Skip to content

Commit

Permalink
enable ramdisk scratch for per-sentence-grammars
Browse files Browse the repository at this point in the history
  • Loading branch information
Chris Dyer committed Sep 17, 2011
1 parent 445c624 commit ed2b1ce
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 4 deletions.
35 changes: 35 additions & 0 deletions training/mpi_batch_optimize.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ namespace mpi = boost::mpi;
#include "ff_register.h"
#include "decoder.h"
#include "filelib.h"
#include "stringlib.h"
#include "optimize.h"
#include "fdict.h"
#include "weights.h"
Expand All @@ -42,6 +43,7 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
("correction_buffers,M", po::value<int>()->default_value(10), "Number of gradients for LBFGS to maintain in memory")
("gaussian_prior,p","Use a Gaussian prior on the weights")
("means,u", po::value<string>(), "File containing the means for Gaussian prior")
("per_sentence_grammar_scratch,P", po::value<string>(), "(Optional) location of scratch space to copy per-sentence grammars for fast access, useful if a RAM disk is available")
("sigma_squared", po::value<double>()->default_value(1.0), "Sigma squared term for spherical Gaussian prior");
po::options_description clo("Command line options");
clo.add_options()
Expand Down Expand Up @@ -186,6 +188,36 @@ struct VectorPlus : public binary_function<vector<T>, vector<T>, vector<T> > {
}
};

void MovePerSentenceGrammars(const string& root, int size, int rank, vector<string>* c) {
if (!DirectoryExists(root)) {
cerr << "Can't find scratch space at " << root << endl;
abort();
}
ostringstream os;
os << root << "/psg." << size << "_of_" << rank;
const string path = os.str();
MkDirP(path);
string sent;
map<string, string> attr;
for (unsigned i = 0; i < c->size(); ++i) {
sent = (*c)[i];
attr.clear();
ProcessAndStripSGML(&sent, &attr);
map<string, string>::iterator it = attr.find("grammar");
if (it != attr.end()) {
string src_file = it->second;
bool is_gzipped = (src_file.size() > 3) && (src_file.rfind(".gz") == (src_file.size() - 3));
string new_name = path + "/" + md5(sent);
if (is_gzipped) new_name += ".gz";
CopyFile(src_file, new_name);
it->second = new_name;
}
ostringstream ns;
ns << SGMLOpenSegTag(attr) << ' ' << sent << " </seg>";
(*c)[i] = ns.str();
}
}

int main(int argc, char** argv) {
#ifdef HAVE_MPI
mpi::environment env(argc, argv);
Expand Down Expand Up @@ -257,6 +289,9 @@ int main(int argc, char** argv) {
ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus);
assert(corpus.size() > 0);

if (conf.count("per_sentence_grammar_scratch"))
MovePerSentenceGrammars(conf["per_sentence_grammar_scratch"].as<string>(), rank, size, &corpus);

TrainingObserver observer;
while (!converged) {
observer.Reset();
Expand Down
19 changes: 19 additions & 0 deletions utils/filelib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

#include <unistd.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <cstdlib>
#include <cstdio>
#include <sys/stat.h>
#include <sys/types.h>

using namespace std;

Expand Down Expand Up @@ -32,3 +38,16 @@ void MkDirP(const string& dir) {
}
}

#if 0
void CopyFile(const string& inf, const string& outf) {
WriteFile w(outf);
CopyFile(inf,*w);
}
#else
void CopyFile(const string& inf, const string& outf) {
ofstream of(outf.c_str(), fstream::trunc|fstream::binary);
ifstream in(inf.c_str(), fstream::binary);
of << in.rdbuf();
}
#endif

5 changes: 1 addition & 4 deletions utils/filelib.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,6 @@ inline void CopyFile(std::string const& inf,std::ostream &out) {
CopyFile(*r,out);
}

inline void CopyFile(std::string const& inf,std::string const& outf) {
WriteFile w(outf);
CopyFile(inf,*w);
}
void CopyFile(std::string const& inf,std::string const& outf);

#endif

0 comments on commit ed2b1ce

Please sign in to comment.