Skip to content

Commit

Permalink
stop switch to boost serialization for hypergraph IO
Browse files Browse the repository at this point in the history
  • Loading branch information
Chris Dyer committed Oct 19, 2014
1 parent e8451a3 commit 011a87c
Show file tree
Hide file tree
Showing 16 changed files with 156 additions and 116 deletions.
4 changes: 2 additions & 2 deletions decoder/decoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -930,7 +930,7 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
Hypergraph new_hg;
{
ReadFile rf(writer.fname_);
bool succeeded = HypergraphIO::ReadFromJSON(rf.stream(), &new_hg);
bool succeeded = HypergraphIO::ReadFromBinary(rf.stream(), &new_hg);
if (!succeeded) abort();
}
HG::Union(forest, &new_hg);
Expand Down Expand Up @@ -1023,7 +1023,7 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
Hypergraph new_hg;
{
ReadFile rf(writer.fname_);
bool succeeded = HypergraphIO::ReadFromJSON(rf.stream(), &new_hg);
bool succeeded = HypergraphIO::ReadFromBinary(rf.stream(), &new_hg);
if (!succeeded) abort();
}
HG::Union(forest, &new_hg);
Expand Down
4 changes: 2 additions & 2 deletions decoder/forest_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,13 @@
using namespace std;

ForestWriter::ForestWriter(const std::string& path, int num) :
fname_(path + '/' + boost::lexical_cast<string>(num) + ".json.gz"), used_(false) {}
fname_(path + '/' + boost::lexical_cast<string>(num) + ".bin.gz"), used_(false) {}

bool ForestWriter::Write(const Hypergraph& forest, bool minimal_rules) {
assert(!used_);
used_ = true;
cerr << " Writing forest to " << fname_ << endl;
WriteFile wf(fname_);
return HypergraphIO::WriteToJSON(forest, minimal_rules, wf.stream());
return HypergraphIO::WriteToBinary(forest, wf.stream());
}

52 changes: 52 additions & 0 deletions decoder/hg.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <string>
#include <vector>
#include <boost/shared_ptr.hpp>
#include <boost/serialization/vector.hpp>

#include "feature_vector.h"
#include "small_vector.h"
Expand Down Expand Up @@ -69,6 +70,18 @@ namespace HG {
short int j_;
short int prev_i_;
short int prev_j_;
template<class Archive>
void serialize(Archive & ar, const unsigned int version) {
ar & head_node_;
ar & tail_nodes_;
ar & rule_;
ar & feature_values_;
ar & i_;
ar & j_;
ar & prev_i_;
ar & prev_j_;
ar & id_;
}
void show(std::ostream &o,unsigned mask=SPAN|RULE) const {
o<<'{';
if (mask&CATEGORY)
Expand Down Expand Up @@ -149,6 +162,24 @@ namespace HG {
WordID NT() const { return -cat_; }
EdgesVector in_edges_; // an in edge is an edge with this node as its head. (in edges come from the bottom up to us) indices in edges_
EdgesVector out_edges_; // an out edge is an edge with this node as its tail. (out edges leave us up toward the top/goal). indices in edges_
template<class Archive>
void save(Archive & ar, const unsigned int version) const {
ar & node_hash;
ar & id_;
ar & TD::Convert(-cat_);
ar & in_edges_;
ar & out_edges_;
}
template<class Archive>
void load(Archive & ar, const unsigned int version) {
ar & node_hash;
ar & id_;
std::string cat; ar & cat;
cat_ = -TD::Convert(cat);
ar & in_edges_;
ar & out_edges_;
}
BOOST_SERIALIZATION_SPLIT_MEMBER()
void copy_fixed(Node const& o) { // nonstructural fields only - structural ones are managed by sorting/pruning/subsetting
node_hash = o.node_hash;
cat_=o.cat_;
Expand Down Expand Up @@ -492,6 +523,27 @@ class Hypergraph {
void set_ids(); // resync edge,node .id_
void check_ids() const; // assert that .id_ have been kept in sync

template<class Archive>
void save(Archive & ar, const unsigned int version) const {
unsigned ns = nodes_.size(); ar & ns;
unsigned es = edges_.size(); ar & es;
for (auto& n : nodes_) ar & n;
for (auto& e : edges_) ar & e;
int x;
x = edges_topo_; ar & x;
x = is_linear_chain_; ar & x;
}
template<class Archive>
void load(Archive & ar, const unsigned int version) {
unsigned ns; ar & ns; nodes_.resize(ns);
unsigned es; ar & es; edges_.resize(es);
for (auto& n : nodes_) ar & n;
for (auto& e : edges_) ar & e;
int x;
ar & x; edges_topo_ = x;
ar & x; is_linear_chain_ = x;
}
BOOST_SERIALIZATION_SPLIT_MEMBER()
private:
Hypergraph(int num_nodes, int num_edges, bool is_lc) : is_linear_chain_(is_lc), nodes_(num_nodes), edges_(num_edges),edges_topo_(true) {}
};
Expand Down
101 changes: 12 additions & 89 deletions decoder/hg_io.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
#include <sstream>
#include <iostream>

#include <boost/archive/binary_iarchive.hpp>
#include <boost/archive/binary_oarchive.hpp>
#include <boost/serialization/shared_ptr.hpp>

#include "fast_lexical_cast.hpp"

#include "tdict.h"
Expand Down Expand Up @@ -271,97 +275,16 @@ bool HypergraphIO::ReadFromJSON(istream* in, Hypergraph* hg) {
return reader.Parse(in);
}

static void WriteRule(const TRule& r, ostream* out) {
if (!r.lhs_) { (*out) << "[X] ||| "; }
JSONParser::WriteEscapedString(r.AsString(), out);
bool HypergraphIO::ReadFromBinary(istream* in, Hypergraph* hg) {
boost::archive::binary_iarchive oa(*in);
hg->clear();
oa >> *hg;
return true;
}

bool HypergraphIO::WriteToJSON(const Hypergraph& hg, bool remove_rules, ostream* out) {
if (hg.empty()) { *out << "{}\n"; return true; }
map<const TRule*, int> rid;
ostream& o = *out;
rid[NULL] = 0;
o << '{';
if (!remove_rules) {
o << "\"rules\":[";
for (int i = 0; i < hg.edges_.size(); ++i) {
const TRule* r = hg.edges_[i].rule_.get();
int &id = rid[r];
if (!id) {
id=rid.size() - 1;
if (id > 1) o << ',';
o << id << ',';
WriteRule(*r, &o);
};
}
o << "],";
}
const bool use_fdict = FD::NumFeats() < 1000;
if (use_fdict) {
o << "\"features\":[";
for (int i = 1; i < FD::NumFeats(); ++i) {
o << (i==1 ? "":",");
JSONParser::WriteEscapedString(FD::Convert(i), &o);
}
o << "],";
}
vector<int> edgemap(hg.edges_.size(), -1); // edges may be in non-topo order
int edge_count = 0;
for (int i = 0; i < hg.nodes_.size(); ++i) {
const Hypergraph::Node& node = hg.nodes_[i];
if (i > 0) { o << ","; }
o << "\"edges\":[";
for (int j = 0; j < node.in_edges_.size(); ++j) {
const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]];
edgemap[edge.id_] = edge_count;
++edge_count;
o << (j == 0 ? "" : ",") << "{";

o << "\"tail\":[";
for (int k = 0; k < edge.tail_nodes_.size(); ++k) {
o << (k > 0 ? "," : "") << edge.tail_nodes_[k];
}
o << "],";

o << "\"spans\":[" << edge.i_ << "," << edge.j_ << "," << edge.prev_i_ << "," << edge.prev_j_ << "],";

o << "\"feats\":[";
bool first = true;
for (SparseVector<double>::const_iterator it = edge.feature_values_.begin(); it != edge.feature_values_.end(); ++it) {
if (!it->second) continue; // don't write features that have a zero value
if (!it->first) continue; // if the feature set was frozen this might happen
if (!first) o << ',';
if (use_fdict)
o << (it->first - 1);
else {
JSONParser::WriteEscapedString(FD::Convert(it->first), &o);
}
o << ',' << it->second;
first = false;
}
o << "]";
if (!remove_rules) { o << ",\"rule\":" << rid[edge.rule_.get()]; }
o << "}";
}
o << "],";

o << "\"node\":{\"in_edges\":[";
for (int j = 0; j < node.in_edges_.size(); ++j) {
int mapped_edge = edgemap[node.in_edges_[j]];
assert(mapped_edge >= 0);
o << (j == 0 ? "" : ",") << mapped_edge;
}
o << "]";
if (node.cat_ < 0) {
o << ",\"cat\":";
JSONParser::WriteEscapedString(TD::Convert(node.cat_ * -1), &o);
}
char buf[48];
sprintf(buf, "%016lX", node.node_hash);
o << ",\"node_hash\":\"" << buf << "\"";
o << "}";
}
o << "}\n";
bool HypergraphIO::WriteToBinary(const Hypergraph& hg, ostream* out) {
boost::archive::binary_oarchive oa(*out);
oa << hg;
return true;
}

Expand Down
5 changes: 3 additions & 2 deletions decoder/hg_io.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,11 @@ struct HypergraphIO {
// see test_data/small.json.gz for an email encoding
static bool ReadFromJSON(std::istream* in, Hypergraph* out);

static bool ReadFromBinary(std::istream* in, Hypergraph* out);
static bool WriteToBinary(const Hypergraph& hg, std::ostream* out);

// if remove_rules is used, the hypergraph is serialized without rule information
// (so it only contains structure and feature information)
static bool WriteToJSON(const Hypergraph& hg, bool remove_rules, std::ostream* out);

static void WriteAsCFG(const Hypergraph& hg);

// Write only the target size information in bottom-up order.
Expand Down
39 changes: 27 additions & 12 deletions decoder/hg_test.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
#define BOOST_TEST_MODULE hg_test
#include <boost/test/unit_test.hpp>
#include <boost/test/floating_point_comparison.hpp>
#include <boost/archive/text_oarchive.hpp>
#include <boost/archive/text_iarchive.hpp>
#include <boost/serialization/shared_ptr.hpp>
#include <boost/serialization/vector.hpp>
#include <sstream>
#include <iostream>
#include "tdict.h"

Expand Down Expand Up @@ -427,19 +432,29 @@ BOOST_AUTO_TEST_CASE(TestGenericKBest) {
}
}

BOOST_AUTO_TEST_CASE(TestReadWriteHG) {
BOOST_AUTO_TEST_CASE(TestReadWriteHG_Boost) {
std::string path(boost::unit_test::framework::master_test_suite().argc == 2 ? boost::unit_test::framework::master_test_suite().argv[1] : TEST_DATA);
Hypergraph hg,hg2;
CreateHG(path, &hg);
hg.edges_.front().j_ = 23;
hg.edges_.back().prev_i_ = 99;
ostringstream os;
HypergraphIO::WriteToJSON(hg, false, &os);
istringstream is(os.str());
HypergraphIO::ReadFromJSON(&is, &hg2);
BOOST_CHECK_EQUAL(hg2.NumberOfPaths(), hg.NumberOfPaths());
BOOST_CHECK_EQUAL(hg2.edges_.front().j_, 23);
BOOST_CHECK_EQUAL(hg2.edges_.back().prev_i_, 99);
Hypergraph hg;
Hypergraph hg2;
std::string out;
{
CreateHG(path, &hg);
hg.edges_.front().j_ = 23;
hg.edges_.back().prev_i_ = 99;
ostringstream os;
boost::archive::text_oarchive oa(os);
oa << hg;
out = os.str();
}
{
cerr << out << endl;
istringstream is(out);
boost::archive::text_iarchive ia(is);
ia >> hg2;
BOOST_CHECK_EQUAL(hg2.NumberOfPaths(), hg.NumberOfPaths());
BOOST_CHECK_EQUAL(hg2.edges_.front().j_, 23);
BOOST_CHECK_EQUAL(hg2.edges_.back().prev_i_, 99);
}
}

BOOST_AUTO_TEST_SUITE_END()
1 change: 1 addition & 0 deletions decoder/rule_lexer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,7 @@ void RuleLexer::ReadRules(std::istream* in, RuleLexer::RuleCallback func, const
void RuleLexer::ReadRule(const std::string& srule, RuleCallback func, bool mono, void* extra) {
init_default_feature_names();
scfglex_fname = srule;
lex_mono_rules = mono;
lex_line = 1;
rule_callback_extra = extra;
Expand Down
3 changes: 2 additions & 1 deletion python/cdec/hypergraph.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ cdef extern from "decoder/viterbi.h":
cdef extern from "decoder/hg_io.h" namespace "HypergraphIO":
# Hypergraph JSON I/O
bint ReadFromJSON(istream* inp, Hypergraph* out)
bint WriteToJSON(Hypergraph& hg, bint remove_rules, ostream* out)
bint ReadFromBinary(istream* inp, Hypergraph* out)
bint WriteToBinary(Hypergraph& hg, ostream* out)
# Hypergraph PLF I/O
void ReadFromPLF(string& inp, Hypergraph* out)
string AsPLF(Hypergraph& hg, bint include_global_parentheses)
Expand Down
2 changes: 1 addition & 1 deletion training/dpmert/mr_dpmert_generate_mapper_input.cc
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ int main(int argc, char** argv) {
unsigned dev_set_size = conf["dev_set_size"].as<unsigned>();
for (unsigned i = 0; i < dev_set_size; ++i) {
for (unsigned j = 0; j < directions.size(); ++j) {
cout << forest_repository << '/' << i << ".json.gz " << i << ' ';
cout << forest_repository << '/' << i << ".bin.gz " << i << ' ';
print(cout, origin, "=", ";");
cout << ' ';
print(cout, directions[j], "=", ";");
Expand Down
4 changes: 2 additions & 2 deletions training/dpmert/mr_dpmert_map.cc
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ int main(int argc, char** argv) {
istringstream is(line);
int sent_id;
string file, s_origin, s_direction;
// path-to-file (JSON) sent_ed starting-point search-direction
// path-to-file sent_ed starting-point search-direction
is >> file >> sent_id >> s_origin >> s_direction;
SparseVector<double> origin;
ReadSparseVectorString(s_origin, &origin);
Expand All @@ -93,7 +93,7 @@ int main(int argc, char** argv) {
if (last_file != file) {
last_file = file;
ReadFile rf(file);
HypergraphIO::ReadFromJSON(rf.stream(), &hg);
HypergraphIO::ReadFromBinary(rf.stream(), &hg);
}
const ConvexHullWeightFunction wf(origin, direction);
const ConvexHull hull = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf);
Expand Down
2 changes: 1 addition & 1 deletion training/minrisk/minrisk_optimize.cc
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ int main(int argc, char** argv) {
ReadFile rf(file);
if (kis.size() % 5 == 0) { cerr << '.'; }
if (kis.size() % 200 == 0) { cerr << " [" << kis.size() << "]\n"; }
HypergraphIO::ReadFromJSON(rf.stream(), &hg);
HypergraphIO::ReadFromBinary(rf.stream(), &hg);
hg.Reweight(weights);
curkbest.AddKBestCandidates(hg, kbest_size, ds[sent_id]);
if (kbest_file.size())
Expand Down
2 changes: 1 addition & 1 deletion training/pro/mr_pro_map.cc
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ int main(int argc, char** argv) {
const string kbest_file = os.str();
if (FileExists(kbest_file))
J_i.ReadFromFile(kbest_file);
HypergraphIO::ReadFromJSON(rf.stream(), &hg);
HypergraphIO::ReadFromBinary(rf.stream(), &hg);
hg.Reweight(weights);
J_i.AddKBestCandidates(hg, kbest_size, ds[sent_id]);
J_i.WriteToFile(kbest_file);
Expand Down
2 changes: 1 addition & 1 deletion training/rampion/rampion_cccp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ int main(int argc, char** argv) {
ReadFile rf(file);
if (kis.size() % 5 == 0) { cerr << '.'; }
if (kis.size() % 200 == 0) { cerr << " [" << kis.size() << "]\n"; }
HypergraphIO::ReadFromJSON(rf.stream(), &hg);
HypergraphIO::ReadFromBinary(rf.stream(), &hg);
hg.Reweight(weights);
curkbest.AddKBestCandidates(hg, kbest_size, ds[sent_id]);
if (kbest_file.size())
Expand Down
5 changes: 3 additions & 2 deletions training/utils/grammar_convert.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
po::notify(*conf);

if (conf->count("help") || conf->count("input") == 0) {
cerr << "\nUsage: grammar_convert [-options]\n\nConverts a grammar file (in Hiero format) into JSON hypergraph.\n";
cerr << "\nUsage: grammar_convert [-options]\n\nConverts a grammar file (in Hiero format) into serialized hypergraph.\n";
cerr << dcmdline_options << endl;
exit(1);
}
Expand Down Expand Up @@ -254,7 +254,8 @@ void ProcessHypergraph(const vector<double>& w, const po::variables_map& conf, c
if (w.size() > 0) { hg->Reweight(w); }
if (conf.count("collapse_weights")) CollapseWeights(hg);
if (conf["output"].as<string>() == "json") {
HypergraphIO::WriteToJSON(*hg, false, &cout);
cerr << "NOT IMPLEMENTED ... talk to cdyer if you need this functionality\n";
// HypergraphIO::WriteToBinary(*hg, &cout);
if (!ref.empty()) { cerr << "REF: " << ref << endl; }
} else {
vector<WordID> onebest;
Expand Down
Loading

0 comments on commit 011a87c

Please sign in to comment.