Skip to content

Commit

Permalink
tool to dedupilate corpus with hashing
Browse files Browse the repository at this point in the history
  • Loading branch information
CHRISTOPHER DYER authored and CHRISTOPHER DYER committed Feb 4, 2015
1 parent c485a6b commit afd6584
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 1 deletion.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
utils/dedup_corpus
klm/lm/builder/dump_counts
klm/util/cat_compressed
example_extff/ff_example.lo
Expand Down
6 changes: 5 additions & 1 deletion utils/Makefile.am
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
bin_PROGRAMS = reconstruct_weights atools
bin_PROGRAMS = reconstruct_weights atools dedup_corpus

noinst_PROGRAMS = \
ts \
Expand Down Expand Up @@ -98,6 +98,10 @@ atools_SOURCES = atools.cc
atools_LDADD = libutils.a
atools_LDFLAGS = $(STATIC_FLAGS)

dedup_corpus_SOURCES = dedup_corpus.cc
dedup_corpus_LDADD = libutils.a
dedup_corpus_LDFLAGS = $(STATIC_FLAGS)

phmt_SOURCES = phmt.cc
phmt_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS)
ts_SOURCES = ts.cc
Expand Down
21 changes: 21 additions & 0 deletions utils/dedup_corpus.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#include <iostream>
#include "hash.h"

using namespace std;

#define INITIAL_SIZE 20000000

int main(int argc, char **argv) {
if (argc != 1) {
cerr << "Usage: " << argv[0] << " < file.txt\n";
return 1;
}
SPARSE_HASH_SET<uint64_t> seen(INITIAL_SIZE);
string line;
while(getline(cin, line)) {
uint64_t h = cdec::MurmurHash3_64(&line[0], line.size(), 17);
if (seen.insert(h).second)
cout << line << '\n';
}
}

3 changes: 3 additions & 0 deletions utils/hash.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
# include <sparsehash/dense_hash_map>
# include <sparsehash/dense_hash_set>
# include <sparsehash/sparse_hash_map>
# include <sparsehash/sparse_hash_set>
# define SPARSE_HASH_MAP google::sparse_hash_map
# define SPARSE_HASH_SET google::sparse_hash_set
# define HASH_MAP google::dense_hash_map
# define HASH_SET google::dense_hash_set
# define HASH_MAP_DELETED(h,deleted) do { (h).set_deleted_key(deleted); } while(0)
Expand All @@ -29,6 +31,7 @@
namespace std { using std::tr1::unordered_map; using std::tr1::unordered_set; }
#endif
# define SPARSE_HASH_MAP std::unordered_map
# define SPARSE_HASH_SET std::unordered_set
# define HASH_MAP std::unordered_map
# define HASH_SET std::unordered_set
# define HASH_MAP_DELETED(h,deleted)
Expand Down

0 comments on commit afd6584

Please sign in to comment.