From 06655ab9f0657f352220fb554ee2092463d51e5d Mon Sep 17 00:00:00 2001 From: Matheus Centa Date: Mon, 9 Dec 2019 15:46:30 +0100 Subject: [PATCH 1/5] Add pretrained embeddings option when all walks are in memory --- deepwalk/__main__.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/deepwalk/__main__.py b/deepwalk/__main__.py index 1070e07..3b72268 100644 --- a/deepwalk/__main__.py +++ b/deepwalk/__main__.py @@ -12,7 +12,7 @@ from . import graph from . import walks as serialized_walks -from gensim.models import Word2Vec +from gensim.models import Word2Vec, KeyedVectors from .skipgram import Skipgram from six import text_type as unicode @@ -72,7 +72,14 @@ def process(args): walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) print("Training...") - model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers) + model = Word2Vec(size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers) + model.build_vocab(walks) + total_examples = model.corpus_count + if args.embeddings is not None: + pretrained_embeddings = KeyedVectors.load_word2vec_format(args.embeddings, binary=False) + model.build_vocab([list(pretrained_embeddings.vocab.keys())], update=True) + model.intersect_word2vec_format(args.embeddings, binary=False, lockf=1.0) + model.train(walks, total_examples=total_examples, epochs=model.iter) else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size)) print("Walking...") @@ -150,6 +157,9 @@ def main(): parser.add_argument('--workers', default=1, type=int, help='Number of parallel processes.') + parser.add_argument('--pretrained', nargs='?', + help='Pre-trained embeddings file') + args = parser.parse_args() numeric_level = getattr(logging, args.log.upper(), None) From a6cea2133d22dc0b05fc2b0251d0104e723de82c Mon Sep 17 00:00:00 2001 From: Matheus Centa Date: Mon, 9 Dec 2019 23:59:36 +0100 Subject: [PATCH 2/5] Make feature compatible with walks on disk --- deepwalk/__main__.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/deepwalk/__main__.py b/deepwalk/__main__.py index 3b72268..51644cd 100644 --- a/deepwalk/__main__.py +++ b/deepwalk/__main__.py @@ -8,6 +8,7 @@ from argparse import ArgumentParser, FileType, ArgumentDefaultsHelpFormatter from collections import Counter from concurrent.futures import ProcessPoolExecutor +from itertools import tee import logging from . import graph @@ -97,10 +98,17 @@ def process(args): vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") - walks_corpus = serialized_walks.WalksCorpus(walk_files) - model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts, - size=args.representation_size, + vocab_walks_corpus, train_walks_corpus = tee(serialized_walks.WalksCorpus(walk_files), 2) + + model = Skipgram(vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, trim_rule=None, workers=args.workers) + model.build_vocab(vocab_walks_corpus) + total_examples = model.corpus_count + if args.embeddings is not None: + pretrained_embeddings = KeyedVectors.load_word2vec_format(args.embeddings, binary=False) + model.build_vocab([list(pretrained_embeddings.vocab.keys())], update=True) + model.intersect_word2vec_format(args.embeddings, binary=False, lockf=1.0) + model.train(train_walks_corpus, total_examples=total_examples, epochs=model.iter) model.wv.save_word2vec_format(args.output) From 6cc7adedc7b734e5e3fdb2bbc5e1efd57ca7f3c5 Mon Sep 17 00:00:00 2001 From: Matheus Centa Date: Tue, 10 Dec 2019 14:23:59 +0100 Subject: [PATCH 3/5] Fix misnamer in the code --- deepwalk/__main__.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/deepwalk/__main__.py b/deepwalk/__main__.py index 51644cd..b946cc5 100644 --- a/deepwalk/__main__.py +++ b/deepwalk/__main__.py @@ -76,10 +76,10 @@ def process(args): model = Word2Vec(size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers) model.build_vocab(walks) total_examples = model.corpus_count - if args.embeddings is not None: - pretrained_embeddings = KeyedVectors.load_word2vec_format(args.embeddings, binary=False) + if args.pretrained is not None: + pretrained_embeddings = KeyedVectors.load_word2vec_format(args.pretrained, binary=False) model.build_vocab([list(pretrained_embeddings.vocab.keys())], update=True) - model.intersect_word2vec_format(args.embeddings, binary=False, lockf=1.0) + model.intersect_word2vec_format(args.pretrained, binary=False, lockf=1.0) model.train(walks, total_examples=total_examples, epochs=model.iter) else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size)) @@ -104,10 +104,10 @@ def process(args): window=args.window_size, min_count=0, trim_rule=None, workers=args.workers) model.build_vocab(vocab_walks_corpus) total_examples = model.corpus_count - if args.embeddings is not None: - pretrained_embeddings = KeyedVectors.load_word2vec_format(args.embeddings, binary=False) + if args.pretrained is not None: + pretrained_embeddings = KeyedVectors.load_word2vec_format(args.pretrained, binary=False) model.build_vocab([list(pretrained_embeddings.vocab.keys())], update=True) - model.intersect_word2vec_format(args.embeddings, binary=False, lockf=1.0) + model.intersect_word2vec_format(args.pretrained, binary=False, lockf=1.0) model.train(train_walks_corpus, total_examples=total_examples, epochs=model.iter) model.wv.save_word2vec_format(args.output) From be8b321555e73eb0b8364a8e46760947febe1007 Mon Sep 17 00:00:00 2001 From: Matheus Centa Date: Fri, 10 Jan 2020 14:27:19 +0100 Subject: [PATCH 4/5] Improve --pretrained help string to state the expected format --- deepwalk/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepwalk/__main__.py b/deepwalk/__main__.py index b946cc5..2ab4c4b 100644 --- a/deepwalk/__main__.py +++ b/deepwalk/__main__.py @@ -166,7 +166,7 @@ def main(): help='Number of parallel processes.') parser.add_argument('--pretrained', nargs='?', - help='Pre-trained embeddings file') + help='Pre-trained embeddings file in the “word2vec C format”.') args = parser.parse_args() From 15754930412ec239a98ad8b8bc6af6021a1655c5 Mon Sep 17 00:00:00 2001 From: Matheus Centa Date: Fri, 10 Jan 2020 14:33:57 +0100 Subject: [PATCH 5/5] Make --representation-size and --pretrained mutually exclusive arguments --- deepwalk/__main__.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/deepwalk/__main__.py b/deepwalk/__main__.py index 2ab4c4b..6fc53d4 100644 --- a/deepwalk/__main__.py +++ b/deepwalk/__main__.py @@ -142,9 +142,6 @@ def main(): parser.add_argument('--output', required=True, help='Output representation file') - parser.add_argument('--representation-size', default=64, type=int, - help='Number of latent dimensions to learn for each node.') - parser.add_argument('--seed', default=0, type=int, help='Seed for random walk generator.') @@ -165,9 +162,15 @@ def main(): parser.add_argument('--workers', default=1, type=int, help='Number of parallel processes.') - parser.add_argument('--pretrained', nargs='?', - help='Pre-trained embeddings file in the “word2vec C format”.') + # The --representation-size and --pretrained flags are mutually exclusive in + # order to avoid vector dimensions that don't match + representation_size_group = parser.add_mutually_exclusive_group(required=True) + + representation_size_group.add_argument('--representation-size', default=64, type=int, + help='Number of latent dimensions to learn for each node.') + representation_size_group.add_argument('--pretrained', nargs='?', + help='Pre-trained embeddings file in the “word2vec C format”.') args = parser.parse_args() numeric_level = getattr(logging, args.log.upper(), None)