From 00c86d09662bbf48345dbd3ef430a2e004bbe682 Mon Sep 17 00:00:00 2001 From: Sourav Singh Date: Fri, 21 Oct 2016 07:30:00 -0700 Subject: [PATCH] PEP8 fixes for Gensim --- ez_setup.py | 32 +- gensim/__init__.py | 8 +- gensim/corpora/__init__.py | 3 +- gensim/corpora/bleicorpus.py | 36 +- gensim/corpora/csvcorpus.py | 3 +- gensim/corpora/dictionary.py | 92 +- gensim/corpora/hashdictionary.py | 71 +- gensim/corpora/indexedcorpus.py | 43 +- gensim/corpora/lowcorpus.py | 46 +- gensim/corpora/malletcorpus.py | 22 +- gensim/corpora/mmcorpus.py | 16 +- gensim/corpora/sharded_corpus.py | 121 ++- gensim/corpora/svmlightcorpus.py | 13 +- gensim/corpora/textcorpus.py | 4 +- gensim/corpora/ucicorpus.py | 55 +- gensim/corpora/wikicorpus.py | 95 +- gensim/examples/dmlcz/__init__.py | 1 - gensim/examples/dmlcz/dmlcorpus.py | 89 +- gensim/examples/dmlcz/gensim_build.py | 43 +- gensim/examples/dmlcz/gensim_genmodel.py | 45 +- gensim/examples/dmlcz/gensim_xml.py | 95 +- gensim/examples/dmlcz/sources.py | 122 +-- gensim/interfaces.py | 58 +- gensim/matutils.py | 276 ++++-- gensim/models/__init__.py | 10 +- gensim/models/basemodel.py | 9 +- gensim/models/coherencemodel.py | 102 +- gensim/models/doc2vec.py | 452 +++++++-- gensim/models/hdpmodel.py | 114 ++- gensim/models/lda_dispatcher.py | 104 +- gensim/models/lda_worker.py | 65 +- gensim/models/ldamodel.py | 352 +++++-- gensim/models/ldamulticore.py | 92 +- gensim/models/ldaseqmodel.py | 518 +++++++--- gensim/models/logentropy_model.py | 7 +- gensim/models/lsi_dispatcher.py | 65 +- gensim/models/lsi_worker.py | 30 +- gensim/models/lsimodel.py | 329 +++++-- gensim/models/normmodel.py | 6 +- gensim/models/phrases.py | 51 +- gensim/models/rpmodel.py | 31 +- gensim/models/tfidfmodel.py | 25 +- gensim/models/word2vec.py | 924 +++++++++++++----- gensim/models/wrappers/dtmmodel.py | 91 +- gensim/models/wrappers/ldamallet.py | 141 ++- gensim/models/wrappers/ldavowpalwabbit.py | 28 +- gensim/nosy.py | 4 +- gensim/parsing/porter.py | 167 ++-- gensim/parsing/preprocessing.py | 25 +- gensim/scripts/glove2word2vec.py | 47 +- gensim/scripts/make_wiki_online_nodebug.py | 30 +- gensim/scripts/make_wikicorpus.py | 30 +- gensim/scripts/word2vec_standalone.py | 92 +- gensim/similarities/docsim.py | 254 +++-- gensim/similarities/index.py | 30 +- gensim/summarization/__init__.py | 2 +- gensim/summarization/bm25.py | 25 +- gensim/summarization/commons.py | 3 +- gensim/summarization/graph.py | 14 +- gensim/summarization/keywords.py | 39 +- gensim/summarization/pagerank_weighted.py | 9 +- gensim/summarization/summarizer.py | 56 +- gensim/summarization/syntactic_unit.py | 3 +- gensim/summarization/textcleaner.py | 40 +- gensim/topic_coherence/aggregation.py | 1 + .../direct_confirmation_measure.py | 22 +- .../indirect_confirmation_measure.py | 45 +- .../topic_coherence/probability_estimation.py | 17 +- gensim/topic_coherence/segmentation.py | 3 + gensim/utils.py | 250 +++-- setup.py | 33 +- 71 files changed, 4390 insertions(+), 1786 deletions(-) diff --git a/ez_setup.py b/ez_setup.py index 4da59fcd76..086c1497cd 100644 --- a/ez_setup.py +++ b/ez_setup.py @@ -32,12 +32,15 @@ DEFAULT_VERSION = "1.3.2" DEFAULT_URL = "https://pypi.python.org/packages/source/s/setuptools/" + def _python_cmd(*args): args = (sys.executable,) + args return subprocess.call(args) == 0 + def _check_call_py24(cmd, *args, **kwargs): res = subprocess.call(cmd, *args, **kwargs) + class CalledProcessError(Exception): pass if not res == 0: @@ -45,6 +48,7 @@ class CalledProcessError(Exception): raise CalledProcessError(msg) vars(subprocess).setdefault('check_call', _check_call_py24) + def _install(tarball, install_args=()): # extracting the tarball tmpdir = tempfile.mkdtemp() @@ -137,11 +141,11 @@ def use_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL, e = sys.exc_info()[1] if was_imported: sys.stderr.write( - "The required version of setuptools (>=%s) is not available,\n" - "and can't be installed while this script is running. Please\n" - "install a more recent version first, using\n" - "'easy_install -U setuptools'." - "\n\n(Currently using %r)\n" % (version, e.args[0])) + "The required version of setuptools (>=%s) is not available,\n" + "and can't be installed while this script is running. Please\n" + "install a more recent version first, using\n" + "'easy_install -U setuptools'." + "\n\n(Currently using %r)\n" % (version, e.args[0])) sys.exit(2) else: del pkg_resources, sys.modules['pkg_resources'] # reload ok @@ -151,6 +155,7 @@ def use_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL, return _do_download(version, download_base, to_dir, download_delay) + def _clean_check(cmd, target): """ Run the command to download target. If the command fails, clean up before @@ -163,6 +168,7 @@ def _clean_check(cmd, target): os.unlink(target) raise + def download_file_powershell(url, target): """ Download the file at url to target using Powershell (which will validate @@ -172,10 +178,12 @@ def download_file_powershell(url, target): cmd = [ 'powershell', '-Command', - "(new-object System.Net.WebClient).DownloadFile(%(url)r, %(target)r)" % vars(), + "(new-object System.Net.WebClient).DownloadFile(%(url)r, %(target)r)" % + vars(), ] _clean_check(cmd, target) + def has_powershell(): if platform.system() != 'Windows': return False @@ -192,10 +200,12 @@ def has_powershell(): download_file_powershell.viable = has_powershell + def download_file_curl(url, target): cmd = ['curl', url, '--silent', '--output', target] _clean_check(cmd, target) + def has_curl(): cmd = ['curl', '--version'] devnull = open(os.path.devnull, 'wb') @@ -210,10 +220,12 @@ def has_curl(): download_file_curl.viable = has_curl + def download_file_wget(url, target): cmd = ['wget', url, '--quiet', '--output-document', target] _clean_check(cmd, target) + def has_wget(): cmd = ['wget', '--version'] devnull = open(os.path.devnull, 'wb') @@ -228,6 +240,7 @@ def has_wget(): download_file_wget.viable = has_wget + def download_file_insecure(url, target): """ Use Python to download the file, even though it cannot authenticate the @@ -253,6 +266,7 @@ def download_file_insecure(url, target): download_file_insecure.viable = lambda: True + def get_best_downloader(): downloaders = [ download_file_powershell, @@ -265,6 +279,7 @@ def get_best_downloader(): if dl.viable(): return dl + def download_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir, delay=15, downloader_factory=get_best_downloader): @@ -350,6 +365,7 @@ def _build_install_args(options): install_args.append('--user') return install_args + def _parse_args(): """ Parse the command line for options @@ -371,10 +387,12 @@ def _parse_args(): # positional arguments are ignored return options + def main(version=DEFAULT_VERSION): """Install or upgrade setuptools and EasyInstall""" options = _parse_args() - tarball = download_setuptools(download_base=options.download_base, + tarball = download_setuptools( + download_base=options.download_base, downloader_factory=options.downloader_factory) return _install(tarball, _build_install_args(options)) diff --git a/gensim/__init__.py b/gensim/__init__.py index 0e3f3db720..2413b34a26 100644 --- a/gensim/__init__.py +++ b/gensim/__init__.py @@ -7,16 +7,18 @@ import logging try: - __version__ = __import__('pkg_resources').get_distribution('gensim').version + __version__ = __import__( + 'pkg_resources').get_distribution('gensim').version except: __version__ = '?' class NullHandler(logging.Handler): """For python versions <= 2.6; same as `logging.NullHandler` in 2.7.""" + def emit(self, record): pass logger = logging.getLogger('gensim') -if len(logger.handlers) == 0: # To ensure reload() doesn't add another one - logger.addHandler(NullHandler()) \ No newline at end of file +if len(logger.handlers) == 0: # To ensure reload() doesn't add another one + logger.addHandler(NullHandler()) diff --git a/gensim/corpora/__init__.py b/gensim/corpora/__init__.py index a11a0df229..9b92e08e48 100644 --- a/gensim/corpora/__init__.py +++ b/gensim/corpora/__init__.py @@ -3,7 +3,8 @@ """ # bring corpus classes directly into package namespace, to save some typing -from .indexedcorpus import IndexedCorpus # must appear before the other classes +# must appear before the other classes +from .indexedcorpus import IndexedCorpus from .mmcorpus import MmCorpus from .bleicorpus import BleiCorpus diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py index b84d080c40..4dc324f7e4 100644 --- a/gensim/corpora/bleicorpus.py +++ b/gensim/corpora/bleicorpus.py @@ -51,11 +51,11 @@ def __init__(self, fname, fname_vocab=None): fname_base, _ = path.splitext(fname) fname_dir = path.dirname(fname) for fname_vocab in [ - utils.smart_extension(fname, '.vocab'), - utils.smart_extension(fname, '/vocab.txt'), - utils.smart_extension(fname_base, '.vocab'), - utils.smart_extension(fname_dir, '/vocab.txt'), - ]: + utils.smart_extension(fname, '.vocab'), + utils.smart_extension(fname, '/vocab.txt'), + utils.smart_extension(fname_base, '.vocab'), + utils.smart_extension(fname_dir, '/vocab.txt'), + ]: if path.exists(fname_vocab): break else: @@ -79,7 +79,9 @@ def __iter__(self): def line2doc(self, line): parts = utils.to_unicode(line).split() if int(parts[0]) != len(parts) - 1: - raise ValueError("invalid format in %s: %s" % (self.fname, repr(line))) + raise ValueError( + "invalid format in %s: %s" % + (self.fname, repr(line))) doc = [part.rsplit(':', 1) for part in parts[1:]] doc = [(int(p1), float(p2)) for p1, p2 in doc] return doc @@ -96,7 +98,8 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): call it directly, call `serialize` instead. """ if id2word is None: - logger.info("no word id mapping provided; initializing from corpus") + logger.info( + "no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) num_terms = len(id2word) else: @@ -109,14 +112,25 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): doc = list(doc) offsets.append(fout.tell()) parts = ["%i:%g" % p for p in doc if abs(p[1]) > 1e-7] - fout.write(utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts)))) + fout.write( + utils.to_utf8( + "%i %s\n" % + (len(doc), ' '.join(parts)))) - # write out vocabulary, in a format compatible with Blei's topics.py script + # write out vocabulary, in a format compatible with Blei's topics.py + # script fname_vocab = utils.smart_extension(fname, '.vocab') - logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab)) + logger.info( + "saving vocabulary of %i words to %s" % + (num_terms, fname_vocab)) with utils.smart_open(fname_vocab, 'wb') as fout: for featureid in xrange(num_terms): - fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) + fout.write( + utils.to_utf8( + "%s\n" % + id2word.get( + featureid, + '---'))) return offsets diff --git a/gensim/corpora/csvcorpus.py b/gensim/corpora/csvcorpus.py index 6d3288bc0f..6ae8679646 100644 --- a/gensim/corpora/csvcorpus.py +++ b/gensim/corpora/csvcorpus.py @@ -45,7 +45,8 @@ def __init__(self, fname, labels): head = ''.join(itertools.islice(utils.smart_open(self.fname), 5)) self.headers = csv.Sniffer().has_header(head) self.dialect = csv.Sniffer().sniff(head) - logger.info("sniffed CSV delimiter=%r, headers=%s" % (self.dialect.delimiter, self.headers)) + logger.info("sniffed CSV delimiter=%r, headers=%s" % + (self.dialect.delimiter, self.headers)) def __iter__(self): """ diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py index 1fd7e31e61..899a3cb564 100644 --- a/gensim/corpora/dictionary.py +++ b/gensim/corpora/dictionary.py @@ -42,6 +42,7 @@ class Dictionary(utils.SaveLoad, Mapping): The main function is `doc2bow`, which converts a collection of words to its bag-of-words representation: a list of (word_id, word_frequency) 2-tuples. """ + def __init__(self, documents=None, prune_at=2000000): """ If `documents` are given, use them to initialize Dictionary (see `add_documents()`). @@ -89,7 +90,8 @@ def __len__(self): def __str__(self): some_keys = list(itertools.islice(iterkeys(self.token2id), 5)) - return "Dictionary(%i unique tokens: %s%s)" % (len(self), some_keys, '...' if len(self) > 5 else '') + return "Dictionary(%i unique tokens: %s%s)" % ( + len(self), some_keys, '...' if len(self) > 5 else '') @staticmethod def from_documents(documents): @@ -109,14 +111,17 @@ def add_documents(self, documents, prune_at=2000000): Dictionary(5 unique tokens) """ for docno, document in enumerate(documents): - # log progress & run a regular check for pruning, once every 10k docs + # log progress & run a regular check for pruning, once every 10k + # docs if docno % 10000 == 0: if prune_at is not None and len(self) > prune_at: - self.filter_extremes(no_below=0, no_above=1.0, keep_n=prune_at) + self.filter_extremes( + no_below=0, no_above=1.0, keep_n=prune_at) logger.info("adding document #%i to %s", docno, self) # update Dictionary with the document - self.doc2bow(document, allow_update=True) # ignore the result, here we only care about updating token ids + # ignore the result, here we only care about updating token ids + self.doc2bow(document, allow_update=True) logger.info( "built %s from %i documents (total %i corpus positions)", @@ -138,7 +143,8 @@ def doc2bow(self, document, allow_update=False, return_missing=False): If `allow_update` is **not** set, this function is `const`, aka read-only. """ if isinstance(document, string_types): - raise TypeError("doc2bow expects an array of unicode tokens on input, not a single string") + raise TypeError( + "doc2bow expects an array of unicode tokens on input, not a single string") # Construct (word, frequency) mapping. counter = defaultdict(int) @@ -147,20 +153,23 @@ def doc2bow(self, document, allow_update=False, return_missing=False): token2id = self.token2id if allow_update or return_missing: - missing = dict((w, freq) for w, freq in iteritems(counter) if w not in token2id) + missing = dict((w, freq) + for w, freq in iteritems(counter) if w not in token2id) if allow_update: for w in missing: # new id = number of ids made so far; # NOTE this assumes there are no gaps in the id sequence! token2id[w] = len(token2id) - result = dict((token2id[w], freq) for w, freq in iteritems(counter) if w in token2id) + result = dict((token2id[w], freq) + for w, freq in iteritems(counter) if w in token2id) if allow_update: self.num_docs += 1 self.num_pos += sum(itervalues(counter)) self.num_nnz += len(result) - # increase document count for each unique token that appeared in the document + # increase document count for each unique token that appeared in + # the document dfs = self.dfs for tokenid in iterkeys(result): dfs[tokenid] = dfs.get(tokenid, 0) + 1 @@ -187,7 +196,9 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000): **Note**: Due to the gap shrinking, the same word may have a different word id before and after the call to this function! """ - no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold + no_above_abs = int( + no_above * + self.num_docs) # convert fractional threshold to absolute threshold # determine which tokens to keep good_ids = ( @@ -196,13 +207,24 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000): good_ids = sorted(good_ids, key=self.dfs.get, reverse=True) if keep_n is not None: good_ids = good_ids[:keep_n] - bad_words = [(self[id], self.dfs.get(id, 0)) for id in set(self).difference(good_ids)] - logger.info("discarding %i tokens: %s...", len(self) - len(good_ids), bad_words[:10]) + bad_words = [(self[id], self.dfs.get(id, 0)) + for id in set(self).difference(good_ids)] + logger.info( + "discarding %i tokens: %s...", + len(self) - + len(good_ids), + bad_words[ + :10]) logger.info( "keeping %i tokens which were in no less than %i and no more than %i (=%.1f%%) documents", - len(good_ids), no_below, no_above_abs, 100.0 * no_above) - - # do the actual filtering, then rebuild dictionary to remove gaps in ids + len(good_ids), + no_below, + no_above_abs, + 100.0 * + no_above) + + # do the actual filtering, then rebuild dictionary to remove gaps in + # ids self.filter_tokens(good_ids=good_ids) logger.info("resulting dictionary: %s", self) @@ -217,12 +239,21 @@ def filter_n_most_frequent(self, remove_n): """ # determine which tokens to keep most_frequent_ids = (v for v in itervalues(self.token2id)) - most_frequent_ids = sorted(most_frequent_ids, key=self.dfs.get, reverse=True) + most_frequent_ids = sorted( + most_frequent_ids, + key=self.dfs.get, + reverse=True) most_frequent_ids = most_frequent_ids[:remove_n] - # do the actual filtering, then rebuild dictionary to remove gaps in ids - most_frequent_words = [(self[id], self.dfs.get(id, 0)) for id in most_frequent_ids] - logger.info("discarding %i tokens: %s...", len(most_frequent_ids), most_frequent_words[:10]) - + # do the actual filtering, then rebuild dictionary to remove gaps in + # ids + most_frequent_words = [(self[id], self.dfs.get(id, 0)) + for id in most_frequent_ids] + logger.info( + "discarding %i tokens: %s...", + len(most_frequent_ids), + most_frequent_words[ + :10]) + self.filter_tokens(bad_ids=most_frequent_ids) logger.info("resulting dictionary: %s" % self) @@ -262,12 +293,16 @@ def compactify(self): logger.debug("rebuilding dictionary, shrinking gaps") # build mapping from old id -> new id - idmap = dict(izip(itervalues(self.token2id), xrange(len(self.token2id)))) + idmap = dict(izip(itervalues(self.token2id), + xrange(len(self.token2id)))) # reassign mappings to new ids - self.token2id = dict((token, idmap[tokenid]) for token, tokenid in iteritems(self.token2id)) + self.token2id = dict( + (token, idmap[tokenid]) for token, tokenid in iteritems( + self.token2id)) self.id2token = {} - self.dfs = dict((idmap[tokenid], freq) for tokenid, freq in iteritems(self.dfs)) + self.dfs = dict((idmap[tokenid], freq) + for tokenid, freq in iteritems(self.dfs)) def save_as_text(self, fname, sort_by_word=True): """ @@ -282,10 +317,12 @@ def save_as_text(self, fname, sort_by_word=True): with utils.smart_open(fname, 'wb') as fout: if sort_by_word: for token, tokenid in sorted(iteritems(self.token2id)): - line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0)) + line = "%i\t%s\t%i\n" % ( + tokenid, token, self.dfs.get(tokenid, 0)) fout.write(utils.to_utf8(line)) else: - for tokenid, freq in sorted(iteritems(self.dfs), key=lambda item: -item[1]): + for tokenid, freq in sorted( + iteritems(self.dfs), key=lambda item: -item[1]): line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq) fout.write(utils.to_utf8(line)) @@ -351,7 +388,9 @@ def load_from_text(fname): % (fname, line.strip())) wordid = int(wordid) if word in result.token2id: - raise KeyError('token %s is defined as ID %d and as ID %d' % (word, wordid, result.token2id[word])) + raise KeyError( + 'token %s is defined as ID %d and as ID %d' % + (word, wordid, result.token2id[word])) result.token2id[word] = wordid result.dfs[wordid] = int(docfreq) return result @@ -389,7 +428,8 @@ def from_corpus(corpus, id2word=None): result.token2id = dict((unicode(i), i) for i in xrange(max_id + 1)) else: # id=>word mapping given: simply copy it - result.token2id = dict((utils.to_unicode(token), id) for id, token in iteritems(id2word)) + result.token2id = dict((utils.to_unicode(token), id) + for id, token in iteritems(id2word)) for id in itervalues(result.token2id): # make sure all token ids have a valid `dfs` entry result.dfs[id] = result.dfs.get(id, 0) diff --git a/gensim/corpora/hashdictionary.py b/gensim/corpora/hashdictionary.py index 0b15de5df6..1bf790ddb1 100644 --- a/gensim/corpora/hashdictionary.py +++ b/gensim/corpora/hashdictionary.py @@ -47,7 +47,13 @@ class HashDictionary(utils.SaveLoad, dict): bag-of-words representation: a list of (word_id, word_frequency) 2-tuples. """ - def __init__(self, documents=None, id_range=32000, myhash=zlib.adler32, debug=True): + + def __init__( + self, + documents=None, + id_range=32000, + myhash=zlib.adler32, + debug=True): """ By default, keep track of debug statistics and mappings. If you find yourself running out of memory (or are sure you don't need the debug info), set @@ -57,7 +63,8 @@ def __init__(self, documents=None, id_range=32000, myhash=zlib.adler32, debug=Tr self.id_range = id_range # hash range: id = myhash(key) % id_range self.debug = debug - # the following (potentially massive!) dictionaries are only formed if `debug` is True + # the following (potentially massive!) dictionaries are only formed if + # `debug` is True self.token2id = {} self.id2token = {} # reverse mapping int->set(words) self.dfs = {} # token_id -> how many documents this token_id appeared in @@ -118,7 +125,8 @@ def add_documents(self, documents): for docno, document in enumerate(documents): if docno % 10000 == 0: logger.info("adding document #%i to %s" % (docno, self)) - _ = self.doc2bow(document, allow_update=True) # ignore the result, here we only care about updating token ids + # ignore the result, here we only care about updating token ids + _ = self.doc2bow(document, allow_update=True) logger.info( "built %s from %i documents (total %i corpus positions)", self, self.num_docs, self.num_pos) @@ -139,14 +147,18 @@ def doc2bow(self, document, allow_update=False, return_missing=False): """ result = {} missing = {} - document = sorted(document) # convert the input to plain list (needed below) + # convert the input to plain list (needed below) + document = sorted(document) for word_norm, group in itertools.groupby(document): - frequency = len(list(group)) # how many times does this word appear in the input document + # how many times does this word appear in the input document + frequency = len(list(group)) tokenid = self.restricted_hash(word_norm) result[tokenid] = result.get(tokenid, 0) + frequency if self.debug: - # increment document count for each unique token that appeared in the document - self.dfs_debug[word_norm] = self.dfs_debug.get(word_norm, 0) + 1 + # increment document count for each unique token that appeared + # in the document + self.dfs_debug[word_norm] = self.dfs_debug.get( + word_norm, 0) + 1 if allow_update or self.allow_update: self.num_docs += 1 @@ -180,9 +192,18 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000): clears some supplementary statistics, for easier debugging and a smaller RAM footprint. """ - no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold - ok = [item for item in iteritems(self.dfs_debug) if no_below <= item[1] <= no_above_abs] - ok = frozenset(word for word, freq in sorted(ok, key=lambda item: -item[1])[:keep_n]) + no_above_abs = int( + no_above * + self.num_docs) # convert fractional threshold to absolute threshold + ok = [item for item in iteritems(self.dfs_debug) if no_below <= item[ + 1] <= no_above_abs] + ok = frozenset( + word for word, + freq in sorted( + ok, + key=lambda item: - + item[1])[ + :keep_n]) self.dfs_debug = dict((word, freq) for word, freq in iteritems(self.dfs_debug) @@ -190,8 +211,10 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000): self.token2id = dict((token, tokenid) for token, tokenid in iteritems(self.token2id) if token in self.dfs_debug) - self.id2token = dict((tokenid, set(token for token in tokens if token in self.dfs_debug)) - for tokenid, tokens in iteritems(self.id2token)) + self.id2token = dict( + (tokenid, set( + token for token in tokens if token in self.dfs_debug)) for tokenid, tokens in iteritems( + self.id2token)) self.dfs = dict((tokenid, freq) for tokenid, freq in iteritems(self.dfs) if self.id2token.get(tokenid, set())) @@ -199,7 +222,10 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000): # for word->document frequency logger.info( "kept statistics for which were in no less than %i and no more than %i (=%.1f%%) documents", - no_below, no_above_abs, 100.0 * no_above) + no_below, + no_above_abs, + 100.0 * + no_above) def save_as_text(self, fname): """ @@ -215,7 +241,20 @@ def save_as_text(self, fname): for tokenid in self.keys(): words = sorted(self[tokenid]) if words: - words_df = [(word, self.dfs_debug.get(word, 0)) for word in words] - words_df = ["%s(%i)" % item for item in sorted(words_df, key=lambda item: -item[1])] + words_df = [(word, self.dfs_debug.get(word, 0)) + for word in words] + words_df = [ + "%s(%i)" % + item for item in sorted( + words_df, + key=lambda item: - + item[1])] words_df = '\t'.join(words_df) - fout.write(utils.to_utf8("%i\t%i\t%s\n" % (tokenid, self.dfs.get(tokenid, 0), words_df))) + fout.write( + utils.to_utf8( + "%i\t%i\t%s\n" % + (tokenid, + self.dfs.get( + tokenid, + 0), + words_df))) diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py index dd3f703899..38363b41e1 100644 --- a/gensim/corpora/indexedcorpus.py +++ b/gensim/corpora/indexedcorpus.py @@ -28,6 +28,7 @@ class IndexedCorpus(interfaces.CorpusABC): + def __init__(self, fname, index_fname=None): """ Initialize this abstract base class, by loading a previously saved index @@ -56,7 +57,15 @@ def __init__(self, fname, index_fname=None): self.length = None @classmethod - def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False): + def serialize( + serializer, + fname, + corpus, + id2word=None, + index_fname=None, + progress_cnt=None, + labels=None, + metadata=False): """ Iterate through the document stream `corpus`, saving the documents to `fname` and recording byte offset of each document. Save the resulting index @@ -77,24 +86,36 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres >>> print(mm[42]) # retrieve document no. 42, etc. """ if getattr(corpus, 'fname', None) == fname: - raise ValueError("identical input vs. output corpus filename, refusing to serialize: %s" % fname) + raise ValueError( + "identical input vs. output corpus filename, refusing to serialize: %s" % + fname) if index_fname is None: index_fname = utils.smart_extension(fname, '.index') if progress_cnt is not None: if labels is not None: - offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, progress_cnt=progress_cnt, metadata=metadata) + offsets = serializer.save_corpus( + fname, + corpus, + id2word, + labels=labels, + progress_cnt=progress_cnt, + metadata=metadata) else: - offsets = serializer.save_corpus(fname, corpus, id2word, progress_cnt=progress_cnt, metadata=metadata) + offsets = serializer.save_corpus( + fname, corpus, id2word, progress_cnt=progress_cnt, metadata=metadata) else: if labels is not None: - offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, metadata=metadata) + offsets = serializer.save_corpus( + fname, corpus, id2word, labels=labels, metadata=metadata) else: - offsets = serializer.save_corpus(fname, corpus, id2word, metadata=metadata) + offsets = serializer.save_corpus( + fname, corpus, id2word, metadata=metadata) if offsets is None: - raise NotImplementedError("called serialize on class %s which doesn't support indexing!" % + raise NotImplementedError( + "called serialize on class %s which doesn't support indexing!" % serializer.__name__) # store offsets persistently, using pickle @@ -102,7 +123,9 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres # the offsets that are actually stored on disk - we're not storing self.index in any case, the # load just needs to turn whatever is loaded from disk back into a ndarray - this should also ensure # backwards compatibility - logger.info("saving %s index to %s" % (serializer.__name__, index_fname)) + logger.info( + "saving %s index to %s" % + (serializer.__name__, index_fname)) utils.pickle(offsets, index_fname) def __len__(self): @@ -126,8 +149,8 @@ def __getitem__(self, docno): elif isinstance(docno, (int, numpy.integer)): return self.docbyoffset(self.index[docno]) else: - raise ValueError('Unrecognised value for docno, use either a single integer, a slice or a numpy.ndarray') - + raise ValueError( + 'Unrecognised value for docno, use either a single integer, a slice or a numpy.ndarray') # endclass IndexedCorpus diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py index b87f1108a2..2acf522a40 100644 --- a/gensim/corpora/lowcorpus.py +++ b/gensim/corpora/lowcorpus.py @@ -50,6 +50,7 @@ class LowCorpus(IndexedCorpus): in which all [wordij] (i=1..M, j=1..Ni) are text strings and they are separated by the blank character. """ + def __init__(self, fname, id2word=None, line2words=split_on_space): """ Initialize the corpus from a file. @@ -65,30 +66,36 @@ def __init__(self, fname, id2word=None, line2words=split_on_space): IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s" % fname) - self.fname = fname # input file, see class doc for format - self.line2words = line2words # how to translate lines into words (simply split on space by default) + self.fname = fname # input file, see class doc for format + # how to translate lines into words (simply split on space by default) + self.line2words = line2words self.num_docs = self._calculate_num_docs() if not id2word: # build a list of all word types in the corpus (distinct words) logger.info("extracting vocabulary from the corpus") all_terms = set() - self.use_wordids = False # return documents as (word, wordCount) 2-tuples + # return documents as (word, wordCount) 2-tuples + self.use_wordids = False for doc in self: all_terms.update(word for word, wordCnt in doc) - all_terms = sorted(all_terms) # sort the list of all words; rank in that list = word's integer id - self.id2word = dict(izip(xrange(len(all_terms)), all_terms)) # build a mapping of word id(int) -> word (string) + # sort the list of all words; rank in that list = word's integer id + all_terms = sorted(all_terms) + # build a mapping of word id(int) -> word (string) + self.id2word = dict(izip(xrange(len(all_terms)), all_terms)) else: logger.info("using provided word mapping (%i ids)" % len(id2word)) self.id2word = id2word self.num_terms = len(self.word2id) - self.use_wordids = True # return documents as (wordIndex, wordCount) 2-tuples + # return documents as (wordIndex, wordCount) 2-tuples + self.use_wordids = True logger.info("loaded corpus with %i documents and %i terms from %s" % - (self.num_docs, self.num_terms, fname)) + (self.num_docs, self.num_terms, fname)) def _calculate_num_docs(self): - # the first line in input data is the number of documents (integer). throws exception on bad input. + # the first line in input data is the number of documents (integer). + # throws exception on bad input. with utils.smart_open(self.fname) as fin: try: result = int(next(fin)) @@ -111,7 +118,8 @@ def line2doc(self, line): # as they were in the input. when iterating over the documents, # the (word, count) pairs will appear in the same order as they # were in the input (bar duplicates), which looks better. - # if this was not needed, we might as well have used useWords = set(words) + # if this was not needed, we might as well have used useWords = + # set(words) use_words, marker = [], set() for word in words: if (word in uniq_words) and (word not in marker): @@ -119,14 +127,15 @@ def line2doc(self, line): marker.add(word) # construct a list of (wordIndex, wordFrequency) 2-tuples doc = list(zip(map(self.word2id.get, use_words), - map(words.count, use_words))) + map(words.count, use_words))) else: uniq_words = set(words) # construct a list of (word, wordFrequency) 2-tuples doc = list(zip(uniq_words, map(words.count, uniq_words))) # return the document, then forget it and move on to the next one - # note that this way, only one doc is stored in memory at a time, not the whole corpus + # note that this way, only one doc is stored in memory at a time, not + # the whole corpus return doc def __iter__(self): @@ -135,7 +144,7 @@ def __iter__(self): """ with utils.smart_open(self.fname) as fin: for lineno, line in enumerate(fin): - if lineno > 0: # ignore the first line = number of documents + if lineno > 0: # ignore the first line = number of documents yield self.line2doc(line) @staticmethod @@ -147,7 +156,8 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): call it directly, call `serialize` instead. """ if id2word is None: - logger.info("no word id mapping provided; initializing from corpus") + logger.info( + "no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) logger.info("storing corpus in List-Of-Words format into %s" % fname) @@ -160,14 +170,16 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): for wordid, value in doc: if abs(int(value) - value) > 1e-6: truncated += 1 - words.extend([utils.to_unicode(id2word[wordid])] * int(value)) + words.extend( + [utils.to_unicode(id2word[wordid])] * int(value)) offsets.append(fout.tell()) fout.write(utils.to_utf8('%s\n' % ' '.join(words))) if truncated: - logger.warning("List-of-words format can only save vectors with " - "integer elements; %i float entries were truncated to integer value" % - truncated) + logger.warning( + "List-of-words format can only save vectors with " + "integer elements; %i float entries were truncated to integer value" % + truncated) return offsets def docbyoffset(self, offset): diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py index f8410845e6..31bbf060ee 100644 --- a/gensim/corpora/malletcorpus.py +++ b/gensim/corpora/malletcorpus.py @@ -36,6 +36,7 @@ class MalletCorpus(LowCorpus): Note that language/label is *not* considered in Gensim. """ + def __init__(self, fname, id2word=None, metadata=False): self.metadata = metadata LowCorpus.__init__(self, fname, id2word) @@ -56,7 +57,8 @@ def __iter__(self): yield self.line2doc(line) def line2doc(self, line): - l = [word for word in utils.to_unicode(line).strip().split(' ') if word] + l = [word for word in utils.to_unicode( + line).strip().split(' ') if word] docid, doclang, words = l[0], l[1], l[2:] doc = super(MalletCorpus, self).line2doc(' '.join(words)) @@ -82,7 +84,8 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): """ if id2word is None: - logger.info("no word id mapping provided; initializing from corpus") + logger.info( + "no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) logger.info("storing corpus in Mallet format into %s" % fname) @@ -101,14 +104,19 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): for wordid, value in doc: if abs(int(value) - value) > 1e-6: truncated += 1 - words.extend([utils.to_unicode(id2word[wordid])] * int(value)) + words.extend( + [utils.to_unicode(id2word[wordid])] * int(value)) offsets.append(fout.tell()) - fout.write(utils.to_utf8('%s %s %s\n' % (doc_id, doc_lang, ' '.join(words)))) + fout.write( + utils.to_utf8( + '%s %s %s\n' % + (doc_id, doc_lang, ' '.join(words)))) if truncated: - logger.warning("Mallet format can only save vectors with " - "integer elements; %i float entries were truncated to integer value" % - truncated) + logger.warning( + "Mallet format can only save vectors with " + "integer elements; %i float entries were truncated to integer value" % + truncated) return offsets diff --git a/gensim/corpora/mmcorpus.py b/gensim/corpora/mmcorpus.py index a9a879db3e..7b5bfa3cc2 100644 --- a/gensim/corpora/mmcorpus.py +++ b/gensim/corpora/mmcorpus.py @@ -23,6 +23,7 @@ class MmCorpus(matutils.MmReader, IndexedCorpus): """ Corpus in the Matrix Market format. """ + def __init__(self, fname): # avoid calling super(), too confusing IndexedCorpus.__init__(self, fname) @@ -37,7 +38,12 @@ def __iter__(self): yield doc # get rid of doc id, return the sparse vector only @staticmethod - def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): + def save_corpus( + fname, + corpus, + id2word=None, + progress_cnt=1000, + metadata=False): """ Save a corpus in the Matrix Market format to disk. @@ -46,6 +52,12 @@ def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): """ logger.info("storing corpus in Matrix Market format to %s" % fname) num_terms = len(id2word) if id2word is not None else None - return matutils.MmWriter.write_corpus(fname, corpus, num_terms=num_terms, index=True, progress_cnt=progress_cnt, metadata=metadata) + return matutils.MmWriter.write_corpus( + fname, + corpus, + num_terms=num_terms, + index=True, + progress_cnt=progress_cnt, + metadata=metadata) # endclass MmCorpus diff --git a/gensim/corpora/sharded_corpus.py b/gensim/corpora/sharded_corpus.py index d2d8301019..b2cd42b7d5 100644 --- a/gensim/corpora/sharded_corpus.py +++ b/gensim/corpora/sharded_corpus.py @@ -34,7 +34,8 @@ import theano _default_dtype = theano.config.floatX except ImportError: - logger.info('Could not import Theano, will use standard float for default ShardedCorpus dtype.') + logger.info( + 'Could not import Theano, will use standard float for default ShardedCorpus dtype.') from six.moves import xrange @@ -140,6 +141,7 @@ class ShardedCorpus(IndexedCorpus): the current shard, or opens a new one. The shard size is constant, except for the last shard. """ + def __init__(self, output_prefix, corpus, dim=None, shardsize=4096, overwrite=False, sparse_serialization=False, sparse_retrieval=False, gensim=False): @@ -233,10 +235,10 @@ def __init__(self, output_prefix, corpus, dim=None, self.current_shard = None # The current shard itself (numpy ndarray) self.current_shard_n = None # Current shard is the current_shard_n-th self.current_offset = None # The index into the dataset which - # corresponds to index 0 of current shard + # corresponds to index 0 of current shard logger.info('Initializing sharded corpus with prefix ' - '{0}'.format(output_prefix)) + '{0}'.format(output_prefix)) if (not os.path.isfile(output_prefix)) or overwrite: logger.info('Building from corpus...') self.init_shards(output_prefix, corpus, shardsize) @@ -245,29 +247,36 @@ def __init__(self, output_prefix, corpus, dim=None, # and retain information about how the corpus # was serialized. logger.info('Saving ShardedCorpus object to ' - '{0}'.format(self.output_prefix)) + '{0}'.format(self.output_prefix)) self.save() else: logger.info('Cloning existing...') self.init_by_clone() - def init_shards(self, output_prefix, corpus, shardsize=4096, dtype=_default_dtype): + def init_shards( + self, + output_prefix, + corpus, + shardsize=4096, + dtype=_default_dtype): """Initialize shards from the corpus.""" if not gensim.utils.is_corpus(corpus): - raise ValueError('Cannot initialize shards without a corpus to read' - ' from! (Got corpus type: {0})'.format(type(corpus))) + raise ValueError( + 'Cannot initialize shards without a corpus to read' + ' from! (Got corpus type: {0})'.format( + type(corpus))) proposed_dim = self._guess_n_features(corpus) if proposed_dim != self.dim: if self.dim is None: logger.info('Deriving dataset dimension from corpus: ' - '{0}'.format(proposed_dim)) + '{0}'.format(proposed_dim)) else: logger.warn('Dataset dimension derived from input corpus diffe' - 'rs from initialization argument, using corpus.' - '(corpus {0}, init arg {1})'.format(proposed_dim, - self.dim)) + 'rs from initialization argument, using corpus.' + '(corpus {0}, init arg {1})'.format(proposed_dim, + self.dim)) self.dim = proposed_dim self.offsets = [0] @@ -276,16 +285,22 @@ def init_shards(self, output_prefix, corpus, shardsize=4096, dtype=_default_dtyp logger.info('Running init from corpus.') - for n, doc_chunk in enumerate(gensim.utils.grouper(corpus, chunksize=shardsize)): - logger.info('Chunk no. {0} at {1} s'.format(n, time.clock() - start_time)) + for n, doc_chunk in enumerate( + gensim.utils.grouper( + corpus, chunksize=shardsize)): + logger.info( + 'Chunk no. {0} at {1} s'.format( + n, time.clock() - start_time)) - current_shard = numpy.zeros((len(doc_chunk), self.dim), dtype=dtype) + current_shard = numpy.zeros( + (len(doc_chunk), self.dim), dtype=dtype) logger.debug('Current chunk dimension: ' - '{0} x {1}'.format(len(doc_chunk), self.dim)) + '{0} x {1}'.format(len(doc_chunk), self.dim)) for i, doc in enumerate(doc_chunk): doc = dict(doc) - current_shard[i][list(doc)] = list(gensim.matutils.itervalues(doc)) + current_shard[i][list(doc)] = list( + gensim.matutils.itervalues(doc)) # Handles the updating as well. if self.sparse_serialization: @@ -294,7 +309,10 @@ def init_shards(self, output_prefix, corpus, shardsize=4096, dtype=_default_dtyp self.save_shard(current_shard) end_time = time.clock() - logger.info('Built {0} shards in {1} s.'.format(self.n_shards, end_time - start_time)) + logger.info( + 'Built {0} shards in {1} s.'.format( + self.n_shards, + end_time - start_time)) def init_by_clone(self): """ @@ -311,9 +329,11 @@ def init_by_clone(self): if self.dim is None: logger.info('Loaded dataset dimension: {0}'.format(temp.dim)) else: - logger.warn('Loaded dataset dimension differs from init arg ' - 'dimension, using loaded dim. ' - '(loaded {0}, init {1})'.format(temp.dim, self.dim)) + logger.warn( + 'Loaded dataset dimension differs from init arg ' + 'dimension, using loaded dim. ' + '(loaded {0}, init {1})'.format( + temp.dim, self.dim)) self.dim = temp.dim # To be consistent with the loaded data! @@ -328,7 +348,7 @@ def save_shard(self, shard, n=None, filename=None): """ new_shard = False if n is None: - n = self.n_shards # Saving the *next* one by default. + n = self.n_shards # Saving the *next* one by default. new_shard = True if not filename: @@ -344,7 +364,7 @@ def load_shard(self, n): """ Load (unpickle) the n-th shard as the "live" part of the dataset into the Dataset object.""" - #logger.debug('ShardedCorpus loading shard {0}, ' + # logger.debug('ShardedCorpus loading shard {0}, ' # 'current shard: {1}'.format(n, self.current_shard_n)) # No-op if the shard is already open. @@ -353,7 +373,8 @@ def load_shard(self, n): filename = self._shard_name(n) if not os.path.isfile(filename): - raise ValueError('Attempting to load nonexistent shard no. {0}'.format(n)) + raise ValueError( + 'Attempting to load nonexistent shard no. {0}'.format(n)) shard = gensim.utils.unpickle(filename) self.current_shard = shard @@ -402,7 +423,7 @@ def in_current(self, offset): """ return (self.current_offset <= offset) \ - and (offset < self.offsets[self.current_shard_n + 1]) + and (offset < self.offsets[self.current_shard_n + 1]) def in_next(self, offset): """ @@ -413,9 +434,9 @@ def in_next(self, offset): """ if self.current_shard_n == self.n_shards: - return False # There's no next shard. + return False # There's no next shard. return (self.offsets[self.current_shard_n + 1] <= offset) \ - and (offset < self.offsets[self.current_shard_n + 2]) + and (offset < self.offsets[self.current_shard_n + 2]) def resize_shards(self, shardsize): """ @@ -471,8 +492,8 @@ def resize_shards(self, shardsize): os.remove(old_shard_name) except Exception as e: logger.error('Exception occurred during old shard no. {0} ' - 'removal: {1}.\nAttempting to at least move ' - 'new shards in.'.format(old_shard_n, str(e))) + 'removal: {1}.\nAttempting to at least move ' + 'new shards in.'.format(old_shard_n, str(e))) finally: # If something happens with cleaning up - try to at least get the # new guys in. @@ -482,8 +503,9 @@ def resize_shards(self, shardsize): # If something happens when we're in this stage, we're screwed. except Exception as e: print(e) - raise RuntimeError('Resizing completely failed for some reason.' - ' Sorry, dataset is probably ruined...') + raise RuntimeError( + 'Resizing completely failed for some reason.' + ' Sorry, dataset is probably ruined...') finally: # Sets the new shard stats. self.n_shards = n_new_shards @@ -527,18 +549,22 @@ def _guess_n_features(self, corpus): return self._guess_n_features(corpus.corpus) else: if not self.dim: - raise TypeError('Couldn\'t find number of features, ' - 'refusing to guess (dimension set to {0},' - 'type of corpus: {1}).'.format(self.dim, type(corpus))) + raise TypeError( + 'Couldn\'t find number of features, ' + 'refusing to guess (dimension set to {0},' + 'type of corpus: {1}).'.format( + self.dim, type(corpus))) else: logger.warn('Couldn\'t find number of features, trusting ' - 'supplied dimension ({0})'.format(self.dim)) + 'supplied dimension ({0})'.format(self.dim)) n_features = self.dim if self.dim and n_features != self.dim: - logger.warn('Discovered inconsistent dataset dim ({0}) and ' - 'feature count from corpus ({1}). Coercing to dimension' - ' given by argument.'.format(self.dim, n_features)) + logger.warn( + 'Discovered inconsistent dataset dim ({0}) and ' + 'feature count from corpus ({1}). Coercing to dimension' + ' given by argument.'.format( + self.dim, n_features)) return n_features @@ -604,7 +630,7 @@ def __getitem__(self, offset): # This fails on one-past # slice indexing; that's why there's a code branch here. - #logger.debug('ShardedCorpus: Retrieving slice {0}: ' + # logger.debug('ShardedCorpus: Retrieving slice {0}: ' # 'shard {1}'.format((offset.start, offset.stop), # (first_shard, last_shard))) @@ -613,7 +639,7 @@ def __getitem__(self, offset): # The easy case: both in one shard. if first_shard == last_shard: s_result = self.current_shard[start - self.current_offset: - stop - self.current_offset] + stop - self.current_offset] # Handle different sparsity settings: s_result = self._getitem_format(s_result) @@ -647,15 +673,15 @@ def __getitem__(self, offset): # to (stop - current_offset) shard_start = start - self.current_offset shard_stop = self.offsets[self.current_shard_n + 1] - \ - self.current_offset + self.current_offset - #s_result[result_start:result_stop] = self.current_shard[ + # s_result[result_start:result_stop] = self.current_shard[ # shard_start:shard_stop] s_result = self.__add_to_slice(s_result, result_start, result_stop, shard_start, shard_stop) # First and last get special treatment, these are in between - for shard_n in xrange(first_shard+1, last_shard): + for shard_n in xrange(first_shard + 1, last_shard): self.load_shard(shard_n) result_start = result_stop @@ -746,11 +772,16 @@ def _getitem_sparse2gensim(self, result): """ def row_sparse2gensim(row_idx, csr_matrix): - indices = csr_matrix.indices[csr_matrix.indptr[row_idx]:csr_matrix.indptr[row_idx+1]] - g_row = [(col_idx, csr_matrix[row_idx, col_idx]) for col_idx in indices] + indices = csr_matrix.indices[csr_matrix.indptr[ + row_idx]:csr_matrix.indptr[row_idx + 1]] + g_row = [(col_idx, csr_matrix[row_idx, col_idx]) + for col_idx in indices] return g_row - output = (row_sparse2gensim(i, result) for i in xrange(result.shape[0])) + output = ( + row_sparse2gensim( + i, result) for i in xrange( + result.shape[0])) return output diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py index 4fdc764b16..6f1df9faf5 100644 --- a/gensim/corpora/svmlightcorpus.py +++ b/gensim/corpora/svmlightcorpus.py @@ -58,7 +58,7 @@ def __init__(self, fname, store_labels=True): IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s" % fname) - self.fname = fname # input file, see class doc for format + self.fname = fname # input file, see class doc for format self.length = None self.store_labels = store_labels self.labels = [] @@ -94,7 +94,8 @@ def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): offsets = [] with utils.smart_open(fname, 'wb') as fout: for docno, doc in enumerate(corpus): - label = labels[docno] if labels else 0 # target class is 0 by default + # target class is 0 by default + label = labels[docno] if labels else 0 offsets.append(fout.tell()) fout.write(utils.to_utf8(SvmLightCorpus.doc2line(doc, label))) return offsets @@ -114,12 +115,13 @@ def line2doc(self, line): line = utils.to_unicode(line) line = line[: line.find('#')].strip() if not line: - return None # ignore comments and empty lines + return None # ignore comments and empty lines parts = line.split() if not parts: raise ValueError('invalid line format in %s' % self.fname) target, fields = parts[0], [part.rsplit(':', 1) for part in parts[1:]] - doc = [(int(p1) - 1, float(p2)) for p1, p2 in fields if p1 != 'qid'] # ignore 'qid' features, convert 1-based feature ids to 0-based + # ignore 'qid' features, convert 1-based feature ids to 0-based + doc = [(int(p1) - 1, float(p2)) for p1, p2 in fields if p1 != 'qid'] return doc, target @staticmethod @@ -127,7 +129,8 @@ def doc2line(doc, label=0): """ Output the document in SVMlight format, as a string. Inverse function to `line2doc`. """ - pairs = ' '.join("%i:%s" % (termid + 1, termval) for termid, termval in doc) # +1 to convert 0-base to 1-base + pairs = ' '.join("%i:%s" % (termid + 1, termval) + for termid, termval in doc) # +1 to convert 0-base to 1-base return "%s %s\n" % (label, pairs) # endclass SvmLightCorpus diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py index 36cfabe301..f1df0f7aa2 100644 --- a/gensim/corpora/textcorpus.py +++ b/gensim/corpora/textcorpus.py @@ -52,6 +52,7 @@ class TextCorpus(interfaces.CorpusABC): implementation. """ + def __init__(self, input=None): super(TextCorpus, self).__init__() self.input = input @@ -89,7 +90,8 @@ def get_texts(self): """ # Instead of raising NotImplementedError, let's provide a sample implementation: # assume documents are lines in a single file (one document per line). - # Yield each document as a list of lowercase tokens, via `utils.tokenize`. + # Yield each document as a list of lowercase tokens, via + # `utils.tokenize`. with self.getstream() as lines: for lineno, line in enumerate(lines): if self.metadata: diff --git a/gensim/corpora/ucicorpus.py b/gensim/corpora/ucicorpus.py index 44b2a772d9..cd1b7ca397 100644 --- a/gensim/corpora/ucicorpus.py +++ b/gensim/corpora/ucicorpus.py @@ -29,6 +29,7 @@ class UciReader(MmReader): + def __init__(self, input): """ Initialize the reader. @@ -50,7 +51,8 @@ def __init__(self, input): except StopIteration: pass - logger.info('accepted corpus with %i documents, %i features, %i non-zero entries' % + logger.info( + 'accepted corpus with %i documents, %i features, %i non-zero entries' % (self.num_docs, self.num_terms, self.num_nnz)) def skip_headers(self, input_file): @@ -91,7 +93,8 @@ def update_headers(self, num_docs, num_terms, num_nnz): Update headers with actual values. """ offset = 0 - values = [utils.to_utf8(str(n)) for n in [num_docs, num_terms, num_nnz]] + values = [utils.to_utf8(str(n)) + for n in [num_docs, num_terms, num_nnz]] for value in values: if len(value) > len(self.FAKE_HEADER): @@ -118,7 +121,8 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False): offsets.append(posnow) poslast = posnow - vector = [(x, int(y)) for (x, y) in bow if int(y) != 0] # integer count, not floating weights + # integer count, not floating weights + vector = [(x, int(y)) for (x, y) in bow if int(y) != 0] max_id, veclen = writer.write_vector(docno, vector) num_terms = max(num_terms, 1 + max_id) num_nnz += veclen @@ -126,12 +130,13 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False): if num_docs * num_terms != 0: logger.info("saved %ix%i matrix, density=%.3f%% (%i/%i)" % - (num_docs, num_terms, - 100.0 * num_nnz / (num_docs * num_terms), - num_nnz, - num_docs * num_terms)) + (num_docs, num_terms, + 100.0 * num_nnz / (num_docs * num_terms), + num_nnz, + num_docs * num_terms)) - # now write proper headers, by seeking and overwriting the spaces written earlier + # now write proper headers, by seeking and overwriting the spaces + # written earlier writer.update_headers(num_docs, num_terms, num_nnz) writer.close() @@ -145,6 +150,7 @@ class UciCorpus(UciReader, IndexedCorpus): """ Corpus in the UCI bag-of-words format. """ + def __init__(self, fname, fname_vocab=None): IndexedCorpus.__init__(self, fname) UciReader.__init__(self, fname) @@ -165,7 +171,7 @@ def __iter__(self): (yielding one document at a time). """ for docId, doc in super(UciCorpus, self).__iter__(): - yield doc # get rid of docId, return the sparse vector only + yield doc # get rid of docId, return the sparse vector only def create_dictionary(self): """ @@ -175,7 +181,8 @@ def create_dictionary(self): dictionary = Dictionary() # replace dfs with defaultdict to avoid downstream KeyErrors - # uci vocabularies may contain terms that are not used in the document data + # uci vocabularies may contain terms that are not used in the document + # data dictionary.dfs = defaultdict(int) dictionary.id2token = self.id2word @@ -186,7 +193,9 @@ def create_dictionary(self): for docno, doc in enumerate(self): if docno % 10000 == 0: - logger.info('PROGRESS: processing document %i of %i' % (docno, self.num_docs)) + logger.info( + 'PROGRESS: processing document %i of %i' % + (docno, self.num_docs)) for word, count in doc: dictionary.dfs[word] += 1 @@ -195,7 +204,12 @@ def create_dictionary(self): return dictionary @staticmethod - def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): + def save_corpus( + fname, + corpus, + id2word=None, + progress_cnt=10000, + metadata=False): """ Save a corpus in the UCI Bag-of-Words format. @@ -206,7 +220,8 @@ def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False) call it directly, call `serialize` instead. """ if id2word is None: - logger.info("no word id mapping provided; initializing from corpus") + logger.info( + "no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) num_terms = len(id2word) else: @@ -214,13 +229,21 @@ def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False) # write out vocabulary fname_vocab = utils.smart_extension(fname, '.vocab') - logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab)) + logger.info( + "saving vocabulary of %i words to %s" % + (num_terms, fname_vocab)) with utils.smart_open(fname_vocab, 'wb') as fout: for featureid in xrange(num_terms): - fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) + fout.write( + utils.to_utf8( + "%s\n" % + id2word.get( + featureid, + '---'))) logger.info("storing corpus in UCI Bag-of-Words format: %s" % fname) - return UciWriter.write_corpus(fname, corpus, index=True, progress_cnt=progress_cnt) + return UciWriter.write_corpus( + fname, corpus, index=True, progress_cnt=progress_cnt) # endclass UciCorpus diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 1a53b282e9..2738af46fc 100755 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -21,7 +21,8 @@ import bz2 import logging import re -from xml.etree.cElementTree import iterparse # LXML isn't faster, so let's go with the built-in solution +# LXML isn't faster, so let's go with the built-in solution +from xml.etree.cElementTree import iterparse import multiprocessing from gensim import utils @@ -32,24 +33,44 @@ logger = logging.getLogger('gensim.corpora.wikicorpus') -# ignore articles shorter than ARTICLE_MIN_WORDS characters (after full preprocessing) +# ignore articles shorter than ARTICLE_MIN_WORDS characters (after full +# preprocessing) ARTICLE_MIN_WORDS = 50 RE_P0 = re.compile('', re.DOTALL | re.UNICODE) # comments -RE_P1 = re.compile(' ].*?)(|/>)', re.DOTALL | re.UNICODE) # footnotes -RE_P2 = re.compile("(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$", re.UNICODE) # links to languages +RE_P1 = re.compile( + ' ].*?)(|/>)', + re.DOTALL | re.UNICODE) # footnotes +RE_P2 = re.compile( + "(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$", + re.UNICODE) # links to languages RE_P3 = re.compile("{{([^}{]*)}}", re.DOTALL | re.UNICODE) # template RE_P4 = re.compile("{{([^}]*)}}", re.DOTALL | re.UNICODE) # template -RE_P5 = re.compile('\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE) # remove URL, keep description -RE_P6 = re.compile("\[([^][]*)\|([^][]*)\]", re.DOTALL | re.UNICODE) # simplify links, keep description -RE_P7 = re.compile('\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) # keep description of images -RE_P8 = re.compile('\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) # keep description of files -RE_P9 = re.compile(' ].*?)(|/>)', re.DOTALL | re.UNICODE) # outside links -RE_P10 = re.compile(' ].*?)(|/>)', re.DOTALL | re.UNICODE) # math content +RE_P5 = re.compile( + '\[(\w+):\/\/(.*?)(( (.*?))|())\]', + re.UNICODE) # remove URL, keep description +RE_P6 = re.compile("\[([^][]*)\|([^][]*)\]", re.DOTALL | + re.UNICODE) # simplify links, keep description +RE_P7 = re.compile( + '\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', + re.UNICODE) # keep description of images +RE_P8 = re.compile( + '\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', + re.UNICODE) # keep description of files +RE_P9 = re.compile( + ' ].*?)(|/>)', + re.DOTALL | re.UNICODE) # outside links +RE_P10 = re.compile( + ' ].*?)(|/>)', + re.DOTALL | re.UNICODE) # math content RE_P11 = re.compile('<(.*?)>', re.DOTALL | re.UNICODE) # all other tags -RE_P12 = re.compile('\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE) # table formatting -RE_P13 = re.compile('\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE) # table cell formatting +RE_P12 = re.compile( + '\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', + re.UNICODE) # table formatting +RE_P13 = re.compile( + '\n(\||\!)(.*?\|)*([^|]*?)', + re.UNICODE) # table cell formatting RE_P14 = re.compile('\[\[Category:[^][]*\]\]', re.UNICODE) # categories # Remove File and Image template RE_P15 = re.compile('\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE) @@ -91,7 +112,8 @@ def remove_markup(text): text = re.sub(RE_P11, "", text) # remove all remaining tags text = re.sub(RE_P14, '', text) # remove categories text = re.sub(RE_P5, '\\3', text) # remove urls, keep description - text = re.sub(RE_P6, '\\2', text) # simplify links, keep description only + # simplify links, keep description only + text = re.sub(RE_P6, '\\2', text) # remove table markup text = text.replace('||', '\n|') # each table cell on a separate line text = re.sub(RE_P12, '\n', text) # remove formatting lines @@ -103,7 +125,8 @@ def remove_markup(text): # the following is needed to make the tokenizer see '[[socialist]]s' as a single word 'socialists' # TODO is this really desirable? - text = text.replace('[', '').replace(']', '') # promote all remaining markup to plain text + # promote all remaining markup to plain text + text = text.replace('[', '').replace(']', '') return text @@ -171,9 +194,13 @@ def tokenize(content): Return list of tokens as utf8 bytestrings. Ignore words shorted than 2 or longer that 15 characters (not bytes!). """ - # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.) - return [token.encode('utf8') for token in utils.tokenize(content, lower=True, errors='ignore') - if 2 <= len(token) <= 15 and not token.startswith('_')] + # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, + # russian etc.) + return [ + token.encode('utf8') for token in utils.tokenize( + content, + lower=True, + errors='ignore') if 2 <= len(token) <= 15 and not token.startswith('_')] def get_namespace(tag): @@ -258,7 +285,16 @@ class WikiCorpus(TextCorpus): >>> MmCorpus.serialize('wiki_en_vocab200k.mm', wiki) # another 8h, creates a file in MatrixMarket format plus file with id->word """ - def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',)): + + def __init__( + self, + fname, + processes=None, + lemmatize=utils.has_pattern(), + dictionary=None, + filter_namespaces=( + '0', + )): """ Initialize the corpus. Unless a dictionary is provided, this scans the corpus once, to determine its vocabulary. @@ -296,16 +332,31 @@ def get_texts(self): """ articles, articles_all = 0, 0 positions, positions_all = 0, 0 - texts = ((text, self.lemmatize, title, pageid) for title, text, pageid in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces)) + texts = ( + (text, + self.lemmatize, + title, + pageid) for title, + text, + pageid in extract_pages( + bz2.BZ2File( + self.fname), + self.filter_namespaces)) pool = multiprocessing.Pool(self.processes) # process the corpus in smaller chunks of docs, because multiprocessing.Pool # is dumb and would load the entire input into RAM at once... - for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1): - for tokens, title, pageid in pool.imap(process_article, group): # chunksize=10): + for group in utils.chunkize( + texts, + chunksize=10 * + self.processes, + maxsize=1): + for tokens, title, pageid in pool.imap( + process_article, group): # chunksize=10): articles_all += 1 positions_all += len(tokens) # article redirects and short stubs are pruned here - if len(tokens) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): + if len(tokens) < ARTICLE_MIN_WORDS or any( + title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): continue articles += 1 positions += len(tokens) diff --git a/gensim/examples/dmlcz/__init__.py b/gensim/examples/dmlcz/__init__.py index 8b13789179..e69de29bb2 100644 --- a/gensim/examples/dmlcz/__init__.py +++ b/gensim/examples/dmlcz/__init__.py @@ -1 +0,0 @@ - diff --git a/gensim/examples/dmlcz/dmlcorpus.py b/gensim/examples/dmlcz/dmlcorpus.py index 63c9f16855..59975023d8 100644 --- a/gensim/examples/dmlcz/dmlcorpus.py +++ b/gensim/examples/dmlcz/dmlcorpus.py @@ -15,7 +15,7 @@ import os.path from gensim import interfaces, matutils -import dictionary # for constructing word->id mappings +import dictionary # for constructing word->id mappings logger = logging.getLogger('gensim.corpora.dmlcorpus') @@ -34,39 +34,46 @@ class DmlConfig(object): output files and which articles to accept for the corpus (= an additional filter over the sources). """ - def __init__(self, configId, resultDir, acceptLangs = None): - self.resultDir = resultDir # output files will be stored in this directory - self.configId = configId # configId is a string that is used as filename prefix for all files, so keep it simple - self.sources = {} # all article sources; see sources.DmlSource class for an example of source - if acceptLangs is None: # which languages to accept - acceptLangs = set(['any']) # if not specified, accept all languages (including unknown/unspecified) + def __init__(self, configId, resultDir, acceptLangs=None): + self.resultDir = resultDir # output files will be stored in this directory + # configId is a string that is used as filename prefix for all files, + # so keep it simple + self.configId = configId + # all article sources; see sources.DmlSource class for an example of + # source + self.sources = {} + + if acceptLangs is None: # which languages to accept + # if not specified, accept all languages (including + # unknown/unspecified) + acceptLangs = set(['any']) self.acceptLangs = set(acceptLangs) logger.info('initialized %s' % self) - def resultFile(self, fname): return os.path.join(self.resultDir, self.configId + '_' + fname) - def acceptArticle(self, metadata): - lang = metadata.get('language', 'unk') # if there was no language field in the article metadata, set language to 'unk' = unknown + # if there was no language field in the article metadata, set language + # to 'unk' = unknown + lang = metadata.get('language', 'unk') if 'any' not in self.acceptLangs and lang not in self.acceptLangs: return False return True - def addSource(self, source): sourceId = str(source) assert sourceId not in self.sources, "source %s already present in the config!" % sourceId self.sources[sourceId] = source - def __str__(self): - return ("DmlConfig(id=%s, sources=[%s], acceptLangs=[%s])" % - (self.configId, ', '.join(self.sources.iterkeys()), ', '.join(self.acceptLangs))) -#endclass DmlConfig - + return ( + "DmlConfig(id=%s, sources=[%s], acceptLangs=[%s])" % + (self.configId, ', '.join( + self.sources.iterkeys()), ', '.join( + self.acceptLangs))) +# endclass DmlConfig class DmlCorpus(interfaces.CorpusABC): @@ -79,16 +86,15 @@ class DmlCorpus(interfaces.CorpusABC): DmlCorpus has methods for building a dictionary (mapping between words and their ids). """ + def __init__(self): self.documents = [] self.config = None self.dictionary = dictionary.Dictionary() - def __len__(self): return len(self.documents) - def __iter__(self): """ The function that defines a corpus -- iterating over the corpus yields @@ -100,9 +106,9 @@ def __iter__(self): source = self.config.sources[sourceId] contents = source.getContent(docUri) - words = [source.normalizeWord(word) for word in source.tokenize(contents)] - yield self.dictionary.doc2bow(words, allowUpdate = False) - + words = [source.normalizeWord(word) + for word in source.tokenize(contents)] + yield self.dictionary.doc2bow(words, allowUpdate=False) def buildDictionary(self): """ @@ -112,25 +118,27 @@ def buildDictionary(self): them into tokens and converting tokens to their ids (creating new ids as necessary). """ - logger.info("creating dictionary from %i articles" % len(self.documents)) + logger.info("creating dictionary from %i articles" % + len(self.documents)) self.dictionary = dictionary.Dictionary() numPositions = 0 for docNo, (sourceId, docUri) in enumerate(self.documents): if docNo % 1000 == 0: logger.info("PROGRESS: at document #%i/%i (%s, %s)" % - (docNo, len(self.documents), sourceId, docUri)) + (docNo, len(self.documents), sourceId, docUri)) source = self.config.sources[sourceId] contents = source.getContent(docUri) - words = [source.normalizeWord(word) for word in source.tokenize(contents)] + words = [source.normalizeWord(word) + for word in source.tokenize(contents)] numPositions += len(words) - # convert to bag-of-words, but ignore the result -- here we only care about updating token ids - _ = self.dictionary.doc2bow(words, allowUpdate = True) + # convert to bag-of-words, but ignore the result -- here we only + # care about updating token ids + _ = self.dictionary.doc2bow(words, allowUpdate=True) logger.info("built %s from %i documents (total %i corpus positions)" % - (self.dictionary, len(self.documents), numPositions)) - + (self.dictionary, len(self.documents), numPositions)) - def processConfig(self, config, shuffle = False): + def processConfig(self, config, shuffle=False): """ Parse the directories specified in the config, looking for suitable articles. @@ -148,24 +156,27 @@ def processConfig(self, config, shuffle = False): logger.info("processing source '%s'" % sourceId) accepted = [] for articleUri in source.findArticles(): - meta = source.getMeta(articleUri) # retrieve metadata (= dictionary of key->value) - if config.acceptArticle(meta): # do additional filtering on articles, based on the article's metadata + # retrieve metadata (= dictionary of key->value) + meta = source.getMeta(articleUri) + if config.acceptArticle( + meta): # do additional filtering on articles, based on the article's metadata accepted.append((sourceId, articleUri)) logger.info("accepted %i articles for source '%s'" % - (len(accepted), sourceId)) + (len(accepted), sourceId)) self.documents.extend(accepted) if not self.documents: - logger.warning('no articles at all found from the config; something went wrong!') + logger.warning( + 'no articles at all found from the config; something went wrong!') if shuffle: - logger.info("shuffling %i documents for random order" % len(self.documents)) + logger.info("shuffling %i documents for random order" % + len(self.documents)) import random random.shuffle(self.documents) logger.info("accepted total of %i articles for %s" % - (len(self.documents), str(config))) - + (len(self.documents), str(config))) def saveDictionary(self, fname): logger.info("saving dictionary mapping to %s" % fname) @@ -194,7 +205,6 @@ def saveDocuments(self, fname): fout.write("%i\t%s\n" % (docNo, repr(docId))) fout.close() - def saveAsText(self): """ Store the corpus to disk, in a human-readable text format. @@ -211,7 +221,6 @@ def saveAsText(self): self.saveDocuments(self.config.resultFile('docids.txt')) matutils.MmWriter.writeCorpus(self.config.resultFile('bow.mm'), self) - def articleDir(self, docNo): """ Return absolute normalized path on filesystem to article no. `docNo`. @@ -220,7 +229,6 @@ def articleDir(self, docNo): source = self.config.sources[sourceId] return os.path.join(source.baseDir, outPath) - def getMeta(self, docNo): """ Return metadata for article no. `docNo`. @@ -228,5 +236,4 @@ def getMeta(self, docNo): sourceId, uri = self.documents[docNo] source = self.config.sources[sourceId] return source.getMeta(uri) -#endclass DmlCorpus - +# endclass DmlCorpus diff --git a/gensim/examples/dmlcz/gensim_build.py b/gensim/examples/dmlcz/gensim_build.py index 4e258ada8d..44b6e31f58 100755 --- a/gensim/examples/dmlcz/gensim_build.py +++ b/gensim/examples/dmlcz/gensim_build.py @@ -27,10 +27,16 @@ if AT_HOME: SOURCE_LIST = [ - sources.DmlCzSource('dmlcz', '/Users/kofola/workspace/dml/data/dmlcz/'), - sources.DmlSource('numdam', '/Users/kofola/workspace/dml/data/numdam/'), - sources.ArxmlivSource('arxmliv', '/Users/kofola/workspace/dml/data/arxmliv/'), - ] + sources.DmlCzSource( + 'dmlcz', + '/Users/kofola/workspace/dml/data/dmlcz/'), + sources.DmlSource( + 'numdam', + '/Users/kofola/workspace/dml/data/numdam/'), + sources.ArxmlivSource( + 'arxmliv', + '/Users/kofola/workspace/dml/data/arxmliv/'), + ] # SOURCE_LIST = [ # sources.DmlCzSource('dmlcz', '/Users/kofola/workspace/dml/data/dmlcz/CzechMathJ'), @@ -41,22 +47,25 @@ else: SOURCE_LIST = [ - sources.DmlCzSource('dmlcz', '/data/dmlcz/data/share'), - sources.DmlSource('numdam', '/data/dmlcz/data/numdam'), - sources.ArxmlivSource('arxmliv', '/data/dmlcz/data/arxmliv'), - ] + sources.DmlCzSource('dmlcz', '/data/dmlcz/data/share'), + sources.DmlSource('numdam', '/data/dmlcz/data/numdam'), + sources.ArxmlivSource('arxmliv', '/data/dmlcz/data/arxmliv'), + ] RESULT_DIR = '/data/dmlcz/xrehurek/results' def buildDmlCorpus(config): dml = dmlcorpus.DmlCorpus() - dml.processConfig(config, shuffle = True) + dml.processConfig(config, shuffle=True) dml.buildDictionary() - dml.dictionary.filterExtremes(noBelow=5, noAbove=0.3) # ignore too (in)frequent words + # ignore too (in)frequent words + dml.dictionary.filterExtremes(noBelow=5, noAbove=0.3) - dml.save(config.resultFile('.pkl')) # save the mappings as binary data (actual documents are not saved, only their URIs) - dml.saveAsText() # save id mappings and documents as text data (matrix market format) + # save the mappings as binary data (actual documents are not saved, only + # their URIs) + dml.save(config.resultFile('.pkl')) + dml.saveAsText() # save id mappings and documents as text data (matrix market format) return dml @@ -73,8 +82,14 @@ def buildDmlCorpus(config): sys.exit(1) language = sys.argv[1] - # construct the config, which holds information about sources, data file filenames etc. - config = dmlcorpus.DmlConfig('%s_%s' % (PREFIX, language), resultDir=RESULT_DIR, acceptLangs=[language]) + # construct the config, which holds information about sources, data file + # filenames etc. + config = dmlcorpus.DmlConfig( + '%s_%s' % + (PREFIX, + language), + resultDir=RESULT_DIR, + acceptLangs=[language]) for source in SOURCE_LIST: config.addSource(source) buildDmlCorpus(config) diff --git a/gensim/examples/dmlcz/gensim_genmodel.py b/gensim/examples/dmlcz/gensim_genmodel.py index 428f8b5536..2b2f9d3c47 100755 --- a/gensim/examples/dmlcz/gensim_genmodel.py +++ b/gensim/examples/dmlcz/gensim_genmodel.py @@ -25,15 +25,14 @@ # internal method parameters -DIM_RP = 300 # dimensionality for random projections -DIM_LSI = 200 # for lantent semantic indexing -DIM_LDA = 100 # for latent dirichlet allocation - +DIM_RP = 300 # dimensionality for random projections +DIM_LSI = 200 # for lantent semantic indexing +DIM_LDA = 100 # for latent dirichlet allocation if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') - logging.root.setLevel(level = logging.INFO) + logging.root.setLevel(level=logging.INFO) logging.info("running %s" % ' '.join(sys.argv)) program = os.path.basename(sys.argv[0]) @@ -46,32 +45,45 @@ method = sys.argv[2].strip().lower() logging.info("loading corpus mappings") - config = dmlcorpus.DmlConfig('%s_%s' % (gensim_build.PREFIX, language), - resultDir=gensim_build.RESULT_DIR, acceptLangs=[language]) - - logging.info("loading word id mapping from %s" % config.resultFile('wordids.txt')) - id2word = dmlcorpus.DmlCorpus.loadDictionary(config.resultFile('wordids.txt')) + config = dmlcorpus.DmlConfig( + '%s_%s' % + (gensim_build.PREFIX, + language), + resultDir=gensim_build.RESULT_DIR, + acceptLangs=[language]) + + logging.info( + "loading word id mapping from %s" % + config.resultFile('wordids.txt')) + id2word = dmlcorpus.DmlCorpus.loadDictionary( + config.resultFile('wordids.txt')) logging.info("loaded %i word ids" % len(id2word)) corpus = MmCorpus(config.resultFile('bow.mm')) if method == 'tfidf': - model = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True) + model = tfidfmodel.TfidfModel(corpus, id2word=id2word, normalize=True) model.save(config.resultFile('model_tfidf.pkl')) elif method == 'lda': - model = ldamodel.LdaModel(corpus, id2word = id2word, numTopics = DIM_LDA) + model = ldamodel.LdaModel(corpus, id2word=id2word, numTopics=DIM_LDA) model.save(config.resultFile('model_lda.pkl')) elif method == 'lsi': # first, transform word counts to tf-idf weights - tfidf = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True) + tfidf = tfidfmodel.TfidfModel(corpus, id2word=id2word, normalize=True) # then find the transformation from tf-idf to latent space - model = lsimodel.LsiModel(tfidf[corpus], id2word = id2word, numTopics = DIM_LSI) + model = lsimodel.LsiModel( + tfidf[corpus], + id2word=id2word, + numTopics=DIM_LSI) model.save(config.resultFile('model_lsi.pkl')) elif method == 'rp': # first, transform word counts to tf-idf weights - tfidf = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True) + tfidf = tfidfmodel.TfidfModel(corpus, id2word=id2word, normalize=True) # then find the transformation from tf-idf to latent space - model = rpmodel.RpModel(tfidf[corpus], id2word = id2word, numTopics = DIM_RP) + model = rpmodel.RpModel( + tfidf[corpus], + id2word=id2word, + numTopics=DIM_RP) model.save(config.resultFile('model_rp.pkl')) else: raise ValueError('unknown topic extraction method: %s' % repr(method)) @@ -79,4 +91,3 @@ MmCorpus.saveCorpus(config.resultFile('%s.mm' % method), model[corpus]) logging.info("finished running %s" % program) - diff --git a/gensim/examples/dmlcz/gensim_xml.py b/gensim/examples/dmlcz/gensim_xml.py index 8ac2b265c2..72ec13395e 100755 --- a/gensim/examples/dmlcz/gensim_xml.py +++ b/gensim/examples/dmlcz/gensim_xml.py @@ -28,10 +28,14 @@ DRY_RUN = False # how many 'most similar' documents to store in each similar.xml? -MIN_SCORE = 0.0 # prune based on similarity score (all below MIN_SCORE are ignored) -MAX_SIMILAR = 10 # prune based on rank (at most MAX_SIMILAR are stored). set to 0 to store all of them (no limit). - -# if there are no similar articles (after the pruning), do we still want to generate similar.xml? +# prune based on similarity score (all below MIN_SCORE are ignored) +MIN_SCORE = 0.0 +# prune based on rank (at most MAX_SIMILAR are stored). set to 0 to store +# all of them (no limit). +MAX_SIMILAR = 10 + +# if there are no similar articles (after the pruning), do we still want +# to generate similar.xml? SAVE_EMPTY = True # xml template for similar articles @@ -47,7 +51,8 @@ """ -# template for the whole similar.xml file (will be filled with multiple ARTICLE instances) +# template for the whole similar.xml file (will be filled with multiple +# ARTICLE instances) SIMILAR = """\ %s @@ -55,37 +60,47 @@ """ - def generateSimilar(corpus, index, method): - for docNo, topSims in enumerate(index): # for each document + for docNo, topSims in enumerate(index): # for each document # store similarities to the following file - outfile = os.path.join(corpus.articleDir(docNo), 'similar_%s.xml' % method) - - articles = [] # collect similars in this list - for docNo2, score in topSims: # for each most similar article - if score > MIN_SCORE and docNo != docNo2: # if similarity is above MIN_SCORE and not identity (=always maximum similarity, boring) + outfile = os.path.join( + corpus.articleDir(docNo), + 'similar_%s.xml' % + method) + + articles = [] # collect similars in this list + for docNo2, score in topSims: # for each most similar article + # if similarity is above MIN_SCORE and not identity (=always + # maximum similarity, boring) + if score > MIN_SCORE and docNo != docNo2: source, (intId, pathId) = corpus.documents[docNo2] meta = corpus.getMeta(docNo2) - suffix, author, title = '', meta.get('author', ''), meta.get('title', '') - articles.append(ARTICLE % locals()) # add the similar article to output + suffix, author, title = '', meta.get( + 'author', ''), meta.get('title', '') + # add the similar article to output + articles.append(ARTICLE % locals()) if len(articles) >= MAX_SIMILAR: break # now `articles` holds multiple strings in similar_*.xml format if SAVE_EMPTY or articles: - output = ''.join(articles) # concat all similars to one string - if not DRY_RUN: # only open output files for writing if DRY_RUN is false - logging.info("generating %s (%i similars)" % (outfile, len(articles))) + output = ''.join(articles) # concat all similars to one string + if not DRY_RUN: # only open output files for writing if DRY_RUN is false + logging.info( + "generating %s (%i similars)" % + (outfile, len(articles))) outfile = open(outfile, 'w') - outfile.write(SIMILAR % output) # add xml headers and print to file + # add xml headers and print to file + outfile.write(SIMILAR % output) outfile.close() else: - logging.info("would be generating %s (%i similars):%s\n" % (outfile, len(articles), output)) + logging.info( + "would be generating %s (%i similars):%s\n" % + (outfile, len(articles), output)) else: logging.debug("skipping %s (no similar found)" % outfile) - if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) @@ -101,25 +116,39 @@ def generateSimilar(corpus, index, method): method = sys.argv[2].strip().lower() logging.info("loading corpus mappings") - config = dmlcorpus.DmlConfig('%s_%s' % (gensim_build.PREFIX, language), - resultDir=gensim_build.RESULT_DIR, acceptLangs=[language]) - - logging.info("loading word id mapping from %s" % config.resultFile('wordids.txt')) - id2word = dmlcorpus.DmlCorpus.loadDictionary(config.resultFile('wordids.txt')) + config = dmlcorpus.DmlConfig( + '%s_%s' % + (gensim_build.PREFIX, + language), + resultDir=gensim_build.RESULT_DIR, + acceptLangs=[language]) + + logging.info( + "loading word id mapping from %s" % + config.resultFile('wordids.txt')) + id2word = dmlcorpus.DmlCorpus.loadDictionary( + config.resultFile('wordids.txt')) logging.info("loaded %i word ids" % len(id2word)) corpus = dmlcorpus.DmlCorpus.load(config.resultFile('.pkl')) input = MmCorpus(config.resultFile('_%s.mm' % method)) - assert len(input) == len(corpus), "corpus size mismatch (%i vs %i): run ./gensim_genmodel.py again" % (len(input), len(corpus)) - - # initialize structure for similarity queries - if method == 'lsi' or method == 'rp': # for these methods, use dense vectors - index = MatrixSimilarity(input, numBest=MAX_SIMILAR + 1, numFeatures=input.numTerms) + assert len(input) == len( + corpus), "corpus size mismatch (%i vs %i): run ./gensim_genmodel.py again" % (len(input), len(corpus)) + + # initialize structure for similarity queries + if method == 'lsi' or method == 'rp': # for these methods, use dense vectors + index = MatrixSimilarity( + input, + numBest=MAX_SIMILAR + 1, + numFeatures=input.numTerms) else: index = SparseMatrixSimilarity(input, numBest=MAX_SIMILAR + 1) - index.normalize = False # do not normalize query vectors during similarity queries (the index is already built normalized, so it would be a no-op) - generateSimilar(corpus, index, method) # for each document, print MAX_SIMILAR nearest documents to a xml file, in dml-cz specific format + # do not normalize query vectors during similarity queries (the index is + # already built normalized, so it would be a no-op) + index.normalize = False + # for each document, print MAX_SIMILAR nearest documents to a xml file, in + # dml-cz specific format + generateSimilar(corpus, index, method) logging.info("finished running %s" % program) - diff --git a/gensim/examples/dmlcz/sources.py b/gensim/examples/dmlcz/sources.py index f6244ad361..000acfb350 100644 --- a/gensim/examples/dmlcz/sources.py +++ b/gensim/examples/dmlcz/sources.py @@ -20,7 +20,7 @@ import os.path import re -import xml.sax # for parsing arxmliv articles +import xml.sax # for parsing arxmliv articles from gensim import utils @@ -44,6 +44,7 @@ class ArticleSource(object): This class is just an ABC interface; see eg. DmlSource or ArxmlivSource classes for concrete instances. """ + def __init__(self, sourceId): self.sourceId = sourceId @@ -64,8 +65,7 @@ def tokenize(self, content): def normalizeWord(self, word): raise NotImplementedError('Abstract Base Class') -#endclass ArticleSource - +# endclass ArticleSource class DmlSource(ArticleSource): @@ -79,6 +79,7 @@ class DmlSource(ArticleSource): See the ArticleSource class for general info on sources. """ + def __init__(self, sourceId, baseDir): self.sourceId = sourceId self.baseDir = os.path.normpath(baseDir) @@ -94,36 +95,39 @@ def parseDmlMeta(cls, xmlfile): result = {} xml = open(xmlfile) for line in xml: - if line.find('
') >= 0: # skip until the beginning of
tag + if line.find( + '
') >= 0: # skip until the beginning of
tag break for line in xml: - if line.find('
') >= 0: # end of
, we're done + if line.find('
') >= 0: # end of
, we're done break - p = re.search(PAT_TAG, line) # HAX assumes one element = one line; proper xml parsing probably better... but who cares + # HAX assumes one element = one line; proper xml parsing probably + # better... but who cares + p = re.search(PAT_TAG, line) if p: name, cont = p.groups() name = name.split()[0] name, cont = name.strip(), cont.strip() if name == 'msc': if len(cont) != 5: - logger.warning('invalid MSC=%s in %s' % (cont, xmlfile)) + logger.warning( + 'invalid MSC=%s in %s' % + (cont, xmlfile)) result.setdefault('msc', []).append(cont) continue if name == 'idMR': - cont = cont[2:] # omit MR from MR123456 + cont = cont[2:] # omit MR from MR123456 if name and cont: result[name] = cont xml.close() return result - def idFromDir(self, path): assert len(path) > len(self.baseDir) - intId = path[1 + path.rfind('#') : ] - pathId = path[len(self.baseDir) + 1 : ] + intId = path[1 + path.rfind('#'):] + pathId = path[len(self.baseDir) + 1:] return (intId, pathId) - def isArticle(self, path): # in order to be valid, the article directory must start with '#' if not os.path.basename(path).startswith('#'): @@ -138,10 +142,11 @@ def isArticle(self, path): return False return True - def findArticles(self): dirTotal = artAccepted = 0 - logger.info("looking for '%s' articles inside %s" % (self.sourceId, self.baseDir)) + logger.info( + "looking for '%s' articles inside %s" % + (self.sourceId, self.baseDir)) for root, dirs, files in os.walk(self.baseDir): dirTotal += 1 root = os.path.normpath(root) @@ -149,8 +154,7 @@ def findArticles(self): artAccepted += 1 yield self.idFromDir(root) logger.info('%i directories processed, found %i articles' % - (dirTotal, artAccepted)) - + (dirTotal, artAccepted)) def getContent(self, uri): """ @@ -160,7 +164,6 @@ def getContent(self, uri): filename = os.path.join(self.baseDir, pathId, 'fulltext.txt') return open(filename).read() - def getMeta(self, uri): """ Return article metadata as a attribute->value dictionary. @@ -169,15 +172,15 @@ def getMeta(self, uri): filename = os.path.join(self.baseDir, pathId, 'meta.xml') return DmlSource.parseDmlMeta(filename) - def tokenize(self, content): - return [token.encode('utf8') for token in utils.tokenize(content, errors = 'ignore') if not token.isdigit()] - + return [token.encode('utf8') for token in utils.tokenize( + content, errors='ignore') if not token.isdigit()] def normalizeWord(self, word): wordU = unicode(word, 'utf8') - return wordU.lower().encode('utf8') # lowercase and then convert back to bytestring -#endclass DmlSource + # lowercase and then convert back to bytestring + return wordU.lower().encode('utf8') +# endclass DmlSource class DmlCzSource(DmlSource): @@ -190,13 +193,13 @@ class DmlCzSource(DmlSource): See the ArticleSource class for general info on sources. """ + def idFromDir(self, path): assert len(path) > len(self.baseDir) dmlczId = open(os.path.join(path, 'dspace_id')).read().strip() - pathId = path[len(self.baseDir) + 1 : ] + pathId = path[len(self.baseDir) + 1:] return (dmlczId, pathId) - def isArticle(self, path): # in order to be valid, the article directory must start with '#' if not os.path.basename(path).startswith('#'): @@ -206,7 +209,14 @@ def isArticle(self, path): logger.info('missing dspace_id in %s' % path) return False # and contain either fulltext.txt or fulltext_dspace.txt file - if not (os.path.exists(os.path.join(path, 'fulltext.txt')) or os.path.exists(os.path.join(path, 'fulltext-dspace.txt'))): + if not ( + os.path.exists( + os.path.join( + path, + 'fulltext.txt')) or os.path.exists( + os.path.join( + path, + 'fulltext-dspace.txt'))): logger.info('missing fulltext in %s' % path) return False # and contain the meta.xml file @@ -215,7 +225,6 @@ def isArticle(self, path): return False return True - def getContent(self, uri): """ Return article content as a single large string. @@ -236,8 +245,7 @@ def getContent(self, uri): assert os.path.exists(filename2) filename = filename2 return open(filename).read() -#endclass DmlCzSource - +# endclass DmlCzSource class ArxmlivSource(ArticleSource): @@ -252,13 +260,16 @@ class ArxmlivSource(ArticleSource): See the ArticleSource class for general info on sources. """ class ArxmlivContentHandler(xml.sax.handler.ContentHandler): + def __init__(self): - self.path = [''] # help structure for sax event parsing - self.tokens = [] # will contain tokens once parsing is finished + self.path = [''] # help structure for sax event parsing + self.tokens = [] # will contain tokens once parsing is finished def startElement(self, name, attr): - # for math tokens, we only care about Math elements directly below

- if name == 'Math' and self.path[-1] == 'p' and attr.get('mode', '') == 'inline': + # for math tokens, we only care about Math elements directly below + #

+ if name == 'Math' and self.path[ + -1] == 'p' and attr.get('mode', '') == 'inline': tex = attr.get('tex', '') if tex and not tex.isdigit(): self.tokens.append('$%s$' % tex.encode('utf8')) @@ -270,10 +281,11 @@ def endElement(self, name): def characters(self, text): # for text, we only care about tokens directly within the

tag if self.path[-1] == 'p': - tokens = [token.encode('utf8') for token in utils.tokenize(text, errors = 'ignore') if not token.isdigit()] + tokens = [ + token.encode('utf8') for token in utils.tokenize( + text, errors='ignore') if not token.isdigit()] self.tokens.extend(tokens) - #endclass ArxmlivHandler - + # endclass ArxmlivHandler class ArxmlivErrorHandler(xml.sax.handler.ErrorHandler): # Python2.5 implementation of xml.sax is broken -- character streams and @@ -282,30 +294,27 @@ class ArxmlivErrorHandler(xml.sax.handler.ErrorHandler): # the middle, resulting in invalid tokens... # This is not really a problem with arxmliv xml files themselves, so ignore # these errors silently. + def error(self, exception): pass # logger.debug("SAX error parsing xml: %s" % exception) warning = fatalError = error - #endclass ArxmlivErrorHandler - + # endclass ArxmlivErrorHandler def __init__(self, sourceId, baseDir): self.sourceId = sourceId self.baseDir = os.path.normpath(baseDir) - def __str__(self): return self.sourceId - def idFromDir(self, path): assert len(path) > len(self.baseDir) - intId = path[1 + path.rfind('#') : ] - pathId = path[len(self.baseDir) + 1 : ] + intId = path[1 + path.rfind('#'):] + pathId = path[len(self.baseDir) + 1:] return (intId, pathId) - def isArticle(self, path): # in order to be valid, the article directory must start with '#' if not os.path.basename(path).startswith('#'): @@ -316,10 +325,11 @@ def isArticle(self, path): return False return True - def findArticles(self): dirTotal = artAccepted = 0 - logger.info("looking for '%s' articles inside %s" % (self.sourceId, self.baseDir)) + logger.info( + "looking for '%s' articles inside %s" % + (self.sourceId, self.baseDir)) for root, dirs, files in os.walk(self.baseDir): dirTotal += 1 root = os.path.normpath(root) @@ -327,8 +337,7 @@ def findArticles(self): artAccepted += 1 yield self.idFromDir(root) logger.info('%i directories processed, found %i articles' % - (dirTotal, artAccepted)) - + (dirTotal, artAccepted)) def getContent(self, uri): """ @@ -338,15 +347,15 @@ def getContent(self, uri): filename = os.path.join(self.baseDir, pathId, 'tex.xml') return open(filename).read() - def getMeta(self, uri): """ Return article metadata as an attribute->value dictionary. """ # intId, pathId = uri # filename = os.path.join(self.baseDir, pathId, 'tex.xml') - return {'language': 'eng'} # TODO maybe parse out some meta; but currently not needed for anything... - + # TODO maybe parse out some meta; but currently not needed for + # anything... + return {'language': 'eng'} def tokenize(self, content): """ @@ -358,15 +367,16 @@ def tokenize(self, content): a dollar sign prefix and suffix. """ handler = ArxmlivSource.ArxmlivContentHandler() - xml.sax.parseString(content, handler, ArxmlivSource.ArxmlivErrorHandler()) + xml.sax.parseString( + content, + handler, + ArxmlivSource.ArxmlivErrorHandler()) return handler.tokens - def normalizeWord(self, word): - if word[0] == '$': # ignore math tokens + if word[0] == '$': # ignore math tokens return word wordU = unicode(word, 'utf8') - return wordU.lower().encode('utf8') # lowercase and then convert back to bytestring -#endclass ArxmlivSource - - + # lowercase and then convert back to bytestring + return wordU.lower().encode('utf8') +# endclass ArxmlivSource diff --git a/gensim/interfaces.py b/gensim/interfaces.py index e1024723b0..3645e06209 100644 --- a/gensim/interfaces.py +++ b/gensim/interfaces.py @@ -48,16 +48,17 @@ class CorpusABC(utils.SaveLoad): state, and **not** the documents themselves. See the `save_corpus` static method for serializing the actual stream content. """ + def __iter__(self): """ Iterate over the corpus, yielding one document at a time. """ raise NotImplementedError('cannot instantiate abstract base class') - def save(self, *args, **kwargs): import warnings - warnings.warn("corpus.save() stores only the (tiny) iteration object; " + warnings.warn( + "corpus.save() stores only the (tiny) iteration object; " "to serialize the actual corpus content, use e.g. MmCorpus.serialize(corpus)") super(CorpusABC, self).save(*args, **kwargs) @@ -68,9 +69,11 @@ def __len__(self): This method is just the least common denominator and should really be overridden when possible. """ - raise NotImplementedError("must override __len__() before calling len(corpus)") + raise NotImplementedError( + "must override __len__() before calling len(corpus)") # logger.warning("performing full corpus scan to determine its length; was this intended?") -# return sum(1 for doc in self) # sum(empty generator) == 0, so this works even for an empty corpus +# return sum(1 for doc in self) # sum(empty generator) == 0, so this works +# even for an empty corpus @staticmethod def save_corpus(fname, corpus, id2word=None, metadata=False): @@ -98,13 +101,15 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): # example code: logger.info("converting corpus to ??? format: %s" % fname) with utils.smart_open(fname, 'wb') as fout: - for doc in corpus: # iterate over the document stream - fmt = str(doc) # format the document appropriately... - fout.write(utils.to_utf8("%s\n" % fmt)) # serialize the formatted document to disk -#endclass CorpusABC + for doc in corpus: # iterate over the document stream + fmt = str(doc) # format the document appropriately... + # serialize the formatted document to disk + fout.write(utils.to_utf8("%s\n" % fmt)) +# endclass CorpusABC class TransformedCorpus(CorpusABC): + def __init__(self, obj, corpus, chunksize=None): self.obj, self.corpus, self.chunksize = obj, corpus, chunksize self.metadata = False @@ -123,10 +128,13 @@ def __iter__(self): def __getitem__(self, docno): if hasattr(self.corpus, '__getitem__'): - return self.obj[self.corpus[docno]] + return self.obj[self.corpus[docno]] else: - raise RuntimeError('Type {} does not support slicing.'.format(type(self.corpus))) -#endclass TransformedCorpus + raise RuntimeError( + 'Type {} does not support slicing.'.format( + type( + self.corpus))) +# endclass TransformedCorpus class TransformationABC(utils.SaveLoad): @@ -155,14 +163,13 @@ def __getitem__(self, vec): """ raise NotImplementedError('cannot instantiate abstract base class') - def _apply(self, corpus, chunksize=None): """ Apply the transformation to a whole corpus (as opposed to a single document) and return the result as another corpus. """ return TransformedCorpus(self, corpus, chunksize) -#endclass TransformationABC +# endclass TransformationABC class SimilarityABC(utils.SaveLoad): @@ -181,16 +188,15 @@ class SimilarityABC(utils.SaveLoad): similarities of each document in the corpus against the whole corpus (ie., the query is each corpus document in turn). """ + def __init__(self, corpus): raise NotImplementedError("cannot instantiate Abstract Base Class") - def get_similarities(self, doc): # (Sparse)MatrixSimilarity override this method so that they both use the # same __getitem__ method, defined below raise NotImplementedError("cannot instantiate Abstract Base Class") - def __getitem__(self, query): """Get similarities of document `query` to all documents in the corpus. @@ -223,18 +229,20 @@ def __getitem__(self, query): # if the input query was a corpus (=more documents), compute the top-n # most similar for each document in turn if matutils.ismatrix(result): - return [matutils.full2sparse_clipped(v, self.num_best) for v in result] + return [ + matutils.full2sparse_clipped( + v, self.num_best) for v in result] else: # otherwise, return top-n of the single input document return matutils.full2sparse_clipped(result, self.num_best) - def __iter__(self): """ For each index document, compute cosine similarity against all other documents in the index and yield the result. """ - # turn off query normalization (vectors in the index are assumed to be already normalized) + # turn off query normalization (vectors in the index are assumed to be + # already normalized) norm = self.normalize self.normalize = False @@ -245,11 +253,13 @@ def __iter__(self): # # After computing similarities of the bigger query in `self[chunk]`, # yield the resulting similarities one after another, so that it looks - # exactly the same as if they had been computed with many small queries. + # exactly the same as if they had been computed with many small + # queries. try: chunking = self.chunksize > 1 except AttributeError: - # chunking not supported; fall back to the (slower) mode of 1 query=1 document + # chunking not supported; fall back to the (slower) mode of 1 + # query=1 document chunking = False if chunking: # assumes `self.corpus` holds the index as a 2-d numpy array. @@ -259,8 +269,10 @@ def __iter__(self): # scipy.sparse doesn't allow slicing beyond real size of the matrix # (unlike numpy). so, clip the end of the chunk explicitly to make # scipy.sparse happy - chunk_end = min(self.index.shape[0], chunk_start + self.chunksize) - chunk = self.index[chunk_start : chunk_end] + chunk_end = min( + self.index.shape[0], + chunk_start + self.chunksize) + chunk = self.index[chunk_start: chunk_end] for sim in self[chunk]: yield sim else: @@ -269,4 +281,4 @@ def __iter__(self): # restore old normalization value self.normalize = norm -#endclass SimilarityABC +# endclass SimilarityABC diff --git a/gensim/matutils.py b/gensim/matutils.py index 4c92be4562..d12e99a4f3 100644 --- a/gensim/matutils.py +++ b/gensim/matutils.py @@ -26,7 +26,8 @@ from six.moves import xrange, zip as izip # scipy is not a stable package yet, locations change, so try to work -# around differences (currently only concerns location of 'triu' in scipy 0.7 vs. 0.8) +# around differences (currently only concerns location of 'triu' in scipy +# 0.7 vs. 0.8) try: from scipy.linalg.basic import triu except ImportError: @@ -41,7 +42,8 @@ def triu_indices(n, k=0): a = triu(m, k) return numpy.where(a != 0) -blas = lambda name, ndarray: scipy.linalg.get_blas_funcs((name,), (ndarray,))[0] +blas = lambda name, ndarray: scipy.linalg.get_blas_funcs((name,), (ndarray,))[ + 0] logger = logging.getLogger(__name__) @@ -53,7 +55,8 @@ def argsort(x, topn=None, reverse=False): If reverse is True, return the greatest elements instead, in descending order. """ - x = numpy.asarray(x) # unify code path for when `x` is not a numpy array (list, tuple...) + x = numpy.asarray( + x) # unify code path for when `x` is not a numpy array (list, tuple...) if topn is None: topn = x.size if topn <= 0: @@ -64,10 +67,18 @@ def argsort(x, topn=None, reverse=False): return numpy.argsort(x)[:topn] # numpy >= 1.8 has a fast partial argsort, use that! most_extreme = numpy.argpartition(x, topn)[:topn] - return most_extreme.take(numpy.argsort(x.take(most_extreme))) # resort topn into order + return most_extreme.take( + numpy.argsort( + x.take(most_extreme))) # resort topn into order -def corpus2csc(corpus, num_terms=None, dtype=numpy.float64, num_docs=None, num_nnz=None, printprogress=0): +def corpus2csc( + corpus, + num_terms=None, + dtype=numpy.float64, + num_docs=None, + num_nnz=None, + printprogress=0): """ Convert a streamed corpus into a sparse matrix, in scipy.sparse.csc_matrix format, with documents as columns. @@ -90,26 +101,34 @@ def corpus2csc(corpus, num_terms=None, dtype=numpy.float64, num_docs=None, num_n if num_nnz is None: num_nnz = corpus.num_nnz except AttributeError: - pass # not a MmCorpus... + pass # not a MmCorpus... if printprogress: logger.info("creating sparse matrix from corpus") if num_terms is not None and num_docs is not None and num_nnz is not None: - # faster and much more memory-friendly version of creating the sparse csc + # faster and much more memory-friendly version of creating the sparse + # csc posnow, indptr = 0, [0] - indices = numpy.empty((num_nnz,), dtype=numpy.int32) # HACK assume feature ids fit in 32bit integer + # HACK assume feature ids fit in 32bit integer + indices = numpy.empty((num_nnz,), dtype=numpy.int32) data = numpy.empty((num_nnz,), dtype=dtype) for docno, doc in enumerate(corpus): if printprogress and docno % printprogress == 0: logger.info("PROGRESS: at document #%i/%i" % (docno, num_docs)) posnext = posnow + len(doc) indices[posnow: posnext] = [feature_id for feature_id, _ in doc] - data[posnow: posnext] = [feature_weight for _, feature_weight in doc] + data[ + posnow: posnext] = [ + feature_weight for _, + feature_weight in doc] indptr.append(posnext) posnow = posnext assert posnow == num_nnz, "mismatch between supplied and computed number of non-zeros" - result = scipy.sparse.csc_matrix((data, indices, indptr), shape=(num_terms, num_docs), dtype=dtype) + result = scipy.sparse.csc_matrix( + (data, indices, indptr), shape=( + num_terms, num_docs), dtype=dtype) else: - # slower version; determine the sparse matrix parameters during iteration + # slower version; determine the sparse matrix parameters during + # iteration num_nnz, data, indices, indptr = 0, [], [], [0] for docno, doc in enumerate(corpus): if printprogress and docno % printprogress == 0: @@ -124,7 +143,9 @@ def corpus2csc(corpus, num_terms=None, dtype=numpy.float64, num_docs=None, num_n # now num_docs, num_terms and num_nnz contain the correct values data = numpy.asarray(data, dtype=dtype) indices = numpy.asarray(indices) - result = scipy.sparse.csc_matrix((data, indices, indptr), shape=(num_terms, num_docs), dtype=dtype) + result = scipy.sparse.csc_matrix( + (data, indices, indptr), shape=( + num_terms, num_docs), dtype=dtype) return result @@ -139,19 +160,25 @@ def pad(mat, padrow, padcol): padcol = 0 rows, cols = mat.shape return numpy.bmat([[mat, numpy.matrix(numpy.zeros((rows, padcol)))], - [numpy.matrix(numpy.zeros((padrow, cols + padcol)))]]) + [numpy.matrix(numpy.zeros((padrow, cols + padcol)))]]) def zeros_aligned(shape, dtype, order='C', align=128): """Like `numpy.zeros()`, but the array will be aligned at `align` byte boundary.""" nbytes = numpy.prod(shape, dtype=numpy.int64) * numpy.dtype(dtype).itemsize - buffer = numpy.zeros(nbytes + align, dtype=numpy.uint8) # problematic on win64 ("maximum allowed dimension exceeded") + # problematic on win64 ("maximum allowed dimension exceeded") + buffer = numpy.zeros(nbytes + align, dtype=numpy.uint8) start_index = -buffer.ctypes.data % align - return buffer[start_index : start_index + nbytes].view(dtype).reshape(shape, order=order) + return buffer[ + start_index: start_index + + nbytes].view(dtype).reshape( + shape, + order=order) def ismatrix(m): - return isinstance(m, numpy.ndarray) and m.ndim == 2 or scipy.sparse.issparse(m) + return isinstance( + m, numpy.ndarray) and m.ndim == 2 or scipy.sparse.issparse(m) def any2sparse(vec, eps=1e-9): @@ -167,7 +194,8 @@ def scipy2sparse(vec, eps=1e-9): """Convert a scipy.sparse vector into gensim document format (=list of 2-tuples).""" vec = vec.tocsr() assert vec.shape[0] == 1 - return [(int(pos), float(val)) for pos, val in zip(vec.indices, vec.data) if numpy.abs(val) > eps] + return [(int(pos), float(val)) + for pos, val in zip(vec.indices, vec.data) if numpy.abs(val) > eps] class Scipy2Corpus(object): @@ -177,6 +205,7 @@ class Scipy2Corpus(object): This is the mirror function to `corpus2csc`. """ + def __init__(self, vecs): """ `vecs` is a sequence of dense and/or sparse vectors, such as a 2d numpy array, @@ -204,7 +233,8 @@ def sparse2full(doc, length): This is the mirror function to `full2sparse`. """ - result = numpy.zeros(length, dtype=numpy.float32) # fill with zeroes (default value) + result = numpy.zeros( + length, dtype=numpy.float32) # fill with zeroes (default value) doc = dict(doc) # overwrite some of the zeroes with explicit values result[list(doc)] = list(itervalues(doc)) @@ -233,7 +263,8 @@ def full2sparse_clipped(vec, topn, eps=1e-9): """ # use numpy.argpartition/argsort and only form tuples that are actually returned. - # this is about 40x faster than explicitly forming all 2-tuples to run sort() or heapq.nlargest() on. + # this is about 40x faster than explicitly forming all 2-tuples to run + # sort() or heapq.nlargest() on. if topn <= 0: return [] vec = numpy.asarray(vec, dtype=float) @@ -261,11 +292,11 @@ def corpus2dense(corpus, num_terms, num_docs=None, dtype=numpy.float32): result[:, docno] = sparse2full(doc, num_terms) assert docno + 1 == num_docs else: - result = numpy.column_stack(sparse2full(doc, num_terms) for doc in corpus) + result = numpy.column_stack( + sparse2full(doc, num_terms) for doc in corpus) return result.astype(dtype) - class Dense2Corpus(object): """ Treat dense numpy array as a sparse, streamed gensim corpus. @@ -276,6 +307,7 @@ class Dense2Corpus(object): This is the mirror function to `corpus2dense`. """ + def __init__(self, dense, documents_columns=True): if documents_columns: self.dense = dense.T @@ -288,7 +320,7 @@ def __iter__(self): def __len__(self): return len(self.dense) -#endclass DenseCorpus +# endclass DenseCorpus class Sparse2Corpus(object): @@ -298,19 +330,23 @@ class Sparse2Corpus(object): This is the mirror function to `corpus2csc`. """ + def __init__(self, sparse, documents_columns=True): if documents_columns: self.sparse = sparse.tocsc() else: - self.sparse = sparse.tocsr().T # make sure shape[1]=number of docs (needed in len()) + # make sure shape[1]=number of docs (needed in len()) + self.sparse = sparse.tocsr().T def __iter__(self): - for indprev, indnow in izip(self.sparse.indptr, self.sparse.indptr[1:]): + for indprev, indnow in izip( + self.sparse.indptr, self.sparse.indptr[ + 1:]): yield list(zip(self.sparse.indices[indprev:indnow], self.sparse.data[indprev:indnow])) def __len__(self): return self.sparse.shape[1] -#endclass Sparse2Corpus +# endclass Sparse2Corpus def veclen(vec): @@ -320,12 +356,14 @@ def veclen(vec): assert length > 0.0, "sparse documents must not contain any explicit zero entries" return length + def ret_normalized_vec(vec, length): if length != 1.0: return [(termid, val / length) for termid, val in vec] else: return list(vec) + def ret_log_normalize_vec(vec, axis=1): log_max = 100.0 if len(vec.shape) == 1: @@ -362,7 +400,9 @@ def unitvec(vec, norm='l2'): or numpy array=>numpy array, scipy.sparse=>scipy.sparse). """ if norm not in ('l1', 'l2'): - raise ValueError("'%s' is not a supported norm. Currently supported norms are 'l1' and 'l2'." % norm) + raise ValueError( + "'%s' is not a supported norm. Currently supported norms are 'l1' and 'l2'." % + norm) if scipy.sparse.issparse(vec): vec = vec.tocsr() if norm == 'l1': @@ -390,7 +430,8 @@ def unitvec(vec, norm='l2'): except: return vec - if isinstance(first, (tuple, list)) and len(first) == 2: # gensim sparse format + if isinstance(first, (tuple, list)) and len( + first) == 2: # gensim sparse format if norm == 'l1': length = float(sum(abs(val) for _, val in vec)) if norm == 'l2': @@ -413,9 +454,10 @@ def cossim(vec1, vec2): vec2len = 1.0 * math.sqrt(sum(val * val for val in itervalues(vec2))) assert vec1len > 0.0 and vec2len > 0.0, "sparse documents must not contain any explicit zero entries" if len(vec2) < len(vec1): - vec1, vec2 = vec2, vec1 # swap references so that we iterate over the shorter vector - result = sum(value * vec2.get(index, 0.0) for index, value in iteritems(vec1)) - result /= vec1len * vec2len # rescale by vector lengths + vec1, vec2 = vec2, vec1 # swap references so that we iterate over the shorter vector + result = sum(value * vec2.get(index, 0.0) + for index, value in iteritems(vec1)) + result /= vec1len * vec2len # rescale by vector lengths return result @@ -427,10 +469,12 @@ def isbow(vec): if scipy.sparse.issparse(vec): vec = vec.todense().tolist() try: - id_, val_ = vec[0] # checking first value to see if it is in bag of words format by unpacking + # checking first value to see if it is in bag of words format by + # unpacking + id_, val_ = vec[0] id_, val_ = int(id_), float(val_) except IndexError: - return True # this is to handle the empty input case + return True # this is to handle the empty input case except Exception: return False return True @@ -446,9 +490,12 @@ def kullback_leibler(vec1, vec2, num_features=None): if scipy.sparse.issparse(vec1): vec1 = vec1.toarray() if scipy.sparse.issparse(vec2): - vec2 = vec2.toarray() # converted both the vectors to dense in case they were in sparse matrix - if isbow(vec1) and isbow(vec2): # if they are in bag of words format we make it dense - if num_features != None: # if not None, make as large as the documents drawing from + # converted both the vectors to dense in case they were in sparse + # matrix + vec2 = vec2.toarray() + if isbow(vec1) and isbow( + vec2): # if they are in bag of words format we make it dense + if num_features is not None: # if not None, make as large as the documents drawing from dense1 = sparse2full(vec1, num_features) dense2 = sparse2full(vec2, num_features) return entropy(dense1, dense2) @@ -459,7 +506,8 @@ def kullback_leibler(vec1, vec2, num_features=None): return entropy(dense1, dense2) else: # this conversion is made because if it is not in bow format, it might be a list within a list after conversion - # the scipy implementation of Kullback fails in such a case so we pick up only the nested list. + # the scipy implementation of Kullback fails in such a case so we pick + # up only the nested list. if len(vec1) == 1: vec1 = vec1[0] if len(vec2) == 1: @@ -476,15 +524,25 @@ def hellinger(vec1, vec2): vec1 = vec1.toarray() if scipy.sparse.issparse(vec2): vec2 = vec2.toarray() - if isbow(vec1) and isbow(vec2): - # if it is a bag of words format, instead of converting to dense we use dictionaries to calculate appropriate distance + if isbow(vec1) and isbow(vec2): + # if it is a bag of words format, instead of converting to dense we use + # dictionaries to calculate appropriate distance vec1, vec2 = dict(vec1), dict(vec2) - if len(vec2) < len(vec1): - vec1, vec2 = vec2, vec1 # swap references so that we iterate over the shorter vector - sim = numpy.sqrt(0.5*sum((numpy.sqrt(value) - numpy.sqrt(vec2.get(index, 0.0)))**2 for index, value in iteritems(vec1))) + if len(vec2) < len(vec1): + vec1, vec2 = vec2, vec1 # swap references so that we iterate over the shorter vector + sim = numpy.sqrt( + 0.5 * + sum( + (numpy.sqrt(value) - + numpy.sqrt( + vec2.get( + index, + 0.0)))**2 for index, + value in iteritems(vec1))) return sim else: - sim = numpy.sqrt(0.5 * ((numpy.sqrt(vec1) - numpy.sqrt(vec2))**2).sum()) + sim = numpy.sqrt( + 0.5 * ((numpy.sqrt(vec1) - numpy.sqrt(vec2))**2).sum()) return sim @@ -502,18 +560,21 @@ def jaccard(vec1, vec2): vec1 = vec1.toarray() if scipy.sparse.issparse(vec2): vec2 = vec2.toarray() - if isbow(vec1) and isbow(vec2): + if isbow(vec1) and isbow(vec2): # if it's in bow format, we use the following definitions: # union = sum of the 'weights' of both the bags - # intersection = lowest weight for a particular id; basically the number of common words or items - union = sum(weight for id_, weight in vec1) + sum(weight for id_, weight in vec2) + # intersection = lowest weight for a particular id; basically the + # number of common words or items + union = sum(weight for id_, weight in vec1) + \ + sum(weight for id_, weight in vec2) vec1, vec2 = dict(vec1), dict(vec2) intersection = 0.0 for feature_id, feature_weight in iteritems(vec1): intersection += min(feature_weight, vec2.get(feature_id, 0.0)) return 1 - float(intersection) / float(union) else: - # if it isn't in bag of words format, we can use sets to calculate intersection and union + # if it isn't in bag of words format, we can use sets to calculate + # intersection and union if isinstance(vec1, numpy.ndarray): vec1 = vec1.tolist() if isinstance(vec2, numpy.ndarray): @@ -533,18 +594,18 @@ def qr_destroy(la): because the memory used in `la[0]` is reclaimed earlier. """ a = numpy.asfortranarray(la[0]) - del la[0], la # now `a` is the only reference to the input matrix + del la[0], la # now `a` is the only reference to the input matrix m, n = a.shape # perform q, r = QR(a); code hacked out of scipy.linalg.qr logger.debug("computing QR of %s dense matrix" % str(a.shape)) geqrf, = get_lapack_funcs(('geqrf',), (a,)) qr, tau, work, info = geqrf(a, lwork=-1, overwrite_a=True) qr, tau, work, info = geqrf(a, lwork=work[0], overwrite_a=True) - del a # free up mem + del a # free up mem assert info >= 0 r = triu(qr[:n, :n]) - if m < n: # rare case, #features < #topics - qr = qr[:, :m] # retains fortran order + if m < n: # rare case, #features < #topics + qr = qr[:, :m] # retains fortran order gorgqr, = get_lapack_funcs(('orgqr',), (qr,)) q, work, info = gorgqr(qr, tau, lwork=-1, overwrite_a=True) q, work, info = gorgqr(qr, tau, lwork=work[0], overwrite_a=True) @@ -553,7 +614,6 @@ def qr_destroy(la): return q, r - class MmWriter(object): """ Store a corpus in Matrix Market format. @@ -570,31 +630,38 @@ class MmWriter(object): """ - HEADER_LINE = b'%%MatrixMarket matrix coordinate real general\n' # the only supported MM format + # the only supported MM format + HEADER_LINE = b'%%MatrixMarket matrix coordinate real general\n' def __init__(self, fname): self.fname = fname if fname.endswith(".gz") or fname.endswith('.bz2'): - raise NotImplementedError("compressed output not supported with MmWriter") - self.fout = utils.smart_open(self.fname, 'wb+') # open for both reading and writing + raise NotImplementedError( + "compressed output not supported with MmWriter") + # open for both reading and writing + self.fout = utils.smart_open(self.fname, 'wb+') self.headers_written = False - def write_headers(self, num_docs, num_terms, num_nnz): self.fout.write(MmWriter.HEADER_LINE) if num_nnz < 0: - # we don't know the matrix shape/density yet, so only log a general line + # we don't know the matrix shape/density yet, so only log a general + # line logger.info("saving sparse matrix to %s" % self.fname) - self.fout.write(utils.to_utf8(' ' * 50 + '\n')) # 48 digits must be enough for everybody + # 48 digits must be enough for everybody + self.fout.write(utils.to_utf8(' ' * 50 + '\n')) else: - logger.info("saving sparse %sx%s matrix with %i non-zero entries to %s" % - (num_docs, num_terms, num_nnz, self.fname)) - self.fout.write(utils.to_utf8('%s %s %s\n' % (num_docs, num_terms, num_nnz))) + logger.info( + "saving sparse %sx%s matrix with %i non-zero entries to %s" % + (num_docs, num_terms, num_nnz, self.fname)) + self.fout.write( + utils.to_utf8( + '%s %s %s\n' % + (num_docs, num_terms, num_nnz))) self.last_docno = -1 self.headers_written = True - def fake_headers(self, num_docs, num_terms, num_nnz): stats = '%i %i %i' % (num_docs, num_terms, num_nnz) if len(stats) > 50: @@ -602,7 +669,6 @@ def fake_headers(self, num_docs, num_terms, num_nnz): self.fout.seek(len(MmWriter.HEADER_LINE)) self.fout.write(utils.to_utf8(stats)) - def write_vector(self, docno, vector): """ Write a single sparse vector to the file. @@ -610,16 +676,27 @@ def write_vector(self, docno, vector): Sparse vector is any iterable yielding (field id, field value) pairs. """ assert self.headers_written, "must write Matrix Market file headers before writing data!" - assert self.last_docno < docno, "documents %i and %i not in sequential order!" % (self.last_docno, docno) - vector = sorted((i, w) for i, w in vector if abs(w) > 1e-12) # ignore near-zero entries - for termid, weight in vector: # write term ids in sorted order - self.fout.write(utils.to_utf8("%i %i %s\n" % (docno + 1, termid + 1, weight))) # +1 because MM format starts counting from 1 + assert self.last_docno < docno, "documents %i and %i not in sequential order!" % ( + self.last_docno, docno) + vector = sorted((i, w) for i, w in vector if abs(w) + > 1e-12) # ignore near-zero entries + for termid, weight in vector: # write term ids in sorted order + # +1 because MM format starts counting from 1 + self.fout.write( + utils.to_utf8( + "%i %i %s\n" % + (docno + 1, termid + 1, weight))) self.last_docno = docno return (vector[-1][0], len(vector)) if vector else (-1, 0) - @staticmethod - def write_corpus(fname, corpus, progress_cnt=1000, index=False, num_terms=None, metadata=False): + def write_corpus( + fname, + corpus, + progress_cnt=1000, + index=False, + num_terms=None, + metadata=False): """ Save the vector space representation of an entire corpus to disk. @@ -628,10 +705,13 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False, num_terms=None, """ mw = MmWriter(fname) - # write empty headers to the file (with enough space to be overwritten later) - mw.write_headers(-1, -1, -1) # will print 50 spaces followed by newline on the stats line + # write empty headers to the file (with enough space to be overwritten + # later) + # will print 50 spaces followed by newline on the stats line + mw.write_headers(-1, -1, -1) - # calculate necessary header info (nnz elements, num terms, num docs) while writing out vectors + # calculate necessary header info (nnz elements, num terms, num docs) + # while writing out vectors _num_terms, num_nnz = 0, 0 docno, poslast = -1, -1 offsets = [] @@ -673,14 +753,14 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False, num_terms=None, num_nnz, num_docs * num_terms)) - # now write proper headers, by seeking and overwriting the spaces written earlier + # now write proper headers, by seeking and overwriting the spaces + # written earlier mw.fake_headers(num_docs, num_terms, num_nnz) mw.close() if index: return offsets - def __del__(self): """ Automatic destructor which closes the underlying file. @@ -689,15 +769,13 @@ def __del__(self): to work! Closing the file explicitly via the close() method is preferred and safer. """ - self.close() # does nothing if called twice (on an already closed file), so no worries - + self.close() # does nothing if called twice (on an already closed file), so no worries def close(self): logger.debug("closing %s" % self.fname) if hasattr(self, 'fout'): self.fout.close() -#endclass MmWriter - +# endclass MmWriter class MmReader(object): @@ -709,6 +787,7 @@ class MmReader(object): matrix at once (unlike scipy.io.mmread). This allows us to process corpora which are larger than the available RAM. """ + def __init__(self, input, transposed=True): """ Initialize the matrix reader. @@ -726,8 +805,9 @@ def __init__(self, input, transposed=True): try: header = utils.to_unicode(next(lines)).strip() if not header.lower().startswith('%%matrixmarket matrix coordinate real general'): - raise ValueError("File %s not in Matrix Market format with coordinate real general; instead found: \n%s" % - (self.input, header)) + raise ValueError( + "File %s not in Matrix Market format with coordinate real general; instead found: \n%s" % + (self.input, header)) except StopIteration: pass @@ -735,13 +815,15 @@ def __init__(self, input, transposed=True): for lineno, line in enumerate(lines): line = utils.to_unicode(line) if not line.startswith('%'): - self.num_docs, self.num_terms, self.num_nnz = map(int, line.split()) + self.num_docs, self.num_terms, self.num_nnz = map( + int, line.split()) if not self.transposed: self.num_docs, self.num_terms = self.num_terms, self.num_docs break - logger.info("accepted corpus with %i documents, %i features, %i non-zero entries" % - (self.num_docs, self.num_terms, self.num_nnz)) + logger.info( + "accepted corpus with %i documents, %i features, %i non-zero entries" % + (self.num_docs, self.num_terms, self.num_nnz)) def __len__(self): return self.num_docs @@ -774,26 +856,33 @@ def __iter__(self): previd = -1 for line in lines: - docid, termid, val = utils.to_unicode(line).split() # needed for python3 + docid, termid, val = utils.to_unicode( + line).split() # needed for python3 if not self.transposed: termid, docid = docid, termid - docid, termid, val = int(docid) - 1, int(termid) - 1, float(val) # -1 because matrix market indexes are 1-based => convert to 0-based + # -1 because matrix market indexes are 1-based => convert to 0-based + docid, termid, val = int( + docid) - 1, int(termid) - 1, float(val) assert previd <= docid, "matrix columns must come in ascending order" if docid != previd: - # change of document: return the document read so far (its id is prevId) + # change of document: return the document read so far (its + # id is prevId) if previd >= 0: yield previd, document # return implicit (empty) documents between previous id and new id - # too, to keep consistent document numbering and corpus length + # too, to keep consistent document numbering and corpus + # length for previd in xrange(previd + 1, docid): yield previd, [] - # from now on start adding fields to a new document, with a new id + # from now on start adding fields to a new document, with a + # new id previd = docid document = [] - document.append((termid, val,)) # add another field to the current document + # add another field to the current document + document.append((termid, val,)) # handle the last document, as a special case if previd >= 0: @@ -804,7 +893,6 @@ def __iter__(self): for previd in xrange(previd + 1, self.num_docs): yield previd, [] - def docbyoffset(self, offset): """Return document at file offset `offset` (in bytes)""" # empty documents are not stored explicitly in MM format, so the index marks @@ -816,19 +904,21 @@ def docbyoffset(self, offset): else: fin = self.input - fin.seek(offset) # works for gzip/bz2 input, too + fin.seek(offset) # works for gzip/bz2 input, too previd, document = -1, [] for line in fin: docid, termid, val = line.split() if not self.transposed: termid, docid = docid, termid - docid, termid, val = int(docid) - 1, int(termid) - 1, float(val) # -1 because matrix market indexes are 1-based => convert to 0-based + # -1 because matrix market indexes are 1-based => convert to 0-based + docid, termid, val = int(docid) - 1, int(termid) - 1, float(val) assert previd <= docid, "matrix columns must come in ascending order" if docid != previd: if previd >= 0: return document previd = docid - document.append((termid, val,)) # add another field to the current document + # add another field to the current document + document.append((termid, val,)) return document -#endclass MmReader +# endclass MmReader diff --git a/gensim/models/__init__.py b/gensim/models/__init__.py index 79ab1cca9b..8c4ba6b74a 100644 --- a/gensim/models/__init__.py +++ b/gensim/models/__init__.py @@ -40,20 +40,22 @@ class VocabTransform(interfaces.TransformationABC): >>> ... """ + def __init__(self, old2new, id2token=None): # id2word = dict((newid, oldid2word[oldid]) for oldid, newid in old2new.iteritems()) self.old2new = old2new self.id2token = id2token - def __getitem__(self, bow): """ Return representation with the ids transformed. """ - # if the input vector is in fact a corpus, return a transformed corpus as a result + # if the input vector is in fact a corpus, return a transformed corpus + # as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) - return sorted((self.old2new[oldid], weight) for oldid, weight in bow if oldid in self.old2new) -#endclass VocabTransform + return sorted((self.old2new[oldid], weight) + for oldid, weight in bow if oldid in self.old2new) +# endclass VocabTransform diff --git a/gensim/models/basemodel.py b/gensim/models/basemodel.py index cba87a9d70..2287cdbf05 100644 --- a/gensim/models/basemodel.py +++ b/gensim/models/basemodel.py @@ -1,4 +1,5 @@ class BaseTopicModel(object): + def print_topic(self, topicno, topn=10): """ Return a single topic as a formatted string. See `show_topic()` for parameters. @@ -7,10 +8,14 @@ def print_topic(self, topicno, topn=10): '-0.340 * "category" + 0.298 * "$M$" + 0.183 * "algebra" + -0.174 * "functor" + -0.168 * "operator"' """ - return ' + '.join(['%.3f*"%s"' % (v, k) for k, v in self.show_topic(topicno, topn)]) + return ' + '.join(['%.3f*"%s"' % (v, k) + for k, v in self.show_topic(topicno, topn)]) def print_topics(self, num_topics=20, num_words=10): """Alias for `show_topics()` that prints the `num_words` most probable words for `topics` number of topics to log. Set `topics=-1` to print all topics.""" - return self.show_topics(num_topics=num_topics, num_words=num_words, log=True) + return self.show_topics( + num_topics=num_topics, + num_words=num_words, + log=True) diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index 20fa9e14bb..288ed76906 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -21,9 +21,12 @@ import logging from gensim import interfaces -from gensim.topic_coherence import (segmentation, probability_estimation, - direct_confirmation_measure, indirect_confirmation_measure, - aggregation) +from gensim.topic_coherence import ( + segmentation, + probability_estimation, + direct_confirmation_measure, + indirect_confirmation_measure, + aggregation) from gensim.matutils import argsort from gensim.utils import is_corpus, FakeDict from gensim.models.ldamodel import LdaModel @@ -40,22 +43,26 @@ make_pipeline = namedtuple('Coherence_Measure', 'seg, prob, conf, aggr') coherence_dict = { - 'u_mass': make_pipeline(segmentation.s_one_pre, - probability_estimation.p_boolean_document, - direct_confirmation_measure.log_conditional_probability, - aggregation.arithmetic_mean), - 'c_v': make_pipeline(segmentation.s_one_set, - probability_estimation.p_boolean_sliding_window, - indirect_confirmation_measure.cosine_similarity, - aggregation.arithmetic_mean), - 'c_uci': make_pipeline(segmentation.s_one_one, - probability_estimation.p_boolean_sliding_window, - direct_confirmation_measure.log_ratio_measure, - aggregation.arithmetic_mean), - 'c_npmi': make_pipeline(segmentation.s_one_one, - probability_estimation.p_boolean_sliding_window, - direct_confirmation_measure.log_ratio_measure, - aggregation.arithmetic_mean), + 'u_mass': make_pipeline( + segmentation.s_one_pre, + probability_estimation.p_boolean_document, + direct_confirmation_measure.log_conditional_probability, + aggregation.arithmetic_mean), + 'c_v': make_pipeline( + segmentation.s_one_set, + probability_estimation.p_boolean_sliding_window, + indirect_confirmation_measure.cosine_similarity, + aggregation.arithmetic_mean), + 'c_uci': make_pipeline( + segmentation.s_one_one, + probability_estimation.p_boolean_sliding_window, + direct_confirmation_measure.log_ratio_measure, + aggregation.arithmetic_mean), + 'c_npmi': make_pipeline( + segmentation.s_one_one, + probability_estimation.p_boolean_sliding_window, + direct_confirmation_measure.log_ratio_measure, + aggregation.arithmetic_mean), } sliding_windows_dict = { @@ -64,6 +71,7 @@ 'c_npmi': 10 } + class CoherenceModel(interfaces.TransformationABC): """ Objects of this class allow for building and maintaining a model for topic @@ -87,7 +95,17 @@ class CoherenceModel(interfaces.TransformationABC): Model persistency is achieved via its load/save methods. """ - def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None, window_size=None, coherence='c_v', topn=10): + + def __init__( + self, + model=None, + topics=None, + texts=None, + corpus=None, + dictionary=None, + window_size=None, + coherence='c_v', + topn=10): """ Args: ---- @@ -125,14 +143,16 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= if model is None and topics is None: raise ValueError("One of model or topics has to be provided.") elif topics is not None and dictionary is None: - raise ValueError("dictionary has to be provided if topics are to be used.") + raise ValueError( + "dictionary has to be provided if topics are to be used.") if texts is None and corpus is None: raise ValueError("One of texts or corpus has to be provided.") # Check if associated dictionary is provided. if dictionary is None: if isinstance(model.id2word, FakeDict): - raise ValueError("The associated dictionary should be provided with the corpus or 'id2word' for topic model" - " should be set as the associated dictionary.") + raise ValueError( + "The associated dictionary should be provided with the corpus or 'id2word' for topic model" + " should be set as the associated dictionary.") else: self.dictionary = model.id2word else: @@ -143,18 +163,25 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= self.corpus = corpus elif texts is not None: self.texts = texts - self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] + self.corpus = [ + self.dictionary.doc2bow(text) for text in self.texts] else: - raise ValueError("Either 'corpus' with 'dictionary' or 'texts' should be provided for %s coherence." % coherence) + raise ValueError( + "Either 'corpus' with 'dictionary' or 'texts' should be provided for %s coherence." % + coherence) # Check for correct inputs for c_v coherence measure. elif coherence in sliding_window_based: self.window_size = window_size if texts is None: - raise ValueError("'texts' should be provided for %s coherence." % coherence) + raise ValueError( + "'texts' should be provided for %s coherence." % + coherence) else: self.texts = texts else: - raise ValueError("%s coherence is not currently supported." % coherence) + raise ValueError( + "%s coherence is not currently supported." % + coherence) self.topn = topn self.model = model if model is not None: @@ -187,8 +214,9 @@ def _get_topics(self): bestn = argsort(topic, topn=self.topn, reverse=True) topics.append(bestn) else: - raise ValueError("This topic model is not currently supported. Supported topic models are" - "LdaModel, LdaVowpalWabbit and LdaMallet.") + raise ValueError( + "This topic model is not currently supported. Supported topic models are" + "LdaModel, LdaVowpalWabbit and LdaMallet.") return topics def get_coherence(self): @@ -198,20 +226,24 @@ def get_coherence(self): measure = coherence_dict[self.coherence] segmented_topics = measure.seg(self.topics) if self.coherence in boolean_document_based: - per_topic_postings, num_docs = measure.prob(self.corpus, segmented_topics) - confirmed_measures = measure.conf(segmented_topics, per_topic_postings, num_docs) + per_topic_postings, num_docs = measure.prob( + self.corpus, segmented_topics) + confirmed_measures = measure.conf( + segmented_topics, per_topic_postings, num_docs) elif self.coherence in sliding_window_based: if self.window_size is not None: self.window_size = sliding_windows_dict[self.coherence] - per_topic_postings, num_windows = measure.prob(texts=self.texts, segmented_topics=segmented_topics, - dictionary=self.dictionary, window_size=self.window_size) + per_topic_postings, num_windows = measure.prob( + texts=self.texts, segmented_topics=segmented_topics, dictionary=self.dictionary, window_size=self.window_size) if self.coherence == 'c_v': - confirmed_measures = measure.conf(self.topics, segmented_topics, per_topic_postings, 'nlr', 1, num_windows) + confirmed_measures = measure.conf( + self.topics, segmented_topics, per_topic_postings, 'nlr', 1, num_windows) else: if self.coherence == 'c_npmi': normalize = True else: # For c_uci normalize = False - confirmed_measures = measure.conf(segmented_topics, per_topic_postings, num_windows, normalize=normalize) + confirmed_measures = measure.conf( + segmented_topics, per_topic_postings, num_windows, normalize=normalize) return measure.aggr(confirmed_measures) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index c9f39f3299..ace956b836 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -50,7 +50,8 @@ repeat as np_repeat, array, float32 as REAL, empty, ones, memmap as np_memmap, \ sqrt, newaxis, ndarray, dot, vstack, dtype, divide as np_divide -from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc +# utility fnc for pickling, common scipy operations etc +from gensim import utils, matutils from gensim.models.word2vec import Word2Vec, Vocab, train_cbow_pair, train_sg_pair, train_batch_sg from six.moves import xrange, zip from six import string_types, integer_types, itervalues @@ -59,16 +60,29 @@ try: from gensim.models.doc2vec_inner import train_document_dbow, train_document_dm, train_document_dm_concat - from gensim.models.word2vec_inner import FAST_VERSION # blas-adaptation shared from word2vec + # blas-adaptation shared from word2vec + from gensim.models.word2vec_inner import FAST_VERSION logger.debug('Fast version of {0} is being used'.format(__name__)) except ImportError: logger.warning('Slow version of {0} is being used'.format(__name__)) - # failed... fall back to plain numpy (20-80x slower training than the above) + # failed... fall back to plain numpy (20-80x slower training than the + # above) FAST_VERSION = -1 - def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, - train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True, - word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): + def train_document_dbow( + model, + doc_words, + doctag_indexes, + alpha, + work=None, + train_words=False, + learn_doctags=True, + learn_words=True, + learn_hidden=True, + word_vectors=None, + word_locks=None, + doctag_vectors=None, + doctag_locks=None): """ Update distributed bag of words model ("PV-DBOW") by training on a single document. @@ -99,15 +113,32 @@ def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, train_batch_sg(model, [doc_words], alpha, work) for doctag_index in doctag_indexes: for word in doc_words: - train_sg_pair(model, word, doctag_index, alpha, learn_vectors=learn_doctags, - learn_hidden=learn_hidden, context_vectors=doctag_vectors, - context_locks=doctag_locks) + train_sg_pair( + model, + word, + doctag_index, + alpha, + learn_vectors=learn_doctags, + learn_hidden=learn_hidden, + context_vectors=doctag_vectors, + context_locks=doctag_locks) return len(doc_words) - def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, - learn_doctags=True, learn_words=True, learn_hidden=True, - word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): + def train_document_dm( + model, + doc_words, + doctag_indexes, + alpha, + work=None, + neu1=None, + learn_doctags=True, + learn_words=True, + learn_hidden=True, + word_vectors=None, + word_locks=None, + doctag_vectors=None, + doctag_locks=None): """ Update distributed memory model ("PV-DM") by training on a single document. @@ -142,16 +173,33 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N model.vocab[w].sample_int > model.random.rand() * 2**32] for pos, word in enumerate(word_vocabs): - reduced_window = model.random.randint(model.window) # `b` in the original doc2vec code + reduced_window = model.random.randint( + model.window) # `b` in the original doc2vec code start = max(0, pos - model.window + reduced_window) - window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) - word2_indexes = [word2.index for pos2, word2 in window_pos if pos2 != pos] - l1 = np_sum(word_vectors[word2_indexes], axis=0) + np_sum(doctag_vectors[doctag_indexes], axis=0) + window_pos = enumerate( + word_vocabs[ + start:( + pos + + model.window + + 1 - + reduced_window)], + start) + word2_indexes = [ + word2.index for pos2, + word2 in window_pos if pos2 != pos] + l1 = np_sum(word_vectors[word2_indexes], axis=0) + \ + np_sum(doctag_vectors[doctag_indexes], axis=0) count = len(word2_indexes) + len(doctag_indexes) - if model.cbow_mean and count > 1 : + if model.cbow_mean and count > 1: l1 /= count - neu1e = train_cbow_pair(model, word, word2_indexes, l1, alpha, - learn_vectors=False, learn_hidden=learn_hidden) + neu1e = train_cbow_pair( + model, + word, + word2_indexes, + l1, + alpha, + learn_vectors=False, + learn_hidden=learn_hidden) if not model.cbow_mean and count > 1: neu1e /= count if learn_doctags: @@ -163,9 +211,20 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N return len(word_vocabs) - def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, - learn_doctags=True, learn_words=True, learn_hidden=True, - word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): + def train_document_dm_concat( + model, + doc_words, + doctag_indexes, + alpha, + work=None, + neu1=None, + learn_doctags=True, + learn_words=True, + learn_hidden=True, + word_vectors=None, + word_locks=None, + doctag_vectors=None, + doctag_locks=None): """ Update distributed memory model ("PV-DM") by training on a single document, using a concatenation of the context window word vectors (rather than a sum or average). @@ -197,38 +256,63 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, model.vocab[w].sample_int > model.random.rand() * 2**32] doctag_len = len(doctag_indexes) if doctag_len != model.dm_tag_count: - return 0 # skip doc without expected number of doctag(s) (TODO: warn/pad?) + # skip doc without expected number of doctag(s) (TODO: warn/pad?) + return 0 null_word = model.vocab['\0'] pre_pad_count = model.window post_pad_count = model.window padded_document_indexes = ( (pre_pad_count * [null_word.index]) # pre-padding - + [word.index for word in word_vocabs if word is not None] # elide out-of-Vocabulary words + # elide out-of-Vocabulary words + + [word.index for word in word_vocabs if word is not None] + (post_pad_count * [null_word.index]) # post-padding ) - for pos in range(pre_pad_count, len(padded_document_indexes) - post_pad_count): + for pos in range( + pre_pad_count, + len(padded_document_indexes) - + post_pad_count): word_context_indexes = ( - padded_document_indexes[(pos - pre_pad_count): pos] # preceding words - + padded_document_indexes[(pos + 1):(pos + 1 + post_pad_count)] # following words + padded_document_indexes[ + (pos - pre_pad_count): pos] # preceding words + # following words + + padded_document_indexes[(pos + 1):(pos + 1 + post_pad_count)] ) word_context_len = len(word_context_indexes) - predict_word = model.vocab[model.index2word[padded_document_indexes[pos]]] + predict_word = model.vocab[ + model.index2word[ + padded_document_indexes[pos]]] # numpy advanced-indexing copies; concatenate, flatten to 1d - l1 = concatenate((doctag_vectors[doctag_indexes], word_vectors[word_context_indexes])).ravel() - neu1e = train_cbow_pair(model, predict_word, None, l1, alpha, - learn_hidden=learn_hidden, learn_vectors=False) + l1 = concatenate( + (doctag_vectors[doctag_indexes], + word_vectors[word_context_indexes])).ravel() + neu1e = train_cbow_pair( + model, + predict_word, + None, + l1, + alpha, + learn_hidden=learn_hidden, + learn_vectors=False) # filter by locks and shape for addition to source vectors - e_locks = concatenate((doctag_locks[doctag_indexes], word_locks[word_context_indexes])) - neu1e_r = (neu1e.reshape(-1, model.vector_size) - * np_repeat(e_locks, model.vector_size).reshape(-1, model.vector_size)) + e_locks = concatenate( + (doctag_locks[doctag_indexes], + word_locks[word_context_indexes])) + neu1e_r = (neu1e.reshape(-1, + model.vector_size) * np_repeat(e_locks, + model.vector_size).reshape(-1, + model.vector_size)) if learn_doctags: np_add.at(doctag_vectors, doctag_indexes, neu1e_r[:doctag_len]) if learn_words: - np_add.at(word_vectors, word_context_indexes, neu1e_r[doctag_len:]) + np_add.at( + word_vectors, + word_context_indexes, + neu1e_r[ + doctag_len:]) return len(padded_document_indexes) - pre_pad_count - post_pad_count @@ -243,14 +327,18 @@ class TaggedDocument(namedtuple('TaggedDocument', 'words tags')): Replaces "sentence as a list of words" from Word2Vec. """ + def __str__(self): return '%s(%s, %s)' % (self.__class__.__name__, self.words, self.tags) # for compatibility class LabeledSentence(TaggedDocument): + def __init__(self, *args, **kwargs): - warnings.warn('LabeledSentence has been replaced by TaggedDocument', DeprecationWarning) + warnings.warn( + 'LabeledSentence has been replaced by TaggedDocument', + DeprecationWarning) class DocvecsArray(utils.SaveLoad): @@ -278,10 +366,12 @@ class DocvecsArray(utils.SaveLoad): implementation, based on another persistence mechanism like LMDB, LevelDB, or SQLite, should also be possible. """ + def __init__(self, mapfile_path=None): self.doctags = {} # string -> Doctag (only filled if necessary) self.max_rawint = -1 # highest rawint-indexed doctag - self.offset2doctag = [] # int offset-past-(max_rawint+1) -> String (only filled if necessary) + # int offset-past-(max_rawint+1) -> String (only filled if necessary) + self.offset2doctag = [] self.count = 0 self.mapfile_path = mapfile_path @@ -293,7 +383,8 @@ def note_doctag(self, key, document_no, document_length): if key in self.doctags: self.doctags[key] = self.doctags[key].repeat(document_length) else: - self.doctags[key] = Doctag(len(self.offset2doctag), document_length, 1) + self.doctags[key] = Doctag( + len(self.offset2doctag), document_length, 1) self.offset2doctag.append(key) self.count = self.max_rawint + 1 + len(self.offset2doctag) @@ -366,14 +457,27 @@ def estimated_lookup_memory(self): def reset_weights(self, model): length = max(len(self.doctags), self.count) if self.mapfile_path: - self.doctag_syn0 = np_memmap(self.mapfile_path+'.doctag_syn0', dtype=REAL, - mode='w+', shape=(length, model.vector_size)) - self.doctag_syn0_lockf = np_memmap(self.mapfile_path+'.doctag_syn0_lockf', dtype=REAL, - mode='w+', shape=(length,)) + self.doctag_syn0 = np_memmap( + self.mapfile_path + + '.doctag_syn0', + dtype=REAL, + mode='w+', + shape=( + length, + model.vector_size)) + self.doctag_syn0_lockf = np_memmap( + self.mapfile_path + + '.doctag_syn0_lockf', + dtype=REAL, + mode='w+', + shape=( + length, + )) self.doctag_syn0_lockf.fill(1.0) else: self.doctag_syn0 = empty((length, model.vector_size), dtype=REAL) - self.doctag_syn0_lockf = ones((length,), dtype=REAL) # zeros suppress learning + self.doctag_syn0_lockf = ones( + (length,), dtype=REAL) # zeros suppress learning for i in xrange(length): # construct deterministic seed from index AND model seed @@ -396,18 +500,28 @@ def init_sims(self, replace=False): logger.info("precomputing L2-norms of doc weight vectors") if replace: for i in xrange(self.doctag_syn0.shape[0]): - self.doctag_syn0[i, :] /= sqrt((self.doctag_syn0[i, :] ** 2).sum(-1)) + self.doctag_syn0[ + i, :] /= sqrt((self.doctag_syn0[i, :] ** 2).sum(-1)) self.doctag_syn0norm = self.doctag_syn0 else: if self.mapfile_path: self.doctag_syn0norm = np_memmap( - self.mapfile_path+'.doctag_syn0norm', dtype=REAL, + self.mapfile_path + '.doctag_syn0norm', dtype=REAL, mode='w+', shape=self.doctag_syn0.shape) else: - self.doctag_syn0norm = empty(self.doctag_syn0.shape, dtype=REAL) - np_divide(self.doctag_syn0, sqrt((self.doctag_syn0 ** 2).sum(-1))[..., newaxis], self.doctag_syn0norm) - - def most_similar(self, positive=[], negative=[], topn=10, clip_start=0, clip_end=None, indexer=None): + self.doctag_syn0norm = empty( + self.doctag_syn0.shape, dtype=REAL) + np_divide(self.doctag_syn0, sqrt( + (self.doctag_syn0 ** 2).sum(-1))[..., newaxis], self.doctag_syn0norm) + + def most_similar( + self, + positive=[], + negative=[], + topn=10, + clip_start=0, + clip_end=None, + indexer=None): """ Find the top-N most similar docvecs known from training. Positive docs contribute positively towards the similarity, negative docs negatively. @@ -425,10 +539,12 @@ def most_similar(self, positive=[], negative=[], topn=10, clip_start=0, clip_end clip_end = clip_end or len(self.doctag_syn0norm) if isinstance(positive, string_types + integer_types) and not negative: - # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog']) + # allow calls like most_similar('dog'), as a shorthand for + # most_similar(['dog']) positive = [positive] - # add weights for each doc, if not already present; default to 1.0 for positive and -1.0 for negative docs + # add weights for each doc, if not already present; default to 1.0 for + # positive and -1.0 for negative docs positive = [ (doc, 1.0) if isinstance(doc, string_types + (ndarray,) + integer_types) else doc for doc in positive @@ -444,7 +560,10 @@ def most_similar(self, positive=[], negative=[], topn=10, clip_start=0, clip_end if isinstance(doc, ndarray): mean.append(weight * doc) elif doc in self.doctags or doc < self.count: - mean.append(weight * self.doctag_syn0norm[self._int_index(doc)]) + mean.append( + weight * + self.doctag_syn0norm[ + self._int_index(doc)]) all_docs.add(self._int_index(doc)) else: raise KeyError("doc '%s' not in trained set" % doc) @@ -460,7 +579,8 @@ def most_similar(self, positive=[], negative=[], topn=10, clip_start=0, clip_end return dists best = matutils.argsort(dists, topn=topn + len(all_docs), reverse=True) # ignore (don't return) docs from the input - result = [(self.index_to_doctag(sim), float(dists[sim])) for sim in best if sim not in all_docs] + result = [(self.index_to_doctag(sim), float(dists[sim])) + for sim in best if sim not in all_docs] return result[:topn] def doesnt_match(self, docs): @@ -472,11 +592,14 @@ def doesnt_match(self, docs): """ self.init_sims() - docs = [doc for doc in docs if doc in self.doctags or 0 <= doc < self.count] # filter out unknowns + docs = [doc for doc in docs if doc in self.doctags or 0 <= + doc < self.count] # filter out unknowns logger.debug("using docs %s" % docs) if not docs: raise ValueError("cannot select a doc from an empty list") - vectors = vstack(self.doctag_syn0norm[self._int_index(doc)] for doc in docs).astype(REAL) + vectors = vstack( + self.doctag_syn0norm[ + self._int_index(doc)] for doc in docs).astype(REAL) mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL) dists = dot(vectors, mean) return sorted(zip(dists, docs))[0][1] @@ -497,19 +620,39 @@ def n_similarity(self, ds1, ds2): """ v1 = [self[doc] for doc in ds1] v2 = [self[doc] for doc in ds2] - return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0))) - - def similarity_unseen_docs(self, model, doc_words1, doc_words2, alpha=0.1, min_alpha=0.0001, steps=5): + return dot( + matutils.unitvec( + array(v1).mean( + axis=0)), matutils.unitvec( + array(v2).mean( + axis=0))) + + def similarity_unseen_docs( + self, + model, + doc_words1, + doc_words2, + alpha=0.1, + min_alpha=0.0001, + steps=5): """ Compute cosine similarity between two post-bulk out of training documents. Document should be a list of (word) tokens. """ - d1 = model.infer_vector(doc_words=doc_words1, alpha=alpha, min_alpha=min_alpha, steps=steps) - d2 = model.infer_vector(doc_words=doc_words2, alpha=alpha, min_alpha=min_alpha, steps=steps) + d1 = model.infer_vector( + doc_words=doc_words1, + alpha=alpha, + min_alpha=min_alpha, + steps=steps) + d2 = model.infer_vector( + doc_words=doc_words2, + alpha=alpha, + min_alpha=min_alpha, + steps=steps) return dot(matutils.unitvec(d1), matutils.unitvec(d2)) - - + + class Doctag(namedtuple('Doctag', 'offset, word_count, doc_count')): """A string document tag discovered during the initial vocabulary scan. (The document-vector equivalent of a Vocab object.) @@ -524,15 +667,38 @@ class Doctag(namedtuple('Doctag', 'offset, word_count, doc_count')): __slots__ = () def repeat(self, word_count): - return self._replace(word_count=self.word_count + word_count, doc_count=self.doc_count + 1) + return self._replace( + word_count=self.word_count + word_count, + doc_count=self.doc_count + 1) class Doc2Vec(Word2Vec): """Class for training, using and evaluating neural networks described in http://arxiv.org/pdf/1405.4053v2.pdf""" - def __init__(self, documents=None, size=300, alpha=0.025, window=8, min_count=5, - max_vocab_size=None, sample=0, seed=1, workers=1, min_alpha=0.0001, - dm=1, hs=1, negative=0, dbow_words=0, dm_mean=0, dm_concat=0, dm_tag_count=1, - docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, **kwargs): + + def __init__( + self, + documents=None, + size=300, + alpha=0.025, + window=8, + min_count=5, + max_vocab_size=None, + sample=0, + seed=1, + workers=1, + min_alpha=0.0001, + dm=1, + hs=1, + negative=0, + dbow_words=0, + dm_mean=0, + dm_concat=0, + dm_tag_count=1, + docvecs=None, + docvecs_mapfile=None, + comment=None, + trim_rule=None, + **kwargs): """ Initialize the model from an iterable of `documents`. Each document is a TaggedDocument object that will be used for training. @@ -553,7 +719,7 @@ def __init__(self, documents=None, size=300, alpha=0.025, window=8, min_count=5, `alpha` is the initial learning rate (will linearly drop to zero as training progresses). - `seed` = for the random number generator. + `seed` = for the random number generator. Note that for a fully deterministically-reproducible run, you must also limit the model to a single worker thread, to eliminate ordering jitter from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires use of the PYTHONHASHSEED @@ -570,7 +736,7 @@ def __init__(self, documents=None, size=300, alpha=0.025, window=8, min_count=5, `workers` = use this many worker threads to train the model (=faster training with multicore machines). - `iter` = number of iterations (epochs) over the corpus. The default inherited from Word2Vec is 5, + `iter` = number of iterations (epochs) over the corpus. The default inherited from Word2Vec is 5, but values of 10 or 20 are common in published 'Paragraph Vector' experiments. `hs` = if 1 (default), hierarchical sampling will be used for model training (else set to 0). @@ -600,16 +766,28 @@ def __init__(self, documents=None, size=300, alpha=0.025, window=8, min_count=5, of the model. """ - super(Doc2Vec, self).__init__( - size=size, alpha=alpha, window=window, min_count=min_count, max_vocab_size=max_vocab_size, - sample=sample, seed=seed, workers=workers, min_alpha=min_alpha, - sg=(1+dm) % 2, hs=hs, negative=negative, cbow_mean=dm_mean, - null_word=dm_concat, **kwargs) + super(Doc2Vec, + self).__init__(size=size, + alpha=alpha, + window=window, + min_count=min_count, + max_vocab_size=max_vocab_size, + sample=sample, + seed=seed, + workers=workers, + min_alpha=min_alpha, + sg=(1 + dm) % 2, + hs=hs, + negative=negative, + cbow_mean=dm_mean, + null_word=dm_concat, + **kwargs) self.dbow_words = dbow_words self.dm_concat = dm_concat self.dm_tag_count = dm_tag_count if self.dm and self.dm_concat: - self.layer1_size = (self.dm_tag_count + (2 * self.window)) * self.vector_size + self.layer1_size = (self.dm_tag_count + + (2 * self.window)) * self.vector_size else: self.layer1_size = size self.docvecs = docvecs or DocvecsArray(docvecs_mapfile) @@ -633,8 +811,11 @@ def clear_sims(self): def reset_weights(self): if self.dm and self.dm_concat: # expand l1 size to match concatenated tags+words length - self.layer1_size = (self.dm_tag_count + (2 * self.window)) * self.vector_size - logger.info("using concatenative %d-dimensional layer1" % (self.layer1_size)) + self.layer1_size = (self.dm_tag_count + + (2 * self.window)) * self.vector_size + logger.info( + "using concatenative %d-dimensional layer1" % + (self.layer1_size)) super(Doc2Vec, self).reset_weights() self.docvecs.reset_weights(self) @@ -643,25 +824,41 @@ def reset_from(self, other_model): self.docvecs.borrow_from(other_model.docvecs) super(Doc2Vec, self).reset_from(other_model) - def scan_vocab(self, documents, progress_per=10000, trim_rule=None, update=False): + def scan_vocab( + self, + documents, + progress_per=10000, + trim_rule=None, + update=False): logger.info("collecting all words and their counts") document_no = -1 total_words = 0 min_reduce = 1 - interval_start = default_timer() - 0.00001 # guard against next sample being identical + # guard against next sample being identical + interval_start = default_timer() - 0.00001 interval_count = 0 checked_string_types = 0 vocab = defaultdict(int) for document_no, document in enumerate(documents): if not checked_string_types: if isinstance(document.words, string_types): - logger.warn("Each 'words' should be a list of words (usually unicode strings)." - "First 'words' here is instead plain %s." % type(document.words)) + logger.warn( + "Each 'words' should be a list of words (usually unicode strings)." + "First 'words' here is instead plain %s." % + type( + document.words)) checked_string_types += 1 if document_no % progress_per == 0: - interval_rate = (total_words - interval_count) / (default_timer() - interval_start) - logger.info("PROGRESS: at example #%i, processed %i words (%i/s), %i word types, %i tags", - document_no, total_words, interval_rate, len(vocab), len(self.docvecs)) + interval_rate = (total_words - interval_count) / \ + (default_timer() - interval_start) + logger.info( + "PROGRESS: at example #%i, processed %i words (%i/s), %i word types, %i tags", + document_no, + total_words, + interval_rate, + len(vocab), + len( + self.docvecs)) interval_start = default_timer() interval_count = total_words document_length = len(document.words) @@ -689,15 +886,32 @@ def _do_train_job(self, job, alpha, inits): indexed_doctags = self.docvecs.indexed_doctags(doc.tags) doctag_indexes, doctag_vectors, doctag_locks, ignored = indexed_doctags if self.sg: - tally += train_document_dbow(self, doc.words, doctag_indexes, alpha, work, + tally += train_document_dbow(self, + doc.words, + doctag_indexes, + alpha, + work, train_words=self.dbow_words, - doctag_vectors=doctag_vectors, doctag_locks=doctag_locks) + doctag_vectors=doctag_vectors, + doctag_locks=doctag_locks) elif self.dm_concat: - tally += train_document_dm_concat(self, doc.words, doctag_indexes, alpha, work, neu1, - doctag_vectors=doctag_vectors, doctag_locks=doctag_locks) + tally += train_document_dm_concat(self, + doc.words, + doctag_indexes, + alpha, + work, + neu1, + doctag_vectors=doctag_vectors, + doctag_locks=doctag_locks) else: - tally += train_document_dm(self, doc.words, doctag_indexes, alpha, work, neu1, - doctag_vectors=doctag_vectors, doctag_locks=doctag_locks) + tally += train_document_dm(self, + doc.words, + doctag_indexes, + alpha, + work, + neu1, + doctag_vectors=doctag_vectors, + doctag_locks=doctag_locks) self.docvecs.trained_item(indexed_doctags) return tally, self._raw_word_count(job) @@ -722,17 +936,40 @@ def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5): for i in range(steps): if self.sg: - train_document_dbow(self, doc_words, doctag_indexes, alpha, work, - learn_words=False, learn_hidden=False, - doctag_vectors=doctag_vectors, doctag_locks=doctag_locks) + train_document_dbow( + self, + doc_words, + doctag_indexes, + alpha, + work, + learn_words=False, + learn_hidden=False, + doctag_vectors=doctag_vectors, + doctag_locks=doctag_locks) elif self.dm_concat: - train_document_dm_concat(self, doc_words, doctag_indexes, alpha, work, neu1, - learn_words=False, learn_hidden=False, - doctag_vectors=doctag_vectors, doctag_locks=doctag_locks) + train_document_dm_concat( + self, + doc_words, + doctag_indexes, + alpha, + work, + neu1, + learn_words=False, + learn_hidden=False, + doctag_vectors=doctag_vectors, + doctag_locks=doctag_locks) else: - train_document_dm(self, doc_words, doctag_indexes, alpha, work, neu1, - learn_words=False, learn_hidden=False, - doctag_vectors=doctag_vectors, doctag_locks=doctag_locks) + train_document_dm( + self, + doc_words, + doctag_indexes, + alpha, + work, + neu1, + learn_words=False, + learn_hidden=False, + doctag_vectors=doctag_vectors, + doctag_locks=doctag_locks) alpha = ((alpha - min_alpha) / (steps - i)) + min_alpha return doctag_vectors[0] @@ -741,7 +978,8 @@ def estimate_memory(self, vocab_size=None, report=None): """Estimate required memory for a model using current settings.""" report = report or {} report['doctag_lookup'] = self.docvecs.estimated_lookup_memory() - report['doctag_syn0'] = self.docvecs.count * self.vector_size * dtype(REAL).itemsize + report['doctag_syn0'] = self.docvecs.count * \ + self.vector_size * dtype(REAL).itemsize return super(Doc2Vec, self).estimate_memory(vocab_size, report=report) def __str__(self): @@ -782,6 +1020,7 @@ def __str__(self): class TaggedBrownCorpus(object): """Iterate over documents from the Brown corpus (part of NLTK data), yielding each document out as a TaggedDocument object.""" + def __init__(self, dirname): self.dirname = dirname @@ -794,9 +1033,15 @@ def __iter__(self): line = utils.to_unicode(line) # each file line is a single document in the Brown corpus # each token is WORD/POS_TAG - token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2] - # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff) - words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()] + token_tags = [t.split('/') + for t in line.split() if len(t.split('/')) == 2] + # ignore words with non-alphabetic tags like ",", "!" etc + # (punctuation, weird stuff) + words = [ + "%s/%s" % + (token.lower(), tag[ + :2]) for token, tag in token_tags if tag[ + :2].isalpha()] if not words: # don't bother sending out empty documents continue yield TaggedDocument(words, ['%s_SENT_%s' % (fname, item_no)]) @@ -807,6 +1052,7 @@ class TaggedLineDocument(object): Words are expected to be already preprocessed and separated by whitespace, tags are constructed automatically from the document line number.""" + def __init__(self, source): """ `source` can be either a string (filename) or a file object. diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py index 7e19cd3a7f..081ebd576e 100755 --- a/gensim/models/hdpmodel.py +++ b/gensim/models/hdpmodel.py @@ -33,7 +33,8 @@ from __future__ import with_statement -import logging, time +import logging +import time import numpy as np import scipy.special as sp @@ -97,6 +98,7 @@ def lda_e_step(doc_word_ids, doc_word_counts, alpha, beta, max_iter=100): class SuffStats(object): + def __init__(self, T, Wt, Dt): self.m_chunksize = Dt self.m_var_sticks_ss = np.zeros(T) @@ -120,6 +122,7 @@ class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel): Model persistency is achieved through its `load`/`save` methods. """ + def __init__(self, corpus, id2word, max_chunks=None, max_time=None, chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, @@ -162,7 +165,8 @@ def __init__(self, corpus, id2word, max_chunks=None, max_time=None, self.m_var_sticks[1] = range(T - 1, 0, -1) self.m_varphi_ss = np.zeros(T) - self.m_lambda = np.random.gamma(1.0, 1.0, (T, self.m_W)) * self.m_D * 100 / (T * self.m_W) - eta + self.m_lambda = np.random.gamma( + 1.0, 1.0, (T, self.m_W)) * self.m_D * 100 / (T * self.m_W) - eta self.m_eta = eta self.m_Elogbeta = dirichlet_expectation(self.m_eta + self.m_lambda) @@ -182,7 +186,8 @@ def __init__(self, corpus, id2word, max_chunks=None, max_time=None, if self.outputdir: self.save_options() - # if a training corpus was provided, start estimating the model right away + # if a training corpus was provided, start estimating the model right + # away if corpus is not None: self.update(corpus) @@ -191,7 +196,9 @@ def inference(self, chunk): raise RuntimeError("model must be trained to perform inference") chunk = list(chunk) if len(chunk) > 1: - logger.debug("performing inference on a chunk of %i documents" % len(chunk)) + logger.debug( + "performing inference on a chunk of %i documents" % + len(chunk)) gamma = np.zeros((len(chunk), self.lda_beta.shape[0])) for d, doc in enumerate(chunk): @@ -209,11 +216,12 @@ def __getitem__(self, bow, eps=0.01): gamma = self.inference([bow])[0] topic_dist = gamma / sum(gamma) if sum(gamma) != 0 else [] - return [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist) - if topicvalue >= eps] + return [(topicid, topicvalue) for topicid, + topicvalue in enumerate(topic_dist) if topicvalue >= eps] def update(self, corpus): - save_freq = max(1, int(10000 / self.chunksize)) # save every 10k docs, roughly + # save every 10k docs, roughly + save_freq = max(1, int(10000 / self.chunksize)) chunks_processed = 0 start_time = time.clock() @@ -223,7 +231,10 @@ def update(self, corpus): self.m_num_docs_processed += len(chunk) chunks_processed += 1 - if self.update_finished(start_time, chunks_processed, self.m_num_docs_processed): + if self.update_finished( + start_time, + chunks_processed, + self.m_num_docs_processed): self.update_expectations() alpha, beta = self.hdp_to_lda() self.lda_alpha = alpha @@ -237,7 +248,10 @@ def update(self, corpus): self.update_expectations() # self.save_topics(self.m_num_docs_processed) self.print_topics(20) - logger.info('PROGRESS: finished document %i of %i', self.m_num_docs_processed, self.m_D) + logger.info( + 'PROGRESS: finished document %i of %i', + self.m_num_docs_processed, + self.m_D) def update_finished(self, start_time, chunks_processed, docs_processed): return ( @@ -299,7 +313,7 @@ def doc_e_step(self, doc, ss, Elogsticks_1st, word_list, chunkids = [unique_words[id] for id in doc_word_ids] Elogbeta_doc = self.m_Elogbeta[:, doc_word_ids] - ## very similar to the hdp equations + # very similar to the hdp equations v = np.zeros((2, self.m_K - 1)) v[0] = 1.0 v[1] = self.m_alpha @@ -314,17 +328,19 @@ def doc_e_step(self, doc, ss, Elogsticks_1st, word_list, iter = 0 max_iter = 100 - # not yet support second level optimization yet, to be done in the future + # not yet support second level optimization yet, to be done in the + # future while iter < max_iter and (converge < 0.0 or converge > var_converge): - ### update variational parameters + # update variational parameters # var_phi if iter < 3: - var_phi = np.dot(phi.T, (Elogbeta_doc * doc_word_counts).T) + var_phi = np.dot(phi.T, (Elogbeta_doc * doc_word_counts).T) (log_var_phi, log_norm) = matutils.ret_log_normalize_vec(var_phi) var_phi = np.exp(log_var_phi) else: - var_phi = np.dot(phi.T, (Elogbeta_doc * doc_word_counts).T) + Elogsticks_1st + var_phi = np.dot( + phi.T, (Elogbeta_doc * doc_word_counts).T) + Elogsticks_1st (log_var_phi, log_norm) = matutils.ret_log_normalize_vec(var_phi) var_phi = np.exp(log_var_phi) @@ -354,14 +370,17 @@ def doc_e_step(self, doc, ss, Elogsticks_1st, word_list, log_alpha = np.log(self.m_alpha) likelihood += (self.m_K - 1) * log_alpha dig_sum = sp.psi(np.sum(v, 0)) - likelihood += np.sum((np.array([1.0, self.m_alpha])[:, np.newaxis] - v) * (sp.psi(v) - dig_sum)) - likelihood -= np.sum(sp.gammaln(np.sum(v, 0))) - np.sum(sp.gammaln(v)) + likelihood += np.sum((np.array([1.0, self.m_alpha]) + [:, np.newaxis] - v) * (sp.psi(v) - dig_sum)) + likelihood -= np.sum(sp.gammaln(np.sum(v, 0)) + ) - np.sum(sp.gammaln(v)) # Z part likelihood += np.sum((Elogsticks_2nd - log_phi) * phi) # X part, the data part - likelihood += np.sum(phi.T * np.dot(var_phi, Elogbeta_doc * doc_word_counts)) + likelihood += np.sum(phi.T * np.dot(var_phi, + Elogbeta_doc * doc_word_counts)) converge = (likelihood - old_likelihood) / abs(old_likelihood) old_likelihood = likelihood @@ -374,7 +393,8 @@ def doc_e_step(self, doc, ss, Elogsticks_1st, word_list, # update the suff_stat ss # this time it only contains information from one doc ss.m_var_sticks_ss += np.sum(var_phi, 0) - ss.m_var_beta_ss[:, chunkids] += np.dot(var_phi.T, phi.T * doc_word_counts) + ss.m_var_beta_ss[ + :, chunkids] += np.dot(var_phi.T, phi.T * doc_word_counts) return likelihood @@ -390,8 +410,8 @@ def update_lambda(self, sstats, word_list, opt_o): # Update appropriate columns of lambda based on documents. self.m_lambda[:, word_list] = self.m_lambda[:, word_list] * (1 - rhot) + \ rhot * self.m_D * sstats.m_var_beta_ss / sstats.m_chunksize - self.m_lambda_sum = (1 - rhot) * self.m_lambda_sum + \ - rhot * self.m_D * np.sum(sstats.m_var_beta_ss, axis=1) / sstats.m_chunksize + self.m_lambda_sum = (1 - rhot) * self.m_lambda_sum + rhot * self.m_D * \ + np.sum(sstats.m_var_beta_ss, axis=1) / sstats.m_chunksize self.m_updatect += 1 self.m_timestamp[word_list] = self.m_updatect @@ -403,7 +423,7 @@ def update_lambda(self, sstats, word_list, opt_o): if opt_o: self.optimal_ordering() - ## update top level sticks + # update top level sticks self.m_var_sticks[0] = self.m_varphi_ss[:self.m_T - 1] + 1.0 var_phi_sum = np.flipud(self.m_varphi_ss[1:]) self.m_var_sticks[1] = np.flipud(np.cumsum(var_phi_sum)) + self.m_gamma @@ -435,7 +455,12 @@ def update_expectations(self): self.m_timestamp[:] = self.m_updatect self.m_status_up_to_date = True - def show_topics(self, num_topics=20, num_words=20, log=False, formatted=True): + def show_topics( + self, + num_topics=20, + num_words=20, + log=False, + formatted=True): """ Print the `num_words` most probable words for `topics` number of topics. Set `topics=-1` to print all topics. @@ -453,7 +478,8 @@ def show_topics(self, num_topics=20, num_words=20, log=False, formatted=True): def save_topics(self, doc_count=None): """legacy method; use `self.save()` instead""" if not self.outputdir: - logger.error("cannot store topics without having specified an output directory") + logger.error( + "cannot store topics without having specified an output directory") if doc_count is None: fname = 'final' @@ -467,7 +493,8 @@ def save_topics(self, doc_count=None): def save_options(self): """legacy method; use `self.save()` instead""" if not self.outputdir: - logger.error("cannot store options without having specified an output directory") + logger.error( + "cannot store options without having specified an output directory") return fname = '%s/options.dat' % self.outputdir with utils.smart_open(fname, 'wb') as fout: @@ -488,7 +515,8 @@ def hdp_to_lda(self): Compute the LDA almost equivalent HDP. """ # alpha - sticks = self.m_var_sticks[0] / (self.m_var_sticks[0] + self.m_var_sticks[1]) + sticks = self.m_var_sticks[0] / \ + (self.m_var_sticks[0] + self.m_var_sticks[1]) alpha = np.zeros(self.m_T) left = 1.0 for i in xrange(0, self.m_T - 1): @@ -498,8 +526,8 @@ def hdp_to_lda(self): alpha = alpha * self.m_alpha # beta - beta = (self.m_lambda + self.m_eta) / (self.m_W * self.m_eta + \ - self.m_lambda_sum[:, np.newaxis]) + beta = (self.m_lambda + self.m_eta) / (self.m_W * + self.m_eta + self.m_lambda_sum[:, np.newaxis]) return (alpha, beta) @@ -512,7 +540,8 @@ def evaluate_test_corpus(self, corpus): for i, doc in enumerate(corpus): if len(doc) > 0: doc_word_ids, doc_word_counts = zip(*doc) - likelihood, gamma = lda_e_step(doc_word_ids, doc_word_counts, self.lda_alpha, self.lda_beta) + likelihood, gamma = lda_e_step( + doc_word_ids, doc_word_counts, self.lda_alpha, self.lda_beta) theta = gamma / np.sum(gamma) lda_betad = self.lda_beta[:, doc_word_ids] log_predicts = np.log(np.dot(theta, lda_betad)) @@ -520,15 +549,22 @@ def evaluate_test_corpus(self, corpus): logger.info('TEST: %6d %.5f' % (i, doc_score)) score += likelihood total_words += sum(doc_word_counts) - logger.info('TEST: average score: %.5f, total score: %.5f, test docs: %d' % (score / total_words, score, len(corpus))) + logger.info( + 'TEST: average score: %.5f, total score: %.5f, test docs: %d' % + (score / total_words, score, len(corpus))) return score -#endclass HdpModel +# endclass HdpModel class HdpTopicFormatter(object): (STYLE_GENSIM, STYLE_PRETTY) = (1, 2) - def __init__(self, dictionary=None, topic_data=None, topic_file=None, style=None): + def __init__( + self, + dictionary=None, + topic_data=None, + topic_file=None, + style=None): if dictionary is None: raise ValueError('no dictionary!') @@ -554,7 +590,12 @@ def __init__(self, dictionary=None, topic_data=None, topic_file=None, style=None def print_topics(self, num_topics=10, num_words=10): return self.show_topics(num_topics, num_words, True) - def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): + def show_topics( + self, + num_topics=10, + num_words=10, + log=False, + formatted=True): shown = [] if num_topics < 0: num_topics = len(self.data) @@ -583,13 +624,16 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): return shown def show_topic_terms(self, topic_data, num_words): - return [(self.dictionary[wid], weight) for (weight, wid) in topic_data[:num_words]] + return [(self.dictionary[wid], weight) + for (weight, wid) in topic_data[:num_words]] def format_topic(self, topic_id, topic_terms): if self.STYLE_GENSIM == self.style: - fmt = ' + '.join(['%.3f*%s' % (weight, word) for (word, weight) in topic_terms]) + fmt = ' + '.join(['%.3f*%s' % (weight, word) + for (word, weight) in topic_terms]) else: - fmt = '\n'.join([' %20s %.8f' % (word, weight) for (word, weight) in topic_terms]) + fmt = '\n'.join([' %20s %.8f' % (word, weight) + for (word, weight) in topic_terms]) fmt = (topic_id, fmt) return fmt diff --git a/gensim/models/lda_dispatcher.py b/gensim/models/lda_dispatcher.py index 484e497b5f..2132745604 100755 --- a/gensim/models/lda_dispatcher.py +++ b/gensim/models/lda_dispatcher.py @@ -44,7 +44,7 @@ # timeout for the Queue object put/get blocking methods. # it should theoretically be infinity, but then keyboard interrupts don't work. # so this is really just a hack, see http://bugs.python.org/issue1360 -HUGE_TIMEOUT = 365 * 24 * 60 * 60 # one year +HUGE_TIMEOUT = 365 * 24 * 60 * 60 # one year LDA_DISPATCHER_PREFIX = 'gensim.lda_dispatcher' @@ -62,7 +62,9 @@ def __init__(self, maxsize=MAX_JOBS_QUEUE, ns_conf={}): use the `initialize()` function to populate it with workers etc. """ self.maxsize = maxsize - self.callback = None # a pyro proxy to this object (unknown at init time, but will be set later) + # a pyro proxy to this object (unknown at init time, but will be set + # later) + self.callback = None self.ns_conf = ns_conf @Pyro4.expose @@ -76,24 +78,32 @@ def initialize(self, **model_params): self._jobsdone = 0 self._jobsreceived = 0 - # locate all available workers and store their proxies, for subsequent RMI calls + # locate all available workers and store their proxies, for subsequent + # RMI calls self.workers = {} with utils.getNS(**self.ns_conf) as ns: - self.callback = Pyro4.Proxy(ns.list(prefix=LDA_DISPATCHER_PREFIX)[LDA_DISPATCHER_PREFIX]) + self.callback = Pyro4.Proxy(ns.list(prefix=LDA_DISPATCHER_PREFIX)[ + LDA_DISPATCHER_PREFIX]) for name, uri in iteritems(ns.list(prefix=LDA_WORKER_PREFIX)): try: worker = Pyro4.Proxy(uri) workerid = len(self.workers) # make time consuming methods work asynchronously - logger.info("registering worker #%i at %s" % (workerid, uri)) - worker.initialize(workerid, dispatcher=self.callback, **model_params) + logger.info( + "registering worker #%i at %s" % + (workerid, uri)) + worker.initialize( + workerid, dispatcher=self.callback, **model_params) self.workers[workerid] = worker except Pyro4.errors.PyroError: - logger.warning("unresponsive worker at %s, deleting it from the name server" % uri) + logger.warning( + "unresponsive worker at %s, deleting it from the name server" % + uri) ns.remove(name) if not self.workers: - raise RuntimeError('no workers found; run some lda_worker scripts on your machines first!') + raise RuntimeError( + 'no workers found; run some lda_worker scripts on your machines first!') @Pyro4.expose def getworkers(self): @@ -106,14 +116,18 @@ def getworkers(self): def getjob(self, worker_id): logger.info("worker #%i requesting a new job" % worker_id) job = self.jobs.get(block=True, timeout=1) - logger.info("worker #%i got a new job (%i left)" % (worker_id, self.jobs.qsize())) + logger.info( + "worker #%i got a new job (%i left)" % + (worker_id, self.jobs.qsize())) return job @Pyro4.expose def putjob(self, job): self._jobsreceived += 1 self.jobs.put(job, block=True, timeout=HUGE_TIMEOUT) - logger.info("added a new job (len(queue)=%i items)" % self.jobs.qsize()) + logger.info( + "added a new job (len(queue)=%i items)" % + self.jobs.qsize()) @Pyro4.expose def getstate(self): @@ -121,9 +135,11 @@ def getstate(self): Merge states from across all workers and return the result. """ logger.info("end of input, assigning all remaining jobs") - logger.debug("jobs done: %s, jobs received: %s" % (self._jobsdone, self._jobsreceived)) + logger.debug( + "jobs done: %s, jobs received: %s" % + (self._jobsdone, self._jobsreceived)) while self._jobsdone < self._jobsreceived: - time.sleep(0.5) # check every half a second + time.sleep(0.5) # check every half a second logger.info("merging states from %i workers" % len(self.workers)) workers = list(self.workers.values()) @@ -159,14 +175,13 @@ def jobdone(self, workerid): """ self._jobsdone += 1 logger.info("worker #%s finished job #%i" % (workerid, self._jobsdone)) - self.workers[workerid].requestjob() # tell the worker to ask for another job, asynchronously (one-way) - + # tell the worker to ask for another job, asynchronously (one-way) + self.workers[workerid].requestjob() def jobsdone(self): """Wrap self._jobsdone, needed for remote access through Pyro proxies""" return self._jobsdone - @Pyro4.oneway def exit(self): """ @@ -176,32 +191,63 @@ def exit(self): logger.info("terminating worker %s" % workerid) worker.exit() logger.info("terminating dispatcher") - os._exit(0) # exit the whole process (not just this thread ala sys.exit()) -#endclass Dispatcher + # exit the whole process (not just this thread ala sys.exit()) + os._exit(0) +# endclass Dispatcher def main(): parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--maxsize", help="How many jobs (=chunks of N documents) " - "to keep 'pre-fetched' in a queue (default: %(default)s)", - type=int, default=MAX_JOBS_QUEUE) - parser.add_argument("--host", help="Nameserver hostname (default: %(default)s)", default=None) - parser.add_argument("--port", help="Nameserver port (default: %(default)s)", default=None, type=int) - parser.add_argument("--no-broadcast", help="Disable broadcast (default: %(default)s)", - action='store_const', default=True, const=False) - parser.add_argument("--hmac", help="Nameserver hmac key (default: %(default)s)", default=None) - parser.add_argument('-v', '--verbose', help='Verbose flag', action='store_const', dest="loglevel", - const=logging.INFO, default=logging.WARNING) + parser.add_argument( + "--maxsize", + help="How many jobs (=chunks of N documents) " + "to keep 'pre-fetched' in a queue (default: %(default)s)", + type=int, + default=MAX_JOBS_QUEUE) + parser.add_argument( + "--host", + help="Nameserver hostname (default: %(default)s)", + default=None) + parser.add_argument( + "--port", + help="Nameserver port (default: %(default)s)", + default=None, + type=int) + parser.add_argument( + "--no-broadcast", + help="Disable broadcast (default: %(default)s)", + action='store_const', + default=True, + const=False) + parser.add_argument( + "--hmac", + help="Nameserver hmac key (default: %(default)s)", + default=None) + parser.add_argument( + '-v', + '--verbose', + help='Verbose flag', + action='store_const', + dest="loglevel", + const=logging.INFO, + default=logging.WARNING) args = parser.parse_args() - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=args.loglevel) + logging.basicConfig( + format='%(asctime)s : %(levelname)s : %(message)s', + level=args.loglevel) logger.info("running %s", " ".join(sys.argv)) ns_conf = {"broadcast": args.no_broadcast, "host": args.host, "port": args.port, "hmac_key": args.hmac} - utils.pyro_daemon(LDA_DISPATCHER_PREFIX, Dispatcher(maxsize=args.maxsize, ns_conf=ns_conf), ns_conf=ns_conf) + utils.pyro_daemon( + LDA_DISPATCHER_PREFIX, + Dispatcher( + maxsize=args.maxsize, + ns_conf=ns_conf), + ns_conf=ns_conf) logger.info("finished running %s", " ".join(sys.argv)) diff --git a/gensim/models/lda_worker.py b/gensim/models/lda_worker.py index fbae4c0fff..9666422613 100755 --- a/gensim/models/lda_worker.py +++ b/gensim/models/lda_worker.py @@ -33,21 +33,23 @@ logger = logging.getLogger('gensim.models.lda_worker') -# periodically save intermediate models after every SAVE_DEBUG updates (0 for never) +# periodically save intermediate models after every SAVE_DEBUG updates (0 +# for never) SAVE_DEBUG = 0 LDA_WORKER_PREFIX = 'gensim.lda_worker' class Worker(object): + def __init__(self): self.model = None @Pyro4.expose def initialize(self, myid, dispatcher, **model_params): self.lock_update = threading.Lock() - self.jobsdone = 0 # how many jobs has this worker completed? - self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove? + self.jobsdone = 0 # how many jobs has this worker completed? + self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove? self.dispatcher = dispatcher self.finished = False logger.info("initializing worker #%s" % myid) @@ -60,7 +62,8 @@ def requestjob(self): Request jobs from the dispatcher, in a perpetual loop until `getstate()` is called. """ if self.model is None: - raise RuntimeError("worker must be initialized before receiving jobs") + raise RuntimeError( + "worker must be initialized before receiving jobs") job = None while job is None and not self.finished: @@ -70,13 +73,14 @@ def requestjob(self): # no new job: try again, unless we're finished with all work continue if job is not None: - logger.info("worker #%s received job #%i" % (self.myid, self.jobsdone)) + logger.info( + "worker #%s received job #%i" % + (self.myid, self.jobsdone)) self.processjob(job) self.dispatcher.jobdone(self.myid) else: logger.info("worker #%i stopping asking for jobs" % self.myid) - @utils.synchronous('lock_update') def processjob(self, job): logger.debug("starting to process job #%i" % self.jobsdone) @@ -94,7 +98,7 @@ def getstate(self): (self.myid, self.jobsdone)) result = self.model.state assert isinstance(result, ldamodel.LdaState) - self.model.clear() # free up mem in-between two EM cycles + self.model.clear() # free up mem in-between two EM cycles self.finished = True return result @@ -108,26 +112,47 @@ def reset(self, state): self.model.state.reset() self.finished = False - @Pyro4.oneway def exit(self): logger.info("terminating worker #%i" % self.myid) os._exit(0) -#endclass Worker +# endclass Worker def main(): parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--host", help="Nameserver hostname (default: %(default)s)", default=None) - parser.add_argument("--port", help="Nameserver port (default: %(default)s)", default=None, type=int) - parser.add_argument("--no-broadcast", help="Disable broadcast (default: %(default)s)", - action='store_const', default=True, const=False) - parser.add_argument("--hmac", help="Nameserver hmac key (default: %(default)s)", default=None) - parser.add_argument('-v', '--verbose', help='Verbose flag', action='store_const', dest="loglevel", - const=logging.INFO, default=logging.WARNING) + parser.add_argument( + "--host", + help="Nameserver hostname (default: %(default)s)", + default=None) + parser.add_argument( + "--port", + help="Nameserver port (default: %(default)s)", + default=None, + type=int) + parser.add_argument( + "--no-broadcast", + help="Disable broadcast (default: %(default)s)", + action='store_const', + default=True, + const=False) + parser.add_argument( + "--hmac", + help="Nameserver hmac key (default: %(default)s)", + default=None) + parser.add_argument( + '-v', + '--verbose', + help='Verbose flag', + action='store_const', + dest="loglevel", + const=logging.INFO, + default=logging.WARNING) args = parser.parse_args() - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=args.loglevel) + logging.basicConfig( + format='%(asctime)s : %(levelname)s : %(message)s', + level=args.loglevel) logger.info("running %s", " ".join(sys.argv)) ns_conf = {"broadcast": args.no_broadcast, @@ -135,7 +160,11 @@ def main(): "port": args.port, "hmac_key": args.hmac} - utils.pyro_daemon(LDA_WORKER_PREFIX, Worker(), random_suffix=True, ns_conf=ns_conf) + utils.pyro_daemon( + LDA_WORKER_PREFIX, + Worker(), + random_suffix=True, + ns_conf=ns_conf) logger.info("finished running %s", " ".join(sys.argv)) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 7def8966b3..327a039914 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -91,19 +91,21 @@ def update_dir_prior(prior, N, logphat, rho): return prior + def get_random_state(seed): - """ Turn seed into a np.random.RandomState instance. - - Method originally from maciejkula/glove-python, and written by @joshloyal - """ - if seed is None or seed is numpy.random: - return numpy.random.mtrand._rand - if isinstance(seed, (numbers.Integral, numpy.integer)): - return numpy.random.RandomState(seed) - if isinstance(seed, numpy.random.RandomState): + """ Turn seed into a np.random.RandomState instance. + + Method originally from maciejkula/glove-python, and written by @joshloyal + """ + if seed is None or seed is numpy.random: + return numpy.random.mtrand._rand + if isinstance(seed, (numbers.Integral, numpy.integer)): + return numpy.random.RandomState(seed) + if isinstance(seed, numpy.random.RandomState): return seed - raise ValueError('%r cannot be used to seed a numpy.random.RandomState' - ' instance' % seed) + raise ValueError('%r cannot be used to seed a numpy.random.RandomState' + ' instance' % seed) + class LdaState(utils.SaveLoad): """ @@ -113,6 +115,7 @@ class LdaState(utils.SaveLoad): reduce traffic. """ + def __init__(self, eta, shape): self.eta = eta self.sstats = numpy.zeros(shape) @@ -168,8 +171,10 @@ def blend(self, rhot, other, targetsize=None): if other.numdocs == 0 or targetsize == other.numdocs: scale = 1.0 else: - logger.info("merging changes from %i documents into a model of %i documents", - other.numdocs, targetsize) + logger.info( + "merging changes from %i documents into a model of %i documents", + other.numdocs, + targetsize) scale = 1.0 * targetsize / other.numdocs self.sstats += rhot * scale * other.sstats @@ -212,6 +217,7 @@ class LdaModel(interfaces.TransformationABC, basemodel.BaseTopicModel): Model persistency is achieved through its `load`/`save` methods. """ + def __init__(self, corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, @@ -272,10 +278,12 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, # store user-supplied parameters self.id2word = id2word if corpus is None and self.id2word is None: - raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality') + raise ValueError( + 'at least one of corpus/id2word must be specified, to establish input space dimensionality') if self.id2word is None: - logger.warning("no word id mapping provided; initializing from corpus, assuming identity") + logger.warning( + "no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif len(self.id2word) > 0: @@ -284,7 +292,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, self.num_terms = 0 if self.num_terms == 0: - raise ValueError("cannot compute LDA over an empty collection (no terms)") + raise ValueError( + "cannot compute LDA over an empty collection (no terms)") self.distributed = bool(distributed) self.num_topics = int(num_topics) @@ -300,7 +309,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha') - assert self.alpha.shape == (self.num_topics,), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics) + assert self.alpha.shape == (self.num_topics,), "Invalid alpha shape. Got shape %s, but expected (%d, )" % ( + str(self.alpha.shape), self.num_topics) self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta') @@ -321,28 +331,42 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, self.numworkers = 1 else: if self.optimize_alpha: - raise NotImplementedError("auto-optimizing alpha not implemented in distributed LDA") + raise NotImplementedError( + "auto-optimizing alpha not implemented in distributed LDA") # set up distributed version try: import Pyro4 with utils.getNS(**ns_conf) as ns: from gensim.models.lda_dispatcher import LDA_DISPATCHER_PREFIX - self.dispatcher = Pyro4.Proxy(ns.list(prefix=LDA_DISPATCHER_PREFIX)[LDA_DISPATCHER_PREFIX]) - logger.debug("looking for dispatcher at %s" % str(self.dispatcher._pyroUri)) - self.dispatcher.initialize(id2word=self.id2word, num_topics=self.num_topics, - chunksize=chunksize, alpha=alpha, eta=eta, distributed=False) + self.dispatcher = Pyro4.Proxy( + ns.list(prefix=LDA_DISPATCHER_PREFIX)[LDA_DISPATCHER_PREFIX]) + logger.debug("looking for dispatcher at %s" % + str(self.dispatcher._pyroUri)) + self.dispatcher.initialize( + id2word=self.id2word, + num_topics=self.num_topics, + chunksize=chunksize, + alpha=alpha, + eta=eta, + distributed=False) self.numworkers = len(self.dispatcher.getworkers()) - logger.info("using distributed version with %i workers" % self.numworkers) + logger.info( + "using distributed version with %i workers" % + self.numworkers) except Exception as err: logger.error("failed to initialize distributed LDA (%s)", err) - raise RuntimeError("failed to initialize distributed LDA (%s)" % err) + raise RuntimeError( + "failed to initialize distributed LDA (%s)" % + err) # Initialize the variational distribution q(beta|lambda) self.state = LdaState(self.eta, (self.num_topics, self.num_terms)) - self.state.sstats = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms)) + self.state.sstats = self.random_state.gamma( + 100., 1. / 100., (self.num_topics, self.num_terms)) self.expElogbeta = numpy.exp(dirichlet_expectation(self.state.sstats)) - # if a training corpus was provided, start estimating the model right away + # if a training corpus was provided, start estimating the model right + # away if corpus is not None: use_numpy = self.dispatcher is not None self.update(corpus, chunks_as_numpy=use_numpy) @@ -355,18 +379,29 @@ def init_dir_prior(self, prior, name): if isinstance(prior, six.string_types): if prior == 'symmetric': - logger.info("using symmetric %s at %s", name, 1.0 / self.num_topics) - init_prior = numpy.asarray([1.0 / self.num_topics for i in xrange(self.num_topics)]) + logger.info( + "using symmetric %s at %s", + name, + 1.0 / self.num_topics) + init_prior = numpy.asarray( + [1.0 / self.num_topics for i in xrange(self.num_topics)]) elif prior == 'asymmetric': - init_prior = numpy.asarray([1.0 / (i + numpy.sqrt(self.num_topics)) for i in xrange(self.num_topics)]) + init_prior = numpy.asarray( + [1.0 / (i + numpy.sqrt(self.num_topics)) for i in xrange(self.num_topics)]) init_prior /= init_prior.sum() logger.info("using asymmetric %s %s", name, list(init_prior)) elif prior == 'auto': is_auto = True - init_prior = numpy.asarray([1.0 / self.num_topics for i in xrange(self.num_topics)]) - logger.info("using autotuned %s, starting with %s", name, list(init_prior)) + init_prior = numpy.asarray( + [1.0 / self.num_topics for i in xrange(self.num_topics)]) + logger.info( + "using autotuned %s, starting with %s", + name, + list(init_prior)) else: - raise ValueError("Unable to determine proper %s value given '%s'" % (name, prior)) + raise ValueError( + "Unable to determine proper %s value given '%s'" % + (name, prior)) elif isinstance(prior, list): init_prior = numpy.asarray(prior) elif isinstance(prior, numpy.ndarray): @@ -374,21 +409,29 @@ def init_dir_prior(self, prior, name): elif isinstance(prior, numpy.number) or isinstance(prior, numbers.Real): init_prior = numpy.asarray([prior] * self.num_topics) else: - raise ValueError("%s must be either a numpy array of scalars, list of scalars, or scalar" % name) + raise ValueError( + "%s must be either a numpy array of scalars, list of scalars, or scalar" % + name) if name == 'eta': # please note the difference in shapes between alpha and eta: # alpha is a row: [0.1, 0.1] # eta is a column: [[0.1], # [0.1]] - if init_prior.shape == (self.num_topics,) or init_prior.shape == (1, self.num_topics): - init_prior = init_prior.reshape((self.num_topics, 1)) # this statement throws ValueError if eta did not match self.num_topics + if init_prior.shape == ( + self.num_topics, + ) or init_prior.shape == ( + 1, + self.num_topics): + # this statement throws ValueError if eta did not match + # self.num_topics + init_prior = init_prior.reshape((self.num_topics, 1)) return init_prior, is_auto def __str__(self): - return "LdaModel(num_terms=%s, num_topics=%s, decay=%s, chunksize=%s)" % \ - (self.num_terms, self.num_topics, self.decay, self.chunksize) + return "LdaModel(num_terms=%s, num_topics=%s, decay=%s, chunksize=%s)" % ( + self.num_terms, self.num_topics, self.decay, self.chunksize) def sync_state(self): self.expElogbeta = numpy.exp(self.state.get_Elogbeta()) @@ -422,10 +465,13 @@ def inference(self, chunk, collect_sstats=False): # convert iterators/generators to plain list, so we have len() etc. chunk = list(chunk) if len(chunk) > 1: - logger.debug("performing inference on a chunk of %i documents", len(chunk)) + logger.debug( + "performing inference on a chunk of %i documents", + len(chunk)) # Initialize the variational distribution q(theta|gamma) for the chunk - gamma = self.random_state.gamma(100., 1. / 100., (len(chunk), self.num_topics)) + gamma = self.random_state.gamma( + 100., 1. / 100., (len(chunk), self.num_topics)) Elogtheta = dirichlet_expectation(gamma) expElogtheta = numpy.exp(Elogtheta) if collect_sstats: @@ -440,7 +486,8 @@ def inference(self, chunk, collect_sstats=False): # to Blei's original LDA-C code, cool!). for d, doc in enumerate(chunk): if doc and not isinstance(doc[0][0], six.integer_types): - # make sure the term IDs are ints, otherwise numpy will get upset + # make sure the term IDs are ints, otherwise numpy will get + # upset ids = [int(id) for id, _ in doc] else: ids = [id for id, _ in doc] @@ -461,7 +508,8 @@ def inference(self, chunk, collect_sstats=False): # We represent phi implicitly to save memory and time. # Substituting the value of the optimal phi back into # the update for gamma gives this update. Cf. Lee&Seung 2001. - gammad = self.alpha + expElogthetad * numpy.dot(cts / phinorm, expElogbetad.T) + gammad = self.alpha + expElogthetad * \ + numpy.dot(cts / phinorm, expElogbetad.T) Elogthetad = dirichlet_expectation(gammad) expElogthetad = numpy.exp(Elogthetad) phinorm = numpy.dot(expElogthetad, expElogbetad) + 1e-100 @@ -498,7 +546,8 @@ def do_estep(self, chunk, state=None): state = self.state gamma, sstats = self.inference(chunk, collect_sstats=True) state.sstats += sstats - state.numdocs += gamma.shape[0] # avoids calling len(chunk) on a generator + # avoids calling len(chunk) on a generator + state.numdocs += gamma.shape[0] return gamma def update_alpha(self, gammat, rho): @@ -520,12 +569,17 @@ def update_eta(self, lambdat, rho): word weights `eta` given the last `lambdat`. """ if self.eta.shape[1] != 1: - raise ValueError("Can't use update_eta with eta matrices, only column vectors.") + raise ValueError( + "Can't use update_eta with eta matrices, only column vectors.") N = float(lambdat.shape[1]) - logphat = (sum(dirichlet_expectation(lambda_) for lambda_ in lambdat.transpose()) / N).reshape((self.num_topics, 1)) + logphat = (sum(dirichlet_expectation(lambda_) + for lambda_ in lambdat.transpose()) / N).reshape((self.num_topics, 1)) self.eta = update_dir_prior(self.eta, N, logphat, rho) - logger.info("optimized eta %s", list(self.eta.reshape((self.num_topics)))) + logger.info( + "optimized eta %s", list( + self.eta.reshape( + (self.num_topics)))) return self.eta @@ -540,14 +594,24 @@ def log_perplexity(self, chunk, total_docs=None): total_docs = len(chunk) corpus_words = sum(cnt for document in chunk for _, cnt in document) subsample_ratio = 1.0 * total_docs / len(chunk) - perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words) + perwordbound = self.bound( + chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words) logger.info("%.3f per-word bound, %.1f perplexity estimate based on a held-out corpus of %i documents with %i words" % (perwordbound, numpy.exp2(-perwordbound), len(chunk), corpus_words)) return perwordbound - def update(self, corpus, chunksize=None, decay=None, offset=None, - passes=None, update_every=None, eval_every=None, iterations=None, - gamma_threshold=None, chunks_as_numpy=False): + def update( + self, + corpus, + chunksize=None, + decay=None, + offset=None, + passes=None, + update_every=None, + eval_every=None, + iterations=None, + gamma_threshold=None, + chunks_as_numpy=False): """ Train the model with new documents, by EM-iterating over `corpus` until the topics converge (or until the maximum number of allowed iterations @@ -579,7 +643,8 @@ def update(self, corpus, chunksize=None, decay=None, offset=None, For other parameter settings, see :class:`LdaModel` constructor. """ - # use parameters given in constructor, unless user explicitly overrode them + # use parameters given in constructor, unless user explicitly overrode + # them if decay is None: decay = self.decay if offset is None: @@ -598,7 +663,8 @@ def update(self, corpus, chunksize=None, decay=None, offset=None, try: lencorpus = len(corpus) except: - logger.warning("input corpus stream has no len(); counting documents") + logger.warning( + "input corpus stream has no len(); counting documents") lencorpus = sum(1 for _ in corpus) if lencorpus == 0: logger.warning("LdaModel.update() called with an empty corpus") @@ -611,24 +677,39 @@ def update(self, corpus, chunksize=None, decay=None, offset=None, if update_every: updatetype = "online" - updateafter = min(lencorpus, update_every * self.numworkers * chunksize) + updateafter = min( + lencorpus, + update_every * + self.numworkers * + chunksize) else: updatetype = "batch" updateafter = lencorpus - evalafter = min(lencorpus, (eval_every or 0) * self.numworkers * chunksize) + evalafter = min( + lencorpus, + (eval_every or 0) * + self.numworkers * + chunksize) updates_per_pass = max(1, lencorpus / updateafter) - logger.info("running %s LDA training, %s topics, %i passes over " - "the supplied corpus of %i documents, updating model once " - "every %i documents, evaluating perplexity every %i documents, " - "iterating %ix with a convergence threshold of %f", - updatetype, self.num_topics, passes, lencorpus, - updateafter, evalafter, iterations, - gamma_threshold) + logger.info( + "running %s LDA training, %s topics, %i passes over " + "the supplied corpus of %i documents, updating model once " + "every %i documents, evaluating perplexity every %i documents, " + "iterating %ix with a convergence threshold of %f", + updatetype, + self.num_topics, + passes, + lencorpus, + updateafter, + evalafter, + iterations, + gamma_threshold) if updates_per_pass * passes < 10: - logger.warning("too few updates, training might not converge; consider " - "increasing the number of passes or iterations to improve accuracy") + logger.warning( + "too few updates, training might not converge; consider " + "increasing the number of passes or iterations to improve accuracy") # rho is the "speed" of updating; TODO try other fncs # pass_ + num_updates handles increasing the starting t for each pass, @@ -645,21 +726,38 @@ def rho(): dirty = False reallen = 0 - for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize, as_numpy=chunks_as_numpy)): - reallen += len(chunk) # keep track of how many documents we've processed so far - - if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)): + for chunk_no, chunk in enumerate(utils.grouper( + corpus, chunksize, as_numpy=chunks_as_numpy)): + # keep track of how many documents we've processed so far + reallen += len(chunk) + + if eval_every and ( + (reallen == lencorpus) or ( + (chunk_no + 1) % + (eval_every * self.numworkers) == 0)): self.log_perplexity(chunk, total_docs=lencorpus) if self.dispatcher: - # add the chunk to dispatcher's job queue, so workers can munch on it - logger.info('PROGRESS: pass %i, dispatching documents up to #%i/%i', - pass_, chunk_no * chunksize + len(chunk), lencorpus) - # this will eventually block until some jobs finish, because the queue has a small finite length + # add the chunk to dispatcher's job queue, so workers can + # munch on it + logger.info( + 'PROGRESS: pass %i, dispatching documents up to #%i/%i', + pass_, + chunk_no * + chunksize + + len(chunk), + lencorpus) + # this will eventually block until some jobs finish, + # because the queue has a small finite length self.dispatcher.putjob(chunk) else: - logger.info('PROGRESS: pass %i, at document #%i/%i', - pass_, chunk_no * chunksize + len(chunk), lencorpus) + logger.info( + 'PROGRESS: pass %i, at document #%i/%i', + pass_, + chunk_no * + chunksize + + len(chunk), + lencorpus) gammat = self.do_estep(chunk, other) if self.optimize_alpha: @@ -668,11 +766,14 @@ def rho(): dirty = True del chunk - # perform an M step. determine when based on update_every, don't do this after every chunk - if update_every and (chunk_no + 1) % (update_every * self.numworkers) == 0: + # perform an M step. determine when based on update_every, + # don't do this after every chunk + if update_every and ( + chunk_no + 1) % (update_every * self.numworkers) == 0: if self.dispatcher: # distributed mode: wait for all workers to finish - logger.info("reached the end of input; now waiting for all remaining jobs to finish") + logger.info( + "reached the end of input; now waiting for all remaining jobs to finish") other = self.dispatcher.getstate() self.do_mstep(rho(), other, pass_ > 0) del other # frees up memory @@ -685,13 +786,15 @@ def rho(): dirty = False # endfor single corpus iteration if reallen != lencorpus: - raise RuntimeError("input corpus size changed during training (don't use generators as input)") + raise RuntimeError( + "input corpus size changed during training (don't use generators as input)") if dirty: # finish any remaining updates if self.dispatcher: # distributed mode: wait for all workers to finish - logger.info("reached the end of input; now waiting for all remaining jobs to finish") + logger.info( + "reached the end of input; now waiting for all remaining jobs to finish") other = self.dispatcher.getstate() self.do_mstep(rho(), other, pass_ > 0) del other @@ -737,7 +840,8 @@ def bound(self, corpus, gamma=None, subsample_ratio=1.0): _lambda = self.state.get_lambda() Elogbeta = dirichlet_expectation(_lambda) - for d, doc in enumerate(corpus): # stream the input doc-by-doc, in case it's too large to fit in RAM + for d, doc in enumerate( + corpus): # stream the input doc-by-doc, in case it's too large to fit in RAM if d % self.chunksize == 0: logger.debug("bound: at document #%i", d) if gamma is None: @@ -747,14 +851,18 @@ def bound(self, corpus, gamma=None, subsample_ratio=1.0): Elogthetad = dirichlet_expectation(gammad) # E[log p(doc | theta, beta)] - score += numpy.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, id]) for id, cnt in doc) + score += numpy.sum(cnt * logsumexp(Elogthetad + + Elogbeta[:, id]) for id, cnt in doc) - # E[log p(theta | alpha) - log q(theta | gamma)]; assumes alpha is a vector + # E[log p(theta | alpha) - log q(theta | gamma)]; assumes alpha is + # a vector score += numpy.sum((self.alpha - gammad) * Elogthetad) score += numpy.sum(gammaln(gammad) - gammaln(self.alpha)) - score += gammaln(numpy.sum(self.alpha)) - gammaln(numpy.sum(gammad)) + score += gammaln(numpy.sum(self.alpha)) - \ + gammaln(numpy.sum(gammad)) - # compensate likelihood for when `corpus` above is only a sample of the whole corpus + # compensate likelihood for when `corpus` above is only a sample of the + # whole corpus score *= subsample_ratio # E[log p(beta | eta) - log q (beta | lambda)]; assumes eta is a scalar @@ -769,7 +877,12 @@ def bound(self, corpus, gamma=None, subsample_ratio=1.0): score += numpy.sum(gammaln(sum_eta) - gammaln(numpy.sum(_lambda, 1))) return score - def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): + def show_topics( + self, + num_topics=10, + num_words=10, + log=False, + formatted=True): """ For `num_topics` number of topics, return `num_words` most significant words (10 words per topic, by default). @@ -790,11 +903,14 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): else: num_topics = min(num_topics, self.num_topics) - # add a little random jitter, to randomize results around the same alpha - sort_alpha = self.alpha + 0.0001 * self.random_state.rand(len(self.alpha)) + # add a little random jitter, to randomize results around the same + # alpha + sort_alpha = self.alpha + 0.0001 * \ + self.random_state.rand(len(self.alpha)) sorted_topics = list(matutils.argsort(sort_alpha)) - chosen_topics = sorted_topics[:num_topics // 2] + sorted_topics[-num_topics // 2:] + chosen_topics = sorted_topics[ + :num_topics // 2] + sorted_topics[-num_topics // 2:] shown = [] for i in chosen_topics: @@ -817,7 +933,8 @@ def show_topic(self, topicid, topn=10): Only return 2-tuples for the topn most probable words (ignore the rest). """ - return [(self.id2word[id], value) for id, value in self.get_topic_terms(topicid, topn)] + return [(self.id2word[id], value) + for id, value in self.get_topic_terms(topicid, topn)] def get_topic_terms(self, topicid, topn=10): """ @@ -887,14 +1004,20 @@ def top_topics(self, corpus, num_words=20): co_doc_frequency = len(m_docs.intersection(l_docs)) # add to the coherence sum for these two words m, l - coherence += numpy.log((co_doc_frequency + 1.0) / len(l_docs)) + coherence += numpy.log((co_doc_frequency + + 1.0) / len(l_docs)) coherence_scores.append((str_topics[t], coherence)) top_topics = sorted(coherence_scores, key=lambda t: t[1], reverse=True) return top_topics - def get_document_topics(self, bow, minimum_probability=None, minimum_phi_value=None, per_word_topics=False): + def get_document_topics( + self, + bow, + minimum_probability=None, + minimum_phi_value=None, + per_word_topics=False): """ Return topic distribution for the given document `bow`, as a list of (topic_id, topic_probability) 2-tuples. @@ -907,11 +1030,13 @@ def get_document_topics(self, bow, minimum_probability=None, minimum_phi_value=N """ if minimum_probability is None: minimum_probability = self.minimum_probability - minimum_probability = max(minimum_probability, 1e-8) # never allow zero values in sparse output + # never allow zero values in sparse output + minimum_probability = max(minimum_probability, 1e-8) if minimum_phi_value is None: minimum_phi_value = self.minimum_probability - minimum_phi_value = max(minimum_phi_value, 1e-8) # never allow zero values in sparse output + # never allow zero values in sparse output + minimum_phi_value = max(minimum_phi_value, 1e-8) # if the input vector is a corpus, return a transformed corpus is_corpus, corpus = utils.is_corpus(bow) @@ -921,32 +1046,36 @@ def get_document_topics(self, bow, minimum_probability=None, minimum_phi_value=N gamma, phis = self.inference([bow], collect_sstats=True) topic_dist = gamma[0] / sum(gamma[0]) # normalize distribution - document_topics = [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist) - if topicvalue >= minimum_probability] + document_topics = [(topicid, topicvalue) for topicid, topicvalue in enumerate( + topic_dist) if topicvalue >= minimum_probability] if not per_word_topics: return document_topics else: - word_topic = [] # contains word and corresponding topic - word_phi = [] # contains word and phi values + word_topic = [] # contains word and corresponding topic + word_phi = [] # contains word and phi values for word_type, weight in bow: - phi_values = [] # contains (phi_value, topic) pairing to later be sorted - phi_topic = [] # contains topic and corresponding phi value to be returned 'raw' to user + # contains (phi_value, topic) pairing to later be sorted + phi_values = [] + phi_topic = [] # contains topic and corresponding phi value to be returned 'raw' to user for topic_id in range(0, self.num_topics): if phis[topic_id][word_type] >= minimum_phi_value: # appends phi values for each topic for that word # these phi values are scaled by feature length - phi_values.append((phis[topic_id][word_type], topic_id)) + phi_values.append( + (phis[topic_id][word_type], topic_id)) phi_topic.append((topic_id, phis[topic_id][word_type])) - # list with ({word_id => [(topic_0, phi_value), (topic_1, phi_value) ...]). + # list with ({word_id => [(topic_0, phi_value), (topic_1, + # phi_value) ...]). word_phi.append((word_type, phi_topic)) # sorts the topics based on most likely topic - # returns a list like ({word_id => [topic_id_most_probable, topic_id_second_most_probable, ...]). + # returns a list like ({word_id => [topic_id_most_probable, + # topic_id_second_most_probable, ...]). sorted_phi_values = sorted(phi_values, reverse=True) topics_sorted = [x[1] for x in sorted_phi_values] word_topic.append((word_type, topics_sorted)) - return (document_topics, word_topic, word_phi) # returns 2-tuple + return (document_topics, word_topic, word_phi) # returns 2-tuple def get_term_topics(self, word_id, minimum_probability=None): """ @@ -955,7 +1084,8 @@ def get_term_topics(self, word_id, minimum_probability=None): """ if minimum_probability is None: minimum_probability = self.minimum_probability - minimum_probability = max(minimum_probability, 1e-8) # never allow zero values in sparse output + # never allow zero values in sparse output + minimum_probability = max(minimum_probability, 1e-8) # if user enters word instead of id in vocab, change to get id if isinstance(word_id, str): @@ -968,7 +1098,6 @@ def get_term_topics(self, word_id, minimum_probability=None): return values - def __getitem__(self, bow, eps=None): """ Return topic distribution for the given document `bow`, as a list of @@ -1007,14 +1136,20 @@ def save(self, fname, ignore=['state', 'dispatcher'], *args, **kwargs): for an example on how to work around these issues. """ if self.state is not None: - self.state.save(utils.smart_extension(fname, '.state'), *args, **kwargs) + self.state.save( + utils.smart_extension( + fname, + '.state'), + *args, + **kwargs) # make sure 'state' and 'dispatcher' are ignored from the pickled object, even if # someone sets the ignore list themselves if ignore is not None and ignore: if isinstance(ignore, six.string_types): ignore = [ignore] - ignore = [e for e in ignore if e] # make sure None and '' are not in the list + # make sure None and '' are not in the list + ignore = [e for e in ignore if e] ignore = list(set(['state', 'dispatcher']) | set(ignore)) else: ignore = ['state', 'dispatcher'] @@ -1034,7 +1169,12 @@ def load(cls, fname, *args, **kwargs): result = super(LdaModel, cls).load(fname, *args, **kwargs) state_fname = utils.smart_extension(fname, '.state') try: - result.state = super(LdaModel, cls).load(state_fname, *args, **kwargs) + result.state = super( + LdaModel, + cls).load( + state_fname, + *args, + **kwargs) except Exception as e: logging.warning("failed to load state from %s: %s", state_fname, e) return result diff --git a/gensim/models/ldamulticore.py b/gensim/models/ldamulticore.py index e7ab9c983c..ce0563dba8 100644 --- a/gensim/models/ldamulticore.py +++ b/gensim/models/ldamulticore.py @@ -77,6 +77,7 @@ class LdaMulticore(LdaModel): Model persistency is achieved through its `load`/`save` methods. """ + def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None, chunksize=2000, passes=1, batch=False, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, @@ -124,7 +125,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None, `decay` and `offset` parameters are the same as Kappa and Tau_0 in Hoffman et al, respectively. - + `random_state` can be a numpy.random.RandomState object or the seed for one Example: @@ -139,13 +140,25 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None, self.batch = batch if isinstance(alpha, six.string_types) and alpha == 'auto': - raise NotImplementedError("auto-tuning alpha not implemented in multicore LDA; use plain LdaModel.") - - super(LdaMulticore, self).__init__(corpus=corpus, num_topics=num_topics, - id2word=id2word, chunksize=chunksize, passes=passes, alpha=alpha, eta=eta, - decay=decay, offset=offset, eval_every=eval_every, iterations=iterations, - gamma_threshold=gamma_threshold, random_state=random_state) - + raise NotImplementedError( + "auto-tuning alpha not implemented in multicore LDA; use plain LdaModel.") + + super( + LdaMulticore, + self).__init__( + corpus=corpus, + num_topics=num_topics, + id2word=id2word, + chunksize=chunksize, + passes=passes, + alpha=alpha, + eta=eta, + decay=decay, + offset=offset, + eval_every=eval_every, + iterations=iterations, + gamma_threshold=gamma_threshold, + random_state=random_state) def update(self, corpus, chunks_as_numpy=False): """ @@ -168,7 +181,8 @@ def update(self, corpus, chunks_as_numpy=False): try: lencorpus = len(corpus) except: - logger.warning("input corpus stream has no len(); counting documents") + logger.warning( + "input corpus stream has no len(); counting documents") lencorpus = sum(1 for _ in corpus) if lencorpus == 0: logger.warning("LdaMulticore.update() called with an empty corpus") @@ -185,14 +199,22 @@ def update(self, corpus, chunks_as_numpy=False): evalafter = min(lencorpus, (self.eval_every or 0) * updateafter) updates_per_pass = max(1, lencorpus / updateafter) - logger.info("running %s LDA training, %s topics, %i passes over the" + logger.info( + "running %s LDA training, %s topics, %i passes over the" " supplied corpus of %i documents, updating every %i documents," " evaluating every ~%i documents, iterating %ix with a convergence threshold of %f", - updatetype, self.num_topics, self.passes, lencorpus, updateafter, evalafter, - self.iterations, self.gamma_threshold) + updatetype, + self.num_topics, + self.passes, + lencorpus, + updateafter, + evalafter, + self.iterations, + self.gamma_threshold) if updates_per_pass * self.passes < 10: - logger.warning("too few updates, training might not converge; consider " + logger.warning( + "too few updates, training might not converge; consider " "increasing the number of passes or iterations to improve accuracy") job_queue = Queue(maxsize=2 * self.workers) @@ -202,7 +224,8 @@ def update(self, corpus, chunks_as_numpy=False): # pass_ + num_updates handles increasing the starting t for each pass, # while allowing it to "reset" on the first pass of each update def rho(): - return pow(self.offset + pass_ + (self.num_updates / self.chunksize), -self.decay) + return pow(self.offset + pass_ + + (self.num_updates / self.chunksize), -self.decay) logger.info("training LDA model using %i processes", self.workers) pool = Pool(self.workers, worker_e_step, (job_queue, result_queue,)) @@ -221,41 +244,57 @@ def process_result_queue(force=False): other.merge(result_queue.get()) queue_size[0] -= 1 merged_new = True - if (force and merged_new and queue_size[0] == 0) or (not self.batch and (other.numdocs >= updateafter)): + if (force and merged_new and queue_size[0] == 0) or ( + not self.batch and (other.numdocs >= updateafter)): self.do_mstep(rho(), other, pass_ > 0) other.reset() - if self.eval_every is not None and ((force and queue_size[0] == 0) or (self.eval_every != 0 and (self.num_updates / updateafter) % self.eval_every == 0)): + if self.eval_every is not None and ( + (force and queue_size[0] == 0) or ( + self.eval_every != 0 and ( + self.num_updates / updateafter) % + self.eval_every == 0)): self.log_perplexity(chunk, total_docs=lencorpus) - chunk_stream = utils.grouper(corpus, self.chunksize, as_numpy=chunks_as_numpy) + chunk_stream = utils.grouper( + corpus, self.chunksize, as_numpy=chunks_as_numpy) for chunk_no, chunk in enumerate(chunk_stream): - reallen += len(chunk) # keep track of how many documents we've processed so far + # keep track of how many documents we've processed so far + reallen += len(chunk) # put the chunk into the workers' input job queue chunk_put = False while not chunk_put: try: - job_queue.put((chunk_no, chunk, self), block=False, timeout=0.1) + job_queue.put( + (chunk_no, chunk, self), block=False, timeout=0.1) chunk_put = True queue_size[0] += 1 - logger.info('PROGRESS: pass %i, dispatched chunk #%i = ' + logger.info( + 'PROGRESS: pass %i, dispatched chunk #%i = ' 'documents up to #%i/%i, outstanding queue size %i', - pass_, chunk_no, chunk_no * self.chunksize + len(chunk), lencorpus, queue_size[0]) + pass_, + chunk_no, + chunk_no * + self.chunksize + + len(chunk), + lencorpus, + queue_size[0]) except queue.Full: # in case the input job queue is full, keep clearing the # result queue, to make sure we don't deadlock process_result_queue() process_result_queue() - #endfor single corpus pass + # endfor single corpus pass # wait for all outstanding jobs to finish while queue_size[0] > 0: process_result_queue(force=True) if reallen != lencorpus: - raise RuntimeError("input corpus size changed during training (don't use generators as input)") - #endfor entire update + raise RuntimeError( + "input corpus size changed during training (don't use generators as input)") + # endfor entire update pool.terminate() @@ -270,7 +309,10 @@ def worker_e_step(input_queue, result_queue): while True: logger.debug("getting a new job") chunk_no, chunk, worker_lda = input_queue.get() - logger.debug("processing chunk #%i of %i documents", chunk_no, len(chunk)) + logger.debug( + "processing chunk #%i of %i documents", + chunk_no, + len(chunk)) worker_lda.state.reset() worker_lda.do_estep(chunk) # TODO: auto-tune alpha? del chunk diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index 22d1f9fd37..f187bdf6e4 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -32,6 +32,7 @@ logger = logging.getLogger('gensim.models.ldaseqmodel') + class LdaSeqModel(utils.SaveLoad): """ The constructor estimates Dynamic Topic Model parameters based @@ -48,9 +49,24 @@ class LdaSeqModel(utils.SaveLoad): saves the model to disk. """ - def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_topics=10, - initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10, - random_state=None, lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100): + def __init__( + self, + corpus=None, + time_slice=None, + id2word=None, + alphas=0.01, + num_topics=10, + initialize='gensim', + sstats=None, + lda_model=None, + obs_variance=0.5, + chain_variance=0.005, + passes=10, + random_state=None, + lda_inference_max_iter=25, + em_min_iter=6, + em_max_iter=20, + chunksize=100): """ `corpus` is any iterable gensim corpus @@ -76,10 +92,12 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_ """ self.id2word = id2word if corpus is None and self.id2word is None: - raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality') + raise ValueError( + 'at least one of corpus/id2word must be specified, to establish input space dimensionality') if self.id2word is None: - logger.warning("no word id mapping provided; initializing from corpus, assuming identity") + logger.warning( + "no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.vocab_len = len(self.id2word) elif len(self.id2word) > 0: @@ -91,7 +109,8 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_ try: self.corpus_len = len(corpus) except: - logger.warning("input corpus stream has no len(); counting documents") + logger.warning( + "input corpus stream has no len(); counting documents") self.corpus_len = sum(1 for _ in corpus) self.time_slice = time_slice @@ -109,22 +128,36 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_ self.alphas = numpy.full(num_topics, alphas) # topic_chains contains for each topic a 'state space language model' object which in turn has information about each topic - # the sslm class is described below and contains information on topic-word probabilities and doc-topic probabilities. + # the sslm class is described below and contains information on + # topic-word probabilities and doc-topic probabilities. self.topic_chains = [] for topic in range(0, num_topics): - sslm_ = sslm(num_time_slices=self.num_time_slices, vocab_len=self.vocab_len, num_topics=self.num_topics, chain_variance=chain_variance, obs_variance=obs_variance) + sslm_ = sslm( + num_time_slices=self.num_time_slices, + vocab_len=self.vocab_len, + num_topics=self.num_topics, + chain_variance=chain_variance, + obs_variance=obs_variance) self.topic_chains.append(sslm_) - # the following are class variables which are to be integrated during Document Influence Model + # the following are class variables which are to be integrated during + # Document Influence Model self.top_doc_phis = None self.influence = None self.renormalized_influence = None self.influence_sum_lgl = None - # if a corpus and time_slice is provided, depending on the user choice of initializing LDA, we start DTM. + # if a corpus and time_slice is provided, depending on the user choice + # of initializing LDA, we start DTM. if corpus is not None and time_slice is not None: if initialize == 'gensim': - lda_model = ldamodel.LdaModel(corpus, id2word=self.id2word, num_topics=self.num_topics, passes=passes, alpha=self.alphas, random_state=random_state) + lda_model = ldamodel.LdaModel( + corpus, + id2word=self.id2word, + num_topics=self.num_topics, + passes=passes, + alpha=self.alphas, + random_state=random_state) self.sstats = numpy.transpose(lda_model.state.sstats) if initialize == 'ldamodel': self.sstats = numpy.transpose(lda_model.state.sstats) @@ -132,28 +165,50 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_ self.sstats = sstats # initialize model from sstats - self.init_ldaseq_ss(chain_variance, obs_variance, self.alphas, self.sstats) + self.init_ldaseq_ss( + chain_variance, + obs_variance, + self.alphas, + self.sstats) # fit DTM - self.fit_lda_seq(corpus, lda_inference_max_iter, em_min_iter, em_max_iter, chunksize) - - - def init_ldaseq_ss(self, topic_chain_variance, topic_obs_variance, alpha, init_suffstats): + self.fit_lda_seq( + corpus, + lda_inference_max_iter, + em_min_iter, + em_max_iter, + chunksize) + + def init_ldaseq_ss( + self, + topic_chain_variance, + topic_obs_variance, + alpha, + init_suffstats): """ Method to initialize State Space Language Model, topic wise. """ self.alphas = alpha for k, chain in enumerate(self.topic_chains): sstats = init_suffstats[:, k] - sslm.sslm_counts_init(chain, topic_obs_variance, topic_chain_variance, sstats) + sslm.sslm_counts_init( + chain, + topic_obs_variance, + topic_chain_variance, + sstats) # initialize the below matrices only if running DIM # ldaseq.topic_chains[k].w_phi_l = numpy.zeros((ldaseq.vocab_len, ldaseq.num_time_slices)) # ldaseq.topic_chains[k].w_phi_sum = numpy.zeros((ldaseq.vocab_len, ldaseq.num_time_slices)) # ldaseq.topic_chains[k].w_phi_sq = numpy.zeros((ldaseq.vocab_len, ldaseq.num_time_slices)) - - def fit_lda_seq(self, corpus, lda_inference_max_iter, em_min_iter, em_max_iter, chunksize): + def fit_lda_seq( + self, + corpus, + lda_inference_max_iter, + em_min_iter, + em_max_iter, + chunksize): """ fit an lda sequence model: @@ -181,7 +236,8 @@ def fit_lda_seq(self, corpus, lda_inference_max_iter, em_min_iter, em_max_iter, convergence = LDASQE_EM_THRESHOLD + 1 iter_ = 0 - while iter_ < em_min_iter or ((convergence > LDASQE_EM_THRESHOLD) and iter_ <= em_max_iter): + while iter_ < em_min_iter or ( + (convergence > LDASQE_EM_THRESHOLD) and iter_ <= em_max_iter): logger.info(" EM iter %i", iter_) logger.info("E Step") @@ -191,14 +247,27 @@ def fit_lda_seq(self, corpus, lda_inference_max_iter, em_min_iter, em_max_iter, # initiate sufficient statistics topic_suffstats = [] for topic in range(0, num_topics): - topic_suffstats.append(numpy.resize(numpy.zeros(vocab_len * data_len), (vocab_len, data_len))) + topic_suffstats.append( + numpy.resize( + numpy.zeros( + vocab_len * data_len), (vocab_len, data_len))) # set up variables - gammas = numpy.resize(numpy.zeros(corpus_len * num_topics), (corpus_len, num_topics)) - lhoods = numpy.resize(numpy.zeros(corpus_len * num_topics + 1), (corpus_len, num_topics + 1)) + gammas = numpy.resize( + numpy.zeros( + corpus_len * + num_topics), + (corpus_len, + num_topics)) + lhoods = numpy.resize( + numpy.zeros( + corpus_len * num_topics + 1), + (corpus_len, + num_topics + 1)) # compute the likelihood of a sequential corpus under an LDA # seq model and find the evidence lower bound. This is the E - Step - bound, gammas = self.lda_seq_infer(corpus, topic_suffstats, gammas, lhoods, iter_, lda_inference_max_iter, chunksize) + bound, gammas = self.lda_seq_infer( + corpus, topic_suffstats, gammas, lhoods, iter_, lda_inference_max_iter, chunksize) self.gammas = gammas logger.info("M Step") @@ -211,7 +280,9 @@ def fit_lda_seq(self, corpus, lda_inference_max_iter, em_min_iter, em_max_iter, # if max_iter is too low, increase iterations. if lda_inference_max_iter < LOWER_ITER: lda_inference_max_iter *= ITER_MULT_LOW - logger.info("Bound went down, increasing iterations to %i", lda_inference_max_iter) + logger.info( + "Bound went down, increasing iterations to %i", + lda_inference_max_iter) # check for convergence convergence = numpy.fabs((bound - old_bound) / old_bound) @@ -219,17 +290,30 @@ def fit_lda_seq(self, corpus, lda_inference_max_iter, em_min_iter, em_max_iter, if convergence < LDASQE_EM_THRESHOLD: lda_inference_max_iter = MAX_ITER - logger.info("Starting final iterations, max iter is %i", lda_inference_max_iter) + logger.info( + "Starting final iterations, max iter is %i", + lda_inference_max_iter) convergence = 1.0 - logger.info("iteration %i iteration lda seq bound is %f convergence is %f", iter_, bound, convergence) + logger.info( + "iteration %i iteration lda seq bound is %f convergence is %f", + iter_, + bound, + convergence) iter_ += 1 return bound - - def lda_seq_infer(self, corpus, topic_suffstats, gammas, lhoods, iter_, lda_inference_max_iter, chunksize): + def lda_seq_infer( + self, + corpus, + topic_suffstats, + gammas, + lhoods, + iter_, + lda_inference_max_iter, + chunksize): """ Inference or E- Step. This is used to set up the gensim LdaModel to be used for each time-slice. @@ -239,28 +323,51 @@ def lda_seq_infer(self, corpus, topic_suffstats, gammas, lhoods, iter_, lda_infe vocab_len = self.vocab_len bound = 0.0 - lda = ldamodel.LdaModel(num_topics=num_topics, alpha=self.alphas, id2word=self.id2word) - lda.topics = numpy.array(numpy.split(numpy.zeros(vocab_len * num_topics), vocab_len)) - ldapost = LdaPost(max_doc_len=self.max_doc_len, num_topics=num_topics, lda=lda) + lda = ldamodel.LdaModel( + num_topics=num_topics, + alpha=self.alphas, + id2word=self.id2word) + lda.topics = numpy.array( + numpy.split( + numpy.zeros( + vocab_len * + num_topics), + vocab_len)) + ldapost = LdaPost( + max_doc_len=self.max_doc_len, + num_topics=num_topics, + lda=lda) model = "DTM" if model == "DTM": - bound, gammas = self.inferDTMseq(corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound, lda_inference_max_iter, chunksize) + bound, gammas = self.inferDTMseq( + corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound, lda_inference_max_iter, chunksize) elif model == "DIM": self.InfluenceTotalFixed(corpus) - bound, gammas = self.inferDIMseq(corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound, lda_inference_max_iter, chunksize) + bound, gammas = self.inferDIMseq( + corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound, lda_inference_max_iter, chunksize) return bound, gammas - - def inferDTMseq(self, corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound, lda_inference_max_iter, chunksize): + def inferDTMseq( + self, + corpus, + topic_suffstats, + gammas, + lhoods, + lda, + ldapost, + iter_, + bound, + lda_inference_max_iter, + chunksize): """ Computes the likelihood of a sequential corpus under an LDA seq model, and return the likelihood bound. Need to pass the LdaSeq model, corpus, sufficient stats, gammas and lhoods matrices previously created, and LdaModel and LdaPost class objects. """ - doc_index = 0 # overall doc_index in corpus - time = 0 # current time-slice + doc_index = 0 # overall doc_index in corpus + time = 0 # current time-slice doc_num = 0 # doc-index in current time-lice num_topics = self.num_topics lda = self.make_lda_seq_slice(lda, time) # create lda_seq slice @@ -270,10 +377,12 @@ def inferDTMseq(self, corpus, topic_suffstats, gammas, lhoods, lda, ldapost, ite for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): # iterates chunk size for constant memory footprint for doc in chunk: - # this is used to update the time_slice and create a new lda_seq slice every new time_slice + # this is used to update the time_slice and create a new + # lda_seq slice every new time_slice if doc_index > time_slice[time]: time += 1 - lda = self.make_lda_seq_slice(lda, time) # create lda_seq slice + lda = self.make_lda_seq_slice( + lda, time) # create lda_seq slice doc_num = 0 gam = gammas[doc_index] @@ -283,14 +392,18 @@ def inferDTMseq(self, corpus, topic_suffstats, gammas, lhoods, lda, ldapost, ite ldapost.lhood = lhood ldapost.doc = doc - # TODO: replace fit_lda_post with appropriate ldamodel functions, if possible. + # TODO: replace fit_lda_post with appropriate ldamodel + # functions, if possible. if iter_ == 0: - doc_lhood = LdaPost.fit_lda_post(ldapost, doc_num, time, None, lda_inference_max_iter=lda_inference_max_iter) + doc_lhood = LdaPost.fit_lda_post( + ldapost, doc_num, time, None, lda_inference_max_iter=lda_inference_max_iter) else: - doc_lhood = LdaPost.fit_lda_post(ldapost, doc_num, time, self, lda_inference_max_iter=lda_inference_max_iter) + doc_lhood = LdaPost.fit_lda_post( + ldapost, doc_num, time, self, lda_inference_max_iter=lda_inference_max_iter) if topic_suffstats is not None: - topic_suffstats = LdaPost.update_lda_seq_ss(ldapost, time, doc, topic_suffstats) + topic_suffstats = LdaPost.update_lda_seq_ss( + ldapost, time, doc, topic_suffstats) gammas[doc_index] = ldapost.gamma bound += doc_lhood @@ -299,18 +412,17 @@ def inferDTMseq(self, corpus, topic_suffstats, gammas, lhoods, lda, ldapost, ite return bound, gammas - def make_lda_seq_slice(self, lda, time): """ set up the LDA model topic-word values with that of ldaseq. """ for k in range(0, self.num_topics): - lda.topics[:, k] = numpy.copy(self.topic_chains[k].e_log_prob[:, time]) + lda.topics[:, k] = numpy.copy( + self.topic_chains[k].e_log_prob[:, time]) lda.alpha = numpy.copy(self.alphas) return lda - def fit_lda_seq_topics(self, topic_suffstats): """ Fit lda sequence topic wise. @@ -325,7 +437,6 @@ def fit_lda_seq_topics(self, topic_suffstats): return lhood - def print_topic_times(self, topic, top_terms=20): """ Prints one topic showing each time-slice. @@ -336,17 +447,15 @@ def print_topic_times(self, topic, top_terms=20): return topics - def print_topics(self, time=0, top_terms=20): """ Prints all topics in a particular time-slice. """ - topics =[] + topics = [] for topic in range(0, self.num_topics): topics.append(self.print_topic(topic, time, top_terms)) return topics - def print_topic(self, topic, time=0, top_terms=20): """ Topic is the topic number @@ -361,7 +470,6 @@ def print_topic(self, topic, time=0, top_terms=20): beststr = [(self.id2word[id_], round(topic[id_], 3)) for id_ in bestn] return beststr - def doc_topics(self, doc_number): """ On passing the LdaSeqModel trained ldaseq object, the doc_number of your document in the corpus, @@ -371,7 +479,6 @@ def doc_topics(self, doc_number): doc_topic /= doc_topic.sum(axis=1)[:, numpy.newaxis] return doc_topic[doc_number] - def dtm_vis(self, time, corpus): """ returns term_frequency, vocab, doc_lengths, topic-term distributions and doc_topic distributions, specified by pyLDAvis format. @@ -381,7 +488,15 @@ def dtm_vis(self, time, corpus): doc_topic = numpy.copy(self.gammas) doc_topic /= doc_topic.sum(axis=1)[:, numpy.newaxis] - topic_term = [numpy.exp(numpy.transpose(chain.e_log_prob)[time]) / numpy.exp(numpy.transpose(chain.e_log_prob)[time]).sum() for k, chain in enumerate(self.topic_chains)] + topic_term = [ + numpy.exp( + numpy.transpose( + chain.e_log_prob)[time]) / + numpy.exp( + numpy.transpose( + chain.e_log_prob)[time]).sum() for k, + chain in enumerate( + self.topic_chains)] doc_lengths = [len(doc) for doc_no, doc in enumerate(corpus)] @@ -389,16 +504,17 @@ def dtm_vis(self, time, corpus): for doc_no, doc in enumerate(corpus): for pair in doc: term_frequency[pair[0]] += pair[1] - + vocab = [self.id2word[i] for i in range(0, len(self.id2word))] # returns numpy arrays for doc_topic proportions, topic_term proportions, and document_lengths, term_frequency. - # these should be passed to the `pyLDAvis.prepare` method to visualise one time-slice of DTM topics. - return doc_topic, numpy.array(topic_term), doc_lengths, term_frequency, vocab - + # these should be passed to the `pyLDAvis.prepare` method to visualise + # one time-slice of DTM topics. + return doc_topic, numpy.array( + topic_term), doc_lengths, term_frequency, vocab def dtm_coherence(self, time): """ - returns all topics of a particular time-slice without probabilitiy values for it to be used + returns all topics of a particular time-slice without probabilitiy values for it to be used for either "u_mass" or "c_v" coherence. """ coherence_topics = [] @@ -414,13 +530,26 @@ def __getitem__(self, doc): """ Similar to the LdaModel __getitem__ function, it returns topic proportions of a document passed. """ - lda_model = ldamodel.LdaModel(num_topics=self.num_topics, alpha=self.alphas, id2word=self.id2word) - lda_model.topics = numpy.array(numpy.split(numpy.zeros(self.vocab_len * self.num_topics), self.vocab_len)) - ldapost = LdaPost(num_topics=self.num_topics, max_doc_len=len(doc), lda=lda_model, doc=doc) + lda_model = ldamodel.LdaModel( + num_topics=self.num_topics, + alpha=self.alphas, + id2word=self.id2word) + lda_model.topics = numpy.array( + numpy.split( + numpy.zeros( + self.vocab_len * + self.num_topics), + self.vocab_len)) + ldapost = LdaPost( + num_topics=self.num_topics, + max_doc_len=len(doc), + lda=lda_model, + doc=doc) time_lhoods = [] for time in range(0, self.num_time_slices): - lda_model = self.make_lda_seq_slice(lda_model, time) # create lda_seq slice + lda_model = self.make_lda_seq_slice( + lda_model, time) # create lda_seq slice lhood = LdaPost.fit_lda_post(ldapost, 0, time, self) time_lhoods.append(lhood) @@ -441,23 +570,57 @@ class sslm(utils.SaveLoad): `fwd_mean`, `fwd_variance` are the forward posterior values. `zeta` is an extra variational parameter with a value for each time-slice """ - def __init__(self, vocab_len=None, num_time_slices=None, num_topics=None, obs_variance=0.5, chain_variance=0.005): + + def __init__( + self, + vocab_len=None, + num_time_slices=None, + num_topics=None, + obs_variance=0.5, + chain_variance=0.005): self.vocab_len = vocab_len self.num_time_slices = num_time_slices self.obs_variance = obs_variance - self.chain_variance= chain_variance + self.chain_variance = chain_variance self.num_topics = num_topics # setting up matrices - self.obs = numpy.array(numpy.split(numpy.zeros(num_time_slices * vocab_len), vocab_len)) - self.e_log_prob = numpy.array(numpy.split(numpy.zeros(num_time_slices * vocab_len), vocab_len)) - self.mean = numpy.array(numpy.split(numpy.zeros((num_time_slices + 1) * vocab_len), vocab_len)) - self.fwd_mean = numpy.array(numpy.split(numpy.zeros((num_time_slices + 1) * vocab_len), vocab_len)) - self.fwd_variance = numpy.array(numpy.split(numpy.zeros((num_time_slices + 1) * vocab_len), vocab_len)) - self.variance = numpy.array(numpy.split(numpy.zeros((num_time_slices + 1) * vocab_len), vocab_len)) + self.obs = numpy.array( + numpy.split( + numpy.zeros( + num_time_slices * + vocab_len), + vocab_len)) + self.e_log_prob = numpy.array( + numpy.split( + numpy.zeros( + num_time_slices * + vocab_len), + vocab_len)) + self.mean = numpy.array( + numpy.split( + numpy.zeros( + (num_time_slices + 1) * vocab_len), + vocab_len)) + self.fwd_mean = numpy.array( + numpy.split( + numpy.zeros( + (num_time_slices + 1) * vocab_len), + vocab_len)) + self.fwd_variance = numpy.array( + numpy.split( + numpy.zeros( + (num_time_slices + 1) * vocab_len), + vocab_len)) + self.variance = numpy.array( + numpy.split( + numpy.zeros( + (num_time_slices + 1) * vocab_len), + vocab_len)) self.zeta = numpy.zeros(num_time_slices) - # the following are class variables which are to be integrated during Document Influence Model + # the following are class variables which are to be integrated during + # Document Influence Model self.m_update_coeff = None self.mean_t = None self.variance_t = None @@ -467,7 +630,6 @@ def __init__(self, vocab_len=None, num_time_slices=None, num_topics=None, obs_va self.w_phi_l_sq = None self.m_update_coeff_g = None - def update_zeta(self): """ Updates the Zeta Variational Parameter. @@ -475,10 +637,19 @@ def update_zeta(self): It is the value of variational parameter zeta which maximizes the lower bound. """ for j, val in enumerate(self.zeta): - self.zeta[j] = numpy.sum(numpy.exp(self.mean[:, j + 1] + self.variance[:, j + 1] / 2)) + self.zeta[j] = numpy.sum( + numpy.exp( + self.mean[ + :, + j + + 1] + + self.variance[ + :, + j + + 1] / + 2)) return self.zeta - def compute_post_variance(self, word, chain_variance): """ Based on the Variational Kalman Filtering approach for Approximate Inference [https://www.cs.princeton.edu/~blei/papers/BleiLafferty2006a.pdf] @@ -487,9 +658,9 @@ def compute_post_variance(self, word, chain_variance): Fwd_Variance(t) ≡ E((beta_{t,w} − mean_{t,w})^2 |beta_{t} for 1:t) = (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance ) * (fwd_variance[t - 1] + obs_variance) - + Variance(t) ≡ E((beta_{t,w} − mean_cap{t,w})^2 |beta_cap{t} for 1:t) - = fwd_variance[t - 1] + (fwd_variance[t - 1] / fwd_variance[t - 1] + obs_variance)^2 * (variance[t - 1] - (fwd_variance[t-1] + obs_variance)) + = fwd_variance[t - 1] + (fwd_variance[t - 1] / fwd_variance[t - 1] + obs_variance)^2 * (variance[t - 1] - (fwd_variance[t-1] + obs_variance)) """ INIT_VARIANCE_CONST = 1000 @@ -501,22 +672,24 @@ def compute_post_variance(self, word, chain_variance): fwd_variance[0] = chain_variance * INIT_VARIANCE_CONST for t in range(1, T + 1): if self.obs_variance: - c = self.obs_variance / (fwd_variance[t - 1] + chain_variance + self.obs_variance) + c = self.obs_variance / \ + (fwd_variance[t - 1] + chain_variance + self.obs_variance) else: c = 0 fwd_variance[t] = c * (fwd_variance[t - 1] + chain_variance) - # backward pass + # backward pass variance[T] = fwd_variance[T] for t in range(T - 1, -1, -1): if fwd_variance[t] > 0.0: - c = numpy.power((fwd_variance[t] / (fwd_variance[t] + chain_variance)), 2) + c = numpy.power( + (fwd_variance[t] / (fwd_variance[t] + chain_variance)), 2) else: - c = 0 - variance[t] = (c * (variance[t + 1] - chain_variance)) + ((1 - c) * fwd_variance[t]) + c = 0 + variance[t] = (c * (variance[t + 1] - chain_variance) + ) + ((1 - c) * fwd_variance[t]) return variance, fwd_variance - def compute_post_mean(self, word, chain_variance): """ @@ -526,9 +699,9 @@ def compute_post_mean(self, word, chain_variance): Fwd_Mean(t) ≡ E(beta_{t,w} | beta_ˆ 1:t ) = (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance ) * fwd_mean[t - 1] + (1 - (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance)) * beta - + Mean(t) ≡ E(beta_{t,w} | beta_ˆ 1:T ) - = fwd_mean[t - 1] + (obs_variance / fwd_variance[t - 1] + obs_variance) + (1 - obs_variance / fwd_variance[t - 1] + obs_variance)) * mean[t] + = fwd_mean[t - 1] + (obs_variance / fwd_variance[t - 1] + obs_variance) + (1 - obs_variance / fwd_variance[t - 1] + obs_variance)) * mean[t] """ T = self.num_time_slices @@ -537,10 +710,11 @@ def compute_post_mean(self, word, chain_variance): mean = self.mean[word] fwd_mean = self.fwd_mean[word] - # forward + # forward fwd_mean[0] = 0 for t in range(1, T + 1): - c = self.obs_variance / (fwd_variance[t - 1] + chain_variance + self.obs_variance) + c = self.obs_variance / \ + (fwd_variance[t - 1] + chain_variance + self.obs_variance) fwd_mean[t] = c * fwd_mean[t - 1] + (1 - c) * obs[t - 1] # backward pass @@ -553,7 +727,6 @@ def compute_post_mean(self, word, chain_variance): mean[t] = c * fwd_mean[t] + (1 - c) * mean[t + 1] return mean, fwd_mean - def compute_expected_log_prob(self): """ Compute the expected log probability given values of m. @@ -561,10 +734,10 @@ def compute_expected_log_prob(self): The below implementation is the result of solving the equation and is as implemented in the original Blei DTM code. """ for (w, t), val in numpy.ndenumerate(self.e_log_prob): - self.e_log_prob[w][t] = self.mean[w][t + 1] - numpy.log(self.zeta[t]) + self.e_log_prob[w][t] = self.mean[w][ + t + 1] - numpy.log(self.zeta[t]) return self.e_log_prob - def sslm_counts_init(self, obs_variance, chain_variance, sstats): """ Initialize State Space Language Model with LDA sufficient statistics. @@ -587,13 +760,14 @@ def sslm_counts_init(self, obs_variance, chain_variance, sstats): # compute post variance, mean for w in range(0, W): - self.variance[w], self.fwd_variance[w] = self.compute_post_variance(w, self.chain_variance) - self.mean[w], self.fwd_mean[w] = self.compute_post_mean(w, self.chain_variance) + self.variance[w], self.fwd_variance[ + w] = self.compute_post_variance(w, self.chain_variance) + self.mean[w], self.fwd_mean[ + w] = self.compute_post_mean(w, self.chain_variance) self.zeta = self.update_zeta() self.e_log_prob = self.compute_expected_log_prob() - def fit_sslm(self, sstats): """ Fits variational distribution. @@ -611,7 +785,8 @@ def fit_sslm(self, sstats): totals = numpy.zeros(sstats.shape[1]) # computing variance, fwd_variance - self.variance, self.fwd_variance = map(numpy.array, list(zip(*[self.compute_post_variance(w, self.chain_variance) for w in range(0, W)]))) + self.variance, self.fwd_variance = map(numpy.array, list( + zip(*[self.compute_post_variance(w, self.chain_variance) for w in range(0, W)]))) # column sum of sstats totals = sstats.sum(axis=0) @@ -636,15 +811,18 @@ def fit_sslm(self, sstats): bound = self.compute_bound_fixed(sstats, totals) converged = numpy.fabs((bound - old_bound) / old_bound) - logger.info("iteration %i iteration lda seq bound is %f convergence is %f", iter_, bound, converged) + logger.info( + "iteration %i iteration lda seq bound is %f convergence is %f", + iter_, + bound, + converged) self.e_log_prob = self.compute_expected_log_prob() return bound - def compute_bound(self, sstats, totals): """ - Compute log probability bound. + Compute log probability bound. Forumula is as described in appendix of DTM by Blei. (formula no. 5) """ W = self.vocab_len @@ -659,11 +837,13 @@ def compute_bound(self, sstats, totals): chain_variance = self.chain_variance # computing mean, fwd_mean - self.mean, self.fwd_mean = map(numpy.array, (zip(*[self.compute_post_mean(w, self.chain_variance) for w in range(0, W)]))) + self.mean, self.fwd_mean = map(numpy.array, (zip( + *[self.compute_post_mean(w, self.chain_variance) for w in range(0, W)]))) self.zeta = self.update_zeta() for w in range(0, W): - val += (self.variance[w][0] - self.variance[w][T]) / 2 * chain_variance + val += (self.variance[w][0] - self.variance[w] + [T]) / 2 * chain_variance logger.info("Computing bound, all times") @@ -683,20 +863,21 @@ def compute_bound(self, sstats, totals): # exp_i = numpy.exp(-prev_m) # term_1 += (numpy.power(m - prev_m - (w_phi_l * exp_i), 2) / (2 * chain_variance)) - (v / chain_variance) - numpy.log(chain_variance) - term_1 += (numpy.power(m - prev_m, 2) / (2 * chain_variance)) - (v / chain_variance) - numpy.log(chain_variance) + term_1 += (numpy.power(m - prev_m, 2) / (2 * chain_variance) + ) - (v / chain_variance) - numpy.log(chain_variance) term_2 += sstats[w][t - 1] * m - ent += numpy.log(v) / 2 # note the 2pi's cancel with term1 (see doc) + # note the 2pi's cancel with term1 (see doc) + ent += numpy.log(v) / 2 term_3 = -totals[t - 1] * numpy.log(self.zeta[t - 1]) val += term_2 + term_3 + ent - term_1 return val - def update_obs(self, sstats, totals): """ Function to perform optimization of obs. Parameters are suff_stats set up in the fit_sslm method. - + TODO: This is by far the slowest function in the whole algorithm. Replacing or improving the performance of this would greatly speed things up. @@ -725,7 +906,7 @@ def update_obs(self, sstats, totals): if counts_norm < OBS_NORM_CUTOFF and norm_cutoff_obs is not None: obs = self.obs[w] norm_cutoff_obs = numpy.copy(obs) - else: + else: if counts_norm < OBS_NORM_CUTOFF: w_counts = numpy.zeros(len(w_counts)) @@ -742,7 +923,14 @@ def update_obs(self, sstats, totals): if model == "DTM": # slowest part of method - obs = optimize.fmin_cg(f=f_obs, fprime=df_obs, x0=obs, gtol=TOL, args=args, epsilon=STEP_SIZE, disp=0) + obs = optimize.fmin_cg( + f=f_obs, + fprime=df_obs, + x0=obs, + gtol=TOL, + args=args, + epsilon=STEP_SIZE, + disp=0) if model == "DIM": pass runs += 1 @@ -753,10 +941,9 @@ def update_obs(self, sstats, totals): self.obs[w] = obs self.zeta = self.update_zeta() - + return self.obs, self.zeta - def compute_mean_deriv(self, word, time, deriv): """ Used in helping find the optimum function. @@ -772,7 +959,8 @@ def compute_mean_deriv(self, word, time, deriv): # forward pass for t in range(1, T + 1): if self.obs_variance > 0.0: - w = self.obs_variance / (fwd_variance[t - 1] + self.chain_variance + self.obs_variance) + w = self.obs_variance / \ + (fwd_variance[t - 1] + self.chain_variance + self.obs_variance) else: w = 0.0 val = w * deriv[t - 1] @@ -784,13 +972,19 @@ def compute_mean_deriv(self, word, time, deriv): if self.chain_variance == 0.0: w = 0.0 else: - w = self.chain_variance / (fwd_variance[t] + self.chain_variance) + w = self.chain_variance / \ + (fwd_variance[t] + self.chain_variance) deriv[t] = w * deriv[t] + (1 - w) * deriv[t + 1] return deriv - - def compute_obs_deriv(self, word, word_counts, totals, mean_deriv_mtx, deriv): + def compute_obs_deriv( + self, + word, + word_counts, + totals, + mean_deriv_mtx, + deriv): """ Derivation of obs which is used in derivative function [df_obs] while optimizing. """ @@ -828,7 +1022,8 @@ def compute_obs_deriv(self, word, word_counts, totals, mean_deriv_mtx, deriv): dmean_u_prev = mean_deriv[u - 1] term1 += (mean_u - mean_u_prev) * (dmean_u - dmean_u_prev) - term2 += (word_counts[u - 1] - (totals[u - 1] * self.temp_vect[u - 1] / self.zeta[u - 1])) * dmean_u + term2 += (word_counts[u - 1] - (totals[u - 1] * \ + self.temp_vect[u - 1] / self.zeta[u - 1])) * dmean_u model = "DTM" if model == "DIM": @@ -837,15 +1032,17 @@ def compute_obs_deriv(self, word, word_counts, totals, mean_deriv_mtx, deriv): if self.chain_variance: term1 = - (term1 / self.chain_variance) - term1 = term1 - (mean[0] * mean_deriv[0]) / (init_mult * self.chain_variance) + term1 = term1 - (mean[0] * mean_deriv[0]) / \ + (init_mult * self.chain_variance) else: term1 = 0.0 deriv[t] = term1 + term2 + term3 + term4 - + return deriv # endclass sslm + class LdaPost(utils.SaveLoad): """ @@ -854,7 +1051,14 @@ class LdaPost(utils.SaveLoad): to update phi, gamma. End game would be to somehow replace LdaPost entirely with LdaModel. """ - def __init__(self, doc=None, lda=None, max_doc_len=None, num_topics=None, gamma=None, lhood=None): + def __init__( + self, + doc=None, + lda=None, + max_doc_len=None, + num_topics=None, + gamma=None, + lhood=None): self.doc = doc self.lda = lda @@ -866,21 +1070,27 @@ def __init__(self, doc=None, lda=None, max_doc_len=None, num_topics=None, gamma= self.lhood = numpy.zeros(num_topics + 1) if max_doc_len is not None and num_topics is not None: - self.phi = numpy.resize(numpy.zeros(max_doc_len * num_topics), (max_doc_len, num_topics)) - self.log_phi = numpy.resize(numpy.zeros(max_doc_len * num_topics), (max_doc_len, num_topics)) - - # the following are class variables which are to be integrated during Document Influence Model + self.phi = numpy.resize( + numpy.zeros( + max_doc_len * + num_topics), + (max_doc_len, + num_topics)) + self.log_phi = numpy.resize(numpy.zeros( + max_doc_len * num_topics), (max_doc_len, num_topics)) + + # the following are class variables which are to be integrated during + # Document Influence Model self.doc_weight = None self.renormalized_doc_weight = None - def update_phi(self, doc_number, time): """ Update variational multinomial parameters, based on a document and a time-slice. This is done based on the original Blei-LDA paper, where: log_phi := beta * exp(Ψ(gamma)), over every topic for every word. - + TODO: incorporate lee-sueng trick used in **Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001**. """ num_topics = self.lda.num_topics @@ -904,22 +1114,21 @@ def update_phi(self, doc_number, time): v = numpy.logaddexp(v, log_phi_row[i]) # subtract every element by v - log_phi_row = log_phi_row - v + log_phi_row = log_phi_row - v phi_row = numpy.exp(log_phi_row) self.log_phi[n] = log_phi_row self.phi[n] = phi_row - n +=1 # increase iteration + n += 1 # increase iteration return self.phi, self.log_phi - def update_gamma(self): """ update variational dirichlet parameters as described in the original Blei LDA paper: gamma = alpha + sum(phi), over every topic for every word. """ self.gamma = numpy.copy(self.lda.alpha) - n = 0 # keep track of number of iterations for phi, log_phi + n = 0 # keep track of number of iterations for phi, log_phi for word_id, count in self.doc: phi_row = self.phi[n] for k in range(0, self.lda.num_topics): @@ -928,18 +1137,16 @@ def update_gamma(self): return self.gamma - def init_lda_post(self): """ Initialize variational posterior, does not return anything. """ total = sum(count for word_id, count in self.doc) self.gamma.fill(self.lda.alpha[0] + float(total) / self.lda.num_topics) - self.phi[:len(self.doc),:] = 1.0 / self.lda.num_topics + self.phi[:len(self.doc), :] = 1.0 / self.lda.num_topics # doc_weight used during DIM # ldapost.doc_weight = None - def compute_lda_lhood(self): """ compute the likelihood bound @@ -949,7 +1156,7 @@ def compute_lda_lhood(self): # to be used in DIM # sigma_l = 0 - # sigma_d = 0 + # sigma_d = 0 lhood = gammaln(numpy.sum(self.lda.alpha)) - gammaln(gamma_sum) self.lhood[num_topics] = lhood @@ -965,12 +1172,14 @@ def compute_lda_lhood(self): # influence_term = - ((influence_topic * influence_topic + sigma_l * sigma_l) / 2.0 / (sigma_d * sigma_d)) e_log_theta_k = digamma(self.gamma[k]) - digsum - lhood_term = (self.lda.alpha[k] - self.gamma[k]) * e_log_theta_k + gammaln(self.gamma[k]) - gammaln(self.lda.alpha[k]) + lhood_term = (self.lda.alpha[k] - self.gamma[k]) * e_log_theta_k + \ + gammaln(self.gamma[k]) - gammaln(self.lda.alpha[k]) # TODO: check why there's an IF n = 0 for word_id, count in self.doc: if self.phi[n][k] > 0: - lhood_term += count * self.phi[n][k] * (e_log_theta_k + self.lda.topics[word_id][k] - self.log_phi[n][k]) + lhood_term += count * \ + self.phi[n][k] * (e_log_theta_k + self.lda.topics[word_id][k] - self.log_phi[n][k]) n += 1 self.lhood[k] = lhood_term lhood += lhood_term @@ -979,8 +1188,17 @@ def compute_lda_lhood(self): return lhood - def fit_lda_post(self, doc_number, time, ldaseq, LDA_INFERENCE_CONVERGED = 1e-8, - lda_inference_max_iter = 25, g=None, g3_matrix=None, g4_matrix=None, g5_matrix=None): + def fit_lda_post( + self, + doc_number, + time, + ldaseq, + LDA_INFERENCE_CONVERGED=1e-8, + lda_inference_max_iter=25, + g=None, + g3_matrix=None, + g4_matrix=None, + g5_matrix=None): """ Posterior inference for lda. g, g3, g4 and g5 are matrices used in Document Influence Model and not used currently. @@ -989,7 +1207,7 @@ def fit_lda_post(self, doc_number, time, ldaseq, LDA_INFERENCE_CONVERGED = 1e-8, self.init_lda_post() # sum of counts in a doc total = sum(count for word_id, count in self.doc) - + model = "DTM" if model == "DIM": # if in DIM then we initialise some variables here @@ -1010,7 +1228,8 @@ def fit_lda_post(self, doc_number, time, ldaseq, LDA_INFERENCE_CONVERGED = 1e-8, if model == "DTM" or sslm is None: self.phi, self.log_phi = self.update_phi(doc_number, time) elif model == "DIM" and sslm is not None: - self.phi, self.log_phi = self.update_phi_fixed(doc_number, time, sslm, g3_matrix, g4_matrix, g5_matrix) + self.phi, self.log_phi = self.update_phi_fixed( + doc_number, time, sslm, g3_matrix, g4_matrix, g5_matrix) lhood = self.compute_lda_lhood() converged = numpy.fabs((lhood_old - lhood) / (lhood_old * total)) @@ -1025,14 +1244,14 @@ def fit_lda_post(self, doc_number, time, ldaseq, LDA_INFERENCE_CONVERGED = 1e-8, if model == "DTM" or sslm is None: self.phi, self.log_phi = self.update_phi(doc_number, time) elif model == "DIM" and sslm is not None: - self.phi, self.log_phi = self.update_phi_fixed(doc_number, time, sslm, g3_matrix, g4_matrix, g5_matrix) + self.phi, self.log_phi = self.update_phi_fixed( + doc_number, time, sslm, g3_matrix, g4_matrix, g5_matrix) lhood = self.compute_lda_lhood() converged = numpy.fabs((lhood_old - lhood) / (lhood_old * total)) return lhood - def update_lda_seq_ss(self, time, doc, topic_suffstats): """ Update lda sequence sufficient statistics from an lda posterior. @@ -1067,11 +1286,12 @@ def f_obs(x, *args): term2 = 0 # term 3 and 4 for DIM - term3 = 0 + term3 = 0 term4 = 0 sslm.obs[word] = x - sslm.mean[word], sslm.fwd_mean[word] = sslm.compute_post_mean(word, sslm.chain_variance) + sslm.mean[word], sslm.fwd_mean[ + word] = sslm.compute_post_mean(word, sslm.chain_variance) mean = sslm.mean[word] variance = sslm.variance[word] @@ -1087,7 +1307,8 @@ def f_obs(x, *args): val = mean_t - mean_t_prev term1 += val * val - term2 += word_counts[t - 1] * mean_t - totals[t - 1] * numpy.exp(mean_t + variance[t] / 2) / sslm.zeta[t - 1] + term2 += word_counts[t - 1] * mean_t - totals[t - 1] * \ + numpy.exp(mean_t + variance[t] / 2) / sslm.zeta[t - 1] model = "DTM" if model == "DIM": @@ -1095,9 +1316,10 @@ def f_obs(x, *args): pass if sslm.chain_variance > 0.0: - + term1 = - (term1 / (2 * sslm.chain_variance)) - term1 = term1 - mean[0] * mean[0] / (2 * init_mult * sslm.chain_variance) + term1 = term1 - mean[0] * mean[0] / \ + (2 * init_mult * sslm.chain_variance) else: term1 = 0.0 @@ -1105,21 +1327,23 @@ def f_obs(x, *args): return final -def df_obs(x, *args): +def df_obs(x, *args): """ Derivative of function which optimises obs. """ sslm, word_counts, totals, mean_deriv_mtx, word, deriv = args sslm.obs[word] = x - sslm.mean[word], sslm.fwd_mean[word] = sslm.compute_post_mean(word, sslm.chain_variance) + sslm.mean[word], sslm.fwd_mean[ + word] = sslm.compute_post_mean(word, sslm.chain_variance) model = "DTM" if model == "DTM": - deriv = sslm.compute_obs_deriv(word, word_counts, totals, mean_deriv_mtx, deriv) + deriv = sslm.compute_obs_deriv( + word, word_counts, totals, mean_deriv_mtx, deriv) elif model == "DIM": - deriv = sslm.compute_obs_deriv_fixed(p.word, p.word_counts, p.totals, p.sslm, p.mean_deriv_mtx, deriv) + deriv = sslm.compute_obs_deriv_fixed( + p.word, p.word_counts, p.totals, p.sslm, p.mean_deriv_mtx, deriv) return numpy.negative(deriv) - \ No newline at end of file diff --git a/gensim/models/logentropy_model.py b/gensim/models/logentropy_model.py index d4bfc93479..1877cfe207 100644 --- a/gensim/models/logentropy_model.py +++ b/gensim/models/logentropy_model.py @@ -82,8 +82,8 @@ def initialize(self, corpus): # and finally compute the global weights logger.info("calculating global log entropy weights for %i " - "documents and %i features (%i matrix non-zeros)" - % (self.n_docs, len(glob_freq), self.n_words)) + "documents and %i features (%i matrix non-zeros)" + % (self.n_docs, len(glob_freq), self.n_words)) logger.debug('iterating over corpus') for doc_no2, bow in enumerate(corpus): for key, freq in bow: @@ -91,7 +91,8 @@ def initialize(self, corpus): glob_freq[key]) self.entr[key] = self.entr.get(key, 0.0) + p if doc_no2 != doc_no: - raise ValueError("LogEntropyModel doesn't support generators as training data") + raise ValueError( + "LogEntropyModel doesn't support generators as training data") logger.debug('iterating over keys') for key in self.entr: diff --git a/gensim/models/lsi_dispatcher.py b/gensim/models/lsi_dispatcher.py index 8c4fb78dd3..1da7f45ea3 100755 --- a/gensim/models/lsi_dispatcher.py +++ b/gensim/models/lsi_dispatcher.py @@ -15,7 +15,11 @@ from __future__ import with_statement -import os, sys, logging, threading, time +import os +import sys +import logging +import threading +import time from six import iteritems, itervalues try: from Queue import Queue @@ -37,8 +41,7 @@ # timeout for the Queue object put/get blocking methods. # it should really be infinity, but then keyboard interrupts don't work. # so this is really just a hack, see http://bugs.python.org/issue1360 -HUGE_TIMEOUT = 365 * 24 * 60 * 60 # one year - +HUGE_TIMEOUT = 365 * 24 * 60 * 60 # one year class Dispatcher(object): @@ -55,7 +58,9 @@ def __init__(self, maxsize=0): """ self.maxsize = maxsize self.workers = {} - self.callback = None # a pyro proxy to this object (unknown at init time, but will be set later) + # a pyro proxy to this object (unknown at init time, but will be set + # later) + self.callback = None @Pyro4.expose def initialize(self, **model_params): @@ -68,24 +73,32 @@ def initialize(self, **model_params): self._jobsdone = 0 self._jobsreceived = 0 - # locate all available workers and store their proxies, for subsequent RMI calls + # locate all available workers and store their proxies, for subsequent + # RMI calls self.workers = {} with utils.getNS() as ns: - self.callback = Pyro4.Proxy('PYRONAME:gensim.lsi_dispatcher') # = self + self.callback = Pyro4.Proxy( + 'PYRONAME:gensim.lsi_dispatcher') # = self for name, uri in iteritems(ns.list(prefix='gensim.lsi_worker')): try: worker = Pyro4.Proxy(uri) workerid = len(self.workers) # make time consuming methods work asynchronously - logger.info("registering worker #%i from %s" % (workerid, uri)) - worker.initialize(workerid, dispatcher=self.callback, **model_params) + logger.info( + "registering worker #%i from %s" % + (workerid, uri)) + worker.initialize( + workerid, dispatcher=self.callback, **model_params) self.workers[workerid] = worker except Pyro4.errors.PyroError: - logger.exception("unresponsive worker at %s, deleting it from the name server" % uri) + logger.exception( + "unresponsive worker at %s, deleting it from the name server" % + uri) ns.remove(name) if not self.workers: - raise RuntimeError('no workers found; run some lsi_worker scripts on your machines first!') + raise RuntimeError( + 'no workers found; run some lsi_worker scripts on your machines first!') @Pyro4.expose def getworkers(self): @@ -98,14 +111,18 @@ def getworkers(self): def getjob(self, worker_id): logger.info("worker #%i requesting a new job" % worker_id) job = self.jobs.get(block=True, timeout=1) - logger.info("worker #%i got a new job (%i left)" % (worker_id, self.jobs.qsize())) + logger.info( + "worker #%i got a new job (%i left)" % + (worker_id, self.jobs.qsize())) return job @Pyro4.expose def putjob(self, job): self._jobsreceived += 1 self.jobs.put(job, block=True, timeout=HUGE_TIMEOUT) - logger.info("added a new job (len(queue)=%i items)" % self.jobs.qsize()) + logger.info( + "added a new job (len(queue)=%i items)" % + self.jobs.qsize()) @Pyro4.expose def getstate(self): @@ -113,14 +130,17 @@ def getstate(self): Merge projections from across all workers and return the final projection. """ logger.info("end of input, assigning all remaining jobs") - logger.debug("jobs done: %s, jobs received: %s" % (self._jobsdone, self._jobsreceived)) + logger.debug( + "jobs done: %s, jobs received: %s" % + (self._jobsdone, self._jobsreceived)) while self._jobsdone < self._jobsreceived: - time.sleep(0.5) # check every half a second + time.sleep(0.5) # check every half a second # TODO: merge in parallel, so that we're done in `log_2(workers)` merges, # and not `workers - 1` merges! # but merging only takes place once, after all input data has been processed, - # so the overall effect would be small... compared to the amount of coding :-) + # so the overall effect would be small... compared to the amount of + # coding :-) logger.info("merging states from %i workers" % len(self.workers)) workers = list(self.workers.items()) result = workers[0][1].getstate() @@ -156,14 +176,12 @@ def jobdone(self, workerid): self._jobsdone += 1 logger.info("worker #%s finished job #%i" % (workerid, self._jobsdone)) worker = self.workers[workerid] - worker.requestjob() # tell the worker to ask for another job, asynchronously (one-way) - + worker.requestjob() # tell the worker to ask for another job, asynchronously (one-way) def jobsdone(self): """Wrap self._jobsdone, needed for remote access through proxies""" return self._jobsdone - @Pyro4.oneway def exit(self): """ @@ -173,13 +191,15 @@ def exit(self): logger.info("terminating worker %s" % workerid) worker.exit() logger.info("terminating dispatcher") - os._exit(0) # exit the whole process (not just this thread ala sys.exit()) -#endclass Dispatcher - + # exit the whole process (not just this thread ala sys.exit()) + os._exit(0) +# endclass Dispatcher def main(): - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + logging.basicConfig( + format='%(asctime)s : %(levelname)s : %(message)s', + level=logging.INFO) logger.info("running %s" % " ".join(sys.argv)) program = os.path.basename(sys.argv[0]) @@ -197,6 +217,5 @@ def main(): logger.info("finished running %s" % program) - if __name__ == '__main__': main() diff --git a/gensim/models/lsi_worker.py b/gensim/models/lsi_worker.py index b9de939962..763b64b774 100755 --- a/gensim/models/lsi_worker.py +++ b/gensim/models/lsi_worker.py @@ -17,7 +17,9 @@ from __future__ import with_statement -import os, sys, logging +import os +import sys +import logging import threading import tempfile try: @@ -31,19 +33,20 @@ logger = logging.getLogger('gensim.models.lsi_worker') -SAVE_DEBUG = 0 # save intermediate models after every SAVE_DEBUG updates (0 for never) - +# save intermediate models after every SAVE_DEBUG updates (0 for never) +SAVE_DEBUG = 0 class Worker(object): + def __init__(self): self.model = None @Pyro4.expose def initialize(self, myid, dispatcher, **model_params): self.lock_update = threading.Lock() - self.jobsdone = 0 # how many jobs has this worker completed? - self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove? + self.jobsdone = 0 # how many jobs has this worker completed? + self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove? self.dispatcher = dispatcher self.finished = False logger.info("initializing worker #%s" % myid) @@ -56,7 +59,8 @@ def requestjob(self): Request jobs from the dispatcher, in a perpetual loop until `getstate()` is called. """ if self.model is None: - raise RuntimeError("worker must be initialized before receiving jobs") + raise RuntimeError( + "worker must be initialized before receiving jobs") job = None while job is None and not self.finished: @@ -66,13 +70,14 @@ def requestjob(self): # no new job: try again, unless we're finished with all work continue if job is not None: - logger.info("worker #%s received job #%i" % (self.myid, self.jobsdone)) + logger.info( + "worker #%s received job #%i" % + (self.myid, self.jobsdone)) self.processjob(job) self.dispatcher.jobdone(self.myid) else: logger.info("worker #%i stopping asking for jobs" % self.myid) - @utils.synchronous('lock_update') def processjob(self, job): self.model.add_documents(job) @@ -97,17 +102,17 @@ def reset(self): self.model.projection = self.model.projection.empty_like() self.finished = False - @Pyro4.oneway def exit(self): logger.info("terminating worker #%i" % self.myid) os._exit(0) -#endclass Worker - +# endclass Worker def main(): - logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + logging.basicConfig( + format='%(asctime)s : %(levelname)s : %(message)s', + level=logging.INFO) logger.info("running %s" % " ".join(sys.argv)) program = os.path.basename(sys.argv[0]) @@ -121,6 +126,5 @@ def main(): logger.info("finished running %s" % program) - if __name__ == '__main__': main() diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py index 3f44028667..dd4156d3e3 100644 --- a/gensim/models/lsimodel.py +++ b/gensim/models/lsimodel.py @@ -84,7 +84,8 @@ def clip_spectrum(s, k, discard=0.001): """ # compute relative contribution of eigenvalues towards the energy spectrum rel_spectrum = numpy.abs(1.0 - numpy.cumsum(s / numpy.sum(s))) - # ignore the last `discard` mass (or 1/k, whichever is smaller) of the spectrum + # ignore the last `discard` mass (or 1/k, whichever is smaller) of the + # spectrum small = 1 + len(numpy.where(rel_spectrum > min(discard, 1.0 / k))[0]) k = min(k, small) # clip against k logger.info("keeping %i factors (discarding %.3f%% of energy spectrum)", @@ -107,7 +108,15 @@ def ascarray(a, name=''): class Projection(utils.SaveLoad): - def __init__(self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITERS, extra_dims=P2_EXTRA_DIMS): + + def __init__( + self, + m, + k, + docs=None, + use_svdlibc=False, + power_iters=P2_EXTRA_ITERS, + extra_dims=P2_EXTRA_DIMS): """ Construct the (U, S) projection from a corpus `docs`. The projection can be later updated by merging it with another Projection via `self.merge()`. @@ -131,11 +140,16 @@ def __init__(self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITER try: import sparsesvd except ImportError: - raise ImportError("`sparsesvd` module requested but not found; run `easy_install sparsesvd`") - logger.info("computing sparse SVD of %s matrix", str(docs.shape)) + raise ImportError( + "`sparsesvd` module requested but not found; run `easy_install sparsesvd`") + logger.info( + "computing sparse SVD of %s matrix", str( + docs.shape)) if not scipy.sparse.issparse(docs): docs = matutils.corpus2csc(docs) - ut, s, vt = sparsesvd.sparsesvd(docs, k + 30) # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested + # ask for extra factors, because for some reason SVDLIBC + # sometimes returns fewer factors than requested + ut, s, vt = sparsesvd.sparsesvd(docs, k + 30) u = ut.T del ut, vt k = clip_spectrum(s**2, self.k) @@ -145,7 +159,11 @@ def __init__(self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITER self.u, self.s = None, None def empty_like(self): - return Projection(self.m, self.k, power_iters=self.power_iters, extra_dims=self.extra_dims) + return Projection( + self.m, + self.k, + power_iters=self.power_iters, + extra_dims=self.extra_dims) def merge(self, other, decay=1.0): """ @@ -158,14 +176,17 @@ def merge(self, other, decay=1.0): # the other projection is empty => do nothing return if self.u is None: - # we are empty => result of merge is the other projection, whatever it is + # we are empty => result of merge is the other projection, whatever + # it is self.u = other.u.copy() self.s = other.s.copy() return if self.m != other.m: - raise ValueError("vector space mismatch: update is using %s features, expected %s" % - (other.m, self.m)) - logger.info("merging projections: %s + %s", str(self.u.shape), str(other.u.shape)) + raise ValueError( + "vector space mismatch: update is using %s features, expected %s" % + (other.m, self.m)) + logger.info("merging projections: %s + %s", + str(self.u.shape), str(other.u.shape)) m, n1, n2 = self.u.shape[0], self.u.shape[1], other.u.shape[1] # TODO Maybe keep the bases as elementary reflectors, without # forming explicit matrices with ORGQR. @@ -179,13 +200,14 @@ def merge(self, other, decay=1.0): self.u = ascarray(self.u, 'self.u') other.u -= numpy.dot(self.u, c) - other.u = [other.u] # do some reference magic and call qr_destroy, to save RAM + # do some reference magic and call qr_destroy, to save RAM + other.u = [other.u] q, r = matutils.qr_destroy(other.u) # q, r = QR(component) assert not other.u # find the rotation that diagonalizes r - k = numpy.bmat([[numpy.diag(decay * self.s), numpy.multiply(c, other.s)], - [matutils.pad(numpy.array([]).reshape(0, 0), min(m, n2), n1), numpy.multiply(r, other.s)]]) + k = numpy.bmat([[numpy.diag(decay * self.s), numpy.multiply(c, other.s)], [matutils.pad( + numpy.array([]).reshape(0, 0), min(m, n2), n1), numpy.multiply(r, other.s)]]) logger.debug("computing SVD of %s dense matrix", k.shape) try: # in numpy < 1.1.0, running SVD sometimes results in "LinAlgError: SVD did not converge'. @@ -194,14 +216,21 @@ def merge(self, other, decay=1.0): # see http://www.mail-archive.com/numpy-discussion@scipy.org/msg07224.html and # bug ticket http://projects.scipy.org/numpy/ticket/706 # sdoering: replaced numpy's linalg.svd with scipy's linalg.svd: - u_k, s_k, _ = scipy.linalg.svd(k, full_matrices=False) # TODO *ugly overkill*!! only need first self.k SVD factors... but there is no LAPACK wrapper for partial svd/eigendecomp in numpy :( //sdoering: maybe there is one in scipy? + # TODO *ugly overkill*!! only need first self.k SVD factors... but + # there is no LAPACK wrapper for partial svd/eigendecomp in numpy + # :( //sdoering: maybe there is one in scipy? + u_k, s_k, _ = scipy.linalg.svd(k, full_matrices=False) except scipy.linalg.LinAlgError: logger.error("SVD(A) failed; trying SVD(A * A^T)") - u_k, s_k, _ = scipy.linalg.svd(numpy.dot(k, k.T), full_matrices=False) # if this fails too, give up with an exception - s_k = numpy.sqrt(s_k) # go back from eigen values to singular values + # if this fails too, give up with an exception + u_k, s_k, _ = scipy.linalg.svd( + numpy.dot(k, k.T), full_matrices=False) + # go back from eigen values to singular values + s_k = numpy.sqrt(s_k) k = clip_spectrum(s_k**2, self.k) - u1_k, u2_k, s_k = numpy.array(u_k[:n1, :k]), numpy.array(u_k[n1:, :k]), s_k[:k] + u1_k, u2_k, s_k = numpy.array( + u_k[:n1, :k]), numpy.array(u_k[n1:, :k]), s_k[:k] # update & rotate current basis U = [U, U']*[U1_k, U2_k] logger.debug("updating orthonormal basis U") @@ -213,14 +242,15 @@ def merge(self, other, decay=1.0): q = numpy.dot(q, u2_k) self.u += q - # make each column of U start with a non-negative number (to force canonical decomposition) + # make each column of U start with a non-negative number (to force + # canonical decomposition) if self.u.shape[0] > 0: for i in xrange(self.u.shape[1]): if self.u[0, i] < 0.0: self.u[:, i] *= -1.0 # diff = numpy.dot(self.u.T, self.u) - numpy.eye(self.u.shape[1]) # logger.info('orth error after=%f' % numpy.sum(diff * diff)) -#endclass Projection +# endclass Projection class LsiModel(interfaces.TransformationABC, basemodel.BaseTopicModel): @@ -244,9 +274,18 @@ class LsiModel(interfaces.TransformationABC, basemodel.BaseTopicModel): .. [2] https://github.com/piskvorky/gensim/wiki/Recipes-&-FAQ#q4-how-do-you-output-the-u-s-vt-matrices-of-lsi """ - def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000, - decay=1.0, distributed=False, onepass=True, - power_iters=P2_EXTRA_ITERS, extra_samples=P2_EXTRA_DIMS): + + def __init__( + self, + corpus=None, + num_topics=200, + id2word=None, + chunksize=20000, + decay=1.0, + distributed=False, + onepass=True, + power_iters=P2_EXTRA_ITERS, + extra_samples=P2_EXTRA_DIMS): """ `num_topics` is the number of requested factors (latent dimensions). @@ -285,23 +324,30 @@ def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000, self.decay = float(decay) if distributed: if not onepass: - logger.warning("forcing the one-pass algorithm for distributed LSA") + logger.warning( + "forcing the one-pass algorithm for distributed LSA") onepass = True self.onepass = onepass self.extra_samples, self.power_iters = extra_samples, power_iters if corpus is None and self.id2word is None: - raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality') + raise ValueError( + 'at least one of corpus/id2word must be specified, to establish input space dimensionality') if self.id2word is None: - logger.warning("no word id mapping provided; initializing from corpus, assuming identity") + logger.warning( + "no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) else: self.num_terms = 1 + max([-1] + self.id2word.keys()) self.docs_processed = 0 - self.projection = Projection(self.num_terms, self.num_topics, power_iters=self.power_iters, extra_dims=self.extra_samples) + self.projection = Projection( + self.num_terms, + self.num_topics, + power_iters=self.power_iters, + extra_dims=self.extra_samples) self.numworkers = 1 if not distributed: @@ -309,28 +355,40 @@ def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000, self.dispatcher = None else: if not onepass: - raise NotImplementedError("distributed stochastic LSA not implemented yet; " - "run either distributed one-pass, or serial randomized.") + raise NotImplementedError( + "distributed stochastic LSA not implemented yet; " + "run either distributed one-pass, or serial randomized.") try: import Pyro4 dispatcher = Pyro4.Proxy('PYRONAME:gensim.lsi_dispatcher') - logger.debug("looking for dispatcher at %s", str(dispatcher._pyroUri)) - dispatcher.initialize(id2word=self.id2word, num_topics=num_topics, - chunksize=chunksize, decay=decay, - power_iters=self.power_iters, extra_samples=self.extra_samples, - distributed=False, onepass=onepass) + logger.debug( + "looking for dispatcher at %s", str( + dispatcher._pyroUri)) + dispatcher.initialize( + id2word=self.id2word, + num_topics=num_topics, + chunksize=chunksize, + decay=decay, + power_iters=self.power_iters, + extra_samples=self.extra_samples, + distributed=False, + onepass=onepass) self.dispatcher = dispatcher self.numworkers = len(dispatcher.getworkers()) - logger.info("using distributed version with %i workers", self.numworkers) + logger.info( + "using distributed version with %i workers", + self.numworkers) except Exception as err: - # distributed version was specifically requested, so this is an error state + # distributed version was specifically requested, so this is an + # error state logger.error("failed to initialize distributed LSI (%s)", err) - raise RuntimeError("failed to initialize distributed LSI (%s)" % err) + raise RuntimeError( + "failed to initialize distributed LSI (%s)" % + err) if corpus is not None: self.add_documents(corpus) - def add_documents(self, corpus, chunksize=None, decay=None): """ Update singular value decomposition to take into account a new @@ -348,7 +406,8 @@ def add_documents(self, corpus, chunksize=None, decay=None): """ logger.info("updating model with new documents") - # get computation parameters; if not specified, use the ones from constructor + # get computation parameters; if not specified, use the ones from + # constructor if chunksize is None: chunksize = self.chunksize if decay is None: @@ -356,38 +415,51 @@ def add_documents(self, corpus, chunksize=None, decay=None): if not scipy.sparse.issparse(corpus): if not self.onepass: - # we are allowed multiple passes over the input => use a faster, randomized two-pass algo + # we are allowed multiple passes over the input => use a + # faster, randomized two-pass algo update = Projection(self.num_terms, self.num_topics, None) update.u, update.s = stochastic_svd( corpus, self.num_topics, num_terms=self.num_terms, chunksize=chunksize, extra_dims=self.extra_samples, power_iters=self.power_iters) self.projection.merge(update, decay=decay) - self.docs_processed += len(corpus) if hasattr(corpus, '__len__') else 0 + self.docs_processed += len(corpus) if hasattr(corpus, + '__len__') else 0 else: # the one-pass algo doc_no = 0 if self.dispatcher: logger.info('initializing %s workers', self.numworkers) self.dispatcher.reset() - for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): + for chunk_no, chunk in enumerate( + utils.grouper(corpus, chunksize)): logger.info("preparing a new chunk of documents") nnz = sum(len(doc) for doc in chunk) # construct the job as a sparse matrix, to minimize memory overhead # definitely avoid materializing it as a dense matrix! logger.debug("converting corpus to csc format") - job = matutils.corpus2csc(chunk, num_docs=len(chunk), num_terms=self.num_terms, num_nnz=nnz) + job = matutils.corpus2csc(chunk, num_docs=len( + chunk), num_terms=self.num_terms, num_nnz=nnz) del chunk doc_no += job.shape[1] if self.dispatcher: - # distributed version: add this job to the job queue, so workers can work on it + # distributed version: add this job to the job queue, + # so workers can work on it logger.debug("creating job #%i", chunk_no) - self.dispatcher.putjob(job) # put job into queue; this will eventually block, because the queue has a small finite size + # put job into queue; this will eventually block, + # because the queue has a small finite size + self.dispatcher.putjob(job) del job logger.info("dispatched documents up to #%s", doc_no) else: - # serial version, there is only one "worker" (myself) => process the job directly - update = Projection(self.num_terms, self.num_topics, job, extra_dims=self.extra_samples, power_iters=self.power_iters) + # serial version, there is only one "worker" (myself) + # => process the job directly + update = Projection( + self.num_terms, + self.num_topics, + job, + extra_dims=self.extra_samples, + power_iters=self.power_iters) del job self.projection.merge(update, decay=decay) del update @@ -396,7 +468,8 @@ def add_documents(self, corpus, chunksize=None, decay=None): # wait for all workers to finish (distributed version only) if self.dispatcher: - logger.info("reached the end of input; now waiting for all remaining jobs to finish") + logger.info( + "reached the end of input; now waiting for all remaining jobs to finish") self.projection = self.dispatcher.getstate() self.docs_processed += doc_no # logger.info("top topics after adding %i documents" % doc_no) @@ -404,9 +477,16 @@ def add_documents(self, corpus, chunksize=None, decay=None): else: assert not self.dispatcher, "must be in serial mode to receive jobs" assert self.onepass, "distributed two-pass algo not supported yet" - update = Projection(self.num_terms, self.num_topics, corpus.tocsc(), extra_dims=self.extra_samples, power_iters=self.power_iters) + update = Projection( + self.num_terms, + self.num_topics, + corpus.tocsc(), + extra_dims=self.extra_samples, + power_iters=self.power_iters) self.projection.merge(update, decay=decay) - logger.info("processed sparse job of %i documents", corpus.shape[1]) + logger.info( + "processed sparse job of %i documents", + corpus.shape[1]) self.docs_processed += corpus.shape[1] def __str__(self): @@ -424,7 +504,8 @@ def __getitem__(self, bow, scaled=False, chunksize=512): """ assert self.projection.u is not None, "decomposition not initialized yet" - # if the input vector is in fact a corpus, return a transformed corpus as a result + # if the input vector is in fact a corpus, return a transformed corpus + # as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus and chunksize: # by default, transform `chunksize` documents at once, when called as `lsi[corpus]`. @@ -436,9 +517,17 @@ def __getitem__(self, bow, scaled=False, chunksize=512): if not is_corpus: bow = [bow] - # convert input to scipy.sparse CSC, then do "sparse * dense = dense" multiplication - vec = matutils.corpus2csc(bow, num_terms=self.num_terms, dtype=self.projection.u.dtype) - topic_dist = (vec.T * self.projection.u[:, :self.num_topics]).T # (x^T * u).T = u^-1 * x + # convert input to scipy.sparse CSC, then do "sparse * dense = dense" + # multiplication + vec = matutils.corpus2csc( + bow, + num_terms=self.num_terms, + dtype=self.projection.u.dtype) + topic_dist = ( + vec.T * + self.projection.u[ + :, + :self.num_topics]).T # (x^T * u).T = u^-1 * x # # convert input to dense, then do dense * dense multiplication # # ± same performance as above (BLAS dense * dense is better optimized than scipy.sparse), but consumes more memory @@ -458,7 +547,9 @@ def __getitem__(self, bow, scaled=False, chunksize=512): topic_dist = topic_dist.reshape(-1) if scaled: - topic_dist = (1.0 / self.projection.s[:self.num_topics]) * topic_dist # s^-1 * u^-1 * x + # s^-1 * u^-1 * x + topic_dist = ( + 1.0 / self.projection.s[:self.num_topics]) * topic_dist # convert a numpy array to gensim sparse vector = tuples of (feature_id, feature_weight), # with no zero weights. @@ -492,7 +583,12 @@ def show_topic(self, topicno, topn=10): most = matutils.argsort(numpy.abs(c), topn, reverse=True) return [(self.id2word[val], 1.0 * c[val] / norm) for val in most] - def show_topics(self, num_topics=-1, num_words=10, log=False, formatted=True): + def show_topics( + self, + num_topics=-1, + num_words=10, + log=False, + formatted=True): """ Return `num_topics` most significant topics (return all by default). For each topic, show `num_words` most significant words (10 words by default). @@ -514,7 +610,11 @@ def show_topics(self, num_topics=-1, num_words=10, log=False, formatted=True): topic = self.show_topic(i, topn=num_words) shown.append((i, topic)) if log: - logger.info("topic #%i(%.3f): %s", i, self.projection.s[i], topic) + logger.info( + "topic #%i(%.3f): %s", + i, + self.projection.s[i], + topic) return shown def print_debug(self, num_topics=5, num_words=10): @@ -542,8 +642,21 @@ def save(self, fname, *args, **kwargs): """ if self.projection is not None: - self.projection.save(utils.smart_extension(fname, '.projection'), *args, **kwargs) - super(LsiModel, self).save(fname, *args, ignore=['projection', 'dispatcher'], **kwargs) + self.projection.save( + utils.smart_extension( + fname, + '.projection'), + *args, + **kwargs) + super( + LsiModel, + self).save( + fname, + *args, + ignore=[ + 'projection', + 'dispatcher'], + **kwargs) @classmethod def load(cls, fname, *args, **kwargs): @@ -559,11 +672,15 @@ def load(cls, fname, *args, **kwargs): result = super(LsiModel, cls).load(fname, *args, **kwargs) projection_fname = utils.smart_extension(fname, '.projection') try: - result.projection = super(LsiModel, cls).load(projection_fname, *args, **kwargs) + result.projection = super( + LsiModel, cls).load( + projection_fname, *args, **kwargs) except Exception as e: - logging.warning("failed to load projection from %s: %s" % (projection_fname, e)) + logging.warning( + "failed to load projection from %s: %s" % + (projection_fname, e)) return result -#endclass LsiModel +# endclass LsiModel def print_debug(id2token, u, s, topics, num_words=10, num_neg=None): @@ -584,7 +701,9 @@ def print_debug(id2token, u, s, topics, num_words=10, num_neg=None): for topic in sorted(iterkeys(result)): weights = sorted(result[topic], key=lambda x: -abs(x[0])) _, most = weights[0] - if u[most, topic] < 0.0: # the most significant word has a negative sign => flip sign of u[most] + if u[ + most, + topic] < 0.0: # the most significant word has a negative sign => flip sign of u[most] normalize = -1.0 else: normalize = 1.0 @@ -603,7 +722,12 @@ def print_debug(id2token, u, s, topics, num_words=10, num_neg=None): if len(neg) >= num_neg: break - logger.info('topic #%s(%.3f): %s, ..., %s', topic, s[topic], ', '.join(pos), ', '.join(neg)) + logger.info( + 'topic #%s(%.3f): %s, ..., %s', + topic, + s[topic], + ', '.join(pos), + ', '.join(neg)) def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None, @@ -630,10 +754,14 @@ def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None, """ rank = int(rank) if extra_dims is None: - samples = max(10, 2 * rank) # use more samples than requested factors, to improve accuracy + # use more samples than requested factors, to improve accuracy + samples = max(10, 2 * rank) else: samples = rank + int(extra_dims) - logger.info("using %i extra samples and %i power iterations", samples - rank, power_iters) + logger.info( + "using %i extra samples and %i power iterations", + samples - rank, + power_iters) num_terms = int(num_terms) @@ -645,14 +773,25 @@ def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None, if scipy.sparse.issparse(corpus): m, n = corpus.shape - assert num_terms == m, "mismatch in number of features: %i in sparse matrix vs. %i parameter" % (m, num_terms) - o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(y.dtype) # draw a random gaussian matrix - sparsetools.csc_matvecs(m, n, samples, corpus.indptr, corpus.indices, - corpus.data, o.ravel(), y.ravel()) # y = corpus * o + assert num_terms == m, "mismatch in number of features: %i in sparse matrix vs. %i parameter" % ( + m, num_terms) + o = numpy.random.normal( + 0.0, 1.0, (n, samples)).astype( + y.dtype) # draw a random gaussian matrix + sparsetools.csc_matvecs( + m, + n, + samples, + corpus.indptr, + corpus.indices, + corpus.data, + o.ravel(), + y.ravel()) # y = corpus * o del o # unlike numpy, scipy.sparse `astype()` copies everything, even if there is no change to dtype! - # so check for equal dtype explicitly, to avoid the extra memory footprint if possible + # so check for equal dtype explicitly, to avoid the extra memory + # footprint if possible if y.dtype != dtype: y = y.astype(dtype) @@ -664,21 +803,26 @@ def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None, for power_iter in xrange(power_iters): q = corpus.T * q q = [corpus * q] - q, _ = matutils.qr_destroy(q) # orthonormalize the range after each power iteration step + # orthonormalize the range after each power iteration step + q, _ = matutils.qr_destroy(q) else: num_docs = 0 for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): logger.info('PROGRESS: at document #%i', (chunk_no * chunksize)) # construct the chunk as a sparse matrix, to minimize memory overhead - # definitely avoid materializing it as a dense (num_terms x chunksize) matrix! + # definitely avoid materializing it as a dense (num_terms x + # chunksize) matrix! s = sum(len(doc) for doc in chunk) - chunk = matutils.corpus2csc(chunk, num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC + # documents = columns of sparse CSC + chunk = matutils.corpus2csc( + chunk, num_terms=num_terms, dtype=dtype) m, n = chunk.shape assert m == num_terms assert n <= chunksize # the very last chunk of A is allowed to be smaller in size num_docs += n logger.debug("multiplying chunk * gauss") - o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(dtype) # draw a random gaussian matrix + o = numpy.random.normal(0.0, 1.0, (n, samples)).astype( + dtype) # draw a random gaussian matrix sparsetools.csc_matvecs(m, n, samples, chunk.indptr, chunk.indices, # y = y + chunk * o chunk.data, o.ravel(), y.ravel()) del chunk, o @@ -690,8 +834,13 @@ def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None, yold = q.copy() q[:] = 0.0 for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): - logger.info('PROGRESS: at document #%i/%i', chunk_no * chunksize, num_docs) - chunk = matutils.corpus2csc(chunk, num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC + logger.info( + 'PROGRESS: at document #%i/%i', + chunk_no * chunksize, + num_docs) + # documents = columns of sparse CSC + chunk = matutils.corpus2csc( + chunk, num_terms=num_terms, dtype=dtype) tmp = chunk.T * yold tmp = chunk * tmp del chunk @@ -713,19 +862,33 @@ def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None, # again, construct X incrementally, in chunks of `chunksize` documents from the streaming # input corpus A, to avoid using O(number of documents) memory x = numpy.zeros(shape=(qt.shape[0], qt.shape[0]), dtype=numpy.float64) - logger.info("2nd phase: constructing %s covariance matrix", str(x.shape)) + logger.info( + "2nd phase: constructing %s covariance matrix", str( + x.shape)) for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): - logger.info('PROGRESS: at document #%i/%i', chunk_no * chunksize, num_docs) - chunk = matutils.corpus2csc(chunk, num_terms=num_terms, dtype=qt.dtype) + logger.info( + 'PROGRESS: at document #%i/%i', + chunk_no * chunksize, + num_docs) + chunk = matutils.corpus2csc( + chunk, num_terms=num_terms, dtype=qt.dtype) b = qt * chunk # dense * sparse matrix multiply del chunk - x += numpy.dot(b, b.T) # TODO should call the BLAS routine SYRK, but there is no SYRK wrapper in scipy :( + # TODO should call the BLAS routine SYRK, but there is no SYRK + # wrapper in scipy :( + x += numpy.dot(b, b.T) del b # now we're ready to compute decomposition of the small matrix X - logger.info("running dense decomposition on %s covariance matrix", str(x.shape)) - u, s, vt = scipy.linalg.svd(x) # could use linalg.eigh, but who cares... and svd returns the factors already sorted :) - s = numpy.sqrt(s) # sqrt to go back from singular values of X to singular values of B = singular values of the corpus + logger.info( + "running dense decomposition on %s covariance matrix", str( + x.shape)) + # could use linalg.eigh, but who cares... and svd returns the factors + # already sorted :) + u, s, vt = scipy.linalg.svd(x) + # sqrt to go back from singular values of X to singular values of B = + # singular values of the corpus + s = numpy.sqrt(s) q = qt.T.copy() del qt diff --git a/gensim/models/normmodel.py b/gensim/models/normmodel.py index 07bdcce650..f5ae9c1604 100644 --- a/gensim/models/normmodel.py +++ b/gensim/models/normmodel.py @@ -29,6 +29,7 @@ class NormModel(interfaces.TransformationABC): Model persistency is achieved via its load/save methods """ + def __init__(self, corpus=None, norm='l2'): """ Compute the 'l1' or 'l2' normalization by normalizing separately @@ -48,7 +49,8 @@ def __init__(self, corpus=None, norm='l2'): pass def __str__(self): - return "NormModel(num_docs=%s, num_nnz=%s, norm=%s)" % (self.num_docs, self.num_nnz, self.norm) + return "NormModel(num_docs=%s, num_nnz=%s, norm=%s)" % ( + self.num_docs, self.num_nnz, self.norm) def calc_norm(self, corpus): """ @@ -72,4 +74,4 @@ def normalize(self, bow): def __getitem__(self, bow): return self.normalize(bow) -#endclass NormModel +# endclass NormModel diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 93bbfba12c..f90d9d9e5c 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -79,6 +79,7 @@ class Phrases(interfaces.TransformationABC): and `phrases[corpus]` syntax. """ + def __init__(self, sentences=None, min_count=5, threshold=10.0, max_vocab_size=40000000, delimiter=b'_', progress_per=10000): """ @@ -117,7 +118,8 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, self.min_count = min_count self.threshold = threshold self.max_vocab_size = max_vocab_size - self.vocab = defaultdict(int) # mapping between utf8 token => its count + # mapping between utf8 token => its count + self.vocab = defaultdict(int) self.min_reduce = 1 # ignore any tokens with count smaller than this self.delimiter = delimiter self.progress_per = progress_per @@ -132,7 +134,11 @@ def __str__(self): self.threshold, self.max_vocab_size) @staticmethod - def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000): + def learn_vocab( + sentences, + max_vocab_size, + delimiter=b'_', + progress_per=10000): """Collect unigram/bigram counts from the `sentences` iterable.""" sentence_no = -1 total_words = 0 @@ -141,8 +147,9 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000): min_reduce = 1 for sentence_no, sentence in enumerate(sentences): if sentence_no % progress_per == 0: - logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" % - (sentence_no, total_words, len(vocab))) + logger.info( + "PROGRESS: at sentence #%i, processed %i words and %i word types" % + (sentence_no, total_words, len(vocab))) sentence = [utils.any2utf8(w) for w in sentence] for bigram in zip(sentence, sentence[1:]): vocab[bigram[0]] += 1 @@ -157,8 +164,9 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000): utils.prune_vocab(vocab, min_reduce) min_reduce += 1 - logger.info("collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences" % - (len(vocab), total_words, sentence_no + 1)) + logger.info( + "collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences" % + (len(vocab), total_words, sentence_no + 1)) return min_reduce, vocab def add_vocab(self, sentences): @@ -171,7 +179,8 @@ def add_vocab(self, sentences): # directly, but gives the new sentences a fighting chance to collect # sufficient counts, before being pruned out by the (large) accummulated # counts collected in previous learn_vocab runs. - min_reduce, vocab = self.learn_vocab(sentences, self.max_vocab_size, self.delimiter, self.progress_per) + min_reduce, vocab = self.learn_vocab( + sentences, self.max_vocab_size, self.delimiter, self.progress_per) if len(self.vocab) > 0: logger.info("merging %i counts into %s", len(vocab), self) @@ -216,7 +225,8 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False): pab = float(vocab[bigram_word]) score = (pab - min_count) / pa / pb * len(vocab) # logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s", - # bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score) + # bigram_word, pab, self.min_count, pa, pb, + # len(self.vocab), score) if score > threshold: if as_tuples: yield ((word_a, word_b), score) @@ -245,7 +255,8 @@ def __getitem__(self, sentence): nonviolence leo_tolstoy """ - warnings.warn("For a faster implementation, use the gensim.models.phrases.Phraser class") + warnings.warn( + "For a faster implementation, use the gensim.models.phrases.Phraser class") try: is_single = not sentence or isinstance(sentence[0], string_types) except: @@ -270,7 +281,8 @@ def __getitem__(self, sentence): pab = float(vocab[bigram_word]) score = (pab - min_count) / pa / pb * len(vocab) # logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s", - # bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score) + # bigram_word, pab, self.min_count, pa, pb, + # len(self.vocab), score) if score > threshold: new_s.append(bigram_word) last_bigram = True @@ -310,6 +322,7 @@ class Phraser(interfaces.TransformationABC): other values.) """ + def __init__(self, phrases_model): self.threshold = phrases_model.threshold self.min_count = phrases_model.min_count @@ -318,15 +331,18 @@ def __init__(self, phrases_model): corpus = pseudocorpus(phrases_model.vocab, phrases_model.delimiter) logger.info('source_vocab length %i', len(phrases_model.vocab)) count = 0 - for bigram, score in phrases_model.export_phrases(corpus, self.delimiter, as_tuples=True): + for bigram, score in phrases_model.export_phrases( + corpus, self.delimiter, as_tuples=True): if bigram in self.phrasegrams: logger.info('Phraser repeat %s', bigram) - self.phrasegrams[bigram] = (phrases_model.vocab[self.delimiter.join(bigram)], score) + self.phrasegrams[bigram] = ( + phrases_model.vocab[ + self.delimiter.join(bigram)], score) count += 1 if not count % 50000: logger.info('Phraser added %i phrasegrams', count) - logger.info('Phraser built with %i %i phrasegrams', count, len(self.phrasegrams)) - + logger.info('Phraser built with %i %i phrasegrams', + count, len(self.phrasegrams)) def __getitem__(self, sentence): """ @@ -354,7 +370,8 @@ def __getitem__(self, sentence): delimiter = self.delimiter for word_a, word_b in zip(s, s[1:]): bigram_tuple = (word_a, word_b) - if phrasegrams.get(bigram_tuple, (-1, -1))[1] > self.threshold and not last_bigram: + if phrasegrams.get(bigram_tuple, (-1, -1) + )[1] > self.threshold and not last_bigram: bigram_word = delimiter.join((word_a, word_b)) new_s.append(bigram_word) last_bigram = True @@ -373,7 +390,9 @@ def __getitem__(self, sentence): if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) + logging.basicConfig( + format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', + level=logging.INFO) logging.info("running %s" % " ".join(sys.argv)) # check and process cmdline input diff --git a/gensim/models/rpmodel.py b/gensim/models/rpmodel.py index 4e6cd4a590..a9aaeb5e4d 100644 --- a/gensim/models/rpmodel.py +++ b/gensim/models/rpmodel.py @@ -34,6 +34,7 @@ class RpModel(interfaces.TransformationABC): Model persistency is achieved via its load/save methods. """ + def __init__(self, corpus, id2word=None, num_topics=300): """ `id2word` is a mapping from word ids (integers) to words (strings). It is @@ -46,14 +47,16 @@ def __init__(self, corpus, id2word=None, num_topics=300): self.initialize(corpus) def __str__(self): - return "RpModel(num_terms=%s, num_topics=%s)" % (self.num_terms, self.num_topics) + return "RpModel(num_terms=%s, num_topics=%s)" % ( + self.num_terms, self.num_topics) def initialize(self, corpus): """ Initialize the random projection matrix. """ if self.id2word is None: - logger.info("no word id mapping provided; initializing from corpus, assuming identity") + logger.info( + "no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) else: @@ -63,9 +66,12 @@ def initialize(self, corpus): logger.info("constructing %s random matrix" % str(shape)) # Now construct the projection matrix itself. # Here i use a particular form, derived in "Achlioptas: Database-friendly random projection", - # and his (1) scenario of Theorem 1.1 in particular (all entries are +1/-1). - randmat = 1 - 2 * numpy.random.binomial(1, 0.5, shape) # convert from 0/1 to +1/-1 - self.projection = numpy.asfortranarray(randmat, dtype=numpy.float32) # convert from int32 to floats, for faster multiplications + # and his (1) scenario of Theorem 1.1 in particular (all entries are + # +1/-1). + # convert from 0/1 to +1/-1 + randmat = 1 - 2 * numpy.random.binomial(1, 0.5, shape) + # convert from int32 to floats, for faster multiplications + self.projection = numpy.asfortranarray(randmat, dtype=numpy.float32) # TODO: check whether the Fortran-order shenanigans still make sense. In the original # code (~2010), this made a BIG difference for numpy BLAS implementations; perhaps now the wrappers # are smarter and this is no longer needed? @@ -74,7 +80,8 @@ def __getitem__(self, bow): """ Return RP representation of the input vector and/or corpus. """ - # if the input vector is in fact a corpus, return a transformed corpus as result + # if the input vector is in fact a corpus, return a transformed corpus + # as result is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) @@ -83,15 +90,19 @@ def __getitem__(self, bow): # This is a hack to work around a bug in numpy, where a FORTRAN-order array # unpickled from disk segfaults on using it. self.freshly_loaded = False - self.projection = self.projection.copy('F') # simply making a fresh copy fixes the broken array + # simply making a fresh copy fixes the broken array + self.projection = self.projection.copy('F') - vec = matutils.sparse2full(bow, self.num_terms).reshape(self.num_terms, 1) / numpy.sqrt(self.num_topics) + vec = matutils.sparse2full(bow, self.num_terms).reshape( + self.num_terms, 1) / numpy.sqrt(self.num_topics) vec = numpy.asfortranarray(vec, dtype=numpy.float32) - topic_dist = numpy.dot(self.projection, vec) # (k, d) * (d, 1) = (k, 1) + topic_dist = numpy.dot( + self.projection, + vec) # (k, d) * (d, 1) = (k, 1) return [(topicid, float(topicvalue)) for topicid, topicvalue in enumerate(topic_dist.flat) if numpy.isfinite(topicvalue) and not numpy.allclose(topicvalue, 0.0)] def __setstate__(self, state): self.__dict__ = state self.freshly_loaded = True -#endclass RpModel +# endclass RpModel diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 592cc9537c..652d93ad4a 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -49,6 +49,7 @@ class TfidfModel(interfaces.TransformationABC): Model persistency is achieved via its load/save methods. """ + def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.identity, wglobal=df2idf, normalize=True): """ @@ -87,8 +88,9 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, # statistics we need to construct the IDF mapping. we can skip the # step that goes through the corpus (= an optimization). if corpus is not None: - logger.warning("constructor received both corpus and explicit " - "inverse document frequencies; ignoring the corpus") + logger.warning( + "constructor received both corpus and explicit " + "inverse document frequencies; ignoring the corpus") self.num_docs, self.num_nnz = dictionary.num_docs, dictionary.num_nnz self.dfs = dictionary.dfs.copy() self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs) @@ -99,10 +101,9 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, # be initialized in some other way pass - def __str__(self): - return "TfidfModel(num_docs=%s, num_nnz=%s)" % (self.num_docs, self.num_nnz) - + return "TfidfModel(num_docs=%s, num_nnz=%s)" % ( + self.num_docs, self.num_nnz) def initialize(self, corpus): """ @@ -126,16 +127,17 @@ def initialize(self, corpus): # and finally compute the idf weights n_features = max(dfs) if dfs else 0 - logger.info("calculating IDF weights for %i documents and %i features (%i matrix non-zeros)" % - (self.num_docs, n_features, self.num_nnz)) + logger.info( + "calculating IDF weights for %i documents and %i features (%i matrix non-zeros)" % + (self.num_docs, n_features, self.num_nnz)) self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs) - def __getitem__(self, bow, eps=1e-12): """ Return tf-idf representation of the input vector and/or corpus. """ - # if the input vector is in fact a corpus, return a transformed corpus as a result + # if the input vector is in fact a corpus, return a transformed corpus + # as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) @@ -153,6 +155,7 @@ def __getitem__(self, bow, eps=1e-12): vector = self.normalize(vector) # make sure there are no explicit zeroes in the vector (must be sparse) - vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps] + vector = [(termid, weight) + for termid, weight in vector if abs(weight) > eps] return vector -#endclass TfidfModel +# endclass TfidfModel diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 9bb50698c3..93fca54354 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -89,7 +89,8 @@ double, uint32, seterr, array, uint8, vstack, fromstring, sqrt, newaxis,\ ndarray, empty, sum as np_sum, prod, ones, ascontiguousarray, vstack -from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc +# utility fnc for pickling, common scipy operations etc +from gensim import utils, matutils from gensim.corpora.dictionary import Dictionary from six import iteritems, itervalues, string_types from six.moves import xrange @@ -124,14 +125,21 @@ def train_batch_sg(model, sentences, alpha, work=None): word_vocabs = [model.vocab[w] for w in sentence if w in model.vocab and model.vocab[w].sample_int > model.random.rand() * 2**32] for pos, word in enumerate(word_vocabs): - reduced_window = model.random.randint(model.window) # `b` in the original word2vec code + reduced_window = model.random.randint( + model.window) # `b` in the original word2vec code - # now go over all words from the (reduced) window, predicting each one in turn + # now go over all words from the (reduced) window, predicting + # each one in turn start = max(0, pos - model.window + reduced_window) - for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): + for pos2, word2 in enumerate( + word_vocabs[ + start:( + pos + model.window + 1 - reduced_window)], start): # don't train on the `word` itself if pos2 != pos: - train_sg_pair(model, model.index2word[word.index], word2.index, alpha) + train_sg_pair( + model, model.index2word[ + word.index], word2.index, alpha) result += len(word_vocabs) return result @@ -151,11 +159,23 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None): word_vocabs = [model.vocab[w] for w in sentence if w in model.vocab and model.vocab[w].sample_int > model.random.rand() * 2**32] for pos, word in enumerate(word_vocabs): - reduced_window = model.random.randint(model.window) # `b` in the original word2vec code + reduced_window = model.random.randint( + model.window) # `b` in the original word2vec code start = max(0, pos - model.window + reduced_window) - window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) - word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] - l1 = np_sum(model.syn0[word2_indices], axis=0) # 1 x vector_size + window_pos = enumerate( + word_vocabs[ + start:( + pos + + model.window + + 1 - + reduced_window)], + start) + word2_indices = [ + word2.index for pos2, word2 in window_pos if ( + word2 is not None and pos2 != pos)] + l1 = np_sum( + model.syn0[word2_indices], + axis=0) # 1 x vector_size if word2_indices and model.cbow_mean: l1 /= len(word2_indices) train_cbow_pair(model, word, word2_indices, l1, alpha) @@ -183,9 +203,11 @@ def score_sentence_sg(model, sentence, work=None): if word is None: continue # OOV word in the input sentence => skip - # now go over all words from the window, predicting each one in turn + # now go over all words from the window, predicting each one in + # turn start = max(0, pos - model.window) - for pos2, word2 in enumerate(word_vocabs[start : pos + model.window + 1], start): + for pos2, word2 in enumerate( + word_vocabs[start: pos + model.window + 1], start): # don't train on OOV words and on the `word` itself if word2 is not None and pos2 != pos: log_prob_sentence += score_sg_pair(model, word, word2) @@ -213,25 +235,44 @@ def score_sentence_cbow(model, sentence, alpha, work=None, neu1=None): continue # OOV word in the input sentence => skip start = max(0, pos - model.window) - window_pos = enumerate(word_vocabs[start:(pos + model.window + 1)], start) - word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] + window_pos = enumerate( + word_vocabs[ + start:( + pos + + model.window + + 1)], + start) + word2_indices = [ + word2.index for pos2, + word2 in window_pos if ( + word2 is not None and pos2 != pos)] l1 = np_sum(model.syn0[word2_indices], axis=0) # 1 x layer1_size if word2_indices and model.cbow_mean: l1 /= len(word2_indices) - log_prob_sentence += score_cbow_pair(model, word, word2_indices, l1) + log_prob_sentence += score_cbow_pair(model, + word, word2_indices, l1) return log_prob_sentence # If pyemd C extension is available, import it. -# If pyemd is attempted to be used, but isn't installed, ImportError will be raised. +# If pyemd is attempted to be used, but isn't installed, ImportError will +# be raised. try: from pyemd import emd PYEMD_EXT = True except ImportError: PYEMD_EXT = False -def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_hidden=True, - context_vectors=None, context_locks=None): + +def train_sg_pair( + model, + word, + context_index, + alpha, + learn_vectors=True, + learn_hidden=True, + context_vectors=None, + context_locks=None): if context_vectors is None: context_vectors = model.syn0 if context_locks is None: @@ -241,38 +282,50 @@ def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_h return predict_word = model.vocab[word] # target word (NN output) - l1 = context_vectors[context_index] # input word (NN input/projection layer) + # input word (NN input/projection layer) + l1 = context_vectors[context_index] lock_factor = context_locks[context_index] neu1e = zeros(l1.shape) if model.hs: - # work on the entire tree at once, to push as much work into numpy's C routines as possible (performance) - l2a = deepcopy(model.syn1[predict_word.point]) # 2d matrix, codelen x layer1_size + # work on the entire tree at once, to push as much work into numpy's C + # routines as possible (performance) + # 2d matrix, codelen x layer1_size + l2a = deepcopy(model.syn1[predict_word.point]) fa = 1.0 / (1.0 + exp(-dot(l1, l2a.T))) # propagate hidden -> output - ga = (1 - predict_word.code - fa) * alpha # vector of error gradients multiplied by the learning rate + # vector of error gradients multiplied by the learning rate + ga = (1 - predict_word.code - fa) * alpha if learn_hidden: - model.syn1[predict_word.point] += outer(ga, l1) # learn hidden -> output + # learn hidden -> output + model.syn1[predict_word.point] += outer(ga, l1) neu1e += dot(ga, l2a) # save error if model.negative: - # use this word (label = 1) + `negative` other random words not from this sentence (label = 0) + # use this word (label = 1) + `negative` other random words not from + # this sentence (label = 0) word_indices = [predict_word.index] while len(word_indices) < model.negative + 1: - w = model.cum_table.searchsorted(model.random.randint(model.cum_table[-1])) + w = model.cum_table.searchsorted( + model.random.randint(model.cum_table[-1])) if w != predict_word.index: word_indices.append(w) l2b = model.syn1neg[word_indices] # 2d matrix, k+1 x layer1_size fb = 1. / (1. + exp(-dot(l1, l2b.T))) # propagate hidden -> output - gb = (model.neg_labels - fb) * alpha # vector of error gradients multiplied by the learning rate + # vector of error gradients multiplied by the learning rate + gb = (model.neg_labels - fb) * alpha if learn_hidden: - model.syn1neg[word_indices] += outer(gb, l1) # learn hidden -> output + # learn hidden -> output + model.syn1neg[word_indices] += outer(gb, l1) neu1e += dot(gb, l2b) # save error if learn_vectors: - l1 += neu1e * lock_factor # learn input -> hidden (mutates model.syn0[word2.index], if that is l1) + # learn input -> hidden (mutates model.syn0[word2.index], if that is + # l1) + l1 += neu1e * lock_factor return neu1e + def sigmoid(p): if p > 0: return 1. / (1. + exp(-p)) @@ -281,29 +334,42 @@ def sigmoid(p): else: raise ValueError -def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=True, learn_hidden=True): + +def train_cbow_pair( + model, + word, + input_word_indices, + l1, + alpha, + learn_vectors=True, + learn_hidden=True): neu1e = zeros(l1.shape) if model.hs: l2a = model.syn1[word.point] # 2d matrix, codelen x layer1_size fa = 1. / (1. + exp(-dot(l1, l2a.T))) # propagate hidden -> output - ga = (1. - word.code - fa) * alpha # vector of error gradients multiplied by the learning rate + # vector of error gradients multiplied by the learning rate + ga = (1. - word.code - fa) * alpha if learn_hidden: model.syn1[word.point] += outer(ga, l1) # learn hidden -> output neu1e += dot(ga, l2a) # save error if model.negative: - # use this word (label = 1) + `negative` other random words not from this sentence (label = 0) + # use this word (label = 1) + `negative` other random words not from + # this sentence (label = 0) word_indices = [word.index] while len(word_indices) < model.negative + 1: - w = model.cum_table.searchsorted(model.random.randint(model.cum_table[-1])) + w = model.cum_table.searchsorted( + model.random.randint(model.cum_table[-1])) if w != word.index: word_indices.append(w) l2b = model.syn1neg[word_indices] # 2d matrix, k+1 x layer1_size fb = sigmoid(dot(l1, l2b.T)) # propagate hidden -> output - gb = (model.neg_labels - fb) * alpha # vector of error gradients multiplied by the learning rate + # vector of error gradients multiplied by the learning rate + gb = (model.neg_labels - fb) * alpha if learn_hidden: - model.syn1neg[word_indices] += outer(gb, l1) # learn hidden -> output + # learn hidden -> output + model.syn1neg[word_indices] += outer(gb, l1) neu1e += dot(gb, l2b) # save error if learn_vectors: @@ -320,14 +386,14 @@ def score_sg_pair(model, word, word2): l1 = model.syn0[word2.index] l2a = deepcopy(model.syn1[word.point]) # 2d matrix, codelen x layer1_size sgn = (-1.0)**word.code # ch function, 0-> 1, 1 -> -1 - lprob = -log(1.0 + exp(-sgn*dot(l1, l2a.T))) + lprob = -log(1.0 + exp(-sgn * dot(l1, l2a.T))) return sum(lprob) def score_cbow_pair(model, word, word2_indices, l1): l2a = model.syn1[word.point] # 2d matrix, codelen x layer1_size sgn = (-1.0)**word.code # ch function, 0-> 1, 1 -> -1 - lprob = -log(1.0 + exp(-sgn*dot(l1, l2a.T))) + lprob = -log(1.0 + exp(-sgn * dot(l1, l2a.T))) return sum(lprob) @@ -337,6 +403,7 @@ class Vocab(object): and for constructing binary trees (incl. both word leaves and inner nodes). """ + def __init__(self, **kwargs): self.count = 0 self.__dict__.update(kwargs) @@ -345,7 +412,11 @@ def __lt__(self, other): # used for sorting in a priority queue return self.count < other.count def __str__(self): - vals = ['%s:%r' % (key, self.__dict__[key]) for key in sorted(self.__dict__) if not key.startswith('_')] + vals = [ + '%s:%r' % + (key, + self.__dict__[key]) for key in sorted( + self.__dict__) if not key.startswith('_')] return "%s(%s)" % (self.__class__.__name__, ', '.join(vals)) @@ -357,11 +428,29 @@ class Word2Vec(utils.SaveLoad): compatible with the original word2vec implementation via `save_word2vec_format()` and `load_word2vec_format()`. """ + def __init__( - self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, - max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, - sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, - trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH): + self, + sentences=None, + size=100, + alpha=0.025, + window=5, + min_count=5, + max_vocab_size=None, + sample=1e-3, + seed=1, + workers=3, + min_alpha=0.0001, + sg=0, + hs=0, + negative=5, + cbow_mean=1, + hashfxn=hash, + iter=5, + null_word=0, + trim_rule=None, + sorted_vocab=1, + batch_words=MAX_WORDS_IN_BATCH): """ Initialize the model from an iterable of `sentences`. Each sentence is a list of words (unicode strings) that will be used for training. @@ -433,20 +522,24 @@ def __init__( """ if FAST_VERSION == -1: - logger.warning('Slow version of {0} is being used'.format(__name__)) + logger.warning( + 'Slow version of {0} is being used'.format(__name__)) else: logger.debug('Fast version of {0} is being used'.format(__name__)) self.vocab = {} # mapping from a word (string) to a Vocab object - self.index2word = [] # map from a word's matrix index (int) to word (string) + # map from a word's matrix index (int) to word (string) + self.index2word = [] self.sg = int(sg) self.cum_table = None # for negative sampling self.vector_size = int(size) self.layer1_size = int(size) if size % 4 != 0: - logger.warning("consider setting layer size to a multiple of 4 for greater performance") + logger.warning( + "consider setting layer size to a multiple of 4 for greater performance") self.alpha = float(alpha) - self.min_alpha_yet_reached = float(alpha) # To warn user if alpha increases + # To warn user if alpha increases + self.min_alpha_yet_reached = float(alpha) self.window = int(window) self.max_vocab_size = max_vocab_size self.seed = seed @@ -468,7 +561,8 @@ def __init__( if sentences is not None: if isinstance(sentences, GeneratorType): - raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.") + raise TypeError( + "You can't pass a generator as the sentences argument. Try an iterator.") self.build_vocab(sentences, trim_rule=trim_rule) self.train(sentences) @@ -487,11 +581,13 @@ def make_cum_table(self, power=0.75, domain=2**31 - 1): vocab_size = len(self.index2word) self.cum_table = zeros(vocab_size, dtype=uint32) # compute sum of all power (Z in paper) - train_words_pow = float(sum([self.vocab[word].count**power for word in self.vocab])) + train_words_pow = float( + sum([self.vocab[word].count**power for word in self.vocab])) cumulative = 0.0 for word_index in range(vocab_size): cumulative += self.vocab[self.index2word[word_index]].count**power - self.cum_table[word_index] = round(cumulative / train_words_pow * domain) + self.cum_table[word_index] = round( + cumulative / train_words_pow * domain) if len(self.cum_table) > 0: assert self.cum_table[-1] == domain @@ -501,16 +597,23 @@ def create_binary_tree(self): will have shorter binary codes. Called internally from `build_vocab()`. """ - logger.info("constructing a huffman tree from %i words", len(self.vocab)) + logger.info( + "constructing a huffman tree from %i words", len( + self.vocab)) # build the huffman tree heap = list(itervalues(self.vocab)) heapq.heapify(heap) for i in xrange(len(self.vocab) - 1): min1, min2 = heapq.heappop(heap), heapq.heappop(heap) - heapq.heappush(heap, Vocab(count=min1.count + min2.count, index=i + len(self.vocab), left=min1, right=min2)) - - # recurse over the tree, assigning a binary code to each vocabulary word + heapq.heappush(heap, + Vocab(count=min1.count + min2.count, + index=i + len(self.vocab), + left=min1, + right=min2)) + + # recurse over the tree, assigning a binary code to each vocabulary + # word if heap: max_depth, stack = 0, [(heap[0], [], [])] while stack: @@ -521,23 +624,54 @@ def create_binary_tree(self): max_depth = max(len(codes), max_depth) else: # inner node => continue recursion - points = array(list(points) + [node.index - len(self.vocab)], dtype=uint32) - stack.append((node.left, array(list(codes) + [0], dtype=uint8), points)) - stack.append((node.right, array(list(codes) + [1], dtype=uint8), points)) - - logger.info("built huffman tree with maximum node depth %i", max_depth) + points = array( + list(points) + [node.index - len(self.vocab)], dtype=uint32) + stack.append( + (node.left, + array( + list(codes) + [0], + dtype=uint8), + points)) + stack.append( + (node.right, + array( + list(codes) + [1], + dtype=uint8), + points)) - def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_per=10000, update=False): + logger.info( + "built huffman tree with maximum node depth %i", + max_depth) + + def build_vocab( + self, + sentences, + keep_raw_vocab=False, + trim_rule=None, + progress_per=10000, + update=False): """ Build vocabulary from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of unicode strings. """ - self.scan_vocab(sentences, progress_per=progress_per, trim_rule=trim_rule, update=update) # initial survey - self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling + self.scan_vocab( + sentences, + progress_per=progress_per, + trim_rule=trim_rule, + update=update) # initial survey + self.scale_vocab( + keep_raw_vocab=keep_raw_vocab, + trim_rule=trim_rule, + update=update) # trim by min_count & precalculate downsampling self.finalize_vocab(update=update) # build tables & arrays - def scan_vocab(self, sentences, progress_per=10000, trim_rule=None, update=False): + def scan_vocab( + self, + sentences, + progress_per=10000, + trim_rule=None, + update=False): """Do an initial scan of all words appearing in sentences.""" logger.info("collecting all words and their counts") sentence_no = -1 @@ -548,26 +682,39 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None, update=False for sentence_no, sentence in enumerate(sentences): if not checked_string_types: if isinstance(sentence, string_types): - logger.warn("Each 'sentences' item should be a list of words (usually unicode strings)." - "First item here is instead plain %s.", type(sentence)) + logger.warn( + "Each 'sentences' item should be a list of words (usually unicode strings)." + "First item here is instead plain %s.", type(sentence)) checked_string_types += 1 if sentence_no % progress_per == 0: - logger.info("PROGRESS: at sentence #%i, processed %i words, keeping %i word types", - sentence_no, sum(itervalues(vocab)) + total_words, len(vocab)) + logger.info( + "PROGRESS: at sentence #%i, processed %i words, keeping %i word types", sentence_no, sum( + itervalues(vocab)) + total_words, len(vocab)) for word in sentence: vocab[word] += 1 if self.max_vocab_size and len(vocab) > self.max_vocab_size: - total_words += utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) + total_words += utils.prune_vocab(vocab, + min_reduce, trim_rule=trim_rule) min_reduce += 1 total_words += sum(itervalues(vocab)) - logger.info("collected %i word types from a corpus of %i raw words and %i sentences", - len(vocab), total_words, sentence_no + 1) + logger.info( + "collected %i word types from a corpus of %i raw words and %i sentences", + len(vocab), + total_words, + sentence_no + 1) self.corpus_count = sentence_no + 1 self.raw_vocab = vocab - def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab=False, trim_rule=None, update=False): + def scale_vocab( + self, + min_count=None, + sample=None, + dry_run=False, + keep_raw_vocab=False, + trim_rule=None, + update=False): """ Apply vocabulary settings for `min_count` (discarding less-frequent words) and `sample` (controlling the downsampling of more-frequent words). @@ -601,19 +748,31 @@ def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab retain_words.append(word) retain_total += v if not dry_run: - self.vocab[word] = Vocab(count=v, index=len(self.index2word)) + self.vocab[word] = Vocab( + count=v, index=len(self.index2word)) self.index2word.append(word) else: drop_unique += 1 drop_total += v original_unique_total = len(retain_words) + drop_unique - retain_unique_pct = len(retain_words) * 100 / max(original_unique_total, 1) - logger.info("min_count=%d retains %i unique words (%i%% of original %i, drops %i)", - min_count, len(retain_words), retain_unique_pct, original_unique_total, drop_unique) + retain_unique_pct = len(retain_words) * \ + 100 / max(original_unique_total, 1) + logger.info( + "min_count=%d retains %i unique words (%i%% of original %i, drops %i)", + min_count, + len(retain_words), + retain_unique_pct, + original_unique_total, + drop_unique) original_total = retain_total + drop_total retain_pct = retain_total * 100 / max(original_total, 1) - logger.info("min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)", - min_count, retain_total, retain_pct, original_total, drop_total) + logger.info( + "min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)", + min_count, + retain_total, + retain_pct, + original_total, + drop_total) else: logger.info("Updating model with new vocabulary") new_total = pre_exist_total = 0 @@ -629,18 +788,27 @@ def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab new_words.append(word) new_total += v if not dry_run: - self.vocab[word] = Vocab(count=v, index=len(self.index2word)) + self.vocab[word] = Vocab( + count=v, index=len(self.index2word)) self.index2word.append(word) else: drop_unique += 1 drop_total += v - original_unique_total = len(pre_exist_words) + len(new_words) + drop_unique - pre_exist_unique_pct = len(pre_exist_words) * 100 / max(original_unique_total, 1) - new_unique_pct = len(new_words) * 100 / max(original_unique_total, 1) - logger.info("""New added %i unique words (%i%% of original %i) + original_unique_total = len( + pre_exist_words) + len(new_words) + drop_unique + pre_exist_unique_pct = len( + pre_exist_words) * 100 / max(original_unique_total, 1) + new_unique_pct = len(new_words) * 100 / \ + max(original_unique_total, 1) + logger.info( + """New added %i unique words (%i%% of original %i) and increased the count of %i pre-existing words (%i%% of original %i)""", - len(new_words), new_unique_pct, original_unique_total, - len(pre_exist_words), pre_exist_unique_pct, original_unique_total) + len(new_words), + new_unique_pct, + original_unique_total, + len(pre_exist_words), + pre_exist_unique_pct, + original_unique_total) retain_words = new_words + pre_exist_words retain_total = new_total + pre_exist_total @@ -652,13 +820,15 @@ def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab # traditional meaning: set parameter as proportion of total threshold_count = sample * retain_total else: - # new shorthand: sample >= 1 means downsample all words with higher count than sample + # new shorthand: sample >= 1 means downsample all words with higher + # count than sample threshold_count = int(sample * (3 + sqrt(5)) / 2) downsample_total, downsample_unique = 0, 0 for w in retain_words: v = self.raw_vocab[w] - word_probability = (sqrt(v / threshold_count) + 1) * (threshold_count / v) + word_probability = (sqrt(v / threshold_count) + + 1) * (threshold_count / v) if word_probability < 1.0: downsample_unique += 1 downsample_total += word_probability * v @@ -669,19 +839,28 @@ def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab self.vocab[w].sample_int = int(round(word_probability * 2**32)) if not dry_run and not keep_raw_vocab: - logger.info("deleting the raw counts dictionary of %i items", len(self.raw_vocab)) + logger.info( + "deleting the raw counts dictionary of %i items", len( + self.raw_vocab)) self.raw_vocab = defaultdict(int) - logger.info("sample=%g downsamples %i most-common words", sample, downsample_unique) + logger.info( + "sample=%g downsamples %i most-common words", + sample, + downsample_unique) logger.info("downsampling leaves estimated %i word corpus (%.1f%% of prior %i)", downsample_total, downsample_total * 100.0 / max(retain_total, 1), retain_total) # return from each step: words-affected, resulting-corpus-size - report_values = {'drop_unique': drop_unique, 'retain_total': retain_total, - 'downsample_unique': downsample_unique, 'downsample_total': int(downsample_total)} + report_values = { + 'drop_unique': drop_unique, + 'retain_total': retain_total, + 'downsample_unique': downsample_unique, + 'downsample_total': int(downsample_total)} # print extra memory estimates - report_values['memory'] = self.estimate_memory(vocab_size=len(retain_words)) + report_values['memory'] = self.estimate_memory( + vocab_size=len(retain_words)) return report_values @@ -699,7 +878,8 @@ def finalize_vocab(self, update=False): self.make_cum_table() if self.null_word: # create null pseudo-word for padding when using concatenative L1 (run-of-words) - # this word is only ever input – never predicted – so count, huffman-point, etc doesn't matter + # this word is only ever input – never predicted – so count, + # huffman-point, etc doesn't matter word, v = '\0', Vocab(count=1, sample_int=0) v.index = len(self.vocab) self.index2word.append(word) @@ -714,7 +894,8 @@ def sort_vocab(self): """Sort the vocabulary so the most frequent words have the lowest indexes.""" if hasattr(self, 'syn0'): raise RuntimeError("must sort before initializing vectors/weights") - self.index2word.sort(key=lambda word: self.vocab[word].count, reverse=True) + self.index2word.sort(key=lambda word: self.vocab[ + word].count, reverse=True) for i, word in enumerate(self.index2word): self.vocab[word].index = i @@ -759,11 +940,13 @@ def train(self, sentences, total_words=None, word_count=0, """ if FAST_VERSION < 0: import warnings - warnings.warn("C extension not loaded for Word2Vec, training will be slow. " - "Install a C compiler and reinstall gensim for fast training.") + warnings.warn( + "C extension not loaded for Word2Vec, training will be slow. " + "Install a C compiler and reinstall gensim for fast training.") self.neg_labels = [] if self.negative > 0: - # precompute negative labels optimization for pure-python training + # precompute negative labels optimization for pure-python + # training self.neg_labels = zeros(self.negative + 1) self.neg_labels[0] = 1. @@ -774,16 +957,21 @@ def train(self, sentences, total_words=None, word_count=0, self.hs, self.sample, self.negative, self.window) if not self.vocab: - raise RuntimeError("you must first build vocabulary before training the model") + raise RuntimeError( + "you must first build vocabulary before training the model") if not hasattr(self, 'syn0'): - raise RuntimeError("you must first finalize vocabulary before training the model") + raise RuntimeError( + "you must first finalize vocabulary before training the model") if total_words is None and total_examples is None: if self.corpus_count: total_examples = self.corpus_count - logger.info("expecting %i sentences, matching count from corpus used for vocabulary survey", total_examples) + logger.info( + "expecting %i sentences, matching count from corpus used for vocabulary survey", + total_examples) else: - raise ValueError("you must provide either total_words or total_examples, to enable alpha and progress calculations") + raise ValueError( + "you must provide either total_words or total_examples, to enable alpha and progress calculations") job_tally = 0 @@ -794,7 +982,8 @@ def train(self, sentences, total_words=None, word_count=0, def worker_loop(): """Train the model, lifting lists of sentences from the job_queue.""" - work = matutils.zeros_aligned(self.layer1_size, dtype=REAL) # per-thread private work memory + work = matutils.zeros_aligned( + self.layer1_size, dtype=REAL) # per-thread private work memory neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) jobs_processed = 0 while True: @@ -803,8 +992,10 @@ def worker_loop(): progress_queue.put(None) break # no more jobs => quit this worker sentences, alpha = job - tally, raw_tally = self._do_train_job(sentences, alpha, (work, neu1)) - progress_queue.put((len(sentences), tally, raw_tally)) # report back progress + tally, raw_tally = self._do_train_job( + sentences, alpha, (work, neu1)) + progress_queue.put( + (len(sentences), tally, raw_tally)) # report back progress jobs_processed += 1 logger.debug("worker exiting, processed %i jobs", jobs_processed) @@ -814,7 +1005,8 @@ def job_producer(): pushed_words, pushed_examples = 0, 0 next_alpha = self.alpha if next_alpha > self.min_alpha_yet_reached: - logger.warn("Effective 'alpha' higher than previous training cycles") + logger.warn( + "Effective 'alpha' higher than previous training cycles") self.min_alpha_yet_reached = next_alpha job_no = 0 @@ -830,7 +1022,10 @@ def job_producer(): # no => submit the existing job logger.debug( "queueing job #%i (%i words, %i sentences) at alpha %.05f", - job_no, batch_size, len(job_batch), next_alpha) + job_no, + batch_size, + len(job_batch), + next_alpha) job_no += 1 job_queue.put((job_batch, next_alpha)) @@ -844,13 +1039,16 @@ def job_producer(): # words-based decay pushed_words += self._raw_word_count(job_batch) progress = 1.0 * pushed_words / total_words - next_alpha = self.alpha - (self.alpha - self.min_alpha) * progress + next_alpha = self.alpha - \ + (self.alpha - self.min_alpha) * progress next_alpha = max(self.min_alpha, next_alpha) - # add the sentence that didn't fit as the first item of a new job + # add the sentence that didn't fit as the first item of a + # new job job_batch, batch_size = [sentence], sentence_length - # add the last job too (may be significantly smaller than batch_words) + # add the last job too (may be significantly smaller than + # batch_words) if job_batch: logger.debug( "queueing job #%i (%i words, %i sentences) at alpha %.05f", @@ -870,11 +1068,13 @@ def job_producer(): job_queue.put(None) logger.debug("job loop exiting, total %i jobs", job_no) - # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( + # buffer ahead only a limited number of jobs.. this is the reason we + # can't simply use ThreadPool :( job_queue = Queue(maxsize=queue_factor * self.workers) progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers) - workers = [threading.Thread(target=worker_loop) for _ in xrange(self.workers)] + workers = [threading.Thread(target=worker_loop) + for _ in xrange(self.workers)] unfinished_worker_count = len(workers) workers.append(threading.Thread(target=job_producer)) @@ -889,7 +1089,9 @@ def job_producer(): report = progress_queue.get() # blocks if workers too slow if report is None: # a thread reporting that it finished unfinished_worker_count -= 1 - logger.info("worker thread finished; awaiting finish of %i more threads", unfinished_worker_count) + logger.info( + "worker thread finished; awaiting finish of %i more threads", + unfinished_worker_count) continue examples, trained_words, raw_words = report job_tally += 1 @@ -906,29 +1108,44 @@ def job_producer(): # examples-based progress % logger.info( "PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i", - 100.0 * example_count / total_examples, trained_word_count / elapsed, - utils.qsize(job_queue), utils.qsize(progress_queue)) + 100.0 * example_count / total_examples, + trained_word_count / elapsed, + utils.qsize(job_queue), + utils.qsize(progress_queue)) else: # words-based progress % logger.info( "PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i", - 100.0 * raw_word_count / total_words, trained_word_count / elapsed, - utils.qsize(job_queue), utils.qsize(progress_queue)) + 100.0 * raw_word_count / total_words, + trained_word_count / elapsed, + utils.qsize(job_queue), + utils.qsize(progress_queue)) next_report = elapsed + report_delay # all done; report the final stats elapsed = default_timer() - start logger.info( "training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s", - raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed) + raw_word_count, + trained_word_count, + elapsed, + trained_word_count / + elapsed) if job_tally < 10 * self.workers: - logger.warn("under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay") + logger.warn( + "under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay") # check that the input corpus hasn't changed during iteration if total_examples and total_examples != example_count: - logger.warn("supplied example count (%i) did not equal expected count (%i)", example_count, total_examples) + logger.warn( + "supplied example count (%i) did not equal expected count (%i)", + example_count, + total_examples) if total_words and total_words != raw_word_count: - logger.warn("supplied raw word count (%i) did not equal expected count (%i)", raw_word_count, total_words) + logger.warn( + "supplied raw word count (%i) did not equal expected count (%i)", + raw_word_count, + total_words) self.train_count += 1 # number of times train() has been called self.total_train_time += elapsed @@ -936,7 +1153,13 @@ def job_producer(): return trained_word_count # basics copied from the train() function - def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor=2, report_delay=1): + def score( + self, + sentences, + total_sentences=int(1e6), + chunksize=100, + queue_factor=2, + report_delay=1): """ Score the log probability for a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of unicode strings. @@ -956,23 +1179,26 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor """ if FAST_VERSION < 0: import warnings - warnings.warn("C extension compilation failed, scoring will be slow. " - "Install a C compiler and reinstall gensim for fastness.") + warnings.warn( + "C extension compilation failed, scoring will be slow. " + "Install a C compiler and reinstall gensim for fastness.") logger.info( "scoring sentences with %i workers on %i vocabulary and %i features, " - "using sg=%s hs=%s sample=%s and negative=%s", - self.workers, len(self.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative) + "using sg=%s hs=%s sample=%s and negative=%s", self.workers, len( + self.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative) if not self.vocab: - raise RuntimeError("you must first build vocabulary before scoring new data") + raise RuntimeError( + "you must first build vocabulary before scoring new data") if not self.hs: raise RuntimeError("we have only implemented score for hs") def worker_loop(): """Train the model, lifting lists of sentences from the jobs queue.""" - work = zeros(1, dtype=REAL) # for sg hs, we actually only need one memory loc (running sum) + work = zeros( + 1, dtype=REAL) # for sg hs, we actually only need one memory loc (running sum) neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) while True: job = job_queue.get() @@ -991,11 +1217,13 @@ def worker_loop(): progress_queue.put(ns) # report progress start, next_report = default_timer(), 1.0 - # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( + # buffer ahead only a limited number of jobs.. this is the reason we + # can't simply use ThreadPool :( job_queue = Queue(maxsize=queue_factor * self.workers) progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers) - workers = [threading.Thread(target=worker_loop) for _ in xrange(self.workers)] + workers = [threading.Thread(target=worker_loop) + for _ in xrange(self.workers)] for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() @@ -1024,11 +1252,14 @@ def worker_loop(): "reached end of input; waiting to finish %i outstanding jobs", job_no - done_jobs + 1) for _ in xrange(self.workers): - job_queue.put(None) # give the workers heads up that they can finish -- no more work! + # give the workers heads up that they can finish -- no more + # work! + job_queue.put(None) push_done = True try: while done_jobs < (job_no + 1) or not push_done: - ns = progress_queue.get(push_done) # only block after all jobs pushed + # only block after all jobs pushed + ns = progress_queue.get(push_done) sentence_count += ns done_jobs += 1 elapsed = default_timer() - start @@ -1036,7 +1267,8 @@ def worker_loop(): logger.info( "PROGRESS: at %.2f%% sentences, %.0f sentences/s", 100.0 * sentence_count, sentence_count / elapsed) - next_report = elapsed + report_delay # don't flood log, wait report_delay seconds + # don't flood log, wait report_delay seconds + next_report = elapsed + report_delay else: # loop ended by job count; really done break @@ -1065,37 +1297,46 @@ def update_weights(self): # randomize the remaining words for i in xrange(len(self.syn0), len(self.vocab)): # construct deterministic seed from word AND seed argument - newsyn0[i-len(self.syn0)] = self.seeded_vector(self.index2word[i] + str(self.seed)) + newsyn0[ + i - len(self.syn0)] = self.seeded_vector(self.index2word[i] + str(self.seed)) self.syn0 = vstack([self.syn0, newsyn0]) if self.hs: - self.syn1 = vstack([self.syn1, zeros((gained_vocab, self.layer1_size), dtype=REAL)]) + self.syn1 = vstack( + [self.syn1, zeros((gained_vocab, self.layer1_size), dtype=REAL)]) if self.negative: - self.syn1neg = vstack([self.syn1neg, zeros((gained_vocab, self.layer1_size), dtype=REAL)]) + self.syn1neg = vstack([self.syn1neg, zeros( + (gained_vocab, self.layer1_size), dtype=REAL)]) self.syn0norm = None # do not suppress learning for already learned words - self.syn0_lockf = ones(len(self.vocab), dtype=REAL) # zeros suppress learning + self.syn0_lockf = ones(len(self.vocab), + dtype=REAL) # zeros suppress learning def reset_weights(self): """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary.""" logger.info("resetting layer weights") self.syn0 = empty((len(self.vocab), self.vector_size), dtype=REAL) - # randomize weights vector by vector, rather than materializing a huge random matrix in RAM at once + # randomize weights vector by vector, rather than materializing a huge + # random matrix in RAM at once for i in xrange(len(self.vocab)): # construct deterministic seed from word AND seed argument - self.syn0[i] = self.seeded_vector(self.index2word[i] + str(self.seed)) + self.syn0[i] = self.seeded_vector( + self.index2word[i] + str(self.seed)) if self.hs: self.syn1 = zeros((len(self.vocab), self.layer1_size), dtype=REAL) if self.negative: - self.syn1neg = zeros((len(self.vocab), self.layer1_size), dtype=REAL) + self.syn1neg = zeros( + (len(self.vocab), self.layer1_size), dtype=REAL) self.syn0norm = None - self.syn0_lockf = ones(len(self.vocab), dtype=REAL) # zeros suppress learning + self.syn0_lockf = ones(len(self.vocab), + dtype=REAL) # zeros suppress learning def seeded_vector(self, seed_string): """Create one 'random' vector (but deterministic by seed_string)""" - # Note: built-in hash() may vary by Python version or even (in Py3.x) per launch + # Note: built-in hash() may vary by Python version or even (in Py3.x) + # per launch once = random.RandomState(self.hashfxn(seed_string) & 0xffffffff) return (once.rand(self.vector_size) - 0.5) / self.vector_size @@ -1113,23 +1354,39 @@ def save_word2vec_format(self, fname, fvocab=None, binary=False): if fvocab is not None: logger.info("storing vocabulary in %s" % (fvocab)) with utils.smart_open(fvocab, 'wb') as vout: - for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): + for word, vocab in sorted(iteritems( + self.vocab), key=lambda item: -item[1].count): vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count))) - logger.info("storing %sx%s projection weights into %s" % (len(self.vocab), self.vector_size, fname)) + logger.info("storing %sx%s projection weights into %s" % + (len(self.vocab), self.vector_size, fname)) assert (len(self.vocab), self.vector_size) == self.syn0.shape with utils.smart_open(fname, 'wb') as fout: fout.write(utils.to_utf8("%s %s\n" % self.syn0.shape)) # store in sorted order: most frequent words at the top - for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): + for word, vocab in sorted( + iteritems( + self.vocab), key=lambda item: -item[1].count): row = self.syn0[vocab.index] if binary: fout.write(utils.to_utf8(word) + b" " + row.tostring()) else: - fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row)))) + fout.write( + utils.to_utf8( + "%s %s\n" % + (word, ' '.join( + "%f" % + val for val in row)))) @classmethod - def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', - limit=None, datatype=REAL): + def load_word2vec_format( + cls, + fname, + fvocab=None, + binary=False, + encoding='utf8', + unicode_errors='strict', + limit=None, + datatype=REAL): """ Load the input-hidden weight matrix from the original C word2vec-tool format. @@ -1170,7 +1427,8 @@ def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', logger.info("loading projection weights from %s", fname) with utils.smart_open(fname) as fin: header = utils.to_unicode(fin.readline(), encoding=encoding) - vocab_size, vector_size = map(int, header.split()) # throws for invalid file format + vocab_size, vector_size = map( + int, header.split()) # throws for invalid file format if limit: vocab_size = min(vocab_size, limit) result = cls(size=vector_size) @@ -1179,17 +1437,23 @@ def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', def add_word(word, weights): word_id = len(result.vocab) if word in result.vocab: - logger.warning("duplicate word '%s' in %s, ignoring all but first", word, fname) + logger.warning( + "duplicate word '%s' in %s, ignoring all but first", word, fname) return if counts is None: - # most common scenario: no vocab file given. just make up some bogus counts, in descending order - result.vocab[word] = Vocab(index=word_id, count=vocab_size - word_id) + # most common scenario: no vocab file given. just make up + # some bogus counts, in descending order + result.vocab[word] = Vocab( + index=word_id, count=vocab_size - word_id) elif word in counts: # use count from the vocab file - result.vocab[word] = Vocab(index=word_id, count=counts[word]) + result.vocab[word] = Vocab( + index=word_id, count=counts[word]) else: - # vocab file given, but word is missing -- set count to None (TODO: or raise?) - logger.warning("vocabulary file is incomplete: '%s' is missing", word) + # vocab file given, but word is missing -- set count to + # None (TODO: or raise?) + logger.warning( + "vocabulary file is incomplete: '%s' is missing", word) result.vocab[word] = Vocab(index=word_id, count=None) result.syn0[word_id] = weights result.index2word.append(word) @@ -1204,20 +1468,30 @@ def add_word(word, weights): if ch == b' ': break if ch == b'': - raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?") - if ch != b'\n': # ignore newlines in front of words (some binary files have) + raise EOFError( + "unexpected end of input; is count incorrect or file otherwise damaged?") + # ignore newlines in front of words (some binary files + # have) + if ch != b'\n': word.append(ch) - word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors) + word = utils.to_unicode( + b''.join(word), encoding=encoding, errors=unicode_errors) weights = fromstring(fin.read(binary_len), dtype=REAL) add_word(word, weights) else: for line_no in xrange(vocab_size): line = fin.readline() if line == b'': - raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?") - parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") + raise EOFError( + "unexpected end of input; is count incorrect or file otherwise damaged?") + parts = utils.to_unicode( + line.rstrip(), + encoding=encoding, + errors=unicode_errors).split(" ") if len(parts) != vector_size + 1: - raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no)) + raise ValueError( + "invalid vector on line %s (is this really the text format?)" % + (line_no)) word, weights = parts[0], list(map(REAL, parts[1:])) add_word(word, weights) if result.syn0.shape[0] != len(result.vocab): @@ -1231,7 +1505,13 @@ def add_word(word, weights): logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname)) return result - def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='utf8', unicode_errors='strict'): + def intersect_word2vec_format( + self, + fname, + lockf=0.0, + binary=False, + encoding='utf8', + unicode_errors='strict'): """ Merge the input-hidden weight matrix from the original C word2vec-tool format given, where it intersects with the current vocabulary. (No words are added to the @@ -1248,10 +1528,14 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut logger.info("loading projection weights from %s" % (fname)) with utils.smart_open(fname) as fin: header = utils.to_unicode(fin.readline(), encoding=encoding) - vocab_size, vector_size = map(int, header.split()) # throws for invalid file format + vocab_size, vector_size = map( + int, header.split()) # throws for invalid file format if not vector_size == self.vector_size: - raise ValueError("incompatible vector size %d in file %s" % (vector_size, fname)) - # TOCONSIDER: maybe mismatched vectors still useful enough to merge (truncating/padding)? + raise ValueError( + "incompatible vector size %d in file %s" % + (vector_size, fname)) + # TOCONSIDER: maybe mismatched vectors still useful enough to + # merge (truncating/padding)? if binary: binary_len = dtype(REAL).itemsize * vector_size for line_no in xrange(vocab_size): @@ -1261,26 +1545,42 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut ch = fin.read(1) if ch == b' ': break - if ch != b'\n': # ignore newlines in front of words (some binary files have) + # ignore newlines in front of words (some binary files + # have) + if ch != b'\n': word.append(ch) - word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors) + word = utils.to_unicode( + b''.join(word), encoding=encoding, errors=unicode_errors) weights = fromstring(fin.read(binary_len), dtype=REAL) if word in self.vocab: overlap_count += 1 self.syn0[self.vocab[word].index] = weights - self.syn0_lockf[self.vocab[word].index] = lockf # lock-factor: 0.0 stops further changes + # lock-factor: 0.0 stops further changes + self.syn0_lockf[self.vocab[word].index] = lockf else: for line_no, line in enumerate(fin): - parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") + parts = utils.to_unicode( + line.rstrip(), + encoding=encoding, + errors=unicode_errors).split(" ") if len(parts) != vector_size + 1: - raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no)) + raise ValueError( + "invalid vector on line %s (is this really the text format?)" % + (line_no)) word, weights = parts[0], list(map(REAL, parts[1:])) if word in self.vocab: overlap_count += 1 self.syn0[self.vocab[word].index] = weights - logger.info("merged %d vectors into %s matrix from %s" % (overlap_count, self.syn0.shape, fname)) - - def most_similar(self, positive=[], negative=[], topn=10, restrict_vocab=None, indexer=None): + logger.info("merged %d vectors into %s matrix from %s" % + (overlap_count, self.syn0.shape, fname)) + + def most_similar( + self, + positive=[], + negative=[], + topn=10, + restrict_vocab=None, + indexer=None): """ Find the top-N most similar words. Positive words contribute positively towards the similarity, negative words negatively. @@ -1306,10 +1606,12 @@ def most_similar(self, positive=[], negative=[], topn=10, restrict_vocab=None, i self.init_sims() if isinstance(positive, string_types) and not negative: - # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog']) + # allow calls like most_similar('dog'), as a shorthand for + # most_similar(['dog']) positive = [positive] - # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words + # add weights for each word, if not already present; default to 1.0 for + # positive and -1.0 for negative words positive = [ (word, 1.0) if isinstance(word, string_types + (ndarray,)) else word for word in positive @@ -1336,13 +1638,19 @@ def most_similar(self, positive=[], negative=[], topn=10, restrict_vocab=None, i if indexer is not None: return indexer.most_similar(mean, topn) - limited = self.syn0norm if restrict_vocab is None else self.syn0norm[:restrict_vocab] + limited = self.syn0norm if restrict_vocab is None else self.syn0norm[ + :restrict_vocab] dists = dot(limited, mean) if not topn: return dists - best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True) + best = matutils.argsort( + dists, + topn=topn + + len(all_words), + reverse=True) # ignore (don't return) words from the input - result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words] + result = [(self.index2word[sim], float(dists[sim])) + for sim in best if sim not in all_words] return result[:topn] def wmdistance(self, document1, document2): @@ -1378,7 +1686,8 @@ def wmdistance(self, document1, document2): """ if not PYEMD_EXT: - raise ImportError("Please install pyemd Python package to compute WMD.") + raise ImportError( + "Please install pyemd Python package to compute WMD.") # Remove out-of-vocabulary words. len_pre_oov1 = len(document1) @@ -1388,8 +1697,10 @@ def wmdistance(self, document1, document2): diff1 = len_pre_oov1 - len(document1) diff2 = len_pre_oov2 - len(document2) if diff1 > 0 or diff2 > 0: - logger.info('Removed %d and %d OOV words from document 1 and 2 (respectively).', - diff1, diff2) + logger.info( + 'Removed %d and %d OOV words from document 1 and 2 (respectively).', + diff1, + diff2) if len(document1) == 0 or len(document2) == 0: logger.info('At least one of the documents had no words that were' @@ -1414,7 +1725,8 @@ def wmdistance(self, document1, document2): if np_sum(distance_matrix) == 0.0: # `emd` gets stuck if the distance matrix contains only zeros. - logger.info('The distance matrix is all zeros. Aborting (returning inf).') + logger.info( + 'The distance matrix is all zeros. Aborting (returning inf).') return float('inf') def nbow(document): @@ -1457,7 +1769,8 @@ def most_similar_cosmul(self, positive=[], negative=[], topn=10): self.init_sims() if isinstance(positive, string_types) and not negative: - # allow calls like most_similar_cosmul('dog'), as a shorthand for most_similar_cosmul(['dog']) + # allow calls like most_similar_cosmul('dog'), as a shorthand for + # most_similar_cosmul(['dog']) positive = [positive] all_words = set() @@ -1484,9 +1797,14 @@ def word_vec(word): if not topn: return dists - best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True) + best = matutils.argsort( + dists, + topn=topn + + len(all_words), + reverse=True) # ignore (don't return) words from the input - result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words] + result = [(self.index2word[sim], float(dists[sim])) + for sim in best if sim not in all_words] return result[:topn] def similar_by_word(self, word, topn=10, restrict_vocab=None): @@ -1507,7 +1825,10 @@ def similar_by_word(self, word, topn=10, restrict_vocab=None): """ - return self.most_similar(positive=[word], topn=topn, restrict_vocab=restrict_vocab) + return self.most_similar( + positive=[word], + topn=topn, + restrict_vocab=restrict_vocab) def similar_by_vector(self, vector, topn=10, restrict_vocab=None): """ @@ -1527,7 +1848,10 @@ def similar_by_vector(self, vector, topn=10, restrict_vocab=None): """ - return self.most_similar(positive=[vector], topn=topn, restrict_vocab=restrict_vocab) + return self.most_similar( + positive=[vector], + topn=topn, + restrict_vocab=restrict_vocab) def doesnt_match(self, words): """ @@ -1541,17 +1865,18 @@ def doesnt_match(self, words): """ self.init_sims() - words = [word for word in words if word in self.vocab] # filter out OOV words + # filter out OOV words + words = [word for word in words if word in self.vocab] logger.debug("using words %s" % words) if not words: raise ValueError("cannot select a word from an empty list") - vectors = vstack(self.syn0norm[self.vocab[word].index] for word in words).astype(REAL) + vectors = vstack(self.syn0norm[self.vocab[word].index] + for word in words).astype(REAL) mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL) dists = dot(vectors, mean) return sorted(zip(dists, words))[0][1] def __getitem__(self, words): - """ Accept a single word or a list of words as input. @@ -1574,7 +1899,8 @@ def __getitem__(self, words): """ if isinstance(words, string_types): - # allow calls like trained_model['office'], as a shorthand for trained_model[['office']] + # allow calls like trained_model['office'], as a shorthand for + # trained_model[['office']] return self.syn0[self.vocab[words].index] return vstack([self.syn0[self.vocab[word].index] for word in words]) @@ -1620,7 +1946,6 @@ def n_similarity(self, ws1, ws2): return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0))) - def init_sims(self, replace=False): """ Precompute L2-normalized vectors. @@ -1641,7 +1966,8 @@ def init_sims(self, replace=False): if hasattr(self, 'syn1'): del self.syn1 else: - self.syn0norm = (self.syn0 / sqrt((self.syn0 ** 2).sum(-1))[..., newaxis]).astype(REAL) + self.syn0norm = ( + self.syn0 / sqrt((self.syn0 ** 2).sum(-1))[..., newaxis]).astype(REAL) def estimate_memory(self, vocab_size=None, report=None): """Estimate required memory for a model using current settings and provided vocabulary size.""" @@ -1650,12 +1976,17 @@ def estimate_memory(self, vocab_size=None, report=None): report['vocab'] = vocab_size * (700 if self.hs else 500) report['syn0'] = vocab_size * self.vector_size * dtype(REAL).itemsize if self.hs: - report['syn1'] = vocab_size * self.layer1_size * dtype(REAL).itemsize + report['syn1'] = vocab_size * \ + self.layer1_size * dtype(REAL).itemsize if self.negative: - report['syn1neg'] = vocab_size * self.layer1_size * dtype(REAL).itemsize + report['syn1neg'] = vocab_size * \ + self.layer1_size * dtype(REAL).itemsize report['total'] = sum(report.values()) - logger.info("estimated required memory for %i words and %i dimensions: %i bytes", - vocab_size, self.vector_size, report['total']) + logger.info( + "estimated required memory for %i words and %i dimensions: %i bytes", + vocab_size, + self.vector_size, + report['total']) return report @staticmethod @@ -1663,10 +1994,18 @@ def log_accuracy(section): correct, incorrect = len(section['correct']), len(section['incorrect']) if correct + incorrect > 0: logger.info("%s: %.1f%% (%i/%i)" % - (section['section'], 100.0 * correct / (correct + incorrect), - correct, correct + incorrect)) - - def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, case_insensitive=True): + (section['section'], 100.0 * + correct / + (correct + + incorrect), correct, correct + + incorrect)) + + def accuracy( + self, + questions, + restrict_vocab=30000, + most_similar=most_similar, + case_insensitive=True): """ Compute accuracy of the model. `questions` is a filename where lines are 4-tuples of words, split into sections by ": SECTION NAME" lines. @@ -1688,46 +2027,72 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c This method corresponds to the `compute-accuracy` script of the original C word2vec. """ - ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]] - ok_vocab = dict((w.upper(), v) for w, v in reversed(ok_vocab)) if case_insensitive else dict(ok_vocab) + ok_vocab = [(w, self.vocab[w]) + for w in self.index2word[:restrict_vocab]] + ok_vocab = dict((w.upper(), v) for w, v in reversed( + ok_vocab)) if case_insensitive else dict(ok_vocab) sections, section = [], None for line_no, line in enumerate(utils.smart_open(questions)): - # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed + # TODO: use level3 BLAS (=evaluate multiple questions at once), for + # speed line = utils.to_unicode(line) if line.startswith(': '): # a new section starts => store the old section if section: sections.append(section) self.log_accuracy(section) - section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []} + section = { + 'section': line.lstrip(': ').strip(), + 'correct': [], + 'incorrect': []} else: if not section: - raise ValueError("missing section header before line #%i in %s" % (line_no, questions)) + raise ValueError( + "missing section header before line #%i in %s" % + (line_no, questions)) try: if case_insensitive: - a, b, c, expected = [word.upper() for word in line.split()] + a, b, c, expected = [word.upper() + for word in line.split()] else: a, b, c, expected = [word for word in line.split()] except: - logger.info("skipping invalid line #%i in %s" % (line_no, questions)) + logger.info( + "skipping invalid line #%i in %s" % + (line_no, questions)) continue if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: - logger.debug("skipping line #%i with OOV words: %s" % (line_no, line.strip())) + logger.debug( + "skipping line #%i with OOV words: %s" % + (line_no, line.strip())) continue original_vocab = self.vocab self.vocab = ok_vocab ignore = set([a, b, c]) # input words to be ignored predicted = None - # find the most likely prediction, ignoring OOV words and input words - sims = most_similar(self, positive=[b, c], negative=[a], topn=False, restrict_vocab=restrict_vocab) + # find the most likely prediction, ignoring OOV words and input + # words + sims = most_similar( + self, + positive=[ + b, + c], + negative=[a], + topn=False, + restrict_vocab=restrict_vocab) self.vocab = original_vocab for index in matutils.argsort(sims, reverse=True): - predicted = self.index2word[index].upper() if case_insensitive else self.index2word[index] + predicted = self.index2word[index].upper( + ) if case_insensitive else self.index2word[index] if predicted in ok_vocab and predicted not in ignore: if predicted != expected: - logger.debug("%s: expected %s, predicted %s", line.strip(), expected, predicted) + logger.debug( + "%s: expected %s, predicted %s", + line.strip(), + expected, + predicted) break if predicted == expected: section['correct'].append((a, b, c, expected)) @@ -1748,11 +2113,14 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c return sections def __str__(self): - return "%s(vocab=%s, size=%s, alpha=%s)" % (self.__class__.__name__, len(self.index2word), self.vector_size, self.alpha) + return "%s(vocab=%s, size=%s, alpha=%s)" % ( + self.__class__.__name__, len(self.index2word), self.vector_size, self.alpha) def save(self, *args, **kwargs): - # don't bother storing the cached normalized vectors, recalculable table - kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'table', 'cum_table']) + # don't bother storing the cached normalized vectors, recalculable + # table + kwargs['ignore'] = kwargs.get( + 'ignore', ['syn0norm', 'table', 'cum_table']) super(Word2Vec, self).save(*args, **kwargs) save.__doc__ = utils.SaveLoad.save.__doc__ @@ -1785,6 +2153,7 @@ def load(cls, *args, **kwargs): class BrownCorpus(object): """Iterate over sentences from the Brown corpus (part of NLTK data).""" + def __init__(self, dirname): self.dirname = dirname @@ -1797,9 +2166,15 @@ def __iter__(self): line = utils.to_unicode(line) # each file line is a single sentence in the Brown corpus # each token is WORD/POS_TAG - token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2] - # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff) - words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()] + token_tags = [t.split('/') + for t in line.split() if len(t.split('/')) == 2] + # ignore words with non-alphabetic tags like ",", "!" etc + # (punctuation, weird stuff) + words = [ + "%s/%s" % + (token.lower(), tag[ + :2]) for token, tag in token_tags if tag[ + :2].isalpha()] if not words: # don't bother sending out empty sentences continue yield words @@ -1807,26 +2182,37 @@ def __iter__(self): class Text8Corpus(object): """Iterate over sentences from the "text8" corpus, unzipped from http://mattmahoney.net/dc/text8.zip .""" + def __init__(self, fname, max_sentence_length=MAX_WORDS_IN_BATCH): self.fname = fname self.max_sentence_length = max_sentence_length def __iter__(self): # the entire corpus is one gigantic line -- there are no sentence marks at all - # so just split the sequence of tokens arbitrarily: 1 sentence = 1000 tokens + # so just split the sequence of tokens arbitrarily: 1 sentence = 1000 + # tokens sentence, rest = [], b'' with utils.smart_open(self.fname) as fin: while True: - text = rest + fin.read(8192) # avoid loading the entire file (=1 line) into RAM + # avoid loading the entire file (=1 line) into RAM + text = rest + fin.read(8192) if text == rest: # EOF words = utils.to_unicode(text).split() - sentence.extend(words) # return the last chunk of words, too (may be shorter/longer) + # return the last chunk of words, too (may be + # shorter/longer) + sentence.extend(words) if sentence: yield sentence break - last_token = text.rfind(b' ') # last token may have been split in two... keep for next iteration - words, rest = (utils.to_unicode(text[:last_token]).split(), - text[last_token:].strip()) if last_token >= 0 else ([], text) + # last token may have been split in two... keep for next + # iteration + last_token = text.rfind(b' ') + words, rest = ( + utils.to_unicode( + text[ + :last_token]).split(), text[ + last_token:].strip()) if last_token >= 0 else ( + [], text) sentence.extend(words) while len(sentence) >= self.max_sentence_length: yield sentence[:self.max_sentence_length] @@ -1838,7 +2224,11 @@ class LineSentence(object): Simple format: one sentence = one line; words already preprocessed and separated by whitespace. """ - def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): + def __init__( + self, + source, + max_sentence_length=MAX_WORDS_IN_BATCH, + limit=None): """ `source` can be either a string or a file object. Clip the file to the first `limit` lines (or no clipped if limit is None, the default). @@ -1867,7 +2257,7 @@ def __iter__(self): line = utils.to_unicode(line).split() i = 0 while i < len(line): - yield line[i : i + self.max_sentence_length] + yield line[i: i + self.max_sentence_length] i += self.max_sentence_length except AttributeError: # If it didn't work like a file, use it as a string filename @@ -1876,11 +2266,12 @@ def __iter__(self): line = utils.to_unicode(line).split() i = 0 while i < len(line): - yield line[i : i + self.max_sentence_length] + yield line[i: i + self.max_sentence_length] i += self.max_sentence_length -# Example: ./word2vec.py -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3 +# Example: ./word2vec.py -train data.txt -output vec.txt -size 200 -window +# 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3 if __name__ == "__main__": import argparse logging.basicConfig( @@ -1895,24 +2286,81 @@ def __iter__(self): print(globals()['__doc__'] % locals()) sys.exit(1) - from gensim.models.word2vec import Word2Vec # avoid referencing __main__ in pickle + # avoid referencing __main__ in pickle + from gensim.models.word2vec import Word2Vec seterr(all='raise') # don't ignore numpy errors parser = argparse.ArgumentParser() - parser.add_argument("-train", help="Use text data from file TRAIN to train the model", required=True) - parser.add_argument("-output", help="Use file OUTPUT to save the resulting word vectors") - parser.add_argument("-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5) - parser.add_argument("-size", help="Set size of word vectors; default is 100", type=int, default=100) - parser.add_argument("-sample", help="Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)", type=float, default=1e-3) - parser.add_argument("-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0, choices=[0, 1]) - parser.add_argument("-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", type=int, default=5) - parser.add_argument("-threads", help="Use THREADS threads (default 12)", type=int, default=12) - parser.add_argument("-iter", help="Run more training iterations (default 5)", type=int, default=5) - parser.add_argument("-min_count", help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int, default=5) - parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", type=int, default=1, choices=[0, 1]) - parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int, default=0, choices=[0, 1]) - parser.add_argument("-accuracy", help="Use questions from file ACCURACY to evaluate the model") + parser.add_argument( + "-train", + help="Use text data from file TRAIN to train the model", + required=True) + parser.add_argument( + "-output", + help="Use file OUTPUT to save the resulting word vectors") + parser.add_argument( + "-window", + help="Set max skip length WINDOW between words; default is 5", + type=int, + default=5) + parser.add_argument( + "-size", + help="Set size of word vectors; default is 100", + type=int, + default=100) + parser.add_argument( + "-sample", + help="Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)", + type=float, + default=1e-3) + parser.add_argument( + "-hs", + help="Use Hierarchical Softmax; default is 0 (not used)", + type=int, + default=0, + choices=[ + 0, + 1]) + parser.add_argument( + "-negative", + help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", + type=int, + default=5) + parser.add_argument( + "-threads", + help="Use THREADS threads (default 12)", + type=int, + default=12) + parser.add_argument( + "-iter", + help="Run more training iterations (default 5)", + type=int, + default=5) + parser.add_argument( + "-min_count", + help="This will discard words that appear less than MIN_COUNT times; default is 5", + type=int, + default=5) + parser.add_argument( + "-cbow", + help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", + type=int, + default=1, + choices=[ + 0, + 1]) + parser.add_argument( + "-binary", + help="Save the resulting vectors in binary mode; default is 0 (off)", + type=int, + default=0, + choices=[ + 0, + 1]) + parser.add_argument( + "-accuracy", + help="Use questions from file ACCURACY to evaluate the model") args = parser.parse_args() diff --git a/gensim/models/wrappers/dtmmodel.py b/gensim/models/wrappers/dtmmodel.py index 72dba1a741..469b0ad244 100644 --- a/gensim/models/wrappers/dtmmodel.py +++ b/gensim/models/wrappers/dtmmodel.py @@ -42,8 +42,22 @@ class DtmModel(utils.SaveLoad): """ def __init__( - self, dtm_path, corpus=None, time_slices=None, mode='fit', model='dtm', num_topics=100, id2word=None, prefix=None, - lda_sequence_min_iter=6, lda_sequence_max_iter=20, lda_max_em_iter=10, alpha=0.01, top_chain_var=0.005, rng_seed=0, initialize_lda=True): + self, + dtm_path, + corpus=None, + time_slices=None, + mode='fit', + model='dtm', + num_topics=100, + id2word=None, + prefix=None, + lda_sequence_min_iter=6, + lda_sequence_max_iter=20, + lda_max_em_iter=10, + alpha=0.01, + top_chain_var=0.005, + rng_seed=0, + initialize_lda=True): """ `dtm_path` is path to the dtm executable, e.g. `C:/dtm/dtm-win64.exe`. @@ -72,33 +86,40 @@ def __init__( """ if not os.path.isfile(dtm_path): - raise ValueError("dtm_path must point to the binary file, not to a folder") + raise ValueError( + "dtm_path must point to the binary file, not to a folder") self.dtm_path = dtm_path self.id2word = id2word if self.id2word is None: - logger.warning("no word id mapping provided; initializing from corpus, assuming identity") + logger.warning( + "no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) else: - self.num_terms = 0 if not self.id2word else 1 + max(self.id2word.keys()) + self.num_terms = 0 if not self.id2word else 1 + \ + max(self.id2word.keys()) if self.num_terms == 0: - raise ValueError("cannot compute DTM over an empty collection (no terms)") + raise ValueError( + "cannot compute DTM over an empty collection (no terms)") self.num_topics = num_topics try: lencorpus = len(corpus) except: - logger.warning("input corpus stream has no len(); counting documents") + logger.warning( + "input corpus stream has no len(); counting documents") lencorpus = sum(1 for _ in corpus) if lencorpus == 0: raise ValueError("cannot compute DTM over an empty corpus") - if model == "fixed" and any([i == 0 for i in [len(text) for text in corpus.get_texts()]]): + if model == "fixed" and any( + [i == 0 for i in [len(text) for text in corpus.get_texts()]]): raise ValueError("""There is a text without words in the input corpus. This breaks method='fixed' (The DIM model).""") if lencorpus != sum(time_slices): - raise ValueError("mismatched timeslices %{slices} for corpus of len {clen}".format( - slices=sum(time_slices), clen=lencorpus)) + raise ValueError( + "mismatched timeslices %{slices} for corpus of len {clen}".format( + slices=sum(time_slices), clen=lencorpus)) self.lencorpus = lencorpus if prefix is None: rand_prefix = hex(random.randint(0, 0xffffff))[2:] + '_' @@ -134,7 +155,8 @@ def fout_gamma(self): return self.prefix + 'train_out/lda-seq/' + 'gam.dat' def fout_prob(self): - return self.prefix + 'train_out/lda-seq/' + 'topic-{i}-var-e-log-prob.dat' + return self.prefix + 'train_out/lda-seq/' + \ + 'topic-{i}-var-e-log-prob.dat' def fout_observations(self): return self.prefix + 'train_out/lda-seq/' + 'topic-{i}-var-obs.dat' @@ -217,17 +239,24 @@ def train(self, corpus, time_slices, mode, model): # normalize proportions self.gamma_ /= self.gamma_.sum(axis=1)[:, np.newaxis] - self.lambda_ = np.zeros((self.num_topics, self.num_terms * len(self.time_slices))) - self.obs_ = np.zeros((self.num_topics, self.num_terms * len(self.time_slices))) + self.lambda_ = np.zeros( + (self.num_topics, self.num_terms * len(self.time_slices))) + self.obs_ = np.zeros( + (self.num_topics, self.num_terms * len(self.time_slices))) for t in range(self.num_topics): - topic = "%03d" % t - self.lambda_[t, :] = np.loadtxt(self.fout_prob().format(i=topic)) - self.obs_[t, :] = np.loadtxt(self.fout_observations().format(i=topic)) + topic = "%03d" % t + self.lambda_[t, :] = np.loadtxt(self.fout_prob().format(i=topic)) + self.obs_[t, :] = np.loadtxt( + self.fout_observations().format(i=topic)) # cast to correct shape, lambda[5,10,0] is the proportion of the 10th # topic in doc 5 at time 0 - self.lambda_.shape = (self.num_topics, self.num_terms, len(self.time_slices)) - self.obs_.shape = (self.num_topics, self.num_terms, len(self.time_slices)) + self.lambda_.shape = ( + self.num_topics, self.num_terms, len( + self.time_slices)) + self.obs_.shape = ( + self.num_topics, self.num_terms, len( + self.time_slices)) # extract document influence on topics for each time slice # influences_time[0] , influences at time 0 if model == 'fixed': @@ -241,7 +270,13 @@ def train(self, corpus, time_slices, mode, model): def print_topics(self, num_topics=10, times=5, num_words=10): return self.show_topics(num_topics, times, num_words, log=True) - def show_topics(self, num_topics=10, times=5, num_words=10, log=False, formatted=True): + def show_topics( + self, + num_topics=10, + times=5, + num_words=10, + log=False, + formatted=True): """ Print the `num_words` most probable words for `num_topics` number of topics at 'times' time slices. Set `topics=-1` to print all topics. @@ -255,7 +290,7 @@ def show_topics(self, num_topics=10, times=5, num_words=10, log=False, formatted else: num_topics = min(num_topics, self.num_topics) chosen_topics = range(num_topics) - # add a little random jitter, to randomize results around the same + # add a little random jitter, to randomize results around the same # alpha # sort_alpha = self.alpha + 0.0001 * \ # numpy.random.rand(len(self.alpha)) @@ -302,7 +337,8 @@ def show_topic(self, topicid, time, num_words=50): def print_topic(self, topicid, time, num_words=10): """Return the given topic, formatted as a string.""" - return ' + '.join(['%.3f*%s' % v for v in self.show_topic(topicid, time, num_words)]) + return ' + '.join(['%.3f*%s' % + v for v in self.show_topic(topicid, time, num_words)]) def dtm_vis(self, corpus, time): """ @@ -310,7 +346,8 @@ def dtm_vis(self, corpus, time): all of these are needed to visualise topics for DTM for a particular time-slice via pyLDAvis. input parameter is the year to do the visualisation. """ - topic_term = np.exp(self.lambda_[:,:,time]) / np.exp(self.lambda_[:,:,time]).sum() + topic_term = np.exp( + self.lambda_[:, :, time]) / np.exp(self.lambda_[:, :, time]).sum() topic_term = topic_term * self.num_topics doc_topic = self.gamma_ @@ -324,19 +361,21 @@ def dtm_vis(self, corpus, time): vocab = [self.id2word[i] for i in range(0, len(self.id2word))] # returns numpy arrays for doc_topic proportions, topic_term proportions, and document_lengths, term_frequency. - # these should be passed to the `pyLDAvis.prepare` method to visualise one time-slice of DTM topics. + # these should be passed to the `pyLDAvis.prepare` method to visualise + # one time-slice of DTM topics. return doc_topic, topic_term, doc_lengths, term_frequency, vocab def dtm_coherence(self, time, num_words=20): """ - returns all topics of a particular time-slice without probabilitiy values for it to be used + returns all topics of a particular time-slice without probabilitiy values for it to be used for either "u_mass" or "c_v" coherence. TODO: because of print format right now can only return for 1st time-slice. - should we fix the coherence printing or make changes to the print statements to mirror DTM python? + should we fix the coherence printing or make changes to the print statements to mirror DTM python? """ coherence_topics = [] for topic_no in range(0, self.num_topics): - topic = self.show_topic(topicid=topic_no, time=time, num_words=num_words) + topic = self.show_topic( + topicid=topic_no, time=time, num_words=num_words) coherence_topic = [] for prob, word in topic: coherence_topic.append(word) diff --git a/gensim/models/wrappers/ldamallet.py b/gensim/models/wrappers/ldamallet.py index cce75823ec..99f2ac9b19 100644 --- a/gensim/models/wrappers/ldamallet.py +++ b/gensim/models/wrappers/ldamallet.py @@ -55,8 +55,19 @@ class LdaMallet(utils.SaveLoad): takes place by passing around data files on disk and calling Java with subprocess.call(). """ - def __init__(self, mallet_path, corpus=None, num_topics=100, alpha=50, id2word=None, workers=4, prefix=None, - optimize_interval=0, iterations=1000, topic_threshold=0.0): + + def __init__( + self, + mallet_path, + corpus=None, + num_topics=100, + alpha=50, + id2word=None, + workers=4, + prefix=None, + optimize_interval=0, + iterations=1000, + topic_threshold=0.0): """ `mallet_path` is path to the mallet executable, e.g. `/home/kofola/mallet-2.0.7/bin/mallet`. @@ -78,15 +89,18 @@ def __init__(self, mallet_path, corpus=None, num_topics=100, alpha=50, id2word=N self.mallet_path = mallet_path self.id2word = id2word if self.id2word is None: - logger.warning("no word id mapping provided; initializing from corpus, assuming identity") + logger.warning( + "no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) else: - self.num_terms = 0 if not self.id2word else 1 + max(self.id2word.keys()) + self.num_terms = 0 if not self.id2word else 1 + \ + max(self.id2word.keys()) if self.num_terms == 0: - raise ValueError("cannot compute LDA over an empty collection (no terms)") + raise ValueError( + "cannot compute LDA over an empty collection (no terms)") self.num_topics = num_topics - self.topic_threshold=topic_threshold + self.topic_threshold = topic_threshold self.alpha = alpha if prefix is None: rand_prefix = hex(random.randint(0, 0xffffff))[2:] + '_' @@ -127,10 +141,15 @@ def corpus2mallet(self, corpus, file_like): """ for docno, doc in enumerate(corpus): if self.id2word: - tokens = sum(([self.id2word[tokenid]] * int(cnt) for tokenid, cnt in doc), []) + tokens = sum(([self.id2word[tokenid]] * int(cnt) + for tokenid, cnt in doc), []) else: - tokens = sum(([str(tokenid)] * int(cnt) for tokenid, cnt in doc), []) - file_like.write(utils.to_utf8("%s 0 %s\n" % (docno, ' '.join(tokens)))) + tokens = sum(([str(tokenid)] * int(cnt) + for tokenid, cnt in doc), []) + file_like.write( + utils.to_utf8( + "%s 0 %s\n" % + (docno, ' '.join(tokens)))) def convert_input(self, corpus, infer=False, serialize_corpus=True): """ @@ -139,18 +158,23 @@ def convert_input(self, corpus, infer=False, serialize_corpus=True): """ if serialize_corpus: - logger.info("serializing temporary corpus to %s", self.fcorpustxt()) + logger.info( + "serializing temporary corpus to %s", + self.fcorpustxt()) with smart_open(self.fcorpustxt(), 'wb') as fout: self.corpus2mallet(corpus, fout) # convert the text file above into MALLET's internal format - cmd = self.mallet_path + " import-file --preserve-case --keep-sequence --remove-stopwords --token-regex '\S+' --input %s --output %s" + cmd = self.mallet_path + \ + " import-file --preserve-case --keep-sequence --remove-stopwords --token-regex '\S+' --input %s --output %s" if infer: cmd += ' --use-pipe-from ' + self.fcorpusmallet() cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + '.infer') else: cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet()) - logger.info("converting temporary corpus to MALLET format with %s", cmd) + logger.info( + "converting temporary corpus to MALLET format with %s", + cmd) check_output(cmd, shell=True) def train(self, corpus): @@ -158,15 +182,25 @@ def train(self, corpus): cmd = self.mallet_path + " train-topics --input %s --num-topics %s --alpha %s --optimize-interval %s "\ "--num-threads %s --output-state %s --output-doc-topics %s --output-topic-keys %s "\ "--num-iterations %s --inferencer-filename %s --doc-topics-threshold %s" - cmd = cmd % ( - self.fcorpusmallet(), self.num_topics, self.alpha, self.optimize_interval, self.workers, - self.fstate(), self.fdoctopics(), self.ftopickeys(), self.iterations, self.finferencer(), self.topic_threshold) - # NOTE "--keep-sequence-bigrams" / "--use-ngrams true" poorer results + runs out of memory + cmd = cmd % (self.fcorpusmallet(), + self.num_topics, + self.alpha, + self.optimize_interval, + self.workers, + self.fstate(), + self.fdoctopics(), + self.ftopickeys(), + self.iterations, + self.finferencer(), + self.topic_threshold) + # NOTE "--keep-sequence-bigrams" / "--use-ngrams true" poorer results + + # runs out of memory logger.info("training MALLET LDA with %s", cmd) check_output(cmd, shell=True) self.word_topics = self.load_word_topics() - # NOTE - we are still keeping the wordtopics variable to not break backward compatibility. - # word_topics has replaced wordtopics throughout the code; wordtopics just stores the values of word_topics when train is called. + # NOTE - we are still keeping the wordtopics variable to not break backward compatibility. + # word_topics has replaced wordtopics throughout the code; wordtopics + # just stores the values of word_topics when train is called. self.wordtopics = self.word_topics def __getitem__(self, bow, iterations=100): @@ -176,8 +210,13 @@ def __getitem__(self, bow, iterations=100): bow = [bow] self.convert_input(bow, infer=True) - cmd = self.mallet_path + " infer-topics --input %s --inferencer %s --output-doc-topics %s --num-iterations %s --doc-topics-threshold %s" - cmd = cmd % (self.fcorpusmallet() + '.infer', self.finferencer(), self.fdoctopics() + '.infer', iterations, self.topic_threshold) + cmd = self.mallet_path + \ + " infer-topics --input %s --inferencer %s --output-doc-topics %s --num-iterations %s --doc-topics-threshold %s" + cmd = cmd % (self.fcorpusmallet() + '.infer', + self.finferencer(), + self.fdoctopics() + '.infer', + iterations, + self.topic_threshold) logger.info("inferring topics with MALLET LDA '%s'", cmd) check_output(cmd, shell=True) result = list(self.read_doctopics(self.fdoctopics() + '.infer')) @@ -185,7 +224,8 @@ def __getitem__(self, bow, iterations=100): def load_word_topics(self): logger.info("loading assigned topics from %s", self.fstate()) - word_topics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float32) + word_topics = numpy.zeros( + (self.num_topics, self.num_terms), dtype=numpy.float32) if hasattr(self.id2word, 'token2id'): word2id = self.id2word.token2id else: @@ -193,8 +233,10 @@ def load_word_topics(self): with utils.smart_open(self.fstate()) as fin: _ = next(fin) # header - self.alpha = numpy.array([float(val) for val in next(fin).split()[2:]]) - assert len(self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics" + self.alpha = numpy.array([float(val) + for val in next(fin).split()[2:]]) + assert len( + self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics" _ = next(fin) # beta for lineno, line in enumerate(fin): line = utils.to_unicode(line) @@ -215,7 +257,12 @@ def load_document_topics(self): """ return self.read_doctopics(self.fdoctopics()) - def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): + def show_topics( + self, + num_topics=10, + num_words=10, + log=False, + formatted=True): """ Print the `num_words` most probable words for `num_topics` number of topics. Set `num_topics=-1` to print all topics. @@ -228,9 +275,13 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): chosen_topics = range(num_topics) else: num_topics = min(num_topics, self.num_topics) - sort_alpha = self.alpha + 0.0001 * numpy.random.rand(len(self.alpha)) # add a little random jitter, to randomize results around the same alpha + # add a little random jitter, to randomize results around the same + # alpha + sort_alpha = self.alpha + 0.0001 * \ + numpy.random.rand(len(self.alpha)) sorted_topics = list(matutils.argsort(sort_alpha)) - chosen_topics = sorted_topics[ : num_topics//2] + sorted_topics[-num_topics//2 : ] + chosen_topics = sorted_topics[ + : num_topics // 2] + sorted_topics[-num_topics // 2:] shown = [] for i in chosen_topics: if formatted: @@ -252,8 +303,8 @@ def show_topic(self, topicid, num_words=10): return beststr def print_topic(self, topicid, num_words=10): - return ' + '.join(['%.3f*%s' % v for v in self.show_topic(topicid, num_words)]) - + return ' + '.join(['%.3f*%s' % + v for v in self.show_topic(topicid, num_words)]) def get_version(self, direc_path): """" @@ -266,12 +317,12 @@ def get_version(self, direc_path): Check version of mallet via jar file """ archive = zipfile.ZipFile(direc_path, 'r') - if u'cc/mallet/regression/' not in archive.namelist(): + if u'cc/mallet/regression/' not in archive.namelist(): return '2.0.7' else: return '2.0.8RC3' except Exception: - + xml_path = direc_path.split("bin")[0] try: doc = et.parse(xml_path + "pom.xml").getroot() @@ -279,8 +330,6 @@ def get_version(self, direc_path): return doc.find(namespace + 'version').text.split("-")[0] except Exception: return "Can't parse pom.xml version file" - - def read_doctopics(self, fname, eps=1e-6, renorm=True): """ @@ -310,7 +359,7 @@ def read_doctopics(self, fname, eps=1e-6, renorm=True): if mallet_version == "2.0.7": """ - 1 1 0 1.0780612802674239 30.005575655428533364 2 0.005575655428533364 1 0.005575655428533364 + 1 1 0 1.0780612802674239 30.005575655428533364 2 0.005575655428533364 1 0.005575655428533364 2 2 0 0.9184413079632608 40.009062076892971008 3 0.009062076892971008 2 0.009062076892971008 1 0.009062076892971008 In the above example there is a mix of the above if and elif statement. There are neither `2*num_topics` nor `num_topics` elements. It has 2 formats 40.009062076892971008 and 0 1.0780612802674239 which cannot be handled by above if elif. @@ -322,32 +371,42 @@ def read_doctopics(self, fname, eps=1e-6, renorm=True): doc = [] if len(parts) > 0: while count < len(parts): - """ + """ if section is to deal with formats of type 2 0.034 so if count reaches index of 2 and since int(2) == float(2) so if block is executed now there is one extra element afer 2, so count + 1 access should not give an error else section handles formats of type 20.034 now count is there on index of 20.034 since float(20.034) != int(20.034) so else block - is executed + is executed """ if float(parts[count]) == int(parts[count]): if float(parts[count + 1]) > eps: - doc.append((int(parts[count]), float(parts[count + 1]))) + doc.append( + (int( + parts[count]), float( + parts[ + count + 1]))) count += 2 else: - if float(parts[count]) - int(parts[count]) > eps: - doc.append((int(parts[count]) % 10, float(parts[count]) - int(parts[count]))) + if float(parts[count]) - \ + int(parts[count]) > eps: + doc.append((int(parts[count]) % 10, float( + parts[count]) - int(parts[count]))) count += 1 else: - raise RuntimeError("invalid doc topics format at line %i in %s" % (lineno + 1, fname)) + raise RuntimeError( + "invalid doc topics format at line %i in %s" % + (lineno + 1, fname)) if renorm: - # explicitly normalize weights to sum up to 1.0, just to be sure... + # explicitly normalize weights to sum up to 1.0, just to be + # sure... total_weight = float(sum([weight for _, weight in doc])) if total_weight: - doc = [(id_, float(weight) / total_weight) for id_, weight in doc] + doc = [(id_, float(weight) / total_weight) + for id_, weight in doc] yield doc diff --git a/gensim/models/wrappers/ldavowpalwabbit.py b/gensim/models/wrappers/ldavowpalwabbit.py index 7ae040293c..8717318b84 100644 --- a/gensim/models/wrappers/ldavowpalwabbit.py +++ b/gensim/models/wrappers/ldavowpalwabbit.py @@ -76,6 +76,7 @@ class LdaVowpalWabbit(utils.SaveLoad): between Vowpal Wabbit and Python takes place by passing around data files on disk and calling the 'vw' binary with the subprocess module. """ + def __init__(self, vw_path, corpus=None, num_topics=100, id2word=None, chunksize=256, passes=1, alpha=0.1, eta=0.1, decay=0.5, offset=1, gamma_threshold=0.001, random_seed=None, @@ -303,7 +304,7 @@ def load(cls, fname, *args, **kwargs): LOG.debug("Writing model bytes to '%s'", lda_vw._model_filename) with utils.smart_open(lda_vw._model_filename, 'wb') as fhandle: fhandle.write(lda_vw._model_data) - lda_vw._model_data = None # no need to keep in memory after this + lda_vw._model_data = None # no need to keep in memory after this if lda_vw._topics_data: LOG.debug("Writing topic bytes to '%s'", lda_vw._topics_filename) @@ -327,11 +328,11 @@ def _init_temp_dir(self, prefix='tmp'): def _get_vw_predict_command(self, corpus_size): """Get list of command line arguments for running prediction.""" cmd = [self.vw_path, - '--testonly', # don't update model with this data + '--testonly', # don't update model with this data '--lda_D', str(corpus_size), - '-i', self._model_filename, # load existing binary model + '-i', self._model_filename, # load existing binary model '-d', self._corpus_filename, - '--learning_rate', '0', # possibly not needed, but harmless + '--learning_rate', '0', # possibly not needed, but harmless '-p', self._predict_filename] if self.random_seed is not None: @@ -355,7 +356,7 @@ def _get_vw_train_command(self, corpus_size, update=False): '--cache_file', self._cache_filename, '--lda_epsilon', str(self.gamma_threshold), '--readable_model', self._topics_filename, - '-k', # clear cache + '-k', # clear cache '-f', self._model_filename] if update: @@ -486,7 +487,8 @@ def _predict_filename(self): def __str__(self): fields = ['num_terms', 'num_topics', 'chunksize', 'alpha', 'eta'] - kv = ["{0}={1}".format(field, getattr(self, field)) for field in fields] + kv = ["{0}={1}".format(field, getattr(self, field)) + for field in fields] return "{0}({1})".format(self.__class__.__name__, ', '.join(kv)) @@ -561,6 +563,7 @@ def _bit_length(num): """Return number of bits needed to encode given number.""" return len(bin(num).lstrip('-0b')) + def vwmodel2ldamodel(vw_model, iterations=50): """ Function to convert vowpal wabbit model to gensim LdaModel. This works by @@ -577,8 +580,15 @@ def vwmodel2ldamodel(vw_model, iterations=50): model_gensim : LdaModel instance; copied gensim LdaModel. """ model_gensim = LdaModel( - num_topics=vw_model.num_topics, id2word=vw_model.id2word, chunksize=vw_model.chunksize, - passes=vw_model.passes, alpha=vw_model.alpha, eta=vw_model.eta, decay=vw_model.decay, - offset=vw_model.offset, iterations=iterations, gamma_threshold=vw_model.gamma_threshold) + num_topics=vw_model.num_topics, + id2word=vw_model.id2word, + chunksize=vw_model.chunksize, + passes=vw_model.passes, + alpha=vw_model.alpha, + eta=vw_model.eta, + decay=vw_model.decay, + offset=vw_model.offset, + iterations=iterations, + gamma_threshold=vw_model.gamma_threshold) model_gensim.expElogbeta[:] = vw_model._get_topics() return model_gensim diff --git a/gensim/nosy.py b/gensim/nosy.py index 3536965b68..b8621bf996 100644 --- a/gensim/nosy.py +++ b/gensim/nosy.py @@ -24,7 +24,7 @@ EXTENSIONS = ['*.py'] EXECUTABLE = 'nosetests test/' -DEFAULTARGS = '--with-color -exe'# -w tests' +DEFAULTARGS = '--with-color -exe' # -w tests' def checkSum(): @@ -46,7 +46,7 @@ def checkSum(): if checkSum() != val: val = checkSum() os.system('%s %s %s' % (EXECUTABLE, DEFAULTARGS, - ' '.join(sys.argv[1:]))) + ' '.join(sys.argv[1:]))) print(datetime.datetime.now().__str__()) print('=' * 77) time.sleep(1) diff --git a/gensim/parsing/porter.py b/gensim/parsing/porter.py index 539271f58c..e60367a93d 100644 --- a/gensim/parsing/porter.py +++ b/gensim/parsing/porter.py @@ -36,6 +36,7 @@ class PorterStemmer(object): + def __init__(self): """The main part of the stemming algorithm starts here. b is a buffer holding a word to be stemmed. The letters are in b[0], @@ -89,7 +90,7 @@ def _m(self): i += 1 i += 1 n += 1 - while 1: + while True: if i > self.j: return n if not self._cons(i): @@ -103,7 +104,7 @@ def _vowelinstem(self): def _doublec(self, j): """True <=> j,(j-1) contain a double consonant.""" - return j > 0 and self.b[j] == self.b[j-1] and self._cons(j) + return j > 0 and self.b[j] == self.b[j - 1] and self._cons(j) def _cvc(self, i): """True <=> i-2,i-1,i has the form consonant - vowel - consonant @@ -113,25 +114,26 @@ def _cvc(self, i): cav(e), lov(e), hop(e), crim(e), but snow, box, tray. """ - if i < 2 or not self._cons(i) or self._cons(i-1) or not self._cons(i-2): + if i < 2 or not self._cons(i) or self._cons( + i - 1) or not self._cons(i - 2): return False return self.b[i] not in "wxy" def _ends(self, s): """True <=> 0,...k ends with the string s.""" - if s[-1] != self.b[self.k]: # tiny speed-up + if s[-1] != self.b[self.k]: # tiny speed-up return 0 length = len(s) if length > (self.k + 1): return 0 - if self.b[self.k-length+1:self.k+1] != s: + if self.b[self.k - length + 1:self.k + 1] != s: return 0 self.j = self.k - length return 1 def _setto(self, s): """Set (j+1),...k to the characters in the string s, adjusting k.""" - self.b = self.b[:self.j+1] + s + self.b = self.b[:self.j + 1] + s self.k = len(self.b) - 1 def _r(self, s): @@ -171,9 +173,12 @@ def _step1ab(self): self.k -= 1 elif (self._ends("ed") or self._ends("ing")) and self._vowelinstem(): self.k = self.j - if self._ends("at"): self._setto("ate") - elif self._ends("bl"): self._setto("ble") - elif self._ends("iz"): self._setto("ize") + if self._ends("at"): + self._setto("ate") + elif self._ends("bl"): + self._setto("ble") + elif self._ends("iz"): + self._setto("ize") elif self._doublec(self.k): if self.b[self.k - 1] not in "lsz": self.k -= 1 @@ -193,87 +198,133 @@ def _step2(self): """ ch = self.b[self.k - 1] if ch == 'a': - if self._ends("ational"): self._r("ate") - elif self._ends("tional"): self._r("tion") + if self._ends("ational"): + self._r("ate") + elif self._ends("tional"): + self._r("tion") elif ch == 'c': - if self._ends("enci"): self._r("ence") - elif self._ends("anci"): self._r("ance") + if self._ends("enci"): + self._r("ence") + elif self._ends("anci"): + self._r("ance") elif ch == 'e': - if self._ends("izer"): self._r("ize") + if self._ends("izer"): + self._r("ize") elif ch == 'l': - if self._ends("bli"): self._r("ble") # --DEPARTURE-- + if self._ends("bli"): + self._r("ble") # --DEPARTURE-- # To match the published algorithm, replace this phrase with # if self._ends("abli"): self._r("able") - elif self._ends("alli"): self._r("al") - elif self._ends("entli"): self._r("ent") - elif self._ends("eli"): self._r("e") - elif self._ends("ousli"): self._r("ous") + elif self._ends("alli"): + self._r("al") + elif self._ends("entli"): + self._r("ent") + elif self._ends("eli"): + self._r("e") + elif self._ends("ousli"): + self._r("ous") elif ch == 'o': - if self._ends("ization"): self._r("ize") - elif self._ends("ation"): self._r("ate") - elif self._ends("ator"): self._r("ate") + if self._ends("ization"): + self._r("ize") + elif self._ends("ation"): + self._r("ate") + elif self._ends("ator"): + self._r("ate") elif ch == 's': - if self._ends("alism"): self._r("al") - elif self._ends("iveness"): self._r("ive") - elif self._ends("fulness"): self._r("ful") - elif self._ends("ousness"): self._r("ous") + if self._ends("alism"): + self._r("al") + elif self._ends("iveness"): + self._r("ive") + elif self._ends("fulness"): + self._r("ful") + elif self._ends("ousness"): + self._r("ous") elif ch == 't': - if self._ends("aliti"): self._r("al") - elif self._ends("iviti"): self._r("ive") - elif self._ends("biliti"): self._r("ble") - elif ch == 'g': # --DEPARTURE-- - if self._ends("logi"): self._r("log") + if self._ends("aliti"): + self._r("al") + elif self._ends("iviti"): + self._r("ive") + elif self._ends("biliti"): + self._r("ble") + elif ch == 'g': # --DEPARTURE-- + if self._ends("logi"): + self._r("log") # To match the published algorithm, delete this phrase def _step3(self): """Deal with -ic-, -full, -ness etc. Similar strategy to _step2.""" ch = self.b[self.k] if ch == 'e': - if self._ends("icate"): self._r("ic") - elif self._ends("ative"): self._r("") - elif self._ends("alize"): self._r("al") + if self._ends("icate"): + self._r("ic") + elif self._ends("ative"): + self._r("") + elif self._ends("alize"): + self._r("al") elif ch == 'i': - if self._ends("iciti"): self._r("ic") + if self._ends("iciti"): + self._r("ic") elif ch == 'l': - if self._ends("ical"): self._r("ic") - elif self._ends("ful"): self._r("") + if self._ends("ical"): + self._r("ic") + elif self._ends("ful"): + self._r("") elif ch == 's': - if self._ends("ness"): self._r("") + if self._ends("ness"): + self._r("") def _step4(self): """_step4() takes off -ant, -ence etc., in context vcvc.""" ch = self.b[self.k - 1] if ch == 'a': - if not self._ends("al"): return + if not self._ends("al"): + return elif ch == 'c': - if not self._ends("ance") and not self._ends("ence"): return + if not self._ends("ance") and not self._ends("ence"): + return elif ch == 'e': - if not self._ends("er"): return + if not self._ends("er"): + return elif ch == 'i': - if not self._ends("ic"): return + if not self._ends("ic"): + return elif ch == 'l': - if not self._ends("able") and not self._ends("ible"): return + if not self._ends("able") and not self._ends("ible"): + return elif ch == 'n': - if self._ends("ant"): pass - elif self._ends("ement"): pass - elif self._ends("ment"): pass - elif self._ends("ent"): pass - else: return + if self._ends("ant"): + pass + elif self._ends("ement"): + pass + elif self._ends("ment"): + pass + elif self._ends("ent"): + pass + else: + return elif ch == 'o': - if self._ends("ion") and self.b[self.j] in "st": pass - elif self._ends("ou"): pass + if self._ends("ion") and self.b[self.j] in "st": + pass + elif self._ends("ou"): + pass # takes care of -ous - else: return + else: + return elif ch == 's': - if not self._ends("ism"): return + if not self._ends("ism"): + return elif ch == 't': - if not self._ends("ate") and not self._ends("iti"): return + if not self._ends("ate") and not self._ends("iti"): + return elif ch == 'u': - if not self._ends("ous"): return + if not self._ends("ous"): + return elif ch == 'v': - if not self._ends("ive"): return + if not self._ends("ive"): + return elif ch == 'z': - if not self._ends("ize"): return + if not self._ends("ize"): + return else: return if self._m() > 1: @@ -295,7 +346,7 @@ def stem(self, w): w = w.lower() k = len(w) - 1 if k <= 1: - return w # --DEPARTURE-- + return w # --DEPARTURE-- # With this line, strings of length 1 or 2 don't go through the # stemming process, although no mention is made of this in the @@ -311,7 +362,7 @@ def stem(self, w): self._step3() self._step4() self._step5() - return self.b[:self.k+1] + return self.b[:self.k + 1] def stem_sentence(self, txt): return " ".join(map(self.stem, txt.split())) diff --git a/gensim/parsing/preprocessing.py b/gensim/parsing/preprocessing.py index 367f0b02ad..7ca951b9fb 100644 --- a/gensim/parsing/preprocessing.py +++ b/gensim/parsing/preprocessing.py @@ -45,6 +45,8 @@ def remove_stopwords(s): RE_PUNCT = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE) + + def strip_punctuation(s): s = utils.to_unicode(s) return RE_PUNCT.sub(" ", s) @@ -58,9 +60,11 @@ def strip_punctuation(s): RE_TAGS = re.compile(r"<([^>]+)>", re.UNICODE) + + def strip_tags(s): s = utils.to_unicode(s) - return RE_TAGS.sub("",s) + return RE_TAGS.sub("", s) def strip_short(s, minsize=3): @@ -69,18 +73,24 @@ def strip_short(s, minsize=3): RE_NUMERIC = re.compile(r"[0-9]+", re.UNICODE) + + def strip_numeric(s): s = utils.to_unicode(s) return RE_NUMERIC.sub("", s) RE_NONALPHA = re.compile(r"\W", re.UNICODE) + + def strip_non_alphanum(s): s = utils.to_unicode(s) return RE_NONALPHA.sub(" ", s) RE_WHITESPACE = re.compile(r"(\s)+", re.UNICODE) + + def strip_multiple_whitespaces(s): s = utils.to_unicode(s) return RE_WHITESPACE.sub(" ", s) @@ -88,6 +98,8 @@ def strip_multiple_whitespaces(s): RE_AL_NUM = re.compile(r"([a-z]+)([0-9]+)", flags=re.UNICODE) RE_NUM_AL = re.compile(r"([0-9]+)([a-z]+)", flags=re.UNICODE) + + def split_alphanum(s): s = utils.to_unicode(s) s = RE_AL_NUM.sub(r"\1 \2", s) @@ -103,8 +115,15 @@ def stem_text(text): return ' '.join(p.stem(word) for word in text.split()) stem = stem_text -DEFAULT_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation, strip_multiple_whitespaces, - strip_numeric, remove_stopwords, strip_short, stem_text] +DEFAULT_FILTERS = [ + lambda x: x.lower(), + strip_tags, + strip_punctuation, + strip_multiple_whitespaces, + strip_numeric, + remove_stopwords, + strip_short, + stem_text] def preprocess_string(s, filters=DEFAULT_FILTERS): diff --git a/gensim/scripts/glove2word2vec.py b/gensim/scripts/glove2word2vec.py index 4d8d219447..908dd66a8a 100644 --- a/gensim/scripts/glove2word2vec.py +++ b/gensim/scripts/glove2word2vec.py @@ -40,7 +40,11 @@ def get_glove_info(glove_file_name): def glove2word2vec(glove_input_file, word2vec_output_file): """Convert `glove_input_file` in GloVe format into `word2vec_output_file in word2vec format.""" num_lines, num_dims = get_glove_info(glove_input_file) - logger.info("converting %i vectors from %s to %s", num_lines, glove_input_file, word2vec_output_file) + logger.info( + "converting %i vectors from %s to %s", + num_lines, + glove_input_file, + word2vec_output_file) if sys.version_info < (3,): with smart_open(word2vec_output_file, 'wb') as fout: fout.write("%s %s\n" % (num_lines, num_dims)) @@ -58,7 +62,9 @@ def glove2word2vec(glove_input_file, word2vec_output_file): if __name__ == "__main__": - logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) + logging.basicConfig( + format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', + level=logging.INFO) logging.root.setLevel(level=logging.INFO) logger.info("running %s", ' '.join(sys.argv)) @@ -79,28 +85,47 @@ def glove2word2vec(glove_input_file, word2vec_output_file): # do the actual conversion num_lines, num_dims = glove2word2vec(args.input, args.output) - logger.info('Converted model with %i vectors and %i dimensions', num_lines, num_dims) + logger.info( + 'Converted model with %i vectors and %i dimensions', + num_lines, + num_dims) # test that the converted model loads successfully - model = gensim.models.Word2Vec.load_word2vec_format(args.output, binary=False) + model = gensim.models.Word2Vec.load_word2vec_format( + args.output, binary=False) logger.info('Model %s successfully loaded', model) try: logger.info('testing the model....') if sys.version_info < (3,): with smart_open(args.output, 'rb') as f: - seed_word1, seed_word2 = random.sample([line.split()[0] for line in f], 2) + seed_word1, seed_word2 = random.sample( + [line.split()[0] for line in f], 2) else: with smart_open(args.output, 'r') as f: - seed_word1, seed_word2 = random.sample([line.split()[0] for line in f], 2) - logger.info('top-10 most similar words to "%s" are: %s', seed_word1, model.most_similar(positive=[seed_word1], topn=10)) - logger.info('similarity score between %s and %s: %s', seed_word1, seed_word2, model.similarity(seed_word1, seed_word2)) + seed_word1, seed_word2 = random.sample( + [line.split()[0] for line in f], 2) + logger.info( + 'top-10 most similar words to "%s" are: %s', + seed_word1, + model.most_similar( + positive=[seed_word1], + topn=10)) + logger.info( + 'similarity score between %s and %s: %s', + seed_word1, + seed_word2, + model.similarity( + seed_word1, + seed_word2)) except: - logger.error('error encountered. checking for model file creation now....') + logger.error( + 'error encountered. checking for model file creation now....') if os.path.isfile(os.path.join(args.output)): - logger.info('model file %s was created but could not be loaded.', args.output) + logger.info( + 'model file %s was created but could not be loaded.', + args.output) else: logger.info('model file %s creation failed. ') logger.info('please check the parameters and input file format.') raise logger.info("finished running %s", program) - diff --git a/gensim/scripts/make_wiki_online_nodebug.py b/gensim/scripts/make_wiki_online_nodebug.py index 26ca5d83ff..2d8e2bccbe 100755 --- a/gensim/scripts/make_wiki_online_nodebug.py +++ b/gensim/scripts/make_wiki_online_nodebug.py @@ -64,7 +64,8 @@ inp, outp = sys.argv[1:3] if not os.path.isdir(os.path.dirname(outp)): - raise SystemExit("Error: The output directory does not exist. Create the directory and try again.") + raise SystemExit( + "Error: The output directory does not exist. Create the directory and try again.") if len(sys.argv) > 3: keep_words = int(sys.argv[3]) @@ -76,23 +77,34 @@ if online: dictionary = HashDictionary(id_range=keep_words, debug=debug) - dictionary.allow_update = True # start collecting document frequencies + dictionary.allow_update = True # start collecting document frequencies wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) - MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) - # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` - dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) + # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) + MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) + # with HashDictionary, the token->id mapping is only fully instantiated + # now, after `serialize` + dictionary.filter_extremes( + no_below=20, + no_above=0.1, + keep_n=DEFAULT_DICT_SIZE) dictionary.save_as_text(outp + '_wordids.txt.bz2') wiki.save(outp + '_corpus.pkl.bz2') dictionary.allow_update = False else: - wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) + # takes about 9h on a macbook pro, for 3.5m articles (june 2011) + wiki = WikiCorpus(inp, lemmatize=lemmatize) # only keep the most frequent words (out of total ~8.2m unique tokens) - wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) + wiki.dictionary.filter_extremes( + no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) # save dictionary and bag-of-words (term-document frequency matrix) - MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h + MmCorpus.serialize( + outp + '_bow.mm', + wiki, + progress_cnt=10000) # another ~9h wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') # load back the id->word mapping directly from file - # this seems to save more memory, compared to keeping the wiki.dictionary object from above + # this seems to save more memory, compared to keeping the + # wiki.dictionary object from above dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') del wiki diff --git a/gensim/scripts/make_wikicorpus.py b/gensim/scripts/make_wikicorpus.py index 26ca5d83ff..2d8e2bccbe 100755 --- a/gensim/scripts/make_wikicorpus.py +++ b/gensim/scripts/make_wikicorpus.py @@ -64,7 +64,8 @@ inp, outp = sys.argv[1:3] if not os.path.isdir(os.path.dirname(outp)): - raise SystemExit("Error: The output directory does not exist. Create the directory and try again.") + raise SystemExit( + "Error: The output directory does not exist. Create the directory and try again.") if len(sys.argv) > 3: keep_words = int(sys.argv[3]) @@ -76,23 +77,34 @@ if online: dictionary = HashDictionary(id_range=keep_words, debug=debug) - dictionary.allow_update = True # start collecting document frequencies + dictionary.allow_update = True # start collecting document frequencies wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) - MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) - # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` - dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) + # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) + MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) + # with HashDictionary, the token->id mapping is only fully instantiated + # now, after `serialize` + dictionary.filter_extremes( + no_below=20, + no_above=0.1, + keep_n=DEFAULT_DICT_SIZE) dictionary.save_as_text(outp + '_wordids.txt.bz2') wiki.save(outp + '_corpus.pkl.bz2') dictionary.allow_update = False else: - wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) + # takes about 9h on a macbook pro, for 3.5m articles (june 2011) + wiki = WikiCorpus(inp, lemmatize=lemmatize) # only keep the most frequent words (out of total ~8.2m unique tokens) - wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) + wiki.dictionary.filter_extremes( + no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) # save dictionary and bag-of-words (term-document frequency matrix) - MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h + MmCorpus.serialize( + outp + '_bow.mm', + wiki, + progress_cnt=10000) # another ~9h wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') # load back the id->word mapping directly from file - # this seems to save more memory, compared to keeping the wiki.dictionary object from above + # this seems to save more memory, compared to keeping the + # wiki.dictionary object from above dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') del wiki diff --git a/gensim/scripts/word2vec_standalone.py b/gensim/scripts/word2vec_standalone.py index c36d611ef7..853421192c 100755 --- a/gensim/scripts/word2vec_standalone.py +++ b/gensim/scripts/word2vec_standalone.py @@ -57,7 +57,8 @@ logger = logging.getLogger(__name__) -from gensim.models.word2vec import Word2Vec, LineSentence # avoid referencing __main__ in pickle +# avoid referencing __main__ in pickle +from gensim.models.word2vec import Word2Vec, LineSentence if __name__ == "__main__": @@ -75,21 +76,80 @@ seterr(all='raise') # don't ignore numpy errors parser = argparse.ArgumentParser() - parser.add_argument("-train", help="Use text data from file TRAIN to train the model", required=True) - parser.add_argument("-output", help="Use file OUTPUT to save the resulting word vectors") - parser.add_argument("-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5) - parser.add_argument("-size", help="Set size of word vectors; default is 100", type=int, default=100) - parser.add_argument("-sample", help="Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; " - "default is 1e-3, useful range is (0, 1e-5)", type=float, default=1e-3) - parser.add_argument("-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0, choices=[0, 1]) - parser.add_argument("-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", type=int, default=5) - parser.add_argument("-threads", help="Use THREADS threads (default 3)", type=int, default=3) - parser.add_argument("-iter", help="Run more training iterations (default 5)", type=int, default=5) - parser.add_argument("-min_count", help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int, default=5) - parser.add_argument("-alpha", help="Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW", type=float) - parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", type=int, default=1, choices=[0, 1]) - parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int, default=0, choices=[0, 1]) - parser.add_argument("-accuracy", help="Use questions from file ACCURACY to evaluate the model") + parser.add_argument( + "-train", + help="Use text data from file TRAIN to train the model", + required=True) + parser.add_argument( + "-output", + help="Use file OUTPUT to save the resulting word vectors") + parser.add_argument( + "-window", + help="Set max skip length WINDOW between words; default is 5", + type=int, + default=5) + parser.add_argument( + "-size", + help="Set size of word vectors; default is 100", + type=int, + default=100) + parser.add_argument( + "-sample", + help="Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; " + "default is 1e-3, useful range is (0, 1e-5)", + type=float, + default=1e-3) + parser.add_argument( + "-hs", + help="Use Hierarchical Softmax; default is 0 (not used)", + type=int, + default=0, + choices=[ + 0, + 1]) + parser.add_argument( + "-negative", + help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", + type=int, + default=5) + parser.add_argument( + "-threads", + help="Use THREADS threads (default 3)", + type=int, + default=3) + parser.add_argument( + "-iter", + help="Run more training iterations (default 5)", + type=int, + default=5) + parser.add_argument( + "-min_count", + help="This will discard words that appear less than MIN_COUNT times; default is 5", + type=int, + default=5) + parser.add_argument( + "-alpha", + help="Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW", + type=float) + parser.add_argument( + "-cbow", + help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", + type=int, + default=1, + choices=[ + 0, + 1]) + parser.add_argument( + "-binary", + help="Save the resulting vectors in binary mode; default is 0 (off)", + type=int, + default=0, + choices=[ + 0, + 1]) + parser.add_argument( + "-accuracy", + help="Use questions from file ACCURACY to evaluate the model") args = parser.parse_args() diff --git a/gensim/similarities/docsim.py b/gensim/similarities/docsim.py index 8a6b119ff0..f5fff0d0ae 100755 --- a/gensim/similarities/docsim.py +++ b/gensim/similarities/docsim.py @@ -68,7 +68,8 @@ try: import multiprocessing # by default, don't parallelize queries. uncomment the following line if you want that. -# PARALLEL_SHARDS = multiprocessing.cpu_count() # use #parallel processes = #CPus +# PARALLEL_SHARDS = multiprocessing.cpu_count() # use #parallel processes +# = #CPus except ImportError: pass @@ -82,6 +83,7 @@ class Shard(utils.SaveLoad): request (query). """ + def __init__(self, fname, index): self.dirname, self.fname = os.path.split(fname) self.length = len(index) @@ -104,7 +106,8 @@ def __getstate__(self): return result def __str__(self): - return ("%s Shard(%i documents in %s)" % (self.cls.__name__, len(self), self.fullname())) + return ("%s Shard(%i documents in %s)" % + (self.cls.__name__, len(self), self.fullname())) def get_index(self): if not hasattr(self, 'index'): @@ -127,15 +130,21 @@ def __getitem__(self, query): index.num_best = self.num_best index.normalize = self.normalize except: - raise ValueError("num_best and normalize have to be set before querying a proxy Shard object") + raise ValueError( + "num_best and normalize have to be set before querying a proxy Shard object") return index[query] def query_shard(args): - query, shard = args # simulate starmap (not part of multiprocessing in older Pythons) - logger.debug("querying shard %s num_best=%s in process %s", shard, shard.num_best, os.getpid()) + # simulate starmap (not part of multiprocessing in older Pythons) + query, shard = args + logger.debug("querying shard %s num_best=%s in process %s", + shard, shard.num_best, os.getpid()) result = shard[query] - logger.debug("finished querying shard %s in process %s", shard, os.getpid()) + logger.debug( + "finished querying shard %s in process %s", + shard, + os.getpid()) return result @@ -149,7 +158,16 @@ class Similarity(interfaces.SimilarityABC): The shards themselves are simply stored as files to disk and mmap'ed back as needed. """ - def __init__(self, output_prefix, corpus, num_features, num_best=None, chunksize=256, shardsize=32768, norm='l2'): + + def __init__( + self, + output_prefix, + corpus, + num_features, + num_best=None, + chunksize=256, + shardsize=32768, + norm='l2'): """ Construct the index from `corpus`. The index can be later extended by calling the `add_documents` method. **Note**: documents are split (internally, transparently) @@ -187,7 +205,8 @@ def __init__(self, output_prefix, corpus, num_features, num_best=None, chunksize """ if output_prefix is None: - # undocumented feature: set output_prefix=None to create the server in temp + # undocumented feature: set output_prefix=None to create the server + # in temp self.output_prefix = utils.randfname(prefix='simserver') else: self.output_prefix = output_prefix @@ -204,7 +223,8 @@ def __init__(self, output_prefix, corpus, num_features, num_best=None, chunksize self.add_documents(corpus) def __len__(self): - return len(self.fresh_docs) + sum([len(shard) for shard in self.shards]) + return len(self.fresh_docs) + \ + sum([len(shard) for shard in self.shards]) def __str__(self): return ("Similarity index with %i documents in %i shards (stored under %s)" % @@ -219,7 +239,8 @@ def add_documents(self, corpus): """ min_ratio = 1.0 # 0.5 to only reopen shards that are <50% complete if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize: - # The last shard was incomplete (<; load it back and add the documents there, don't start a new shard + # The last shard was incomplete (<; load it back and add the + # documents there, don't start a new shard self.reopen_shard() for doc in corpus: if isinstance(doc, numpy.ndarray): @@ -229,15 +250,20 @@ def add_documents(self, corpus): else: doclen = len(doc) if doclen < 0.3 * self.num_features: - doc = matutils.unitvec(matutils.corpus2csc([doc], self.num_features).T, self.norm) + doc = matutils.unitvec(matutils.corpus2csc( + [doc], self.num_features).T, self.norm) else: - doc = matutils.unitvec(matutils.sparse2full(doc, self.num_features), self.norm) + doc = matutils.unitvec( + matutils.sparse2full( + doc, self.num_features), self.norm) self.fresh_docs.append(doc) self.fresh_nnz += doclen if len(self.fresh_docs) >= self.shardsize: self.close_shard() if len(self.fresh_docs) % 10000 == 0: - logger.info("PROGRESS: fresh_shard size=%i", len(self.fresh_docs)) + logger.info( + "PROGRESS: fresh_shard size=%i", len( + self.fresh_docs)) def shardid2filename(self, shardid): if self.output_prefix.endswith('.'): @@ -258,13 +284,19 @@ def close_shard(self): return shardid = len(self.shards) # consider the shard sparse if its density is < 30% - issparse = 0.3 > 1.0 * self.fresh_nnz / (len(self.fresh_docs) * self.num_features) + issparse = 0.3 > 1.0 * self.fresh_nnz / \ + (len(self.fresh_docs) * self.num_features) if issparse: - index = SparseMatrixSimilarity(self.fresh_docs, num_terms=self.num_features, - num_docs=len(self.fresh_docs), num_nnz=self.fresh_nnz) + index = SparseMatrixSimilarity( + self.fresh_docs, num_terms=self.num_features, num_docs=len( + self.fresh_docs), num_nnz=self.fresh_nnz) else: - index = MatrixSimilarity(self.fresh_docs, num_features=self.num_features) - logger.info("creating %s shard #%s", 'sparse' if issparse else 'dense', shardid) + index = MatrixSimilarity( + self.fresh_docs, num_features=self.num_features) + logger.info( + "creating %s shard #%s", + 'sparse' if issparse else 'dense', + shardid) shard = Shard(self.shardid2filename(shardid), index) shard.num_best = self.num_best shard.num_nnz = self.fresh_nnz @@ -274,14 +306,18 @@ def close_shard(self): def reopen_shard(self): assert self.shards if self.fresh_docs: - raise ValueError("cannot reopen a shard with fresh documents in index") + raise ValueError( + "cannot reopen a shard with fresh documents in index") last_shard = self.shards[-1] last_index = last_shard.get_index() - logger.info("reopening an incomplete shard of %i documents", len(last_shard)) + logger.info( + "reopening an incomplete shard of %i documents", + len(last_shard)) self.fresh_docs = list(last_index.index) self.fresh_nnz = last_shard.num_nnz - del self.shards[-1] # remove the shard from index, *but its file on disk is not deleted* + # remove the shard from index, *but its file on disk is not deleted* + del self.shards[-1] logger.debug("reopen complete") def query_shards(self, query): @@ -296,7 +332,12 @@ def query_shards(self, query): if PARALLEL_SHARDS and PARALLEL_SHARDS > 1: logger.debug("spawning %i query processes", PARALLEL_SHARDS) pool = multiprocessing.Pool(PARALLEL_SHARDS) - result = pool.imap(query_shard, args, chunksize=1 + len(args) / PARALLEL_SHARDS) + result = pool.imap( + query_shard, + args, + chunksize=1 + + len(args) / + PARALLEL_SHARDS) else: # serial processing, one shard after another pool = None @@ -314,7 +355,8 @@ def __getitem__(self, query): """ self.close_shard() # no-op if no documents added to index since last query - # reset num_best and normalize parameters, in case they were changed dynamically + # reset num_best and normalize parameters, in case they were changed + # dynamically for shard in self.shards: shard.num_best = self.num_best shard.normalize = self.norm @@ -330,25 +372,34 @@ def __getitem__(self, query): result = numpy.hstack(shard_results) else: # the following uses a lot of lazy evaluation and (optionally) parallel - # processing, to improve query latency and minimize memory footprint. + # processing, to improve query latency and minimize memory + # footprint. offsets = numpy.cumsum([0] + [len(shard) for shard in self.shards]) - convert = lambda doc, shard_no: [(doc_index + offsets[shard_no], sim) - for doc_index, sim in doc] + convert = lambda doc, shard_no: [ + (doc_index + offsets[shard_no], sim) for doc_index, sim in doc] is_corpus, query = utils.is_corpus(query) - is_corpus = is_corpus or hasattr(query, 'ndim') and query.ndim > 1 and query.shape[0] > 1 + is_corpus = is_corpus or hasattr( + query, 'ndim') and query.ndim > 1 and query.shape[0] > 1 if not is_corpus: - # user asked for num_best most similar and query is a single doc - results = (convert(result, shard_no) for shard_no, result in enumerate(shard_results)) - result = heapq.nlargest(self.num_best, itertools.chain(*results), key=lambda item: item[1]) + # user asked for num_best most similar and query is a single + # doc + results = (convert(result, shard_no) + for shard_no, result in enumerate(shard_results)) + result = heapq.nlargest( + self.num_best, itertools.chain( + *results), key=lambda item: item[1]) else: - # the trickiest combination: returning num_best results when query was a corpus + # the trickiest combination: returning num_best results when + # query was a corpus results = [] for shard_no, result in enumerate(shard_results): shard_result = [convert(doc, shard_no) for doc in result] results.append(shard_result) result = [] for parts in izip(*results): - merged = heapq.nlargest(self.num_best, itertools.chain(*parts), key=lambda item: item[1]) + merged = heapq.nlargest( + self.num_best, itertools.chain( + *parts), key=lambda item: item[1]) result.append(merged) if pool: # gc doesn't seem to collect the Pools, eventually leading to @@ -368,8 +419,9 @@ def vector_by_id(self, docpos): if docpos < pos: break if not self.shards or docpos < 0 or docpos >= pos: - raise ValueError("invalid document position: %s (must be 0 <= x < %s)" % - (docpos, len(self))) + raise ValueError( + "invalid document position: %s (must be 0 <= x < %s)" % + (docpos, len(self))) result = shard.get_document_id(docpos - pos + len(shard)) return result @@ -389,7 +441,8 @@ def __iter__(self): For each index document, compute cosine similarity against all other documents in the index and yield the result. """ - # turn off query normalization (vectors in the index are already normalized, save some CPU) + # turn off query normalization (vectors in the index are already + # normalized, save some CPU) norm, self.norm = self.norm, False for chunk in self.iter_chunks(): @@ -412,7 +465,8 @@ def iter_chunks(self, chunksize=None): self.close_shard() if chunksize is None: - # if not explicitly specified, use the chunksize from the constructor + # if not explicitly specified, use the chunksize from the + # constructor chunksize = self.chunksize for shard in self.shards: @@ -456,7 +510,7 @@ def destroy(self): for fname in glob.glob(self.output_prefix + '*'): logger.info("deleting %s", fname) os.remove(fname) -#endclass Similarity +# endclass Similarity class MatrixSimilarity(interfaces.SimilarityABC): @@ -473,7 +527,15 @@ class MatrixSimilarity(interfaces.SimilarityABC): See also `Similarity` and `SparseMatrixSimilarity` in this module. """ - def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunksize=256, corpus_len=None): + + def __init__( + self, + corpus, + num_best=None, + dtype=numpy.float32, + num_features=None, + chunksize=256, + corpus_len=None): """ `num_features` is the number of features in the corpus (will be determined automatically by scanning the corpus if not specified). See `Similarity` @@ -481,7 +543,8 @@ class for description of the other parameters. """ if num_features is None: - logger.warning("scanning corpus to determine the number of features (consider setting `num_features` explicitly)") + logger.warning( + "scanning corpus to determine the number of features (consider setting `num_features` explicitly)") num_features = 1 + utils.get_max_id(corpus) self.num_features = num_features @@ -493,14 +556,23 @@ class for description of the other parameters. if corpus is not None: if self.num_features <= 0: - raise ValueError("cannot index a corpus with zero features (you must specify either `num_features` or a non-empty corpus in the constructor)") - logger.info("creating matrix with %i documents and %i features", corpus_len, num_features) - self.index = numpy.empty(shape=(corpus_len, num_features), dtype=dtype) + raise ValueError( + "cannot index a corpus with zero features (you must specify either `num_features` or a non-empty corpus in the constructor)") + logger.info( + "creating matrix with %i documents and %i features", + corpus_len, + num_features) + self.index = numpy.empty( + shape=( + corpus_len, + num_features), + dtype=dtype) # iterate over corpus, populating the numpy index matrix with (normalized) # document vectors for docno, vector in enumerate(corpus): if docno % 1000 == 0: - logger.debug("PROGRESS: at document #%i/%i", docno, corpus_len) + logger.debug( + "PROGRESS: at document #%i/%i", docno, corpus_len) # individual documents in fact may be in numpy.scipy.sparse format as well. # it's not documented because other it's not fully supported throughout. # the user better know what he's doing (no normalization, must @@ -510,7 +582,9 @@ class for description of the other parameters. elif scipy.sparse.issparse(vector): vector = vector.toarray().flatten() else: - vector = matutils.unitvec(matutils.sparse2full(vector, num_features)) + vector = matutils.unitvec( + matutils.sparse2full( + vector, num_features)) self.index[docno] = vector def __len__(self): @@ -539,7 +613,8 @@ def get_similarities(self, query): elif isinstance(query, numpy.ndarray): pass else: - # default case: query is a single vector in sparse gensim format + # default case: query is a single vector in sparse gensim + # format query = matutils.sparse2full(query, self.num_features) query = numpy.asarray(query, dtype=self.index.dtype) @@ -549,8 +624,10 @@ def get_similarities(self, query): return result # XXX: removed casting the result from array to list; does anyone care? def __str__(self): - return "%s<%i docs, %i features>" % (self.__class__.__name__, len(self), self.index.shape[1]) -#endclass MatrixSimilarity + return "%s<%i docs, %i features>" % ( + self.__class__.__name__, len(self), self.index.shape[1]) +# endclass MatrixSimilarity + class WmdSimilarity(interfaces.SimilarityABC): """ @@ -577,7 +654,14 @@ class WmdSimilarity(interfaces.SimilarityABC): >>> query = 'Very good, you should seat outdoor.' >>> sims = instance[query] """ - def __init__(self, corpus, w2v_model, num_best=None, normalize_w2v_and_replace=True, chunksize=256): + + def __init__( + self, + corpus, + w2v_model, + num_best=None, + normalize_w2v_and_replace=True, + chunksize=256): """ corpus: List of lists of strings, as in gensim.models.word2vec. w2v_model: A trained word2vec model. @@ -590,7 +674,8 @@ def __init__(self, corpus, w2v_model, num_best=None, normalize_w2v_and_replace=T self.num_best = num_best self.chunksize = chunksize - # Normalization of features is not possible, as corpus is a list (of lists) of strings. + # Normalization of features is not possible, as corpus is a list (of + # lists) of strings. self.normalize = False # index is simply an array from 0 to size of corpus. @@ -618,9 +703,13 @@ def get_similarities(self, query): result = [] for qidx in range(n_queries): # Compute similarity for each query. - qresult = [self.w2v_model.wmdistance(document, query[qidx]) for document in self.corpus] + qresult = [ + self.w2v_model.wmdistance( + document, + query[qidx]) for document in self.corpus] qresult = numpy.array(qresult) - qresult = 1./(1.+qresult) # Similarity is the negative of the distance. + # Similarity is the negative of the distance. + qresult = 1. / (1. + qresult) # Append single query result to list of all results. result.append(qresult) @@ -634,8 +723,10 @@ def get_similarities(self, query): return result def __str__(self): - return "%s<%i docs, %i features>" % (self.__class__.__name__, len(self), self.w2v_model.syn0.shape[1]) -#endclass WmdSimilarity + return "%s<%i docs, %i features>" % ( + self.__class__.__name__, len(self), self.w2v_model.syn0.shape[1]) +# endclass WmdSimilarity + class SparseMatrixSimilarity(interfaces.SimilarityABC): """ @@ -654,8 +745,18 @@ class SparseMatrixSimilarity(interfaces.SimilarityABC): See also `Similarity` and `MatrixSimilarity` in this module. """ - def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num_nnz=None, - num_best=None, chunksize=500, dtype=numpy.float32, maintain_sparsity=False): + + def __init__( + self, + corpus, + num_features=None, + num_terms=None, + num_docs=None, + num_nnz=None, + num_best=None, + chunksize=500, + dtype=numpy.float32, + maintain_sparsity=False): self.num_best = num_best self.normalize = True self.chunksize = chunksize @@ -675,18 +776,26 @@ def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num # num_* params in constructor) pass if num_features is not None: - # num_terms is just an alias for num_features, for compatibility with MatrixSimilarity + # num_terms is just an alias for num_features, for + # compatibility with MatrixSimilarity num_terms = num_features if num_terms is None: - raise ValueError("refusing to guess the number of sparse features: specify num_features explicitly") - corpus = (matutils.scipy2sparse(v) if scipy.sparse.issparse(v) else - (matutils.full2sparse(v) if isinstance(v, numpy.ndarray) else - matutils.unitvec(v)) for v in corpus) + raise ValueError( + "refusing to guess the number of sparse features: specify num_features explicitly") + corpus = ( + matutils.scipy2sparse(v) if scipy.sparse.issparse(v) else ( + matutils.full2sparse(v) if isinstance( + v, numpy.ndarray) else matutils.unitvec(v)) for v in corpus) self.index = matutils.corpus2csc( - corpus, num_terms=num_terms, num_docs=num_docs, num_nnz=num_nnz, - dtype=dtype, printprogress=10000).T - - # convert to Compressed Sparse Row for efficient row slicing and multiplications + corpus, + num_terms=num_terms, + num_docs=num_docs, + num_nnz=num_nnz, + dtype=dtype, + printprogress=10000).T + + # convert to Compressed Sparse Row for efficient row slicing and + # multiplications self.index = self.index.tocsr() # currently no-op, CSC.T is already CSR logger.info("created %r", self.index) @@ -707,19 +816,24 @@ def get_similarities(self, query): """ is_corpus, query = utils.is_corpus(query) if is_corpus: - query = matutils.corpus2csc(query, self.index.shape[1], dtype=self.index.dtype) + query = matutils.corpus2csc( + query, self.index.shape[1], dtype=self.index.dtype) else: if scipy.sparse.issparse(query): query = query.T # convert documents=rows to documents=columns elif isinstance(query, numpy.ndarray): if query.ndim == 1: query.shape = (1, len(query)) - query = scipy.sparse.csr_matrix(query, dtype=self.index.dtype).T + query = scipy.sparse.csr_matrix( + query, dtype=self.index.dtype).T else: - # default case: query is a single vector, in sparse gensim format - query = matutils.corpus2csc([query], self.index.shape[1], dtype=self.index.dtype) + # default case: query is a single vector, in sparse gensim + # format + query = matutils.corpus2csc( + [query], self.index.shape[1], dtype=self.index.dtype) - # compute cosine similarity against every other document in the collection + # compute cosine similarity against every other document in the + # collection result = self.index * query.tocsc() # N x T * T x C = N x C if result.shape[1] == 1 and not is_corpus: # for queries of one document, return a 1d array @@ -731,4 +845,4 @@ def get_similarities(self, query): # otherwise, return a 2d matrix (#queries x #index) result = result.toarray().T return result -#endclass SparseMatrixSimilarity +# endclass SparseMatrixSimilarity diff --git a/gensim/similarities/index.py b/gensim/similarities/index.py index eb130e0d3d..6434b2d2c8 100644 --- a/gensim/similarities/index.py +++ b/gensim/similarities/index.py @@ -16,7 +16,8 @@ try: from annoy import AnnoyIndex except ImportError: - raise ImportError("Annoy has not been installed, if you wish to use the annoy indexer, please run `pip install annoy`") + raise ImportError( + "Annoy has not been installed, if you wish to use the annoy indexer, please run `pip install annoy`") class AnnoyIndexer(object): @@ -33,20 +34,25 @@ def __init__(self, model=None, num_trees=None): elif isinstance(self.model, Word2Vec): self.build_from_word2vec() else: - raise ValueError("Only a Word2Vec or Doc2Vec instance can be used") + raise ValueError( + "Only a Word2Vec or Doc2Vec instance can be used") def save(self, fname, protocol=2): fname_dict = fname + '.d' self.index.save(fname) - d = {'f': self.model.vector_size, 'num_trees': self.num_trees, 'labels': self.labels} + d = { + 'f': self.model.vector_size, + 'num_trees': self.num_trees, + 'labels': self.labels} with smart_open(fname_dict, 'wb') as fout: _pickle.dump(d, fout, protocol=protocol) def load(self, fname): - fname_dict = fname+'.d' + fname_dict = fname + '.d' if not (os.path.exists(fname) and os.path.exists(fname_dict)): raise IOError( - "Can't find index files '%s' and '%s' - Unable to restore AnnoyIndexer state." % (fname, fname_dict)) + "Can't find index files '%s' and '%s' - Unable to restore AnnoyIndexer state." % + (fname, fname_dict)) else: with smart_open(fname_dict) as f: d = _pickle.loads(f.read()) @@ -59,8 +65,10 @@ def build_from_word2vec(self): """Build an Annoy index using word vectors from a Word2Vec model""" self.model.init_sims() - return self._build_from_model(self.model.syn0norm, self.model.index2word - , self.model.vector_size) + return self._build_from_model( + self.model.syn0norm, + self.model.index2word, + self.model.vector_size) def build_from_doc2vec(self): """Build an Annoy index using document vectors from a Doc2Vec model""" @@ -68,7 +76,10 @@ def build_from_doc2vec(self): docvecs = self.model.docvecs docvecs.init_sims() labels = [docvecs.index_to_doctag(i) for i in range(0, docvecs.count)] - return self._build_from_model(docvecs.doctag_syn0norm, labels, self.model.vector_size) + return self._build_from_model( + docvecs.doctag_syn0norm, + labels, + self.model.vector_size) def _build_from_model(self, vectors, labels, num_features): index = AnnoyIndex(num_features) @@ -86,4 +97,5 @@ def most_similar(self, vector, num_neighbors): ids, distances = self.index.get_nns_by_vector( vector, num_neighbors, include_distances=True) - return [(self.labels[ids[i]], 1 - distances[i] / 2) for i in range(len(ids))] + return [(self.labels[ids[i]], 1 - distances[i] / 2) + for i in range(len(ids))] diff --git a/gensim/summarization/__init__.py b/gensim/summarization/__init__.py index 57c9a7c815..c7efb84d4a 100644 --- a/gensim/summarization/__init__.py +++ b/gensim/summarization/__init__.py @@ -1,4 +1,4 @@ # bring model classes directly into package namespace, to save some typing from .summarizer import summarize, summarize_corpus -from .keywords import keywords \ No newline at end of file +from .keywords import keywords diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py index 6704146d54..f9aecbb43b 100644 --- a/gensim/summarization/bm25.py +++ b/gensim/summarization/bm25.py @@ -18,7 +18,8 @@ class BM25(object): def __init__(self, corpus): self.corpus_size = len(corpus) - self.avgdl = sum(map(lambda x: float(len(x)), corpus)) / self.corpus_size + self.avgdl = sum(map(lambda x: float(len(x)), corpus) + ) / self.corpus_size self.corpus = corpus self.f = [] self.df = {} @@ -40,16 +41,27 @@ def initialize(self): self.df[word] += 1 for word, freq in iteritems(self.df): - self.idf[word] = math.log(self.corpus_size-freq+0.5) - math.log(freq+0.5) + self.idf[word] = math.log( + self.corpus_size - freq + 0.5) - math.log(freq + 0.5) def get_score(self, document, index, average_idf): score = 0 for word in document: if word not in self.f[index]: continue - idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf - score += (idf*self.f[index][word]*(PARAM_K1+1) - / (self.f[index][word] + PARAM_K1*(1 - PARAM_B+PARAM_B*self.corpus_size / self.avgdl))) + idf = self.idf[word] if self.idf[ + word] >= 0 else EPSILON * average_idf + score += (idf * + self.f[index][word] * + (PARAM_K1 + + 1) / + (self.f[index][word] + + PARAM_K1 * + (1 - + PARAM_B + + PARAM_B * + self.corpus_size / + self.avgdl))) return score def get_scores(self, document, average_idf): @@ -62,7 +74,8 @@ def get_scores(self, document, average_idf): def get_bm25_weights(corpus): bm25 = BM25(corpus) - average_idf = sum(map(lambda k: float(bm25.idf[k]), bm25.idf.keys())) / len(bm25.idf.keys()) + average_idf = sum(map(lambda k: float( + bm25.idf[k]), bm25.idf.keys())) / len(bm25.idf.keys()) weights = [] for doc in corpus: diff --git a/gensim/summarization/commons.py b/gensim/summarization/commons.py index 1c467098f9..4f19196066 100644 --- a/gensim/summarization/commons.py +++ b/gensim/summarization/commons.py @@ -16,5 +16,6 @@ def build_graph(sequence): def remove_unreachable_nodes(graph): for node in graph.nodes(): - if sum(graph.edge_weight((node, other)) for other in graph.neighbors(node)) == 0: + if sum(graph.edge_weight((node, other)) + for other in graph.neighbors(node)) == 0: graph.del_node(node) diff --git a/gensim/summarization/graph.py b/gensim/summarization/graph.py index bfed410b5e..7922ac7c9c 100644 --- a/gensim/summarization/graph.py +++ b/gensim/summarization/graph.py @@ -161,10 +161,13 @@ def __init__(self): def has_edge(self, edge): u, v = edge - return (u, v) in self.edge_properties and (v, u) in self.edge_properties + return ( + u, v) in self.edge_properties and ( + v, u) in self.edge_properties def edge_weight(self, edge): - return self.get_edge_properties(edge).setdefault(self.WEIGHT_ATTRIBUTE_NAME, self.DEFAULT_WEIGHT) + return self.get_edge_properties(edge).setdefault( + self.WEIGHT_ATTRIBUTE_NAME, self.DEFAULT_WEIGHT) def neighbors(self, node): return self.node_neighbors[node] @@ -218,7 +221,9 @@ def add_edge_attribute(self, edge, attr): self.edge_attr[edge] = self.edge_attributes(edge) + [attr] if edge[0] != edge[1]: - self.edge_attr[(edge[1], edge[0])] = self.edge_attributes((edge[1], edge[0])) + [attr] + self.edge_attr[ + (edge[1], edge[0])] = self.edge_attributes( + (edge[1], edge[0])) + [attr] def edge_attributes(self, edge): try: @@ -229,7 +234,8 @@ def edge_attributes(self, edge): def set_edge_properties(self, edge, **properties): self.edge_properties.setdefault(edge, {}).update(properties) if edge[0] != edge[1]: - self.edge_properties.setdefault((edge[1], edge[0]), {}).update(properties) + self.edge_properties.setdefault( + (edge[1], edge[0]), {}).update(properties) def del_edge(self, edge): u, v = edge diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py index 3bb7cee100..7e9a919909 100644 --- a/gensim/summarization/keywords.py +++ b/gensim/summarization/keywords.py @@ -37,7 +37,8 @@ def _get_words_for_graph(tokens, pos_filter): include_filters = set(pos_filter) exclude_filters = frozenset([]) if include_filters and exclude_filters: - raise ValueError("Can't use both include and exclude filters, should use only one") + raise ValueError( + "Can't use both include and exclude filters, should use only one") result = [] for word, unit in iteritems(tokens): @@ -58,7 +59,8 @@ def _set_graph_edge(graph, tokens, word_a, word_b): lemma_b = tokens[word_b].token edge = (lemma_a, lemma_b) - if graph.has_node(lemma_a) and graph.has_node(lemma_b) and not graph.has_edge(edge): + if graph.has_node(lemma_a) and graph.has_node( + lemma_b) and not graph.has_edge(edge): graph.add_edge(edge) @@ -161,10 +163,12 @@ def _get_combined_keywords(_keywords, split_text): if word in _keywords: combined_word = [word] if i + 1 == len_text: - result.append(word) # appends last word if keyword and doesn't iterate + # appends last word if keyword and doesn't iterate + result.append(word) for j in xrange(i + 1, len_text): other_word = _strip_word(split_text[j]) - if other_word in _keywords and other_word == split_text[j] and not other_word in combined_word: + if other_word in _keywords and other_word == split_text[ + j] and not other_word in combined_word: combined_word.append(other_word) else: for keyword in combined_word: @@ -189,15 +193,27 @@ def _format_results(_keywords, combined_keywords, split, scores): :param keywords:dict of keywords:scores :param combined_keywords:list of word/s """ - combined_keywords.sort(key=lambda w: _get_average_score(w, _keywords), reverse=True) + combined_keywords.sort( + key=lambda w: _get_average_score( + w, _keywords), reverse=True) if scores: - return [(word, _get_average_score(word, _keywords)) for word in combined_keywords] + return [(word, _get_average_score(word, _keywords)) + for word in combined_keywords] if split: return combined_keywords return "\n".join(combined_keywords) -def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=['NN', 'JJ'], lemmatize=False): +def keywords( + text, + ratio=0.2, + words=None, + split=False, + scores=False, + pos_filter=[ + 'NN', + 'JJ'], + lemmatize=False): # Gets a dict of word -> lemma text = to_unicode(text) tokens = _clean_text_by_word(text) @@ -210,10 +226,12 @@ def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter= _remove_unreachable_nodes(graph) - # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score + # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> + # score pagerank_scores = _pagerank(graph) - extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) + extracted_lemmas = _extract_tokens( + graph.nodes(), pagerank_scores, ratio, words) # The results can be polluted by many variations of the same word if lemmatize: @@ -225,7 +243,8 @@ def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter= keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) - # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined + # text.split() to keep numbers and punctuation marks, so separeted + # concepts are not combined combined_keywords = _get_combined_keywords(keywords, text.split()) return _format_results(keywords, combined_keywords, split, scores) diff --git a/gensim/summarization/pagerank_weighted.py b/gensim/summarization/pagerank_weighted.py index 1978c6e1c7..f2e97049e4 100644 --- a/gensim/summarization/pagerank_weighted.py +++ b/gensim/summarization/pagerank_weighted.py @@ -19,9 +19,11 @@ def pagerank_weighted(graph, damping=0.85): adjacency_matrix = build_adjacency_matrix(graph) probability_matrix = build_probability_matrix(graph) - pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * probability_matrix + pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * \ + probability_matrix - vals, vecs = eigs(pagerank_matrix.T, k=1) # TODO raise an error if matrix has complex eigenvectors? + # TODO raise an error if matrix has complex eigenvectors? + vals, vecs = eigs(pagerank_matrix.T, k=1) return process_results(graph, vecs.real) @@ -35,7 +37,8 @@ def build_adjacency_matrix(graph): for i in xrange(length): current_node = nodes[i] - neighbors_sum = sum(graph.edge_weight((current_node, neighbor)) for neighbor in graph.neighbors(current_node)) + neighbors_sum = sum(graph.edge_weight((current_node, neighbor)) + for neighbor in graph.neighbors(current_node)) for j in xrange(length): edge_weight = float(graph.edge_weight((current_node, nodes[j]))) if i != j and edge_weight != 0.0: diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py index 0779011999..1c8fc4f219 100644 --- a/gensim/summarization/summarizer.py +++ b/gensim/summarization/summarizer.py @@ -72,7 +72,8 @@ def _get_similarity(doc1, doc2, vec1, vec2): length_1 = _get_doc_length(doc1) length_2 = _get_doc_length(doc2) - denominator = _log10(length_1) + _log10(length_2) if length_1 > 0 and length_2 > 0 else 0 + denominator = _log10( + length_1) + _log10(length_2) if length_1 > 0 and length_2 > 0 else 0 return numerator / denominator if denominator != 0 else 0 @@ -86,7 +87,8 @@ def _build_corpus(sentences): def _get_important_sentences(sentences, corpus, important_docs): hashable_corpus = _build_hasheable_corpus(corpus) sentences_by_corpus = dict(zip(hashable_corpus, sentences)) - return [sentences_by_corpus[tuple(important_doc)] for important_doc in important_docs] + return [sentences_by_corpus[tuple(important_doc)] + for important_doc in important_docs] def _get_sentences_with_word_count(sentences, word_count): @@ -101,7 +103,12 @@ def _get_sentences_with_word_count(sentences, word_count): # Checks if the inclusion of the sentence gives a better approximation # to the word parameter. - if abs(word_count - length - words_in_sentence) > abs(word_count - length): + if abs( + word_count - + length - + words_in_sentence) > abs( + word_count - + length): return selected_sentences selected_sentences.append(sentence) @@ -110,12 +117,18 @@ def _get_sentences_with_word_count(sentences, word_count): return selected_sentences -def _extract_important_sentences(sentences, corpus, important_docs, word_count): - important_sentences = _get_important_sentences(sentences, corpus, important_docs) +def _extract_important_sentences( + sentences, + corpus, + important_docs, + word_count): + important_sentences = _get_important_sentences( + sentences, corpus, important_docs) # If no "word_count" option is provided, the number of sentences is # reduced by the provided ratio. Else, the ratio is ignored. - return important_sentences if word_count is None else _get_sentences_with_word_count(important_sentences, word_count) + return important_sentences if word_count is None else _get_sentences_with_word_count( + important_sentences, word_count) def _format_results(extracted_sentences, split): @@ -152,20 +165,27 @@ def summarize_corpus(corpus, ratio=0.2): # Warns the user if there are too few documents. if len(corpus) < INPUT_MIN_LENGTH: - logger.warning("Input corpus is expected to have at least " + str(INPUT_MIN_LENGTH) + " documents.") + logger.warning( + "Input corpus is expected to have at least " + + str(INPUT_MIN_LENGTH) + + " documents.") graph = _build_graph(hashable_corpus) _set_graph_edge_weights(graph) _remove_unreachable_nodes(graph) - # Cannot calculate eigenvectors if number of unique words in text < 3. Warns user to add more text. The function ends. + # Cannot calculate eigenvectors if number of unique words in text < 3. + # Warns user to add more text. The function ends. if len(graph.nodes()) < 3: - logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3") + logger.warning( + "Please add more sentences to the text. The number of reachable nodes is below 3") return pagerank_scores = _pagerank(graph) - hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True) + hashable_corpus.sort( + key=lambda doc: pagerank_scores.get( + doc, 0), reverse=True) return [list(doc) for doc in hashable_corpus[:int(len(corpus) * ratio)]] @@ -198,20 +218,26 @@ def summarize(text, ratio=0.2, word_count=None, split=False): logger.warning("Input text is empty.") return - # If only one sentence is present, the function raises an error (Avoids ZeroDivisionError). + # If only one sentence is present, the function raises an error (Avoids + # ZeroDivisionError). if len(sentences) == 1: raise ValueError("input must have more than one sentence") - + # Warns if the text is too short. if len(sentences) < INPUT_MIN_LENGTH: - logger.warning("Input text is expected to have at least " + str(INPUT_MIN_LENGTH) + " sentences.") + logger.warning( + "Input text is expected to have at least " + + str(INPUT_MIN_LENGTH) + + " sentences.") corpus = _build_corpus(sentences) - most_important_docs = summarize_corpus(corpus, ratio=ratio if word_count is None else 1) + most_important_docs = summarize_corpus( + corpus, ratio=ratio if word_count is None else 1) # Extracts the most important sentences with the selected criterion. - extracted_sentences = _extract_important_sentences(sentences, corpus, most_important_docs, word_count) + extracted_sentences = _extract_important_sentences( + sentences, corpus, most_important_docs, word_count) # Sorts the extracted sentences by apparition order in the original text. extracted_sentences.sort(key=lambda s: s.index) diff --git a/gensim/summarization/syntactic_unit.py b/gensim/summarization/syntactic_unit.py index 89842e1122..5a84eca139 100644 --- a/gensim/summarization/syntactic_unit.py +++ b/gensim/summarization/syntactic_unit.py @@ -14,7 +14,8 @@ def __init__(self, text, token=None, tag=None): self.score = -1 def __str__(self): - return "Original unit: '" + self.text + "' *-*-*-* " + "Processed unit: '" + self.token + "'" + return "Original unit: '" + self.text + "' *-*-*-* " + \ + "Processed unit: '" + self.token + "'" def __repr__(self): return str(self) diff --git a/gensim/summarization/textcleaner.py b/gensim/summarization/textcleaner.py index 7609da469a..eafbb706b0 100644 --- a/gensim/summarization/textcleaner.py +++ b/gensim/summarization/textcleaner.py @@ -17,22 +17,29 @@ logger.info("'pattern' package found; tag filters are available for English") HAS_PATTERN = True except ImportError: - logger.info("'pattern' package not found; tag filters are not available for English") + logger.info( + "'pattern' package not found; tag filters are not available for English") HAS_PATTERN = False SEPARATOR = r"@" -RE_SENTENCE = re.compile('(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE) # backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$) +# backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$) +RE_SENTENCE = re.compile('(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE) AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)\s(\w)", re.UNICODE) AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)\s(\w)", re.UNICODE) AB_ACRONYM_LETTERS = re.compile("([a-zA-Z])\.([a-zA-Z])\.", re.UNICODE) -UNDO_AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)" + SEPARATOR + "(\w)", re.UNICODE) +UNDO_AB_SENIOR = re.compile( + "([A-Z][a-z]{1,2}\.)" + + SEPARATOR + + "(\w)", + re.UNICODE) UNDO_AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)" + SEPARATOR + "(\w)", re.UNICODE) def split_sentences(text): processed = replace_abbreviations(text) - return [undo_replacement(sentence) for sentence in get_sentences(processed)] + return [undo_replacement(sentence) + for sentence in get_sentences(processed)] def replace_abbreviations(text): @@ -40,7 +47,9 @@ def replace_abbreviations(text): def undo_replacement(sentence): - return replace_with_separator(sentence, r" ", [UNDO_AB_SENIOR, UNDO_AB_ACRONYM]) + return replace_with_separator( + sentence, r" ", [ + UNDO_AB_SENIOR, UNDO_AB_ACRONYM]) def replace_with_separator(text, separator, regexs): @@ -81,7 +90,8 @@ def clean_text_by_sentences(text): """ Tokenizes a given text into sentences, applying filters and lemmatizing them. Returns a SyntacticUnit list. """ original_sentences = split_sentences(text) - filtered_sentences = [join_words(sentence) for sentence in preprocess_documents(original_sentences)] + filtered_sentences = [ + join_words(sentence) for sentence in preprocess_documents(original_sentences)] return merge_syntactic_units(original_sentences, filtered_sentences) @@ -89,11 +99,18 @@ def clean_text_by_sentences(text): def clean_text_by_word(text): """ Tokenizes a given text into words, applying filters and lemmatizing them. Returns a dict of word -> syntacticUnit. """ - text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) - original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=True)) - filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)] + text_without_acronyms = replace_with_separator( + text, "", [AB_ACRONYM_LETTERS]) + original_words = list( + tokenize( + text_without_acronyms, + to_lower=True, + deacc=True)) + filtered_words = [join_words(word_list, "") + for word_list in preprocess_documents(original_words)] if HAS_PATTERN: - tags = tag(join_words(original_words)) # tag needs the context of the words in the text + # tag needs the context of the words in the text + tags = tag(join_words(original_words)) else: tags = None units = merge_syntactic_units(original_words, filtered_words, tags) @@ -101,5 +118,6 @@ def clean_text_by_word(text): def tokenize_by_word(text): - text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) + text_without_acronyms = replace_with_separator( + text, "", [AB_ACRONYM_LETTERS]) return tokenize(text_without_acronyms, to_lower=True, deacc=True) diff --git a/gensim/topic_coherence/aggregation.py b/gensim/topic_coherence/aggregation.py index 7c345d8812..2338c47cc2 100644 --- a/gensim/topic_coherence/aggregation.py +++ b/gensim/topic_coherence/aggregation.py @@ -14,6 +14,7 @@ logger = logging.getLogger(__name__) + def arithmetic_mean(confirmed_measures): """ This functoin performs the arithmetic mean aggregation on the output obtained from diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py index 83227822e9..9478d8887c 100644 --- a/gensim/topic_coherence/direct_confirmation_measure.py +++ b/gensim/topic_coherence/direct_confirmation_measure.py @@ -15,7 +15,11 @@ EPSILON = 1e-12 # Should be small. Value as suggested in paper. -def log_conditional_probability(segmented_topics, per_topic_postings, num_docs): + +def log_conditional_probability( + segmented_topics, + per_topic_postings, + num_docs): """ This function calculates the log-conditional-probability measure which is used by coherence measures such as U_mass. @@ -37,12 +41,18 @@ def log_conditional_probability(segmented_topics, per_topic_postings, num_docs): w_prime_docs = per_topic_postings[w_prime] w_star_docs = per_topic_postings[w_star] co_docs = w_prime_docs.intersection(w_star_docs) - m_lc_i = np.log(((len(co_docs) / float(num_docs)) + EPSILON) / (len(w_star_docs) / float(num_docs))) + m_lc_i = np.log(((len(co_docs) / float(num_docs)) + + EPSILON) / (len(w_star_docs) / float(num_docs))) m_lc.append(m_lc_i) return m_lc -def log_ratio_measure(segmented_topics, per_topic_postings, num_docs, normalize=False): + +def log_ratio_measure( + segmented_topics, + per_topic_postings, + num_docs, + normalize=False): """ If normalize=False: Popularly known as PMI. @@ -73,13 +83,15 @@ def log_ratio_measure(segmented_topics, per_topic_postings, num_docs, normalize= co_docs = w_prime_docs.intersection(w_star_docs) if normalize: # For normalized log ratio measure - numerator = log_ratio_measure([[(w_prime, w_star)]], per_topic_postings, num_docs)[0] + numerator = log_ratio_measure( + [[(w_prime, w_star)]], per_topic_postings, num_docs)[0] co_doc_prob = len(co_docs) / float(num_docs) m_lr_i = numerator / (-np.log(co_doc_prob + EPSILON)) else: # For log ratio measure without normalization numerator = (len(co_docs) / float(num_docs)) + EPSILON - denominator = (len(w_prime_docs) / float(num_docs)) * (len(w_star_docs) / float(num_docs)) + denominator = (len(w_prime_docs) / float(num_docs) + ) * (len(w_star_docs) / float(num_docs)) m_lr_i = np.log(numerator / denominator) m_lr.append(m_lr_i) diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index c68206a372..ac0bd22f4e 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -48,7 +48,15 @@ def _present(w_prime_star, w, w_backtrack): return -1 return index -def _make_seg(w_prime, w, per_topic_postings, measure, gamma, backtrack, num_docs): + +def _make_seg( + w_prime, + w, + per_topic_postings, + measure, + gamma, + backtrack, + num_docs): """ Internal helper function to return context vectors for segmentations. """ @@ -57,7 +65,8 @@ def _make_seg(w_prime, w, per_topic_postings, measure, gamma, backtrack, num_doc for w_j in w: for w_i in w_prime: if (w_i, w_j) not in backtrack: - backtrack[(w_i, w_j)] = measure[0]([[(w_i, w_j)]], per_topic_postings, num_docs, measure[1])[0] + backtrack[(w_i, w_j)] = measure[0]( + [[(w_i, w_j)]], per_topic_postings, num_docs, measure[1])[0] if w_j not in context_vectors: context_vectors[w_j] = backtrack[(w_i, w_j)] ** gamma else: @@ -65,11 +74,19 @@ def _make_seg(w_prime, w, per_topic_postings, measure, gamma, backtrack, num_doc else: for w_j in w: if (w_prime, w_j) not in backtrack: - backtrack[(w_prime, w_j)] = measure[0]([[(w_prime, w_j)]], per_topic_postings, num_docs, measure[1])[0] + backtrack[(w_prime, w_j)] = measure[0]( + [[(w_prime, w_j)]], per_topic_postings, num_docs, measure[1])[0] context_vectors[w_j] = backtrack[(w_prime, w_j)] ** gamma return (context_vectors, backtrack) -def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gamma, num_docs): + +def cosine_similarity( + topics, + segmented_topics, + per_topic_postings, + measure, + gamma, + num_docs): """ This function calculates the indirect cosine measure. Given context vectors _ _ _ _ @@ -98,8 +115,11 @@ def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gam # make normalized log ratio measure tuple measure = (direct_confirmation_measure.log_ratio_measure, True) else: - raise ValueError("The direct confirmation measure you entered is not currently supported.") - backtrack = {} # Backtracking dictionary for storing measure values of topic id tuples eg. (1, 2). + raise ValueError( + "The direct confirmation measure you entered is not currently supported.") + # Backtracking dictionary for storing measure values of topic id tuples + # eg. (1, 2). + backtrack = {} """ For backtracking context vectors, we will create a list called w_backtrack to store (w_prime, w) or (w_star, w) tuples and a corresponding list context_vector_backtrack which will create a @@ -114,9 +134,11 @@ def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gam # Step 2. If yes, return corresponding context vector w_prime_index = _present(w_prime, top_words, w_backtrack) if w_backtrack and w_prime_index != -1: - w_prime_context_vectors = context_vector_backtrack[w_prime_index] + w_prime_context_vectors = context_vector_backtrack[ + w_prime_index] else: - w_prime_context_vectors, backtrack_i = _make_seg(w_prime, top_words, per_topic_postings, measure, gamma, backtrack, num_docs) + w_prime_context_vectors, backtrack_i = _make_seg( + w_prime, top_words, per_topic_postings, measure, gamma, backtrack, num_docs) backtrack.update(backtrack_i) # Update backtracking lists w_backtrack.append((w_prime, top_words)) @@ -128,13 +150,16 @@ def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gam if w_backtrack and w_star_index != -1: w_star_context_vectors = context_vector_backtrack[w_star_index] else: - w_star_context_vectors, backtrack_i = _make_seg(w_star, top_words, per_topic_postings, measure, gamma, backtrack, num_docs) + w_star_context_vectors, backtrack_i = _make_seg( + w_star, top_words, per_topic_postings, measure, gamma, backtrack, num_docs) backtrack.update(backtrack_i) # Update all backtracking lists w_backtrack.append((w_star, top_words)) context_vector_backtrack.append(w_star_context_vectors) - s_cos_sim_i = cossim(w_prime_context_vectors.items(), w_star_context_vectors.items()) + s_cos_sim_i = cossim( + w_prime_context_vectors.items(), + w_star_context_vectors.items()) s_cos_sim.append(s_cos_sim_i) return s_cos_sim diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index a76f40db4c..877a77eac2 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -17,6 +17,7 @@ logger = logging.getLogger(__name__) + def _ret_top_ids(segmented_topics): """ Helper function to return a set of all the unique topic ids in segmented topics. @@ -31,6 +32,7 @@ def _ret_top_ids(segmented_topics): top_ids.add(id) return top_ids + def p_boolean_document(corpus, segmented_topics): """ This function performs the boolean document probability estimation. Boolean document estimates the probability @@ -58,6 +60,7 @@ def p_boolean_document(corpus, segmented_topics): num_docs = len(corpus) return (per_topic_postings, num_docs) + def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size): """ This function performs the boolean sliding window probability estimation. Boolean sliding window @@ -81,7 +84,13 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size): window_id = 0 # Each window assigned a window id. per_topic_postings = {} token2id_dict = dictionary.token2id - def add_topic_posting(top_ids, window, per_topic_postings, window_id, token2id_dict): + + def add_topic_posting( + top_ids, + window, + per_topic_postings, + window_id, + token2id_dict): for word in window: word_id = token2id_dict[word] if word_id in top_ids: @@ -95,9 +104,11 @@ def add_topic_posting(top_ids, window, per_topic_postings, window_id, token2id_d for document in texts: it = iter(document) window = tuple(islice(it, window_size)) - window_id, per_topic_postings = add_topic_posting(top_ids, window, per_topic_postings, window_id, token2id_dict) + window_id, per_topic_postings = add_topic_posting( + top_ids, window, per_topic_postings, window_id, token2id_dict) for elem in it: window = window[1:] + (elem,) - window_id, per_topic_postings = add_topic_posting(top_ids, window, per_topic_postings, window_id, token2id_dict) + window_id, per_topic_postings = add_topic_posting( + top_ids, window, per_topic_postings, window_id, token2id_dict) return per_topic_postings, window_id diff --git a/gensim/topic_coherence/segmentation.py b/gensim/topic_coherence/segmentation.py index 9a2a58b060..1b4b05c6b4 100644 --- a/gensim/topic_coherence/segmentation.py +++ b/gensim/topic_coherence/segmentation.py @@ -13,6 +13,7 @@ logger = logging.getLogger(__name__) + def s_one_pre(topics): """ This function performs s_one_pre segmentation on a list of topics. @@ -43,6 +44,7 @@ def s_one_pre(topics): return s_one_pre + def s_one_one(topics): """ This function performs s_one_one segmentation on a list of topics. @@ -76,6 +78,7 @@ def s_one_one(topics): return s_one_one + def s_one_set(topics): """ This function performs s_one_set segmentation on a list of topics. diff --git a/gensim/utils.py b/gensim/utils.py index 606060bb38..71d60bd869 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -10,7 +10,8 @@ from __future__ import with_statement -import logging, warnings +import logging +import warnings logger = logging.getLogger(__name__) @@ -48,7 +49,8 @@ try: from smart_open import smart_open except ImportError: - logger.info("smart_open library not found; falling back to local-filesystem-only") + logger.info( + "smart_open library not found; falling back to local-filesystem-only") def make_closing(base, **attrs): """ @@ -80,7 +82,6 @@ def smart_open(fname, mode='rb'): RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE) - def synchronous(tlockname): """ A decorator to place an instance-based lock around a method. @@ -91,24 +92,34 @@ def _synched(func): @wraps(func) def _synchronizer(self, *args, **kwargs): tlock = getattr(self, tlockname) - logger.debug("acquiring lock %r for %s" % (tlockname, func.__name__)) - - with tlock: # use lock as a context manager to perform safe acquire/release pairs - logger.debug("acquired lock %r for %s" % (tlockname, func.__name__)) + logger.debug( + "acquiring lock %r for %s" % + (tlockname, func.__name__)) + + with tlock: # use lock as a context manager to perform safe acquire/release pairs + logger.debug( + "acquired lock %r for %s" % + (tlockname, func.__name__)) result = func(self, *args, **kwargs) - logger.debug("releasing lock %r for %s" % (tlockname, func.__name__)) + logger.debug( + "releasing lock %r for %s" % + (tlockname, func.__name__)) return result return _synchronizer return _synched class NoCM(object): + def acquire(self): pass + def release(self): pass + def __enter__(self): pass + def __exit__(self, type, value, traceback): pass nocm = NoCM() @@ -161,7 +172,13 @@ def copytree_hardlink(source, dest): shutil.copy2 = copy2 -def tokenize(text, lowercase=False, deacc=False, errors="strict", to_lower=False, lower=False): +def tokenize( + text, + lowercase=False, + deacc=False, + errors="strict", + to_lower=False, + lower=False): """ Iteratively yield tokens as unicode strings, removing accent marks and optionally lowercasing the unidoce string by assigning True @@ -250,7 +267,6 @@ def load(cls, fname, mmap=None): logger.info("loaded %s", fname) return obj - def _load_specials(self, fname, mmap, compress, subname): """ Loads any attributes that were stored specially, and gives the same @@ -265,7 +281,13 @@ def _load_specials(self, fname, mmap, compress, subname): cfname = '.'.join((fname, attrib)) logger.info("loading %s recursively from %s.* with mmap=%s" % ( attrib, cfname, mmap)) - getattr(self, attrib)._load_specials(cfname, mmap, compress, subname) + getattr( + self, + attrib)._load_specials( + cfname, + mmap, + compress, + subname) for attrib in getattr(self, '__numpys', []): logger.info("loading %s from %s with mmap=%s" % ( @@ -294,9 +316,24 @@ def _load_specials(self, fname, mmap, compress, subname): sparse.indptr = f['indptr'] sparse.indices = f['indices'] else: - sparse.data = numpy.load(subname(fname, attrib, 'data'), mmap_mode=mmap) - sparse.indptr = numpy.load(subname(fname, attrib, 'indptr'), mmap_mode=mmap) - sparse.indices = numpy.load(subname(fname, attrib, 'indices'), mmap_mode=mmap) + sparse.data = numpy.load( + subname( + fname, + attrib, + 'data'), + mmap_mode=mmap) + sparse.indptr = numpy.load( + subname( + fname, + attrib, + 'indptr'), + mmap_mode=mmap) + sparse.indices = numpy.load( + subname( + fname, + attrib, + 'indices'), + mmap_mode=mmap) setattr(self, attrib, sparse) @@ -304,7 +341,6 @@ def _load_specials(self, fname, mmap, compress, subname): logger.info("setting ignored attribute %s to None" % (attrib)) setattr(self, attrib, None) - @staticmethod def _adapt_by_suffix(fname): """Give appropriate compress setting and filename formula""" @@ -316,7 +352,6 @@ def _adapt_by_suffix(fname): subname = lambda *args: '.'.join(list(args) + ['npy']) return (compress, subname) - def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2): """ @@ -345,8 +380,14 @@ def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, compress, subname = SaveLoad._adapt_by_suffix(fname) - restores = self._save_specials(fname, separately, sep_limit, ignore, pickle_protocol, - compress, subname) + restores = self._save_specials( + fname, + separately, + sep_limit, + ignore, + pickle_protocol, + compress, + subname) try: pickle(self, fname, protocol=pickle_protocol) finally: @@ -356,8 +397,15 @@ def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, setattr(obj, attrib, val) logger.info("saved %s", fname) - - def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, compress, subname): + def _save_specials( + self, + fname, + separately, + sep_limit, + ignore, + pickle_protocol, + compress, + subname): """ Save aside any attributes that need to be handled separately, including by recursion any attributes that are themselves SaveLoad instances. @@ -377,7 +425,8 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, elif isinstance(val, sparse_matrices) and val.nnz >= sep_limit: separately.append(attrib) - # whatever's in `separately` or `ignore` at this point won't get pickled + # whatever's in `separately` or `ignore` at this point won't get + # pickled for attrib in separately + list(ignore): if hasattr(self, attrib): asides[attrib] = getattr(self, attrib) @@ -386,11 +435,20 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, recursive_saveloads = [] restores = [] for attrib, val in iteritems(self.__dict__): - if hasattr(val, '_save_specials'): # better than 'isinstance(val, SaveLoad)' if IPython reloading + if hasattr( + val, + '_save_specials'): # better than 'isinstance(val, SaveLoad)' if IPython reloading recursive_saveloads.append(attrib) - cfname = '.'.join((fname,attrib)) - restores.extend(val._save_specials(cfname, None, sep_limit, ignore, - pickle_protocol, compress, subname)) + cfname = '.'.join((fname, attrib)) + restores.extend( + val._save_specials( + cfname, + None, + sep_limit, + ignore, + pickle_protocol, + compress, + subname)) try: numpys, scipys, ignoreds = [], [], [] @@ -401,9 +459,17 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, attrib, subname(fname, attrib))) if compress: - numpy.savez_compressed(subname(fname, attrib), val=numpy.ascontiguousarray(val)) + numpy.savez_compressed( + subname( + fname, + attrib), + val=numpy.ascontiguousarray(val)) else: - numpy.save(subname(fname, attrib), numpy.ascontiguousarray(val)) + numpy.save( + subname( + fname, + attrib), + numpy.ascontiguousarray(val)) elif isinstance(val, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)) and attrib not in ignore: scipys.append(attrib) @@ -411,21 +477,36 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, attrib, subname(fname, attrib))) if compress: - numpy.savez_compressed(subname(fname, attrib, 'sparse'), - data=val.data, - indptr=val.indptr, - indices=val.indices) + numpy.savez_compressed( + subname( + fname, + attrib, + 'sparse'), + data=val.data, + indptr=val.indptr, + indices=val.indices) else: numpy.save(subname(fname, attrib, 'data'), val.data) - numpy.save(subname(fname, attrib, 'indptr'), val.indptr) - numpy.save(subname(fname, attrib, 'indices'), val.indices) + numpy.save( + subname( + fname, + attrib, + 'indptr'), + val.indptr) + numpy.save( + subname( + fname, + attrib, + 'indices'), + val.indices) data, indptr, indices = val.data, val.indptr, val.indices val.data, val.indptr, val.indices = None, None, None try: # store array-less object - pickle(val, subname(fname, attrib), protocol=pickle_protocol) + pickle(val, subname(fname, attrib), + protocol=pickle_protocol) finally: val.data, val.indptr, val.indices = data, indptr, indices else: @@ -443,7 +524,6 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, raise return restores + [(self, asides)] - def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2): """ @@ -477,7 +557,7 @@ def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, except TypeError: # `fname_or_handle` does not have write attribute self._smart_save(fname_or_handle, separately, sep_limit, ignore, pickle_protocol=pickle_protocol) -#endclass SaveLoad +# endclass SaveLoad def identity(p): @@ -494,7 +574,8 @@ def get_max_id(corpus): """ maxid = -1 for document in corpus: - maxid = max(maxid, max([-1] + [fieldid for fieldid, _ in document])) # [-1] to avoid exceptions from max(empty) + # [-1] to avoid exceptions from max(empty) + maxid = max(maxid, max([-1] + [fieldid for fieldid, _ in document])) return maxid @@ -507,14 +588,13 @@ class FakeDict(object): is a waste of memory. """ + def __init__(self, num_terms): self.num_terms = num_terms - def __str__(self): return "FakeDict(num_terms=%s)" % self.num_terms - def __getitem__(self, val): if 0 <= val < self.num_terms: return str(val) @@ -587,9 +667,11 @@ def is_corpus(obj): obj = itertools.chain([doc1], obj) else: doc1 = next(iter(obj)) # empty corpus is resolved to False here - if len(doc1) == 0: # sparse documents must have a __len__ function (list, tuple...) + if len( + doc1) == 0: # sparse documents must have a __len__ function (list, tuple...) return True, obj # the first document is empty=>assume this is a corpus - id1, val1 = next(iter(doc1)) # if obj is a numpy array, it resolves to False here + # if obj is a numpy array, it resolves to False here + id1, val1 = next(iter(doc1)) id1, val1 = int(id1), float(val1) # must be a 2-tuple (integer, float) except Exception: return False, obj @@ -618,7 +700,8 @@ def get_my_ip(): try: # see what ifconfig says about our default interface import commands - result = commands.getoutput("ifconfig").split("\n")[1].split()[1][5:] + result = commands.getoutput("ifconfig").split("\n")[ + 1].split()[1][5:] if len(result.split('.')) != 4: raise Exception() except: @@ -632,6 +715,7 @@ class RepeatCorpus(SaveLoad): Used in the tutorial on distributed computing and likely not useful anywhere else. """ + def __init__(self, corpus, reps): """ Wrap a `corpus` as another corpus of length `reps`. This is achieved by @@ -650,6 +734,7 @@ def __init__(self, corpus, reps): def __iter__(self): return itertools.islice(itertools.cycle(self.corpus), self.reps) + class RepeatCorpusNTimes(SaveLoad): def __init__(self, corpus, n): @@ -668,7 +753,9 @@ def __iter__(self): for document in self.corpus: yield document + class ClippedCorpus(SaveLoad): + def __init__(self, corpus, max_docs=None): """ Return a corpus that is the "head" of input iterable `corpus`. @@ -687,7 +774,9 @@ def __iter__(self): def __len__(self): return min(self.max_docs, len(self.corpus)) + class SlicedCorpus(SaveLoad): + def __init__(self, corpus, slice_): """ Return a corpus that is the slice of input iterable `corpus`. @@ -724,6 +813,7 @@ def __len__(self): return self.length + def safe_unichr(intval): try: return unichr(intval) @@ -733,6 +823,7 @@ def safe_unichr(intval): # return UTF16 surrogate pair return s.decode('unicode-escape') + def decode_htmlentities(text): """ Decode HTML entities in text, coded as hex, decimal or named. @@ -788,18 +879,21 @@ def chunkize_serial(iterable, chunksize, as_numpy=False): if as_numpy: # convert each document to a 2d numpy array (~6x faster when transmitting # chunk data over the wire, in Pyro) - wrapped_chunk = [[numpy.array(doc) for doc in itertools.islice(it, int(chunksize))]] + wrapped_chunk = [ + [numpy.array(doc) for doc in itertools.islice(it, int(chunksize))]] else: wrapped_chunk = [list(itertools.islice(it, int(chunksize)))] if not wrapped_chunk[0]: break - # memory opt: wrap the chunk and then pop(), to avoid leaving behind a dangling reference + # memory opt: wrap the chunk and then pop(), to avoid leaving behind a + # dangling reference yield wrapped_chunk.pop() grouper = chunkize_serial class InputQueue(multiprocessing.Process): + def __init__(self, q, corpus, chunksize, maxsize, as_numpy): super(InputQueue, self).__init__() self.q = q @@ -810,7 +904,7 @@ def __init__(self, q, corpus, chunksize, maxsize, as_numpy): def run(self): if self.as_numpy: - import numpy # don't clutter the global namespace with a dependency on numpy + import numpy # don't clutter the global namespace with a dependency on numpy it = iter(self.corpus) while True: chunk = itertools.islice(it, self.chunksize) @@ -831,9 +925,9 @@ def run(self): except NotImplementedError: qsize = '?' logger.debug("prepared another chunk of %i documents (qsize=%s)" % - (len(wrapped_chunk[0]), qsize)) + (len(wrapped_chunk[0]), qsize)) self.q.put(wrapped_chunk.pop(), block=True) -#endclass InputQueue +# endclass InputQueue if os.name == 'nt': @@ -869,7 +963,12 @@ def chunkize(corpus, chunksize, maxsize=0, as_numpy=False): if maxsize > 0: q = multiprocessing.Queue(maxsize=maxsize) - worker = InputQueue(q, corpus, chunksize, maxsize=maxsize, as_numpy=as_numpy) + worker = InputQueue( + q, + corpus, + chunksize, + maxsize=maxsize, + as_numpy=as_numpy) worker.daemon = True worker.start() while True: @@ -908,7 +1007,8 @@ def pickle(obj, fname, protocol=2): def unpickle(fname): """Load pickled object from `fname`""" with smart_open(fname) as f: - # Because of loading from S3 load can't be used (missing readline in smart_open) + # Because of loading from S3 load can't be used (missing readline in + # smart_open) return _pickle.loads(f.read()) @@ -934,11 +1034,11 @@ def toptexts(query, texts, index, n=10): Return a list of 3-tuples (docid, doc's similarity to the query, texts[docid]). """ - sims = index[query] # perform a similarity query against the corpus + sims = index[query] # perform a similarity query against the corpus sims = sorted(enumerate(sims), key=lambda item: -item[1]) result = [] - for topid, topcosine in sims[:n]: # only consider top-n most similar docs + for topid, topcosine in sims[:n]: # only consider top-n most similar docs result.append((topid, topcosine, texts[topid])) return result @@ -983,7 +1083,13 @@ def getNS(host=None, port=None, broadcast=True, hmac_key=None): raise RuntimeError("Pyro name server not found") -def pyro_daemon(name, obj, random_suffix=False, ip=None, port=None, ns_conf={}): +def pyro_daemon( + name, + obj, + random_suffix=False, + ip=None, + port=None, + ns_conf={}): """ Register object with name server (starting the name server if not running yet) and block until the daemon is terminated. The object is registered under @@ -999,7 +1105,9 @@ def pyro_daemon(name, obj, random_suffix=False, ip=None, port=None, ns_conf={}): uri = daemon.register(obj, name) ns.remove(name) ns.register(name, uri) - logger.info("%s registered with nameserver (URI '%s')" % (name, uri)) + logger.info( + "%s registered with nameserver (URI '%s')" % + (name, uri)) daemon.requestLoop() @@ -1012,12 +1120,13 @@ def has_pattern(): from pattern.en import parse pattern = True except ImportError: - warnings.warn("Pattern library is not installed, lemmatization won't be available.") + warnings.warn( + "Pattern library is not installed, lemmatization won't be available.") return pattern def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False, - stopwords=frozenset(), min_length=2, max_length=15): + stopwords=frozenset(), min_length=2, max_length=15): """ This function is only available when the optional 'pattern' package is installed. @@ -1056,7 +1165,8 @@ def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False, result = [] for sentence in parsed: for token, tag, _, _, lemma in sentence: - if min_length <= len(lemma) <= max_length and not lemma.startswith('_') and lemma not in stopwords: + if min_length <= len(lemma) <= max_length and not lemma.startswith( + '_') and lemma not in stopwords: if allowed_tags.match(tag): lemma += "/" + tag[:2] result.append(lemma.encode('utf8')) @@ -1097,7 +1207,11 @@ def prune_vocab(vocab, min_reduce, trim_rule=None): result = 0 old_len = len(vocab) for w in list(vocab): # make a copy of dict's keys - if not keep_vocab_item(w, vocab[w], min_reduce, trim_rule): # vocab[w] <= min_reduce: + if not keep_vocab_item( + w, + vocab[w], + min_reduce, + trim_rule): # vocab[w] <= min_reduce: result += vocab[w] del vocab[w] logger.info("pruned out %i tokens with count <=%i (before %i, after %i)", @@ -1132,6 +1246,7 @@ def keep_vocab_item(word, count, min_count, trim_rule=None): else: return default_res + def check_output(*popenargs, **kwargs): r"""Run command with arguments and return its output as a byte string. Backported from Python 2.7 as it's implemented as pure python on stdlib. @@ -1140,7 +1255,8 @@ def check_output(*popenargs, **kwargs): Added extra KeyboardInterrupt handling """ try: - process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs) + process = subprocess.Popen( + stdout=subprocess.PIPE, *popenargs, **kwargs) output, unused_err = process.communicate() retcode = process.poll() if retcode: @@ -1155,11 +1271,15 @@ def check_output(*popenargs, **kwargs): process.terminate() raise + def sample_dict(d, n=10, use_random=True): - """ - Pick `n` items from dictionary `d` and return them as a list. - The items are picked randomly if `use_random` is True, otherwise picked - according to natural dict iteration. - """ - selected_keys = random.sample(list(d), min(len(d), n)) if use_random else itertools.islice(iterkeys(d), n) - return [(key, d[key]) for key in selected_keys] + """ + Pick `n` items from dictionary `d` and return them as a list. + The items are picked randomly if `use_random` is True, otherwise picked + according to natural dict iteration. + """ + selected_keys = random.sample( + list(d), min( + len(d), n)) if use_random else itertools.islice( + iterkeys(d), n) + return [(key, d[key]) for key in selected_keys] diff --git a/setup.py b/setup.py index 9243ba703f..2534b34d1b 100644 --- a/setup.py +++ b/setup.py @@ -81,24 +81,23 @@ def build_extension(self, ext): e = sys.exc_info()[1] sys.stdout.write('%s\n' % str(e)) warnings.warn( - self.warning_message + - "The %s extension module" % (name,) + - "The output above this warning shows how the compilation failed.") + self.warning_message + "The %s extension module" % + (name,) + "The output above this warning shows how the compilation failed.") # the following is needed to be able to add numpy's include dirs... without # importing numpy directly in this script, before it's actually installed! # http://stackoverflow.com/questions/19919905/how-to-bootstrap-numpy-installation-in-setup-py def finalize_options(self): - build_ext.finalize_options(self) - # Prevent numpy from thinking it is still in its setup process: - # https://docs.python.org/2/library/__builtin__.html#module-__builtin__ - if isinstance(__builtins__, dict): - __builtins__["__NUMPY_SETUP__"] = False - else: - __builtins__.__NUMPY_SETUP__ = False + build_ext.finalize_options(self) + # Prevent numpy from thinking it is still in its setup process: + # https://docs.python.org/2/library/__builtin__.html#module-__builtin__ + if isinstance(__builtins__, dict): + __builtins__["__NUMPY_SETUP__"] = False + else: + __builtins__.__NUMPY_SETUP__ = False - import numpy - self.include_dirs.append(numpy.get_include()) + import numpy + self.include_dirs.append(numpy.get_include()) def readfile(fname): @@ -129,11 +128,11 @@ def readfile(fname): ext_modules=[ Extension('gensim.models.word2vec_inner', - sources=['./gensim/models/word2vec_inner.c'], - include_dirs=[model_dir]), + sources=['./gensim/models/word2vec_inner.c'], + include_dirs=[model_dir]), Extension('gensim.models.doc2vec_inner', - sources=['./gensim/models/doc2vec_inner.c'], - include_dirs=[model_dir]) + sources=['./gensim/models/doc2vec_inner.c'], + include_dirs=[model_dir]) ], cmdclass=cmdclass, packages=find_packages(), @@ -181,7 +180,7 @@ def readfile(fname): python_2_6_backports, ], tests_require=[ - 'testfixtures', + 'testfixtures', ], extras_require={ 'distributed': ['Pyro4 >= 4.27'],