From 541f7db56e626a3d01fefe81c3b99e650820bc13 Mon Sep 17 00:00:00 2001 From: Ankush Chander Date: Wed, 9 Aug 2023 16:22:28 +0530 Subject: [PATCH] update scrubber documentation --- examples/sample.ipynb | 2020 +++++++++++++++++++++++------------------ 1 file changed, 1148 insertions(+), 872 deletions(-) diff --git a/examples/sample.ipynb b/examples/sample.ipynb index 3612465..3a5e9ab 100644 --- a/examples/sample.ipynb +++ b/examples/sample.ipynb @@ -97,8 +97,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001B[1m\n", - "============================= Pipeline Overview =============================\u001B[0m\n", + "\u001b[1m\n", + "============================= Pipeline Overview =============================\u001b[0m\n", "\n", "# Component Assigns Requires Scores Retokenizes\n", "- --------------- ------------------- -------- ---------------- -----------\n", @@ -124,7 +124,7 @@ " \n", "6 textrank False \n", "\n", - "\u001B[38;5;2m✔ No problems found.\u001B[0m\n" + "\u001b[38;5;2m✔ No problems found.\u001b[0m\n" ] }, { @@ -173,16 +173,16 @@ " 'lemmatizer': [],\n", " 'ner': [],\n", " 'textrank': []},\n", - " 'attrs': {'token.dep': {'assigns': ['parser'], 'requires': []},\n", - " 'token.ent_type': {'assigns': ['ner'], 'requires': []},\n", + " 'attrs': {'token.is_sent_start': {'assigns': ['parser'], 'requires': []},\n", + " 'doc.sents': {'assigns': ['parser'], 'requires': []},\n", " 'token.head': {'assigns': ['parser'], 'requires': []},\n", - " 'token.is_sent_start': {'assigns': ['parser'], 'requires': []},\n", " 'token.ent_iob': {'assigns': ['ner'], 'requires': []},\n", + " 'token.tag': {'assigns': ['tagger'], 'requires': []},\n", + " 'token.lemma': {'assigns': ['lemmatizer'], 'requires': []},\n", " 'doc.tensor': {'assigns': ['tok2vec'], 'requires': []},\n", " 'doc.ents': {'assigns': ['ner'], 'requires': []},\n", - " 'doc.sents': {'assigns': ['parser'], 'requires': []},\n", - " 'token.lemma': {'assigns': ['lemmatizer'], 'requires': []},\n", - " 'token.tag': {'assigns': ['tagger'], 'requires': []}}}" + " 'token.dep': {'assigns': ['parser'], 'requires': []},\n", + " 'token.ent_type': {'assigns': ['ner'], 'requires': []}}}" ] }, "execution_count": 4, @@ -276,7 +276,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "ic| tr.elapsed_time: 7.581949234008789\n" + "ic| tr.elapsed_time: 2.915620803833008\n" ] } ], @@ -350,6 +350,10 @@ " phrase.count: 1\n", " phrase.text: 'a minimal supporting set'\n", "ic| phrase.chunks: [a minimal supporting set]\n", + "ic| phrase.rank: 0.08444534702772151\n", + " phrase.count: 1\n", + " phrase.text: 'linear'\n", + "ic| phrase.chunks: [linear]\n", "ic| phrase.rank: 0.08243620500315359\n", " phrase.count: 1\n", " phrase.text: 'a system'\n", @@ -441,16 +445,40 @@ "name": "stderr", "output_type": "stream", "text": [ - "ic| phrase: Phrase(text='words', chunks=[words, words], count=2, rank=0.16137018222637944)\n", - "ic| phrase: Phrase(text='sentences', chunks=[sentences], count=1, rank=0.13367291641220508)\n", - "ic| phrase: Phrase(text='Mihalcea et al', chunks=[Mihalcea et al], count=1, rank=0.1095023226326187)\n", - "ic| phrase: Phrase(text='et al', chunks=[et al], count=1, rank=0.10745197034799042)\n", - "ic| phrase: Phrase(text='Barrios et al', chunks=[Barrios et al], count=1, rank=0.10502825160040344)\n", - "ic| phrase: Phrase(text='the remaining words', chunks=[the remaining words], count=1, rank=0.09559863808781449)\n", - "ic| phrase: Phrase(text='gensim implements TextRank', chunks=[gensim implements TextRank], count=1, rank=0.09162794519014893)\n", - "ic| phrase: Phrase(text='text summarization', chunks=[text summarization], count=1, rank=0.08555365347028678)\n", - "ic| phrase: Phrase(text='ranking webpages', chunks=[ranking webpages], count=1, rank=0.07894442579092492)\n", - "ic| phrase: Phrase(text='algorithm', chunks=[algorithm], count=1, rank=0.07747520663125698)\n" + "ic| phrase: Phrase(text='words', chunks=[words, words], count=2, rank=0.16404428603296545)\n", + "ic| phrase: Phrase(text='sentences', chunks=[sentences], count=1, rank=0.1287826954552565)\n", + "ic| phrase: Phrase(text='Mihalcea et al',\n", + " chunks=[Mihalcea et al],\n", + " count=1,\n", + " rank=0.11278365769540494)\n", + "ic| phrase: Phrase(text='Barrios et al',\n", + " chunks=[Barrios et al],\n", + " count=1,\n", + " rank=0.10760811592357011)\n", + "ic| phrase: Phrase(text='the remaining words',\n", + " chunks=[the remaining words],\n", + " count=1,\n", + " rank=0.09737893962520337)\n", + "ic| phrase: Phrase(text='text summarization',\n", + " chunks=[text summarization],\n", + " count=1,\n", + " rank=0.08861074217386355)\n", + "ic| phrase: Phrase(text='ranking webpages',\n", + " chunks=[ranking webpages],\n", + " count=1,\n", + " rank=0.07685260919250497)\n", + "ic| phrase: Phrase(text='Okapi BM25 function',\n", + " chunks=[Okapi BM25 function],\n", + " count=1,\n", + " rank=0.0756013984034083)\n", + "ic| phrase: Phrase(text='gensim implements',\n", + " chunks=[gensim implements],\n", + " count=1,\n", + " rank=0.0748386557231912)\n", + "ic| phrase: Phrase(text='every other sentence',\n", + " chunks=[every other sentence],\n", + " count=1,\n", + " rank=0.07031782290622991)\n" ] } ], @@ -484,16 +512,43 @@ "name": "stderr", "output_type": "stream", "text": [ - "ic| phrase: Phrase(text='sentences', chunks=[sentences], count=1, rank=0.1490464677880926)\n", - "ic| phrase: Phrase(text='Mihalcea et al', chunks=[Mihalcea et al], count=1, rank=0.117318519527749)\n", - "ic| phrase: Phrase(text='et al', chunks=[et al], count=1, rank=0.11512161354108796)\n", - "ic| phrase: Phrase(text='Barrios et al', chunks=[Barrios et al], count=1, rank=0.11252482346188267)\n", - "ic| phrase: Phrase(text='gensim implements TextRank', chunks=[gensim implements TextRank], count=1, rank=0.09816426515530181)\n", - "ic| phrase: Phrase(text='text summarization', chunks=[text summarization], count=1, rank=0.09165889278462461)\n", - "ic| phrase: Phrase(text='ranking webpages', chunks=[ranking webpages], count=1, rank=0.08457790386936588)\n", - "ic| phrase: Phrase(text='algorithm', chunks=[algorithm], count=1, rank=0.08300479194058319)\n", - "ic| phrase: Phrase(text='every other sentence', chunks=[every other sentence], count=1, rank=0.08179233228776425)\n", - "ic| phrase: Phrase(text='Okapi BM25 function', chunks=[Okapi BM25 function], count=1, rank=0.07919192237459494)\n" + "ic| phrase: Phrase(text='sentences', chunks=[sentences], count=1, rank=0.14407118792073048)\n", + "ic| phrase: Phrase(text='Mihalcea et al',\n", + " chunks=[Mihalcea et al],\n", + " count=1,\n", + " rank=0.12123026637064825)\n", + "ic| phrase: Phrase(text='Barrios et al',\n", + " chunks=[Barrios et al],\n", + " count=1,\n", + " rank=0.11566772028535821)\n", + "ic| phrase: Phrase(text='text summarization',\n", + " chunks=[text summarization],\n", + " count=1,\n", + " rank=0.09524776232834677)\n", + "ic| phrase: Phrase(text='ranking webpages',\n", + " chunks=[ranking webpages],\n", + " count=1,\n", + " rank=0.08260919223940909)\n", + "ic| phrase: Phrase(text='Okapi BM25 function',\n", + " chunks=[Okapi BM25 function],\n", + " count=1,\n", + " rank=0.08125840606728206)\n", + "ic| phrase: Phrase(text='gensim implements',\n", + " chunks=[gensim implements],\n", + " count=1,\n", + " rank=0.08043607214961235)\n", + "ic| phrase: Phrase(text='every other sentence',\n", + " chunks=[every other sentence],\n", + " count=1,\n", + " rank=0.07915141312258998)\n", + "ic| phrase: Phrase(text='original TextRank',\n", + " chunks=[original TextRank],\n", + " count=1,\n", + " rank=0.07013026654397199)\n", + "ic| phrase: Phrase(text='TextRank',\n", + " chunks=[TextRank, TextRank, TextRank, TextRank, TextRank],\n", + " count=5,\n", + " rank=0.06686718957926076)\n" ] } ], @@ -512,7 +567,9 @@ "id": "experimental-scott", "metadata": {}, "source": [ - "For each entry, you'll need to add a key that is the *lemma* and a value that's a list of its *part-of-speech* tags." + "For each entry, you'll need to add a key that is the *lemma_* and a value that's a list of its *part-of-speech* tags.\n", + "\n", + "Note: [lemma_](https://spacy.io/api/token#attributes) of a token is base form of the token, with no inflectional suffixes. It is usually represented in lower-case form, with the exception of proper nouns and named entities. For eg. words like *ran*, *runs*, *running* will be lemmatized to *run*, *London* will be lemmatized to *London* without lower casing. It is sugggested to check the designated lemma value for a token before setting it in stopword config. " ] }, { @@ -535,16 +592,51 @@ "name": "stderr", "output_type": "stream", "text": [ - "ic| phrase: Phrase(text='sentences', chunks=[sentences, the sentences], count=2, rank=0.1490464677880926)\n", - "ic| phrase: Phrase(text='Mihalcea et al', chunks=[Mihalcea et al], count=1, rank=0.117318519527749)\n", - "ic| phrase: Phrase(text='et al', chunks=[et al], count=1, rank=0.11512161354108796)\n", - "ic| phrase: Phrase(text='Barrios et al', chunks=[Barrios et al], count=1, rank=0.11252482346188267)\n", - "ic| phrase: Phrase(text='gensim implements TextRank', chunks=[gensim implements TextRank], count=1, rank=0.09816426515530181)\n", - "ic| phrase: Phrase(text='text summarization', chunks=[text summarization], count=1, rank=0.09165889278462461)\n", - "ic| phrase: Phrase(text='ranking webpages', chunks=[ranking webpages], count=1, rank=0.08457790386936588)\n", - "ic| phrase: Phrase(text='algorithm', chunks=[algorithm], count=1, rank=0.08300479194058319)\n", - "ic| phrase: Phrase(text='sentence', chunks=[every sentence, every other sentence], count=2, rank=0.08179233228776425)\n", - "ic| phrase: Phrase(text='Okapi BM25 function', chunks=[Okapi BM25 function], count=1, rank=0.07919192237459494)\n" + "ic| phrase: Phrase(text='sentence',\n", + " chunks=[sentences,\n", + " every sentence,\n", + " every other sentence,\n", + " the two sentences,\n", + " two sentences,\n", + " the sentences],\n", + " count=6,\n", + " rank=0.14407118792073048)\n", + "ic| phrase: Phrase(text='Mihalcea et al',\n", + " chunks=[Mihalcea et al],\n", + " count=1,\n", + " rank=0.12123026637064825)\n", + "ic| phrase: Phrase(text='Barrios et al',\n", + " chunks=[Barrios et al],\n", + " count=1,\n", + " rank=0.11566772028535821)\n", + "ic| phrase: Phrase(text='text summarization',\n", + " chunks=[text summarization],\n", + " count=1,\n", + " rank=0.09524776232834677)\n", + "ic| phrase: Phrase(text='rank webpage',\n", + " chunks=[ranking webpages],\n", + " count=1,\n", + " rank=0.08260919223940909)\n", + "ic| phrase: Phrase(text='Okapi BM25 function',\n", + " chunks=[Okapi BM25 function],\n", + " count=1,\n", + " rank=0.08125840606728206)\n", + "ic| phrase: Phrase(text='gensim implement',\n", + " chunks=[gensim implements],\n", + " count=1,\n", + " rank=0.08043607214961235)\n", + "ic| phrase: Phrase(text='original TextRank',\n", + " chunks=[original TextRank],\n", + " count=1,\n", + " rank=0.07013026654397199)\n", + "ic| phrase: Phrase(text='TextRank',\n", + " chunks=[TextRank, TextRank, TextRank, TextRank],\n", + " count=4,\n", + " rank=0.06686718957926076)\n", + "ic| phrase: Phrase(text='Olavur Mortensen',\n", + " chunks=[Olavur Mortensen, Olavur Mortensen],\n", + " count=2,\n", + " rank=0.06548020385220721)\n" ] } ], @@ -556,9 +648,9 @@ "@spacy.registry.misc(\"prefix_scrubber\")\n", "def prefix_scrubber():\n", "\tdef scrubber_func(span: Span) -> str:\n", - "\t\twhile len(span) > 1 and span[0].text in (\"a\", \"the\", \"their\", \"every\", \"other\"):\n", + "\t\twhile len(span) > 1 and span[0].text in (\"a\", \"the\", \"their\", \"every\", \"other\", \"two\"):\n", "\t\t\tspan = span[1:]\n", - "\t\treturn span.text\n", + "\t\treturn span.lemma_\n", "\treturn scrubber_func\n", "\n", "nlp.add_pipe(\"textrank\", config={ \"stopwords\": { \"word\": [\"NOUN\"] }, \"scrubber\": {\"@misc\": \"prefix_scrubber\"}})\n", @@ -582,7 +674,7 @@ "id": "a123e0fa-c594-4b8c-9ca3-4d3205c2662a", "metadata": {}, "source": [ - "As the scrubber takes in `Spans`, we can also use `toekn.pos_` or any other spaCy `Token` or `Span` attribute in the scrubbing. The variations of \"sentences\" have different DETs (determiners), so we could achieve a similar result with the folowing scrubber." + "As the scrubber takes in `Spans`, we can also use `token.pos_` or any other spaCy `Token` or `Span` attribute in the scrubbing. The variations of \"sentences\" have different DETs (determiners), so we could achieve a similar result with the folowing scrubber." ] }, { @@ -613,16 +705,46 @@ "name": "stderr", "output_type": "stream", "text": [ - "ic| phrase: Phrase(text='sentences', chunks=[sentences, the sentences], count=2, rank=0.1490464677880926)\n", - "ic| phrase: Phrase(text='Mihalcea et al', chunks=[Mihalcea et al], count=1, rank=0.117318519527749)\n", - "ic| phrase: Phrase(text='et al', chunks=[et al], count=1, rank=0.11512161354108796)\n", - "ic| phrase: Phrase(text='Barrios et al', chunks=[Barrios et al], count=1, rank=0.11252482346188267)\n", - "ic| phrase: Phrase(text='gensim implements TextRank', chunks=[gensim implements TextRank], count=1, rank=0.09816426515530181)\n", - "ic| phrase: Phrase(text='text summarization', chunks=[text summarization], count=1, rank=0.09165889278462461)\n", - "ic| phrase: Phrase(text='ranking webpages', chunks=[ranking webpages], count=1, rank=0.08457790386936588)\n", - "ic| phrase: Phrase(text='algorithm', chunks=[algorithm], count=1, rank=0.08300479194058319)\n", - "ic| phrase: Phrase(text='other sentence', chunks=[every other sentence], count=1, rank=0.08179233228776425)\n", - "ic| phrase: Phrase(text='Okapi BM25 function', chunks=[Okapi BM25 function], count=1, rank=0.07919192237459494)\n" + "ic| phrase: Phrase(text='sentences',\n", + " chunks=[sentences, the sentences],\n", + " count=2,\n", + " rank=0.14407118792073048)\n", + "ic| phrase: Phrase(text='Mihalcea et al',\n", + " chunks=[Mihalcea et al],\n", + " count=1,\n", + " rank=0.12123026637064825)\n", + "ic| phrase: Phrase(text='Barrios et al',\n", + " chunks=[Barrios et al],\n", + " count=1,\n", + " rank=0.11566772028535821)\n", + "ic| phrase: Phrase(text='text summarization',\n", + " chunks=[text summarization],\n", + " count=1,\n", + " rank=0.09524776232834677)\n", + "ic| phrase: Phrase(text='ranking webpages',\n", + " chunks=[ranking webpages],\n", + " count=1,\n", + " rank=0.08260919223940909)\n", + "ic| phrase: Phrase(text='Okapi BM25 function',\n", + " chunks=[Okapi BM25 function],\n", + " count=1,\n", + " rank=0.08125840606728206)\n", + "ic| phrase: Phrase(text='gensim implements',\n", + " chunks=[gensim implements],\n", + " count=1,\n", + " rank=0.08043607214961235)\n", + "ic| phrase: Phrase(text='other sentence',\n", + " chunks=[every other sentence],\n", + " count=1,\n", + " rank=0.07915141312258998)\n", + "ic| phrase: Phrase(text='original TextRank',\n", + " chunks=[original TextRank],\n", + " count=1,\n", + " rank=0.07013026654397199)\n", + "ic| phrase: Phrase(text='TextRank',\n", + " chunks=[TextRank, TextRank, TextRank, TextRank, TextRank],\n", + " count=5,\n", + " rank=0.06686718957926076)\n" ] } ], @@ -671,15 +793,39 @@ "name": "stderr", "output_type": "stream", "text": [ - "ic| phrase: Phrase(text='sentences', chunks=[sentences], count=1, rank=0.1490464677880926)\n", - "ic| phrase: Phrase(text='gensim implements TextRank', chunks=[gensim implements TextRank], count=1, rank=0.09816426515530181)\n", - "ic| phrase: Phrase(text='text summarization', chunks=[text summarization], count=1, rank=0.09165889278462461)\n", - "ic| phrase: Phrase(text='ranking webpages', chunks=[ranking webpages], count=1, rank=0.08457790386936588)\n", - "ic| phrase: Phrase(text='algorithm', chunks=[algorithm], count=1, rank=0.08300479194058319)\n", - "ic| phrase: Phrase(text='every other sentence', chunks=[every other sentence], count=1, rank=0.08179233228776425)\n", - "ic| phrase: Phrase(text='Okapi BM25 function', chunks=[Okapi BM25 function], count=1, rank=0.07919192237459494)\n", - "ic| phrase: Phrase(text='original TextRank', chunks=[original TextRank], count=1, rank=0.07346227481329015)\n", - "ic| phrase: Phrase(text='TextRank', chunks=[TextRank], count=1, rank=0.07058237377923389)\n" + "ic| phrase: Phrase(text='sentences', chunks=[sentences], count=1, rank=0.14407118792073048)\n", + "ic| phrase: Phrase(text='Barrios et al',\n", + " chunks=[Barrios et al],\n", + " count=1,\n", + " rank=0.11566772028535821)\n", + "ic| phrase: Phrase(text='text summarization',\n", + " chunks=[text summarization],\n", + " count=1,\n", + " rank=0.09524776232834677)\n", + "ic| phrase: Phrase(text='ranking webpages',\n", + " chunks=[ranking webpages],\n", + " count=1,\n", + " rank=0.08260919223940909)\n", + "ic| phrase: Phrase(text='gensim implements',\n", + " chunks=[gensim implements],\n", + " count=1,\n", + " rank=0.08043607214961235)\n", + "ic| phrase: Phrase(text='every other sentence',\n", + " chunks=[every other sentence],\n", + " count=1,\n", + " rank=0.07915141312258998)\n", + "ic| phrase: Phrase(text='original TextRank',\n", + " chunks=[original TextRank],\n", + " count=1,\n", + " rank=0.07013026654397199)\n", + "ic| phrase: Phrase(text='every sentence',\n", + " chunks=[every sentence],\n", + " count=1,\n", + " rank=0.06654363130280233)\n", + "ic| phrase: Phrase(text='the sentences',\n", + " chunks=[the sentences],\n", + " count=1,\n", + " rank=0.06654363130280233)\n" ] } ], @@ -725,7 +871,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "-rw-rw-r-- 1 ankushchander ankushchander 17K Jun 4 11:56 lemma_graph.dot\r\n" + "-rw-rw-r-- 1 ankush ankush 18K Aug 9 15:06 lemma_graph.dot\n" ] } ], @@ -745,7 +891,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "Requirement already satisfied: graphviz in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (0.19.1)\r\n" + "Requirement already satisfied: graphviz in /home/ankush/workplace/os_repos/pytextrank/venv/lib/python3.10/site-packages (0.20.1)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" ] } ], @@ -784,22 +933,22 @@ "\n", "\n", - "\n", - "\n", + "\n", + "\n", "%3\n", - "\n", + "\n", "\n", "\n", "('quick', 'ADJ')\n", "\n", - "('quick', 'ADJ') (0.0044)\n", + "('quick', 'ADJ') (0.0042)\n", "\n", "\n", "\n", "('description', 'NOUN')\n", "\n", - "('description', 'NOUN') (0.0056)\n", + "('description', 'NOUN') (0.0055)\n", "\n", "\n", "\n", @@ -811,7 +960,7 @@ "\n", "('popular', 'ADJ')\n", "\n", - "('popular', 'ADJ') (0.0100)\n", + "('popular', 'ADJ') (0.0097)\n", "\n", "\n", "\n", @@ -823,7 +972,7 @@ "\n", "('algorithm', 'NOUN')\n", "\n", - "('algorithm', 'NOUN') (0.0276)\n", + "('algorithm', 'NOUN') (0.0135)\n", "\n", "\n", "\n", @@ -847,7 +996,7 @@ "\n", "('implementation', 'NOUN')\n", "\n", - "('implementation', 'NOUN') (0.0097)\n", + "('implementation', 'NOUN') (0.0091)\n", "\n", "\n", "\n", @@ -871,7 +1020,7 @@ "\n", "('text', 'NOUN')\n", "\n", - "('text', 'NOUN') (0.0157)\n", + "('text', 'NOUN') (0.0189)\n", "\n", "\n", "\n", @@ -880,28 +1029,28 @@ "\n", "\n", "\n", - "\n", + "\n", "('PageRank', 'PROPN')\n", - "\n", - "('PageRank', 'PROPN') (0.0157)\n", + "\n", + "('PageRank', 'PROPN') (0.0153)\n", "\n", "\n", "\n", "('popular', 'ADJ')->('PageRank', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Google', 'PROPN')\n", - "\n", - "('Google', 'PROPN') (0.0103)\n", + "\n", + "('Google', 'PROPN') (0.0097)\n", "\n", "\n", "\n", "('popular', 'ADJ')->('Google', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", @@ -910,7 +1059,7 @@ "\n", "\n", "\n", - "\n", + "\n", "('algorithm', 'NOUN')->('text', 'NOUN')\n", "\n", "\n", @@ -919,88 +1068,52 @@ "\n", "('summarization', 'NOUN')\n", "\n", - "('summarization', 'NOUN') (0.0221)\n", + "('summarization', 'NOUN') (0.0219)\n", "\n", "\n", - "\n", + "\n", "('algorithm', 'NOUN')->('summarization', 'NOUN')\n", "\n", "\n", "\n", - "\n", - "\n", - "('base', 'VERB')\n", - "\n", - "('base', 'VERB') (0.0124)\n", - "\n", - "\n", - "\n", - "('algorithm', 'NOUN')->('base', 'VERB')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('weight', 'VERB')\n", - "\n", - "('weight', 'VERB') (0.0144)\n", - "\n", - "\n", - "\n", - "('algorithm', 'NOUN')->('weight', 'VERB')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('graph', 'NOUN')\n", - "\n", - "('graph', 'NOUN') (0.0213)\n", - "\n", - "\n", - "\n", - "('algorithm', 'NOUN')->('graph', 'NOUN')\n", - "\n", - "\n", - "\n", "\n", - "\n", + "\n", "('algorithm', 'NOUN')->('Google', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('use', 'VERB')\n", - "\n", - "('use', 'VERB') (0.0146)\n", + "\n", + "('use', 'VERB') (0.0149)\n", "\n", "\n", - "\n", + "\n", "('algorithm', 'NOUN')->('use', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('rank', 'VERB')\n", - "\n", - "('rank', 'VERB') (0.0121)\n", + "\n", + "('rank', 'VERB') (0.0114)\n", "\n", "\n", - "\n", + "\n", "('algorithm', 'NOUN')->('rank', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('implementation', 'NOUN')->('text', 'NOUN')\n", "\n", "\n", "\n", "\n", - "\n", + "\n", "('implementation', 'NOUN')->('summarization', 'NOUN')\n", "\n", "\n", @@ -1009,22 +1122,22 @@ "\n", "('exist', 'VERB')\n", "\n", - "('exist', 'VERB') (0.0125)\n", + "('exist', 'VERB') (0.0126)\n", "\n", "\n", - "\n", + "\n", "('implementation', 'NOUN')->('exist', 'VERB')\n", "\n", "\n", "\n", "\n", - "\n", + "\n", "('text', 'NOUN')->('summarization', 'NOUN')\n", "\n", "\n", "\n", "\n", - "\n", + "\n", "('text', 'NOUN')->('exist', 'VERB')\n", "\n", "\n", @@ -1033,64 +1146,64 @@ "\n", "('today', 'NOUN')\n", "\n", - "('today', 'NOUN') (0.0132)\n", + "('today', 'NOUN') (0.0136)\n", "\n", "\n", - "\n", + "\n", "('text', 'NOUN')->('today', 'NOUN')\n", "\n", "\n", "\n", "\n", - "\n", + "\n", "('remove', 'VERB')\n", - "\n", - "('remove', 'VERB') (0.0102)\n", + "\n", + "('remove', 'VERB') (0.0132)\n", "\n", "\n", - "\n", + "\n", "('text', 'NOUN')->('remove', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('stop', 'VERB')\n", - "\n", - "('stop', 'VERB') (0.0115)\n", + "\n", + "('stop', 'VERB') (0.0140)\n", "\n", "\n", - "\n", + "\n", "('text', 'NOUN')->('stop', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('stem', 'VERB')\n", - "\n", - "('stem', 'VERB') (0.0144)\n", + "\n", + "('stem', 'VERB') (0.0166)\n", "\n", "\n", - "\n", + "\n", "('text', 'NOUN')->('stem', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('summarization', 'NOUN')->('summarization', 'NOUN')\n", "\n", "\n", "\n", "\n", - "\n", + "\n", "('summarization', 'NOUN')->('exist', 'VERB')\n", "\n", "\n", "\n", "\n", - "\n", + "\n", "('summarization', 'NOUN')->('today', 'NOUN')\n", "\n", "\n", @@ -1102,7 +1215,7 @@ "('module', 'NOUN') (0.0148)\n", "\n", "\n", - "\n", + "\n", "('summarization', 'NOUN')->('module', 'NOUN')\n", "\n", "\n", @@ -1114,7 +1227,7 @@ "('gensim', 'NOUN') (0.0154)\n", "\n", "\n", - "\n", + "\n", "('summarization', 'NOUN')->('gensim', 'NOUN')\n", "\n", "\n", @@ -1123,58 +1236,58 @@ "\n", "('implement', 'NOUN')\n", "\n", - "('implement', 'NOUN') (0.0160)\n", + "('implement', 'NOUN') (0.0137)\n", "\n", "\n", - "\n", + "\n", "('summarization', 'NOUN')->('implement', 'NOUN')\n", "\n", "\n", "\n", "\n", - "\n", + "\n", "('exist', 'VERB')->('summarization', 'NOUN')\n", "\n", "\n", "\n", "\n", - "\n", + "\n", "('exist', 'VERB')->('today', 'NOUN')\n", "\n", "\n", "\n", "\n", - "\n", + "\n", "('exist', 'VERB')->('module', 'NOUN')\n", "\n", "\n", "\n", "\n", - "\n", + "\n", "('today', 'NOUN')->('summarization', 'NOUN')\n", "\n", "\n", "\n", "\n", - "\n", + "\n", "('today', 'NOUN')->('module', 'NOUN')\n", "\n", "\n", "\n", "\n", - "\n", + "\n", "('today', 'NOUN')->('gensim', 'NOUN')\n", "\n", "\n", "\n", "\n", - "\n", + "\n", "('module', 'NOUN')->('gensim', 'NOUN')\n", "\n", "\n", "\n", "\n", - "\n", + "\n", "('module', 'NOUN')->('implement', 'NOUN')\n", "\n", "\n", @@ -1183,22 +1296,22 @@ "\n", "('TextRank', 'PROPN')\n", "\n", - "('TextRank', 'PROPN') (0.0199)\n", + "('TextRank', 'PROPN') (0.0179)\n", "\n", "\n", - "\n", + "\n", "('module', 'NOUN')->('TextRank', 'PROPN')\n", "\n", "\n", "\n", "\n", - "\n", + "\n", "('gensim', 'NOUN')->('implement', 'NOUN')\n", "\n", "\n", "\n", "\n", - "\n", + "\n", "('gensim', 'NOUN')->('TextRank', 'PROPN')\n", "\n", "\n", @@ -1207,1195 +1320,1249 @@ "\n", "('unsupervised', 'ADJ')\n", "\n", - "('unsupervised', 'ADJ') (0.0148)\n", + "('unsupervised', 'ADJ') (0.0120)\n", "\n", "\n", - "\n", + "\n", "('gensim', 'NOUN')->('unsupervised', 'ADJ')\n", "\n", "\n", "\n", - "\n", - "\n", - "('implement', 'NOUN')->('algorithm', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", + "('gensim', 'NOUN')->('use', 'VERB')\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "('Okapi', 'PROPN')\n", + "\n", + "('Okapi', 'PROPN') (0.0107)\n", + "\n", + "\n", + "\n", + "('gensim', 'NOUN')->('Okapi', 'PROPN')\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('implement', 'NOUN')->('TextRank', 'PROPN')\n", "\n", "\n", "\n", "\n", - "\n", + "\n", "('implement', 'NOUN')->('unsupervised', 'ADJ')\n", "\n", "\n", "\n", - "\n", - "\n", - "('TextRank', 'PROPN')->('algorithm', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", + "('algorithm', 'PROPN')\n", + "\n", + "('algorithm', 'PROPN') (0.0166)\n", + "\n", + "\n", + "\n", + "('implement', 'NOUN')->('algorithm', 'PROPN')\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('TextRank', 'PROPN')->('unsupervised', 'ADJ')\n", "\n", "\n", "\n", + "\n", + "\n", + "('TextRank', 'PROPN')->('algorithm', 'PROPN')\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "('base', 'VERB')\n", + "\n", + "('base', 'VERB') (0.0129)\n", + "\n", "\n", - "\n", + "\n", "('TextRank', 'PROPN')->('base', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('TextRank', 'PROPN')->('use', 'VERB')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('work', 'VERB')\n", - "\n", - "('work', 'VERB') (0.0059)\n", - "\n", - "\n", - "\n", - "('TextRank', 'PROPN')->('work', 'VERB')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('follow', 'VERB')\n", - "\n", - "('follow', 'VERB') (0.0109)\n", - "\n", - "\n", - "\n", - "('TextRank', 'PROPN')->('follow', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('sentence', 'NOUN')\n", - "\n", - "('sentence', 'NOUN') (0.0889)\n", + "\n", + "('sentence', 'NOUN') (0.0830)\n", "\n", "\n", - "\n", + "\n", "('TextRank', 'PROPN')->('sentence', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('edge', 'NOUN')\n", - "\n", - "('edge', 'NOUN') (0.0421)\n", + "\n", + "('edge', 'NOUN') (0.0400)\n", "\n", "\n", - "\n", + "\n", "('TextRank', 'PROPN')->('edge', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('weight', 'NOUN')\n", - "\n", - "('weight', 'NOUN') (0.0071)\n", + "\n", + "('weight', 'NOUN') (0.0071)\n", "\n", "\n", - "\n", + "\n", "('TextRank', 'PROPN')->('weight', 'NOUN')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('Okapi', 'PROPN')\n", - "\n", - "('Okapi', 'PROPN') (0.0096)\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('TextRank', 'PROPN')->('Okapi', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('BM25', 'PROPN')\n", - "\n", - "('BM25', 'PROPN') (0.0111)\n", + "\n", + "('BM25', 'PROPN') (0.0115)\n", "\n", "\n", - "\n", + "\n", "('TextRank', 'PROPN')->('BM25', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "('unsupervised', 'ADJ')->('algorithm', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", + "('unsupervised', 'ADJ')->('algorithm', 'PROPN')\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('unsupervised', 'ADJ')->('base', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "('weight', 'VERB')\n", + "\n", + "('weight', 'VERB') (0.0148)\n", "\n", "\n", - "\n", + "\n", "('unsupervised', 'ADJ')->('weight', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "('algorithm', 'PROPN')->('base', 'VERB')\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "('algorithm', 'PROPN')->('weight', 'VERB')\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "('graph', 'NOUN')\n", + "\n", + "('graph', 'NOUN') (0.0237)\n", + "\n", + "\n", + "\n", + "('algorithm', 'PROPN')->('graph', 'NOUN')\n", + "\n", + "\n", "\n", "\n", "\n", "('base', 'VERB')->('weight', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('base', 'VERB')->('graph', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('paper', 'NOUN')\n", - "\n", - "('paper', 'NOUN') (0.0162)\n", + "\n", + "('paper', 'NOUN') (0.0173)\n", "\n", "\n", "\n", "('base', 'VERB')->('paper', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('weight', 'VERB')->('graph', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('weight', 'VERB')->('paper', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Mihalcea', 'PROPN')\n", - "\n", - "('Mihalcea', 'PROPN') (0.0138)\n", + "\n", + "('Mihalcea', 'PROPN') (0.0149)\n", "\n", "\n", "\n", "('weight', 'VERB')->('Mihalcea', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('graph', 'NOUN')->('paper', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('graph', 'NOUN')->('Mihalcea', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('et', 'PROPN')\n", - "\n", - "('et', 'PROPN') (0.0224)\n", + "\n", + "('et', 'PROPN') (0.0241)\n", "\n", "\n", "\n", "('graph', 'NOUN')->('et', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('vertex', 'NOUN')\n", - "\n", - "('vertex', 'NOUN') (0.0086)\n", + "\n", + "('vertex', 'NOUN') (0.0095)\n", "\n", "\n", "\n", "('graph', 'NOUN')->('vertex', 'NOUN')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('be', 'VERB')\n", - "\n", - "('be', 'VERB') (0.0123)\n", - "\n", - "\n", - "\n", - "('graph', 'NOUN')->('be', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('graph', 'NOUN')->('sentence', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('paper', 'NOUN')->('Mihalcea', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('paper', 'NOUN')->('et', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('al', 'PROPN')\n", - "\n", - "('al', 'PROPN') (0.0372)\n", + "\n", + "('al', 'PROPN') (0.0393)\n", "\n", "\n", - "\n", + "\n", "('paper', 'NOUN')->('al', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('Barrios', 'PROPN')\n", - "\n", - "('Barrios', 'PROPN') (0.0079)\n", + "\n", + "('Barrios', 'PROPN') (0.0079)\n", "\n", "\n", - "\n", + "\n", "('paper', 'NOUN')->('Barrios', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Mihalcea', 'PROPN')->('et', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Mihalcea', 'PROPN')->('al', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('et', 'PROPN')->('al', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('add', 'VERB')\n", - "\n", - "('add', 'VERB') (0.0044)\n", + "\n", + "('add', 'VERB') (0.0042)\n", "\n", "\n", - "\n", + "\n", "('incubator', 'NOUN')\n", - "\n", - "('incubator', 'NOUN') (0.0056)\n", + "\n", + "('incubator', 'NOUN') (0.0055)\n", "\n", "\n", - "\n", + "\n", "('add', 'VERB')->('incubator', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('student', 'NOUN')\n", - "\n", - "('student', 'NOUN') (0.0072)\n", + "\n", + "('student', 'NOUN') (0.0070)\n", "\n", "\n", - "\n", + "\n", "('add', 'VERB')->('student', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Olavur', 'PROPN')\n", - "\n", - "('Olavur', 'PROPN') (0.0092)\n", + "\n", + "('Olavur', 'PROPN') (0.0090)\n", "\n", "\n", - "\n", + "\n", "('add', 'VERB')->('Olavur', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('incubator', 'NOUN')->('student', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('incubator', 'NOUN')->('Olavur', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Mortensen', 'PROPN')\n", - "\n", - "('Mortensen', 'PROPN') (0.0106)\n", + "\n", + "('Mortensen', 'PROPN') (0.0103)\n", "\n", "\n", - "\n", + "\n", "('incubator', 'NOUN')->('Mortensen', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('student', 'NOUN')->('Olavur', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('student', 'NOUN')->('Mortensen', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('see', 'VERB')\n", - "\n", - "('see', 'VERB') (0.0215)\n", + "\n", + "('see', 'VERB') (0.0217)\n", "\n", "\n", - "\n", + "\n", "('student', 'NOUN')->('see', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Olavur', 'PROPN')->('Mortensen', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Olavur', 'PROPN')->('see', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('previous', 'ADJ')\n", - "\n", - "('previous', 'ADJ') (0.0136)\n", + "\n", + "('previous', 'ADJ') (0.0134)\n", "\n", "\n", - "\n", + "\n", "('Olavur', 'PROPN')->('previous', 'ADJ')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Mortensen', 'PROPN')->('see', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Mortensen', 'PROPN')->('previous', 'ADJ')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('post', 'NOUN')\n", - "\n", - "('post', 'NOUN') (0.0168)\n", + "\n", + "('post', 'NOUN') (0.0165)\n", "\n", "\n", - "\n", + "\n", "('Mortensen', 'PROPN')->('post', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('see', 'VERB')->('previous', 'ADJ')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('see', 'VERB')->('post', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('blog', 'NOUN')\n", - "\n", - "('blog', 'NOUN') (0.0281)\n", + "\n", + "('blog', 'NOUN') (0.0277)\n", "\n", "\n", - "\n", + "\n", "('see', 'VERB')->('blog', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('see', 'VERB')->('sentence', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('similar', 'ADJ')\n", - "\n", - "('similar', 'ADJ') (0.0229)\n", + "\n", + "('similar', 'ADJ') (0.0227)\n", "\n", "\n", - "\n", + "\n", "('see', 'VERB')->('similar', 'ADJ')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('previous', 'ADJ')->('post', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('previous', 'ADJ')->('blog', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('post', 'NOUN')->('blog', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('build', 'VERB')\n", - "\n", - "('build', 'VERB') (0.0044)\n", + "\n", + "('build', 'VERB') (0.0042)\n", "\n", "\n", - "\n", + "\n", "('build', 'VERB')->('popular', 'ADJ')\n", - "\n", + "\n", "\n", "\n", "\n", - "\n", + "\n", "('top', 'NOUN')\n", - "\n", - "('top', 'NOUN') (0.0056)\n", + "\n", + "('top', 'NOUN') (0.0055)\n", "\n", "\n", - "\n", + "\n", "('build', 'VERB')->('top', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('build', 'VERB')->('PageRank', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('top', 'NOUN')->('popular', 'ADJ')\n", - "\n", + "\n", "\n", "\n", "\n", - "\n", + "\n", "('top', 'NOUN')->('algorithm', 'NOUN')\n", - "\n", + "\n", "\n", "\n", "\n", - "\n", + "\n", "('top', 'NOUN')->('PageRank', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('PageRank', 'PROPN')->('algorithm', 'NOUN')\n", - "\n", + "\n", "\n", "\n", + "\n", + "\n", + "('PageRank', 'PROPN')->('algorithm', 'PROPN')\n", + "\n", + "\n", + "\n", "\n", "\n", "('PageRank', 'PROPN')->('graph', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('PageRank', 'PROPN')->('Google', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('PageRank', 'PROPN')->('use', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('score', 'NOUN')\n", - "\n", - "('score', 'NOUN') (0.0112)\n", + "\n", + "('score', 'NOUN') (0.0109)\n", "\n", "\n", "\n", "('PageRank', 'PROPN')->('score', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('Google', 'PROPN')->('use', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('Google', 'PROPN')->('rank', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('webpage', 'NOUN')\n", - "\n", - "('webpage', 'NOUN') (0.0201)\n", + "\n", + "('webpage', 'NOUN') (0.0193)\n", "\n", "\n", "\n", "('Google', 'PROPN')->('webpage', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('use', 'VERB')->('rank', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('use', 'VERB')->('webpage', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('use', 'VERB')->('Okapi', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('use', 'VERB')->('BM25', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('function', 'NOUN')\n", - "\n", - "('function', 'NOUN') (0.0127)\n", + "\n", + "('function', 'NOUN') (0.0131)\n", "\n", "\n", "\n", "('use', 'VERB')->('function', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('rank', 'VERB')->('webpage', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", + "\n", + "\n", + "('textrank', 'NOUN')\n", + "\n", + "('textrank', 'NOUN') (0.0042)\n", + "\n", + "\n", + "\n", + "('work', 'VERB')\n", + "\n", + "('work', 'VERB') (0.0055)\n", + "\n", + "\n", "\n", - "('work', 'VERB')->('follow', 'VERB')\n", - "\n", - "\n", + "('textrank', 'NOUN')->('work', 'VERB')\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "('follow', 'VERB')\n", + "\n", + "('follow', 'VERB') (0.0070)\n", + "\n", + "\n", + "\n", + "('textrank', 'NOUN')->('follow', 'VERB')\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Pre', 'NOUN')\n", - "\n", - "('Pre', 'NOUN') (0.0044)\n", + "\n", + "('Pre', 'NOUN') (0.0090)\n", "\n", - "\n", + "\n", + "\n", + "('textrank', 'NOUN')->('Pre', 'NOUN')\n", + "\n", + "\n", + "\n", + "\n", "\n", - "('Pre', 'NOUN')->('text', 'NOUN')\n", - "\n", - "\n", + "('work', 'VERB')->('follow', 'VERB')\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "('work', 'VERB')->('Pre', 'NOUN')\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('-', 'NOUN')\n", - "\n", - "('-', 'NOUN') (0.0056)\n", + "\n", + "('-', 'NOUN') (0.0103)\n", "\n", - "\n", - "\n", - "('Pre', 'NOUN')->('-', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", + "('work', 'VERB')->('-', 'NOUN')\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "('follow', 'VERB')->('Pre', 'NOUN')\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "('follow', 'VERB')->('-', 'NOUN')\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('process', 'VERB')\n", - "\n", - "('process', 'VERB') (0.0072)\n", + "\n", + "('process', 'VERB') (0.0117)\n", + "\n", + "\n", + "\n", + "('follow', 'VERB')->('process', 'VERB')\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "('Pre', 'NOUN')->('text', 'NOUN')\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "('Pre', 'NOUN')->('-', 'NOUN')\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Pre', 'NOUN')->('process', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('-', 'NOUN')->('text', 'NOUN')\n", - "\n", + "\n", "\n", "\n", "\n", - "\n", + "\n", "('-', 'NOUN')->('process', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('-', 'NOUN')->('remove', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('process', 'VERB')->('text', 'NOUN')\n", - "\n", + "\n", "\n", "\n", "\n", - "\n", + "\n", "('process', 'VERB')->('remove', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('process', 'VERB')->('stop', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('remove', 'VERB')->('stop', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('remove', 'VERB')->('stem', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('remain', 'VERB')\n", - "\n", - "('remain', 'VERB') (0.0243)\n", + "\n", + "('remain', 'VERB') (0.0280)\n", "\n", "\n", - "\n", + "\n", "('remove', 'VERB')->('remain', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('stop', 'VERB')->('stem', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('stop', 'VERB')->('remain', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('stem', 'VERB')->('remain', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('create', 'VERB')\n", - "\n", - "('create', 'VERB') (0.0044)\n", + "\n", + "('create', 'VERB') (0.0042)\n", "\n", "\n", - "\n", + "\n", "('create', 'VERB')->('graph', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('create', 'VERB')->('vertex', 'NOUN')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('create', 'VERB')->('be', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "('vertex', 'NOUN')->('be', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", + "('create', 'VERB')->('sentence', 'NOUN')\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('vertex', 'NOUN')->('sentence', 'NOUN')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('be', 'VERB')->('sentence', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('sentence', 'NOUN')->('sentence', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('other', 'ADJ')\n", - "\n", - "('other', 'ADJ') (0.0182)\n", + "\n", + "('other', 'ADJ') (0.0172)\n", "\n", "\n", - "\n", + "\n", "('sentence', 'NOUN')->('other', 'ADJ')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('sentence', 'NOUN')->('edge', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('percentage', 'NOUN')\n", - "\n", - "('percentage', 'NOUN') (0.0251)\n", + "\n", + "('percentage', 'NOUN') (0.0238)\n", "\n", "\n", - "\n", + "\n", "('sentence', 'NOUN')->('percentage', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('appear', 'VERB')\n", - "\n", - "('appear', 'VERB') (0.0454)\n", + "\n", + "('appear', 'VERB') (0.0431)\n", "\n", "\n", - "\n", + "\n", "('sentence', 'NOUN')->('appear', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('connect', 'VERB')\n", - "\n", - "('connect', 'VERB') (0.0044)\n", + "\n", + "('connect', 'VERB') (0.0042)\n", "\n", "\n", - "\n", + "\n", "('connect', 'VERB')->('sentence', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('connect', 'VERB')->('other', 'ADJ')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('other', 'ADJ')->('sentence', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('other', 'ADJ')->('edge', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('edge', 'NOUN')->('sentence', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('edge', 'NOUN')->('similar', 'ADJ')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('edge', 'NOUN')->('percentage', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('edge', 'NOUN')->('appear', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('weight', 'NOUN')->('sentence', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('weight', 'NOUN')->('edge', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('weight', 'NOUN')->('similar', 'ADJ')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('weight', 'NOUN')->('percentage', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('similar', 'ADJ')->('sentence', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('run', 'VERB')\n", - "\n", - "('run', 'VERB') (0.0044)\n", + "\n", + "('run', 'VERB') (0.0042)\n", "\n", - "\n", - "\n", - "('run', 'VERB')->('algorithm', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", + "('run', 'VERB')->('algorithm', 'PROPN')\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('run', 'VERB')->('graph', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('run', 'VERB')->('PageRank', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('pick', 'VERB')\n", - "\n", - "('pick', 'VERB') (0.0044)\n", + "\n", + "('pick', 'VERB') (0.0042)\n", "\n", "\n", - "\n", + "\n", "('pick', 'VERB')->('PageRank', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "('vertices(sentences', 'PROPN')\n", - "\n", - "('vertices(sentences', 'PROPN') (0.0056)\n", + "\n", + "\n", + "('vertices(sentence', 'NOUN')\n", + "\n", + "('vertices(sentence', 'NOUN') (0.0055)\n", "\n", - "\n", - "\n", - "('pick', 'VERB')->('vertices(sentences', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", + "('pick', 'VERB')->('vertices(sentence', 'NOUN')\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('high', 'ADJ')\n", - "\n", - "('high', 'ADJ') (0.0072)\n", + "\n", + "('high', 'ADJ') (0.0070)\n", "\n", "\n", - "\n", + "\n", "('pick', 'VERB')->('high', 'ADJ')\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "('vertices(sentences', 'PROPN')->('PageRank', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", + "('vertices(sentence', 'NOUN')->('PageRank', 'PROPN')\n", + "\n", + "\n", "\n", - "\n", - "\n", - "('vertices(sentences', 'PROPN')->('high', 'ADJ')\n", - "\n", - "\n", + "\n", + "\n", + "('vertices(sentence', 'NOUN')->('high', 'ADJ')\n", + "\n", + "\n", "\n", - "\n", - "\n", - "('vertices(sentences', 'PROPN')->('score', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", + "('vertices(sentence', 'NOUN')->('score', 'NOUN')\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('high', 'ADJ')->('PageRank', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('high', 'ADJ')->('score', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('original', 'ADJ')\n", - "\n", - "('original', 'ADJ') (0.0044)\n", + "\n", + "('original', 'ADJ') (0.0042)\n", "\n", "\n", - "\n", + "\n", "('original', 'ADJ')->('TextRank', 'PROPN')\n", - "\n", + "\n", "\n", "\n", "\n", - "\n", + "\n", "('original', 'ADJ')->('edge', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('original', 'ADJ')->('weight', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('percentage', 'NOUN')->('appear', 'VERB')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('Gensim', 'PROPN')\n", - "\n", - "('Gensim', 'PROPN') (0.0044)\n", - "\n", - "\n", - "\n", - "('Gensim', 'PROPN')->('TextRank', 'PROPN')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('Gensim', 'PROPN')->('use', 'VERB')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('Gensim', 'PROPN')->('Okapi', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Okapi', 'PROPN')->('see', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Okapi', 'PROPN')->('BM25', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Okapi', 'PROPN')->('function', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('BM25', 'PROPN')->('see', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('BM25', 'PROPN')->('similar', 'ADJ')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('BM25', 'PROPN')->('function', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('function', 'NOUN')->('see', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('function', 'NOUN')->('sentence', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('function', 'NOUN')->('similar', 'ADJ')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('improvement', 'NOUN')\n", - "\n", - "('improvement', 'NOUN') (0.0044)\n", + "\n", + "('improvement', 'NOUN') (0.0042)\n", "\n", "\n", - "\n", + "\n", "('improvement', 'NOUN')->('paper', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('improvement', 'NOUN')->('et', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('improvement', 'NOUN')->('Barrios', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Barrios', 'PROPN')->('et', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Barrios', 'PROPN')->('al', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, "execution_count": 19, @@ -2440,21 +2607,25 @@ "name": "stdout", "output_type": "stream", "text": [ - "Requirement already satisfied: altair in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (4.2.0)\n", - "Requirement already satisfied: jsonschema>=3.0 in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (from altair) (4.4.0)\n", - "Requirement already satisfied: jinja2 in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (from altair) (3.0.3)\n", - "Requirement already satisfied: numpy in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (from altair) (1.22.3)\n", - "Requirement already satisfied: entrypoints in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (from altair) (0.4)\n", - "Requirement already satisfied: toolz in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (from altair) (0.11.2)\n", - "Requirement already satisfied: pandas>=0.18 in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (from altair) (1.4.1)\n", - "Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (from jsonschema>=3.0->altair) (0.18.1)\n", - "Requirement already satisfied: attrs>=17.4.0 in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (from jsonschema>=3.0->altair) (21.4.0)\n", - "Requirement already satisfied: importlib-resources>=1.4.0; python_version < \"3.9\" in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (from jsonschema>=3.0->altair) (5.4.0)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (from jinja2->altair) (2.1.0)\n", - "Requirement already satisfied: pytz>=2020.1 in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (from pandas>=0.18->altair) (2021.3)\n", - "Requirement already satisfied: python-dateutil>=2.8.1 in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (from pandas>=0.18->altair) (2.8.2)\n", - "Requirement already satisfied: zipp>=3.1.0; python_version < \"3.10\" in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (from importlib-resources>=1.4.0; python_version < \"3.9\"->jsonschema>=3.0->altair) (3.7.0)\n", - "Requirement already satisfied: six>=1.5 in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (from python-dateutil>=2.8.1->pandas>=0.18->altair) (1.16.0)\n" + "Requirement already satisfied: altair in /home/ankush/workplace/os_repos/pytextrank/venv/lib/python3.10/site-packages (5.0.1)\n", + "Requirement already satisfied: numpy in /home/ankush/workplace/os_repos/pytextrank/venv/lib/python3.10/site-packages (from altair) (1.25.2)\n", + "Requirement already satisfied: jsonschema>=3.0 in /home/ankush/workplace/os_repos/pytextrank/venv/lib/python3.10/site-packages (from altair) (4.19.0)\n", + "Requirement already satisfied: toolz in /home/ankush/workplace/os_repos/pytextrank/venv/lib/python3.10/site-packages (from altair) (0.12.0)\n", + "Requirement already satisfied: typing-extensions>=4.0.1 in /home/ankush/workplace/os_repos/pytextrank/venv/lib/python3.10/site-packages (from altair) (4.7.1)\n", + "Requirement already satisfied: pandas>=0.18 in /home/ankush/workplace/os_repos/pytextrank/venv/lib/python3.10/site-packages (from altair) (2.0.3)\n", + "Requirement already satisfied: jinja2 in /home/ankush/workplace/os_repos/pytextrank/venv/lib/python3.10/site-packages (from altair) (3.1.2)\n", + "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /home/ankush/workplace/os_repos/pytextrank/venv/lib/python3.10/site-packages (from jsonschema>=3.0->altair) (2023.7.1)\n", + "Requirement already satisfied: referencing>=0.28.4 in /home/ankush/workplace/os_repos/pytextrank/venv/lib/python3.10/site-packages (from jsonschema>=3.0->altair) (0.30.2)\n", + "Requirement already satisfied: attrs>=22.2.0 in /home/ankush/workplace/os_repos/pytextrank/venv/lib/python3.10/site-packages (from jsonschema>=3.0->altair) (23.1.0)\n", + "Requirement already satisfied: rpds-py>=0.7.1 in /home/ankush/workplace/os_repos/pytextrank/venv/lib/python3.10/site-packages (from jsonschema>=3.0->altair) (0.9.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /home/ankush/workplace/os_repos/pytextrank/venv/lib/python3.10/site-packages (from pandas>=0.18->altair) (2023.3)\n", + "Requirement already satisfied: tzdata>=2022.1 in /home/ankush/workplace/os_repos/pytextrank/venv/lib/python3.10/site-packages (from pandas>=0.18->altair) (2023.3)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /home/ankush/workplace/os_repos/pytextrank/venv/lib/python3.10/site-packages (from pandas>=0.18->altair) (2.8.2)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /home/ankush/workplace/os_repos/pytextrank/venv/lib/python3.10/site-packages (from jinja2->altair) (2.1.3)\n", + "Requirement already satisfied: six>=1.5 in /home/ankush/workplace/os_repos/pytextrank/venv/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas>=0.18->altair) (1.16.0)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" ] } ], @@ -2472,19 +2643,30 @@ "data": { "text/html": [ "\n", - "
\n", + "\n", + "
\n", "" ], "text/plain": [ @@ -2570,10 +2752,10 @@ "output_type": "stream", "text": [ "ic| sent: First, a quick description of some popular algorithms & implementations for text summarization that exist today: the summarization module in gensim implements TextRank, an unsupervised algorithm based on weighted-graphs from a paper by Mihalcea et al.\n", - "ic| sent: Gensim’s TextRank uses Okapi BM25 function to see how similar the sentences are.\n", + "ic| sent: It is an improvement from a paper by Barrios et al.\n", "ic| sent: It is built on top of the popular PageRank algorithm that Google used for ranking webpages.\n", "ic| sent: Create a graph where vertices are sentences.\n", - "ic| sent: Run the PageRank algorithm on the graph.\n" + "ic| sent: In original TextRank the weights of an edge between two sentences is the percentage of words appearing in both of them.\n" ] } ], @@ -2663,16 +2845,37 @@ "name": "stderr", "output_type": "stream", "text": [ - "ic| phrase: Phrase(text='Salomon Rondón', chunks=[Salomon Rondón, Salomón Rondón, Rondón], count=3, rank=0.07866221348202057)\n", - "ic| phrase: Phrase(text='Chelsea', chunks=[Chelsea, Chelsea, Chelsea], count=3, rank=0.06832817272016853)\n", - "ic| phrase: Phrase(text='Olivier Giroud', chunks=[Olivier Giroud, Giroud], count=2, rank=0.05574966582168716)\n", - "ic| phrase: Phrase(text='deadline day', chunks=[deadline day, deadline day], count=2, rank=0.05008120527495589)\n", + "ic| phrase: Phrase(text='Salomon Rondón',\n", + " chunks=[Salomon Rondón, Salomón Rondón, Rondón],\n", + " count=3,\n", + " rank=0.07866221348202057)\n", + "ic| phrase: Phrase(text='Chelsea',\n", + " chunks=[Chelsea, Chelsea, Chelsea],\n", + " count=3,\n", + " rank=0.06832817272016853)\n", + "ic| phrase: Phrase(text='Olivier Giroud',\n", + " chunks=[Olivier Giroud, Giroud],\n", + " count=2,\n", + " rank=0.05574966582168716)\n", + "ic| phrase: Phrase(text='deadline day',\n", + " chunks=[deadline day, deadline day],\n", + " count=2,\n", + " rank=0.05008120527495589)\n", "ic| phrase: Phrase(text='Leicester', chunks=[Leicester], count=1, rank=0.039067778208486274)\n", "ic| phrase: Phrase(text='club', chunks=[club], count=1, rank=0.037625206033098234)\n", - "ic| phrase: Phrase(text='Edinson Cavani', chunks=[Edinson Cavani], count=1, rank=0.03759951959121995)\n", + "ic| phrase: Phrase(text='Edinson Cavani',\n", + " chunks=[Edinson Cavani],\n", + " count=1,\n", + " rank=0.03759951959121995)\n", "ic| phrase: Phrase(text='draw', chunks=[draw], count=1, rank=0.037353607917351345)\n", - "ic| phrase: Phrase(text='Manchester United', chunks=[Manchester United], count=1, rank=0.035757812045215435)\n", - "ic| phrase: Phrase(text='Dalian Yifang', chunks=[Dalian Yifang], count=1, rank=0.03570018233618092)\n" + "ic| phrase: Phrase(text='Manchester United',\n", + " chunks=[Manchester United],\n", + " count=1,\n", + " rank=0.035757812045215435)\n", + "ic| phrase: Phrase(text='Dalian Yifang',\n", + " chunks=[Dalian Yifang],\n", + " count=1,\n", + " rank=0.03570018233618092)\n" ] } ], @@ -2727,16 +2930,43 @@ "name": "stderr", "output_type": "stream", "text": [ - "ic| phrase: Phrase(text='Leicester', chunks=[Leicester, Leicester], count=2, rank=0.26184834028994514)\n", - "ic| phrase: Phrase(text='Saturday', chunks=[Saturday, Saturday], count=2, rank=0.13938186779355857)\n", - "ic| phrase: Phrase(text='the last 13 Premier League matches', chunks=[the last 13 Premier League matches], count=1, rank=0.12502820319236171)\n", + "ic| phrase: Phrase(text='Leicester',\n", + " chunks=[Leicester, Leicester],\n", + " count=2,\n", + " rank=0.26184834028994514)\n", + "ic| phrase: Phrase(text='Saturday',\n", + " chunks=[Saturday, Saturday],\n", + " count=2,\n", + " rank=0.13938186779355857)\n", + "ic| phrase: Phrase(text='the last 13 Premier League matches',\n", + " chunks=[the last 13 Premier League matches],\n", + " count=1,\n", + " rank=0.12502820319236171)\n", "ic| phrase: Phrase(text='none', chunks=[none], count=1, rank=1.9498221604845646e-07)\n", - "ic| phrase: Phrase(text='Moussa Dembele', chunks=[Moussa Dembele, Moussa Dembele], count=2, rank=8.640024414329197e-08)\n", - "ic| phrase: Phrase(text='Dries Mertens', chunks=[Dries Mertens, Dries Mertens], count=2, rank=5.152284728493906e-08)\n", - "ic| phrase: Phrase(text='Edinson Cavani', chunks=[Edinson Cavani], count=1, rank=3.076049036231119e-08)\n", - "ic| phrase: Phrase(text='a new centre', chunks=[a new centre], count=1, rank=2.7737546970070932e-08)\n", - "ic| phrase: Phrase(text='deadline day', chunks=[deadline day, deadline day], count=2, rank=1.3752326412669907e-08)\n", - "ic| phrase: Phrase(text='their long search', chunks=[their long search], count=1, rank=1.1267201943238505e-08)\n" + "ic| phrase: Phrase(text='Moussa Dembele',\n", + " chunks=[Moussa Dembele, Moussa Dembele],\n", + " count=2,\n", + " rank=8.640024414329197e-08)\n", + "ic| phrase: Phrase(text='Dries Mertens',\n", + " chunks=[Dries Mertens, Dries Mertens],\n", + " count=2,\n", + " rank=5.152284728493906e-08)\n", + "ic| phrase: Phrase(text='Edinson Cavani',\n", + " chunks=[Edinson Cavani],\n", + " count=1,\n", + " rank=3.076049036231119e-08)\n", + "ic| phrase: Phrase(text='a new centre',\n", + " chunks=[a new centre],\n", + " count=1,\n", + " rank=2.7737546970070932e-08)\n", + "ic| phrase: Phrase(text='the Blues targeted Edinson Cavani',\n", + " chunks=[the Blues targeted Edinson Cavani],\n", + " count=1,\n", + " rank=1.9405864014707633e-08)\n", + "ic| phrase: Phrase(text='deadline day',\n", + " chunks=[deadline day, deadline day],\n", + " count=2,\n", + " rank=1.3752326412669907e-08)\n" ] } ], @@ -2752,29 +2982,29 @@ }, { "cell_type": "markdown", - "source": [ - "The top-ranked phrases from *Biased TextRank* are closely related to the \"focus\" item: `Leicester`" - ], + "id": "bdbbce88", "metadata": { - "collapsed": false, "pycharm": { "name": "#%% md\n" } - } + }, + "source": [ + "The top-ranked phrases from *Biased TextRank* are closely related to the \"focus\" item: `Leicester`" + ] }, { "cell_type": "markdown", - "source": [ - "## Using PositionRank\n", - "\n", - "The *PositionRank* enhanced algorithm is simple to use in the `spaCy` pipeline and it supports the other features described above:" - ], + "id": "82178f5a", "metadata": { - "collapsed": false, "pycharm": { "name": "#%% md\n" } - } + }, + "source": [ + "## Using PositionRank\n", + "\n", + "The *PositionRank* enhanced algorithm is simple to use in the `spaCy` pipeline and it supports the other features described above:" + ] }, { "cell_type": "markdown", @@ -2810,16 +3040,37 @@ "name": "stderr", "output_type": "stream", "text": [ - "ic| phrase: Phrase(text='deadline day', chunks=[deadline day, deadline day], count=2, rank=0.1671249044190727)\n", - "ic| phrase: Phrase(text='Salomon Rondón', chunks=[Salomon Rondón, Salomon Rondón], count=2, rank=0.14836718147498046)\n", - "ic| phrase: Phrase(text='Salomón Rondón', chunks=[Salomón Rondón, Salomón Rondón], count=2, rank=0.14169986334846618)\n", - "ic| phrase: Phrase(text='Chelsea', chunks=[Chelsea, Chelsea, Chelsea, Chelsea, Chelsea, Chelsea], count=6, rank=0.13419811872859874)\n", - "ic| phrase: Phrase(text='Rondón', chunks=[Rondón, Rondón], count=2, rank=0.12722264594603172)\n", - "ic| phrase: Phrase(text='a new centre', chunks=[a new centre], count=1, rank=0.09181159181129885)\n", + "ic| phrase: Phrase(text='deadline day',\n", + " chunks=[deadline day, deadline day],\n", + " count=2,\n", + " rank=0.1671249044190727)\n", + "ic| phrase: Phrase(text='Salomon Rondón',\n", + " chunks=[Salomon Rondón, Salomon Rondón],\n", + " count=2,\n", + " rank=0.14836718147498046)\n", + "ic| phrase: Phrase(text='Salomón Rondón',\n", + " chunks=[Salomón Rondón, Salomón Rondón],\n", + " count=2,\n", + " rank=0.14169986334846618)\n", + "ic| phrase: Phrase(text='Chelsea',\n", + " chunks=[Chelsea, Chelsea, Chelsea, Chelsea],\n", + " count=4,\n", + " rank=0.13419811872859874)\n", + "ic| phrase: Phrase(text='Rondón', chunks=[Rondón], count=1, rank=0.12722264594603172)\n", + "ic| phrase: Phrase(text='a new centre',\n", + " chunks=[a new centre],\n", + " count=1,\n", + " rank=0.09181159181129885)\n", "ic| phrase: Phrase(text='Giroud', chunks=[Giroud, Giroud], count=2, rank=0.0783201596831592)\n", - "ic| phrase: Phrase(text='Olivier Giroud', chunks=[Olivier Giroud, Olivier Giroud], count=2, rank=0.07805316118093475)\n", + "ic| phrase: Phrase(text='Olivier Giroud',\n", + " chunks=[Olivier Giroud, Olivier Giroud],\n", + " count=2,\n", + " rank=0.07805316118093475)\n", "ic| phrase: Phrase(text='none', chunks=[none], count=1, rank=0.07503538984105931)\n", - "ic| phrase: Phrase(text='their long search', chunks=[their long search], count=1, rank=0.07449683199895643)\n" + "ic| phrase: Phrase(text='their long search',\n", + " chunks=[their long search],\n", + " count=1,\n", + " rank=0.07449683199895643)\n" ] } ], @@ -2861,16 +3112,41 @@ "name": "stderr", "output_type": "stream", "text": [ - "ic| phrase: Phrase(text='Shanghai Shenhua striker Odion Ighalo', chunks=[Shanghai Shenhua striker Odion Ighalo], count=1, rank=0.11863090071749424)\n", - "ic| phrase: Phrase(text='Odion Ighalo', chunks=[Odion Ighalo], count=1, rank=0.10925286108900635)\n", + "ic| phrase: Phrase(text='Shanghai Shenhua striker Odion Ighalo',\n", + " chunks=[Shanghai Shenhua striker Odion Ighalo,\n", + " Shanghai Shenhua striker Odion Ighalo],\n", + " count=2,\n", + " rank=0.11863090071749424)\n", "ic| phrase: Phrase(text='none', chunks=[none], count=1, rank=0.09802416183300769)\n", - "ic| phrase: Phrase(text='Moussa Dembele', chunks=[Moussa Dembele, Moussa Dembele], count=2, rank=0.09341044332809736)\n", - "ic| phrase: Phrase(text='deadline day', chunks=[deadline day, deadline day], count=2, rank=0.09046182507994752)\n", - "ic| phrase: Phrase(text='Dries Mertens', chunks=[Dries Mertens, Dries Mertens], count=2, rank=0.08919649435994934)\n", - "ic| phrase: Phrase(text='Edinson Cavani', chunks=[Edinson Cavani], count=1, rank=0.08418633972470349)\n", - "ic| phrase: Phrase(text='Shanghai Shenhua', chunks=[Shanghai Shenhua], count=1, rank=0.08254442709505862)\n", - "ic| phrase: Phrase(text='Salomon Rondón', chunks=[Salomon Rondón, Salomon Rondón], count=2, rank=0.08228367707127111)\n", - "ic| phrase: Phrase(text='Salomón Rondón', chunks=[Salomón Rondón, Salomón Rondón], count=2, rank=0.08228367707127111)\n" + "ic| phrase: Phrase(text='Moussa Dembele',\n", + " chunks=[Moussa Dembele, Moussa Dembele],\n", + " count=2,\n", + " rank=0.09341044332809736)\n", + "ic| phrase: Phrase(text='deadline day',\n", + " chunks=[deadline day, deadline day],\n", + " count=2,\n", + " rank=0.09046182507994752)\n", + "ic| phrase: Phrase(text='Dries Mertens',\n", + " chunks=[Dries Mertens, Dries Mertens],\n", + " count=2,\n", + " rank=0.08919649435994934)\n", + "ic| phrase: Phrase(text='Edinson Cavani',\n", + " chunks=[Edinson Cavani],\n", + " count=1,\n", + " rank=0.08418633972470349)\n", + "ic| phrase: Phrase(text='Salomon Rondón',\n", + " chunks=[Salomon Rondón, Salomon Rondón],\n", + " count=2,\n", + " rank=0.08228367707127111)\n", + "ic| phrase: Phrase(text='Salomón Rondón',\n", + " chunks=[Salomón Rondón, Salomón Rondón],\n", + " count=2,\n", + " rank=0.08228367707127111)\n", + "ic| phrase: Phrase(text='Rondón', chunks=[Rondón], count=1, rank=0.0750732870664833)\n", + "ic| phrase: Phrase(text='Dalian Yifang',\n", + " chunks=[Dalian Yifang, Dalian Yifang],\n", + " count=2,\n", + " rank=0.06681675615287698)\n" ] } ], @@ -2908,7 +3184,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.10.12" } }, "nbformat": 4,