From 361fbcd9af4f31eb7849c4c1f16fc26377365434 Mon Sep 17 00:00:00 2001 From: Meghana Cilagani <58835849+ItsMeg01@users.noreply.github.com> Date: Thu, 26 Aug 2021 12:07:51 +0530 Subject: [PATCH] Add files via upload initial commit --- Tasks/summarizer.py | 201 ++++++++++++++++++++++++++++++++++++++ Tasks/task1/color.html | 17 ++++ Tasks/task1/color.js | 9 ++ Tasks/task1/manifest.json | 15 +++ 4 files changed, 242 insertions(+) create mode 100644 Tasks/summarizer.py create mode 100644 Tasks/task1/color.html create mode 100644 Tasks/task1/color.js create mode 100644 Tasks/task1/manifest.json diff --git a/Tasks/summarizer.py b/Tasks/summarizer.py new file mode 100644 index 0000000..000719d --- /dev/null +++ b/Tasks/summarizer.py @@ -0,0 +1,201 @@ +# IMPORTS +import math +import re +import nltk +import urllib.request +from bs4 import BeautifulSoup +from nltk import sent_tokenize, word_tokenize +from nltk.corpus import stopwords +from nltk.stem.porter import PorterStemmer + +def generate_text(link): + page = urllib.request.urlopen(link) + soup = BeautifulSoup(page, "lxml") + text = "" + + for paragraph in soup.find_all('p'): + text += paragraph.text + + return text + +# Method for creating a frequency matrix for the sentences of the text +def create_frequency_matrix(sentences): + frequency_matrix = {} + stopWords = set(stopwords.words("english")) + ps = PorterStemmer() + + for sent in sentences: + freq_table = {} + words = word_tokenize(sent) + for word in words: + word = word.lower() + word = ps.stem(word) + if word in stopWords: + continue + + if word in freq_table: + freq_table[word] += 1 + else: + freq_table[word] = 1 + + frequency_matrix[sent[:15]] = freq_table + + return frequency_matrix + +# Method for creating the TF(Term Frequency) matrix +def create_tf_matrix(freq_matrix): + tf_matrix = {} + + for sent, f_table in freq_matrix.items(): + tf_table = {} + + count_words_in_sentence = len(f_table) + for word, count in f_table.items(): + tf_table[word] = count / count_words_in_sentence + + tf_matrix[sent] = tf_table + + return tf_matrix + + + +def create_documents_per_words(freq_matrix): + word_per_doc_table = {} + + for sent, f_table in freq_matrix.items(): + for word, count in f_table.items(): + if word in word_per_doc_table: + word_per_doc_table[word] += 1 + else: + word_per_doc_table[word] = 1 + + return word_per_doc_table + +# Method for creating the IDF(Inverse Document Frequency) matrix +def create_idf_matrix(freq_matrix, count_doc_per_words, total_documents): + idf_matrix = {} + + for sent, f_table in freq_matrix.items(): + idf_table = {} + + for word in f_table.keys(): + idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word])) + + idf_matrix[sent] = idf_table + + return idf_matrix + +# Method for creating a combined TF-IDF matrix +def create_tf_idf_matrix(tf_matrix, idf_matrix): + tf_idf_matrix = {} + + for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()): + + tf_idf_table = {} + + for (word1, value1), (word2, value2) in zip(f_table1.items(), + f_table2.items()): # here, keys are the same in both the table + tf_idf_table[word1] = float(value1 * value2) + + tf_idf_matrix[sent1] = tf_idf_table + + return tf_idf_matrix + + +# Method for scoring the 'importance' of a sentence with reference to the TF-IDF matrix +def score_sentences(tf_idf_matrix) -> dict: + sentenceValue = {} + + for sent, f_table in tf_idf_matrix.items(): + total_score_per_sentence = 0 + + count_words_in_sentence = len(f_table) + for word, score in f_table.items(): + total_score_per_sentence += score + + sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence + + return sentenceValue + +# Method for finding the average score pf the sentences +def find_average_score(sentenceValue) -> int: + sumValues = 0 + for entry in sentenceValue: + sumValues += sentenceValue[entry] + + # Average value of a sentence from original summary text + average = (sumValues / len(sentenceValue)) + + return average + + +# Method to generate summary from the given sentence scores +def generate_summary(sentences, sentenceValue, threshold): + sentence_count = 0 + summary = '' + + for sentence in sentences: + if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold): + summary += " " + sentence + sentence_count += 1 + + return summary + + +def summarize(text): + # 1 Sentence Tokenize + sentences = sent_tokenize(text) + total_documents = len(sentences) + + # 2 Create the Frequency matrix of the words in each sentence. + freq_matrix = create_frequency_matrix(sentences) + + # 3 Calculate TermFrequency and generate a matrix + tf_matrix = create_tf_matrix(freq_matrix) + + # 4 creating table for documents per words + count_doc_per_words = create_documents_per_words(freq_matrix) + + # 5 Calculate IDF and generate a matrix + idf_matrix = create_idf_matrix(freq_matrix, count_doc_per_words, total_documents) + + # 6 Calculate TF-IDF and generate a matrix + tf_idf_matrix = create_tf_idf_matrix(tf_matrix, idf_matrix) + + # 7 Important Algorithm: score the sentences + sentence_scores = score_sentences(tf_idf_matrix) + + # 8 Find the threshold + threshold = find_average_score(sentence_scores) + + # 9 Important Algorithm: Generate the summary + summary = generate_summary(sentences, sentence_scores, 1.15 * threshold) + return summary + +# Method for cleaning the summary text, i.e. removing extra brackets and numbers +def clean_text(summary): + summary = re.sub(r'\[[0-9]*\]', ' ', summary) + summary = re.sub(r'\s+', ' ', summary) + summary = re.sub('[^a-zA-Z0-9+.]', ' ', summary ) + summary = re.sub(r'\s+', ' ', summary) + + return summary + + + +if __name__ == '__main__': + text_str = generate_text("https://en.wikipedia.org/wiki/Greek_mythology") + result = summarize(text_str) + print(clean_text(result)) + + + + + + + + + + + + diff --git a/Tasks/task1/color.html b/Tasks/task1/color.html new file mode 100644 index 0000000..5bf9e78 --- /dev/null +++ b/Tasks/task1/color.html @@ -0,0 +1,17 @@ + + +
+ + + +