-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindexer.py
112 lines (82 loc) · 3.54 KB
/
indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import json
import operator
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
class Indexer:
def __init__(self, json_docs) -> None:
self.json_docs = json_docs
self.__docs = self.__getDocs(json_docs)
def __getDocs(self, json_file):
return json.load(open(json_file))
def update(self, json_docs=None):
if json_docs == None:
self.__init__(self.json_docs)
else:
self.__init__(json_docs)
# ? change this paradigm to a better approach
@classmethod
def preprocess(cls, text):
# pre-process the text then tokenize it
def clean(text):
# removes stopwords, stemm words, remove small words[less than 2 letters]
# creating Porter stemmer
stemmer = PorterStemmer()
# stemming all words in the text
stemmed = [stemmer.stem(word) for word in text]
# load stop words as set
stop_words = set(stopwords.words('english'))
# remove stopwords from stemmed words
filtered = [w for w in stemmed if not w in stop_words]
# remove words with less than 2 letters
longwords = [w for w in filtered if len(w) > 2]
return longwords
# create tokenizer that accept only words (no punctuations or numbers)
tokenizer = nltk.RegexpTokenizer(r"\w+")
tokenized = tokenizer.tokenize(text)
# clean the text
return clean(tokenized)
def print(self):
# initilize variables
tokens_count = 0 # tokens count
words = {} # a dictionary to store all words with their occurrence count
# load documents
for doc in tqdm(self.__docs,"printing info",bar_format='{desc:<20}{percentage:3.0f}%|{bar:70}{r_bar}'):
tokens = self.preprocess(doc["body"]+" "+doc["title"])
tokens_count += len(tokens)
for token in tokens:
if token in words:
words[token] += 1
else:
words[token] = 1
# sort the words list by their occurrence count
sorted_words = sorted(words.items(), key=operator.itemgetter(1))
# words that appeared only once
once = []
for i, j in sorted_words:
if j > 1:
break
once.append(i)
# printing part 1:
print("Number of tokens:", tokens_count)
print("The number of unique words:", len(words))
print("The number of words that occur only once:", len(once))
print("The 30 most frequent words:\n", sorted_words[-30:])
print("The average number of word tokens per document:", tokens_count/1400)
def create_schema_file(self, output_filename="schema.json"):
schema = {} # I will use Inverted Index schema
# I used this structure : {"word": {docId:tf}}
for doc in tqdm(self.__docs,"indexing documents",bar_format='{desc:<20}{percentage:3.0f}%|{bar:70}{r_bar}'):
tokens = self.preprocess(doc["body"]+" "+doc["title"])
for token in tokens:
if token in schema:
if doc["id"] in schema[token]:
schema[token][doc["id"]] += 1
else:
schema[token][doc["id"]] = 1
else:
schema[token] = {doc["id"]: 1}
for term in schema:
schema[term]["idf"]=len(self.__docs)/len(schema[term])
json.dump(schema, open(output_filename, "w"))