Skip to content

Commit

Permalink
Merge pull request #96 from xavierfigueroav/improve-tfidf
Browse files Browse the repository at this point in the history
perf(TFIDF): Lower the similarity score threshold to keep matches

Reviewed-by: [email protected]
Tested-by: [email protected]
  • Loading branch information
GMishx authored Mar 28, 2022
2 parents 6cdd410 + 82e89a8 commit df80386
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions atarashi/agents/tfidf.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,15 +115,15 @@ def __tfidfcosinesim(self, inputFile):
startTime = time.time()

all_documents = self.licenseList['processed_text'].tolist()
sklearn_tfidf = TfidfVectorizer(min_df=0, use_idf=True, smooth_idf=True,
sklearn_tfidf = TfidfVectorizer(min_df=0, max_df=0.10, use_idf=True, smooth_idf=True,
sublinear_tf=True, tokenizer=tokenize)

all_documents_matrix = sklearn_tfidf.fit_transform(all_documents).toarray()
search_martix = sklearn_tfidf.transform([processedData1]).toarray()[0]

for counter, value in enumerate(all_documents_matrix, start=0):
sim_score = self.__cosine_similarity(value, search_martix)
if sim_score >= 0.3:
if sim_score >= 0.16:
matches.append({
'shortname': self.licenseList.iloc[counter]['shortname'],
'sim_type': "TF-IDF Cosine Sim",
Expand Down

0 comments on commit df80386

Please sign in to comment.