Skip to content

Commit

Permalink
Improve performance by 20%+
Browse files Browse the repository at this point in the history
  • Loading branch information
zachbateman committed Jan 5, 2021
1 parent a1264bb commit d3f3183
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 6 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@ PyLookup.egg-info/

build/
dist/

*.prof
8 changes: 4 additions & 4 deletions pylookup/pylookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@
PyLookup module designed for simple, intelligent matching and populating between two tables.
'''
from typing import Union
from rapidfuzz import process
from rapidfuzz import fuzz, process
import pandas
from statistics import mean
from collections import defaultdict
import copy
import click
Expand Down Expand Up @@ -123,11 +122,12 @@ def matchable_columns(main_table, reference_table, main_cols_for_matching) -> di
main_col_matches[main_col].append(ref_col)
break
try:
score = process.extract(main_val, ref_vals, limit=1)[0][1]
score = process.extractOne(main_val, ref_vals, scorer=fuzz.QRatio)[1] # fuzz.QRatio appears faster than default fuzz.WRatio
except TypeError:
score = 0
scores = sorted((score, *scores), reverse=True)
if score > 97 or mean(scores[:3]) > 85: # exit as soon as a good or reasonably good matches are found
# Second conditional of next statement is a faster version instead of mean(scores[:3]) > 85... not the exact same, but close enough for this check
if score > 97 or (len(scores) > 2 and (scores[0] + scores[2]) / 2 > 85): # exit as soon as a good or reasonably good matches are found
main_col_matches[main_col].append(ref_col)
break
if not main_col_matches:
Expand Down
4 changes: 2 additions & 2 deletions tests/basics.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ def test_add_matched_column(self):
reference = pandas.read_csv('reference_table.csv')

print(main)
pylookup.pylookup('TYPE', main, reference)
pylookup.pylookup('ANIMAL2', main, reference, force_name=True)
main = pylookup.pylookup('TYPE', main, reference)
main = pylookup.pylookup('ANIMAL2', main, reference, force_name=True)
print(main)

self.assertTrue('TYPE' in main.columns and 'ANIMAL2' in main.columns)
Expand Down
24 changes: 24 additions & 0 deletions tests/profiling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@

import sys
sys.path.insert(1, '..')
import pylookup
import pandas
import cProfile


def performance_test():
main = pandas.read_csv('main_table.csv')
reference = pandas.read_csv('reference_table.csv')

print(main)
main = pylookup.pylookup('TYPE', main, reference)
main = pylookup.pylookup('ANIMAL2', main, reference, force_name=True)

for _ in range(30):
main = pylookup.pylookup('TYPE', main, reference)
print(main)



if __name__ == '__main__':
cProfile.run('performance_test()', 'prof.prof')

0 comments on commit d3f3183

Please sign in to comment.