Improve performance by 20%+

zachbateman · Jan 5, 2021 · d3f3183 · d3f3183
1 parent a1264bb
commit d3f3183
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 6 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,5 @@ PyLookup.egg-info/
 
 build/
 dist/
+
+*.prof
diff --git a/pylookup/pylookup.py b/pylookup/pylookup.py
@@ -2,9 +2,8 @@
 PyLookup module designed for simple, intelligent matching and populating between two tables.
 '''
 from typing import Union
-from rapidfuzz import process
+from rapidfuzz import fuzz, process
 import pandas
-from statistics import mean
 from collections import defaultdict
 import copy
 import click
@@ -123,11 +122,12 @@ def matchable_columns(main_table, reference_table, main_cols_for_matching) -> di
                     main_col_matches[main_col].append(ref_col)
                     break
                 try:
-                    score = process.extract(main_val, ref_vals, limit=1)[0][1]
+                    score = process.extractOne(main_val, ref_vals, scorer=fuzz.QRatio)[1]  # fuzz.QRatio appears faster than default fuzz.WRatio
                 except TypeError:
                     score = 0
                 scores = sorted((score, *scores), reverse=True)
-                if score > 97 or mean(scores[:3]) > 85:  # exit as soon as a good or reasonably good matches are found
+                # Second conditional of next statement is a faster version instead of mean(scores[:3]) > 85... not the exact same, but close enough for this check
+                if score > 97 or (len(scores) > 2 and (scores[0] + scores[2]) / 2 > 85):  # exit as soon as a good or reasonably good matches are found
                     main_col_matches[main_col].append(ref_col)
                     break
     if not main_col_matches:

diff --git a/tests/basics.py b/tests/basics.py
@@ -12,8 +12,8 @@ def test_add_matched_column(self):
         reference = pandas.read_csv('reference_table.csv')
 
         print(main)
-        pylookup.pylookup('TYPE', main, reference)
-        pylookup.pylookup('ANIMAL2', main, reference, force_name=True)
+        main = pylookup.pylookup('TYPE', main, reference)
+        main = pylookup.pylookup('ANIMAL2', main, reference, force_name=True)
         print(main)
 
         self.assertTrue('TYPE' in main.columns and 'ANIMAL2' in main.columns)

diff --git a/tests/profiling.py b/tests/profiling.py
@@ -0,0 +1,24 @@
+
+import sys
+sys.path.insert(1, '..')
+import pylookup
+import pandas
+import cProfile
+
+
+def performance_test():
+    main = pandas.read_csv('main_table.csv')
+    reference = pandas.read_csv('reference_table.csv')
+
+    print(main)
+    main = pylookup.pylookup('TYPE', main, reference)
+    main = pylookup.pylookup('ANIMAL2', main, reference, force_name=True)
+
+    for _ in range(30):
+        main = pylookup.pylookup('TYPE', main, reference)
+    print(main)
+
+
+
+if __name__ == '__main__':
+    cProfile.run('performance_test()', 'prof.prof')
Original file line number	Diff line number	Diff line change
Expand Up		@@ -6,3 +6,5 @@ PyLookup.egg-info/

		build/
		dist/

		*.prof