From a6458c830399eaac038ad2bd541d3896984cfa1b Mon Sep 17 00:00:00 2001
From: dkaslovsky <dkaslovsky@gmail.com>
Date: Wed, 26 Aug 2020 19:29:21 -0600
Subject: [PATCH 01/18] remove six from tests

---
 tests/test_count.py  | 21 ++++++++++-----------
 tests/test_detect.py | 26 ++++++++++++++------------
 tests/test_matrix.py | 16 ++++++++--------
 3 files changed, 32 insertions(+), 31 deletions(-)

diff --git a/tests/test_count.py b/tests/test_count.py
index 7f41b23..fefb439 100644
--- a/tests/test_count.py
+++ b/tests/test_count.py
@@ -2,7 +2,6 @@
 from collections import Counter
 
 import numpy as np
-from six import iteritems
 
 import coupled_biased_random_walks.count as cnt
 
@@ -26,7 +25,7 @@ def test_insert(self):
                 'expected_dict': {'a': 0, 'c': 1, 'b': 2, 'd': 3}
             }
         }
-        for test_name, test in iteritems(table):
+        for test_name, test in table.items():
             self.setUp()
             for key in test['keys_to_insert']:
                 self.d.insert(key)
@@ -61,7 +60,7 @@ class TestObservationCounter(unittest.TestCase):
     # keep a set of all feature_name, feature_val pairs for testing
     all_index_keys = set()
     for observation in observations:
-        for item in iteritems(observation):
+        for item in observation.items():
             all_index_keys.add(item)
 
     def setUp(self):
@@ -75,7 +74,7 @@ def test_update(self):
             'feature_b': 2,
             'feature_c': 2
         }
-        for feature_name, count in iteritems(self.oc.n_obs):
+        for feature_name, count in self.oc.n_obs.items():
             self.assertEqual(count, expected_counts[feature_name])
 
         # test index
@@ -103,7 +102,7 @@ def test_update(self):
                     ]
             }
         }
-        for feature, test in iteritems(table):
+        for feature, test in table.items():
             counts = self.oc.counts[feature]
             expected = sorted(test['expected'])
             self.assertListEqual(sorted(list(counts.items())), expected, feature)
@@ -137,7 +136,7 @@ def test_get_count(self):
                 'expected': 0
             },
         }
-        for test_name, test in iteritems(table):
+        for test_name, test in table.items():
             count = self.oc.get_count(test['feature tuple'])
             expected = test['expected']
             self.assertEqual(count, expected, test_name)
@@ -156,7 +155,7 @@ class TestObservationCounterWithMissingData(unittest.TestCase):
     # keep a set of all feature_name, feature_val pairs for testing
     all_index_keys = set()
     for observation in observations:
-        for item in iteritems(observation):
+        for item in observation.items():
             if not cnt.isnan(cnt.get_feature_value(item)):
                 all_index_keys.add(item)
 
@@ -171,7 +170,7 @@ def test_update(self):
             'feature_b': 1,
             'feature_c': 1
         }
-        for feature_name, count in iteritems(self.oc.n_obs):
+        for feature_name, count in self.oc.n_obs.items():
             self.assertEqual(count, expected_counts[feature_name])
 
         # test index
@@ -201,7 +200,7 @@ def test_update(self):
                 'expected': []
             }
         }
-        for feature, test in iteritems(table):
+        for feature, test in table.items():
             counts = self.oc.counts.get(feature, {})
             expected = sorted(test['expected'])
             self.assertListEqual(sorted(list(counts.items())), expected, feature)
@@ -250,7 +249,7 @@ def test_isnan(self):
                 'expected': False
             },
         }
-        for test_name, test in iteritems(table):
+        for test_name, test in table.items():
             isnan_result = cnt.isnan(test['test'])
             self.assertEqual(isnan_result, test['expected'], test_name)
 
@@ -281,7 +280,7 @@ def test_get_mode(self):
                 'expected': 2
             },
         }
-        for test_name, test in iteritems(table):
+        for test_name, test in table.items():
             mode = cnt.get_mode(test['counter'])
             self.assertEqual(mode, test['expected'], test_name)
 
diff --git a/tests/test_detect.py b/tests/test_detect.py
index d9fc77c..66dee41 100644
--- a/tests/test_detect.py
+++ b/tests/test_detect.py
@@ -3,11 +3,13 @@
 
 import numpy as np
 from scipy.sparse import csr_matrix
-from six import itervalues
 
 from coupled_biased_random_walks.count import isnan
-from coupled_biased_random_walks.detect import (CBRW, CBRWFitError,
-                                                CBRWScoreError)
+from coupled_biased_random_walks.detect import (
+    CBRW,
+    CBRWFitError,
+    CBRWScoreError,
+)
 
 
 class TestCBRW(unittest.TestCase):
@@ -231,7 +233,7 @@ def test_value_scores(self):
         value_scores = self.cbrw.value_scores(to_be_scored)
         value_scores = value_scores[0]
         self.assertListEqual(sorted(value_scores.keys()), sorted(to_be_scored.keys()))
-        for vs in itervalues(value_scores):
+        for vs in value_scores.values():
             self.assertGreaterEqual(vs, 0)
             self.assertLessEqual(vs, 1)
 
@@ -272,8 +274,8 @@ def test_value_scores_unknown_features_default(self):
         value_scores = self.cbrw.value_scores(to_be_scored)
         valid_scores = value_scores[0]
         invalid_scores = value_scores[1]
-        self.assertTrue(all(not isnan(valid_score) for valid_score in itervalues(valid_scores)))
-        self.assertTrue(any(isnan(invalid_score) for invalid_score in itervalues(invalid_scores)))
+        self.assertTrue(all(not isnan(valid_score) for valid_score in valid_scores.values()))
+        self.assertTrue(any(isnan(invalid_score) for invalid_score in invalid_scores.values()))
 
     def test_value_scores_unknown_features_ignore(self):
         self.cbrw = CBRW(ignore_unknown=True)
@@ -293,7 +295,7 @@ def test_value_scores_unknown_features_ignore(self):
         }
         value_scores = self.cbrw.value_scores(to_be_scored)[0]
         actual_value_scores = self.cbrw.value_scores(actually_scored)[0]
-        self.assertTrue(all(not isnan(vs) for vs in itervalues(value_scores)))
+        self.assertTrue(all(not isnan(vs) for vs in value_scores.values()))
         self.assertEqual(value_scores['feature_a'], 0)
         self.assertEqual(value_scores['feature_b'], actual_value_scores['feature_b'])
         self.assertEqual(value_scores['feature_c'], actual_value_scores['feature_c'])
@@ -311,7 +313,7 @@ def test_value_scores_unknown_features_ignore(self):
         }
         value_scores = self.cbrw.value_scores(to_be_scored)[0]
         actual_value_scores = self.cbrw.value_scores(actually_scored)[0]
-        self.assertTrue(all(not isnan(vs) for vs in itervalues(value_scores)))
+        self.assertTrue(all(not isnan(vs) for vs in value_scores.values()))
         self.assertEqual(value_scores['feature_x'], 0)
         self.assertEqual(value_scores['feature_b'], actual_value_scores['feature_b'])
         self.assertEqual(value_scores['feature_c'], actual_value_scores['feature_c'])
@@ -324,8 +326,8 @@ def test_value_scores_unknown_features_ignore(self):
             'feature_z': 'z_val_1'
         }
         value_scores = self.cbrw.value_scores(to_be_scored)[0]
-        self.assertTrue(all(not isnan(vs) for vs in itervalues(value_scores)))
-        self.assertTrue(all(vs == 0 for vs in itervalues(value_scores)))
+        self.assertTrue(all(not isnan(vs) for vs in value_scores.values()))
+        self.assertTrue(all(vs == 0 for vs in value_scores.values()))
 
     def test_value_scores_with_nans_default(self):
         obs = deepcopy(self.observations)
@@ -369,7 +371,7 @@ def test_value_scores_with_nans_ignore(self):
         self.cbrw.fit()
         value_scores = self.cbrw.value_scores(to_be_scored)[0]
         actual_value_scores = self.cbrw.value_scores(actually_scored)[0]
-        self.assertTrue(all(not isnan(vs) for vs in itervalues(value_scores)))
+        self.assertTrue(all(not isnan(vs) for vs in value_scores.values()))
         self.assertEqual(value_scores['feature_a'], 0)
         self.assertEqual(value_scores['feature_b'], actual_value_scores['feature_b'])
         self.assertEqual(value_scores['feature_c'], actual_value_scores['feature_c'])
@@ -380,7 +382,7 @@ def test_value_scores_with_nans_ignore(self):
         self.cbrw.fit()
         value_scores = self.cbrw.value_scores(to_be_scored)[0]
         actual_value_scores = self.cbrw.value_scores(actually_scored)[0]
-        self.assertTrue(all(not isnan(vs) for vs in itervalues(value_scores)))
+        self.assertTrue(all(not isnan(vs) for vs in value_scores.values()))
         self.assertEqual(value_scores['feature_a'], 0)
         self.assertEqual(value_scores['feature_b'], actual_value_scores['feature_b'])
         self.assertEqual(value_scores['feature_c'], actual_value_scores['feature_c'])
diff --git a/tests/test_matrix.py b/tests/test_matrix.py
index 5e2496e..2a9a152 100644
--- a/tests/test_matrix.py
+++ b/tests/test_matrix.py
@@ -2,12 +2,12 @@
 
 import numpy as np
 from scipy.sparse import csr_matrix
-from six import iteritems
-from six.moves import zip
 
-from coupled_biased_random_walks.matrix import (dict_to_csr_matrix,
-                                                random_walk,
-                                                row_normalize_csr_matrix)
+from coupled_biased_random_walks.matrix import (
+    dict_to_csr_matrix,
+    random_walk,
+    row_normalize_csr_matrix,
+)
 
 np.random.seed(0)
 
@@ -85,7 +85,7 @@ def test_dict_to_csr_matrix(self):
             }
         }
 
-        for test_name, params in iteritems(table):
+        for test_name, params in table.items():
             data_dict = params['data_dict']
             shape = params['shape']
             expected = params['expected']
@@ -128,13 +128,13 @@ def test_valid_row_normalize(self):
             }
         }
 
-        for test_name, test in iteritems(valid_table):
+        for test_name, test in valid_table.items():
             matrix = construct_2x2_csr_matrix(test['data'])
             normalized = row_normalize_csr_matrix(matrix)
             row_sums = normalized.sum(axis=1)
             self.assertAlmostEqual(row_sums[0], test['expected_row_0'], 3, test_name)
             self.assertAlmostEqual(row_sums[1], test['expected_row_1'], 3, test_name)
 
-        for test_name, test in iteritems(invalid_table):
+        for test_name, test in invalid_table.items():
             with self.assertRaises(test['exception']):
                 _ = row_normalize_csr_matrix(test['input'])

From ff0fdfaec5f4bf89d533b781fa818e054cefc49a Mon Sep 17 00:00:00 2001
From: dkaslovsky <dkaslovsky@gmail.com>
Date: Wed, 26 Aug 2020 19:48:46 -0600
Subject: [PATCH 02/18] remove six and __future__

---
 coupled_biased_random_walks/count.py  | 14 +++-----------
 coupled_biased_random_walks/detect.py | 15 ++++++---------
 coupled_biased_random_walks/matrix.py |  6 +-----
 3 files changed, 10 insertions(+), 25 deletions(-)

diff --git a/coupled_biased_random_walks/count.py b/coupled_biased_random_walks/count.py
index c64b588..f40aec5 100644
--- a/coupled_biased_random_walks/count.py
+++ b/coupled_biased_random_walks/count.py
@@ -1,15 +1,7 @@
 from collections import Counter, defaultdict
+from collections.abc import Mapping
 from itertools import combinations, tee
 
-from six import iteritems
-
-try:
-    # python 2
-    from collections import Mapping
-except ImportError:
-    # python 3
-    from collections.abc import Mapping
-
 
 class IncrementingDict(Mapping):
 
@@ -91,9 +83,9 @@ def update(self, observation_iterable):
             # feature name with value NaN represents a missing feature in the
             # observation (e.g., a missing value is NaN-filled in a pandas DataFrame) so
             # we remove any such features from the observation to avoid including in counts
-            obs = {key: value for key, value in iteritems(observation) if not isnan(value)}
+            obs = {key: value for key, value in observation.items() if not isnan(value)}
             # create iterators of obs for updating counts
-            obs1, obs2 = tee(iteritems(obs), 2)
+            obs1, obs2 = tee(obs.items(), 2)
             self._update_counts(obs1)
             self._update_joint_counts(obs2)
 
diff --git a/coupled_biased_random_walks/detect.py b/coupled_biased_random_walks/detect.py
index 8f3b61f..a9764be 100644
--- a/coupled_biased_random_walks/detect.py
+++ b/coupled_biased_random_walks/detect.py
@@ -1,9 +1,6 @@
-from __future__ import division
-
 from collections import defaultdict
 
 import numpy as np
-from six import iteritems, itervalues
 
 from coupled_biased_random_walks.count import (
     ObservationCounter,
@@ -71,7 +68,7 @@ def fit(self):
         stationary_prob = {}
         feature_relevance = defaultdict(int)
 
-        for feature, idx in iteritems(self._counter.index):
+        for feature, idx in self._counter.index.items():
             prob = pi[idx]
             stationary_prob[feature] = prob
             feature_relevance[get_feature_name(feature)] += prob
@@ -99,7 +96,7 @@ def _score(self, observation):
         Compute the weighted anomaly score (object_score in the paper) for an observation
         :param observation: dict of the form {feature_name: feature_value, ...}
         """
-        return sum(itervalues(self._value_scores(observation)))
+        return sum(self._value_scores(observation).values())
 
     def value_scores(self, observation_iterable):
         """
@@ -125,7 +122,7 @@ def _value_scores(self, observation):
             get_feature_name(item):
                 self._get_feature_relevance(item) *
                 self._stationary_prob.get(item, self._unknown_feature_score)
-            for item in iteritems(observation)
+            for item in observation.items()
         }
 
     def _get_feature_relevance(self, feature_tuple):
@@ -144,7 +141,7 @@ def _compute_biased_transition_matrix(self):
 
         bias_dict = self._compute_biases()
 
-        for (feature1, feature2), joint_count in iteritems(self._counter.joint_counts):
+        for (feature1, feature2), joint_count in self._counter.joint_counts.items():
 
             # get index for features
             feature1_idx = self._counter.index[feature1]
@@ -178,12 +175,12 @@ def _compute_biases(self):
         Computes bias for random walk for each feature tuple
         """
         bias_dict = {}
-        for feature_name, value_counts in iteritems(self._counter.counts):
+        for feature_name, value_counts in self._counter.counts.items():
             mode = get_mode(value_counts)
             base = 1 - (mode / self._counter.n_obs[feature_name])
             bias = {
                 feature_val: (1 - (count / mode) + base) / 2
-                for feature_val, count in iteritems(value_counts)
+                for feature_val, count in value_counts.items()
             }
             bias_dict.update(bias)
         return bias_dict
diff --git a/coupled_biased_random_walks/matrix.py b/coupled_biased_random_walks/matrix.py
index 5ecf663..8c475e8 100644
--- a/coupled_biased_random_walks/matrix.py
+++ b/coupled_biased_random_walks/matrix.py
@@ -1,8 +1,5 @@
-from __future__ import division
-
 import numpy as np
 from scipy.sparse import csr_matrix
-from six.moves import range
 
 
 def random_walk(transition_matrix, alpha, err_tol, max_iter):
@@ -41,8 +38,7 @@ def dict_to_csr_matrix(data_dict, shape):
 
     if isinstance(shape, int):
         shape = (shape, shape)
-    # csr_matrix cannot accept iterators so cast to lists for python 3
-    data = list(data_dict.values())
+    data = list(data_dict.values())  # csr_matrix cannot accept iterator for data
     idx = zip(*list(data_dict.keys()))
     return csr_matrix((data, idx), shape=shape)
 

From 87ed0d11479b1ebf6864af87ca4be02d62bb9358 Mon Sep 17 00:00:00 2001
From: dkaslovsky <dkaslovsky@gmail.com>
Date: Wed, 26 Aug 2020 20:10:02 -0600
Subject: [PATCH 03/18] remove six and use fstring in example code

---
 data/loading.py |  4 +---
 example.py      | 10 ++++------
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/data/loading.py b/data/loading.py
index 8f1b119..6c5afe4 100644
--- a/data/loading.py
+++ b/data/loading.py
@@ -1,8 +1,6 @@
 from csv import DictReader
 from functools import partial
 
-from six import iteritems
-
 
 def load_from_csv(path_to_csv, exclude_cols=None):
     """
@@ -28,4 +26,4 @@ def filter_keys(record, fields):
     :param fields: set of strings indicating fields to drop
     :return:
     """
-    return {k: v for k, v in iteritems(record) if k not in fields}
+    return {k: v for k, v in record.items() if k not in fields}
diff --git a/example.py b/example.py
index d2ff1aa..1725720 100644
--- a/example.py
+++ b/example.py
@@ -1,7 +1,5 @@
 import os
 
-from six import iteritems
-
 from coupled_biased_random_walks import CBRW
 from data.loading import load_from_csv
 
@@ -14,7 +12,7 @@
 
 def round_dict_values(input_dict, digits=4):
     """ Helper function for printing dicts with float values """
-    return {key: round(val, digits) for key, val in iteritems(input_dict)}
+    return {key: round(val, digits) for key, val in input_dict.items()}
 
 
 if __name__ == '__main__':
@@ -35,15 +33,15 @@ def round_dict_values(input_dict, digits=4):
     # display results
     print('Detector fit with {} observations:'.format(len(observations)))
     for i, obs in enumerate(observations):
-        print('Observation ID {}: {}'.format(i+1, obs))
+        print(f'Observation ID {i+1}: {obs}')
 
     print('\nFeature weights:')
     print(round_dict_values(detector.feature_weights, 4))
 
     print('\nScores:')
     for i, score in enumerate(scores):
-        print('Observation ID {}: {}'.format(i+1, round(score, 4)))
+        print(f'Observation ID {i+1}: {round(score, 4)}')
 
     print('\nValue scores per attribute:')
     for i, value_score in enumerate(value_scores):
-        print('Observation ID {}: {}'.format(i+1, round_dict_values(value_score, 4)))
+        print(f'Observation ID {i+1}: {round_dict_values(value_score, 4)}')

From 7e2cf6e63578a7beb8953d9513172c4fc4e2e7e0 Mon Sep 17 00:00:00 2001
From: dkaslovsky <dkaslovsky@gmail.com>
Date: Wed, 26 Aug 2020 20:18:32 -0600
Subject: [PATCH 04/18] no longer inherit from object

---
 coupled_biased_random_walks/detect.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/coupled_biased_random_walks/detect.py b/coupled_biased_random_walks/detect.py
index a9764be..deadf9f 100644
--- a/coupled_biased_random_walks/detect.py
+++ b/coupled_biased_random_walks/detect.py
@@ -14,7 +14,7 @@
 )
 
 
-class CBRW(object):
+class CBRW:
 
     """ Class implementing Coupled Biased Random Walks algorithm """
 

From d148f9d0f15e53ddb5c4f8ed14cd8beca7829225 Mon Sep 17 00:00:00 2001
From: dkaslovsky <dkaslovsky@gmail.com>
Date: Wed, 26 Aug 2020 20:20:07 -0600
Subject: [PATCH 05/18] update dependency versions

---
 CHANGELOG.md     | 4 ++++
 requirements.txt | 5 ++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 99b002a..b2bfe33 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 2.0.0 / 2020-08-26
+* [Changed] removed support for Python2
+* [Changed] updated dependencies to latest versions
+
 ## 1.1.0 / 2020-08-23
 
 * [Added] `CBRW.value_scores()` function to return individual value scores of an observation
diff --git a/requirements.txt b/requirements.txt
index cb1a0b9..33b7f09 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,2 @@
-numpy==1.14.5
-scipy==1.1.0
-six==1.11.0
+numpy==1.19.1
+scipy==1.5.2

From 092ba6471f03fee48b649077f226978c56820943 Mon Sep 17 00:00:00 2001
From: dkaslovsky <dkaslovsky@gmail.com>
Date: Wed, 26 Aug 2020 20:20:40 -0600
Subject: [PATCH 06/18] python3 only

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index a5e6560..f71390a 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,10 +2,10 @@ language: python
 
 matrix:
   include:
-    - python: 2.7
     - python: 3.5
     - python: 3.6
     - python: 3.7
+    - python: 3.8
 
 install:
   - pip install -r requirements.txt

From 1965d09353f6d812aa5c3948117065cec3ec950d Mon Sep 17 00:00:00 2001
From: dkaslovsky <dkaslovsky@gmail.com>
Date: Wed, 26 Aug 2020 20:20:57 -0600
Subject: [PATCH 07/18] version bump to 2.0.0

---
 setup.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 0f36f40..b33f038 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
 
 setup(
     name='coupled_biased_random_walks',
-    version='1.1.0',
+    version='2.0.0',
     author='Daniel Kaslovsky',
     author_email='dkaslovsky@gmail.com',
     license='MIT',
@@ -26,10 +26,9 @@
         'License :: OSI Approved :: MIT License',
         'Natural Language :: English',
         'Operating System :: OS Independent',
-        'Programming Language :: Python :: 2.7',
-        'Programming Language :: Python :: 3.4',
         'Programming Language :: Python :: 3.5',
         'Programming Language :: Python :: 3.6',
         'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.8',
     ]
 )

From a3c7c399d00bee57122027460306859e83f91d9b Mon Sep 17 00:00:00 2001
From: dkaslovsky <dkaslovsky@gmail.com>
Date: Wed, 26 Aug 2020 20:25:37 -0600
Subject: [PATCH 08/18] remove python <3.6

---
 .travis.yml  | 1 -
 CHANGELOG.md | 2 +-
 setup.py     | 1 -
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index f71390a..9f50e43 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,7 +2,6 @@ language: python
 
 matrix:
   include:
-    - python: 3.5
     - python: 3.6
     - python: 3.7
     - python: 3.8
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b2bfe33..ae89d6c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,5 @@
 ## 2.0.0 / 2020-08-26
-* [Changed] removed support for Python2
+* [Changed] removed support for Python 2 and <3.6
 * [Changed] updated dependencies to latest versions
 
 ## 1.1.0 / 2020-08-23
diff --git a/setup.py b/setup.py
index b33f038..692b8d4 100644
--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,6 @@
         'License :: OSI Approved :: MIT License',
         'Natural Language :: English',
         'Operating System :: OS Independent',
-        'Programming Language :: Python :: 3.5',
         'Programming Language :: Python :: 3.6',
         'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',

From aaf7bcc76f17a14ce19b2ad4502bb30d463115f8 Mon Sep 17 00:00:00 2001
From: dkaslovsky <dkaslovsky@gmail.com>
Date: Wed, 26 Aug 2020 20:32:33 -0600
Subject: [PATCH 09/18] update readme

---
 README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 2d6cc35..48167ea 100644
--- a/README.md
+++ b/README.md
@@ -3,9 +3,10 @@ Outlier detection for categorical data
 
 [![Build Status](https://travis-ci.org/dkaslovsky/Coupled-Biased-Random-Walks.svg?branch=master)](https://travis-ci.org/dkaslovsky/Coupled-Biased-Random-Walks)
 [![Coverage Status](https://coveralls.io/repos/github/dkaslovsky/Coupled-Biased-Random-Walks/badge.svg?branch=master)](https://coveralls.io/github/dkaslovsky/Coupled-Biased-Random-Walks?branch=master)
+![PyPI - Python Version](https://img.shields.io/pypi/pyversions/Coupled-Biased-Random-Walks)
 
 ### Overview
-Python [2.7, 3.x] implementation of the Coupled Biased Random Walks (CBRW) outlier detection algorithm described by Pang, Cao, and Chen in https://www.ijcai.org/Proceedings/16/Papers/272.pdf.
+Python implementation of the Coupled Biased Random Walks (CBRW) outlier detection algorithm described by Pang, Cao, and Chen in https://www.ijcai.org/Proceedings/16/Papers/272.pdf.
 
 This implementation operates on Python dicts rather than Pandas DataFrames.  This has the advantage of allowing the model to be updated with new observations in a trivial manner and is more efficient in certain aspects.  However, these advantages come at the cost of iterating a (potentially large) dict of observed values more times than might otherwise be necessary using an underlying DataFrame implementation.
 
@@ -14,7 +15,7 @@ If one is working with data previously loaded into a DataFrame, simply use the r
 ### Installation
 This package is hosted on PyPI and can be installed via `pip`:
 ```
-$ pip install coupled_biased_random_walks
+$ pip install coupled-biased-random-walks
 ```
 To instead install from source:
 ```

From 8668213cc5ef97e38f56416d7aabf92f97854833 Mon Sep 17 00:00:00 2001
From: dkaslovsky <dkaslovsky@gmail.com>
Date: Wed, 26 Aug 2020 20:37:46 -0600
Subject: [PATCH 10/18] use fstring

---
 example.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example.py b/example.py
index 1725720..13c0c56 100644
--- a/example.py
+++ b/example.py
@@ -31,7 +31,7 @@ def round_dict_values(input_dict, digits=4):
     value_scores = detector.value_scores(observations)
 
     # display results
-    print('Detector fit with {} observations:'.format(len(observations)))
+    print(f'Detector fit with {len(observations)} observations:')
     for i, obs in enumerate(observations):
         print(f'Observation ID {i+1}: {obs}')
 

From b535c7f817f315f03730d4654d6dc9ccbd707d7c Mon Sep 17 00:00:00 2001
From: dkaslovsky <dkaslovsky@gmail.com>
Date: Wed, 26 Aug 2020 20:38:17 -0600
Subject: [PATCH 11/18] img for readme

---
 README.md                                  |   2 +-
 example_table.png => img/example_table.png | Bin
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename example_table.png => img/example_table.png (100%)

diff --git a/README.md b/README.md
index 48167ea..e1e331a 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ $ python setup.py install
 ### Example
 Let's run the CBRW detection algorithm on the authors' example data set from the paper:
 
-<img src="./example_table.png" width="400">
+<img src="./img/example_table.png" width="400">
 
 This data is saved as a [CSV file](./data/CBRW_paper_example.csv) in this repository and is loaded into memory as a list of dicts by [example.py](./example.py).  Note that we drop the `Cheat?` column when loading the data, as this is essentially the target variable indicating the anomalous activity to be detected.  The detector is instantiated and observations are added as follows:
 ```
diff --git a/example_table.png b/img/example_table.png
similarity index 100%
rename from example_table.png
rename to img/example_table.png

From 9ec07cf26b2b6c4631c3c0d1589af9c4994ed31c Mon Sep 17 00:00:00 2001
From: dkaslovsky <dkaslovsky@gmail.com>
Date: Thu, 27 Aug 2020 06:43:51 -0600
Subject: [PATCH 12/18] python3 and minor readability improvements

---
 coupled_biased_random_walks/count.py  |  2 +-
 coupled_biased_random_walks/detect.py | 10 ++++------
 data/loading.py                       | 14 ++++++++------
 tests/test_count.py                   |  9 +++------
 tests/test_detect.py                  |  2 +-
 tests/test_matrix.py                  | 22 ++++++++++------------
 6 files changed, 27 insertions(+), 32 deletions(-)

diff --git a/coupled_biased_random_walks/count.py b/coupled_biased_random_walks/count.py
index f40aec5..466d6e4 100644
--- a/coupled_biased_random_walks/count.py
+++ b/coupled_biased_random_walks/count.py
@@ -39,7 +39,7 @@ def __repr__(self):
         return self._d.__repr__()
 
 
-class ObservationCounter(object):
+class ObservationCounter:
 
     """
     Counts single and joint occurrences of key/value pairs in a dict with
diff --git a/coupled_biased_random_walks/detect.py b/coupled_biased_random_walks/detect.py
index deadf9f..f805571 100644
--- a/coupled_biased_random_walks/detect.py
+++ b/coupled_biased_random_walks/detect.py
@@ -32,7 +32,7 @@ def __init__(self, rw_params=None, ignore_unknown=False):
         or values based only on features seen during training; if False, score such an observation
         as nan (default)
         """
-        self.rw_params = rw_params if rw_params else self.PRESET_RW_PARAMS
+        self.rw_params = rw_params or self.PRESET_RW_PARAMS
         self._unknown_feature_score = 0 if ignore_unknown else np.nan
 
         self._counter = ObservationCounter()
@@ -178,11 +178,9 @@ def _compute_biases(self):
         for feature_name, value_counts in self._counter.counts.items():
             mode = get_mode(value_counts)
             base = 1 - (mode / self._counter.n_obs[feature_name])
-            bias = {
-                feature_val: (1 - (count / mode) + base) / 2
-                for feature_val, count in value_counts.items()
-            }
-            bias_dict.update(bias)
+            for feature_val, count in value_counts.items():
+                bias = (1 - (count / mode) + base) / 2
+                bias_dict[feature_val] = bias
         return bias_dict
 
 
diff --git a/data/loading.py b/data/loading.py
index 6c5afe4..f82c2b8 100644
--- a/data/loading.py
+++ b/data/loading.py
@@ -10,13 +10,15 @@ def load_from_csv(path_to_csv, exclude_cols=None):
     :param exclude_cols: iterable of columns to exclude (often the target variable)
     """
     with open(path_to_csv, 'r') as csvfile:
+        # use list to load into memory before closing
         data = list(DictReader(csvfile))
-    if exclude_cols is not None:
-        if isinstance(exclude_cols, str):
-            exclude_cols = {exclude_cols}
-        filt = partial(filter_keys, fields=set(exclude_cols))
-        return [filt(rec) for rec in data]
-    return data
+    if exclude_cols is None:
+        return data
+    # filter based on exclude cols
+    if isinstance(exclude_cols, str):
+        exclude_cols = {exclude_cols}
+    filt = partial(filter_keys, fields=set(exclude_cols))
+    return [filt(rec) for rec in data]
 
 
 def filter_keys(record, fields):
diff --git a/tests/test_count.py b/tests/test_count.py
index fefb439..7e6290f 100644
--- a/tests/test_count.py
+++ b/tests/test_count.py
@@ -104,8 +104,7 @@ def test_update(self):
         }
         for feature, test in table.items():
             counts = self.oc.counts[feature]
-            expected = sorted(test['expected'])
-            self.assertListEqual(sorted(list(counts.items())), expected, feature)
+            self.assertCountEqual(counts.items(), test['expected'], feature)
 
         # test joint_counts
         expected_joint_counts = {
@@ -138,8 +137,7 @@ def test_get_count(self):
         }
         for test_name, test in table.items():
             count = self.oc.get_count(test['feature tuple'])
-            expected = test['expected']
-            self.assertEqual(count, expected, test_name)
+            self.assertEqual(count, test['expected'], test_name)
 
 
 class TestObservationCounterWithMissingData(unittest.TestCase):
@@ -202,8 +200,7 @@ def test_update(self):
         }
         for feature, test in table.items():
             counts = self.oc.counts.get(feature, {})
-            expected = sorted(test['expected'])
-            self.assertListEqual(sorted(list(counts.items())), expected, feature)
+            self.assertCountEqual(counts.items(), test['expected'], feature)
 
         # test joint_counts
         expected_joint_counts = {
diff --git a/tests/test_detect.py b/tests/test_detect.py
index 66dee41..5098b7a 100644
--- a/tests/test_detect.py
+++ b/tests/test_detect.py
@@ -232,7 +232,7 @@ def test_value_scores(self):
         to_be_scored = self.observations[0]
         value_scores = self.cbrw.value_scores(to_be_scored)
         value_scores = value_scores[0]
-        self.assertListEqual(sorted(value_scores.keys()), sorted(to_be_scored.keys()))
+        self.assertCountEqual(value_scores.keys(), to_be_scored.keys())
         for vs in value_scores.values():
             self.assertGreaterEqual(vs, 0)
             self.assertLessEqual(vs, 1)
diff --git a/tests/test_matrix.py b/tests/test_matrix.py
index 2a9a152..5beda1b 100644
--- a/tests/test_matrix.py
+++ b/tests/test_matrix.py
@@ -21,12 +21,13 @@ def construct_2x2_csr_matrix(data):
     matrix_data = []
     matrix_idx = []
     for ix, datum in zip(idx, data):
-        if datum != 0:
-            matrix_data.append(datum)
-            matrix_idx.append(ix)
-    if matrix_data:
-        return csr_matrix((matrix_data, zip(*matrix_idx)), shape=(2, 2))
-    return csr_matrix(([], ([], [])), shape=(2, 2))
+        if datum == 0:
+            continue
+        matrix_data.append(datum)
+        matrix_idx.append(ix)
+    if not matrix_data:
+        return csr_matrix(([], ([], [])), shape=(2, 2))
+    return csr_matrix((matrix_data, zip(*matrix_idx)), shape=(2, 2))
 
 
 def csr_matrix_equality(c1, c2):
@@ -85,12 +86,9 @@ def test_dict_to_csr_matrix(self):
             }
         }
 
-        for test_name, params in table.items():
-            data_dict = params['data_dict']
-            shape = params['shape']
-            expected = params['expected']
-            result = dict_to_csr_matrix(data_dict, shape)
-            self.assertTrue(csr_matrix_equality(result, expected), test_name)
+        for test_name, test in table.items():
+            result = dict_to_csr_matrix(test['data_dict'], test['shape'])
+            self.assertTrue(csr_matrix_equality(result, test['expected']), test_name)
 
 
 class TestRowNormalizeCSRMatrix(unittest.TestCase):

From adef7540bca6bd4789f4313e05ac7906be11230f Mon Sep 17 00:00:00 2001
From: dkaslovsky <dkaslovsky@gmail.com>
Date: Thu, 27 Aug 2020 06:44:19 -0600
Subject: [PATCH 13/18] add python3 note to readme

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index e1e331a..b1900f5 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,8 @@ Outlier detection for categorical data
 ### Overview
 Python implementation of the Coupled Biased Random Walks (CBRW) outlier detection algorithm described by Pang, Cao, and Chen in https://www.ijcai.org/Proceedings/16/Papers/272.pdf.
 
+*NOTE*: Only Python>=3.6 is supported as of version 2.0.0.
+
 This implementation operates on Python dicts rather than Pandas DataFrames.  This has the advantage of allowing the model to be updated with new observations in a trivial manner and is more efficient in certain aspects.  However, these advantages come at the cost of iterating a (potentially large) dict of observed values more times than might otherwise be necessary using an underlying DataFrame implementation.
 
 If one is working with data previously loaded into a DataFrame, simply use the result of `pandas.DataFrame.to_dict(orient='records')` instead of the DataFrame itself to add observations to the model.  Note that because it is common for a DataFrame to fill missing values with `nan`, the detector will ignore features with value `nan` in any observation record.  Therefore, there is no need to further preprocess the DataFrame before using its `to_dict` method to create records.

From e7d0970e7b2e35157936eb7a66e160ad87fbd322 Mon Sep 17 00:00:00 2001
From: dkaslovsky <dkaslovsky@gmail.com>
Date: Thu, 27 Aug 2020 06:45:22 -0600
Subject: [PATCH 14/18] bold note in readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b1900f5..40090fc 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ Outlier detection for categorical data
 ### Overview
 Python implementation of the Coupled Biased Random Walks (CBRW) outlier detection algorithm described by Pang, Cao, and Chen in https://www.ijcai.org/Proceedings/16/Papers/272.pdf.
 
-*NOTE*: Only Python>=3.6 is supported as of version 2.0.0.
+_NOTE_: Only Python>=3.6 is supported as of version 2.0.0.
 
 This implementation operates on Python dicts rather than Pandas DataFrames.  This has the advantage of allowing the model to be updated with new observations in a trivial manner and is more efficient in certain aspects.  However, these advantages come at the cost of iterating a (potentially large) dict of observed values more times than might otherwise be necessary using an underlying DataFrame implementation.
 

From 941f952821765236684c0d85332ae102c7520b8c Mon Sep 17 00:00:00 2001
From: dkaslovsky <dkaslovsky@gmail.com>
Date: Thu, 27 Aug 2020 06:47:19 -0600
Subject: [PATCH 15/18] fix bold note in readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 40090fc..78ff124 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ Outlier detection for categorical data
 ### Overview
 Python implementation of the Coupled Biased Random Walks (CBRW) outlier detection algorithm described by Pang, Cao, and Chen in https://www.ijcai.org/Proceedings/16/Papers/272.pdf.
 
-_NOTE_: Only Python>=3.6 is supported as of version 2.0.0.
+__NOTE__: Only Python>=3.6 is supported as of version 2.0.0.
 
 This implementation operates on Python dicts rather than Pandas DataFrames.  This has the advantage of allowing the model to be updated with new observations in a trivial manner and is more efficient in certain aspects.  However, these advantages come at the cost of iterating a (potentially large) dict of observed values more times than might otherwise be necessary using an underlying DataFrame implementation.
 

From 671c997d6e5c5706932b9af68099a7017cb2bf23 Mon Sep 17 00:00:00 2001
From: dkaslovsky <dkaslovsky@gmail.com>
Date: Fri, 28 Aug 2020 20:17:15 -0600
Subject: [PATCH 16/18] type hints

---
 CHANGELOG.md                          |  3 +-
 coupled_biased_random_walks/count.py  | 29 +++++++++---------
 coupled_biased_random_walks/detect.py | 42 +++++++++++++++++----------
 coupled_biased_random_walks/matrix.py | 18 ++++++++++--
 coupled_biased_random_walks/types.py  |  4 +++
 5 files changed, 64 insertions(+), 32 deletions(-)
 create mode 100644 coupled_biased_random_walks/types.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ae89d6c..0a852bf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,6 @@
 ## 2.0.0 / 2020-08-26
-* [Changed] removed support for Python 2 and <3.6
+* [Added] type hints
+* [Changed] removed support for Python 2 and <3.7
 * [Changed] updated dependencies to latest versions
 
 ## 1.1.0 / 2020-08-23
diff --git a/coupled_biased_random_walks/count.py b/coupled_biased_random_walks/count.py
index 466d6e4..cd832eb 100644
--- a/coupled_biased_random_walks/count.py
+++ b/coupled_biased_random_walks/count.py
@@ -1,6 +1,9 @@
 from collections import Counter, defaultdict
 from collections.abc import Mapping
 from itertools import combinations, tee
+from typing import Any, Dict, Hashable, Iterable, Tuple
+
+from coupled_biased_random_walks.types import obs_item_type, observation_type
 
 
 class IncrementingDict(Mapping):
@@ -16,7 +19,7 @@ def __init__(self):
         self._d = {}
         self._next_val = 0
 
-    def insert(self, key):
+    def insert(self, key: Hashable) -> None:
         """
         Inserts a (strictly new) key
         :param key: any hashable object to be used as a key
@@ -61,18 +64,18 @@ def __init__(self):
         self._index = IncrementingDict()
 
     @property
-    def counts(self):
+    def counts(self) -> Dict[str, Counter]:
         return dict(self._counts)
 
     @property
-    def joint_counts(self):
+    def joint_counts(self) -> Dict[Tuple[obs_item_type, obs_item_type], int]:
         return dict(self._joint_counts)
 
     @property
-    def index(self):
+    def index(self) -> IncrementingDict:
         return self._index
 
-    def update(self, observation_iterable):
+    def update(self, observation_iterable: Iterable[observation_type]) -> None:
         """
         Update counts with new observation(s)
         :param observation_iterable: list of dicts
@@ -89,7 +92,7 @@ def update(self, observation_iterable):
             self._update_counts(obs1)
             self._update_joint_counts(obs2)
 
-    def get_count(self, item):
+    def get_count(self, item: obs_item_type) -> int:
         """
         Getter to safely retrieve count from interal data structure of defaultdict(Counter)
         :param item: tuple of the form ('feature_name', 'feature_value')
@@ -103,7 +106,7 @@ def get_count(self, item):
             # meaning there is no count for the feature_name
             return 0
 
-    def _update_counts(self, observation):
+    def _update_counts(self, observation: obs_item_type) -> None:
         """
         Update single counts
         :param observation: iterable of tuples of the form ('feature_name', 'feature_value')
@@ -114,18 +117,18 @@ def _update_counts(self, observation):
             self._index.insert(item)
             self.n_obs.update([feature_name])
 
-    def _update_joint_counts(self, observation):
+    def _update_joint_counts(self, observations: Iterable[obs_item_type]) -> None:
         """
         Update joint counts
         :param observation: iterable of tuples of the form ('feature_name', 'feature_value')
         """
-        pairs = combinations(sorted(observation), 2)
+        pairs = combinations(sorted(observations), 2)
         self._joint_counts.update(pairs)
 
 
 # Helper functions
 
-def get_feature_name(feature_tuple):
+def get_feature_name(feature_tuple: obs_item_type) -> str:
     """
     Helper function to return feature name from tuple representation
     :param feature_tuple: tuple of the form (feature_name, feature_value)
@@ -133,7 +136,7 @@ def get_feature_name(feature_tuple):
     return feature_tuple[0]
 
 
-def get_feature_value(feature_tuple):
+def get_feature_value(feature_tuple: obs_item_type) -> str:
     """
     Helper function to return feature value from tuple representation
     :param feature_tuple: tuple of the form (feature_name, feature_value)
@@ -141,7 +144,7 @@ def get_feature_value(feature_tuple):
     return feature_tuple[1]
 
 
-def get_mode(counter):
+def get_mode(counter: Counter) -> int:
     """
     Helper function to return the count of the most common
     element from an instance of Counter()
@@ -155,7 +158,7 @@ def get_mode(counter):
     return mode[0][1]
 
 
-def isnan(x):
+def isnan(x: Any) -> bool:
     """
     Return True if x is NaN where x can be of any type
     :param x: any object for which (in)equality can be checked
diff --git a/coupled_biased_random_walks/detect.py b/coupled_biased_random_walks/detect.py
index f805571..9048e08 100644
--- a/coupled_biased_random_walks/detect.py
+++ b/coupled_biased_random_walks/detect.py
@@ -1,6 +1,10 @@
+from __future__ import annotations
+
 from collections import defaultdict
+from typing import Dict, Iterable, List, Optional
 
 import numpy as np
+from scipy.sparse import csr_matrix
 
 from coupled_biased_random_walks.count import (
     ObservationCounter,
@@ -12,6 +16,7 @@
     random_walk,
     row_normalize_csr_matrix,
 )
+from coupled_biased_random_walks.types import obs_item_type, observation_type
 
 
 class CBRW:
@@ -25,7 +30,11 @@ class CBRW:
         'max_iter': 100    # max number of steps to take
     }
 
-    def __init__(self, rw_params=None, ignore_unknown=False):
+    def __init__(
+        self,
+        rw_params: Optional[Dict[str, float]] = None,
+        ignore_unknown: bool = False,
+    ):
         """
         :param rw_params: random walk parameters to override defaults
         :param ignore_unknown: if True, score an observation containing unknown feature names
@@ -36,14 +45,14 @@ def __init__(self, rw_params=None, ignore_unknown=False):
         self._unknown_feature_score = 0 if ignore_unknown else np.nan
 
         self._counter = ObservationCounter()
-        self._stationary_prob = None
-        self._feature_relevance = None
+        self._stationary_prob = None    # type: Optional[Dict[obs_item_type, float]]
+        self._feature_relevance = None  # type: Optional[Dict[str, float]]
 
     @property
-    def feature_weights(self):
+    def feature_weights(self) -> Optional[Dict[str, float]]:
         return self._feature_relevance
 
-    def add_observations(self, observation_iterable):
+    def add_observations(self, observation_iterable: Iterable[observation_type]) -> CBRW:
         """
         Add observations to be modeled
         :param observation_iterable: list of dicts with each dict representing an observation
@@ -52,7 +61,7 @@ def add_observations(self, observation_iterable):
         self._counter.update(observation_iterable)
         return self
 
-    def fit(self):
+    def fit(self) -> CBRW:
         """
         Compute model based on current observations in state
         """
@@ -79,7 +88,7 @@ def fit(self):
         self._feature_relevance = dict(feature_relevance)
         return self
 
-    def score(self, observation_iterable):
+    def score(self, observation_iterable: Iterable[observation_type]) -> np.array:
         """
         Compute an anomaly score for each observation in observation_iterable
         :param observation_iterable: iterable of dict observations with each dict
@@ -91,14 +100,17 @@ def score(self, observation_iterable):
             observation_iterable = [observation_iterable]
         return np.array([self._score(obs) for obs in observation_iterable])
 
-    def _score(self, observation):
+    def _score(self, observation: observation_type) -> float:
         """
         Compute the weighted anomaly score (object_score in the paper) for an observation
         :param observation: dict of the form {feature_name: feature_value, ...}
         """
         return sum(self._value_scores(observation).values())
 
-    def value_scores(self, observation_iterable):
+    def value_scores(
+        self,
+        observation_iterable: Iterable[observation_type],
+    ) -> List[Dict[str, float]]:
         """
         Compute an anomaly sub-score for each value of each observation in observation_iterable
         :param observation_iterable: iterable of dict observations with each dict
@@ -113,7 +125,7 @@ def value_scores(self, observation_iterable):
             observation_iterable = [observation_iterable]
         return [self._value_scores(obs) for obs in observation_iterable]
 
-    def _value_scores(self, observation):
+    def _value_scores(self, observation: observation_type) -> Dict[str, float]:
         """
         Compute the weighted value scores for each feature value of an observation
         :param observation: dict of the form {feature_name: feature_value, ...}
@@ -125,7 +137,7 @@ def _value_scores(self, observation):
             for item in observation.items()
         }
 
-    def _get_feature_relevance(self, feature_tuple):
+    def _get_feature_relevance(self, feature_tuple: obs_item_type) -> float:
         """
         Getter for the relevance (weight) of a feature (category)
         :param feature_tuple:  tuple of the form (feature_name, feature_value)
@@ -133,11 +145,11 @@ def _get_feature_relevance(self, feature_tuple):
         feature_name = get_feature_name(feature_tuple)
         return self._feature_relevance.get(feature_name, 0)
 
-    def _compute_biased_transition_matrix(self):
+    def _compute_biased_transition_matrix(self) -> csr_matrix:
         """
         Computes biased probability transition matrix of conditional probabilities
         """
-        prob_idx = {}
+        prob_idx = {}  # type: Dict[obs_item_type, float]
 
         bias_dict = self._compute_biases()
 
@@ -170,11 +182,11 @@ def _compute_biased_transition_matrix(self):
         trans_matrix = dict_to_csr_matrix(prob_idx, shape=n_features)
         return row_normalize_csr_matrix(trans_matrix)
 
-    def _compute_biases(self):
+    def _compute_biases(self) -> Dict[obs_item_type, float]:
         """
         Computes bias for random walk for each feature tuple
         """
-        bias_dict = {}
+        bias_dict = {}  # type: Dict[obs_item_type, float]
         for feature_name, value_counts in self._counter.counts.items():
             mode = get_mode(value_counts)
             base = 1 - (mode / self._counter.n_obs[feature_name])
diff --git a/coupled_biased_random_walks/matrix.py b/coupled_biased_random_walks/matrix.py
index 8c475e8..23fbee9 100644
--- a/coupled_biased_random_walks/matrix.py
+++ b/coupled_biased_random_walks/matrix.py
@@ -1,8 +1,17 @@
+from typing import Dict, Tuple, Union
+
 import numpy as np
 from scipy.sparse import csr_matrix
 
+from coupled_biased_random_walks.types import obs_item_type
+
 
-def random_walk(transition_matrix, alpha, err_tol, max_iter):
+def random_walk(
+    transition_matrix: csr_matrix,
+    alpha: float,
+    err_tol: float,
+    max_iter: int
+) -> np.ndarray:
     """
     Run random walk to compute stationary probabilities
     :param transition_matrix: scipy.sparse.csr_matrix defining the random walk
@@ -27,7 +36,10 @@ def random_walk(transition_matrix, alpha, err_tol, max_iter):
     return pi
 
 
-def dict_to_csr_matrix(data_dict, shape):
+def dict_to_csr_matrix(
+    data_dict: Dict[obs_item_type, float],
+    shape: Union[int, Tuple[int, int]],
+) -> csr_matrix:
     """
     Converts dict of index -> value to csr_matrix
     :param data_dict: dict mapping matrix index tuple to corresponding matrix value
@@ -43,7 +55,7 @@ def dict_to_csr_matrix(data_dict, shape):
     return csr_matrix((data, idx), shape=shape)
 
 
-def row_normalize_csr_matrix(matrix):
+def row_normalize_csr_matrix(matrix: csr_matrix) -> csr_matrix:
     """
     Row normalize a csr matrix without mutating the input
     :param matrix: scipy.sparse.csr_matrix instance
diff --git a/coupled_biased_random_walks/types.py b/coupled_biased_random_walks/types.py
new file mode 100644
index 0000000..84684c7
--- /dev/null
+++ b/coupled_biased_random_walks/types.py
@@ -0,0 +1,4 @@
+from typing import Dict, Tuple
+
+observation_type = Dict[str, str]
+obs_item_type = Tuple[str, str]

From 6b913a19294e75d72c8b7d70c6b687ae3216756f Mon Sep 17 00:00:00 2001
From: dkaslovsky <dkaslovsky@gmail.com>
Date: Fri, 28 Aug 2020 20:30:05 -0600
Subject: [PATCH 17/18] type hints

---
 data/loading.py      | 10 ++++++++--
 tests/test_matrix.py |  9 ++++++---
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/data/loading.py b/data/loading.py
index f82c2b8..0f25aa7 100644
--- a/data/loading.py
+++ b/data/loading.py
@@ -1,8 +1,14 @@
 from csv import DictReader
 from functools import partial
+from typing import Iterable, List, Optional, Set
 
+from coupled_biased_random_walks.types import observation_type
 
-def load_from_csv(path_to_csv, exclude_cols=None):
+
+def load_from_csv(
+    path_to_csv: str,
+    exclude_cols: Optional[Iterable[str]] = None,
+) -> List[observation_type]:
     """
     Load a CSV and return a list of dicts, one dict for each row of
     the form {column_header1: <value>, column_header2: <value>, ...}
@@ -21,7 +27,7 @@ def load_from_csv(path_to_csv, exclude_cols=None):
     return [filt(rec) for rec in data]
 
 
-def filter_keys(record, fields):
+def filter_keys(record: observation_type, fields: Set[str]):
     """
     Filter keys from a dict
     :param record: dict
diff --git a/tests/test_matrix.py b/tests/test_matrix.py
index 5beda1b..c91040e 100644
--- a/tests/test_matrix.py
+++ b/tests/test_matrix.py
@@ -1,4 +1,5 @@
 import unittest
+from typing import List
 
 import numpy as np
 from scipy.sparse import csr_matrix
@@ -12,7 +13,7 @@
 np.random.seed(0)
 
 
-def construct_2x2_csr_matrix(data):
+def construct_2x2_csr_matrix(data: List[float]) -> csr_matrix:
     """
     Construct a 2x2 csr_matrix
     :param data: list of length 4 of data for csr matrix corresponding to idx position
@@ -30,9 +31,11 @@ def construct_2x2_csr_matrix(data):
     return csr_matrix((matrix_data, zip(*matrix_idx)), shape=(2, 2))
 
 
-def csr_matrix_equality(c1, c2):
+def csr_matrix_equality(c1: csr_matrix, c2: csr_matrix) -> bool:
     """
-    Test 2 csr matrices for equality
+    Test two csr matrices for equality
+    :param c1: csr_matrix to compare
+    :param c2: csr_matrix to compare
     """
     if c1.shape != c2.shape:
         return False

From f0e6495051d108bee3e00d387ee32c81c7f8faa1 Mon Sep 17 00:00:00 2001
From: dkaslovsky <dkaslovsky@gmail.com>
Date: Fri, 28 Aug 2020 20:32:08 -0600
Subject: [PATCH 18/18] remove python3.6 to use annotations

---
 .travis.yml | 1 -
 README.md   | 2 +-
 setup.py    | 1 -
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 9f50e43..4f51ef7 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,7 +2,6 @@ language: python
 
 matrix:
   include:
-    - python: 3.6
     - python: 3.7
     - python: 3.8
 
diff --git a/README.md b/README.md
index 78ff124..9ea1376 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ Outlier detection for categorical data
 ### Overview
 Python implementation of the Coupled Biased Random Walks (CBRW) outlier detection algorithm described by Pang, Cao, and Chen in https://www.ijcai.org/Proceedings/16/Papers/272.pdf.
 
-__NOTE__: Only Python>=3.6 is supported as of version 2.0.0.
+__NOTE__: Only Python>=3.7 is supported as of version 2.0.0.
 
 This implementation operates on Python dicts rather than Pandas DataFrames.  This has the advantage of allowing the model to be updated with new observations in a trivial manner and is more efficient in certain aspects.  However, these advantages come at the cost of iterating a (potentially large) dict of observed values more times than might otherwise be necessary using an underlying DataFrame implementation.
 
diff --git a/setup.py b/setup.py
index 692b8d4..8ecb9d7 100644
--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,6 @@
         'License :: OSI Approved :: MIT License',
         'Natural Language :: English',
         'Operating System :: OS Independent',
-        'Programming Language :: Python :: 3.6',
         'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',
     ]