From a6458c830399eaac038ad2bd541d3896984cfa1b Mon Sep 17 00:00:00 2001 From: dkaslovsky Date: Wed, 26 Aug 2020 19:29:21 -0600 Subject: [PATCH 01/18] remove six from tests --- tests/test_count.py | 21 ++++++++++----------- tests/test_detect.py | 26 ++++++++++++++------------ tests/test_matrix.py | 16 ++++++++-------- 3 files changed, 32 insertions(+), 31 deletions(-) diff --git a/tests/test_count.py b/tests/test_count.py index 7f41b23..fefb439 100644 --- a/tests/test_count.py +++ b/tests/test_count.py @@ -2,7 +2,6 @@ from collections import Counter import numpy as np -from six import iteritems import coupled_biased_random_walks.count as cnt @@ -26,7 +25,7 @@ def test_insert(self): 'expected_dict': {'a': 0, 'c': 1, 'b': 2, 'd': 3} } } - for test_name, test in iteritems(table): + for test_name, test in table.items(): self.setUp() for key in test['keys_to_insert']: self.d.insert(key) @@ -61,7 +60,7 @@ class TestObservationCounter(unittest.TestCase): # keep a set of all feature_name, feature_val pairs for testing all_index_keys = set() for observation in observations: - for item in iteritems(observation): + for item in observation.items(): all_index_keys.add(item) def setUp(self): @@ -75,7 +74,7 @@ def test_update(self): 'feature_b': 2, 'feature_c': 2 } - for feature_name, count in iteritems(self.oc.n_obs): + for feature_name, count in self.oc.n_obs.items(): self.assertEqual(count, expected_counts[feature_name]) # test index @@ -103,7 +102,7 @@ def test_update(self): ] } } - for feature, test in iteritems(table): + for feature, test in table.items(): counts = self.oc.counts[feature] expected = sorted(test['expected']) self.assertListEqual(sorted(list(counts.items())), expected, feature) @@ -137,7 +136,7 @@ def test_get_count(self): 'expected': 0 }, } - for test_name, test in iteritems(table): + for test_name, test in table.items(): count = self.oc.get_count(test['feature tuple']) expected = test['expected'] self.assertEqual(count, expected, test_name) @@ -156,7 +155,7 @@ class TestObservationCounterWithMissingData(unittest.TestCase): # keep a set of all feature_name, feature_val pairs for testing all_index_keys = set() for observation in observations: - for item in iteritems(observation): + for item in observation.items(): if not cnt.isnan(cnt.get_feature_value(item)): all_index_keys.add(item) @@ -171,7 +170,7 @@ def test_update(self): 'feature_b': 1, 'feature_c': 1 } - for feature_name, count in iteritems(self.oc.n_obs): + for feature_name, count in self.oc.n_obs.items(): self.assertEqual(count, expected_counts[feature_name]) # test index @@ -201,7 +200,7 @@ def test_update(self): 'expected': [] } } - for feature, test in iteritems(table): + for feature, test in table.items(): counts = self.oc.counts.get(feature, {}) expected = sorted(test['expected']) self.assertListEqual(sorted(list(counts.items())), expected, feature) @@ -250,7 +249,7 @@ def test_isnan(self): 'expected': False }, } - for test_name, test in iteritems(table): + for test_name, test in table.items(): isnan_result = cnt.isnan(test['test']) self.assertEqual(isnan_result, test['expected'], test_name) @@ -281,7 +280,7 @@ def test_get_mode(self): 'expected': 2 }, } - for test_name, test in iteritems(table): + for test_name, test in table.items(): mode = cnt.get_mode(test['counter']) self.assertEqual(mode, test['expected'], test_name) diff --git a/tests/test_detect.py b/tests/test_detect.py index d9fc77c..66dee41 100644 --- a/tests/test_detect.py +++ b/tests/test_detect.py @@ -3,11 +3,13 @@ import numpy as np from scipy.sparse import csr_matrix -from six import itervalues from coupled_biased_random_walks.count import isnan -from coupled_biased_random_walks.detect import (CBRW, CBRWFitError, - CBRWScoreError) +from coupled_biased_random_walks.detect import ( + CBRW, + CBRWFitError, + CBRWScoreError, +) class TestCBRW(unittest.TestCase): @@ -231,7 +233,7 @@ def test_value_scores(self): value_scores = self.cbrw.value_scores(to_be_scored) value_scores = value_scores[0] self.assertListEqual(sorted(value_scores.keys()), sorted(to_be_scored.keys())) - for vs in itervalues(value_scores): + for vs in value_scores.values(): self.assertGreaterEqual(vs, 0) self.assertLessEqual(vs, 1) @@ -272,8 +274,8 @@ def test_value_scores_unknown_features_default(self): value_scores = self.cbrw.value_scores(to_be_scored) valid_scores = value_scores[0] invalid_scores = value_scores[1] - self.assertTrue(all(not isnan(valid_score) for valid_score in itervalues(valid_scores))) - self.assertTrue(any(isnan(invalid_score) for invalid_score in itervalues(invalid_scores))) + self.assertTrue(all(not isnan(valid_score) for valid_score in valid_scores.values())) + self.assertTrue(any(isnan(invalid_score) for invalid_score in invalid_scores.values())) def test_value_scores_unknown_features_ignore(self): self.cbrw = CBRW(ignore_unknown=True) @@ -293,7 +295,7 @@ def test_value_scores_unknown_features_ignore(self): } value_scores = self.cbrw.value_scores(to_be_scored)[0] actual_value_scores = self.cbrw.value_scores(actually_scored)[0] - self.assertTrue(all(not isnan(vs) for vs in itervalues(value_scores))) + self.assertTrue(all(not isnan(vs) for vs in value_scores.values())) self.assertEqual(value_scores['feature_a'], 0) self.assertEqual(value_scores['feature_b'], actual_value_scores['feature_b']) self.assertEqual(value_scores['feature_c'], actual_value_scores['feature_c']) @@ -311,7 +313,7 @@ def test_value_scores_unknown_features_ignore(self): } value_scores = self.cbrw.value_scores(to_be_scored)[0] actual_value_scores = self.cbrw.value_scores(actually_scored)[0] - self.assertTrue(all(not isnan(vs) for vs in itervalues(value_scores))) + self.assertTrue(all(not isnan(vs) for vs in value_scores.values())) self.assertEqual(value_scores['feature_x'], 0) self.assertEqual(value_scores['feature_b'], actual_value_scores['feature_b']) self.assertEqual(value_scores['feature_c'], actual_value_scores['feature_c']) @@ -324,8 +326,8 @@ def test_value_scores_unknown_features_ignore(self): 'feature_z': 'z_val_1' } value_scores = self.cbrw.value_scores(to_be_scored)[0] - self.assertTrue(all(not isnan(vs) for vs in itervalues(value_scores))) - self.assertTrue(all(vs == 0 for vs in itervalues(value_scores))) + self.assertTrue(all(not isnan(vs) for vs in value_scores.values())) + self.assertTrue(all(vs == 0 for vs in value_scores.values())) def test_value_scores_with_nans_default(self): obs = deepcopy(self.observations) @@ -369,7 +371,7 @@ def test_value_scores_with_nans_ignore(self): self.cbrw.fit() value_scores = self.cbrw.value_scores(to_be_scored)[0] actual_value_scores = self.cbrw.value_scores(actually_scored)[0] - self.assertTrue(all(not isnan(vs) for vs in itervalues(value_scores))) + self.assertTrue(all(not isnan(vs) for vs in value_scores.values())) self.assertEqual(value_scores['feature_a'], 0) self.assertEqual(value_scores['feature_b'], actual_value_scores['feature_b']) self.assertEqual(value_scores['feature_c'], actual_value_scores['feature_c']) @@ -380,7 +382,7 @@ def test_value_scores_with_nans_ignore(self): self.cbrw.fit() value_scores = self.cbrw.value_scores(to_be_scored)[0] actual_value_scores = self.cbrw.value_scores(actually_scored)[0] - self.assertTrue(all(not isnan(vs) for vs in itervalues(value_scores))) + self.assertTrue(all(not isnan(vs) for vs in value_scores.values())) self.assertEqual(value_scores['feature_a'], 0) self.assertEqual(value_scores['feature_b'], actual_value_scores['feature_b']) self.assertEqual(value_scores['feature_c'], actual_value_scores['feature_c']) diff --git a/tests/test_matrix.py b/tests/test_matrix.py index 5e2496e..2a9a152 100644 --- a/tests/test_matrix.py +++ b/tests/test_matrix.py @@ -2,12 +2,12 @@ import numpy as np from scipy.sparse import csr_matrix -from six import iteritems -from six.moves import zip -from coupled_biased_random_walks.matrix import (dict_to_csr_matrix, - random_walk, - row_normalize_csr_matrix) +from coupled_biased_random_walks.matrix import ( + dict_to_csr_matrix, + random_walk, + row_normalize_csr_matrix, +) np.random.seed(0) @@ -85,7 +85,7 @@ def test_dict_to_csr_matrix(self): } } - for test_name, params in iteritems(table): + for test_name, params in table.items(): data_dict = params['data_dict'] shape = params['shape'] expected = params['expected'] @@ -128,13 +128,13 @@ def test_valid_row_normalize(self): } } - for test_name, test in iteritems(valid_table): + for test_name, test in valid_table.items(): matrix = construct_2x2_csr_matrix(test['data']) normalized = row_normalize_csr_matrix(matrix) row_sums = normalized.sum(axis=1) self.assertAlmostEqual(row_sums[0], test['expected_row_0'], 3, test_name) self.assertAlmostEqual(row_sums[1], test['expected_row_1'], 3, test_name) - for test_name, test in iteritems(invalid_table): + for test_name, test in invalid_table.items(): with self.assertRaises(test['exception']): _ = row_normalize_csr_matrix(test['input']) From ff0fdfaec5f4bf89d533b781fa818e054cefc49a Mon Sep 17 00:00:00 2001 From: dkaslovsky Date: Wed, 26 Aug 2020 19:48:46 -0600 Subject: [PATCH 02/18] remove six and __future__ --- coupled_biased_random_walks/count.py | 14 +++----------- coupled_biased_random_walks/detect.py | 15 ++++++--------- coupled_biased_random_walks/matrix.py | 6 +----- 3 files changed, 10 insertions(+), 25 deletions(-) diff --git a/coupled_biased_random_walks/count.py b/coupled_biased_random_walks/count.py index c64b588..f40aec5 100644 --- a/coupled_biased_random_walks/count.py +++ b/coupled_biased_random_walks/count.py @@ -1,15 +1,7 @@ from collections import Counter, defaultdict +from collections.abc import Mapping from itertools import combinations, tee -from six import iteritems - -try: - # python 2 - from collections import Mapping -except ImportError: - # python 3 - from collections.abc import Mapping - class IncrementingDict(Mapping): @@ -91,9 +83,9 @@ def update(self, observation_iterable): # feature name with value NaN represents a missing feature in the # observation (e.g., a missing value is NaN-filled in a pandas DataFrame) so # we remove any such features from the observation to avoid including in counts - obs = {key: value for key, value in iteritems(observation) if not isnan(value)} + obs = {key: value for key, value in observation.items() if not isnan(value)} # create iterators of obs for updating counts - obs1, obs2 = tee(iteritems(obs), 2) + obs1, obs2 = tee(obs.items(), 2) self._update_counts(obs1) self._update_joint_counts(obs2) diff --git a/coupled_biased_random_walks/detect.py b/coupled_biased_random_walks/detect.py index 8f3b61f..a9764be 100644 --- a/coupled_biased_random_walks/detect.py +++ b/coupled_biased_random_walks/detect.py @@ -1,9 +1,6 @@ -from __future__ import division - from collections import defaultdict import numpy as np -from six import iteritems, itervalues from coupled_biased_random_walks.count import ( ObservationCounter, @@ -71,7 +68,7 @@ def fit(self): stationary_prob = {} feature_relevance = defaultdict(int) - for feature, idx in iteritems(self._counter.index): + for feature, idx in self._counter.index.items(): prob = pi[idx] stationary_prob[feature] = prob feature_relevance[get_feature_name(feature)] += prob @@ -99,7 +96,7 @@ def _score(self, observation): Compute the weighted anomaly score (object_score in the paper) for an observation :param observation: dict of the form {feature_name: feature_value, ...} """ - return sum(itervalues(self._value_scores(observation))) + return sum(self._value_scores(observation).values()) def value_scores(self, observation_iterable): """ @@ -125,7 +122,7 @@ def _value_scores(self, observation): get_feature_name(item): self._get_feature_relevance(item) * self._stationary_prob.get(item, self._unknown_feature_score) - for item in iteritems(observation) + for item in observation.items() } def _get_feature_relevance(self, feature_tuple): @@ -144,7 +141,7 @@ def _compute_biased_transition_matrix(self): bias_dict = self._compute_biases() - for (feature1, feature2), joint_count in iteritems(self._counter.joint_counts): + for (feature1, feature2), joint_count in self._counter.joint_counts.items(): # get index for features feature1_idx = self._counter.index[feature1] @@ -178,12 +175,12 @@ def _compute_biases(self): Computes bias for random walk for each feature tuple """ bias_dict = {} - for feature_name, value_counts in iteritems(self._counter.counts): + for feature_name, value_counts in self._counter.counts.items(): mode = get_mode(value_counts) base = 1 - (mode / self._counter.n_obs[feature_name]) bias = { feature_val: (1 - (count / mode) + base) / 2 - for feature_val, count in iteritems(value_counts) + for feature_val, count in value_counts.items() } bias_dict.update(bias) return bias_dict diff --git a/coupled_biased_random_walks/matrix.py b/coupled_biased_random_walks/matrix.py index 5ecf663..8c475e8 100644 --- a/coupled_biased_random_walks/matrix.py +++ b/coupled_biased_random_walks/matrix.py @@ -1,8 +1,5 @@ -from __future__ import division - import numpy as np from scipy.sparse import csr_matrix -from six.moves import range def random_walk(transition_matrix, alpha, err_tol, max_iter): @@ -41,8 +38,7 @@ def dict_to_csr_matrix(data_dict, shape): if isinstance(shape, int): shape = (shape, shape) - # csr_matrix cannot accept iterators so cast to lists for python 3 - data = list(data_dict.values()) + data = list(data_dict.values()) # csr_matrix cannot accept iterator for data idx = zip(*list(data_dict.keys())) return csr_matrix((data, idx), shape=shape) From 87ed0d11479b1ebf6864af87ca4be02d62bb9358 Mon Sep 17 00:00:00 2001 From: dkaslovsky Date: Wed, 26 Aug 2020 20:10:02 -0600 Subject: [PATCH 03/18] remove six and use fstring in example code --- data/loading.py | 4 +--- example.py | 10 ++++------ 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/data/loading.py b/data/loading.py index 8f1b119..6c5afe4 100644 --- a/data/loading.py +++ b/data/loading.py @@ -1,8 +1,6 @@ from csv import DictReader from functools import partial -from six import iteritems - def load_from_csv(path_to_csv, exclude_cols=None): """ @@ -28,4 +26,4 @@ def filter_keys(record, fields): :param fields: set of strings indicating fields to drop :return: """ - return {k: v for k, v in iteritems(record) if k not in fields} + return {k: v for k, v in record.items() if k not in fields} diff --git a/example.py b/example.py index d2ff1aa..1725720 100644 --- a/example.py +++ b/example.py @@ -1,7 +1,5 @@ import os -from six import iteritems - from coupled_biased_random_walks import CBRW from data.loading import load_from_csv @@ -14,7 +12,7 @@ def round_dict_values(input_dict, digits=4): """ Helper function for printing dicts with float values """ - return {key: round(val, digits) for key, val in iteritems(input_dict)} + return {key: round(val, digits) for key, val in input_dict.items()} if __name__ == '__main__': @@ -35,15 +33,15 @@ def round_dict_values(input_dict, digits=4): # display results print('Detector fit with {} observations:'.format(len(observations))) for i, obs in enumerate(observations): - print('Observation ID {}: {}'.format(i+1, obs)) + print(f'Observation ID {i+1}: {obs}') print('\nFeature weights:') print(round_dict_values(detector.feature_weights, 4)) print('\nScores:') for i, score in enumerate(scores): - print('Observation ID {}: {}'.format(i+1, round(score, 4))) + print(f'Observation ID {i+1}: {round(score, 4)}') print('\nValue scores per attribute:') for i, value_score in enumerate(value_scores): - print('Observation ID {}: {}'.format(i+1, round_dict_values(value_score, 4))) + print(f'Observation ID {i+1}: {round_dict_values(value_score, 4)}') From 7e2cf6e63578a7beb8953d9513172c4fc4e2e7e0 Mon Sep 17 00:00:00 2001 From: dkaslovsky Date: Wed, 26 Aug 2020 20:18:32 -0600 Subject: [PATCH 04/18] no longer inherit from object --- coupled_biased_random_walks/detect.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coupled_biased_random_walks/detect.py b/coupled_biased_random_walks/detect.py index a9764be..deadf9f 100644 --- a/coupled_biased_random_walks/detect.py +++ b/coupled_biased_random_walks/detect.py @@ -14,7 +14,7 @@ ) -class CBRW(object): +class CBRW: """ Class implementing Coupled Biased Random Walks algorithm """ From d148f9d0f15e53ddb5c4f8ed14cd8beca7829225 Mon Sep 17 00:00:00 2001 From: dkaslovsky Date: Wed, 26 Aug 2020 20:20:07 -0600 Subject: [PATCH 05/18] update dependency versions --- CHANGELOG.md | 4 ++++ requirements.txt | 5 ++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 99b002a..b2bfe33 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 2.0.0 / 2020-08-26 +* [Changed] removed support for Python2 +* [Changed] updated dependencies to latest versions + ## 1.1.0 / 2020-08-23 * [Added] `CBRW.value_scores()` function to return individual value scores of an observation diff --git a/requirements.txt b/requirements.txt index cb1a0b9..33b7f09 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,2 @@ -numpy==1.14.5 -scipy==1.1.0 -six==1.11.0 +numpy==1.19.1 +scipy==1.5.2 From 092ba6471f03fee48b649077f226978c56820943 Mon Sep 17 00:00:00 2001 From: dkaslovsky Date: Wed, 26 Aug 2020 20:20:40 -0600 Subject: [PATCH 06/18] python3 only --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index a5e6560..f71390a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,10 +2,10 @@ language: python matrix: include: - - python: 2.7 - python: 3.5 - python: 3.6 - python: 3.7 + - python: 3.8 install: - pip install -r requirements.txt From 1965d09353f6d812aa5c3948117065cec3ec950d Mon Sep 17 00:00:00 2001 From: dkaslovsky Date: Wed, 26 Aug 2020 20:20:57 -0600 Subject: [PATCH 07/18] version bump to 2.0.0 --- setup.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 0f36f40..b33f038 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name='coupled_biased_random_walks', - version='1.1.0', + version='2.0.0', author='Daniel Kaslovsky', author_email='dkaslovsky@gmail.com', license='MIT', @@ -26,10 +26,9 @@ 'License :: OSI Approved :: MIT License', 'Natural Language :: English', 'Operating System :: OS Independent', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', ] ) From a3c7c399d00bee57122027460306859e83f91d9b Mon Sep 17 00:00:00 2001 From: dkaslovsky Date: Wed, 26 Aug 2020 20:25:37 -0600 Subject: [PATCH 08/18] remove python <3.6 --- .travis.yml | 1 - CHANGELOG.md | 2 +- setup.py | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index f71390a..9f50e43 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,7 +2,6 @@ language: python matrix: include: - - python: 3.5 - python: 3.6 - python: 3.7 - python: 3.8 diff --git a/CHANGELOG.md b/CHANGELOG.md index b2bfe33..ae89d6c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,5 @@ ## 2.0.0 / 2020-08-26 -* [Changed] removed support for Python2 +* [Changed] removed support for Python 2 and <3.6 * [Changed] updated dependencies to latest versions ## 1.1.0 / 2020-08-23 diff --git a/setup.py b/setup.py index b33f038..692b8d4 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,6 @@ 'License :: OSI Approved :: MIT License', 'Natural Language :: English', 'Operating System :: OS Independent', - 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', From aaf7bcc76f17a14ce19b2ad4502bb30d463115f8 Mon Sep 17 00:00:00 2001 From: dkaslovsky Date: Wed, 26 Aug 2020 20:32:33 -0600 Subject: [PATCH 09/18] update readme --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2d6cc35..48167ea 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,10 @@ Outlier detection for categorical data [![Build Status](https://travis-ci.org/dkaslovsky/Coupled-Biased-Random-Walks.svg?branch=master)](https://travis-ci.org/dkaslovsky/Coupled-Biased-Random-Walks) [![Coverage Status](https://coveralls.io/repos/github/dkaslovsky/Coupled-Biased-Random-Walks/badge.svg?branch=master)](https://coveralls.io/github/dkaslovsky/Coupled-Biased-Random-Walks?branch=master) +![PyPI - Python Version](https://img.shields.io/pypi/pyversions/Coupled-Biased-Random-Walks) ### Overview -Python [2.7, 3.x] implementation of the Coupled Biased Random Walks (CBRW) outlier detection algorithm described by Pang, Cao, and Chen in https://www.ijcai.org/Proceedings/16/Papers/272.pdf. +Python implementation of the Coupled Biased Random Walks (CBRW) outlier detection algorithm described by Pang, Cao, and Chen in https://www.ijcai.org/Proceedings/16/Papers/272.pdf. This implementation operates on Python dicts rather than Pandas DataFrames. This has the advantage of allowing the model to be updated with new observations in a trivial manner and is more efficient in certain aspects. However, these advantages come at the cost of iterating a (potentially large) dict of observed values more times than might otherwise be necessary using an underlying DataFrame implementation. @@ -14,7 +15,7 @@ If one is working with data previously loaded into a DataFrame, simply use the r ### Installation This package is hosted on PyPI and can be installed via `pip`: ``` -$ pip install coupled_biased_random_walks +$ pip install coupled-biased-random-walks ``` To instead install from source: ``` From 8668213cc5ef97e38f56416d7aabf92f97854833 Mon Sep 17 00:00:00 2001 From: dkaslovsky Date: Wed, 26 Aug 2020 20:37:46 -0600 Subject: [PATCH 10/18] use fstring --- example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example.py b/example.py index 1725720..13c0c56 100644 --- a/example.py +++ b/example.py @@ -31,7 +31,7 @@ def round_dict_values(input_dict, digits=4): value_scores = detector.value_scores(observations) # display results - print('Detector fit with {} observations:'.format(len(observations))) + print(f'Detector fit with {len(observations)} observations:') for i, obs in enumerate(observations): print(f'Observation ID {i+1}: {obs}') From b535c7f817f315f03730d4654d6dc9ccbd707d7c Mon Sep 17 00:00:00 2001 From: dkaslovsky Date: Wed, 26 Aug 2020 20:38:17 -0600 Subject: [PATCH 11/18] img for readme --- README.md | 2 +- example_table.png => img/example_table.png | Bin 2 files changed, 1 insertion(+), 1 deletion(-) rename example_table.png => img/example_table.png (100%) diff --git a/README.md b/README.md index 48167ea..e1e331a 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ $ python setup.py install ### Example Let's run the CBRW detection algorithm on the authors' example data set from the paper: - + This data is saved as a [CSV file](./data/CBRW_paper_example.csv) in this repository and is loaded into memory as a list of dicts by [example.py](./example.py). Note that we drop the `Cheat?` column when loading the data, as this is essentially the target variable indicating the anomalous activity to be detected. The detector is instantiated and observations are added as follows: ``` diff --git a/example_table.png b/img/example_table.png similarity index 100% rename from example_table.png rename to img/example_table.png From 9ec07cf26b2b6c4631c3c0d1589af9c4994ed31c Mon Sep 17 00:00:00 2001 From: dkaslovsky Date: Thu, 27 Aug 2020 06:43:51 -0600 Subject: [PATCH 12/18] python3 and minor readability improvements --- coupled_biased_random_walks/count.py | 2 +- coupled_biased_random_walks/detect.py | 10 ++++------ data/loading.py | 14 ++++++++------ tests/test_count.py | 9 +++------ tests/test_detect.py | 2 +- tests/test_matrix.py | 22 ++++++++++------------ 6 files changed, 27 insertions(+), 32 deletions(-) diff --git a/coupled_biased_random_walks/count.py b/coupled_biased_random_walks/count.py index f40aec5..466d6e4 100644 --- a/coupled_biased_random_walks/count.py +++ b/coupled_biased_random_walks/count.py @@ -39,7 +39,7 @@ def __repr__(self): return self._d.__repr__() -class ObservationCounter(object): +class ObservationCounter: """ Counts single and joint occurrences of key/value pairs in a dict with diff --git a/coupled_biased_random_walks/detect.py b/coupled_biased_random_walks/detect.py index deadf9f..f805571 100644 --- a/coupled_biased_random_walks/detect.py +++ b/coupled_biased_random_walks/detect.py @@ -32,7 +32,7 @@ def __init__(self, rw_params=None, ignore_unknown=False): or values based only on features seen during training; if False, score such an observation as nan (default) """ - self.rw_params = rw_params if rw_params else self.PRESET_RW_PARAMS + self.rw_params = rw_params or self.PRESET_RW_PARAMS self._unknown_feature_score = 0 if ignore_unknown else np.nan self._counter = ObservationCounter() @@ -178,11 +178,9 @@ def _compute_biases(self): for feature_name, value_counts in self._counter.counts.items(): mode = get_mode(value_counts) base = 1 - (mode / self._counter.n_obs[feature_name]) - bias = { - feature_val: (1 - (count / mode) + base) / 2 - for feature_val, count in value_counts.items() - } - bias_dict.update(bias) + for feature_val, count in value_counts.items(): + bias = (1 - (count / mode) + base) / 2 + bias_dict[feature_val] = bias return bias_dict diff --git a/data/loading.py b/data/loading.py index 6c5afe4..f82c2b8 100644 --- a/data/loading.py +++ b/data/loading.py @@ -10,13 +10,15 @@ def load_from_csv(path_to_csv, exclude_cols=None): :param exclude_cols: iterable of columns to exclude (often the target variable) """ with open(path_to_csv, 'r') as csvfile: + # use list to load into memory before closing data = list(DictReader(csvfile)) - if exclude_cols is not None: - if isinstance(exclude_cols, str): - exclude_cols = {exclude_cols} - filt = partial(filter_keys, fields=set(exclude_cols)) - return [filt(rec) for rec in data] - return data + if exclude_cols is None: + return data + # filter based on exclude cols + if isinstance(exclude_cols, str): + exclude_cols = {exclude_cols} + filt = partial(filter_keys, fields=set(exclude_cols)) + return [filt(rec) for rec in data] def filter_keys(record, fields): diff --git a/tests/test_count.py b/tests/test_count.py index fefb439..7e6290f 100644 --- a/tests/test_count.py +++ b/tests/test_count.py @@ -104,8 +104,7 @@ def test_update(self): } for feature, test in table.items(): counts = self.oc.counts[feature] - expected = sorted(test['expected']) - self.assertListEqual(sorted(list(counts.items())), expected, feature) + self.assertCountEqual(counts.items(), test['expected'], feature) # test joint_counts expected_joint_counts = { @@ -138,8 +137,7 @@ def test_get_count(self): } for test_name, test in table.items(): count = self.oc.get_count(test['feature tuple']) - expected = test['expected'] - self.assertEqual(count, expected, test_name) + self.assertEqual(count, test['expected'], test_name) class TestObservationCounterWithMissingData(unittest.TestCase): @@ -202,8 +200,7 @@ def test_update(self): } for feature, test in table.items(): counts = self.oc.counts.get(feature, {}) - expected = sorted(test['expected']) - self.assertListEqual(sorted(list(counts.items())), expected, feature) + self.assertCountEqual(counts.items(), test['expected'], feature) # test joint_counts expected_joint_counts = { diff --git a/tests/test_detect.py b/tests/test_detect.py index 66dee41..5098b7a 100644 --- a/tests/test_detect.py +++ b/tests/test_detect.py @@ -232,7 +232,7 @@ def test_value_scores(self): to_be_scored = self.observations[0] value_scores = self.cbrw.value_scores(to_be_scored) value_scores = value_scores[0] - self.assertListEqual(sorted(value_scores.keys()), sorted(to_be_scored.keys())) + self.assertCountEqual(value_scores.keys(), to_be_scored.keys()) for vs in value_scores.values(): self.assertGreaterEqual(vs, 0) self.assertLessEqual(vs, 1) diff --git a/tests/test_matrix.py b/tests/test_matrix.py index 2a9a152..5beda1b 100644 --- a/tests/test_matrix.py +++ b/tests/test_matrix.py @@ -21,12 +21,13 @@ def construct_2x2_csr_matrix(data): matrix_data = [] matrix_idx = [] for ix, datum in zip(idx, data): - if datum != 0: - matrix_data.append(datum) - matrix_idx.append(ix) - if matrix_data: - return csr_matrix((matrix_data, zip(*matrix_idx)), shape=(2, 2)) - return csr_matrix(([], ([], [])), shape=(2, 2)) + if datum == 0: + continue + matrix_data.append(datum) + matrix_idx.append(ix) + if not matrix_data: + return csr_matrix(([], ([], [])), shape=(2, 2)) + return csr_matrix((matrix_data, zip(*matrix_idx)), shape=(2, 2)) def csr_matrix_equality(c1, c2): @@ -85,12 +86,9 @@ def test_dict_to_csr_matrix(self): } } - for test_name, params in table.items(): - data_dict = params['data_dict'] - shape = params['shape'] - expected = params['expected'] - result = dict_to_csr_matrix(data_dict, shape) - self.assertTrue(csr_matrix_equality(result, expected), test_name) + for test_name, test in table.items(): + result = dict_to_csr_matrix(test['data_dict'], test['shape']) + self.assertTrue(csr_matrix_equality(result, test['expected']), test_name) class TestRowNormalizeCSRMatrix(unittest.TestCase): From adef7540bca6bd4789f4313e05ac7906be11230f Mon Sep 17 00:00:00 2001 From: dkaslovsky Date: Thu, 27 Aug 2020 06:44:19 -0600 Subject: [PATCH 13/18] add python3 note to readme --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index e1e331a..b1900f5 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,8 @@ Outlier detection for categorical data ### Overview Python implementation of the Coupled Biased Random Walks (CBRW) outlier detection algorithm described by Pang, Cao, and Chen in https://www.ijcai.org/Proceedings/16/Papers/272.pdf. +*NOTE*: Only Python>=3.6 is supported as of version 2.0.0. + This implementation operates on Python dicts rather than Pandas DataFrames. This has the advantage of allowing the model to be updated with new observations in a trivial manner and is more efficient in certain aspects. However, these advantages come at the cost of iterating a (potentially large) dict of observed values more times than might otherwise be necessary using an underlying DataFrame implementation. If one is working with data previously loaded into a DataFrame, simply use the result of `pandas.DataFrame.to_dict(orient='records')` instead of the DataFrame itself to add observations to the model. Note that because it is common for a DataFrame to fill missing values with `nan`, the detector will ignore features with value `nan` in any observation record. Therefore, there is no need to further preprocess the DataFrame before using its `to_dict` method to create records. From e7d0970e7b2e35157936eb7a66e160ad87fbd322 Mon Sep 17 00:00:00 2001 From: dkaslovsky Date: Thu, 27 Aug 2020 06:45:22 -0600 Subject: [PATCH 14/18] bold note in readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b1900f5..40090fc 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Outlier detection for categorical data ### Overview Python implementation of the Coupled Biased Random Walks (CBRW) outlier detection algorithm described by Pang, Cao, and Chen in https://www.ijcai.org/Proceedings/16/Papers/272.pdf. -*NOTE*: Only Python>=3.6 is supported as of version 2.0.0. +_NOTE_: Only Python>=3.6 is supported as of version 2.0.0. This implementation operates on Python dicts rather than Pandas DataFrames. This has the advantage of allowing the model to be updated with new observations in a trivial manner and is more efficient in certain aspects. However, these advantages come at the cost of iterating a (potentially large) dict of observed values more times than might otherwise be necessary using an underlying DataFrame implementation. From 941f952821765236684c0d85332ae102c7520b8c Mon Sep 17 00:00:00 2001 From: dkaslovsky Date: Thu, 27 Aug 2020 06:47:19 -0600 Subject: [PATCH 15/18] fix bold note in readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 40090fc..78ff124 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Outlier detection for categorical data ### Overview Python implementation of the Coupled Biased Random Walks (CBRW) outlier detection algorithm described by Pang, Cao, and Chen in https://www.ijcai.org/Proceedings/16/Papers/272.pdf. -_NOTE_: Only Python>=3.6 is supported as of version 2.0.0. +__NOTE__: Only Python>=3.6 is supported as of version 2.0.0. This implementation operates on Python dicts rather than Pandas DataFrames. This has the advantage of allowing the model to be updated with new observations in a trivial manner and is more efficient in certain aspects. However, these advantages come at the cost of iterating a (potentially large) dict of observed values more times than might otherwise be necessary using an underlying DataFrame implementation. From 671c997d6e5c5706932b9af68099a7017cb2bf23 Mon Sep 17 00:00:00 2001 From: dkaslovsky Date: Fri, 28 Aug 2020 20:17:15 -0600 Subject: [PATCH 16/18] type hints --- CHANGELOG.md | 3 +- coupled_biased_random_walks/count.py | 29 +++++++++--------- coupled_biased_random_walks/detect.py | 42 +++++++++++++++++---------- coupled_biased_random_walks/matrix.py | 18 ++++++++++-- coupled_biased_random_walks/types.py | 4 +++ 5 files changed, 64 insertions(+), 32 deletions(-) create mode 100644 coupled_biased_random_walks/types.py diff --git a/CHANGELOG.md b/CHANGELOG.md index ae89d6c..0a852bf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ ## 2.0.0 / 2020-08-26 -* [Changed] removed support for Python 2 and <3.6 +* [Added] type hints +* [Changed] removed support for Python 2 and <3.7 * [Changed] updated dependencies to latest versions ## 1.1.0 / 2020-08-23 diff --git a/coupled_biased_random_walks/count.py b/coupled_biased_random_walks/count.py index 466d6e4..cd832eb 100644 --- a/coupled_biased_random_walks/count.py +++ b/coupled_biased_random_walks/count.py @@ -1,6 +1,9 @@ from collections import Counter, defaultdict from collections.abc import Mapping from itertools import combinations, tee +from typing import Any, Dict, Hashable, Iterable, Tuple + +from coupled_biased_random_walks.types import obs_item_type, observation_type class IncrementingDict(Mapping): @@ -16,7 +19,7 @@ def __init__(self): self._d = {} self._next_val = 0 - def insert(self, key): + def insert(self, key: Hashable) -> None: """ Inserts a (strictly new) key :param key: any hashable object to be used as a key @@ -61,18 +64,18 @@ def __init__(self): self._index = IncrementingDict() @property - def counts(self): + def counts(self) -> Dict[str, Counter]: return dict(self._counts) @property - def joint_counts(self): + def joint_counts(self) -> Dict[Tuple[obs_item_type, obs_item_type], int]: return dict(self._joint_counts) @property - def index(self): + def index(self) -> IncrementingDict: return self._index - def update(self, observation_iterable): + def update(self, observation_iterable: Iterable[observation_type]) -> None: """ Update counts with new observation(s) :param observation_iterable: list of dicts @@ -89,7 +92,7 @@ def update(self, observation_iterable): self._update_counts(obs1) self._update_joint_counts(obs2) - def get_count(self, item): + def get_count(self, item: obs_item_type) -> int: """ Getter to safely retrieve count from interal data structure of defaultdict(Counter) :param item: tuple of the form ('feature_name', 'feature_value') @@ -103,7 +106,7 @@ def get_count(self, item): # meaning there is no count for the feature_name return 0 - def _update_counts(self, observation): + def _update_counts(self, observation: obs_item_type) -> None: """ Update single counts :param observation: iterable of tuples of the form ('feature_name', 'feature_value') @@ -114,18 +117,18 @@ def _update_counts(self, observation): self._index.insert(item) self.n_obs.update([feature_name]) - def _update_joint_counts(self, observation): + def _update_joint_counts(self, observations: Iterable[obs_item_type]) -> None: """ Update joint counts :param observation: iterable of tuples of the form ('feature_name', 'feature_value') """ - pairs = combinations(sorted(observation), 2) + pairs = combinations(sorted(observations), 2) self._joint_counts.update(pairs) # Helper functions -def get_feature_name(feature_tuple): +def get_feature_name(feature_tuple: obs_item_type) -> str: """ Helper function to return feature name from tuple representation :param feature_tuple: tuple of the form (feature_name, feature_value) @@ -133,7 +136,7 @@ def get_feature_name(feature_tuple): return feature_tuple[0] -def get_feature_value(feature_tuple): +def get_feature_value(feature_tuple: obs_item_type) -> str: """ Helper function to return feature value from tuple representation :param feature_tuple: tuple of the form (feature_name, feature_value) @@ -141,7 +144,7 @@ def get_feature_value(feature_tuple): return feature_tuple[1] -def get_mode(counter): +def get_mode(counter: Counter) -> int: """ Helper function to return the count of the most common element from an instance of Counter() @@ -155,7 +158,7 @@ def get_mode(counter): return mode[0][1] -def isnan(x): +def isnan(x: Any) -> bool: """ Return True if x is NaN where x can be of any type :param x: any object for which (in)equality can be checked diff --git a/coupled_biased_random_walks/detect.py b/coupled_biased_random_walks/detect.py index f805571..9048e08 100644 --- a/coupled_biased_random_walks/detect.py +++ b/coupled_biased_random_walks/detect.py @@ -1,6 +1,10 @@ +from __future__ import annotations + from collections import defaultdict +from typing import Dict, Iterable, List, Optional import numpy as np +from scipy.sparse import csr_matrix from coupled_biased_random_walks.count import ( ObservationCounter, @@ -12,6 +16,7 @@ random_walk, row_normalize_csr_matrix, ) +from coupled_biased_random_walks.types import obs_item_type, observation_type class CBRW: @@ -25,7 +30,11 @@ class CBRW: 'max_iter': 100 # max number of steps to take } - def __init__(self, rw_params=None, ignore_unknown=False): + def __init__( + self, + rw_params: Optional[Dict[str, float]] = None, + ignore_unknown: bool = False, + ): """ :param rw_params: random walk parameters to override defaults :param ignore_unknown: if True, score an observation containing unknown feature names @@ -36,14 +45,14 @@ def __init__(self, rw_params=None, ignore_unknown=False): self._unknown_feature_score = 0 if ignore_unknown else np.nan self._counter = ObservationCounter() - self._stationary_prob = None - self._feature_relevance = None + self._stationary_prob = None # type: Optional[Dict[obs_item_type, float]] + self._feature_relevance = None # type: Optional[Dict[str, float]] @property - def feature_weights(self): + def feature_weights(self) -> Optional[Dict[str, float]]: return self._feature_relevance - def add_observations(self, observation_iterable): + def add_observations(self, observation_iterable: Iterable[observation_type]) -> CBRW: """ Add observations to be modeled :param observation_iterable: list of dicts with each dict representing an observation @@ -52,7 +61,7 @@ def add_observations(self, observation_iterable): self._counter.update(observation_iterable) return self - def fit(self): + def fit(self) -> CBRW: """ Compute model based on current observations in state """ @@ -79,7 +88,7 @@ def fit(self): self._feature_relevance = dict(feature_relevance) return self - def score(self, observation_iterable): + def score(self, observation_iterable: Iterable[observation_type]) -> np.array: """ Compute an anomaly score for each observation in observation_iterable :param observation_iterable: iterable of dict observations with each dict @@ -91,14 +100,17 @@ def score(self, observation_iterable): observation_iterable = [observation_iterable] return np.array([self._score(obs) for obs in observation_iterable]) - def _score(self, observation): + def _score(self, observation: observation_type) -> float: """ Compute the weighted anomaly score (object_score in the paper) for an observation :param observation: dict of the form {feature_name: feature_value, ...} """ return sum(self._value_scores(observation).values()) - def value_scores(self, observation_iterable): + def value_scores( + self, + observation_iterable: Iterable[observation_type], + ) -> List[Dict[str, float]]: """ Compute an anomaly sub-score for each value of each observation in observation_iterable :param observation_iterable: iterable of dict observations with each dict @@ -113,7 +125,7 @@ def value_scores(self, observation_iterable): observation_iterable = [observation_iterable] return [self._value_scores(obs) for obs in observation_iterable] - def _value_scores(self, observation): + def _value_scores(self, observation: observation_type) -> Dict[str, float]: """ Compute the weighted value scores for each feature value of an observation :param observation: dict of the form {feature_name: feature_value, ...} @@ -125,7 +137,7 @@ def _value_scores(self, observation): for item in observation.items() } - def _get_feature_relevance(self, feature_tuple): + def _get_feature_relevance(self, feature_tuple: obs_item_type) -> float: """ Getter for the relevance (weight) of a feature (category) :param feature_tuple: tuple of the form (feature_name, feature_value) @@ -133,11 +145,11 @@ def _get_feature_relevance(self, feature_tuple): feature_name = get_feature_name(feature_tuple) return self._feature_relevance.get(feature_name, 0) - def _compute_biased_transition_matrix(self): + def _compute_biased_transition_matrix(self) -> csr_matrix: """ Computes biased probability transition matrix of conditional probabilities """ - prob_idx = {} + prob_idx = {} # type: Dict[obs_item_type, float] bias_dict = self._compute_biases() @@ -170,11 +182,11 @@ def _compute_biased_transition_matrix(self): trans_matrix = dict_to_csr_matrix(prob_idx, shape=n_features) return row_normalize_csr_matrix(trans_matrix) - def _compute_biases(self): + def _compute_biases(self) -> Dict[obs_item_type, float]: """ Computes bias for random walk for each feature tuple """ - bias_dict = {} + bias_dict = {} # type: Dict[obs_item_type, float] for feature_name, value_counts in self._counter.counts.items(): mode = get_mode(value_counts) base = 1 - (mode / self._counter.n_obs[feature_name]) diff --git a/coupled_biased_random_walks/matrix.py b/coupled_biased_random_walks/matrix.py index 8c475e8..23fbee9 100644 --- a/coupled_biased_random_walks/matrix.py +++ b/coupled_biased_random_walks/matrix.py @@ -1,8 +1,17 @@ +from typing import Dict, Tuple, Union + import numpy as np from scipy.sparse import csr_matrix +from coupled_biased_random_walks.types import obs_item_type + -def random_walk(transition_matrix, alpha, err_tol, max_iter): +def random_walk( + transition_matrix: csr_matrix, + alpha: float, + err_tol: float, + max_iter: int +) -> np.ndarray: """ Run random walk to compute stationary probabilities :param transition_matrix: scipy.sparse.csr_matrix defining the random walk @@ -27,7 +36,10 @@ def random_walk(transition_matrix, alpha, err_tol, max_iter): return pi -def dict_to_csr_matrix(data_dict, shape): +def dict_to_csr_matrix( + data_dict: Dict[obs_item_type, float], + shape: Union[int, Tuple[int, int]], +) -> csr_matrix: """ Converts dict of index -> value to csr_matrix :param data_dict: dict mapping matrix index tuple to corresponding matrix value @@ -43,7 +55,7 @@ def dict_to_csr_matrix(data_dict, shape): return csr_matrix((data, idx), shape=shape) -def row_normalize_csr_matrix(matrix): +def row_normalize_csr_matrix(matrix: csr_matrix) -> csr_matrix: """ Row normalize a csr matrix without mutating the input :param matrix: scipy.sparse.csr_matrix instance diff --git a/coupled_biased_random_walks/types.py b/coupled_biased_random_walks/types.py new file mode 100644 index 0000000..84684c7 --- /dev/null +++ b/coupled_biased_random_walks/types.py @@ -0,0 +1,4 @@ +from typing import Dict, Tuple + +observation_type = Dict[str, str] +obs_item_type = Tuple[str, str] From 6b913a19294e75d72c8b7d70c6b687ae3216756f Mon Sep 17 00:00:00 2001 From: dkaslovsky Date: Fri, 28 Aug 2020 20:30:05 -0600 Subject: [PATCH 17/18] type hints --- data/loading.py | 10 ++++++++-- tests/test_matrix.py | 9 ++++++--- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/data/loading.py b/data/loading.py index f82c2b8..0f25aa7 100644 --- a/data/loading.py +++ b/data/loading.py @@ -1,8 +1,14 @@ from csv import DictReader from functools import partial +from typing import Iterable, List, Optional, Set +from coupled_biased_random_walks.types import observation_type -def load_from_csv(path_to_csv, exclude_cols=None): + +def load_from_csv( + path_to_csv: str, + exclude_cols: Optional[Iterable[str]] = None, +) -> List[observation_type]: """ Load a CSV and return a list of dicts, one dict for each row of the form {column_header1: , column_header2: , ...} @@ -21,7 +27,7 @@ def load_from_csv(path_to_csv, exclude_cols=None): return [filt(rec) for rec in data] -def filter_keys(record, fields): +def filter_keys(record: observation_type, fields: Set[str]): """ Filter keys from a dict :param record: dict diff --git a/tests/test_matrix.py b/tests/test_matrix.py index 5beda1b..c91040e 100644 --- a/tests/test_matrix.py +++ b/tests/test_matrix.py @@ -1,4 +1,5 @@ import unittest +from typing import List import numpy as np from scipy.sparse import csr_matrix @@ -12,7 +13,7 @@ np.random.seed(0) -def construct_2x2_csr_matrix(data): +def construct_2x2_csr_matrix(data: List[float]) -> csr_matrix: """ Construct a 2x2 csr_matrix :param data: list of length 4 of data for csr matrix corresponding to idx position @@ -30,9 +31,11 @@ def construct_2x2_csr_matrix(data): return csr_matrix((matrix_data, zip(*matrix_idx)), shape=(2, 2)) -def csr_matrix_equality(c1, c2): +def csr_matrix_equality(c1: csr_matrix, c2: csr_matrix) -> bool: """ - Test 2 csr matrices for equality + Test two csr matrices for equality + :param c1: csr_matrix to compare + :param c2: csr_matrix to compare """ if c1.shape != c2.shape: return False From f0e6495051d108bee3e00d387ee32c81c7f8faa1 Mon Sep 17 00:00:00 2001 From: dkaslovsky Date: Fri, 28 Aug 2020 20:32:08 -0600 Subject: [PATCH 18/18] remove python3.6 to use annotations --- .travis.yml | 1 - README.md | 2 +- setup.py | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 9f50e43..4f51ef7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,7 +2,6 @@ language: python matrix: include: - - python: 3.6 - python: 3.7 - python: 3.8 diff --git a/README.md b/README.md index 78ff124..9ea1376 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Outlier detection for categorical data ### Overview Python implementation of the Coupled Biased Random Walks (CBRW) outlier detection algorithm described by Pang, Cao, and Chen in https://www.ijcai.org/Proceedings/16/Papers/272.pdf. -__NOTE__: Only Python>=3.6 is supported as of version 2.0.0. +__NOTE__: Only Python>=3.7 is supported as of version 2.0.0. This implementation operates on Python dicts rather than Pandas DataFrames. This has the advantage of allowing the model to be updated with new observations in a trivial manner and is more efficient in certain aspects. However, these advantages come at the cost of iterating a (potentially large) dict of observed values more times than might otherwise be necessary using an underlying DataFrame implementation. diff --git a/setup.py b/setup.py index 692b8d4..8ecb9d7 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,6 @@ 'License :: OSI Approved :: MIT License', 'Natural Language :: English', 'Operating System :: OS Independent', - 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', ]