tidy formatting

flaxandteal · Jun 19, 2023 · 81023ba · 81023ba
1 parent d3bf983
commit 81023ba
Show file tree

Hide file tree

Showing 11 changed files with 39 additions and 30 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "bonn"
-version = "0.1.3"
+version = "0.1.4"
 edition = "2021"
 
 [lib]

diff --git a/bonn/__init__.py b/bonn/__init__.py
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "maturin"
 
 [project]
 name = "bonn"
-version = "0.1.3"
+version = "0.1.4"
 description = "Created for ONS. Proof-of-concept mmap'd Rust word2vec implementation linked with category matching"
 readme = "README.md"
 license = { "file" = "LICENSE.md" }
@@ -27,3 +27,7 @@ classifiers = [
     "Programming Language :: Python :: Implementation :: CPython",
     "Programming Language :: Python :: Implementation :: PyPy",
 ]
+
+[tool.maturin]
+python-source = "python"
+module-name = "bonn._bonn"
diff --git a/python/bonn/__init__.py b/python/bonn/__init__.py
diff --git a/bonn/category_manager.py → python/bonn/category_manager.py b/bonn/category_manager.py → python/bonn/category_manager.py
@@ -45,8 +45,12 @@ def __init__(self, key, bow, model, weighting):
         self._set_words()
 
     def _set_vector(self, model):
-        vector = np.mean([self._weighting[code] * model[w] for code, w in self.bow], axis=0)
-        self.vector = vector / sum([self._weighting[code] for code, _ in self.bow])
+        vector = np.mean([
+            self._weighting[code] * model[w] for code, w in self.bow
+        ], axis=0)
+        self.vector = vector / sum(
+            [self._weighting[code] for code, _ in self.bow]
+        )
 
     def _set_words(self):
         self.words = [w for _, w in self.bow]
@@ -61,7 +65,10 @@ class CategoryManager:
     def __init__(self, word_model, settings):
         self._categories = SortedDict()
         self._model = WModel(word_model)
-        stopwords_language = settings.get("STOPWORDS_LANGUAGE", DEFAULT_STOPWORDS_LANGUAGE)
+        stopwords_language = settings.get(
+            "STOPWORDS_LANGUAGE",
+            DEFAULT_STOPWORDS_LANGUAGE
+        )
         extra_stopwords = (
             settings
                 .get("EXTRA_STOPWORDS", {})
@@ -80,7 +87,8 @@ def __init__(self, word_model, settings):
 
     def set_all_words(self, all_words):
         total = sum(all_words.values())
-        scale = lambda c: 0.25 + math.exp(1000 * (1 - c) / total) * 0.75
+        def scale(c):
+            return 0.25 + math.exp(1000 * (1 - c) / total) * 0.75
         self.all_words = {w: scale(c) for w, c in all_words.items()}
 
     def _scale_by_frequency(self, word):
@@ -93,7 +101,8 @@ def _scale_by_frequency(self, word):
 
     def add_categories_from_bow(self, name, classifier_bow):
         self._categories[name] = SortedDict(
-            (k, Category(k, bow, self._model, self._weighting)) for k, bow in classifier_bow.items()
+            (k, Category(k, bow, self._model, self._weighting))
+            for k, bow in classifier_bow.items()
         )
 
     def closest(self, text, cat, classifier_bow_vec):
@@ -139,7 +148,7 @@ def test_category(self, sentence, category, category_group="dtcats"):
         if not clean:
             return []
 
-        classifiers = {w: WEIGHTING[code] * self._model[w] for code, w in cat.bow}
+        classifiers = {w: self._weighting[code] * self._model[w] for code, w in cat.bow}
 
         tags = {}
         for words in clean:
@@ -160,7 +169,7 @@ def test_category(self, sentence, category, category_group="dtcats"):
             "tags": tags,
             "vector": np.linalg.norm(cat.vector),
             "significance": self._significance_for_vector(cat.vector),
-            "weightings": {w: WEIGHTING[code] for code, w in cat.bow},
+            "weightings": {w: self._weighting[code] for code, w in cat.bow},
         }
 
     @staticmethod

diff --git a/bonn/extract.py → python/bonn/extract.py b/bonn/extract.py → python/bonn/extract.py
@@ -9,7 +9,7 @@
 from tqdm import tqdm
 from nltk.stem.wordnet import WordNetLemmatizer
 
-from bonn import FfModel
+from ._bonn import FfModel
 from .category_manager import CategoryManager
 from .taxonomy import get_taxonomy, taxonomy_to_categories, categories_to_classifier_bow
 
@@ -41,38 +41,38 @@ def get_datasets(cm, classifier_bow, settings):
     expecting = s.count()
     size = 50
     s = s.params(size=size)
-    n = 0
     all_words = Counter()
     with tqdm(total=expecting) as pbar:
         for hit in s.scan():
             try:
-                datasets[hit.description.title] = {
+                title = hit.description.title
+                datasets[title] = {
                     "category": tuple(hit.uri.split("/")[1:4]),
-                    "text": f"{hit.description.title} {hit.description.metaDescription}",
+                    "text": f"{title} {hit.description.metaDescription}",
                 }
-                cat = datasets[hit.description.title]["category"]
+                cat = datasets[title]["category"]
                 if cat not in classifier_bow_vec and cat[:-1] in classifier_bow_vec:
                     cat = cat[:-1]
-                    datasets[hit.description.title]["category"] = cat
+                    datasets[title]["category"] = cat
 
-                datasets[hit.description.title]["bow"] = cm.closest(
-                    datasets[hit.description.title]["text"], cat, classifier_bow_vec
+                datasets[title]["bow"] = cm.closest(
+                    datasets[title]["text"], cat, classifier_bow_vec
                 )
                 document = (
-                    hit.description.title
+                    title
                     + " "
-                    + datasets[hit.description.title]["text"]
+                    + datasets[title]["text"]
                 )
                 all_words.update(
                     {
                         ltzr.lemmatize(v)
                         for v in set(sum(cm.strip_document(document), []))
                     }
                 )
-                datasets[hit.description.title]["bow"] = cm.closest(
+                datasets[title]["bow"] = cm.closest(
                     document, cat, classifier_bow_vec
                 )
-            except AttributeError as e:
+            except AttributeError:
                 pass
             pbar.update(1)
 

diff --git a/bonn/settings.py → python/bonn/settings.py b/bonn/settings.py → python/bonn/settings.py
diff --git a/bonn/taxonomy.py → python/bonn/taxonomy.py b/bonn/taxonomy.py → python/bonn/taxonomy.py
@@ -1,6 +1,5 @@
 from sortedcontainers import SortedDict
 import json
-import numpy as np
 
 
 def get_taxonomy(taxonomy_location):

diff --git a/bonn/utils.py → python/bonn/utils.py b/bonn/utils.py → python/bonn/utils.py
@@ -16,7 +16,7 @@ def filter_by_snr(scored_list, snr):
     scored_list_sum = sum(s for s, _ in scored_list)
     scored_list_len = len(scored_list)
     return [
-        (s, l)
-        for s, l in scored_list
-        if _get_snr(s, scored_list_len, scored_list_sum) > snr
+        (score, item)
+        for score, item in scored_list
+        if _get_snr(score, scored_list_len, scored_list_sum) > snr
     ]
diff --git a/src/lib.rs b/src/lib.rs
@@ -93,6 +93,7 @@ fn build_model(input_path: String, output_path: String) -> PyResult<()> {
 }
 
 #[pymodule]
+#[pyo3(name = "_bonn")]
 fn bonn(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
     m.add_class::<FfModel>()?;
     m.add_function(wrap_pyfunction!(build_model, m)?)?;