Skip to content

Commit

Permalink
tidy formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
philtweir committed Jun 19, 2023
1 parent d3bf983 commit 81023ba
Show file tree
Hide file tree
Showing 11 changed files with 39 additions and 30 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "bonn"
version = "0.1.3"
version = "0.1.4"
edition = "2021"

[lib]
Expand Down
4 changes: 0 additions & 4 deletions bonn/__init__.py

This file was deleted.

6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "maturin"

[project]
name = "bonn"
version = "0.1.3"
version = "0.1.4"
description = "Created for ONS. Proof-of-concept mmap'd Rust word2vec implementation linked with category matching"
readme = "README.md"
license = { "file" = "LICENSE.md" }
Expand All @@ -27,3 +27,7 @@ classifiers = [
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
]

[tool.maturin]
python-source = "python"
module-name = "bonn._bonn"
Empty file added python/bonn/__init__.py
Empty file.
23 changes: 16 additions & 7 deletions bonn/category_manager.py → python/bonn/category_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,12 @@ def __init__(self, key, bow, model, weighting):
self._set_words()

def _set_vector(self, model):
vector = np.mean([self._weighting[code] * model[w] for code, w in self.bow], axis=0)
self.vector = vector / sum([self._weighting[code] for code, _ in self.bow])
vector = np.mean([
self._weighting[code] * model[w] for code, w in self.bow
], axis=0)
self.vector = vector / sum(
[self._weighting[code] for code, _ in self.bow]
)

def _set_words(self):
self.words = [w for _, w in self.bow]
Expand All @@ -61,7 +65,10 @@ class CategoryManager:
def __init__(self, word_model, settings):
self._categories = SortedDict()
self._model = WModel(word_model)
stopwords_language = settings.get("STOPWORDS_LANGUAGE", DEFAULT_STOPWORDS_LANGUAGE)
stopwords_language = settings.get(
"STOPWORDS_LANGUAGE",
DEFAULT_STOPWORDS_LANGUAGE
)
extra_stopwords = (
settings
.get("EXTRA_STOPWORDS", {})
Expand All @@ -80,7 +87,8 @@ def __init__(self, word_model, settings):

def set_all_words(self, all_words):
total = sum(all_words.values())
scale = lambda c: 0.25 + math.exp(1000 * (1 - c) / total) * 0.75
def scale(c):
return 0.25 + math.exp(1000 * (1 - c) / total) * 0.75
self.all_words = {w: scale(c) for w, c in all_words.items()}

def _scale_by_frequency(self, word):
Expand All @@ -93,7 +101,8 @@ def _scale_by_frequency(self, word):

def add_categories_from_bow(self, name, classifier_bow):
self._categories[name] = SortedDict(
(k, Category(k, bow, self._model, self._weighting)) for k, bow in classifier_bow.items()
(k, Category(k, bow, self._model, self._weighting))
for k, bow in classifier_bow.items()
)

def closest(self, text, cat, classifier_bow_vec):
Expand Down Expand Up @@ -139,7 +148,7 @@ def test_category(self, sentence, category, category_group="dtcats"):
if not clean:
return []

classifiers = {w: WEIGHTING[code] * self._model[w] for code, w in cat.bow}
classifiers = {w: self._weighting[code] * self._model[w] for code, w in cat.bow}

tags = {}
for words in clean:
Expand All @@ -160,7 +169,7 @@ def test_category(self, sentence, category, category_group="dtcats"):
"tags": tags,
"vector": np.linalg.norm(cat.vector),
"significance": self._significance_for_vector(cat.vector),
"weightings": {w: WEIGHTING[code] for code, w in cat.bow},
"weightings": {w: self._weighting[code] for code, w in cat.bow},
}

@staticmethod
Expand Down
24 changes: 12 additions & 12 deletions bonn/extract.py → python/bonn/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from tqdm import tqdm
from nltk.stem.wordnet import WordNetLemmatizer

from bonn import FfModel
from ._bonn import FfModel
from .category_manager import CategoryManager
from .taxonomy import get_taxonomy, taxonomy_to_categories, categories_to_classifier_bow

Expand Down Expand Up @@ -41,38 +41,38 @@ def get_datasets(cm, classifier_bow, settings):
expecting = s.count()
size = 50
s = s.params(size=size)
n = 0
all_words = Counter()
with tqdm(total=expecting) as pbar:
for hit in s.scan():
try:
datasets[hit.description.title] = {
title = hit.description.title
datasets[title] = {
"category": tuple(hit.uri.split("/")[1:4]),
"text": f"{hit.description.title} {hit.description.metaDescription}",
"text": f"{title} {hit.description.metaDescription}",
}
cat = datasets[hit.description.title]["category"]
cat = datasets[title]["category"]
if cat not in classifier_bow_vec and cat[:-1] in classifier_bow_vec:
cat = cat[:-1]
datasets[hit.description.title]["category"] = cat
datasets[title]["category"] = cat

datasets[hit.description.title]["bow"] = cm.closest(
datasets[hit.description.title]["text"], cat, classifier_bow_vec
datasets[title]["bow"] = cm.closest(
datasets[title]["text"], cat, classifier_bow_vec
)
document = (
hit.description.title
title
+ " "
+ datasets[hit.description.title]["text"]
+ datasets[title]["text"]
)
all_words.update(
{
ltzr.lemmatize(v)
for v in set(sum(cm.strip_document(document), []))
}
)
datasets[hit.description.title]["bow"] = cm.closest(
datasets[title]["bow"] = cm.closest(
document, cat, classifier_bow_vec
)
except AttributeError as e:
except AttributeError:
pass
pbar.update(1)

Expand Down
File renamed without changes.
1 change: 0 additions & 1 deletion bonn/taxonomy.py → python/bonn/taxonomy.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from sortedcontainers import SortedDict
import json
import numpy as np


def get_taxonomy(taxonomy_location):
Expand Down
6 changes: 3 additions & 3 deletions bonn/utils.py → python/bonn/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def filter_by_snr(scored_list, snr):
scored_list_sum = sum(s for s, _ in scored_list)
scored_list_len = len(scored_list)
return [
(s, l)
for s, l in scored_list
if _get_snr(s, scored_list_len, scored_list_sum) > snr
(score, item)
for score, item in scored_list
if _get_snr(score, scored_list_len, scored_list_sum) > snr
]
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ fn build_model(input_path: String, output_path: String) -> PyResult<()> {
}

#[pymodule]
#[pyo3(name = "_bonn")]
fn bonn(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
m.add_class::<FfModel>()?;
m.add_function(wrap_pyfunction!(build_model, m)?)?;
Expand Down

0 comments on commit 81023ba

Please sign in to comment.