From 8f5fcff248b85019cbb43631e88f70ddf2b8a33a Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 5 Jul 2019 15:16:54 +0300 Subject: [PATCH] The additional features in vw_ensemble turned out to be a bad idea - they introduce extra complexity without significantly improving results. Reverting the merge for now. The vw_ensemble backend needs more work, but it's better to have a simple version in master and build upon that. Revert "Merge pull request #288 from NatLibFi/issue235-more-features" This reverts commit 2ea1130aceeeb56926f728d416430e60a2c34633, reversing changes made to 85216528dbcf2c07efe79aff0c9bb5185ba2f3f4. --- annif/backend/vw_ensemble.py | 41 +++++++------------------------ tests/test_backend_vw_ensemble.py | 5 ++-- 2 files changed, 11 insertions(+), 35 deletions(-) diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py index 132e26d69..94cf28fa7 100644 --- a/annif/backend/vw_ensemble.py +++ b/annif/backend/vw_ensemble.py @@ -44,10 +44,6 @@ class VWEnsembleBackend( # will make it more careful so that it will require more training data. DEFAULT_DISCOUNT_RATE = 0.01 - # score threshold for "zero features": scores lower than this will be - # considered zero and marked with a zero feature given to VW - ZERO_THRESHOLD = 0.001 - def _load_subject_freq(self): path = os.path.join(self.datadir, self.FREQ_FILE) if not os.path.exists(path): @@ -98,30 +94,17 @@ def _source_project_ids(self): sources = annif.util.parse_sources(self.params['sources']) return [project_id for project_id, _ in sources] - @staticmethod - def _format_value(true): + def _format_example(self, subject_id, scores, true=None): if true is None: - return '' + val = '' elif true: - return 1 + val = 1 else: - return -1 - - def _format_example(self, subject_id, scores, true=None): - features = " ".join(["{}:{:.6f}".format(proj, scores[proj_idx]) - for proj_idx, proj - in enumerate(self._source_project_ids)]) - zero_features = " ".join(["zero^{}".format(proj) - for proj_idx, proj - in enumerate(self._source_project_ids) - if scores[proj_idx] < self.ZERO_THRESHOLD]) - return "{} |raw {} {} |{} {} {}".format( - self._format_value(true), - features, - zero_features, - subject_id, - features, - zero_features) + val = -1 + ex = "{} |{}".format(val, subject_id) + for proj_idx, proj in enumerate(self._source_project_ids): + ex += " {}:{:.6f}".format(proj, scores[proj_idx]) + return ex def _doc_score_vector(self, doc, source_projects): score_vectors = [] @@ -136,8 +119,7 @@ def _doc_to_example(self, doc, project, source_projects): true = subjects.as_vector(project.subjects) score_vector = self._doc_score_vector(doc, source_projects) for subj_id in range(len(true)): - if true[subj_id] \ - or score_vector[:, subj_id].sum() >= self.ZERO_THRESHOLD: + if true[subj_id] or score_vector[:, subj_id].sum() > 0.0: ex = (subj_id, self._format_example( subj_id, score_vector[:, subj_id], @@ -154,11 +136,6 @@ def _create_examples(self, corpus, project): random.shuffle(examples) return examples - def _create_model(self, project): - # add interactions between raw (descriptor-invariant) features to - # the mix - super()._create_model(project, {'q': 'rr'}) - @staticmethod def _write_freq_file(subject_freq, filename): with open(filename, 'w') as freqfile: diff --git a/tests/test_backend_vw_ensemble.py b/tests/test_backend_vw_ensemble.py index f22dbf75d..23b48ff53 100644 --- a/tests/test_backend_vw_ensemble.py +++ b/tests/test_backend_vw_ensemble.py @@ -126,7 +126,7 @@ def test_vw_ensemble_format_example(datadir): datadir=str(datadir)) ex = vw_ensemble._format_example(0, [0.5]) - assert ex == ' |raw dummy-en:0.500000 |0 dummy-en:0.500000 ' + assert ex == ' |0 dummy-en:0.500000' def test_vw_ensemble_format_example_avoid_sci_notation(datadir): @@ -137,5 +137,4 @@ def test_vw_ensemble_format_example_avoid_sci_notation(datadir): datadir=str(datadir)) ex = vw_ensemble._format_example(0, [7.24e-05]) - assert ex == ' |raw dummy-en:0.000072 zero^dummy-en' + \ - ' |0 dummy-en:0.000072 zero^dummy-en' + assert ex == ' |0 dummy-en:0.000072'