From 8f5fcff248b85019cbb43631e88f70ddf2b8a33a Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Fri, 5 Jul 2019 15:16:54 +0300
Subject: [PATCH] The additional features in vw_ensemble turned out to be a bad
 idea - they introduce extra complexity without significantly improving
 results. Reverting the merge for now. The vw_ensemble backend needs more
 work, but it's better to have a simple version in master and build upon that.

Revert "Merge pull request #288 from NatLibFi/issue235-more-features"

This reverts commit 2ea1130aceeeb56926f728d416430e60a2c34633, reversing
changes made to 85216528dbcf2c07efe79aff0c9bb5185ba2f3f4.
---
 annif/backend/vw_ensemble.py      | 41 +++++++------------------------
 tests/test_backend_vw_ensemble.py |  5 ++--
 2 files changed, 11 insertions(+), 35 deletions(-)

diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py
index 132e26d69..94cf28fa7 100644
--- a/annif/backend/vw_ensemble.py
+++ b/annif/backend/vw_ensemble.py
@@ -44,10 +44,6 @@ class VWEnsembleBackend(
     # will make it more careful so that it will require more training data.
     DEFAULT_DISCOUNT_RATE = 0.01
 
-    # score threshold for "zero features": scores lower than this will be
-    # considered zero and marked with a zero feature given to VW
-    ZERO_THRESHOLD = 0.001
-
     def _load_subject_freq(self):
         path = os.path.join(self.datadir, self.FREQ_FILE)
         if not os.path.exists(path):
@@ -98,30 +94,17 @@ def _source_project_ids(self):
         sources = annif.util.parse_sources(self.params['sources'])
         return [project_id for project_id, _ in sources]
 
-    @staticmethod
-    def _format_value(true):
+    def _format_example(self, subject_id, scores, true=None):
         if true is None:
-            return ''
+            val = ''
         elif true:
-            return 1
+            val = 1
         else:
-            return -1
-
-    def _format_example(self, subject_id, scores, true=None):
-        features = " ".join(["{}:{:.6f}".format(proj, scores[proj_idx])
-                             for proj_idx, proj
-                             in enumerate(self._source_project_ids)])
-        zero_features = " ".join(["zero^{}".format(proj)
-                                  for proj_idx, proj
-                                  in enumerate(self._source_project_ids)
-                                  if scores[proj_idx] < self.ZERO_THRESHOLD])
-        return "{} |raw {} {} |{} {} {}".format(
-            self._format_value(true),
-            features,
-            zero_features,
-            subject_id,
-            features,
-            zero_features)
+            val = -1
+        ex = "{} |{}".format(val, subject_id)
+        for proj_idx, proj in enumerate(self._source_project_ids):
+            ex += " {}:{:.6f}".format(proj, scores[proj_idx])
+        return ex
 
     def _doc_score_vector(self, doc, source_projects):
         score_vectors = []
@@ -136,8 +119,7 @@ def _doc_to_example(self, doc, project, source_projects):
         true = subjects.as_vector(project.subjects)
         score_vector = self._doc_score_vector(doc, source_projects)
         for subj_id in range(len(true)):
-            if true[subj_id] \
-               or score_vector[:, subj_id].sum() >= self.ZERO_THRESHOLD:
+            if true[subj_id] or score_vector[:, subj_id].sum() > 0.0:
                 ex = (subj_id, self._format_example(
                     subj_id,
                     score_vector[:, subj_id],
@@ -154,11 +136,6 @@ def _create_examples(self, corpus, project):
         random.shuffle(examples)
         return examples
 
-    def _create_model(self, project):
-        # add interactions between raw (descriptor-invariant) features to
-        # the mix
-        super()._create_model(project, {'q': 'rr'})
-
     @staticmethod
     def _write_freq_file(subject_freq, filename):
         with open(filename, 'w') as freqfile:
diff --git a/tests/test_backend_vw_ensemble.py b/tests/test_backend_vw_ensemble.py
index f22dbf75d..23b48ff53 100644
--- a/tests/test_backend_vw_ensemble.py
+++ b/tests/test_backend_vw_ensemble.py
@@ -126,7 +126,7 @@ def test_vw_ensemble_format_example(datadir):
         datadir=str(datadir))
 
     ex = vw_ensemble._format_example(0, [0.5])
-    assert ex == ' |raw dummy-en:0.500000  |0 dummy-en:0.500000 '
+    assert ex == ' |0 dummy-en:0.500000'
 
 
 def test_vw_ensemble_format_example_avoid_sci_notation(datadir):
@@ -137,5 +137,4 @@ def test_vw_ensemble_format_example_avoid_sci_notation(datadir):
         datadir=str(datadir))
 
     ex = vw_ensemble._format_example(0, [7.24e-05])
-    assert ex == ' |raw dummy-en:0.000072 zero^dummy-en' + \
-                 ' |0 dummy-en:0.000072 zero^dummy-en'
+    assert ex == ' |0 dummy-en:0.000072'