irit-melodi · moreymat · Jul 1, 2016 · Jul 1, 2016 · Jul 28, 2016 · Jul 29, 2016
diff --git a/attelo/args.py b/attelo/args.py
@@ -24,6 +24,8 @@ def add_common_args(psr):
                      help="EDU pair features (libsvm)")
     psr.add_argument("vocab", metavar="FILE",
                      help="feature vocabulary")
+    psr.add_argument("labels", metavar="FILE",
+                     help="labels")
     psr.add_argument("--quiet", action="store_true",
                      help="Supress all feedback")
 

diff --git a/attelo/cdu.py b/attelo/cdu.py
@@ -0,0 +1,13 @@
+"""Explicit representation of a CDU.
+
+As of 2016-07-28, this is WIP.
+"""
+
+from collections import namedtuple
+
+
+class CDU(namedtuple("CDU", "id members")):
+    """A class representing the CDU (id, [members])"""
+    pass
+
+
diff --git a/attelo/cmd/graph.py b/attelo/cmd/graph.py
@@ -29,9 +29,9 @@ def config_argparser(psr):
 
     input_grp = psr.add_mutually_exclusive_group(required=True)
     input_grp.add_argument("--gold", metavar="FILE",
-                           nargs=2,
+                           nargs=3,
                            help="gold predictions [pairings, "
-                           "features (targets only)]")
+                           "features (targets only), labels]")
     input_grp.add_argument("--predictions", metavar="FILE",
                            help="single predictions")
 

diff --git a/attelo/cmd/util.py b/attelo/cmd/util.py
@@ -15,10 +15,10 @@ def load_args_multipack(args):
     '''
     Load multipack specified via command line arguments
     '''
-    return load_multipack(args.edus,
-                          args.pairings,
+    return load_multipack(args.edus, args.pairings,
                           args.features,
-                          args.vocab,
+                          args.vocab, args.labels,
+                          file_split='corpus',  # WIP
                           verbose=not args.quiet)
 
 

diff --git a/attelo/decoding/eisner.py b/attelo/decoding/eisner.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 
+from ..edu import FAKE_ROOT
 from .interface import Decoder
 # temporary? imports
 from ..table import _edu_positions
@@ -46,6 +47,9 @@ def decode(self, dpack, nonfixed_pairs=None):
         """
         # whether the output tree should contain a unique real root
         unique_real_root = self._unique_real_root
+        # check that the first EDU is the fake root ; this is an
+        # important assumption for the following code
+        assert dpack.edus[0] == FAKE_ROOT
 
         # get number of EDUs and possible labels
         nb_edus = len(dpack.edus)

diff --git a/attelo/edu.py b/attelo/edu.py
@@ -53,3 +53,12 @@ def span(self):
 all groupings
 """
 # pylint: enable=pointless-string-statement
+
+
+# small helper for parsers
+def edu_id2num(edu_id):
+    """Get the number of an EDU"""
+    edu_num = (int(edu_id.rsplit('_', 1)[1])
+               if edu_id != FAKE_ROOT_ID
+               else 0)
+    return edu_num
diff --git a/attelo/fold.py b/attelo/fold.py
@@ -1,21 +1,26 @@
-'''
+"""
 Group-aware n-fold evaluation.
 
 Attelo uses a variant of n-fold evaluation, where we (still)
-andomly partition the dataset into a set of folds of roughly even
+randomly partition the dataset into a set of folds of roughly even
 size, but respecting the additional constraint that any two data
-entries belonging in the same "group" (determined a single
+entries belonging in the same "group" (determined by a single
 distiguished feature, eg. the document id, the dialogue id, etc)
 are always in the same fold. Note that this makes it a bit harder
-to have perfectly evenly sized folds
+to have perfectly evenly sized folds.
 
 
 Created on Jun 20, 2012
 
 @author: stergos
 
 contribs: phil
-'''
+
+TODO
+----
+* [ ] refactor after `sklearn.model_selection._split`: encapsulate
+      into a class similar to GroupKFold.
+"""
 
 import random
 

diff --git a/attelo/harness/evaluate.py b/attelo/harness/evaluate.py
@@ -78,6 +78,18 @@ def _link_data_files(data_dir, eval_dir):
         eval_file = fp.join(eval_dir, fname)
         if fp.isfile(data_file) and not fp.exists(eval_file):
             os.link(data_file, eval_file)
+        elif fp.isdir(data_file) and not fp.exists(eval_file):
+            # 2016-09-01 add support for one file per doc:
+            # create hard links to data/Y/Z as eval-xxx/Y/Z ;
+            # folders cannot be hard linked so we create copies
+            os.makedirs(eval_file)
+            # dirty recursive calls, limited to the immediate members
+            # of data/
+            for fname_sub in os.listdir(data_file):
+                data_file_sub = fp.join(data_file, fname_sub)
+                eval_file_sub = fp.join(eval_file, fname_sub)
+                if fp.isfile(data_file_sub) and not fp.exists(eval_file_sub):
+                    os.link(data_file_sub, eval_file_sub)
 
 
 def _link_model_files(old_dir, new_dir):
@@ -131,12 +143,26 @@ def _create_tstamped_dir(prefix, suffix):
         return True
 
 
-def prepare_dirs(runcfg, data_dir):
-    """
-    Return eval and scratch directory paths
+def prepare_dirs(runcfg, base_dir):
+    """Get eval and scratch directory paths.
+
+    Parameters
+    ----------
+    runcfg: attelo.harness.config.RuntimeConfig
+        Current runtime config
+    base_dir: filepath
+        Base directory for the experiment.
+
+    Returns
+    -------
+    eval_dir: filepath
+        Evaluation folder ; subfolder of base_dir.
+    scratch_dir: filepath
+        Scratch folder ; subfolder of base_dir.
     """
-    eval_prefix = fp.join(data_dir, "eval")
-    scratch_prefix = fp.join(data_dir, "scratch")
+    data_dir = os.path.join(base_dir, 'data')
+    eval_prefix = fp.join(base_dir, "eval")
+    scratch_prefix = fp.join(base_dir, "scratch")
 
     eval_current = eval_prefix + '-current'
     scratch_current = scratch_prefix + '-current'
@@ -230,19 +256,33 @@ def _load_harness_multipack(hconf, test_data=False):
         paths = stripped_paths
     else:
         paths = hconf.mpack_paths(test_data, stripped=False)
-    mpack = load_multipack(paths['edu_input'],
-                           paths['pairings'],
+    mpack = load_multipack(paths['edu_input'], paths['pairings'],
                            paths['features'],
-                           paths['vocab'],
-                           corpus_path=paths.get('corpus', None),  # WIP
+                           paths['vocab'], paths['labels'],
+                           # WIP additional files, used only for rst-dt
+                           # as of 2016-07-28
+                           cdu_file=paths.get('cdu_input', None),
+                           cdu_pairings_file=paths.get('cdu_pairings', None),
+                           cdu_feature_file=paths.get('cdu_features', None),
+                           corpus_path=paths.get('corpus', None),
+                           # end WIP
+                           file_split='corpus',  # WIP
                            verbose=True)
     return mpack
 
 
 def _init_corpus(hconf):
     """Start evaluation; generate folds if needed
 
-    :rtype: DataConfig or None
+    Parameters
+    ----------
+    hconf: ??
+        TODO
+
+    Returns
+    -------
+    dconf: DataConfig or None
+        Data configuration
     """
     can_skip_folds = fp.exists(hconf.fold_file)
     msg_skip_folds = ('Skipping generation of fold files '
@@ -281,8 +321,8 @@ def evaluate_corpus(hconf):
 
     dconf = _init_corpus(hconf)
     if hconf.runcfg.stage in [None, ClusterStage.main]:
-        foldset = hconf.runcfg.folds if hconf.runcfg.folds is not None\
-            else frozenset(dconf.folds.values())
+        foldset = (hconf.runcfg.folds if hconf.runcfg.folds is not None
+                   else frozenset(dconf.folds.values()))
         for fold in foldset:
             do_fold(hconf, dconf, fold)
 

diff --git a/attelo/harness/example.py b/attelo/harness/example.py
@@ -59,16 +59,16 @@ class TinyHarness(Harness):
                                      parser=_parser2)]
 
     def __init__(self):
-        self._datadir = mkdtemp()
+        self._basedir = mkdtemp()
         for cpath in glob.glob('doc/example-corpus/*'):
-            shutil.copy(cpath, self._datadir)
+            shutil.copy(cpath, self._basedir)
         super(TinyHarness, self).__init__('tiny', None)
 
     def run(self):
         """Run the evaluation
         """
         runcfg = RuntimeConfig.empty()
-        eval_dir, scratch_dir = prepare_dirs(runcfg, self._datadir)
+        eval_dir, scratch_dir = prepare_dirs(runcfg, self._basedir)
         self.load(runcfg, eval_dir, scratch_dir)
         evaluate_corpus(self)
 
@@ -89,13 +89,12 @@ def mpack_paths(self, _, stripped=False):
         The 2nd argument denoted by '_' is test_data, which is unused in
         this example.
         """
-        core_path = fp.join(self._datadir, 'tiny')
-        return {
-            'edu_input': core_path + '.edus',
-            'pairings': core_path + '.pairings',
-            'features': core_path + '.features.sparse',
-            'vocab': core_path + '.features.sparse.vocab'
-        }
+        core_path = fp.join(self._basedir, 'data', 'tiny')
+        return {'edu_input': core_path + '.edus',
+                'pairings': core_path + '.pairings',
+                'features': core_path + '.features.sparse',
+                'vocab': core_path + '.features.sparse.vocab',
+                'labels': core_path + '.labels'}
 
     def _model_basename(self, rconf, mtype, ext):
         "Basic filename for a model"

diff --git a/attelo/harness/graph.py b/attelo/harness/graph.py
@@ -50,20 +50,19 @@ def _mk_econf_graphs(hconf, edus, gold, econf, fold):
             raise Exception('Unknown diff mode {}'.format(diffmode))
 
         want_test = fold is None
-        suffix = 'test' if want_test\
-            else fp.basename(hconf.fold_dir_path(fold))
+        suffix = ('test' if want_test
+                  else fp.basename(hconf.fold_dir_path(fold)))
         output_dir = fp.join(hconf.report_dir_path(want_test, None),
                              output_bn_prefix + suffix,
                              econf.key)
 
         # settings
         to_hide = 'inter' if diffmode == GraphDiffMode.diff_intra else None
-        settings =\
-            GraphSettings(hide=to_hide,
-                          select=hconf.graph_docs,
-                          unrelated=False,
-                          timeout=15,
-                          quiet=False)
+        settings = GraphSettings(hide=to_hide,
+                                 select=hconf.graph_docs,
+                                 unrelated=False,
+                                 timeout=15,
+                                 quiet=False)
 
         if diffmode == GraphDiffMode.solo:
             yield delayed(graph_all)(edus,
@@ -84,12 +83,11 @@ def _mk_gold_graphs(hconf, dconf):
     output_dir = fp.join(hconf.report_dir_path(None),
                          'graphs-gold')
 
-    settings =\
-        GraphSettings(hide=None,
-                      select=hconf.graph_docs,
-                      unrelated=False,
-                      timeout=15,
-                      quiet=True)
+    settings = GraphSettings(hide=None,
+                             select=hconf.graph_docs,
+                             unrelated=False,
+                             timeout=15,
+                             quiet=True)
 
     predictions = to_predictions(dconf.pack)
     edus = concat_l(dpack.edus for dpack in dconf.pack.values())

diff --git a/attelo/harness/interface.py b/attelo/harness/interface.py
@@ -201,8 +201,8 @@ def mpack_paths(self, test_data, stripped=False):
         Usual keys are:
         * edu_input
         * pairings
-        * features
-        * vocab
+        * vocabulary
+        * labels
 
         Parameters
         ----------

diff --git a/attelo/harness/report.py b/attelo/harness/report.py
@@ -309,7 +309,11 @@ def full_report(mpack, fold_dict, slices, metrics,
             edge_count[key].append(score_edges(fpack, predictions))
         # * on constituency tree spans
         if 'cspans' in metrics:
-            sc_cspans = score_cspans(dpacks, dpredictions)
+            try:
+                sc_cspans = score_cspans(dpacks, dpredictions)
+            except Exception:
+                print('Error in slice configuration', key)
+                raise
             cspan_count[key].append(sc_cspans)
         # * on EDUs
         if 'edus' in metrics: