From f55be6e7044b861ae129d276719d749fdb745d49 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Mon, 27 Jun 2016 17:21:16 +0200
Subject: [PATCH 01/74] WIP add comparative evaluations for different parsers

---
 evals/__init__.py  |   0
 evals/codra.py     | 110 +++++++++++++++++++
 evals/li2014.py    | 117 ++++++++++++++++++++
 evals/ours.py      | 259 +++++++++++++++++++++++++++++++++++++++++++++
 evals/showdown.py  |  86 +++++++++++++++
 evals/utils_wip.py | 248 +++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 820 insertions(+)
 create mode 100644 evals/__init__.py
 create mode 100644 evals/codra.py
 create mode 100644 evals/li2014.py
 create mode 100644 evals/ours.py
 create mode 100644 evals/showdown.py
 create mode 100644 evals/utils_wip.py

diff --git a/evals/__init__.py b/evals/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/evals/codra.py b/evals/codra.py
new file mode 100644
index 0000000..800bdca
--- /dev/null
+++ b/evals/codra.py
@@ -0,0 +1,110 @@
+"""Use the same evaluation procedure Evaluate the output of CODRA
+
+"""
+
+from __future__ import print_function
+
+import itertools
+import os
+
+from educe.rst_dt.annotation import SimpleRSTTree, _binarize
+from educe.rst_dt.codra import load_codra_output_files
+from educe.rst_dt.corpus import (Reader as RstReader,
+                                 RstRelationConverter as RstRelationConverter)
+from educe.rst_dt.deptree import RstDepTree
+
+from attelo.metrics.constituency import (parseval_detailed_report,
+                                         parseval_report)
+from attelo.metrics.deptree import compute_uas_las
+
+
+# RST corpus
+CORPUS_DIR = os.path.abspath(os.path.join(
+    os.path.dirname(os.path.realpath(__file__)),
+    '..', 'corpus',
+    'RSTtrees-WSJ-main-1.0/'))
+CD_TRAIN = os.path.join(CORPUS_DIR, 'TRAINING')
+CD_TEST = os.path.join(CORPUS_DIR, 'TEST')
+# relation converter (fine- to coarse-grained labels)
+RELMAP_FILE = os.path.join('/home/mmorey/melodi/educe',
+                           'educe', 'rst_dt',
+                           'rst_112to18.txt')
+REL_CONV = RstRelationConverter(RELMAP_FILE).convert_tree
+
+
+def eval_codra_output(codra_out_dir):
+    """Load and evaluate the .dis files output by CODRA.
+
+    This currently runs on the document-level files (.doc_dis).
+    """
+    # load reference trees
+    dtree_true = dict()  # dependency trees
+    ctree_true = dict()  # constituency trees
+    # FIXME: find ways to read the right (not necessarily TEST) section
+    # and only the required documents
+    rst_reader = RstReader(CD_TEST)
+    rst_corpus = rst_reader.slurp()
+
+    for doc_id, rtree_true in sorted(rst_corpus.items()):
+        doc_name = doc_id.doc
+
+        # transform into binary tree with coarse-grained labels
+        coarse_rtree_true = REL_CONV(rtree_true)
+        bin_rtree_true = _binarize(coarse_rtree_true)
+        ctree_true[doc_name] = bin_rtree_true
+
+        # transform into dependency tree via SimpleRSTTree
+        bin_srtree_true = SimpleRSTTree.from_rst_tree(coarse_rtree_true)
+        dt_true = RstDepTree.from_simple_rst_tree(bin_srtree_true)
+        dtree_true[doc_name] = dt_true
+
+    # load predicted trees
+    data_pred = load_codra_output_files(codra_out_dir)
+    # filenames = data_pred['filenames']
+    doc_names_pred = data_pred['doc_names']
+    rst_ctrees_pred = data_pred['rst_ctrees']
+
+    # gather predictions
+    dtree_pred = dict()  # dependency trees
+    ctree_pred = dict()  # constituency trees
+
+    for doc_name, rst_ctree in itertools.izip(doc_names_pred, rst_ctrees_pred):
+        # constituency tree
+        # replace fine-grained labels with coarse-grained labels
+        # 2016-06-27 useless, the files we have already contain the coarse
+        # labels
+        coarse_rtree_pred = REL_CONV(rst_ctree)
+        ctree_pred[doc_name] = coarse_rtree_pred
+
+        # dependency tree
+        # conversion via SimpleRSTTree to RstDepTree
+        bin_srtree_pred = SimpleRSTTree.from_rst_tree(coarse_rtree_pred)
+        dt_pred = RstDepTree.from_simple_rst_tree(bin_srtree_pred)
+        dtree_pred[doc_name] = dt_pred
+
+    # compare pred and true
+    common_doc_names = set(dtree_true.keys()) & set(dtree_pred.keys())
+
+    # dep scores
+    dtree_true_list = [dt for doc_name, dt in sorted(dtree_true.items())
+                       if doc_name in common_doc_names]
+    dtree_pred_list = [dt for doc_name, dt in sorted(dtree_pred.items())
+                       if doc_name in common_doc_names]
+
+    score_uas, score_las, score_ls = compute_uas_las(dtree_true_list,
+                                                     dtree_pred_list)
+    print('UAS / LAS / LS : {:.4f} / {:.4f} / {:.4f}'.format(
+        score_uas, score_las, score_ls))
+
+    skipped_docs = set()
+    # convert dicts to aligned lists of SimpleRSTTrees, skipping docs where
+    # needed
+    ctree_true = [ct for doc_name, ct in sorted(ctree_true.items())
+                  if doc_name not in skipped_docs]
+    ctree_pred = [ct for doc_name, ct in sorted(ctree_pred.items())
+                  if doc_name not in skipped_docs]
+    # compute and print PARSEVAL scores
+    print(parseval_report(ctree_true, ctree_pred, digits=4))
+    # detailed report on S+N+R
+    print(parseval_detailed_report(ctree_true, ctree_pred,
+                                   metric_type='S+R'))
diff --git a/evals/li2014.py b/evals/li2014.py
new file mode 100644
index 0000000..d8c02a5
--- /dev/null
+++ b/evals/li2014.py
@@ -0,0 +1,117 @@
+"""Evaluation procedure used in the parser of (Li et al. 2014).
+
+This is a reimplementation of this evaluation procedure.
+"""
+
+# FIXME legacy code brutally dumped here, broken
+def twisted_eval_li2014(data_true, data_pred):
+    """Run Parseval on transformed gold trees, as in (Li et al., 2014).
+
+    This applies a deterministic transform to the gold constituency tree
+    that basically re-orders attachments of a head EDU.
+    """
+    # 1. ctrees_true -> dtrees_true or dtrees_twis (if the procedure
+    # is fishy)
+    # 2. dtrees_[true|twis] -> ctrees_twis
+    # RESUME HERE
+    # hint: ctrees_twis contain only NS nuclearity (...)
+
+    # TODO check exact conformance with the code of their parser:
+    # how rank and nuclearity are determined
+    data_true['rst_ctrees'] = []
+    for dt_true in data_true['rst_dtrees']:
+        # FIXME map EDUs to sentences
+        dt_true.sent_idx = [edu_id2sent_idx[e.identifier()]
+                            for e in dt_true.edus]
+        # TODO check that 'lllrrr' effectively corresponds to the strategy
+        # they apply
+        chn_bin_srtree_true = deptree_to_simple_rst_tree(
+            dt_true, MULTINUC_LBLS, strategy='lllrrr')
+        chn_bin_rtree_true = SimpleRSTTree.to_binary_rst_tree(
+            chn_bin_srtree_true)
+        bin_rtree_true = chn_bin_rtree_true
+        data_true['rst_ctrees'].append(bin_rtree_true)
+# end FIXME
+
+
+# FIXME currently broken, need to declare and fit classifiers for nuc and rank
+# (nuc_classifier and rank_classifier)
+# TODO move to ?
+def eval_distortion_gold(corpus, nuc_strategy, rank_strategy,
+                         prioritize_same_unit):
+    """Load an RstDepTree from the output of attelo.
+
+    Parameters
+    ----------
+    corpus: string
+        Path to the gold corpus to be evaluated
+    nuc_strategy: string
+        Strategy to predict nuclearity
+    rank_strategy: string
+        Strategy to predict attachment ranking
+    """
+    # print parameters
+    print('corpus: {}\tnuc_strategy: {}\trank_strategy: {}'.format(
+        corpus, nuc_strategy, rank_strategy))
+
+    gold_orig = dict()
+    gold_twis = dict()
+
+    # FIXME: find ways to read the right (not necessarily TEST) section
+    # and only the required documents
+    rst_reader = RstReader(corpus)
+    rst_corpus = rst_reader.slurp()
+    for doc_id, rtree_ref in sorted(rst_corpus.items()):
+        doc_name = doc_id.doc
+
+        # original gold
+        # convert labels to coarse
+        coarse_rtree_ref = REL_CONV(rtree_ref)
+        # convert to binary tree
+        bin_rtree_ref = _binarize(coarse_rtree_ref)
+        gold_orig[doc_name] = bin_rtree_ref
+
+        # distorted gold: forget nuclearity and order of attachment
+        # convert to RstDepTree via SimpleRSTTree
+        bin_srtree_ref = SimpleRSTTree.from_rst_tree(coarse_rtree_ref)
+        dt_ref = RstDepTree.from_simple_rst_tree(bin_srtree_ref)
+        # FIXME replace gold nuclearity and rank with predicted ones,
+        # using the given heuristics
+        # dt_ref.nucs = nuc_classifier.predict([dt_ref])[0]
+        # dt_ref.ranks = rank_classifier.predict([dt_ref])[0]
+        # end FIXME
+        # regenerate a binary RST tree
+        chn_bin_srtree_ref = deptree_to_simple_rst_tree(dt_ref)
+        chn_bin_rtree_ref = SimpleRSTTree.to_binary_rst_tree(
+            chn_bin_srtree_ref)
+        gold_twis[doc_name] = chn_bin_rtree_ref
+
+    print(parseval_report(gold_orig, gold_twis,
+                          metric_types=[x[0] for x in LBL_FNS],
+                          digits=4))
+    # detailed report on S+N+R
+    print(parseval_detailed_report(ctree_true, ctree_pred,
+                                   metric_type='S+R'))
+
+
+def comparative_distortion_on_gold():
+    """Evaluate the impact of forgetting nuclearity and rank in the gold.
+
+    Quantify the distortion and loss when forgetting nuclearity and rank
+    in the gold and replacing them with deterministically-determined
+    values.
+
+    Possible configurations are the cross-product of strategies to
+    heuristically determine rank and nuclearity.
+    """
+    gold_corpus = CD_TRAIN  # CD_TEST
+    nuc_strats = ["most_frequent_by_rel",
+                  "unamb_else_most_frequent"]
+    rank_strats = ['lllrrr',
+                   'rrrlll',
+                   'lrlrlr',
+                   'rlrlrl']
+    prioritize_same_units = [True, False]
+    for nuc_strat in nuc_strats:
+        for rank_strat in rank_strats:
+            eval_distortion_gold(gold_corpus, nuc_strat, rank_strat)
diff --git a/evals/ours.py b/evals/ours.py
new file mode 100644
index 0000000..750dd09
--- /dev/null
+++ b/evals/ours.py
@@ -0,0 +1,259 @@
+"""Evaluate our parsers.
+
+"""
+
+from __future__ import print_function
+
+from collections import defaultdict
+import os
+
+from educe.annotation import Span as EduceSpan
+from educe.rst_dt.annotation import (EDU as EduceEDU,
+                                     SimpleRSTTree, _binarize)
+from educe.rst_dt.corpus import (Reader as RstReader,
+                                 RstRelationConverter as RstRelationConverter)
+from educe.rst_dt.dep2con import (deptree_to_simple_rst_tree,
+                                  DummyNuclearityClassifier,
+                                  InsideOutAttachmentRanker)
+from educe.rst_dt.deptree import RstDepTree, RstDtException
+#
+from attelo.io import load_edus
+from attelo.metrics.constituency import (parseval_detailed_report,
+                                         parseval_report)
+from attelo.metrics.deptree import compute_uas_las
+from attelo.table import UNRELATED  # for load_attelo_output_file
+
+
+# RST corpus
+CORPUS_DIR = os.path.abspath(os.path.join(
+    os.path.dirname(os.path.realpath(__file__)),
+    '..', 'corpus',
+    'RSTtrees-WSJ-main-1.0/'))
+CD_TRAIN = os.path.join(CORPUS_DIR, 'TRAINING')
+CD_TEST = os.path.join(CORPUS_DIR, 'TEST')
+# relation converter (fine- to coarse-grained labels)
+RELMAP_FILE = os.path.join('/home/mmorey/melodi/educe',
+                           'educe', 'rst_dt',
+                           'rst_112to18.txt')
+REL_CONV = RstRelationConverter(RELMAP_FILE).convert_tree
+
+
+# move to attelo.datasets.attelo_out_format
+def load_attelo_output_file(output_file):
+    """Load edges from an attelo output file.
+
+    An attelo output file typically contains edges from several
+    documents. This function indexes edges by the name of their
+    document.
+
+    Parameters
+    ----------
+    output_file: string
+        Path to the attelo output file
+
+    Returns
+    -------
+    edges_pred: dict(string, [(string, string, string)])
+        Predicted edges for each document, indexed by doc name
+
+    Notes
+    -----
+    See `attelo.io.load_predictions` that is almost equivalent to this
+    function. They are expected to converge some day into a better,
+    obvious in retrospect, function.
+    """
+    edges_pred = defaultdict(list)
+    with open(output_file) as f:
+        for line in f:
+            src_id, tgt_id, lbl = line.strip().split('\t')
+            if lbl != UNRELATED:
+                # dirty hack: get doc name from EDU id
+                # e.g. (EDU id = wsj_0601_1) => (doc id = wsj_0601)
+                doc_name = tgt_id.rsplit('_', 1)[0]
+                edges_pred[doc_name].append((src_id, tgt_id, lbl))
+
+    return edges_pred
+
+
+def load_deptrees_from_attelo_output(output_file, edus_file,
+                                     nuc_strategy, rank_strategy,
+                                     prioritize_same_unit=True,
+                                     skpd_docs=None):
+    """Load an RstDepTree from the output of attelo.
+
+    Parameters
+    ----------
+    output_file: string
+        Path to the file that contains attelo's output
+    nuc_strategy: string
+        Strategy to predict nuclearity
+    rank_strategy: string
+        Strategy to predict attachment ranking
+    skpd_docs: set(string)
+        Names of documents that should be skipped to compute scores
+
+    Returns
+    -------
+    skipped_docs: set(string)
+        Names of documents that have been skipped to compute scores
+    """
+    # load reference trees
+    dtree_true = dict()  # dependency trees
+    ctree_true = dict()  # constituency trees
+    # FIXME: find ways to read the right (not necessarily TEST) section
+    # and only the required documents
+    rst_reader = RstReader(CD_TEST)
+    rst_corpus = rst_reader.slurp()
+    for doc_id, rtree_true in sorted(rst_corpus.items()):
+        doc_name = doc_id.doc
+
+        # transform into binary tree with coarse-grained labels
+        coarse_rtree_true = REL_CONV(rtree_true)
+        bin_rtree_true = _binarize(coarse_rtree_true)
+        ctree_true[doc_name] = bin_rtree_true
+
+        # transform into dependency tree via SimpleRSTTree
+        bin_srtree_true = SimpleRSTTree.from_rst_tree(coarse_rtree_true)
+        dt_true = RstDepTree.from_simple_rst_tree(bin_srtree_true)
+        dtree_true[doc_name] = dt_true
+
+    # USE TO INCORPORATE CONSTITUENCY LOSS INTO STRUCTURED CLASSIFIERS
+    # load predicted trees
+    dtree_pred = dict()  # predicted dtrees
+    ctree_pred = dict()  # predicted ctrees
+    # load EDUs as they are known to attelo (sigh)
+    # and predicted edges on these EDUs
+    att_edus = load_edus(edus_file)
+    edges_pred = load_attelo_output_file(output_file)
+    # rebuild educe EDUs from their attelo description
+    # and group them by doc_name
+    educe_edus = defaultdict(list)
+    edu2sent_idx = defaultdict(dict)
+    gid2num = dict()
+    for att_edu in att_edus:
+        # doc name
+        doc_name = att_edu.grouping
+        # EDU info
+        edu_num = int(att_edu.id.rsplit('_', 1)[1])
+        edu_span = EduceSpan(att_edu.start, att_edu.end)
+        edu_text = att_edu.text
+        educe_edus[doc_name].append(EduceEDU(edu_num, edu_span, edu_text))
+        # map global id of EDU to num of EDU inside doc
+        gid2num[att_edu.id] = edu_num
+        # map EDU to sentence
+        sent_idx = int(att_edu.subgrouping.split('_sent')[1])
+        edu2sent_idx[doc_name][edu_num] = sent_idx
+    # sort EDUs by num
+    educe_edus = {doc_name: sorted(edus, key=lambda e: e.num)
+                  for doc_name, edus in educe_edus.items()}
+    # rebuild educe-style edu2sent ; prepend 0 for the fake root
+    doc_name2edu2sent = {doc_name: ([0] +
+                                    [edu2sent_idx[doc_name][e.num]
+                                     for e in doc_educe_edus])
+                         for doc_name, doc_educe_edus in educe_edus.items()}
+
+    # re-build predicted trees from predicted edges and educe EDUs
+    skipped_docs = set()  # docs skipped because non-projective structures
+
+    # classifiers for nuclearity and ranking
+    # FIXME declare, fit and predict upstream...
+    X_train = []
+    y_nuc_train = []
+    y_rank_train = []
+    for doc_name, dt in sorted(dtree_true.items()):
+        X_train.append(dt)
+        y_nuc_train.append(dt.nucs)
+        y_rank_train.append(dt.ranks)
+    # nuclearity
+    nuc_classifier = DummyNuclearityClassifier(strategy=nuc_strategy)
+    nuc_classifier.fit(X_train, y_nuc_train)
+    # ranking classifier
+    rank_classifier = InsideOutAttachmentRanker(
+        strategy=rank_strategy,
+        prioritize_same_unit=prioritize_same_unit)
+    rank_classifier.fit(X_train, y_rank_train)
+
+    # rebuild RstDepTrees
+    for doc_name, es_pred in sorted(edges_pred.items()):
+        # get educe EDUs
+        doc_educe_edus = educe_edus[doc_name]
+        # create pred dtree
+        dt_pred = RstDepTree(doc_educe_edus)
+        for src_id, tgt_id, lbl in es_pred:
+            if src_id == 'ROOT':
+                if lbl == 'ROOT':
+                    dt_pred.set_root(gid2num[tgt_id])
+                else:
+                    raise ValueError('Weird root label: {}'.format(lbl))
+            else:
+                dt_pred.add_dependency(gid2num[src_id], gid2num[tgt_id], lbl)
+        # NEW add nuclearity: heuristic baseline
+        if True:
+            dt_pred.nucs = nuc_classifier.predict([dt_pred])[0]
+        else:  # EXPERIMENTAL use gold nuclearity
+            dt_pred.nucs = dtree_true[doc_name].nucs
+        # NEW add rank: some strategies require a mapping from EDU to sentence
+        # EXPERIMENTAL attach array of sentence index for each EDU in tree
+        edu2sent = doc_name2edu2sent[doc_name]
+        dt_pred.sent_idx = edu2sent
+        # end EXPERIMENTAL
+        if False:  # DEBUG
+            print(doc_name)
+        dt_pred.ranks = rank_classifier.predict([dt_pred])[0]
+        # end NEW
+        dtree_pred[doc_name] = dt_pred
+
+        # create pred ctree
+        try:
+            bin_srtree_pred = deptree_to_simple_rst_tree(dt_pred)
+            if False:  # EXPERIMENTAL
+                # currently False to run on output that already has
+                # labels embedding nuclearity
+                bin_srtree_pred = SimpleRSTTree.incorporate_nuclearity_into_label(
+                    bin_srtree_pred)
+            bin_rtree_pred = SimpleRSTTree.to_binary_rst_tree(bin_srtree_pred)
+            ctree_pred[doc_name] = bin_rtree_pred
+        except RstDtException as rst_e:
+            print(rst_e)
+            skipped_docs.add(doc_name)
+            if False:
+                print('\n'.join('{}: {}'.format(edu.text_span(), edu)
+                                for edu in educe_edus[doc_name]))
+            # raise
+    # end USE TO INCORPORATE CONSTITUENCY LOSS INTO STRUCTURED CLASSIFIERS
+
+    # compare gold with pred on doc_names
+    common_doc_names = set(dtree_true.keys()) & set(dtree_pred.keys())
+
+    # dep scores
+    dtree_true_list = [dt for doc_name, dt in sorted(dtree_true.items())
+                       if doc_name in common_doc_names]
+    dtree_pred_list = [dt for doc_name, dt in sorted(dtree_pred.items())
+                       if doc_name in common_doc_names]
+
+    score_uas, score_las, score_ls = compute_uas_las(dtree_true_list,
+                                                     dtree_pred_list)
+    print('UAS / LAS / LS : {:.4f} / {:.4f} / {:.4f}'.format(
+        score_uas, score_las, score_ls))
+
+    # compute and print PARSEVAL scores
+    if skipped_docs:
+        print('Skipped {} docs over {}'.format(len(skipped_docs),
+                                               len(edges_pred)))
+    # also skip docs passed as argument
+    if skpd_docs is not None:
+        skipped_docs |= skpd_docs
+    # convert dicts to aligned lists of SimpleRSTTrees, skipping docs where
+    # needed
+    ctree_true = [ct for doc_name, ct in sorted(ctree_true.items())
+                  if doc_name not in skipped_docs]
+    ctree_pred = [ct for doc_name, ct in sorted(ctree_pred.items())
+                  if doc_name not in skipped_docs]
+
+    print(parseval_report(ctree_true, ctree_pred,
+                          digits=4))
+    # detailed report on S+N+R
+    print(parseval_detailed_report(ctree_true, ctree_pred,
+                                   metric_type='S+R'))
+
+    return skipped_docs
diff --git a/evals/showdown.py b/evals/showdown.py
new file mode 100644
index 0000000..dfc81ee
--- /dev/null
+++ b/evals/showdown.py
@@ -0,0 +1,86 @@
+"""This module evaluates the output of discourse parsers.
+
+Included are dependency and constituency tree metrics.
+"""
+
+from __future__ import print_function
+
+import os
+
+# from educe.rst_dt.annotation import RSTTree, SimpleRSTTree, _binarize
+from educe.rst_dt.corpus import RstRelationConverter # , Reader as RstReader
+
+# from educe.rst_dt.dep2con import (deptree_to_simple_rst_tree)
+# from educe.rst_dt.deptree import (RstDepTree, RstDtException)
+#
+# from attelo.metrics.constituency import (LBL_FNS, parseval_detailed_report,
+#                                          parseval_report)
+# local to this package
+from evals.codra import eval_codra_output
+from evals.ours import load_deptrees_from_attelo_output
+
+
+# RST corpus
+CORPUS_DIR = os.path.join('corpus', 'RSTtrees-WSJ-main-1.0/')
+CD_TRAIN = os.path.join(CORPUS_DIR, 'TRAINING')
+CD_TEST = os.path.join(CORPUS_DIR, 'TEST')
+# relation converter (fine- to coarse-grained labels)
+RELMAP_FILE = os.path.join('/home/mmorey/melodi/educe',
+                           'educe', 'rst_dt',
+                           'rst_112to18.txt')
+REL_CONV = RstRelationConverter(RELMAP_FILE).convert_tree
+
+
+#
+# EVALUATIONS
+#
+
+# * syntax: pred vs gold
+EDUS_FILE = os.path.join('/home/mmorey/melodi',
+                         'irit-rst-dt/TMP/syn_gold_coarse',
+                         'TEST.relations.sparse.edu_input')
+# outputs of parsers
+EISNER_OUT_SYN_PRED = os.path.join(
+    '/home/mmorey/melodi',
+    'irit-rst-dt/TMP/syn_pred_coarse',  # lbl
+    'scratch-current/combined',
+    'output.maxent-iheads-global-AD.L-jnt-eisner')
+
+EISNER_OUT_SYN_GOLD = os.path.join(
+    '/home/mmorey/melodi',
+    'irit-rst-dt/TMP/syn_gold_coarse',  # lbl
+    'scratch-current/combined',
+    'output.maxent-iheads-global-AD.L-jnt-eisner')
+
+CODRA_OUT_DIR = '/home/mmorey/melodi/joty/Doc-level'
+
+
+
+# FIXME load gold trees here once and for all, pass them to each
+# evaluation
+
+print('CODRA (Joty)')
+eval_codra_output(CODRA_OUT_DIR)
+print('=======================')
+
+print('Eisner, predicted syntax')
+load_deptrees_from_attelo_output(EISNER_OUT_SYN_PRED, EDUS_FILE,
+                                 nuc_strategy="unamb_else_most_frequent",
+                                 # nuc_strategy="most_frequent_by_rel",
+                                 rank_strategy='closest-intra-rl-inter-rl',
+                                 prioritize_same_unit=True)
+print('======================')
+
+print('Eisner, gold syntax')
+load_deptrees_from_attelo_output(EISNER_OUT_SYN_GOLD, EDUS_FILE,
+                                 nuc_strategy="unamb_else_most_frequent",
+                                 # nuc_strategy="most_frequent_by_rel",
+                                 rank_strategy='closest-intra-rl-inter-rl',
+                                 prioritize_same_unit=True)
+print('======================')
+
+
+# TODO use nuclearity classifier
+# starting with baseline: DummyNuclearityClassifier, that assigns to each
+# EDU the most frequent nuclearity of its (incoming) relation in the
+# training corpus, i.e. 'S' for 'NS', 'N' for 'NN'
diff --git a/evals/utils_wip.py b/evals/utils_wip.py
new file mode 100644
index 0000000..bd1d1d0
--- /dev/null
+++ b/evals/utils_wip.py
@@ -0,0 +1,248 @@
+"""Various utility functions that are WIP.
+
+These functions are expected to move to educe or attelo when they
+are mature.
+"""
+
+from __future__ import print_function
+
+import os
+import sys
+
+from educe.rst_dt.annotation import RSTTree
+from educe.rst_dt.corpus import Reader as RstReader
+from educe.rst_dt.dep2con import deptree_to_simple_rst_tree
+from educe.rst_dt.deptree import RstDepTree, RstDtException
+#
+from evals.ours import load_attelo_output_file
+
+
+# RST corpus
+CORPUS_DIR = os.path.abspath(os.path.join(
+    os.path.dirname(os.path.realpath(__file__)),
+    '..', 'corpus',
+    'RSTtrees-WSJ-main-1.0/'))
+CD_TRAIN = os.path.join(CORPUS_DIR, 'TRAINING')
+CD_TEST = os.path.join(CORPUS_DIR, 'TEST')
+
+# move to educe.rst_dt.datasets.rst_dis_format
+STR_ROOT = '{nuc} (span {edu_span})'
+STR_NODE = '{nuc} (span {edu_span}) (rel2par {rel})'
+STR_LEAF = '{nuc} (leaf {edu_num}) (rel2par {rel}) (text _!{edu_txt}_!)'
+
+
+def _str_node(tree):
+    """String for the top node of an RSTTree
+
+    Parameters
+    ----------
+    tree: educe.rst_dt.annotation.RSTTree
+        The tree whose top node we want to print
+    """
+    node = tree.label()
+    # get fields
+    nuc = node.nuclearity
+    edu_span = node.edu_span
+    rel = node.rel
+    # leaf (in reality, we are at the pre-terminal)
+    if len(tree) == 1:
+        # get text from the real leaf (EDU)
+        txt = tree[0].text()
+        node_str = STR_LEAF.format(nuc=nuc, edu_num=edu_span[0],
+                                   rel=rel, edu_txt=txt)
+    # internal node
+    else:
+        edu_span_str = '{} {}'.format(str(edu_span[0]), str(edu_span[1]))
+        node_str = STR_NODE.format(nuc=nuc, edu_span=edu_span_str,
+                                   rel=rel)
+
+    return node_str
+
+
+def tree_str_gen(tree):
+    """Return a generator of strings, one per tree node"""
+    # init tree stack with the whole tree, nesting level 0
+    tree_stack = [(tree, 0)]
+
+    while tree_stack:
+        tree, lvl = tree_stack.pop()
+        yield '{lw}{node_str}'.format(lw='  ' * lvl,
+                                      node_str=_str_node(tree))
+        tree_stack.extend(reversed([(subtree, lvl + 1) for subtree in tree
+                                    if isinstance(subtree, RSTTree)]))
+    # RESUME HERE: add opening (easy) and closing (trickier) parentheses
+    # TODO do not print relation (None) for ROOT
+
+
+def _dump_rst_dis_file(out_file, ct_pred):
+    """Actually do dump.
+
+    Parameters
+    ----------
+    out_file: File
+        Output file
+
+    ct_pred: RSTTree
+        Binary RST tree
+    """
+    res_str = '\n'.join(tree_str_gen(ct_pred))  # or str(ct_pred) ?
+    out_file.write(res_str)
+
+
+def dump_rst_dis_file(out_file, ctree):
+    """Dump a binary RST tree to a file.
+
+    Parameters
+    ----------
+    out_file: string
+        Path to the output file
+
+    ctree: RSTTree
+        Binary RST tree
+    """
+    with open(out_file, 'w') as f:
+        _dump_rst_dis_file(f, ctree)
+# end educe.rst_dt.datasets.rst_dis_format
+
+
+# move to educe.rst_dt.datasets.dep_dis_format ?
+def dump_dep_dis_file(out_file, dtree):
+    """Dump a (RST) dependency tree to a file.
+
+    Parameters
+    ----------
+    out_file: string
+        Path to the output file
+
+    dtree: RstDepTree
+        RST dependency tree
+    """
+    with open(out_file, 'w') as f:
+        res = '\n'.join('{}\t{}'.format(hd, lbl)
+                        for hd, lbl in zip(dtree.heads, dtree.labels))
+        f.write(res)
+# end attelo.datasets.dep_dis_format
+
+
+# move to educe.rst_dt.attelo_out_format
+#
+# this function is only called by `convert_attelo_output_file_to_dis_files`
+#
+# FIXME: find ways to read the right (not necessarily TEST) section
+# and only the required documents
+def load_trees_from_attelo_output_file(att_output_file):
+    """Load predicted RST trees from attelo's output file.
+
+    Parameters
+    ----------
+    att_output_file: string
+        Path to the file that contains attelo's output
+
+    Returns
+    -------
+    ctrees_pred: dict(string, SimpleRSTTree)
+        Predicted SimpleRSTTree for each document, indexed by its name
+    """
+    # get predicted tree for each doc
+    # these currently come in the form of edges on attelo EDUs
+    edges_pred = load_attelo_output_file(att_output_file)
+
+    # get educe EDUs
+    edus = dict()
+    # FIXME: parameterize this, cf. function-wide FIXME above
+    rst_reader = RstReader(CD_TEST)
+    rst_corpus = rst_reader.slurp()
+    for doc_id, rtree_true in sorted(rst_corpus.items()):
+        doc_name = doc_id.doc
+        edus[doc_name] = rtree_true.leaves()
+
+    # re-build predicted trees from predicted edges and educe EDUs
+    dtree_pred = dict()  # predicted dtrees
+    ctree_pred = dict()  # predicted ctrees
+    skipped_docs = set()  # docs skipped because non-projective structures
+    for doc_name, es_pred in sorted(edges_pred.items()):
+        # map from EDU id to EDU num
+        # EDU id should be common to educe and attelo
+        id2num = {edu.identifier(): edu.num for edu in edus[doc_name]}
+        # create pred dtree
+        dt_pred = RstDepTree(edus[doc_name])
+        for src_id, tgt_id, lbl in es_pred:
+            if src_id == 'ROOT':
+                if lbl == 'ROOT':
+                    dt_pred.set_root(id2num[tgt_id])
+                else:
+                    raise ValueError('Weird root label: {}'.format(lbl))
+            else:
+                dt_pred.add_dependency(id2num[src_id], id2num[tgt_id], lbl)
+        dtree_pred[doc_name] = dt_pred
+        # create pred ctree
+        try:
+            ctree_pred[doc_name] = deptree_to_simple_rst_tree(dt_pred)
+        except RstDtException:
+            skipped_docs.add(doc_name)
+            if False:
+                print('\n'.join('{}: {}'.format(edu.text_span(), edu)
+                                for edu in edus[doc_name]))
+            # raise
+    if skipped_docs:
+        print('Skipped {} docs over {}'.format(len(skipped_docs),
+                                               len(edges_pred)))
+
+    return ctree_pred
+# end educe.rst_dt.attelo_out_format
+
+
+# move to educe.datasets.rst_dis_format
+def convert_attelo_output_file_to_dis_files(output_dir, att_output_file):
+    """Convert attelo's output file to a set of dis files in output_dir.
+
+    Parameters
+    ----------
+    output_dir: string
+        Path of the directory for the dis files
+    output_file: string
+        Path to the file that contains attelo's output
+
+    Returns
+    -------
+    ctrees_pred: dict(string, SimpleRSTTree)
+        Predicted SimpleRSTTree for each document, indexed by its name
+    """
+    if not os.path.exists(output_dir):
+        raise ValueError('Absent path: {}'.format(output_dir))
+
+    ctree_pred = load_trees_from_attelo_output_file(att_output_file)
+    # output each SimpleRSTTree to a dis file
+    for doc_name, ct_pred in ctree_pred.items():
+        out_fname = os.path.join(output_dir, doc_name + '.dis')
+        dump_rst_dis_file(out_fname, ct_pred)
+        # DEBUG
+        sys.exit()
+# end educe.datasets.rst_dis_format
+
+
+# ??
+def load_gold():
+    """Load gold structures from RST-WSJ/TEST.
+
+    Returns
+    -------
+    data: dictionary that should be akin to a sklearn Bunch,
+        with interesting keys 'filenames', 'doc_names', 'rst_ctrees',
+        'rst_dtrees'.
+    """
+    # TODO make this the only place where the gold is loaded
+    # shared between evals of both CODRA and attelo's outputs
+    filenames = []  # TODO
+    # load doc names and reference trees
+    rst_reader = RstReader(CD_TEST)
+    rst_corpus = rst_reader.slurp()
+    doc_names = []
+    rst_ctrees = []
+    for doc_id, rst_ctree in sorted(rst_corpus.items(),
+                                    key=lambda kv: kv[0].doc):
+        doc_names.append(doc_id.doc)
+        rst_ctrees.append(rst_ctree)
+    # RESUME HERE (or not)
+    raise NotImplementedError
+# end ??

From 45709b001dd9ede90821e7775706f616c82e2d0e Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Fri, 1 Jul 2016 23:19:24 +0200
Subject: [PATCH 02/74] WIP special processing for same_unit

---
 irit_rst_dt/config/common.py | 23 +++++++++++++++++++++++
 irit_rst_dt/harness.py       |  3 ++-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/irit_rst_dt/config/common.py b/irit_rst_dt/config/common.py
index 54a6f2e..f224581 100644
--- a/irit_rst_dt/config/common.py
+++ b/irit_rst_dt/config/common.py
@@ -14,6 +14,8 @@
 from attelo.learning.oracle import (AttachOracle, LabelOracle)
 from attelo.parser.full import (JointPipeline,
                                 PostlabelPipeline)
+from attelo.parser.same_unit import (JointSameUnitPipeline,
+                                     SklearnSameUnitClassifier)
 
 
 def combined_key(*variants):
@@ -115,6 +117,27 @@ def mk_joint(klearner, kdecoder):
                             parser=Keyed(parser_key, parser))
 
 
+def mk_joint_su(klearner, kdecoder):
+    "return a joint decoding parser config with same-unit"
+    settings = _core_settings('AD.L-jnt_su', klearner)
+    parser_key = combined_key(settings, kdecoder)
+    key = combined_key(klearner, parser_key)
+    # su: use same kind of learner as "attach"
+    parser = JointSameUnitPipeline(
+        learner_attach=klearner.attach.payload,
+        learner_label=klearner.label.payload,
+        learner_su=(
+            SklearnSameUnitClassifier(klearner.attach.payload._learner)
+            if not isinstance(klearner.attach.payload, AttachOracle)
+            else klearner.attach.payload
+        ),
+        decoder=kdecoder.payload)
+    return EvaluationConfig(key=key,
+                            settings=settings,
+                            learner=klearner,
+                            parser=Keyed(parser_key, parser))
+
+
 def mk_post(klearner, kdecoder):
     "return a post label parser"
     settings = _core_settings('AD.L-pst', klearner)
diff --git a/irit_rst_dt/harness.py b/irit_rst_dt/harness.py
index ef4fd60..eddf7d4 100644
--- a/irit_rst_dt/harness.py
+++ b/irit_rst_dt/harness.py
@@ -203,7 +203,8 @@ def _eval_model_path(subconf, mtype):
         else:
             return {
                 'attach': _eval_model_path(rconf, "attach"),
-                'label': _eval_model_path(rconf, "relate")
+                'label': _eval_model_path(rconf, "relate"),
+                'su': _eval_model_path(rconf, "su"),
             }
 
     # ------------------------------------------------------

From 70385852fbfd893ce6697aaae00c86c7e6f245b3 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Fri, 29 Jul 2016 17:28:16 +0200
Subject: [PATCH 03/74] WIP comparative evals, first attempts at same-unit
 preproc

---
 evals/codra.py               | 107 +++++++++++++++++++++++++++++++--
 evals/ours.py                |  45 +++++++++++++-
 evals/showdown.py            |  31 ++++++++--
 irit_rst_dt/cmd/gather.py    | 111 +++++++++++++++++++++++++++--------
 irit_rst_dt/config/common.py |  22 +++++++
 irit_rst_dt/harness.py       |  24 ++++----
 irit_rst_dt/local.py         |  42 ++++++++++---
 7 files changed, 327 insertions(+), 55 deletions(-)

diff --git a/evals/codra.py b/evals/codra.py
index 800bdca..17dbacb 100644
--- a/evals/codra.py
+++ b/evals/codra.py
@@ -4,15 +4,23 @@
 
 from __future__ import print_function
 
+from collections import defaultdict
 import itertools
 import os
 
+import numpy as np
+
 from educe.rst_dt.annotation import SimpleRSTTree, _binarize
 from educe.rst_dt.codra import load_codra_output_files
 from educe.rst_dt.corpus import (Reader as RstReader,
                                  RstRelationConverter as RstRelationConverter)
+from educe.rst_dt.dep2con import (deptree_to_simple_rst_tree,
+                                  DummyNuclearityClassifier,
+                                  InsideOutAttachmentRanker)
 from educe.rst_dt.deptree import RstDepTree
-
+from educe.rst_dt.document_plus import align_edus_with_paragraphs
+#
+from attelo.io import load_edus
 from attelo.metrics.constituency import (parseval_detailed_report,
                                          parseval_report)
 from attelo.metrics.deptree import compute_uas_las
@@ -32,7 +40,10 @@
 REL_CONV = RstRelationConverter(RELMAP_FILE).convert_tree
 
 
-def eval_codra_output(codra_out_dir):
+def eval_codra_output(codra_out_dir, edus_file,
+                      nuc_strategy, rank_strategy,
+                      prioritize_same_unit=True,
+                      detailed=False):
     """Load and evaluate the .dis files output by CODRA.
 
     This currently runs on the document-level files (.doc_dis).
@@ -45,6 +56,22 @@ def eval_codra_output(codra_out_dir):
     rst_reader = RstReader(CD_TEST)
     rst_corpus = rst_reader.slurp()
 
+    # WIP 2016-06-29 sent_idx
+    att_edus = load_edus(edus_file)
+    edu2sent_idx = defaultdict(dict)
+    for att_edu in att_edus:
+        doc_name = att_edu.grouping
+        edu_num = int(att_edu.id.rsplit('_', 1)[1])
+        sent_idx = int(att_edu.subgrouping.split('_sent')[1])
+        edu2sent_idx[doc_name][edu_num] = sent_idx
+    # sort EDUs by num
+    # rebuild educe-style edu2sent ; prepend 0 for the fake root
+    doc_name2edu2sent = {doc_name: ([0]
+                                    + [s_idx for e_num, s_idx
+                                       in sorted(edu2sent.items())])
+                         for doc_name, edu2sent in edu2sent_idx.items()}
+    doc_name2edu2para = dict()
+
     for doc_id, rtree_true in sorted(rst_corpus.items()):
         doc_name = doc_id.doc
 
@@ -58,6 +85,32 @@ def eval_codra_output(codra_out_dir):
         dt_true = RstDepTree.from_simple_rst_tree(bin_srtree_true)
         dtree_true[doc_name] = dt_true
 
+        # WIP 2016-06-29 para_idx
+        doc_edus = rtree_true.leaves()
+        doc_txt = doc_edus[0].context._text
+        # retrieve paragraph idx
+        doc_paras = doc_edus[0].context.paragraphs
+        if doc_paras is not None:
+            edu2para = align_edus_with_paragraphs(
+                doc_edus, doc_paras, doc_txt)
+            # yerk: interpolate values in edu2para where missing
+            edu2para_fix = []
+            for edu_idx in edu2para:
+                if edu_idx is not None:
+                    edu2para_fix.append(edu_idx)
+                else:
+                    # interpolation strategy: copy the last regular value
+                    # that has been seen
+                    edu2para_fix.append(edu2para_fix[-1])
+            edu2para = edu2para_fix
+            # end yerk: interpolate
+            edu2para = [0] + list(np.array(edu2para) + 1)
+            doc_name2edu2para[doc_name] = edu2para
+        else:
+            doc_name2edu2para[doc_name] = None
+        # end retrieve paragraph idx
+
+
     # load predicted trees
     data_pred = load_codra_output_files(codra_out_dir)
     # filenames = data_pred['filenames']
@@ -106,5 +159,51 @@ def eval_codra_output(codra_out_dir):
     # compute and print PARSEVAL scores
     print(parseval_report(ctree_true, ctree_pred, digits=4))
     # detailed report on S+N+R
-    print(parseval_detailed_report(ctree_true, ctree_pred,
-                                   metric_type='S+R'))
+    if detailed:
+        print(parseval_detailed_report(ctree_true, ctree_pred,
+                                       metric_type='S+R'))
+
+    if False:
+        # WIP 2016-06-29 use our deterministic classifiers for nuc and rank
+        # => estimate degradation on Joty's output => hint at ours
+        # FIXME declare, fit and predict upstream on the training corpus...
+        # but currently fit is a no-op for both so this horror is in fact safe
+        X_train = []
+        y_nuc_train = []
+        y_rank_train = []
+        for doc_name, dt in sorted(dtree_true.items()):
+            X_train.append(dt)
+            y_nuc_train.append(dt.nucs)
+            y_rank_train.append(dt.ranks)
+        # nuclearity
+        nuc_classifier = DummyNuclearityClassifier(strategy=nuc_strategy)
+        nuc_classifier.fit(X_train, y_nuc_train)
+        # ranking classifier
+        rank_classifier = InsideOutAttachmentRanker(
+            strategy=rank_strategy,
+            prioritize_same_unit=prioritize_same_unit)
+        rank_classifier.fit(X_train, y_rank_train)
+        # rebuild ctrees
+        ctree_pred2 = dict()
+        for doc_name, dt_pred in sorted(dtree_pred.items()):
+            # set nuclearity
+            dt_pred.nucs = nuc_classifier.predict([dt_pred])[0]
+            # set ranking, needs sent_idx (WIP on para_idx)
+            edu2sent = doc_name2edu2sent[doc_name]
+            dt_pred.sent_idx = edu2sent
+            # 2016-06-28 same for edu2para
+            edu2para = doc_name2edu2para[doc_name]
+            dt_pred.para_idx = edu2para
+            dt_pred.ranks = rank_classifier.predict([dt_pred])[0]
+            # end NEW
+            bin_srtree_pred = deptree_to_simple_rst_tree(dt_pred)
+            bin_rtree_pred = SimpleRSTTree.to_binary_rst_tree(bin_srtree_pred)
+            ctree_pred2[doc_name] = bin_rtree_pred
+        #
+        skipped_docs = set()
+        ctree_pred2 = [ct for doc_name, ct in sorted(ctree_pred2.items())
+                       if doc_name not in skipped_docs]
+        print(parseval_report(ctree_true, ctree_pred2, digits=4))
+        if detailed:
+            print(parseval_detailed_report(ctree_true, ctree_pred2,
+                                           metric_type='S+R'))
diff --git a/evals/ours.py b/evals/ours.py
index 750dd09..156c76a 100644
--- a/evals/ours.py
+++ b/evals/ours.py
@@ -7,6 +7,8 @@
 from collections import defaultdict
 import os
 
+import numpy as np
+
 from educe.annotation import Span as EduceSpan
 from educe.rst_dt.annotation import (EDU as EduceEDU,
                                      SimpleRSTTree, _binarize)
@@ -16,6 +18,7 @@
                                   DummyNuclearityClassifier,
                                   InsideOutAttachmentRanker)
 from educe.rst_dt.deptree import RstDepTree, RstDtException
+from educe.rst_dt.document_plus import align_edus_with_paragraphs
 #
 from attelo.io import load_edus
 from attelo.metrics.constituency import (parseval_detailed_report,
@@ -78,6 +81,7 @@ def load_attelo_output_file(output_file):
 def load_deptrees_from_attelo_output(output_file, edus_file,
                                      nuc_strategy, rank_strategy,
                                      prioritize_same_unit=True,
+                                     detailed=False,
                                      skpd_docs=None):
     """Load an RstDepTree from the output of attelo.
 
@@ -97,6 +101,8 @@ def load_deptrees_from_attelo_output(output_file, edus_file,
     skipped_docs: set(string)
         Names of documents that have been skipped to compute scores
     """
+    doc_name2edu2para = dict()
+
     # load reference trees
     dtree_true = dict()  # dependency trees
     ctree_true = dict()  # constituency trees
@@ -117,6 +123,36 @@ def load_deptrees_from_attelo_output(output_file, edus_file,
         dt_true = RstDepTree.from_simple_rst_tree(bin_srtree_true)
         dtree_true[doc_name] = dt_true
 
+        # 2016-06-28 retrieve paragraph idx of each EDU
+        # FIXME refactor to get in a better way, in a better place
+        # currently, we take EDUs from the RSTTree and paragraphs from
+        # the RSTContext, so no left padding in either list ;
+        # the dtree contains the left padding EDU, so we compute the
+        # edu2paragraph alignment on real units only, shift by one,
+        # then prepend 0
+        doc_edus = rtree_true.leaves()
+        doc_paras = doc_edus[0].context.paragraphs
+        doc_txt = doc_edus[0].context._text
+        if doc_paras is not None:
+            edu2para = align_edus_with_paragraphs(
+                doc_edus, doc_paras, doc_txt)
+            # yerk: interpolate values in edu2para where missing
+            edu2para_fix = []
+            for edu_idx in edu2para:
+                if edu_idx is not None:
+                    edu2para_fix.append(edu_idx)
+                else:
+                    # interpolation strategy: copy the last regular value
+                    # that has been seen
+                    edu2para_fix.append(edu2para_fix[-1])
+            edu2para = edu2para_fix
+            # end yerk: interpolate
+            edu2para = [0] + list(np.array(edu2para) + 1)
+            doc_name2edu2para[doc_name] = edu2para
+        else:
+            doc_name2edu2para[doc_name] = None
+        # end retrieve paragraph idx
+
     # USE TO INCORPORATE CONSTITUENCY LOSS INTO STRUCTURED CLASSIFIERS
     # load predicted trees
     dtree_pred = dict()  # predicted dtrees
@@ -196,6 +232,10 @@ def load_deptrees_from_attelo_output(output_file, edus_file,
         # EXPERIMENTAL attach array of sentence index for each EDU in tree
         edu2sent = doc_name2edu2sent[doc_name]
         dt_pred.sent_idx = edu2sent
+        # 2016-06-28 same for edu2para
+        edu2para = doc_name2edu2para[doc_name]
+        dt_pred.para_idx = edu2para
+        # assert len(edu2sent) == len(edu2para)
         # end EXPERIMENTAL
         if False:  # DEBUG
             print(doc_name)
@@ -253,7 +293,8 @@ def load_deptrees_from_attelo_output(output_file, edus_file,
     print(parseval_report(ctree_true, ctree_pred,
                           digits=4))
     # detailed report on S+N+R
-    print(parseval_detailed_report(ctree_true, ctree_pred,
-                                   metric_type='S+R'))
+    if detailed:
+        print(parseval_detailed_report(ctree_true, ctree_pred,
+                                       metric_type='S+R'))
 
     return skipped_docs
diff --git a/evals/showdown.py b/evals/showdown.py
index dfc81ee..14c5a2f 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -46,6 +46,12 @@
     'scratch-current/combined',
     'output.maxent-iheads-global-AD.L-jnt-eisner')
 
+EISNER_OUT_SYN_PRED_SU = os.path.join(
+    '/home/mmorey/melodi',
+    'irit-rst-dt/TMP/latest',  # lbl
+    'scratch-current/combined',
+    'output.maxent-AD.L-jnt_su-eisner')
+
 EISNER_OUT_SYN_GOLD = os.path.join(
     '/home/mmorey/melodi',
     'irit-rst-dt/TMP/syn_gold_coarse',  # lbl
@@ -56,11 +62,18 @@
 
 
 
-# FIXME load gold trees here once and for all, pass them to each
-# evaluation
+# FIXME:
+# * [ ] load gold trees here once and for all, pass them to each evaluation
+# * [ ] create summary table with one system per row, one metric per column,
+#   keep only the f-score (because for binary trees with manual segmentation
+#   precision = recall = f-score).
 
 print('CODRA (Joty)')
-eval_codra_output(CODRA_OUT_DIR)
+eval_codra_output(CODRA_OUT_DIR, EDUS_FILE,
+                  nuc_strategy="unamb_else_most_frequent",
+                  rank_strategy='closest-intra-rl-inter-rl',
+                  prioritize_same_unit=True,
+                  detailed=True)
 print('=======================')
 
 print('Eisner, predicted syntax')
@@ -68,7 +81,17 @@
                                  nuc_strategy="unamb_else_most_frequent",
                                  # nuc_strategy="most_frequent_by_rel",
                                  rank_strategy='closest-intra-rl-inter-rl',
-                                 prioritize_same_unit=True)
+                                 prioritize_same_unit=True,
+                                 detailed=True)
+print('======================')
+
+print('Eisner, predicted syntax + same-unit')
+load_deptrees_from_attelo_output(EISNER_OUT_SYN_PRED_SU, EDUS_FILE,
+                                 nuc_strategy="unamb_else_most_frequent",
+                                 # nuc_strategy="most_frequent_by_rel",
+                                 rank_strategy='closest-intra-rl-inter-rl',
+                                 prioritize_same_unit=True,
+                                 detailed=True)
 print('======================')
 
 print('Eisner, gold syntax')
diff --git a/irit_rst_dt/cmd/gather.py b/irit_rst_dt/cmd/gather.py
index f7fcba6..e4db2cd 100644
--- a/irit_rst_dt/cmd/gather.py
+++ b/irit_rst_dt/cmd/gather.py
@@ -6,17 +6,12 @@
 """
 
 from __future__ import print_function
-from os import path as fp
 import os
 
 from attelo.harness.util import call, force_symlink
 
-from ..local import (TEST_CORPUS,
-                     TRAINING_CORPUS,
-                     PTB_DIR,
-                     FEATURE_SET,
-                     CORENLP_OUT_DIR,
-                     LECSIE_DATA_DIR)
+from ..local import (FEATURE_SET, LABEL_SET, TEST_CORPUS, TRAINING_CORPUS,
+                     SAME_UNIT, PTB_DIR, CORENLP_OUT_DIR, LECSIE_DATA_DIR)
 from ..util import (current_tmp, latest_tmp)
 
 NAME = 'gather'
@@ -40,16 +35,19 @@ def config_argparser(psr):
     psr.add_argument('--skip-training',
                      action='store_true',
                      help='only gather test data')
-    psr.add_argument('--coarse',
-                     action='store_true',
-                     help='use coarse-grained labels')
     psr.add_argument('--fix_pseudo_rels',
-                        action='store_true',
-                        help='fix pseudo-relation labels')
+                     action='store_true',
+                     help='fix pseudo-relation labels')
+    # WIP frag pairs
+    psr.add_argument('--resume-frag-pairs',
+                     action='store_true',
+                     help='resume extraction at frag-pairs')
+    # end WIP frag pairs
     psr.set_defaults(func=main)
 
 
-def extract_features(corpus, output_dir, coarse, fix_pseudo_rels,
+def extract_features(corpus, output_dir, fix_pseudo_rels, instances,
+                     frag_edus=None,
                      vocab_path=None,
                      label_path=None):
     """Extract instances from a corpus, store them in files.
@@ -64,10 +62,10 @@ def extract_features(corpus, output_dir, coarse, fix_pseudo_rels,
         Path to the corpus.
     output_dir: filepath
         Path to the output folder.
-    coarse: boolean, False by default
-        Use coarse-grained relation labels.
     fix_pseudo_rels: boolean, False by default
         Rewrite pseudo-relations to improve consistency (WIP).
+    instances: one of {'same-unit', 'all-pairs'}
+        Selection of instances to extract.
     vocab_path: filepath
         Path to a fixed vocabulary mapping, for feature extraction
         (needed if extracting test data: the same vocabulary should be
@@ -83,6 +81,7 @@ def extract_features(corpus, output_dir, coarse, fix_pseudo_rels,
         PTB_DIR,  # TODO make this optional and exclusive from CoreNLP
         output_dir,
         '--feature_set', FEATURE_SET,
+        '--instances', instances,
     ]
     # NEW 2016-05-19 rewrite pseudo-relations
     if fix_pseudo_rels:
@@ -91,7 +90,7 @@ def extract_features(corpus, output_dir, coarse, fix_pseudo_rels,
         ])
     # NEW 2016-05-03 use coarse- or fine-grained relation labels
     # NB "coarse" was the previous default
-    if coarse:
+    if LABEL_SET == 'coarse':
         cmd.extend([
             '--coarse'
         ])
@@ -103,6 +102,8 @@ def extract_features(corpus, output_dir, coarse, fix_pseudo_rels,
         cmd.extend([
             '--lecsie_data_dir', LECSIE_DATA_DIR,
         ])
+    if frag_edus is not None:
+        cmd.extend(['--frag-edus', frag_edus])
     if vocab_path is not None:
         cmd.extend(['--vocabulary', vocab_path])
     if label_path is not None:
@@ -117,22 +118,80 @@ def main(args):
     You shouldn't need to call this yourself if you're using
     `config_argparser`
     """
-    if args.skip_training:
+    if args.skip_training or args.resume_frag_pairs:
         tdir = latest_tmp()
     else:
         tdir = current_tmp()
-        extract_features(TRAINING_CORPUS, tdir, args.coarse,
-                         args.fix_pseudo_rels)
+
+    fix_pseudo_rels = args.fix_pseudo_rels
+
+    # same-unit
+    instances = 'same-unit'
+    su_prefix_train = '{}.{}'.format(
+        instances, os.path.basename(TRAINING_CORPUS))
+    su_train_path = os.path.join(tdir, su_prefix_train)
+    su_label_path = su_train_path + '.relations.sparse'
+    su_vocab_path = su_label_path + '.vocab'
     if TEST_CORPUS is not None:
-        train_path = fp.join(tdir, fp.basename(TRAINING_CORPUS))
-        label_path = train_path + '.relations.sparse'
-        vocab_path = label_path + '.vocab'
-        extract_features(TEST_CORPUS, tdir, args.coarse,
-                         args.fix_pseudo_rels,
+        su_prefix_test = '{}.{}'.format(
+            instances, os.path.basename(TEST_CORPUS))
+        su_test_path = os.path.join(tdir, su_prefix_test)
+
+    if SAME_UNIT in ['joint', 'preproc'] and not args.resume_frag_pairs:
+        if not args.skip_training:
+            # * train
+            extract_features(TRAINING_CORPUS, tdir, fix_pseudo_rels,
+                             instances)
+        if TEST_CORPUS is not None:
+            # * test
+            extract_features(TEST_CORPUS, tdir, fix_pseudo_rels,
+                             instances,
+                             vocab_path=su_vocab_path,
+                             label_path=su_label_path)
+
+    # all pairs
+    instances = 'all-pairs'
+    if not args.skip_training and not args.resume_frag_pairs:
+        extract_features(TRAINING_CORPUS, tdir, fix_pseudo_rels,
+                         instances)
+    # path to the vocab and labelset gathered from the training set,
+    # we'll use these paths for the test set and for the frag-pairs
+    prefix_train = '{}.{}'.format(
+        instances, os.path.basename(TRAINING_CORPUS))
+    train_path = os.path.join(tdir, prefix_train)
+    label_path = train_path + '.relations.sparse'
+    vocab_path = label_path + '.vocab'
+    if TEST_CORPUS is not None and not args.resume_frag_pairs:
+        extract_features(TEST_CORPUS, tdir, fix_pseudo_rels,
+                         instances,
                          vocab_path=vocab_path,
                          label_path=label_path)
+
+    # frag pairs: supplementary pairs from/to each fragmented EDU to
+    # the other fragmented EDUs and the EDUs that don't belong to any
+    # fragmented EDU
+    instances = 'frag-pairs'
+    # we use the vocabulary and labelset from "all-pairs" ; this is the
+    # simplest solution currently and it seems correct, but maybe we
+    # could extend "all-pairs" with these pairs when we learn the
+    # vocabulary?
+    if not args.skip_training:
+        frag_edus_train = su_train_path + '.relations' + '.deps_true'
+        extract_features(TRAINING_CORPUS, tdir, fix_pseudo_rels,
+                         instances, frag_edus=frag_edus_train,
+                         vocab_path=vocab_path,
+                         label_path=label_path)
+    if TEST_CORPUS is not None:
+        frag_edus_test = su_test_path + '.relations' + '.deps_true'
+        extract_features(TEST_CORPUS, tdir, fix_pseudo_rels,
+                         instances, frag_edus=frag_edus_test,
+                         vocab_path=vocab_path,
+                         label_path=label_path)
+    # end frag pairs        
+
     with open(os.path.join(tdir, "versions-gather.txt"), "w") as stream:
         call(["pip", "freeze"], stdout=stream)
-    if not args.skip_training:
+
+    if not (args.skip_training or args.resume_frag_pairs):
         latest_dir = latest_tmp()
-        force_symlink(fp.basename(tdir), latest_dir)
+        force_symlink(os.path.basename(tdir), latest_dir)
diff --git a/irit_rst_dt/config/common.py b/irit_rst_dt/config/common.py
index f224581..2bd824f 100644
--- a/irit_rst_dt/config/common.py
+++ b/irit_rst_dt/config/common.py
@@ -15,6 +15,7 @@
 from attelo.parser.full import (JointPipeline,
                                 PostlabelPipeline)
 from attelo.parser.same_unit import (JointSameUnitPipeline,
+                                     SameUnitJointPipeline,
                                      SklearnSameUnitClassifier)
 
 
@@ -138,6 +139,27 @@ def mk_joint_su(klearner, kdecoder):
                             parser=Keyed(parser_key, parser))
 
 
+def mk_su_joint(klearner, kdecoder):
+    "return a parser config with same-unit then joint decoding"
+    settings = _core_settings('su.AD.L-jnt', klearner)
+    parser_key = combined_key(settings, kdecoder)
+    key = combined_key(klearner, parser_key)
+    # su: use same kind of learner as "attach"
+    parser = SameUnitJointPipeline(
+        learner_su=(
+            SklearnSameUnitClassifier(klearner.attach.payload._learner)
+            if not isinstance(klearner.attach.payload, AttachOracle)
+            else klearner.attach.payload
+        ),
+        learner_attach=klearner.attach.payload,
+        learner_label=klearner.label.payload,
+        decoder=kdecoder.payload)
+    return EvaluationConfig(key=key,
+                            settings=settings,
+                            learner=klearner,
+                            parser=Keyed(parser_key, parser))
+
+
 def mk_post(klearner, kdecoder):
     "return a post label parser"
     settings = _core_settings('AD.L-pst', klearner)
diff --git a/irit_rst_dt/harness.py b/irit_rst_dt/harness.py
index eddf7d4..9c81c1b 100644
--- a/irit_rst_dt/harness.py
+++ b/irit_rst_dt/harness.py
@@ -117,21 +117,18 @@ def mpack_paths(self, test_data, stripped=False):
 
         Returns
         -------
-        path_to_edu_input : string
-
-        path_to_pairings : string
-
-        path_to_features : string
-
-        path_to_vocab : string
-
-        corpus_path : string
-            Path to corpus in order to access gold structures (WIP).
+        paths: tuple of file paths
+            Path to: edu_input, pairings, features, vocab, cdu_input,
+            cdu_pairings, cdu_features, corpus (to access gold
+            structures, WIP).
         """
-        ext = 'relations.sparse'
+        ext = 'relations.edu-pairs.sparse'
         # path to data file in the evaluation dir
         dset = self.testset if test_data else self.dataset
         core_path = fp.join(self.eval_dir, "%s.%s" % (dset, ext))
+        # 2016-07-28 pairs on fragmented EDUs
+        frag_ext = 'relations.frag-pairs.sparse'
+        frag_path = fp.join(self.eval_dir, "%s.%s" % (dset, frag_ext))
         # WIP gold RST trees
         corpus_path = fp.abspath(TEST_CORPUS if test_data
                                  else TRAINING_CORPUS)
@@ -140,6 +137,11 @@ def mpack_paths(self, test_data, stripped=False):
                 core_path + '.pairings',
                 (core_path + '.stripped') if stripped else core_path,
                 core_path + '.vocab',
+                # fragmented EDUs
+                frag_path + '.cdu_input',
+                frag_path + '.pairings',
+                (frag_path + '.stripped') if stripped else frag_path,
+                # corpus
                 corpus_path)
 
     def model_paths(self, rconf, fold, parser):
diff --git a/irit_rst_dt/local.py b/irit_rst_dt/local.py
index 4fad6a4..47b3022 100644
--- a/irit_rst_dt/local.py
+++ b/irit_rst_dt/local.py
@@ -49,6 +49,7 @@
                             decoder_last,
                             decoder_local,
                             mk_joint,
+                            mk_joint_su,
                             mk_post)
 
 # PATHS
@@ -64,8 +65,8 @@
 """Results over time we are making a point of saving"""
 
 # TRAINING_CORPUS = 'tiny'
-# TRAINING_CORPUS = 'corpus/RSTtrees-WSJ-main-1.0/TRAINING'
-TRAINING_CORPUS = 'corpus/RSTtrees-WSJ-double-1.0'
+TRAINING_CORPUS = 'corpus/RSTtrees-WSJ-main-1.0/TRAINING'
+# TRAINING_CORPUS = 'corpus/RSTtrees-WSJ-double-1.0'
 """Corpora for use in building/training models and running our
 incremental experiments. Later on we should consider using the
 held-out test data for something, but let's make a point of
@@ -90,10 +91,11 @@
 validation on the training data)
 """
 
-TEST_EVALUATION_KEY = None
+# TEST_EVALUATION_KEY = None
 # TEST_EVALUATION_KEY = 'maxent-AD.L-jnt-mst'
 # TEST_EVALUATION_KEY = 'maxent-AD.L-jnt-eisner'
-# TEST_EVALUATION_KEY = 'maxent-iheads-global-AD.L-jnt-eisner'
+# TEST_EVALUATION_KEY = 'maxent-AD.L-jnt_su-eisner'
+TEST_EVALUATION_KEY = 'maxent-iheads-global-AD.L-jnt_su-eisner'
 """Evaluation to use for testing.
 
 Leave this to None until you think it's OK to look at the test data.
@@ -109,8 +111,9 @@
 parsed/mrg/wsj)
 """
 
-CORENLP_OUT_DIR = None
+# CORENLP_OUT_DIR = None
 # CORENLP_OUT_DIR = '/projets/melodi/corpus/rst-dt-corenlp-2015-01-29'
+CORENLP_OUT_DIR = '/home/mmorey/corpora/rst-dt-corenlp-2015-01-29'
 """
 Where to read parses from CoreNLP from
 """
@@ -126,6 +129,16 @@
 Which feature set to use for feature extraction
 """
 
+LABEL_SET = 'coarse'  # one of {'coarse', 'fine'} or a list of strings
+"""
+Which label set to use
+"""
+
+SAME_UNIT = 'joint'  # one of {'joint', 'preproc', 'no'}
+"""
+Whether to have a special processing for same-unit
+"""
+
 FIXED_FOLD_FILE = None
 # FIXED_FOLD_FILE = 'folds-TRAINING.json'
 """
@@ -253,6 +266,19 @@ def _core_parsers(klearner, unique_real_root=True):
                                     use_prob=True)),
             ]
         ]
+        # WIP with same-unit
+        if SAME_UNIT == 'joint':
+            joint.extend([
+                mk_joint_su(klearner, d) for d in [
+                    # decoder_last(),
+                    # DECODER_LOCAL,
+                    # decoder_mst(),
+                    Keyed('eisner',
+                          EisnerDecoder(unique_real_root=unique_real_root,
+                                        use_prob=True)),
+                    ]
+            ])
+        # end WIP
 
     # postlabeling
     use_prob = klearner.attach.payload.can_predict_proba
@@ -261,9 +287,9 @@ def _core_parsers(klearner, unique_real_root=True):
             # decoder_last() ,
             # DECODER_LOCAL,
             # decoder_mst(),
-            Keyed('eisner',
-                  EisnerDecoder(unique_real_root=unique_real_root,
-                                use_prob=use_prob)),
+            # Keyed('eisner',
+            #       EisnerDecoder(unique_real_root=unique_real_root,
+            #                     use_prob=use_prob)),
         ]
     ]
 

From 5b74d1432939ab41aaf9ca0b805c67b570e639ff Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Fri, 29 Jul 2016 17:47:20 +0200
Subject: [PATCH 04/74] FIX update naming conventions for files

---
 irit_rst_dt/cmd/gather.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/irit_rst_dt/cmd/gather.py b/irit_rst_dt/cmd/gather.py
index e4db2cd..c808c44 100644
--- a/irit_rst_dt/cmd/gather.py
+++ b/irit_rst_dt/cmd/gather.py
@@ -64,7 +64,7 @@ def extract_features(corpus, output_dir, fix_pseudo_rels, instances,
         Path to the output folder.
     fix_pseudo_rels: boolean, False by default
         Rewrite pseudo-relations to improve consistency (WIP).
-    instances: one of {'same-unit', 'all-pairs'}
+    instances: one of {'same-unit', 'edu-pairs'}
         Selection of instances to extract.
     vocab_path: filepath
         Path to a fixed vocabulary mapping, for feature extraction
@@ -128,13 +128,13 @@ def main(args):
     # same-unit
     instances = 'same-unit'
     su_prefix_train = '{}.{}'.format(
-        instances, os.path.basename(TRAINING_CORPUS))
+        os.path.basename(TRAINING_CORPUS), instances)
     su_train_path = os.path.join(tdir, su_prefix_train)
     su_label_path = su_train_path + '.relations.sparse'
     su_vocab_path = su_label_path + '.vocab'
     if TEST_CORPUS is not None:
         su_prefix_test = '{}.{}'.format(
-            instances, os.path.basename(TEST_CORPUS))
+            os.path.basename(TEST_CORPUS), instances)
         su_test_path = os.path.join(tdir, su_prefix_test)
 
     if SAME_UNIT in ['joint', 'preproc'] and not args.resume_frag_pairs:
@@ -150,14 +150,14 @@ def main(args):
                              label_path=su_label_path)
 
     # all pairs
-    instances = 'all-pairs'
+    instances = 'edu-pairs'
     if not args.skip_training and not args.resume_frag_pairs:
         extract_features(TRAINING_CORPUS, tdir, fix_pseudo_rels,
                          instances)
     # path to the vocab and labelset gathered from the training set,
     # we'll use these paths for the test set and for the frag-pairs
     prefix_train = '{}.{}'.format(
-        instances, os.path.basename(TRAINING_CORPUS))
+        os.path.basename(TRAINING_CORPUS), instances)
     train_path = os.path.join(tdir, prefix_train)
     label_path = train_path + '.relations.sparse'
     vocab_path = label_path + '.vocab'
@@ -171,9 +171,9 @@ def main(args):
     # the other fragmented EDUs and the EDUs that don't belong to any
     # fragmented EDU
     instances = 'frag-pairs'
-    # we use the vocabulary and labelset from "all-pairs" ; this is the
+    # we use the vocabulary and labelset from "edu-pairs" ; this is the
     # simplest solution currently and it seems correct, but maybe we
-    # could extend "all-pairs" with these pairs when we learn the
+    # could extend "edu-pairs" with these pairs when we learn the
     # vocabulary?
     if not args.skip_training:
         frag_edus_train = su_train_path + '.relations' + '.deps_true'

From 20e8384b1e71cd804333b6aad882a1e38dab204f Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Mon, 29 Aug 2016 15:52:23 +0200
Subject: [PATCH 05/74] MAINT minor refactoring, same-unit

---
 irit_rst_dt/config/common.py |   2 +-
 irit_rst_dt/config/intra.py  |  10 +--
 irit_rst_dt/local.py         | 130 ++++++++++++++++-------------------
 3 files changed, 65 insertions(+), 77 deletions(-)

diff --git a/irit_rst_dt/config/common.py b/irit_rst_dt/config/common.py
index 2bd824f..abf3efd 100644
--- a/irit_rst_dt/config/common.py
+++ b/irit_rst_dt/config/common.py
@@ -145,7 +145,7 @@ def mk_su_joint(klearner, kdecoder):
     parser_key = combined_key(settings, kdecoder)
     key = combined_key(klearner, parser_key)
     # su: use same kind of learner as "attach"
-    parser = SameUnitJointPipeline(
+    parser = JointSameUnitPipeline(
         learner_su=(
             SklearnSameUnitClassifier(klearner.attach.payload._learner)
             if not isinstance(klearner.attach.payload, AttachOracle)
diff --git a/irit_rst_dt/config/intra.py b/irit_rst_dt/config/intra.py
index b130ab5..1921f7d 100644
--- a/irit_rst_dt/config/intra.py
+++ b/irit_rst_dt/config/intra.py
@@ -11,17 +11,17 @@ def combine_intra(econfs, kconf, primary='intra', verbose=False):
     Parameters
     ----------
     econfs: IntraInterPair(EvaluationConfig)
-
+        Evaluation configs for the intra and inter parsers.
     kconf: Keyed(parser constructor)
-
-    primary: ['intra', 'inter']
-        Treat the intra/inter config as the primary one for the key
+        Key for the whole intra/inter parser.
+    primary: one of {'intra', 'inter'}
+        Treat the intra or inter config as the primary one for the key.
     verbose: boolean, optional
         Verbosity of the intra/inter parser
 
     Returns
     -------
-    econf : EvaluationConfig
+    econf: EvaluationConfig
         Evaluation configuration for the IntraInterParser.
     """
     if primary == 'intra':
diff --git a/irit_rst_dt/local.py b/irit_rst_dt/local.py
index 47b3022..419c077 100644
--- a/irit_rst_dt/local.py
+++ b/irit_rst_dt/local.py
@@ -11,7 +11,13 @@
 from os import path as fp
 import itertools as itr
 
-from attelo.harness.config import (LearnerConfig,
+from sklearn.linear_model import (LogisticRegression)
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier
+
+# attelo
+from attelo.harness.config import (EvaluationConfig,
+                                   LearnerConfig,
                                    Keyed)
 # from attelo.decoding.astar import (AstarArgs,
 #                                    AstarDecoder,
@@ -26,12 +32,7 @@
                                  FrontierToHeadParser,
                                  # SentOnlyParser,
                                  SoftParser)
-
-from sklearn.linear_model import (LogisticRegression)
-from sklearn.tree import DecisionTreeClassifier
-from sklearn.ensemble import RandomForestClassifier
-
-
+# this harness
 from .config.intra import (combine_intra)
 from .config.perceptron import (attach_learner_dp_pa,
                                 attach_learner_dp_perc,
@@ -50,7 +51,10 @@
                             decoder_local,
                             mk_joint,
                             mk_joint_su,
-                            mk_post)
+                            mk_su_joint,
+                            mk_post,
+                            JointPipeline,
+                            Settings)
 
 # PATHS
 
@@ -278,6 +282,17 @@ def _core_parsers(klearner, unique_real_root=True):
                                         use_prob=True)),
                     ]
             ])
+        elif SAME_UNIT == 'preproc':
+            joint.extend([
+                mk_su_joint(klearner, d) for d in [
+                    # decoder_last(),
+                    # DECODER_LOCAL,
+                    # decoder_mst(),
+                    Keyed('eisner',
+                          EisnerDecoder(unique_real_root=unique_real_root,
+                                        use_prob=True)),
+                    ]
+            ])
         # end WIP
 
     # postlabeling
@@ -322,74 +337,26 @@ def _core_parsers(klearner, unique_real_root=True):
 HARNESS_NAME = 'irit-rst-dt'
 
 
-# possibly obsolete
-def _mk_basic_intras(klearner, kconf):
-    """Intra/inter parser based on a single core parser
-    """
-    # NEW intra parsers are explicitly authorized to have more than one
-    # real root (necessary for the Eisner decoder, maybe other decoders too)
-    parsers = [IntraInterPair(intra=x, inter=y) for x, y in
-               zip(_core_parsers(klearner, unique_real_root=False),
-                   _core_parsers(klearner))]
-    return [combine_intra(p, kconf) for p in parsers]
-
-
-def _mk_sorc_intras(klearner, kconf):
-    """Intra/inter parsers based on a single core parser
-    and a sentence oracle
-    """
-    parsers = [IntraInterPair(intra=x, inter=y) for x, y in
-               zip(_core_parsers(ORACLE, unique_real_root=False),
-                   _core_parsers(klearner))]
-    return [combine_intra(p, kconf, primary='inter') for p in parsers]
-
-
-def _mk_dorc_intras(klearner, kconf):
-    """Intra/inter parsers based on a single core parser
-    and a document oracle
-    """
-    parsers = [IntraInterPair(intra=x, inter=y) for x, y in
-               zip(_core_parsers(klearner, unique_real_root=False),
-                   _core_parsers(ORACLE))]
-    return [combine_intra(p, kconf, primary='intra') for p in parsers]
-
-
-def _mk_last_intras(klearner, kconf):
-    """Parsers using "last" for intra and a core decoder for inter.
-    """
-    if ((not klearner.attach.payload.can_predict_proba or
-         not klearner.label.payload.can_predict_proba)):
-        return []
-
-    kconf = Keyed(key=combined_key('last', kconf),
-                  payload=kconf.payload)
-    econf_last = mk_joint(klearner, decoder_last())
-    parsers = [IntraInterPair(intra=econf_last, inter=y) for y in
-               _core_parsers(klearner)]
-    return [combine_intra(p, kconf, primary='inter') for p in parsers]
-# end of possibly obsolete
-
-
 def _is_junk(econf):
     """
     Any configuration for which this function returns True
     will be silently discarded
     """
     # intrasential head to head mode only works with mst for now
-    has = econf.settings
-    kids = econf.settings.children
-    has_intra_oracle = has.intra and (kids.intra.oracle or kids.inter.oracle)
-    has_any_oracle = has.oracle or has_intra_oracle
+    has_intra_oracle = (econf.settings.intra
+                        and (econf.settings.children.intra.oracle
+                             or econf.settings.children.inter.oracle))
+    has_any_oracle = econf.settings.oracle or has_intra_oracle
 
-    decoder_name = econf.parser.key[len(has.key) + 1:]
+    decoder_name = econf.parser.key[len(econf.settings.key) + 1:]
     # last with last-based intra decoders is a bit redundant
-    if has.intra and decoder_name == 'last':
+    if econf.settings.intra and decoder_name == 'last':
         return True
 
     # oracle would be redundant with sentence/doc oracles
     # FIXME the above is wrong for intra/inter parsers because gold edges
     # can fall out of the search space
-    if has.oracle and has_intra_oracle:
+    if econf.settings.oracle and has_intra_oracle:
         return True  # FIXME should sometimes be False
 
     # toggle or comment to enable filtering in/out oracles
@@ -404,6 +371,24 @@ def _evaluations():
     res = []
 
     # == one-step (global) parsers ==
+    # WIP
+    # maxent, eisner, AD.L-jnt
+    maxent_klearner = LearnerConfig(attach=attach_learner_maxent(),
+                                    label=label_learner_maxent())
+    res.append(
+        EvaluationConfig(key='maxent-AD.L-jnt-eisner-NEW',
+                         settings=Settings(key='AD.L-jnt',
+                                           intra=False,
+                                           oracle=False,
+                                           children=None),
+                         learner=maxent_klearner,
+                         parser=Keyed('AD.L-jnt-eisner-NEW',
+                                      JointPipeline(
+                                          learner_attach=maxent_klearner.attach.payload,
+                                          learner_label=maxent_klearner.label.payload,
+                                          decoder=EisnerDecoder(unique_real_root=True, use_prob=True))))
+    )
+    # end WIP
     learners = []
     learners.extend(_LOCAL_LEARNERS)
     # current structured learners don't do probs, hence non-prob decoders
@@ -412,8 +397,8 @@ def _evaluations():
     # MST is disabled by default, as it does not output projective trees
     # nonprob_mst = MstDecoder(MstRootStrategy.fake_root, False)
     # learners.extend(l(nonprob_mst) for l in _STRUCTURED_LEARNERS)
-    global_parsers = itr.chain.from_iterable(_core_parsers(l)
-                                             for l in learners)
+    global_parsers = itr.chain.from_iterable(
+        _core_parsers(l) for l in learners)
     res.extend(global_parsers)
 
     # == two-step parsers: intra then inter-sentential ==
@@ -449,9 +434,12 @@ def _evaluations():
         # NEW intra parsers are explicitly authorized (in fact, expected)
         # to have more than one real root ; this is necessary for the
         # Eisner decoder and probably others, with "hard" strategies
-        ii_pairs.extend(IntraInterPair(intra=x, inter=y) for x, y in
-                        zip(_core_parsers(intra_lnr, unique_real_root=True),  # TODO add unique_real_root to hyperparameters in grid search
-                            _core_parsers(inter_lnr, unique_real_root=True)))
+        # TODO add unique_real_root to hyperparameters in grid search
+        ii_pairs.extend(
+            IntraInterPair(intra=x, inter=y) for x, y in
+            zip(_core_parsers(intra_lnr, unique_real_root=True),
+                _core_parsers(inter_lnr, unique_real_root=True))
+        )
     # cross-product: pairs of parsers x intra-/inter- configs
     ii_parsers = [combine_intra(p, kconf,
                                 primary=('inter' if p.intra.settings.oracle
@@ -484,9 +472,9 @@ def _want_details(econf):
     else:
         learners = [econf.learner]
     has_maxent = any('maxent' in l.key for l in learners)
-    has = econf.settings
-    kids = econf.settings.children
-    has_intra_oracle = has.intra and (kids.intra.oracle or kids.inter.oracle)
+    has_intra_oracle = (econf.settings.intra and
+                        (econf.settings.children.intra.oracle or
+                         econf.settings.children.inter.oracle))
     return (has_maxent and
             ('mst' in econf.parser.key or 'astar' in econf.parser.key or
              'eisner' in econf.parser.key) and

From 9a153ada9cb55bae529dcc367874cb03e9f12d88 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Sat, 3 Sep 2016 15:33:52 +0200
Subject: [PATCH 06/74] WIP one file per doc, same-unit

---
 irit_rst_dt/cmd/clean.py     |  6 +--
 irit_rst_dt/cmd/gather.py    | 98 ++++++++++++++++++++++--------------
 irit_rst_dt/config/common.py | 21 ++++----
 irit_rst_dt/harness.py       | 58 ++++++++++++---------
 4 files changed, 107 insertions(+), 76 deletions(-)

diff --git a/irit_rst_dt/cmd/clean.py b/irit_rst_dt/cmd/clean.py
index ad55823..2a9a019 100644
--- a/irit_rst_dt/cmd/clean.py
+++ b/irit_rst_dt/cmd/clean.py
@@ -34,10 +34,10 @@ def main(_):
     You shouldn't need to call this yourself if you're using
     `config_argparser`
     """
-    for data_dir in sorted(subdirs(LOCAL_TMP)):
-        if fp.basename(data_dir) == "latest":
+    for base_dir in sorted(subdirs(LOCAL_TMP)):
+        if fp.basename(base_dir) == "latest":
             continue
-        for subdir in subdirs(data_dir):
+        for subdir in subdirs(base_dir):
             bname = fp.basename(subdir)
             if bname in ["eval-current", "eval-previous",
                          "scratch-current", "scratch-previous"]:
diff --git a/irit_rst_dt/cmd/gather.py b/irit_rst_dt/cmd/gather.py
index c808c44..996f570 100644
--- a/irit_rst_dt/cmd/gather.py
+++ b/irit_rst_dt/cmd/gather.py
@@ -6,12 +6,17 @@
 """
 
 from __future__ import print_function
+import itertools
 import os
 
 from attelo.harness.util import call, force_symlink
+from attelo.learning.oracle import AttachOracle
+from attelo.parser.intra import IntraInterParser
+from attelo.parser.same_unit import SameUnitClassifierWrapper
 
 from ..local import (FEATURE_SET, LABEL_SET, TEST_CORPUS, TRAINING_CORPUS,
-                     SAME_UNIT, PTB_DIR, CORENLP_OUT_DIR, LECSIE_DATA_DIR)
+                     SAME_UNIT, PTB_DIR, CORENLP_OUT_DIR, LECSIE_DATA_DIR,
+                     EVALUATIONS)
 from ..util import (current_tmp, latest_tmp)
 
 NAME = 'gather'
@@ -125,26 +130,41 @@ def main(args):
 
     fix_pseudo_rels = args.fix_pseudo_rels
 
+    # 2016-09-01 put data files in {tdir}/data
+    tdir_data = os.path.join(tdir, 'data')
+    if not os.path.exists(tdir_data):
+        os.makedirs(tdir_data)
     # same-unit
-    instances = 'same-unit'
-    su_prefix_train = '{}.{}'.format(
-        os.path.basename(TRAINING_CORPUS), instances)
-    su_train_path = os.path.join(tdir, su_prefix_train)
-    su_label_path = su_train_path + '.relations.sparse'
-    su_vocab_path = su_label_path + '.vocab'
-    if TEST_CORPUS is not None:
-        su_prefix_test = '{}.{}'.format(
-            os.path.basename(TEST_CORPUS), instances)
-        su_test_path = os.path.join(tdir, su_prefix_test)
-
-    if SAME_UNIT in ['joint', 'preproc'] and not args.resume_frag_pairs:
+    all_parsers = []
+    for econf in EVALUATIONS:
+        parser = econf.parser[1]
+        if isinstance(parser, IntraInterParser):
+            all_parsers.extend(x[1] for x in itertools.chain(
+                parser._parsers.intra.steps, parser._parsers.inter.steps))
+        else:
+            all_parsers.extend(x[1] for x in parser.steps)
+    same_unit_parsers = [x for x in all_parsers
+                         if isinstance(x, SameUnitClassifierWrapper)]
+    same_unit_clfs = [x._learner_su for x in same_unit_parsers]
+    if same_unit_parsers and not args.resume_frag_pairs:
+        instances = 'same-unit'
+        su_prefix_train = '{}.relations.{}'.format(
+            os.path.basename(TRAINING_CORPUS), instances)
+        su_train_path = os.path.join(tdir_data, su_prefix_train)
+        su_label_path = su_train_path + '.labels'
+        su_vocab_path = su_train_path + '.sparse.vocab'
+        if TEST_CORPUS is not None:
+            su_prefix_test = '{}.{}'.format(
+                os.path.basename(TEST_CORPUS), instances)
+            su_test_path = os.path.join(tdir_data, su_prefix_test)
+
         if not args.skip_training:
             # * train
-            extract_features(TRAINING_CORPUS, tdir, fix_pseudo_rels,
+            extract_features(TRAINING_CORPUS, tdir_data, fix_pseudo_rels,
                              instances)
         if TEST_CORPUS is not None:
             # * test
-            extract_features(TEST_CORPUS, tdir, fix_pseudo_rels,
+            extract_features(TEST_CORPUS, tdir_data, fix_pseudo_rels,
                              instances,
                              vocab_path=su_vocab_path,
                              label_path=su_label_path)
@@ -152,17 +172,17 @@ def main(args):
     # all pairs
     instances = 'edu-pairs'
     if not args.skip_training and not args.resume_frag_pairs:
-        extract_features(TRAINING_CORPUS, tdir, fix_pseudo_rels,
+        extract_features(TRAINING_CORPUS, tdir_data, fix_pseudo_rels,
                          instances)
     # path to the vocab and labelset gathered from the training set,
     # we'll use these paths for the test set and for the frag-pairs
-    prefix_train = '{}.{}'.format(
+    prefix_train = '{}.relations.{}'.format(
         os.path.basename(TRAINING_CORPUS), instances)
-    train_path = os.path.join(tdir, prefix_train)
-    label_path = train_path + '.relations.sparse'
-    vocab_path = label_path + '.vocab'
+    train_path = os.path.join(tdir_data, prefix_train)
+    label_path = train_path + '.labels'
+    vocab_path = train_path + '.sparse.vocab'
     if TEST_CORPUS is not None and not args.resume_frag_pairs:
-        extract_features(TEST_CORPUS, tdir, fix_pseudo_rels,
+        extract_features(TEST_CORPUS, tdir_data, fix_pseudo_rels,
                          instances,
                          vocab_path=vocab_path,
                          label_path=label_path)
@@ -171,25 +191,27 @@ def main(args):
     # the other fragmented EDUs and the EDUs that don't belong to any
     # fragmented EDU
     instances = 'frag-pairs'
-    # we use the vocabulary and labelset from "edu-pairs" ; this is the
-    # simplest solution currently and it seems correct, but maybe we
-    # could extend "edu-pairs" with these pairs when we learn the
-    # vocabulary?
-    if not args.skip_training:
-        frag_edus_train = su_train_path + '.relations' + '.deps_true'
-        extract_features(TRAINING_CORPUS, tdir, fix_pseudo_rels,
-                         instances, frag_edus=frag_edus_train,
-                         vocab_path=vocab_path,
-                         label_path=label_path)
-    if TEST_CORPUS is not None:
-        frag_edus_test = su_test_path + '.relations' + '.deps_true'
-        extract_features(TEST_CORPUS, tdir, fix_pseudo_rels,
-                         instances, frag_edus=frag_edus_test,
-                         vocab_path=vocab_path,
-                         label_path=label_path)
+    same_unit_types = set(('true' if isinstance(x, AttachOracle)
+                           else 'pred')
+                          for clf in same_unit_clfs)
+    for same_unit_type in sorted(same_unit_types):
+        # we use the vocabulary and labelset from "edu-pairs" ; this is the
+        # simplest solution currently and it seems correct, but maybe we
+        # could extend "edu-pairs" with these pairs when we learn the
+        # vocabulary?
+        if not args.skip_training:
+            extract_features(TRAINING_CORPUS, tdir_data, fix_pseudo_rels,
+                             instances, frag_edus=same_unit_type,
+                             vocab_path=vocab_path,
+                             label_path=label_path)
+        if TEST_CORPUS is not None:
+            extract_features(TEST_CORPUS, tdir_data, fix_pseudo_rels,
+                             instances, frag_edus=same_unit_type,
+                             vocab_path=vocab_path,
+                             label_path=label_path)
     # end frag pairs        
 
-    with open(os.path.join(tdir, "versions-gather.txt"), "w") as stream:
+    with open(os.path.join(tdir_data, "versions-gather.txt"), "w") as stream:
         call(["pip", "freeze"], stdout=stream)
 
     if not (args.skip_training or args.resume_frag_pairs):
diff --git a/irit_rst_dt/config/common.py b/irit_rst_dt/config/common.py
index abf3efd..1ff9e9b 100644
--- a/irit_rst_dt/config/common.py
+++ b/irit_rst_dt/config/common.py
@@ -1,7 +1,9 @@
 """Commonly used configuration options"""
 
 from collections import namedtuple
+import copy
 import six
+
 # from attelo.decoding.astar import (AstarArgs,
 #                                    AstarDecoder,
 #                                    Heuristic,
@@ -15,8 +17,7 @@
 from attelo.parser.full import (JointPipeline,
                                 PostlabelPipeline)
 from attelo.parser.same_unit import (JointSameUnitPipeline,
-                                     SameUnitJointPipeline,
-                                     SklearnSameUnitClassifier)
+                                     SameUnitJointPipeline)
 
 
 def combined_key(*variants):
@@ -127,11 +128,9 @@ def mk_joint_su(klearner, kdecoder):
     parser = JointSameUnitPipeline(
         learner_attach=klearner.attach.payload,
         learner_label=klearner.label.payload,
-        learner_su=(
-            SklearnSameUnitClassifier(klearner.attach.payload._learner)
-            if not isinstance(klearner.attach.payload, AttachOracle)
-            else klearner.attach.payload
-        ),
+        # FIXME this copy does not really make sense here, but at least
+        # its type is correct
+        learner_su=copy.deepcopy(klearner.attach.payload),
         decoder=kdecoder.payload)
     return EvaluationConfig(key=key,
                             settings=settings,
@@ -146,11 +145,9 @@ def mk_su_joint(klearner, kdecoder):
     key = combined_key(klearner, parser_key)
     # su: use same kind of learner as "attach"
     parser = JointSameUnitPipeline(
-        learner_su=(
-            SklearnSameUnitClassifier(klearner.attach.payload._learner)
-            if not isinstance(klearner.attach.payload, AttachOracle)
-            else klearner.attach.payload
-        ),
+        # FIXME this copy does not really make sense here, but at least
+        # its type is correct
+        learner_su=copy.deepcopy(klearner.attach.payload),
         learner_attach=klearner.attach.payload,
         learner_label=klearner.label.payload,
         decoder=kdecoder.payload)
diff --git a/irit_rst_dt/harness.py b/irit_rst_dt/harness.py
index 9c81c1b..3242a31 100644
--- a/irit_rst_dt/harness.py
+++ b/irit_rst_dt/harness.py
@@ -2,6 +2,7 @@
 Paths to files used or generated by the test harness
 '''
 from collections import Counter
+from glob import glob
 from os import path as fp
 import sys
 
@@ -42,13 +43,13 @@ def __init__(self):
     def run(self, runcfg):
         """Run the evaluation
         """
-        data_dir = latest_tmp()
-        if not fp.exists(data_dir):
+        base_dir = latest_tmp()
+        if not fp.exists(base_dir):
             exit_ungathered()
-        eval_dir, scratch_dir = prepare_dirs(runcfg, data_dir)
+        eval_dir, scratch_dir = prepare_dirs(runcfg, base_dir)
         self.load(runcfg, eval_dir, scratch_dir)
-        evidence_of_gathered = self.mpack_paths(False)[0]
-        if not fp.exists(evidence_of_gathered):
+        evidence_of_gathered = self.mpack_paths(False)['edu_input']
+        if not glob(evidence_of_gathered):
             exit_ungathered()
         evaluate_corpus(self)
 
@@ -114,35 +115,46 @@ def mpack_paths(self, test_data, stripped=False):
         test_data: boolean
             If true, the returned paths point to self.testset else to
             self.dataset.
+        stripped: boolean
+            TODO
 
         Returns
         -------
-        paths: tuple of file paths
-            Path to: edu_input, pairings, features, vocab, cdu_input,
-            cdu_pairings, cdu_features, corpus (to access gold
-            structures, WIP).
+        paths: dict of file paths
+            Path to: edu_input, pairings, features, vocab, labels,
+            cdu_input, cdu_pairings, cdu_features, corpus (to access
+            gold structures, WIP).
         """
-        ext = 'relations.edu-pairs.sparse'
+        base = 'relations.edu-pairs'
+        ext = base + '.sparse'
         # path to data file in the evaluation dir
         dset = self.testset if test_data else self.dataset
-        core_path = fp.join(self.eval_dir, "%s.%s" % (dset, ext))
+        vocab_path = fp.join(self.eval_dir, "%s.%s.vocab" % (dset, ext))
+        labels_path = fp.join(self.eval_dir, "%s.%s.labels" % (dset, base))
+        core_path = fp.join(self.eval_dir, dset, "*.%s" % ext)
         # 2016-07-28 pairs on fragmented EDUs
         frag_ext = 'relations.frag-pairs.sparse'
-        frag_path = fp.join(self.eval_dir, "%s.%s" % (dset, frag_ext))
+        frag_path = fp.join(self.eval_dir, dset, "*.%s" % frag_ext)
         # WIP gold RST trees
         corpus_path = fp.abspath(TEST_CORPUS if test_data
                                  else TRAINING_CORPUS)
-        # end WIP
-        return (core_path + '.edu_input',
-                core_path + '.pairings',
-                (core_path + '.stripped') if stripped else core_path,
-                core_path + '.vocab',
-                # fragmented EDUs
-                frag_path + '.cdu_input',
-                frag_path + '.pairings',
-                (frag_path + '.stripped') if stripped else frag_path,
-                # corpus
-                corpus_path)
+        # end gold RST trees
+        res = {
+            'edu_input': core_path + '.edu_input',
+            'pairings': core_path + '.pairings',
+            'features': ((core_path + '.stripped') if stripped
+                         else core_path),
+            'vocab': vocab_path,
+            'labels': labels_path,
+            # fragmented EDUs
+            'cdu_input': frag_path + '.cdu_input',
+            'cdu_pairings': frag_path + '.cdu_pairings',
+            'cdu_features': ((frag_path + '.stripped') if stripped
+                             else frag_path),
+            # corpus for gold RST trees
+            'corpus': corpus_path,
+        }
+        return res
 
     def model_paths(self, rconf, fold, parser):
         """Paths to the learner(s) model(s).

From 6888c259a4b137dbb7f825c292ab4e1a591cc4b3 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Sun, 4 Sep 2016 19:04:48 +0200
Subject: [PATCH 07/74] WIP de-engineering local

---
 irit_rst_dt/config/intra.py |   6 +-
 irit_rst_dt/local.py        | 259 ++++++++++++++++++++++++++----------
 2 files changed, 191 insertions(+), 74 deletions(-)

diff --git a/irit_rst_dt/config/intra.py b/irit_rst_dt/config/intra.py
index 1921f7d..7806d29 100644
--- a/irit_rst_dt/config/intra.py
+++ b/irit_rst_dt/config/intra.py
@@ -31,9 +31,9 @@ def combine_intra(econfs, kconf, primary='intra', verbose=False):
     else:
         raise ValueError("'primary' should be one of intra/inter: " + primary)
 
-    parsers = econfs.fmap(lambda e: e.parser.payload)
-    subsettings = econfs.fmap(lambda e: e.settings)
-    learners = econfs.fmap(lambda e: e.learner)
+    parsers = econfs.fmap(lambda e: e.parser.payload)  # IntraInterPair
+    subsettings = econfs.fmap(lambda e: e.settings)  # IntraInterPair
+    learners = econfs.fmap(lambda e: e.learner)  # IntraInterPair
     settings = Settings(key=combined_key(kconf, econf.settings),
                         intra=True,
                         oracle=econf.settings.oracle,
diff --git a/irit_rst_dt/local.py b/irit_rst_dt/local.py
index 419c077..0cb9cb1 100644
--- a/irit_rst_dt/local.py
+++ b/irit_rst_dt/local.py
@@ -27,11 +27,15 @@
 from attelo.decoding.mst import (MstDecoder, MstRootStrategy)
 from attelo.learning.local import (SklearnAttachClassifier,
                                    SklearnLabelClassifier)
+from attelo.learning.oracle import AttachOracle
 from attelo.parser.intra import (IntraInterPair,
                                  HeadToHeadParser,
                                  FrontierToHeadParser,
                                  # SentOnlyParser,
                                  SoftParser)
+from attelo.parser.same_unit import (JointSameUnitPipeline,
+                                     SameUnitJointPipeline)
+
 # this harness
 from .config.intra import (combine_intra)
 from .config.perceptron import (attach_learner_dp_pa,
@@ -252,7 +256,7 @@ def _structured(klearner):
 """
 
 
-def _core_parsers(klearner, unique_real_root=True):
+def _core_parsers(klearner, unique_real_root=True, same_unit='no'):
     """Our basic parser configurations
     """
     # joint
@@ -271,7 +275,7 @@ def _core_parsers(klearner, unique_real_root=True):
             ]
         ]
         # WIP with same-unit
-        if SAME_UNIT == 'joint':
+        if same_unit == 'joint':
             joint.extend([
                 mk_joint_su(klearner, d) for d in [
                     # decoder_last(),
@@ -282,7 +286,7 @@ def _core_parsers(klearner, unique_real_root=True):
                                         use_prob=True)),
                     ]
             ])
-        elif SAME_UNIT == 'preproc':
+        elif same_unit == 'preproc':
             joint.extend([
                 mk_su_joint(klearner, d) for d in [
                     # decoder_last(),
@@ -376,78 +380,191 @@ def _evaluations():
     maxent_klearner = LearnerConfig(attach=attach_learner_maxent(),
                                     label=label_learner_maxent())
     res.append(
-        EvaluationConfig(key='maxent-AD.L-jnt-eisner-NEW',
-                         settings=Settings(key='AD.L-jnt',
-                                           intra=False,
-                                           oracle=False,
-                                           children=None),
-                         learner=maxent_klearner,
-                         parser=Keyed('AD.L-jnt-eisner-NEW',
-                                      JointPipeline(
-                                          learner_attach=maxent_klearner.attach.payload,
-                                          learner_label=maxent_klearner.label.payload,
-                                          decoder=EisnerDecoder(unique_real_root=True, use_prob=True))))
+        EvaluationConfig(
+            key='maxent-AD.L-jnt-eisner',
+            settings=Settings(key='AD.L-jnt',
+                              intra=False,
+                              oracle=False,
+                              children=None),
+            learner=maxent_klearner,
+            parser=Keyed('AD.L-jnt-eisner',
+                         JointPipeline(
+                             learner_attach=maxent_klearner.attach.payload,
+                             learner_label=maxent_klearner.label.payload,
+                             decoder=EisnerDecoder(unique_real_root=True, use_prob=True))))
+    )
+
+    # maxent, eisner, AD.L-jnt then overwrite predicted "Same-Unit"
+    # FIXME "learner" might be wrong: this LearnerConfig has no mention of
+    # the same-unit classifier
+    maxent_su_learner = attach_learner_maxent()
+    # oracle_su_learner = Keyed('oracle', AttachOracle())  # alternative
+    res.append(
+        EvaluationConfig(
+            key='maxent-AD.L-jnt_su-eisner',
+            settings=Settings(key='AD.L-jnt_su',
+                              intra=False,
+                              oracle=False,
+                              children=None),
+            # FIXME ("attach", "label"), lacks "same_unit"
+            learner=maxent_klearner,
+            parser=Keyed('AD.L-jnt_su-eisner',
+                         JointSameUnitPipeline(
+                             learner_attach=maxent_klearner.attach.payload,
+                             learner_label=maxent_klearner.label.payload,
+                             learner_su=maxent_su_learner.payload,
+                             decoder=EisnerDecoder(unique_real_root=True, use_prob=True))))
     )
     # end WIP
-    learners = []
-    learners.extend(_LOCAL_LEARNERS)
-    # current structured learners don't do probs, hence non-prob decoders
-    nonprob_eisner = EisnerDecoder(use_prob=False)
-    learners.extend(l(nonprob_eisner) for l in _STRUCTURED_LEARNERS)
-    # MST is disabled by default, as it does not output projective trees
-    # nonprob_mst = MstDecoder(MstRootStrategy.fake_root, False)
-    # learners.extend(l(nonprob_mst) for l in _STRUCTURED_LEARNERS)
-    global_parsers = itr.chain.from_iterable(
-        _core_parsers(l) for l in learners)
-    res.extend(global_parsers)
+
+    if False:  # legacy code for one-step parsers
+        learners = []
+        learners.extend(_LOCAL_LEARNERS)
+        # current structured learners don't do probs, hence non-prob decoders
+        nonprob_eisner = EisnerDecoder(use_prob=False)
+        learners.extend(l(nonprob_eisner) for l in _STRUCTURED_LEARNERS)
+        # MST is disabled by default, as it does not output projective trees
+        # nonprob_mst = MstDecoder(MstRootStrategy.fake_root, False)
+        # learners.extend(l(nonprob_mst) for l in _STRUCTURED_LEARNERS)
+        global_parsers = itr.chain.from_iterable(
+            _core_parsers(l, same_unit=SAME_UNIT) for l in learners)
+        res.extend(global_parsers)
 
     # == two-step parsers: intra then inter-sentential ==
-    ii_learners = []  # (intra, inter) learners
-    ii_learners.extend((copy.deepcopy(klearner), copy.deepcopy(klearner))
-                       for klearner in _LOCAL_LEARNERS
-                       if klearner != ORACLE)
-    # keep pointer to intra and inter oracles
-    ii_oracles = (copy.deepcopy(ORACLE), ORACLE_INTER)
-    ii_learners.append(ii_oracles)
-    # structured learners, cf. supra
-    intra_nonprob_eisner = EisnerDecoder(use_prob=False,
-                                         unique_real_root=True)
-    inter_nonprob_eisner = EisnerDecoder(use_prob=False,
-                                         unique_real_root=True)
-    ii_learners.extend((copy.deepcopy(l)(intra_nonprob_eisner),
-                        copy.deepcopy(l)(inter_nonprob_eisner))
-                       for l in _STRUCTURED_LEARNERS)
-    # couples of learners with either sentence- or document-level oracle
-    sorc_ii_learners = [
-        (ii_oracles[0], inter_lnr) for intra_lnr, inter_lnr in ii_learners
-        if (ii_oracles[0], inter_lnr) not in ii_learners
-    ]
-    dorc_ii_learners = [
-        (intra_lnr, ii_oracles[1]) for intra_lnr, inter_lnr in ii_learners
-        if (intra_lnr, ii_oracles[1]) not in ii_learners
-    ]
-    # enumerate pairs of (intra, inter) parsers
-    ii_pairs = []
-    for intra_lnr, inter_lnr in itr.chain(ii_learners,
-                                          sorc_ii_learners,
-                                          dorc_ii_learners):
-        # NEW intra parsers are explicitly authorized (in fact, expected)
-        # to have more than one real root ; this is necessary for the
-        # Eisner decoder and probably others, with "hard" strategies
-        # TODO add unique_real_root to hyperparameters in grid search
-        ii_pairs.extend(
-            IntraInterPair(intra=x, inter=y) for x, y in
-            zip(_core_parsers(intra_lnr, unique_real_root=True),
-                _core_parsers(inter_lnr, unique_real_root=True))
-        )
-    # cross-product: pairs of parsers x intra-/inter- configs
-    ii_parsers = [combine_intra(p, kconf,
-                                primary=('inter' if p.intra.settings.oracle
-                                         else 'intra'),
-                                verbose=_VERBOSE_INTRA_INTER)
-                  for p, kconf
-                  in itr.product(ii_pairs, _INTRA_INTER_CONFIGS)]
-    res.extend(ii_parsers)
+    # WIP explicit declaration
+    maxent_team_intra = LearnerConfig(attach=attach_learner_maxent(),
+                                      label=label_learner_maxent())
+    # FIXME ? maybe sel_inter='global' implies that
+    # maxent_team_inter = LearnerConfig(attach=maxent_klearner.attach, label=maxent_klearner.label)
+    maxent_team_inter = LearnerConfig(attach=attach_learner_maxent(),
+                                      label=label_learner_maxent())
+    res.append(
+        EvaluationConfig(
+            key='maxent-iheads-global-AD.L-jnt-eisner',
+            settings=Settings(key='iheads-global-AD.L-jnt',
+                              intra=True,
+                              oracle=False,
+                              children=IntraInterPair(
+                                  intra=Settings(key='AD.L-jnt',
+                                                 intra=False,
+                                                 oracle=False,
+                                                 children=None),
+                                  inter=Settings(key='AD.L-jnt',
+                                                 intra=False,
+                                                 oracle=False,
+                                                 children=None))),
+            learner=IntraInterPair(intra=maxent_team_intra,
+                                   inter=maxent_team_inter),
+            parser=Keyed('iheads-global-AD.L-jnt-eisner',
+                         HeadToHeadParser(
+                             IntraInterPair(
+                                 intra=JointPipeline(
+                                     learner_attach=maxent_team_intra.attach.payload,
+                                     learner_label=maxent_team_intra.label.payload,
+                                     decoder=EisnerDecoder(unique_real_root=True, use_prob=True)),
+                                 inter=JointPipeline(
+                                     learner_attach=maxent_team_inter.attach.payload,
+                                     learner_label=maxent_team_inter.label.payload,
+                                     decoder=EisnerDecoder(unique_real_root=True, use_prob=True))),
+                             sel_inter='global',
+                             verbose=_VERBOSE_INTRA_INTER)))
+    )
+
+    # maxent-iheads-global-AD.L-jnt_su-eisner
+    maxent_su_learner_intra = attach_learner_maxent()  # WIP
+    res.append(
+        EvaluationConfig(
+            key='maxent-iheads-global-AD.L-jnt_su-eisner',
+            settings=Settings(key='iheads-global-AD.L-jnt_su',
+                              intra=True,
+                              oracle=False,
+                              children=IntraInterPair(
+                                  intra=Settings(key='AD.L-jnt_su',
+                                                 intra=False,
+                                                 oracle=False,
+                                                 children=None),
+                                  inter=Settings(key='AD.L-jnt',
+                                                 intra=False,
+                                                 oracle=False,
+                                                 children=None))),
+            learner=IntraInterPair(intra=maxent_team_intra,
+                                   inter=maxent_team_inter),
+            parser=Keyed('iheads-global-AD.L-jnt_su-eisner',
+                         HeadToHeadParser(
+                             IntraInterPair(
+                                 intra=JointSameUnitPipeline(
+                                     learner_attach=maxent_team_intra.attach.payload,
+                                     learner_label=maxent_team_intra.label.payload,
+                                     learner_su=maxent_su_learner_intra.payload,
+                                     decoder=EisnerDecoder(unique_real_root=True, use_prob=True)),
+                                 inter=JointPipeline(
+                                     learner_attach=maxent_team_inter.attach.payload,
+                                     learner_label=maxent_team_inter.label.payload,
+                                     decoder=EisnerDecoder(unique_real_root=True, use_prob=True))),
+                             sel_inter='global',
+                             verbose=_VERBOSE_INTRA_INTER)))
+    )
+    # end WIP
+
+    if False:  # disable legacy code for 2-step parsers
+        ii_learners = []  # (intra, inter) learners
+        ii_learners.extend((copy.deepcopy(klearner), copy.deepcopy(klearner))
+                           for klearner in _LOCAL_LEARNERS
+                           if klearner != ORACLE)
+        # keep pointer to intra and inter oracles
+        ii_oracles = (copy.deepcopy(ORACLE), ORACLE_INTER)
+        ii_learners.append(ii_oracles)
+        # structured learners, cf. supra
+        intra_nonprob_eisner = EisnerDecoder(use_prob=False,
+                                             unique_real_root=True)
+        inter_nonprob_eisner = EisnerDecoder(use_prob=False,
+                                             unique_real_root=True)
+
+        ii_learners.extend((copy.deepcopy(l)(intra_nonprob_eisner),
+                            copy.deepcopy(l)(inter_nonprob_eisner))
+                           for l in _STRUCTURED_LEARNERS)
+        # couples of learners with either sentence- or document-level oracle
+        sorc_ii_learners = [
+            (ii_oracles[0], inter_lnr) for intra_lnr, inter_lnr in ii_learners
+            if (ii_oracles[0], inter_lnr) not in ii_learners
+        ]
+        dorc_ii_learners = [
+            (intra_lnr, ii_oracles[1]) for intra_lnr, inter_lnr in ii_learners
+            if (intra_lnr, ii_oracles[1]) not in ii_learners
+        ]
+        # enumerate pairs of (intra, inter) parsers
+        ii_pairs = []
+        for intra_lnr, inter_lnr in itr.chain(ii_learners,
+                                              sorc_ii_learners,
+                                              dorc_ii_learners):
+            # NEW intra parsers are explicitly authorized (in fact, expected)
+            # to have more than one real root ; this is necessary for the
+            # Eisner decoder and probably others, with "hard" strategies
+            # TODO add unique_real_root to hyperparameters in grid search
+            intra_parsers = _core_parsers(intra_lnr, unique_real_root=True,
+                                          same_unit=SAME_UNIT)
+            # same-unit is undefined for inter, in the RST-DT corpus
+            # (at least in our implementation)
+            inter_parsers = _core_parsers(inter_lnr, unique_real_root=True,
+                                          same_unit='no')
+            if SAME_UNIT != 'no':
+                # inter_parsers would be twice less numerous than intra_parsers
+                # => dirty hack: double the inter parsers
+                inter_parsers = inter_parsers + inter_parsers
+
+            ii_pairs.extend(IntraInterPair(intra=x, inter=y) for x, y
+                            # FIXME should probably not be a zip(), cf dirty hack
+                            # above
+                            in zip(intra_parsers, inter_parsers)
+            )
+        # cross-product: pairs of parsers x intra-/inter- configs
+        ii_parsers = [combine_intra(p, kconf,
+                                    primary=('inter' if p.intra.settings.oracle
+                                             else 'intra'),
+                                    verbose=_VERBOSE_INTRA_INTER)
+                      for p, kconf
+                      in itr.product(ii_pairs, _INTRA_INTER_CONFIGS)]
+        res.extend(ii_parsers)
 
     return [x for x in res if not _is_junk(x)]
 

From fee674e89d71ebcc14ed80a4f1e3d0dcb36286f6 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Wed, 7 Sep 2016 16:23:07 +0200
Subject: [PATCH 08/74] NEW convert attelo predictions to disdep file

---
 evals/attelo_predictions_to_disdep.py | 101 ++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100755 evals/attelo_predictions_to_disdep.py

diff --git a/evals/attelo_predictions_to_disdep.py b/evals/attelo_predictions_to_disdep.py
new file mode 100755
index 0000000..231a233
--- /dev/null
+++ b/evals/attelo_predictions_to_disdep.py
@@ -0,0 +1,101 @@
+"""Small utility script to convert predictions from attelo to dis_dep files.
+"""
+
+from __future__ import absolute_import, print_function
+
+from collections import defaultdict
+from glob import glob
+import os
+
+from attelo.io import load_edus, load_predictions
+from attelo.metrics.util import barebones_rst_deptree
+from attelo.table import UNRELATED
+from educe.corpus import FileId
+from educe.learning.disdep_format import dump_disdep_files
+from educe.rst_dt.dep2con import (DummyNuclearityClassifier,
+                                  InsideOutAttachmentRanker)
+
+
+def attelo_predictions_to_disdep_files(edus_file_glob, edges_file, out_dir):
+    """Generate disdep files from a file dump of attelo predictions.
+
+    Parameters
+    ----------
+    edus_file_glob: str
+        Regex for `edu_input` file paths.
+    edges_file: str
+        Path to the file that contains attelo predictions (edges as
+        triples).
+    out_dir: str
+        Path to the output folder.
+    """
+    # set up heuristic classifiers for nuclearity and rank
+    nuc_clf = DummyNuclearityClassifier(strategy='unamb_else_most_frequent')
+    nuc_clf.fit([], [])  # dummy fit
+    rnk_clf = InsideOutAttachmentRanker(strategy='closest-intra-rl-inter-rl',
+                                        prioritize_same_unit=True)
+    rnk_clf.fit([], [])  # dummy fit
+
+    # load EDUs
+    doc_edus = dict()
+    id2doc = dict()
+    for edu_input_file in glob(edus_file_glob):
+        doc_name = os.path.basename(edu_input_file).rsplit('.', 4)[0]  # FRAGILE
+        edus = load_edus(edu_input_file)
+        assert doc_name == edus[0].grouping
+        # map doc_name to list of EDUs ; populate reverse mapping from
+        # EDU id to doc_name, so that we can dispatch edges to their
+        # document
+        # we keep the list of EDUs sorted as in edu_input, hence we
+        # assume edu_input follows the linear order of EDUs
+        doc_edus[doc_name] = edus
+        for edu in edus:
+            id2doc[edu.id] = doc_name
+    # load edges and dispatch them to their doc
+    edges_pred = load_predictions(edges_file)
+    # for each doc, list edges
+    doc_edges = defaultdict(list)
+    for gov_id, dep_id, lbl in edges_pred:
+        if lbl != UNRELATED:
+            doc_name = id2doc[dep_id]
+            doc_edges[doc_name].append((gov_id, dep_id, lbl))
+
+    # for each doc, get a full-fledged RstDepTree, nuclearity and ranking
+    # are currently determined heuristically
+    doc_dtree = dict()
+    for doc_name, edus in doc_edus.items():
+        # comply with current API for barebones_rst_deptree:
+        # for each doc, create a dict with one item (doc_name, list of edges)
+        dep_edges = doc_edges[doc_name]
+        # create a barebones RST dep tree: head and label only
+        dtree, edu2sent = barebones_rst_deptree(dep_edges, edus, strict=False)
+        # set its origin
+        dtree.origin = FileId(doc_name, None, None, None)
+        # flesh out with heuristically-determined nuclearity
+        dtree.nucs = nuc_clf.predict([dtree])[0]
+        # and heuristically-determined rank (needs edu2sent to prioritize
+        # intra-sentential attachments over inter-sentential ones)
+        dtree.sent_idx = edu2sent  # DIRTY
+        dtree.ranks = rnk_clf.predict([dtree])[0]
+        doc_dtree[doc_name] = dtree
+
+    # write the disdep files
+    dump_disdep_files(doc_dtree.values(), out_dir)
+
+
+if __name__ == '__main__':
+    edus_file_glob = os.path.join('TMP', 'latest', 'data', 'TEST',
+                                  '*.edu-pairs.sparse.edu_input')
+    edges_file_glob = os.path.join('TMP', 'latest', 'scratch-current',
+                                   'combined', 'output.*')
+    out_dir = 'TMP_disdep_chain_pred_ours'
+    # attelo predictions are currently stored in one big file
+    edges_files = glob(edges_file_glob)
+    assert len(edges_files) == 1
+    edges_file = edges_files[0]
+    # paths to the resulting disdep files
+    out_dir = os.path.join(out_dir, 'TEST')
+    if not os.path.exists(out_dir):
+        os.makedirs(out_dir)
+    # do the conversion
+    attelo_predictions_to_disdep_files(edus_file_glob, edges_file, out_dir)

From d5e2a4319ce7ba0169b5eb2088dda528bd815849 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Wed, 7 Sep 2016 17:14:40 +0200
Subject: [PATCH 09/74] ENH new script to compute dependency scores on disdep
 files

---
 evals/disdep_eval.py | 77 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100755 evals/disdep_eval.py

diff --git a/evals/disdep_eval.py b/evals/disdep_eval.py
new file mode 100755
index 0000000..d33f439
--- /dev/null
+++ b/evals/disdep_eval.py
@@ -0,0 +1,77 @@
+"""Evaluation procedure for discourse dependency (disdep) files.
+
+Computes UAS and flavours of LAS for labels, nuclearity, rank and
+their combinations.
+"""
+
+from __future__ import absolute_import, print_function
+import codecs
+import csv
+from glob import glob
+import os
+
+
+if __name__ == '__main__':
+    # TODO turn into argparse params
+    dir_true = os.path.join('TMP_disdep_chain_true', 'TEST')
+    dir_pred = os.path.join('TMP_disdep_chain_pred_ours', 'TEST')
+    # end TODO
+    files_true = {os.path.basename(f).rsplit('.')[0]: f
+                  for f in glob(os.path.join(dir_true, '*.dis_dep'))}
+    files_pred = {os.path.basename(f).rsplit('.')[0]: f
+                  for f in glob(os.path.join(dir_pred, '*.dis_dep'))}
+    assert sorted(files_true.keys()) == sorted(files_pred.keys())
+
+    cnt_tot = 0  # total deps
+    cnt_a = 0  # correct heads (attachments)
+    cnt_l = 0  # correct labels
+    cnt_n = 0  # correct nuclearity
+    cnt_r = 0  # correct ranks
+    cnt_al = 0  # correct labelled attachments
+    cnt_an = 0  # correct attachment + nuc
+    cnt_ar = 0  # correct attachment + rank
+    cnt_aln = 0  # correct attachment + label + nuc
+    cnt_alnr = 0  # correct attachment + label + nuc + rank
+    
+    for doc_name, f_true in files_true.items():
+        f_pred = files_pred[doc_name]
+        with codecs.open(f_true, 'r', encoding='utf-8') as f_true:
+            with codecs.open(f_pred, 'r', encoding='utf-8') as f_pred:
+                reader_true = csv.reader(f_true, dialect=csv.excel_tab)
+                reader_pred = csv.reader(f_pred, dialect=csv.excel_tab)
+                for line_true, line_pred in zip(reader_true, reader_pred):
+                    # i, txt, head, label, clabel, nuc, rank
+                    assert line_true[0] == line_pred[0]  # safety check
+                    ok_a = line_true[2] == line_pred[2]
+                    ok_l = line_true[4] == line_pred[4]  # use clabel
+                    ok_n = line_true[5] == line_pred[5]
+                    ok_r = line_true[6] == line_pred[6]
+                    # update running counters
+                    cnt_tot += 1
+                    if ok_a:
+                        cnt_a += 1
+                    if ok_l:
+                        cnt_l += 1
+                    if ok_n:
+                        cnt_n += 1
+                    if ok_r:
+                        cnt_r += 1
+                    if ok_a and ok_l:
+                        cnt_al += 1
+                    if ok_a and ok_n:
+                        cnt_an += 1
+                    if ok_a and ok_r:
+                        cnt_ar += 1
+                    if ok_a and ok_l and ok_n:
+                        cnt_aln += 1
+                    if ok_a and ok_l and ok_n and ok_r:
+                        cnt_alnr += 1
+    print('\t'.join(['a', 'l', 'n', 'r',
+                     'al', 'an', 'ar',
+                     'aln',
+                     'alnr']))
+    print('\t'.join('{:.4f}'.format(float(cnt_x) / cnt_tot)
+                    for cnt_x in [cnt_a, cnt_l, cnt_n, cnt_r,
+                                  cnt_al, cnt_an, cnt_ar,
+                                  cnt_aln,
+                                  cnt_alnr]))

From d3e61283cb4713d82664485c4d29d2d0c9c9713b Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Thu, 8 Sep 2016 11:29:28 +0200
Subject: [PATCH 10/74] ENH dis_dep from various sources, general eval script
 for dis_dep

---
 evals/attelo_predictions_to_disdep.py    |   3 +-
 evals/codra.py                           |   2 +-
 evals/dis2disdep.py                      | 103 +++++++++++++++++++++++
 evals/{disdep_eval.py => eval_disdep.py} |  22 ++++-
 requirements.txt                         |   8 +-
 5 files changed, 129 insertions(+), 9 deletions(-)
 create mode 100755 evals/dis2disdep.py
 rename evals/{disdep_eval.py => eval_disdep.py} (76%)

diff --git a/evals/attelo_predictions_to_disdep.py b/evals/attelo_predictions_to_disdep.py
index 231a233..f0e7169 100755
--- a/evals/attelo_predictions_to_disdep.py
+++ b/evals/attelo_predictions_to_disdep.py
@@ -88,13 +88,12 @@ def attelo_predictions_to_disdep_files(edus_file_glob, edges_file, out_dir):
                                   '*.edu-pairs.sparse.edu_input')
     edges_file_glob = os.path.join('TMP', 'latest', 'scratch-current',
                                    'combined', 'output.*')
-    out_dir = 'TMP_disdep_chain_pred_ours'
     # attelo predictions are currently stored in one big file
     edges_files = glob(edges_file_glob)
     assert len(edges_files) == 1
     edges_file = edges_files[0]
     # paths to the resulting disdep files
-    out_dir = os.path.join(out_dir, 'TEST')
+    out_dir = os.path.join('TMP_disdep', 'chain', 'ours', 'test')
     if not os.path.exists(out_dir):
         os.makedirs(out_dir)
     # do the conversion
diff --git a/evals/codra.py b/evals/codra.py
index 17dbacb..fb36048 100644
--- a/evals/codra.py
+++ b/evals/codra.py
@@ -2,7 +2,7 @@
 
 """
 
-from __future__ import print_function
+from __future__ import absolute_import, print_function
 
 from collections import defaultdict
 import itertools
diff --git a/evals/dis2disdep.py b/evals/dis2disdep.py
new file mode 100755
index 0000000..2e4e418
--- /dev/null
+++ b/evals/dis2disdep.py
@@ -0,0 +1,103 @@
+"""Convert RST trees to their dependency version (.dis to .dis_dep).
+
+TODO
+----
+* [ ] support the output of Ji & Eisenstein's parser ; need to convert
+      .brackets to .dis_dep (via .dis?)
+* [ ] support intra-sentential level document parsing ; required to score
+      Joty's .sen_dis files
+
+"""
+from __future__ import absolute_import, print_function
+import argparse
+import os
+
+from educe.corpus import FileId
+from educe.learning.disdep_format import dump_disdep_files
+from educe.rst_dt.codra import load_codra_output_files
+from educe.rst_dt.corpus import Reader
+from educe.rst_dt.deptree import RstDepTree
+from educe.rst_dt.rst_wsj_corpus import (DOUBLE_FOLDER, TEST_FOLDER,
+                                         TRAIN_FOLDER)
+
+
+# original RST corpus
+RST_CORPUS = os.path.join('/home/mmorey/corpora/rst_discourse_treebank/data')
+RST_MAIN_TRAIN = os.path.join(RST_CORPUS, TRAIN_FOLDER)
+RST_MAIN_TEST = os.path.join(RST_CORPUS, TEST_FOLDER)
+RST_DOUBLE = os.path.join(RST_CORPUS, DOUBLE_FOLDER)
+# output of Joty's parser
+OUT_JOTY = os.path.join('/home/mmorey/melodi/rst/joty/Doc-level/')
+# output of Feng & Hirst's parser
+OUT_FENG = os.path.join('/home/mmorey/melodi/rst/feng_hirst/tmp/')
+# output of Ji's parser
+OUT_JI = os.path.join('/home/mmorey/melodi/rst/ji_eisenstein/test_input')
+
+
+def main():
+    """Main"""
+    parser = argparse.ArgumentParser(
+        description='Convert .dis files to .dis_dep'
+    )
+    parser.add_argument('--nary_enc', default='chain',
+                        choices=['chain', 'tree'],
+                        help="Encoding for n-ary nodes")
+    parser.add_argument('--author', default='gold',
+                        choices=['gold', 'silver', 'joty', 'feng', 'ji'],
+                        help="Author of the version of the corpus")
+    parser.add_argument('--split', default='test',
+                        choices=['train', 'test', 'double'],
+                        help="Relevant part of the corpus")
+    parser.add_argument('--out_root', default='TMP_disdep',
+                        help="Root directory for the output")
+    args = parser.parse_args()
+    # precise output path, by default: TMP_disdep/chain/gold/train
+    out_dir = os.path.join(args.out_root, args.nary_enc, args.author, args.split)
+    if not os.path.exists(out_dir):
+        os.makedirs(out_dir)
+    # read RST trees
+    nary_enc = args.nary_enc
+    author = args.author
+    corpus_split = args.split
+
+    if author == 'gold':
+        if corpus_split == 'train':
+            corpus_dir = RST_MAIN_TRAIN
+        elif corpus_split == 'test':
+            corpus_dir = RST_MAIN_TEST
+        elif corpus_split == 'double':
+            raise NotImplementedError("Gold trees for 'double'")
+        reader = Reader(corpus_dir)
+        rtrees = reader.slurp()
+        dtrees = {doc_name: RstDepTree.from_rst_tree(rtree, nary_enc=nary_enc)
+                  for doc_name, rtree in rtrees.items()}
+    elif author == 'silver':
+        if corpus_split == 'double':
+            corpus_dir = RST_DOUBLE
+        else:
+            raise ValueError("'silver' annotation is available for the "
+                             "'double' split only")
+    elif author == 'joty':
+        if corpus_split != 'test':
+            raise ValueError("The output of Joty's parser is available for "
+                             "the 'test' split only")
+        data_pred = load_codra_output_files(OUT_JOTY, level='doc')
+        doc_names = data_pred['doc_names']
+        rtrees = data_pred['rst_ctrees']
+        dtrees = {doc_name: RstDepTree.from_rst_tree(rtree, nary_enc=nary_enc)
+                  for doc_name, rtree in zip(doc_names, rtrees)}
+        # set reference to the document in the RstDepTree (required by
+        # dump_disdep_files)
+        for doc_name, dtree in dtrees.items():
+            dtree.origin = FileId(doc_name, None, None, None)
+    elif author == 'feng':
+        # files_glob = os.path.join(OUT_FENG, '*.txt.dis')  # FIXME
+        raise NotImplementedError("Output of Feng's parser")
+    elif author == 'ji':
+        raise NotImplementedError("Output of Ji's parser")
+    # do dump
+    dump_disdep_files(dtrees.values(), out_dir)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/evals/disdep_eval.py b/evals/eval_disdep.py
similarity index 76%
rename from evals/disdep_eval.py
rename to evals/eval_disdep.py
index d33f439..e74467e 100755
--- a/evals/disdep_eval.py
+++ b/evals/eval_disdep.py
@@ -5,6 +5,7 @@
 """
 
 from __future__ import absolute_import, print_function
+import argparse
 import codecs
 import csv
 from glob import glob
@@ -12,9 +13,24 @@
 
 
 if __name__ == '__main__':
-    # TODO turn into argparse params
-    dir_true = os.path.join('TMP_disdep_chain_true', 'TEST')
-    dir_pred = os.path.join('TMP_disdep_chain_pred_ours', 'TEST')
+    parser = argparse.ArgumentParser(
+        description="Evaluate dis_dep trees against a given reference")
+    parser.add_argument('author_pred',
+                        choices=['gold', 'silver',
+                                 'joty', 'feng', 'ji',
+                                 'ours'],
+                        help="Author of the predictions")
+    parser.add_argument('--author_true', default='gold',
+                        choices=['gold', 'silver',
+                                 'joty', 'feng', 'ji',
+                                 'ours'],
+                        help="Author of the reference")
+    args = parser.parse_args()
+    author_true = args.author_true
+    author_pred = args.author_pred
+    # TODO add argparse params for nary_enc and split
+    dir_true = os.path.join('TMP_disdep', 'chain', author_true, 'test')
+    dir_pred = os.path.join('TMP_disdep', 'chain', author_pred, 'test')
     # end TODO
     files_true = {os.path.basename(f).rsplit('.')[0]: f
                   for f in glob(os.path.join(dir_true, '*.dis_dep'))}
diff --git a/requirements.txt b/requirements.txt
index 7d348b6..4735983 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,6 @@
--e git+https://github.com/irit-melodi/educe.git#egg=educe
--e git+https://github.com/irit-melodi/attelo.git#egg=attelo
--e git+https://github.com/nlhepler/pydot.git#egg=pydot
+# -e git+https://github.com/irit-melodi/educe.git#egg=educe
+-e /home/mmorey/melodi/educe
+# -e git+https://github.com/irit-melodi/attelo.git#egg=attelo
+-e /home/mmorey/melodi/attelo
+# -e git+https://github.com/nlhepler/pydot.git#egg=pydot
 -e .

From eb21923963a3536a9ede1551b2087b169b8e0cd8 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Thu, 8 Sep 2016 16:00:18 +0200
Subject: [PATCH 11/74] ENH support for the output of feng's parser, evaluate
 several parsers

---
 evals/dis2disdep.py  |  15 ++++-
 evals/eval_disdep.py | 135 +++++++++++++++++++++++--------------------
 2 files changed, 86 insertions(+), 64 deletions(-)

diff --git a/evals/dis2disdep.py b/evals/dis2disdep.py
index 2e4e418..d1d7966 100755
--- a/evals/dis2disdep.py
+++ b/evals/dis2disdep.py
@@ -15,6 +15,7 @@
 from educe.corpus import FileId
 from educe.learning.disdep_format import dump_disdep_files
 from educe.rst_dt.codra import load_codra_output_files
+from educe.rst_dt.feng import load_feng_output_files
 from educe.rst_dt.corpus import Reader
 from educe.rst_dt.deptree import RstDepTree
 from educe.rst_dt.rst_wsj_corpus import (DOUBLE_FOLDER, TEST_FOLDER,
@@ -91,8 +92,18 @@ def main():
         for doc_name, dtree in dtrees.items():
             dtree.origin = FileId(doc_name, None, None, None)
     elif author == 'feng':
-        # files_glob = os.path.join(OUT_FENG, '*.txt.dis')  # FIXME
-        raise NotImplementedError("Output of Feng's parser")
+        if corpus_split != 'test':
+            raise ValueError("The output of Feng & Hirst's parser is "
+                             "available for the 'test' split only")
+        data_pred = load_feng_output_files(OUT_FENG)
+        doc_names = data_pred['doc_names']
+        rtrees = data_pred['rst_ctrees']
+        dtrees = {doc_name: RstDepTree.from_rst_tree(rtree, nary_enc=nary_enc)
+                  for doc_name, rtree in zip(doc_names, rtrees)}
+        # set reference to the document in the RstDepTree (required by
+        # dump_disdep_files)
+        for doc_name, dtree in dtrees.items():
+            dtree.origin = FileId(doc_name, None, None, None)
     elif author == 'ji':
         raise NotImplementedError("Output of Ji's parser")
     # do dump
diff --git a/evals/eval_disdep.py b/evals/eval_disdep.py
index e74467e..8310487 100755
--- a/evals/eval_disdep.py
+++ b/evals/eval_disdep.py
@@ -15,79 +15,90 @@
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(
         description="Evaluate dis_dep trees against a given reference")
-    parser.add_argument('author_pred',
+    parser.add_argument('authors_pred', nargs='+',
                         choices=['gold', 'silver',
                                  'joty', 'feng', 'ji',
                                  'ours'],
-                        help="Author of the predictions")
+                        help="Author(s) of the predictions")
     parser.add_argument('--author_true', default='gold',
                         choices=['gold', 'silver',
                                  'joty', 'feng', 'ji',
                                  'ours'],
                         help="Author of the reference")
+    parser.add_argument('--nary_enc', default='chain',
+                        choices=['tree', 'chain'],
+                        help="Encoding of n-ary nodes")
+    # TODO add argparse param for split
     args = parser.parse_args()
     author_true = args.author_true
-    author_pred = args.author_pred
-    # TODO add argparse params for nary_enc and split
-    dir_true = os.path.join('TMP_disdep', 'chain', author_true, 'test')
-    dir_pred = os.path.join('TMP_disdep', 'chain', author_pred, 'test')
-    # end TODO
+    authors_pred = args.authors_pred
+    nary_enc = args.nary_enc
+    # reference
+    dir_true = os.path.join('TMP_disdep', nary_enc, author_true, 'test')
     files_true = {os.path.basename(f).rsplit('.')[0]: f
                   for f in glob(os.path.join(dir_true, '*.dis_dep'))}
-    files_pred = {os.path.basename(f).rsplit('.')[0]: f
-                  for f in glob(os.path.join(dir_pred, '*.dis_dep'))}
-    assert sorted(files_true.keys()) == sorted(files_pred.keys())
-
-    cnt_tot = 0  # total deps
-    cnt_a = 0  # correct heads (attachments)
-    cnt_l = 0  # correct labels
-    cnt_n = 0  # correct nuclearity
-    cnt_r = 0  # correct ranks
-    cnt_al = 0  # correct labelled attachments
-    cnt_an = 0  # correct attachment + nuc
-    cnt_ar = 0  # correct attachment + rank
-    cnt_aln = 0  # correct attachment + label + nuc
-    cnt_alnr = 0  # correct attachment + label + nuc + rank
-    
-    for doc_name, f_true in files_true.items():
-        f_pred = files_pred[doc_name]
-        with codecs.open(f_true, 'r', encoding='utf-8') as f_true:
-            with codecs.open(f_pred, 'r', encoding='utf-8') as f_pred:
-                reader_true = csv.reader(f_true, dialect=csv.excel_tab)
-                reader_pred = csv.reader(f_pred, dialect=csv.excel_tab)
-                for line_true, line_pred in zip(reader_true, reader_pred):
-                    # i, txt, head, label, clabel, nuc, rank
-                    assert line_true[0] == line_pred[0]  # safety check
-                    ok_a = line_true[2] == line_pred[2]
-                    ok_l = line_true[4] == line_pred[4]  # use clabel
-                    ok_n = line_true[5] == line_pred[5]
-                    ok_r = line_true[6] == line_pred[6]
-                    # update running counters
-                    cnt_tot += 1
-                    if ok_a:
-                        cnt_a += 1
-                    if ok_l:
-                        cnt_l += 1
-                    if ok_n:
-                        cnt_n += 1
-                    if ok_r:
-                        cnt_r += 1
-                    if ok_a and ok_l:
-                        cnt_al += 1
-                    if ok_a and ok_n:
-                        cnt_an += 1
-                    if ok_a and ok_r:
-                        cnt_ar += 1
-                    if ok_a and ok_l and ok_n:
-                        cnt_aln += 1
-                    if ok_a and ok_l and ok_n and ok_r:
-                        cnt_alnr += 1
-    print('\t'.join(['a', 'l', 'n', 'r',
+    # table header
+    print('\t'.join(['parser',
+                     'a', 'l', 'n', 'r',
                      'al', 'an', 'ar',
                      'aln',
-                     'alnr']))
-    print('\t'.join('{:.4f}'.format(float(cnt_x) / cnt_tot)
-                    for cnt_x in [cnt_a, cnt_l, cnt_n, cnt_r,
-                                  cnt_al, cnt_an, cnt_ar,
-                                  cnt_aln,
-                                  cnt_alnr]))
+                     'alnr',
+                     'support']))
+
+    for author_pred in authors_pred:
+        dir_pred = os.path.join('TMP_disdep', nary_enc, author_pred, 'test')
+        files_pred = {os.path.basename(f).rsplit('.')[0]: f
+                      for f in glob(os.path.join(dir_pred, '*.dis_dep'))}
+        assert sorted(files_true.keys()) == sorted(files_pred.keys())
+
+        cnt_tot = 0  # total deps
+        cnt_a = 0  # correct heads (attachments)
+        cnt_l = 0  # correct labels
+        cnt_n = 0  # correct nuclearity
+        cnt_r = 0  # correct ranks
+        cnt_al = 0  # correct labelled attachments
+        cnt_an = 0  # correct attachment + nuc
+        cnt_ar = 0  # correct attachment + rank
+        cnt_aln = 0  # correct attachment + label + nuc
+        cnt_alnr = 0  # correct attachment + label + nuc + rank
+
+        for doc_name, f_true in files_true.items():
+            f_pred = files_pred[doc_name]
+            with codecs.open(f_true, 'r', encoding='utf-8') as f_true:
+                with codecs.open(f_pred, 'r', encoding='utf-8') as f_pred:
+                    reader_true = csv.reader(f_true, dialect=csv.excel_tab)
+                    reader_pred = csv.reader(f_pred, dialect=csv.excel_tab)
+                    for line_true, line_pred in zip(reader_true, reader_pred):
+                        # i, txt, head, label, clabel, nuc, rank
+                        assert line_true[0] == line_pred[0]  # safety check
+                        ok_a = line_true[2] == line_pred[2]
+                        ok_l = line_true[4] == line_pred[4]  # use clabel
+                        ok_n = line_true[5] == line_pred[5]
+                        ok_r = line_true[6] == line_pred[6]
+                        # update running counters
+                        cnt_tot += 1
+                        if ok_a:
+                            cnt_a += 1
+                        if ok_l:
+                            cnt_l += 1
+                        if ok_n:
+                            cnt_n += 1
+                        if ok_r:
+                            cnt_r += 1
+                        if ok_a and ok_l:
+                            cnt_al += 1
+                        if ok_a and ok_n:
+                            cnt_an += 1
+                        if ok_a and ok_r:
+                            cnt_ar += 1
+                        if ok_a and ok_l and ok_n:
+                            cnt_aln += 1
+                        if ok_a and ok_l and ok_n and ok_r:
+                            cnt_alnr += 1
+        print('\t'.join([author_pred]
+                        + ['{:.4f}'.format(float(cnt_x) / cnt_tot)
+                           for cnt_x in [cnt_a, cnt_l, cnt_n, cnt_r,
+                                         cnt_al, cnt_an, cnt_ar,
+                                         cnt_aln,
+                                         cnt_alnr]]
+                        + [str(cnt_tot)]))

From c8097a5ada15c68db1e140953dc5ec834d56166e Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Wed, 14 Sep 2016 18:22:16 +0200
Subject: [PATCH 12/74] WIP add nary_enc to params in gather and local

---
 irit_rst_dt/cmd/gather.py | 3 ++-
 irit_rst_dt/local.py      | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/irit_rst_dt/cmd/gather.py b/irit_rst_dt/cmd/gather.py
index 996f570..b986097 100644
--- a/irit_rst_dt/cmd/gather.py
+++ b/irit_rst_dt/cmd/gather.py
@@ -16,7 +16,7 @@
 
 from ..local import (FEATURE_SET, LABEL_SET, TEST_CORPUS, TRAINING_CORPUS,
                      SAME_UNIT, PTB_DIR, CORENLP_OUT_DIR, LECSIE_DATA_DIR,
-                     EVALUATIONS)
+                     NARY_ENC, EVALUATIONS)
 from ..util import (current_tmp, latest_tmp)
 
 NAME = 'gather'
@@ -86,6 +86,7 @@ def extract_features(corpus, output_dir, fix_pseudo_rels, instances,
         PTB_DIR,  # TODO make this optional and exclusive from CoreNLP
         output_dir,
         '--feature_set', FEATURE_SET,
+        '--nary_enc', NARY_ENC,  # 2016-09-12
         '--instances', instances,
     ]
     # NEW 2016-05-19 rewrite pseudo-relations
diff --git a/irit_rst_dt/local.py b/irit_rst_dt/local.py
index 0cb9cb1..bfe2691 100644
--- a/irit_rst_dt/local.py
+++ b/irit_rst_dt/local.py
@@ -147,6 +147,11 @@
 Whether to have a special processing for same-unit
 """
 
+NARY_ENC = 'tree'  # one of {'chain', 'tree'}
+"""
+Encoding for n-ary nodes in the ctree.
+"""
+
 FIXED_FOLD_FILE = None
 # FIXED_FOLD_FILE = 'folds-TRAINING.json'
 """

From a094aff82077e8e3955df8d29dfc4d9de936f235 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Wed, 14 Sep 2016 18:23:00 +0200
Subject: [PATCH 13/74] WIP cleaner evaluation: toggle for nary_enc,
 binarize_ref

---
 evals/codra.py    | 16 ++++++++++------
 evals/ours.py     | 35 +++++++++++++++++++++--------------
 evals/showdown.py | 42 ++++++++++++++++++++++++++++++++++--------
 3 files changed, 65 insertions(+), 28 deletions(-)

diff --git a/evals/codra.py b/evals/codra.py
index fb36048..c1ed324 100644
--- a/evals/codra.py
+++ b/evals/codra.py
@@ -41,8 +41,10 @@
 
 
 def eval_codra_output(codra_out_dir, edus_file,
+                      nary_enc,
                       nuc_strategy, rank_strategy,
                       prioritize_same_unit=True,
+                      binarize_ref=False,
                       detailed=False):
     """Load and evaluate the .dis files output by CODRA.
 
@@ -77,12 +79,15 @@ def eval_codra_output(codra_out_dir, edus_file,
 
         # transform into binary tree with coarse-grained labels
         coarse_rtree_true = REL_CONV(rtree_true)
-        bin_rtree_true = _binarize(coarse_rtree_true)
-        ctree_true[doc_name] = bin_rtree_true
+        if binarize_ref:
+            bin_rtree_true = _binarize(coarse_rtree_true)
+            ct_true = bin_rtree_true
+        else:
+            ct_true = coarse_rtree_true
+        ctree_true[doc_name] = ct_true
 
         # transform into dependency tree via SimpleRSTTree
-        bin_srtree_true = SimpleRSTTree.from_rst_tree(coarse_rtree_true)
-        dt_true = RstDepTree.from_simple_rst_tree(bin_srtree_true)
+        dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc=nary_enc)
         dtree_true[doc_name] = dt_true
 
         # WIP 2016-06-29 para_idx
@@ -131,8 +136,7 @@ def eval_codra_output(codra_out_dir, edus_file,
 
         # dependency tree
         # conversion via SimpleRSTTree to RstDepTree
-        bin_srtree_pred = SimpleRSTTree.from_rst_tree(coarse_rtree_pred)
-        dt_pred = RstDepTree.from_simple_rst_tree(bin_srtree_pred)
+        dt_pred = RstDepTree.from_rst_tree(coarse_rtree_pred, nary_enc='chain')
         dtree_pred[doc_name] = dt_pred
 
     # compare pred and true
diff --git a/evals/ours.py b/evals/ours.py
index 156c76a..90df9e6 100644
--- a/evals/ours.py
+++ b/evals/ours.py
@@ -15,6 +15,7 @@
 from educe.rst_dt.corpus import (Reader as RstReader,
                                  RstRelationConverter as RstRelationConverter)
 from educe.rst_dt.dep2con import (deptree_to_simple_rst_tree,
+                                  deptree_to_rst_tree,
                                   DummyNuclearityClassifier,
                                   InsideOutAttachmentRanker)
 from educe.rst_dt.deptree import RstDepTree, RstDtException
@@ -79,8 +80,11 @@ def load_attelo_output_file(output_file):
 
 
 def load_deptrees_from_attelo_output(output_file, edus_file,
+                                     nary_enc,
                                      nuc_strategy, rank_strategy,
                                      prioritize_same_unit=True,
+                                     order='weak',
+                                     binarize_ref=False,
                                      detailed=False,
                                      skpd_docs=None):
     """Load an RstDepTree from the output of attelo.
@@ -115,12 +119,15 @@ def load_deptrees_from_attelo_output(output_file, edus_file,
 
         # transform into binary tree with coarse-grained labels
         coarse_rtree_true = REL_CONV(rtree_true)
-        bin_rtree_true = _binarize(coarse_rtree_true)
-        ctree_true[doc_name] = bin_rtree_true
+        if binarize_ref:
+            bin_rtree_true = _binarize(coarse_rtree_true)
+            ct_true = bin_rtree_true
+        else:
+            ct_true = coarse_rtree_true
+        ctree_true[doc_name] = ct_true
 
-        # transform into dependency tree via SimpleRSTTree
-        bin_srtree_true = SimpleRSTTree.from_rst_tree(coarse_rtree_true)
-        dt_true = RstDepTree.from_simple_rst_tree(bin_srtree_true)
+        # transform into dependency tree
+        dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc=nary_enc)
         dtree_true[doc_name] = dt_true
 
         # 2016-06-28 retrieve paragraph idx of each EDU
@@ -206,7 +213,8 @@ def load_deptrees_from_attelo_output(output_file, edus_file,
     # ranking classifier
     rank_classifier = InsideOutAttachmentRanker(
         strategy=rank_strategy,
-        prioritize_same_unit=prioritize_same_unit)
+        prioritize_same_unit=prioritize_same_unit,
+        order=order)
     rank_classifier.fit(X_train, y_rank_train)
 
     # rebuild RstDepTrees
@@ -245,14 +253,13 @@ def load_deptrees_from_attelo_output(output_file, edus_file,
 
         # create pred ctree
         try:
-            bin_srtree_pred = deptree_to_simple_rst_tree(dt_pred)
-            if False:  # EXPERIMENTAL
-                # currently False to run on output that already has
-                # labels embedding nuclearity
-                bin_srtree_pred = SimpleRSTTree.incorporate_nuclearity_into_label(
-                    bin_srtree_pred)
-            bin_rtree_pred = SimpleRSTTree.to_binary_rst_tree(bin_srtree_pred)
-            ctree_pred[doc_name] = bin_rtree_pred
+            if False:
+                rtree_pred = deptree_to_rst_tree(dt_pred)
+                ctree_pred[doc_name] = rtree_pred
+            else:  # legacy: via SimpleRSTTree, forces binarization
+                bin_srtree_pred = deptree_to_simple_rst_tree(dt_pred)
+                bin_rtree_pred = SimpleRSTTree.to_binary_rst_tree(bin_srtree_pred)
+                ctree_pred[doc_name] = bin_rtree_pred
         except RstDtException as rst_e:
             print(rst_e)
             skipped_docs.add(doc_name)
diff --git a/evals/showdown.py b/evals/showdown.py
index 14c5a2f..677c0bf 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -46,6 +46,14 @@
     'scratch-current/combined',
     'output.maxent-iheads-global-AD.L-jnt-eisner')
 
+# 2016-09-14 "tree" transform, predicted syntax
+EISNER_OUT_TREE_SYN_PRED_SU = os.path.join(
+    '/home/mmorey/melodi',
+    'irit-rst-dt/TMP/latest',  # lbl
+    'scratch-current/combined',
+    'output.maxent-iheads-global-AD.L-jnt_su-eisner')
+# end 2016-09-14
+
 EISNER_OUT_SYN_PRED_SU = os.path.join(
     '/home/mmorey/melodi',
     'irit-rst-dt/TMP/latest',  # lbl
@@ -58,7 +66,7 @@
     'scratch-current/combined',
     'output.maxent-iheads-global-AD.L-jnt-eisner')
 
-CODRA_OUT_DIR = '/home/mmorey/melodi/joty/Doc-level'
+CODRA_OUT_DIR = '/home/mmorey/melodi/rst/joty/Doc-level'
 
 
 
@@ -70,28 +78,46 @@
 
 print('CODRA (Joty)')
 eval_codra_output(CODRA_OUT_DIR, EDUS_FILE,
+                  'chain',
                   nuc_strategy="unamb_else_most_frequent",
-                  rank_strategy='closest-intra-rl-inter-rl',
+                  rank_strategy='sdist-edist-rl',
                   prioritize_same_unit=True,
-                  detailed=True)
+                  binarize_ref=False,
+                  detailed=False)
 print('=======================')
 
-print('Eisner, predicted syntax')
+print('[chain] Eisner, predicted syntax')
 load_deptrees_from_attelo_output(EISNER_OUT_SYN_PRED, EDUS_FILE,
+                                 'chain',
                                  nuc_strategy="unamb_else_most_frequent",
                                  # nuc_strategy="most_frequent_by_rel",
-                                 rank_strategy='closest-intra-rl-inter-rl',
+                                 rank_strategy='sdist-edist-rl',
                                  prioritize_same_unit=True,
-                                 detailed=True)
+                                 order='weak',
+                                 binarize_ref=False,
+                                 detailed=False)
+print('======================')
+
+print('[tree] Eisner, predicted syntax + same-unit')
+load_deptrees_from_attelo_output(EISNER_OUT_TREE_SYN_PRED_SU, EDUS_FILE,
+                                 'tree',
+                                 nuc_strategy="unamb_else_most_frequent",
+                                 # nuc_strategy="most_frequent_by_rel",
+                                 rank_strategy='sdist-edist-rl',
+                                 prioritize_same_unit=True,
+                                 order='weak',
+                                 binarize_ref=False,
+                                 detailed=False)
 print('======================')
 
 print('Eisner, predicted syntax + same-unit')
 load_deptrees_from_attelo_output(EISNER_OUT_SYN_PRED_SU, EDUS_FILE,
+                                 'chain',
                                  nuc_strategy="unamb_else_most_frequent",
                                  # nuc_strategy="most_frequent_by_rel",
-                                 rank_strategy='closest-intra-rl-inter-rl',
+                                 rank_strategy='sdist-edist-rl',
                                  prioritize_same_unit=True,
-                                 detailed=True)
+                                 detailed=False)
 print('======================')
 
 print('Eisner, gold syntax')

From d38d5df647759953eaf5b424f545dc04313b5b73 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Thu, 15 Sep 2016 16:09:48 +0200
Subject: [PATCH 14/74] WIP cleaner eval showdown

---
 evals/codra.py    |  93 ++++----------------------
 evals/ours.py     |  97 ++++++---------------------
 evals/showdown.py | 164 ++++++++++++++++++++++++++++++----------------
 3 files changed, 142 insertions(+), 212 deletions(-)

diff --git a/evals/codra.py b/evals/codra.py
index c1ed324..7bc5275 100644
--- a/evals/codra.py
+++ b/evals/codra.py
@@ -6,17 +6,11 @@
 
 from collections import defaultdict
 import itertools
-import os
 
 import numpy as np
 
-from educe.rst_dt.annotation import SimpleRSTTree, _binarize
 from educe.rst_dt.codra import load_codra_output_files
-from educe.rst_dt.corpus import (Reader as RstReader,
-                                 RstRelationConverter as RstRelationConverter)
-from educe.rst_dt.dep2con import (deptree_to_simple_rst_tree,
-                                  DummyNuclearityClassifier,
-                                  InsideOutAttachmentRanker)
+from educe.rst_dt.dep2con import deptree_to_rst_tree
 from educe.rst_dt.deptree import RstDepTree
 from educe.rst_dt.document_plus import align_edus_with_paragraphs
 #
@@ -26,38 +20,14 @@
 from attelo.metrics.deptree import compute_uas_las
 
 
-# RST corpus
-CORPUS_DIR = os.path.abspath(os.path.join(
-    os.path.dirname(os.path.realpath(__file__)),
-    '..', 'corpus',
-    'RSTtrees-WSJ-main-1.0/'))
-CD_TRAIN = os.path.join(CORPUS_DIR, 'TRAINING')
-CD_TEST = os.path.join(CORPUS_DIR, 'TEST')
-# relation converter (fine- to coarse-grained labels)
-RELMAP_FILE = os.path.join('/home/mmorey/melodi/educe',
-                           'educe', 'rst_dt',
-                           'rst_112to18.txt')
-REL_CONV = RstRelationConverter(RELMAP_FILE).convert_tree
-
-
-def eval_codra_output(codra_out_dir, edus_file,
-                      nary_enc,
-                      nuc_strategy, rank_strategy,
-                      prioritize_same_unit=True,
-                      binarize_ref=False,
+def eval_codra_output(ctree_true, dtree_true,
+                      codra_out_dir, edus_file,
+                      nuc_clf, rnk_clf,
                       detailed=False):
     """Load and evaluate the .dis files output by CODRA.
 
     This currently runs on the document-level files (.doc_dis).
     """
-    # load reference trees
-    dtree_true = dict()  # dependency trees
-    ctree_true = dict()  # constituency trees
-    # FIXME: find ways to read the right (not necessarily TEST) section
-    # and only the required documents
-    rst_reader = RstReader(CD_TEST)
-    rst_corpus = rst_reader.slurp()
-
     # WIP 2016-06-29 sent_idx
     att_edus = load_edus(edus_file)
     edu2sent_idx = defaultdict(dict)
@@ -74,22 +44,7 @@ def eval_codra_output(codra_out_dir, edus_file,
                          for doc_name, edu2sent in edu2sent_idx.items()}
     doc_name2edu2para = dict()
 
-    for doc_id, rtree_true in sorted(rst_corpus.items()):
-        doc_name = doc_id.doc
-
-        # transform into binary tree with coarse-grained labels
-        coarse_rtree_true = REL_CONV(rtree_true)
-        if binarize_ref:
-            bin_rtree_true = _binarize(coarse_rtree_true)
-            ct_true = bin_rtree_true
-        else:
-            ct_true = coarse_rtree_true
-        ctree_true[doc_name] = ct_true
-
-        # transform into dependency tree via SimpleRSTTree
-        dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc=nary_enc)
-        dtree_true[doc_name] = dt_true
-
+    for doc_name, rtree_true in sorted(ctree_true.items()):
         # WIP 2016-06-29 para_idx
         doc_edus = rtree_true.leaves()
         doc_txt = doc_edus[0].context._text
@@ -115,7 +70,6 @@ def eval_codra_output(codra_out_dir, edus_file,
             doc_name2edu2para[doc_name] = None
         # end retrieve paragraph idx
 
-
     # load predicted trees
     data_pred = load_codra_output_files(codra_out_dir)
     # filenames = data_pred['filenames']
@@ -129,13 +83,11 @@ def eval_codra_output(codra_out_dir, edus_file,
     for doc_name, rst_ctree in itertools.izip(doc_names_pred, rst_ctrees_pred):
         # constituency tree
         # replace fine-grained labels with coarse-grained labels
-        # 2016-06-27 useless, the files we have already contain the coarse
-        # labels
-        coarse_rtree_pred = REL_CONV(rst_ctree)
+        # no need to replace labels: the files we have already contain
+        # the coarse labels
+        coarse_rtree_pred = rst_ctree
         ctree_pred[doc_name] = coarse_rtree_pred
-
-        # dependency tree
-        # conversion via SimpleRSTTree to RstDepTree
+        # convert to weakly-ordered dependency tree
         dt_pred = RstDepTree.from_rst_tree(coarse_rtree_pred, nary_enc='chain')
         dtree_pred[doc_name] = dt_pred
 
@@ -167,42 +119,25 @@ def eval_codra_output(codra_out_dir, edus_file,
         print(parseval_detailed_report(ctree_true, ctree_pred,
                                        metric_type='S+R'))
 
-    if False:
+    if True:
         # WIP 2016-06-29 use our deterministic classifiers for nuc and rank
         # => estimate degradation on Joty's output => hint at ours
-        # FIXME declare, fit and predict upstream on the training corpus...
-        # but currently fit is a no-op for both so this horror is in fact safe
-        X_train = []
-        y_nuc_train = []
-        y_rank_train = []
-        for doc_name, dt in sorted(dtree_true.items()):
-            X_train.append(dt)
-            y_nuc_train.append(dt.nucs)
-            y_rank_train.append(dt.ranks)
         # nuclearity
-        nuc_classifier = DummyNuclearityClassifier(strategy=nuc_strategy)
-        nuc_classifier.fit(X_train, y_nuc_train)
-        # ranking classifier
-        rank_classifier = InsideOutAttachmentRanker(
-            strategy=rank_strategy,
-            prioritize_same_unit=prioritize_same_unit)
-        rank_classifier.fit(X_train, y_rank_train)
         # rebuild ctrees
         ctree_pred2 = dict()
         for doc_name, dt_pred in sorted(dtree_pred.items()):
             # set nuclearity
-            dt_pred.nucs = nuc_classifier.predict([dt_pred])[0]
+            dt_pred.nucs = nuc_clf.predict([dt_pred])[0]
             # set ranking, needs sent_idx (WIP on para_idx)
             edu2sent = doc_name2edu2sent[doc_name]
             dt_pred.sent_idx = edu2sent
             # 2016-06-28 same for edu2para
             edu2para = doc_name2edu2para[doc_name]
             dt_pred.para_idx = edu2para
-            dt_pred.ranks = rank_classifier.predict([dt_pred])[0]
+            dt_pred.ranks = rnk_clf.predict([dt_pred])[0]
             # end NEW
-            bin_srtree_pred = deptree_to_simple_rst_tree(dt_pred)
-            bin_rtree_pred = SimpleRSTTree.to_binary_rst_tree(bin_srtree_pred)
-            ctree_pred2[doc_name] = bin_rtree_pred
+            rtree_pred = deptree_to_rst_tree(dt_pred)
+            ctree_pred2[doc_name] = rtree_pred
         #
         skipped_docs = set()
         ctree_pred2 = [ct for doc_name, ct in sorted(ctree_pred2.items())
diff --git a/evals/ours.py b/evals/ours.py
index 90df9e6..1633edf 100644
--- a/evals/ours.py
+++ b/evals/ours.py
@@ -5,19 +5,13 @@
 from __future__ import print_function
 
 from collections import defaultdict
-import os
 
 import numpy as np
 
 from educe.annotation import Span as EduceSpan
-from educe.rst_dt.annotation import (EDU as EduceEDU,
-                                     SimpleRSTTree, _binarize)
-from educe.rst_dt.corpus import (Reader as RstReader,
-                                 RstRelationConverter as RstRelationConverter)
+from educe.rst_dt.annotation import (EDU as EduceEDU, SimpleRSTTree)
 from educe.rst_dt.dep2con import (deptree_to_simple_rst_tree,
-                                  deptree_to_rst_tree,
-                                  DummyNuclearityClassifier,
-                                  InsideOutAttachmentRanker)
+                                  deptree_to_rst_tree)
 from educe.rst_dt.deptree import RstDepTree, RstDtException
 from educe.rst_dt.document_plus import align_edus_with_paragraphs
 #
@@ -28,20 +22,6 @@
 from attelo.table import UNRELATED  # for load_attelo_output_file
 
 
-# RST corpus
-CORPUS_DIR = os.path.abspath(os.path.join(
-    os.path.dirname(os.path.realpath(__file__)),
-    '..', 'corpus',
-    'RSTtrees-WSJ-main-1.0/'))
-CD_TRAIN = os.path.join(CORPUS_DIR, 'TRAINING')
-CD_TEST = os.path.join(CORPUS_DIR, 'TEST')
-# relation converter (fine- to coarse-grained labels)
-RELMAP_FILE = os.path.join('/home/mmorey/melodi/educe',
-                           'educe', 'rst_dt',
-                           'rst_112to18.txt')
-REL_CONV = RstRelationConverter(RELMAP_FILE).convert_tree
-
-
 # move to attelo.datasets.attelo_out_format
 def load_attelo_output_file(output_file):
     """Load edges from an attelo output file.
@@ -79,24 +59,25 @@ def load_attelo_output_file(output_file):
     return edges_pred
 
 
-def load_deptrees_from_attelo_output(output_file, edus_file,
-                                     nary_enc,
-                                     nuc_strategy, rank_strategy,
-                                     prioritize_same_unit=True,
-                                     order='weak',
-                                     binarize_ref=False,
+def load_deptrees_from_attelo_output(ctree_true, dtree_true,
+                                     output_file, edus_file,
+                                     nuc_clf, rnk_clf,
                                      detailed=False,
                                      skpd_docs=None):
     """Load an RstDepTree from the output of attelo.
 
     Parameters
     ----------
+    ctree_true: dict(str, RSTTree)
+        Ground truth RST ctree.
+    dtree_true: dict(str, RstDepTree)
+        Ground truth RST (ordered) dtree.
     output_file: string
         Path to the file that contains attelo's output
-    nuc_strategy: string
-        Strategy to predict nuclearity
-    rank_strategy: string
-        Strategy to predict attachment ranking
+    nuc_clf: NuclearityClassifier
+        Classifier to predict nuclearity
+    rnk_clf: RankClassifier
+        Classifier to predict attachment ranking
     skpd_docs: set(string)
         Names of documents that should be skipped to compute scores
 
@@ -108,28 +89,7 @@ def load_deptrees_from_attelo_output(output_file, edus_file,
     doc_name2edu2para = dict()
 
     # load reference trees
-    dtree_true = dict()  # dependency trees
-    ctree_true = dict()  # constituency trees
-    # FIXME: find ways to read the right (not necessarily TEST) section
-    # and only the required documents
-    rst_reader = RstReader(CD_TEST)
-    rst_corpus = rst_reader.slurp()
-    for doc_id, rtree_true in sorted(rst_corpus.items()):
-        doc_name = doc_id.doc
-
-        # transform into binary tree with coarse-grained labels
-        coarse_rtree_true = REL_CONV(rtree_true)
-        if binarize_ref:
-            bin_rtree_true = _binarize(coarse_rtree_true)
-            ct_true = bin_rtree_true
-        else:
-            ct_true = coarse_rtree_true
-        ctree_true[doc_name] = ct_true
-
-        # transform into dependency tree
-        dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc=nary_enc)
-        dtree_true[doc_name] = dt_true
-
+    for doc_name, rtree_true in sorted(ctree_true.items()):
         # 2016-06-28 retrieve paragraph idx of each EDU
         # FIXME refactor to get in a better way, in a better place
         # currently, we take EDUs from the RSTTree and paragraphs from
@@ -198,25 +158,6 @@ def load_deptrees_from_attelo_output(output_file, edus_file,
     # re-build predicted trees from predicted edges and educe EDUs
     skipped_docs = set()  # docs skipped because non-projective structures
 
-    # classifiers for nuclearity and ranking
-    # FIXME declare, fit and predict upstream...
-    X_train = []
-    y_nuc_train = []
-    y_rank_train = []
-    for doc_name, dt in sorted(dtree_true.items()):
-        X_train.append(dt)
-        y_nuc_train.append(dt.nucs)
-        y_rank_train.append(dt.ranks)
-    # nuclearity
-    nuc_classifier = DummyNuclearityClassifier(strategy=nuc_strategy)
-    nuc_classifier.fit(X_train, y_nuc_train)
-    # ranking classifier
-    rank_classifier = InsideOutAttachmentRanker(
-        strategy=rank_strategy,
-        prioritize_same_unit=prioritize_same_unit,
-        order=order)
-    rank_classifier.fit(X_train, y_rank_train)
-
     # rebuild RstDepTrees
     for doc_name, es_pred in sorted(edges_pred.items()):
         # get educe EDUs
@@ -233,7 +174,7 @@ def load_deptrees_from_attelo_output(output_file, edus_file,
                 dt_pred.add_dependency(gid2num[src_id], gid2num[tgt_id], lbl)
         # NEW add nuclearity: heuristic baseline
         if True:
-            dt_pred.nucs = nuc_classifier.predict([dt_pred])[0]
+            dt_pred.nucs = nuc_clf.predict([dt_pred])[0]
         else:  # EXPERIMENTAL use gold nuclearity
             dt_pred.nucs = dtree_true[doc_name].nucs
         # NEW add rank: some strategies require a mapping from EDU to sentence
@@ -247,13 +188,14 @@ def load_deptrees_from_attelo_output(output_file, edus_file,
         # end EXPERIMENTAL
         if False:  # DEBUG
             print(doc_name)
-        dt_pred.ranks = rank_classifier.predict([dt_pred])[0]
+        dt_pred.ranks = rnk_clf.predict([dt_pred])[0]
         # end NEW
         dtree_pred[doc_name] = dt_pred
 
         # create pred ctree
         try:
-            if False:
+            if True:  # NEW 2016-09-14
+                # direct conversion from ordered dtree to ctree
                 rtree_pred = deptree_to_rst_tree(dt_pred)
                 ctree_pred[doc_name] = rtree_pred
             else:  # legacy: via SimpleRSTTree, forces binarization
@@ -303,5 +245,6 @@ def load_deptrees_from_attelo_output(output_file, edus_file,
     if detailed:
         print(parseval_detailed_report(ctree_true, ctree_pred,
                                        metric_type='S+R'))
-
+    # DEBUG
+    # end DEBUG
     return skipped_docs
diff --git a/evals/showdown.py b/evals/showdown.py
index 677c0bf..8f0ccea 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -7,11 +7,12 @@
 
 import os
 
-# from educe.rst_dt.annotation import RSTTree, SimpleRSTTree, _binarize
-from educe.rst_dt.corpus import RstRelationConverter # , Reader as RstReader
-
-# from educe.rst_dt.dep2con import (deptree_to_simple_rst_tree)
-# from educe.rst_dt.deptree import (RstDepTree, RstDtException)
+from educe.rst_dt.annotation import _binarize
+from educe.rst_dt.corpus import (RstRelationConverter,
+                                 Reader as RstReader)
+from educe.rst_dt.dep2con import (DummyNuclearityClassifier,
+                                  InsideOutAttachmentRanker)
+from educe.rst_dt.deptree import RstDepTree
 #
 # from attelo.metrics.constituency import (LBL_FNS, parseval_detailed_report,
 #                                          parseval_report)
@@ -54,6 +55,7 @@
     'output.maxent-iheads-global-AD.L-jnt_su-eisner')
 # end 2016-09-14
 
+
 EISNER_OUT_SYN_PRED_SU = os.path.join(
     '/home/mmorey/melodi',
     'irit-rst-dt/TMP/latest',  # lbl
@@ -69,67 +71,117 @@
 CODRA_OUT_DIR = '/home/mmorey/melodi/rst/joty/Doc-level'
 
 
+# hyperparams
+NUC_STRATEGY = 'unamb_else_most_frequent'
+RNK_STRATEGY = 'sdist-edist-rl'
+RNK_PRIORITY_SU = True
+RNK_ORDER = 'weak'
+
 
 # FIXME:
-# * [ ] load gold trees here once and for all, pass them to each evaluation
 # * [ ] create summary table with one system per row, one metric per column,
 #   keep only the f-score (because for binary trees with manual segmentation
 #   precision = recall = f-score).
 
-print('CODRA (Joty)')
-eval_codra_output(CODRA_OUT_DIR, EDUS_FILE,
-                  'chain',
-                  nuc_strategy="unamb_else_most_frequent",
-                  rank_strategy='sdist-edist-rl',
-                  prioritize_same_unit=True,
-                  binarize_ref=False,
-                  detailed=False)
-print('=======================')
-
-print('[chain] Eisner, predicted syntax')
-load_deptrees_from_attelo_output(EISNER_OUT_SYN_PRED, EDUS_FILE,
-                                 'chain',
-                                 nuc_strategy="unamb_else_most_frequent",
-                                 # nuc_strategy="most_frequent_by_rel",
-                                 rank_strategy='sdist-edist-rl',
-                                 prioritize_same_unit=True,
-                                 order='weak',
-                                 binarize_ref=False,
-                                 detailed=False)
-print('======================')
-
-print('[tree] Eisner, predicted syntax + same-unit')
-load_deptrees_from_attelo_output(EISNER_OUT_TREE_SYN_PRED_SU, EDUS_FILE,
-                                 'tree',
-                                 nuc_strategy="unamb_else_most_frequent",
-                                 # nuc_strategy="most_frequent_by_rel",
-                                 rank_strategy='sdist-edist-rl',
-                                 prioritize_same_unit=True,
-                                 order='weak',
-                                 binarize_ref=False,
-                                 detailed=False)
-print('======================')
+# 1. load train section of the RST corpus, fit (currently dummy) classifiers
+# for nuclearity and rank
+reader_train = RstReader(CD_TRAIN)
+corpus_train = reader_train.slurp()
+# gold RST trees
+ctree_true = dict()  # ctrees
+ctree_bin_true = dict()  # ctrees, binarized
+dtree_true = dict()  # dtrees from the original ctrees ('tree' transform)
+dtree_bin_true = dict()  # dtrees from the binarized ctrees ('chain' transform)
+for doc_id, ct_true in sorted(corpus_train.items()):
+    doc_name = doc_id.doc
+    # flavours of ctree
+    ct_true = REL_CONV(ct_true)  # map fine to coarse relations
+    ctree_true[doc_name] = ct_true
+    ct_bin_true = _binarize(ct_true)
+    ctree_bin_true[doc_name] = ct_bin_true
+    # flavours of dtree
+    dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc='tree')
+    dt_bin_true = RstDepTree.from_rst_tree(ct_true, nary_enc='chain')
+    # alt:
+    # dt_bin_true = RstDepTree.from_rst_tree(ct_bin_true, nary_enc='chain')
+    dtree_true[doc_name] = dt_true
+    dtree_bin_true[doc_name] = dt_bin_true
+# fit classifiers for nuclearity and rank (DIRTY)
+# NB: both are (dummily) fit on weakly ordered dtrees
+X_train = []
+y_nuc_train = []
+y_rnk_train = []
+for doc_name, dt in sorted(dtree_true.items()):
+    X_train.append(dt)
+    y_nuc_train.append(dt.nucs)
+    y_rnk_train.append(dt.ranks)
+# nuclearity clf
+nuc_clf = DummyNuclearityClassifier(strategy=NUC_STRATEGY)
+nuc_clf.fit(X_train, y_nuc_train)
+# rank clf
+rnk_clf = InsideOutAttachmentRanker(strategy=RNK_STRATEGY,
+                                    prioritize_same_unit=RNK_PRIORITY_SU,
+                                    order=RNK_ORDER)
+rnk_clf.fit(X_train, y_rnk_train)
+
+# load test section of the RST corpus
+reader_test = RstReader(CD_TEST)
+corpus_test = reader_test.slurp()
+# gold RST trees
+ctree_true = dict()  # ctrees
+ctree_bin_true = dict()  # ctrees, binarized
+dtree_true = dict()  # dtrees from the original ctrees ('tree' transform)
+dtree_bin_true = dict()  # dtrees from the binarized ctrees ('chain' transform)
+for doc_id, ct_true in sorted(corpus_test.items()):
+    doc_name = doc_id.doc
+    # flavours of ctree
+    ct_true = REL_CONV(ct_true)  # map fine to coarse relations
+    ctree_true[doc_name] = ct_true
+    ct_bin_true = _binarize(ct_true)
+    ctree_bin_true[doc_name] = ct_bin_true
+    # flavours of dtree
+    dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc='tree')
+    dt_bin_true = RstDepTree.from_rst_tree(ct_true, nary_enc='chain')
+    # alt:
+    # dt_bin_true = RstDepTree.from_rst_tree(ct_bin_true, nary_enc='chain')
+    dtree_true[doc_name] = dt_true
+    dtree_bin_true[doc_name] = dt_bin_true
+
+
+if True:
+    print('CODRA (Joty)')
+    eval_codra_output(ctree_true, dtree_true,
+                      CODRA_OUT_DIR, EDUS_FILE,
+                      nuc_clf, rnk_clf,
+                      detailed=False)
+    print('=======================')
+
+if True:
+    print('[chain] Eisner, predicted syntax')
+    load_deptrees_from_attelo_output(ctree_true, dtree_true,
+                                     EISNER_OUT_SYN_PRED, EDUS_FILE,
+                                     nuc_clf, rnk_clf,
+                                     detailed=False)
+    print('======================')
+
+if True:
+    print('[tree] Eisner, predicted syntax + same-unit')
+    load_deptrees_from_attelo_output(ctree_true, dtree_true,
+                                     EISNER_OUT_TREE_SYN_PRED_SU, EDUS_FILE,
+                                     nuc_clf, rnk_clf,
+                                     detailed=False)
+    print('======================')
 
 print('Eisner, predicted syntax + same-unit')
-load_deptrees_from_attelo_output(EISNER_OUT_SYN_PRED_SU, EDUS_FILE,
-                                 'chain',
-                                 nuc_strategy="unamb_else_most_frequent",
-                                 # nuc_strategy="most_frequent_by_rel",
-                                 rank_strategy='sdist-edist-rl',
-                                 prioritize_same_unit=True,
+load_deptrees_from_attelo_output(ctree_true, dtree_true,
+                                 EISNER_OUT_SYN_PRED_SU, EDUS_FILE,
+                                 nuc_clf, rnk_clf,
                                  detailed=False)
 print('======================')
 
 print('Eisner, gold syntax')
-load_deptrees_from_attelo_output(EISNER_OUT_SYN_GOLD, EDUS_FILE,
-                                 nuc_strategy="unamb_else_most_frequent",
-                                 # nuc_strategy="most_frequent_by_rel",
-                                 rank_strategy='closest-intra-rl-inter-rl',
-                                 prioritize_same_unit=True)
+load_deptrees_from_attelo_output(ctree_true, dtree_true,
+                                 EISNER_OUT_SYN_GOLD, EDUS_FILE,
+                                 nuc_clf, rnk_clf,
+                                 detailed=False)
 print('======================')
-
-
-# TODO use nuclearity classifier
-# starting with baseline: DummyNuclearityClassifier, that assigns to each
-# EDU the most frequent nuclearity of its (incoming) relation in the
-# training corpus, i.e. 'S' for 'NS', 'N' for 'NN'

From 290fe5c54f87a0d58e63f3a39da4c71585dbc6ad Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Thu, 15 Sep 2016 16:26:55 +0200
Subject: [PATCH 15/74] FIX pass rel_conv and nary_enc to codra eval

---
 evals/codra.py    | 15 +++++++++------
 evals/showdown.py |  4 +++-
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/evals/codra.py b/evals/codra.py
index 7bc5275..d83318e 100644
--- a/evals/codra.py
+++ b/evals/codra.py
@@ -22,7 +22,9 @@
 
 def eval_codra_output(ctree_true, dtree_true,
                       codra_out_dir, edus_file,
-                      nuc_clf, rnk_clf,
+                      rel_conv=None,
+                      nary_enc='chain',
+                      nuc_clf=None, rnk_clf=None,
                       detailed=False):
     """Load and evaluate the .dis files output by CODRA.
 
@@ -80,15 +82,16 @@ def eval_codra_output(ctree_true, dtree_true,
     dtree_pred = dict()  # dependency trees
     ctree_pred = dict()  # constituency trees
 
-    for doc_name, rst_ctree in itertools.izip(doc_names_pred, rst_ctrees_pred):
+    for doc_name, ct_pred in itertools.izip(doc_names_pred, rst_ctrees_pred):
         # constituency tree
         # replace fine-grained labels with coarse-grained labels
         # no need to replace labels: the files we have already contain
         # the coarse labels
-        coarse_rtree_pred = rst_ctree
-        ctree_pred[doc_name] = coarse_rtree_pred
+        if rel_conv is not None:
+            ct_pred = rel_conv(ct_pred)
+        ctree_pred[doc_name] = ct_pred
         # convert to weakly-ordered dependency tree
-        dt_pred = RstDepTree.from_rst_tree(coarse_rtree_pred, nary_enc='chain')
+        dt_pred = RstDepTree.from_rst_tree(ct_pred, nary_enc='chain')
         dtree_pred[doc_name] = dt_pred
 
     # compare pred and true
@@ -119,7 +122,7 @@ def eval_codra_output(ctree_true, dtree_true,
         print(parseval_detailed_report(ctree_true, ctree_pred,
                                        metric_type='S+R'))
 
-    if True:
+    if nuc_clf is not None and rnk_clf is not None:
         # WIP 2016-06-29 use our deterministic classifiers for nuc and rank
         # => estimate degradation on Joty's output => hint at ours
         # nuclearity
diff --git a/evals/showdown.py b/evals/showdown.py
index 8f0ccea..5cfa81a 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -152,7 +152,9 @@
     print('CODRA (Joty)')
     eval_codra_output(ctree_true, dtree_true,
                       CODRA_OUT_DIR, EDUS_FILE,
-                      nuc_clf, rnk_clf,
+                      rel_conv=REL_CONV,
+                      nary_enc='chain',
+                      nuc_clf=nuc_clf, rnk_clf=rnk_clf,
                       detailed=False)
     print('=======================')
 

From 2fcdfc3041349609dd7a724c51a5a3de52d4ab18 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Fri, 16 Sep 2016 11:57:53 +0200
Subject: [PATCH 16/74] WIP more compact display in showdown

---
 evals/codra.py    |  14 +--
 evals/showdown.py | 312 ++++++++++++++++++++++++++++++----------------
 2 files changed, 206 insertions(+), 120 deletions(-)

diff --git a/evals/codra.py b/evals/codra.py
index d83318e..21f1faa 100644
--- a/evals/codra.py
+++ b/evals/codra.py
@@ -94,19 +94,7 @@ def eval_codra_output(ctree_true, dtree_true,
         dt_pred = RstDepTree.from_rst_tree(ct_pred, nary_enc='chain')
         dtree_pred[doc_name] = dt_pred
 
-    # compare pred and true
-    common_doc_names = set(dtree_true.keys()) & set(dtree_pred.keys())
-
-    # dep scores
-    dtree_true_list = [dt for doc_name, dt in sorted(dtree_true.items())
-                       if doc_name in common_doc_names]
-    dtree_pred_list = [dt for doc_name, dt in sorted(dtree_pred.items())
-                       if doc_name in common_doc_names]
-
-    score_uas, score_las, score_ls = compute_uas_las(dtree_true_list,
-                                                     dtree_pred_list)
-    print('UAS / LAS / LS : {:.4f} / {:.4f} / {:.4f}'.format(
-        score_uas, score_las, score_ls))
+    return ctree_pred, dtree_pred
 
     skipped_docs = set()
     # convert dicts to aligned lists of SimpleRSTTrees, skipping docs where
diff --git a/evals/showdown.py b/evals/showdown.py
index 5cfa81a..10ed7ff 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -5,6 +5,7 @@
 
 from __future__ import print_function
 
+import argparse
 import os
 
 from educe.rst_dt.annotation import _binarize
@@ -14,8 +15,10 @@
                                   InsideOutAttachmentRanker)
 from educe.rst_dt.deptree import RstDepTree
 #
-# from attelo.metrics.constituency import (LBL_FNS, parseval_detailed_report,
-#                                          parseval_report)
+from attelo.metrics.constituency import (parseval_detailed_report,
+                                         parseval_report)
+from attelo.metrics.deptree import compute_uas_las
+
 # local to this package
 from evals.codra import eval_codra_output
 from evals.ours import load_deptrees_from_attelo_output
@@ -70,7 +73,8 @@
 
 CODRA_OUT_DIR = '/home/mmorey/melodi/rst/joty/Doc-level'
 
-
+# level of detail for parseval
+DETAILED = False
 # hyperparams
 NUC_STRATEGY = 'unamb_else_most_frequent'
 RNK_STRATEGY = 'sdist-edist-rl'
@@ -78,112 +82,206 @@
 RNK_ORDER = 'weak'
 
 
+def setup_dtree_postprocessor(nary_enc):
+    """Setup the nuclearity and rank classifiers to flesh out dtrees."""
+    # load train section of the RST corpus, fit (currently dummy) classifiers
+    # for nuclearity and rank
+    reader_train = RstReader(CD_TRAIN)
+    corpus_train = reader_train.slurp()
+    # gold RST trees
+    ctree_true = dict()  # ctrees
+    dtree_true = dict()  # dtrees from the original ctrees ('tree' transform)
+
+    for doc_id, ct_true in sorted(corpus_train.items()):
+        doc_name = doc_id.doc
+        # flavours of ctree
+        ct_true = REL_CONV(ct_true)  # map fine to coarse relations
+        ctree_true[doc_name] = ct_true
+        # flavours of dtree
+        dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc=nary_enc)
+        dtree_true[doc_name] = dt_true
+    # fit classifiers for nuclearity and rank (DIRTY)
+    # NB: both are (dummily) fit on weakly ordered dtrees
+    X_train = []
+    y_nuc_train = []
+    y_rnk_train = []
+    for doc_name, dt in sorted(dtree_true.items()):
+        X_train.append(dt)
+        y_nuc_train.append(dt.nucs)
+        y_rnk_train.append(dt.ranks)
+    # nuclearity clf
+    nuc_clf = DummyNuclearityClassifier(strategy=NUC_STRATEGY)
+    nuc_clf.fit(X_train, y_nuc_train)
+    # rank clf
+    rnk_clf = InsideOutAttachmentRanker(strategy=RNK_STRATEGY,
+                                        prioritize_same_unit=RNK_PRIORITY_SU,
+                                        order=RNK_ORDER)
+    rnk_clf.fit(X_train, y_rnk_train)
+    return nuc_clf, rnk_clf
+
+
 # FIXME:
 # * [ ] create summary table with one system per row, one metric per column,
 #   keep only the f-score (because for binary trees with manual segmentation
 #   precision = recall = f-score).
+def main():
+    """Run the eval"""
+    parser = argparse.ArgumentParser(
+        description="Evaluate parsers' output against a given reference")
+    # predictions
+    parser.add_argument('authors_pred', nargs='+',
+                        choices=['gold', 'silver',
+                                 'joty', 'feng', 'ji',
+                                 'ours_chain', 'ours_tree'],
+                        help="Author(s) of the predictions")
+    parser.add_argument('--nary_enc_pred', default='tree',
+                        choices=['tree', 'chain'],
+                        help="Encoding of n-ary nodes for the predictions")
+    # reference
+    parser.add_argument('--author_true', default='gold',
+                        choices=['gold', 'silver',
+                                 'joty', 'feng', 'ji',
+                                 'ours_chain', 'ours_tree'],
+                        help="Author of the reference")
+    # * dtree eval
+    parser.add_argument('--nary_enc_true', default='tree',
+                        choices=['tree', 'chain'],
+                        help="Encoding of n-ary nodes for the reference")
+    # * ctree eval
+    parser.add_argument('--binarize_true', action='store_true',
+                        help="Binarize the reference ctree for the eval")
+
+    #
+    args = parser.parse_args()
+    author_true = args.author_true
+    nary_enc_true = args.nary_enc_true
+    authors_pred = args.authors_pred
+    nary_enc_pred = args.nary_enc_pred
+    binarize_true = args.binarize_true
+
+    # 0. setup the postprocessors to flesh out unordered dtrees into ordered
+    # ones with nuclearity
+    nuc_clf, rnk_clf = setup_dtree_postprocessor(nary_enc_pred)
+
+    # the eval compares parses for the test section of the RST corpus
+    reader_test = RstReader(CD_TEST)
+    corpus_test = reader_test.slurp()
+
+    # reference
+    # current assumption: author_true is 'gold'
+    if author_true != 'gold':
+        raise NotImplementedError('Not yet')
+
+    ctree_true = dict()  # ctrees
+    dtree_true = dict()  # dtrees from the original ctrees ('tree' transform)
+    for doc_id, ct_true in sorted(corpus_test.items()):
+        doc_name = doc_id.doc
+        # original reference ctree, with coarse labels
+        ct_true = REL_CONV(ct_true)  # map fine to coarse relations
+        # corresponding dtree
+        dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc=nary_enc_true)
+        dtree_true[doc_name] = dt_true
+        # binarize ctree if necessary
+        if binarize_true:
+            ct_true = _binarize(ct_true)
+        ctree_true[doc_name] = ct_true
+
+    # predictions: [(parser_name, ([doc_names], [ct_pred], [dt_pred]))]
+    predictions = []
+    if 'joty' in authors_pred:
+        # CODRA outputs RST ctrees ; eval_codra_output maps them to RST dtrees
+        predictions.append(
+            ('joty', eval_codra_output(ctree_true, dtree_true,
+                                       CODRA_OUT_DIR, EDUS_FILE,
+                                       rel_conv=REL_CONV,
+                                       nary_enc='chain',
+                                       nuc_clf=nuc_clf, rnk_clf=rnk_clf,
+                                       detailed=False))
+        )
+
+    if 'ours_chain' in authors_pred:
+        print('[chain] Eisner, predicted syntax')
+        # attelo out: unordered dtree ; we pass a nuclearity and rank classifiers
+        # to get an ordered dtree ;
+        # need to map to ctree
+        load_deptrees_from_attelo_output(ctree_true, dtree_true,
+                                         EISNER_OUT_SYN_PRED, EDUS_FILE,
+                                         nuc_clf, rnk_clf,
+                                         detailed=False)
+        print('======================')
+
+    if 'ours_tree' in authors_pred:
+        print('[tree] Eisner, predicted syntax + same-unit')
+        load_deptrees_from_attelo_output(ctree_true, dtree_true,
+                                         EISNER_OUT_TREE_SYN_PRED_SU, EDUS_FILE,
+                                         nuc_clf, rnk_clf,
+                                         detailed=False)
+        print('======================')
+
+    if False:  # FIXME repair (or forget) these
+        print('Eisner, predicted syntax + same-unit')
+        load_deptrees_from_attelo_output(ctree_true, dtree_true,
+                                         EISNER_OUT_SYN_PRED_SU, EDUS_FILE,
+                                         nuc_clf, rnk_clf,
+                                         detailed=False)
+        print('======================')
+
+        print('Eisner, gold syntax')
+        load_deptrees_from_attelo_output(ctree_true, dtree_true,
+                                         EISNER_OUT_SYN_GOLD, EDUS_FILE,
+                                         nuc_clf, rnk_clf,
+                                         detailed=False)
+        print('======================')
+
+    # dependency eval
+
+    # report
+    # * table format
+    digits = 4
+    parser_names = ['joty']
+    width = max(len(x) for x in parser_names)
+
+    headers = ["UAS", "LAS", "LS"]
+    fmt = '%% %ds' % width  # first col: parser name
+    fmt += '  '
+    fmt += ' '.join(['% 9s' for _ in headers])
+    fmt += '\n'
+
+    headers = [""] + headers
+    report = fmt % tuple(headers)
+    report += '\n'
+    # end table format and header line
+
+    # * table content
+    for parser_name, (ctree_pred, dtree_pred) in predictions:
+        doc_names = sorted(dtree_true.keys())
+        dtree_true_list = [dtree_true[doc_name] for doc_name in doc_names]
+        dtree_pred_list = [dtree_pred[doc_name] for doc_name in doc_names]
+        score_uas, score_las, score_ls = compute_uas_las(dtree_true_list,
+                                                         dtree_pred_list)
+        # append to report
+        values = [parser_name]
+        for v in (score_uas, score_las, score_ls):
+            values += ["{0:0.{1}f}".format(v, digits)]
+        report += fmt % tuple(values)
+    # end table content
+    print(report)
+    # end report
+
+    # constituency eval
+    for parser_name, (ctree_pred, dtree_pred) in predictions:
+        doc_names = sorted(ctree_true.keys())
+        ctree_true_list = [ctree_true[doc_name] for doc_name in doc_names]
+        ctree_pred_list = [ctree_pred[doc_name] for doc_name in doc_names]
+        # FIXME
+        # compute and print PARSEVAL scores
+        print(parseval_report(ctree_true_list, ctree_pred_list, digits=4))
+        # detailed report on S+N+R
+        if DETAILED:
+            print(parseval_detailed_report(ctree_true_list, ctree_pred_list,
+                                           metric_type='S+R'))
+        # end FIXME
+
 
-# 1. load train section of the RST corpus, fit (currently dummy) classifiers
-# for nuclearity and rank
-reader_train = RstReader(CD_TRAIN)
-corpus_train = reader_train.slurp()
-# gold RST trees
-ctree_true = dict()  # ctrees
-ctree_bin_true = dict()  # ctrees, binarized
-dtree_true = dict()  # dtrees from the original ctrees ('tree' transform)
-dtree_bin_true = dict()  # dtrees from the binarized ctrees ('chain' transform)
-for doc_id, ct_true in sorted(corpus_train.items()):
-    doc_name = doc_id.doc
-    # flavours of ctree
-    ct_true = REL_CONV(ct_true)  # map fine to coarse relations
-    ctree_true[doc_name] = ct_true
-    ct_bin_true = _binarize(ct_true)
-    ctree_bin_true[doc_name] = ct_bin_true
-    # flavours of dtree
-    dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc='tree')
-    dt_bin_true = RstDepTree.from_rst_tree(ct_true, nary_enc='chain')
-    # alt:
-    # dt_bin_true = RstDepTree.from_rst_tree(ct_bin_true, nary_enc='chain')
-    dtree_true[doc_name] = dt_true
-    dtree_bin_true[doc_name] = dt_bin_true
-# fit classifiers for nuclearity and rank (DIRTY)
-# NB: both are (dummily) fit on weakly ordered dtrees
-X_train = []
-y_nuc_train = []
-y_rnk_train = []
-for doc_name, dt in sorted(dtree_true.items()):
-    X_train.append(dt)
-    y_nuc_train.append(dt.nucs)
-    y_rnk_train.append(dt.ranks)
-# nuclearity clf
-nuc_clf = DummyNuclearityClassifier(strategy=NUC_STRATEGY)
-nuc_clf.fit(X_train, y_nuc_train)
-# rank clf
-rnk_clf = InsideOutAttachmentRanker(strategy=RNK_STRATEGY,
-                                    prioritize_same_unit=RNK_PRIORITY_SU,
-                                    order=RNK_ORDER)
-rnk_clf.fit(X_train, y_rnk_train)
-
-# load test section of the RST corpus
-reader_test = RstReader(CD_TEST)
-corpus_test = reader_test.slurp()
-# gold RST trees
-ctree_true = dict()  # ctrees
-ctree_bin_true = dict()  # ctrees, binarized
-dtree_true = dict()  # dtrees from the original ctrees ('tree' transform)
-dtree_bin_true = dict()  # dtrees from the binarized ctrees ('chain' transform)
-for doc_id, ct_true in sorted(corpus_test.items()):
-    doc_name = doc_id.doc
-    # flavours of ctree
-    ct_true = REL_CONV(ct_true)  # map fine to coarse relations
-    ctree_true[doc_name] = ct_true
-    ct_bin_true = _binarize(ct_true)
-    ctree_bin_true[doc_name] = ct_bin_true
-    # flavours of dtree
-    dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc='tree')
-    dt_bin_true = RstDepTree.from_rst_tree(ct_true, nary_enc='chain')
-    # alt:
-    # dt_bin_true = RstDepTree.from_rst_tree(ct_bin_true, nary_enc='chain')
-    dtree_true[doc_name] = dt_true
-    dtree_bin_true[doc_name] = dt_bin_true
-
-
-if True:
-    print('CODRA (Joty)')
-    eval_codra_output(ctree_true, dtree_true,
-                      CODRA_OUT_DIR, EDUS_FILE,
-                      rel_conv=REL_CONV,
-                      nary_enc='chain',
-                      nuc_clf=nuc_clf, rnk_clf=rnk_clf,
-                      detailed=False)
-    print('=======================')
-
-if True:
-    print('[chain] Eisner, predicted syntax')
-    load_deptrees_from_attelo_output(ctree_true, dtree_true,
-                                     EISNER_OUT_SYN_PRED, EDUS_FILE,
-                                     nuc_clf, rnk_clf,
-                                     detailed=False)
-    print('======================')
-
-if True:
-    print('[tree] Eisner, predicted syntax + same-unit')
-    load_deptrees_from_attelo_output(ctree_true, dtree_true,
-                                     EISNER_OUT_TREE_SYN_PRED_SU, EDUS_FILE,
-                                     nuc_clf, rnk_clf,
-                                     detailed=False)
-    print('======================')
-
-print('Eisner, predicted syntax + same-unit')
-load_deptrees_from_attelo_output(ctree_true, dtree_true,
-                                 EISNER_OUT_SYN_PRED_SU, EDUS_FILE,
-                                 nuc_clf, rnk_clf,
-                                 detailed=False)
-print('======================')
-
-print('Eisner, gold syntax')
-load_deptrees_from_attelo_output(ctree_true, dtree_true,
-                                 EISNER_OUT_SYN_GOLD, EDUS_FILE,
-                                 nuc_clf, rnk_clf,
-                                 detailed=False)
-print('======================')
+if __name__ == '__main__':
+    main()

From b2246b84619b619ec393adc54819e7a266ab399f Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Fri, 16 Sep 2016 18:17:00 +0200
Subject: [PATCH 17/74] WIP concise showdown: joty, ours

---
 evals/codra.py    | 197 +++++++++++++++++++++++-----------------------
 evals/ours.py     | 187 ++++++++++++++++---------------------------
 evals/showdown.py |  72 +++++++++--------
 3 files changed, 204 insertions(+), 252 deletions(-)

diff --git a/evals/codra.py b/evals/codra.py
index 21f1faa..f3b894e 100644
--- a/evals/codra.py
+++ b/evals/codra.py
@@ -20,120 +20,117 @@
 from attelo.metrics.deptree import compute_uas_las
 
 
-def eval_codra_output(ctree_true, dtree_true,
-                      codra_out_dir, edus_file,
-                      rel_conv=None,
-                      nary_enc='chain',
-                      nuc_clf=None, rnk_clf=None,
-                      detailed=False):
-    """Load and evaluate the .dis files output by CODRA.
+def load_codra_ctrees(codra_out_dir, rel_conv):
+    """Load the ctrees output by CODRA as .dis files.
 
     This currently runs on the document-level files (.doc_dis).
-    """
-    # WIP 2016-06-29 sent_idx
-    att_edus = load_edus(edus_file)
-    edu2sent_idx = defaultdict(dict)
-    for att_edu in att_edus:
-        doc_name = att_edu.grouping
-        edu_num = int(att_edu.id.rsplit('_', 1)[1])
-        sent_idx = int(att_edu.subgrouping.split('_sent')[1])
-        edu2sent_idx[doc_name][edu_num] = sent_idx
-    # sort EDUs by num
-    # rebuild educe-style edu2sent ; prepend 0 for the fake root
-    doc_name2edu2sent = {doc_name: ([0]
-                                    + [s_idx for e_num, s_idx
-                                       in sorted(edu2sent.items())])
-                         for doc_name, edu2sent in edu2sent_idx.items()}
-    doc_name2edu2para = dict()
-
-    for doc_name, rtree_true in sorted(ctree_true.items()):
-        # WIP 2016-06-29 para_idx
-        doc_edus = rtree_true.leaves()
-        doc_txt = doc_edus[0].context._text
-        # retrieve paragraph idx
-        doc_paras = doc_edus[0].context.paragraphs
-        if doc_paras is not None:
-            edu2para = align_edus_with_paragraphs(
-                doc_edus, doc_paras, doc_txt)
-            # yerk: interpolate values in edu2para where missing
-            edu2para_fix = []
-            for edu_idx in edu2para:
-                if edu_idx is not None:
-                    edu2para_fix.append(edu_idx)
-                else:
-                    # interpolation strategy: copy the last regular value
-                    # that has been seen
-                    edu2para_fix.append(edu2para_fix[-1])
-            edu2para = edu2para_fix
-            # end yerk: interpolate
-            edu2para = [0] + list(np.array(edu2para) + 1)
-            doc_name2edu2para[doc_name] = edu2para
-        else:
-            doc_name2edu2para[doc_name] = None
-        # end retrieve paragraph idx
 
+    Parameters
+    ----------
+    codra_out_dir: str
+        Path to the base directory containing the output files.
+
+    Returns
+    -------
+    ctree_pred: dict(str, RSTTree)
+        RST ctree for each document.
+    """
     # load predicted trees
     data_pred = load_codra_output_files(codra_out_dir)
     # filenames = data_pred['filenames']
     doc_names_pred = data_pred['doc_names']
     rst_ctrees_pred = data_pred['rst_ctrees']
 
-    # gather predictions
-    dtree_pred = dict()  # dependency trees
+    # build a dict from doc_name to ctree (RSTTree)
     ctree_pred = dict()  # constituency trees
-
     for doc_name, ct_pred in itertools.izip(doc_names_pred, rst_ctrees_pred):
         # constituency tree
-        # replace fine-grained labels with coarse-grained labels
-        # no need to replace labels: the files we have already contain
-        # the coarse labels
+        # replace fine-grained labels with coarse-grained labels ;
+        # the files we have already contain the coarse labels, except their
+        # initial letter is capitalized whereas ours are not
         if rel_conv is not None:
             ct_pred = rel_conv(ct_pred)
         ctree_pred[doc_name] = ct_pred
-        # convert to weakly-ordered dependency tree
-        dt_pred = RstDepTree.from_rst_tree(ct_pred, nary_enc='chain')
+
+    return ctree_pred
+
+
+def load_codra_dtrees(codra_out_dir, rel_conv, nary_enc='chain'):
+    """Get the dtrees that correspond to the ctrees output by CODRA.
+
+    Parameters
+    ----------
+    codra_out_dir: str
+        Path to the base directory containing the output files.
+    nary_enc: one of {'chain', 'tree'}
+        Encoding for n-ary nodes.
+
+    Returns
+    -------
+    dtree_pred: dict(str, RstDepTree)
+        RST dtree for each document.
+    """
+    # load predicted trees
+    data_pred = load_codra_output_files(codra_out_dir)
+    # filenames = data_pred['filenames']
+    doc_names_pred = data_pred['doc_names']
+    rst_ctrees_pred = data_pred['rst_ctrees']
+
+    # build a dict from doc_name to ordered dtree (RstDepTree)
+    dtree_pred = dict()
+    for doc_name, ct_pred in itertools.izip(doc_names_pred, rst_ctrees_pred):
+        # constituency tree
+        # replace fine-grained labels with coarse-grained labels ;
+        # the files we have already contain the coarse labels, except their
+        # initial letter is capitalized whereas ours are not
+        if rel_conv is not None:
+            ct_pred = rel_conv(ct_pred)
+        # convert to an ordered dependency tree ;
+        # * 'tree' produces a weakly-ordered dtree strictly equivalent
+        # to the original ctree,
+        # * 'chain' produces a strictly-ordered dtree for which strict
+        # equivalence is not preserved
+        dt_pred = RstDepTree.from_rst_tree(ct_pred, nary_enc=nary_enc)
         dtree_pred[doc_name] = dt_pred
 
-    return ctree_pred, dtree_pred
-
-    skipped_docs = set()
-    # convert dicts to aligned lists of SimpleRSTTrees, skipping docs where
-    # needed
-    ctree_true = [ct for doc_name, ct in sorted(ctree_true.items())
-                  if doc_name not in skipped_docs]
-    ctree_pred = [ct for doc_name, ct in sorted(ctree_pred.items())
-                  if doc_name not in skipped_docs]
-    # compute and print PARSEVAL scores
-    print(parseval_report(ctree_true, ctree_pred, digits=4))
-    # detailed report on S+N+R
-    if detailed:
-        print(parseval_detailed_report(ctree_true, ctree_pred,
-                                       metric_type='S+R'))
-
-    if nuc_clf is not None and rnk_clf is not None:
-        # WIP 2016-06-29 use our deterministic classifiers for nuc and rank
-        # => estimate degradation on Joty's output => hint at ours
-        # nuclearity
-        # rebuild ctrees
-        ctree_pred2 = dict()
-        for doc_name, dt_pred in sorted(dtree_pred.items()):
-            # set nuclearity
-            dt_pred.nucs = nuc_clf.predict([dt_pred])[0]
-            # set ranking, needs sent_idx (WIP on para_idx)
-            edu2sent = doc_name2edu2sent[doc_name]
-            dt_pred.sent_idx = edu2sent
-            # 2016-06-28 same for edu2para
-            edu2para = doc_name2edu2para[doc_name]
-            dt_pred.para_idx = edu2para
-            dt_pred.ranks = rnk_clf.predict([dt_pred])[0]
-            # end NEW
-            rtree_pred = deptree_to_rst_tree(dt_pred)
-            ctree_pred2[doc_name] = rtree_pred
-        #
-        skipped_docs = set()
-        ctree_pred2 = [ct for doc_name, ct in sorted(ctree_pred2.items())
-                       if doc_name not in skipped_docs]
-        print(parseval_report(ctree_true, ctree_pred2, digits=4))
-        if detailed:
-            print(parseval_detailed_report(ctree_true, ctree_pred2,
-                                           metric_type='S+R'))
+    return dtree_pred
+
+
+# TODO move this generic util to a more appropriate place.
+# This implementation is quite ad-hoc, tailored for RST e.g. to retrieve
+# the edu_num, so I would need to generalize this code first.
+def get_edu2sent(att_edus):
+    """Get edu2sent mapping, from a list of attelo EDUs.
+
+    Parameters
+    ----------
+    att_edus: list of attelo EDUs
+        List of attelo EDUs, as produced by `load_edus`.
+
+    Returns
+    -------
+    doc_name2edu2sent: dict(str, [int])
+        For each document, get the sentence index for every EDU.
+
+    Example:
+    ```
+    att_edus = load_edus(edus_file)
+    doc_name2edu2sent = get_edu2sent(att_edus)
+    for doc_name, edu2sent in doc_name2edu2sent.items():
+        dtree[doc_name].edu2sent = edu2sent
+    ```
+
+    """
+    edu2sent_idx = defaultdict(dict)
+    for att_edu in att_edus:
+        doc_name = att_edu.grouping
+        edu_num = int(att_edu.id.rsplit('_', 1)[1])
+        sent_idx = int(att_edu.subgrouping.split('_sent')[1])
+        edu2sent_idx[doc_name][edu_num] = sent_idx
+    # sort EDUs by num
+    # rebuild educe-style edu2sent ; prepend 0 for the fake root
+    doc_name2edu2sent = {doc_name: ([0]
+                                    + [s_idx for e_num, s_idx
+                                       in sorted(edu2sent.items())])
+                         for doc_name, edu2sent in edu2sent_idx.items()}
+    return doc_name2edu2sent
diff --git a/evals/ours.py b/evals/ours.py
index 1633edf..300b376 100644
--- a/evals/ours.py
+++ b/evals/ours.py
@@ -59,71 +59,22 @@ def load_attelo_output_file(output_file):
     return edges_pred
 
 
-def load_deptrees_from_attelo_output(ctree_true, dtree_true,
-                                     output_file, edus_file,
-                                     nuc_clf, rnk_clf,
-                                     detailed=False,
-                                     skpd_docs=None):
-    """Load an RstDepTree from the output of attelo.
+def load_attelo_dtrees(output_file, edus_file, nuc_clf, rnk_clf):
+    """Load RST dtrees from attelo output files.
 
     Parameters
     ----------
-    ctree_true: dict(str, RSTTree)
-        Ground truth RST ctree.
-    dtree_true: dict(str, RstDepTree)
-        Ground truth RST (ordered) dtree.
     output_file: string
         Path to the file that contains attelo's output
-    nuc_clf: NuclearityClassifier
-        Classifier to predict nuclearity
-    rnk_clf: RankClassifier
-        Classifier to predict attachment ranking
-    skpd_docs: set(string)
-        Names of documents that should be skipped to compute scores
+    edus_file: string
+        Path to the file that describes EDUs.
 
     Returns
     -------
-    skipped_docs: set(string)
-        Names of documents that have been skipped to compute scores
+    TODO
     """
-    doc_name2edu2para = dict()
-
-    # load reference trees
-    for doc_name, rtree_true in sorted(ctree_true.items()):
-        # 2016-06-28 retrieve paragraph idx of each EDU
-        # FIXME refactor to get in a better way, in a better place
-        # currently, we take EDUs from the RSTTree and paragraphs from
-        # the RSTContext, so no left padding in either list ;
-        # the dtree contains the left padding EDU, so we compute the
-        # edu2paragraph alignment on real units only, shift by one,
-        # then prepend 0
-        doc_edus = rtree_true.leaves()
-        doc_paras = doc_edus[0].context.paragraphs
-        doc_txt = doc_edus[0].context._text
-        if doc_paras is not None:
-            edu2para = align_edus_with_paragraphs(
-                doc_edus, doc_paras, doc_txt)
-            # yerk: interpolate values in edu2para where missing
-            edu2para_fix = []
-            for edu_idx in edu2para:
-                if edu_idx is not None:
-                    edu2para_fix.append(edu_idx)
-                else:
-                    # interpolation strategy: copy the last regular value
-                    # that has been seen
-                    edu2para_fix.append(edu2para_fix[-1])
-            edu2para = edu2para_fix
-            # end yerk: interpolate
-            edu2para = [0] + list(np.array(edu2para) + 1)
-            doc_name2edu2para[doc_name] = edu2para
-        else:
-            doc_name2edu2para[doc_name] = None
-        # end retrieve paragraph idx
-
-    # USE TO INCORPORATE CONSTITUENCY LOSS INTO STRUCTURED CLASSIFIERS
-    # load predicted trees
     dtree_pred = dict()  # predicted dtrees
-    ctree_pred = dict()  # predicted ctrees
+    # * setup...
     # load EDUs as they are known to attelo (sigh)
     # and predicted edges on these EDUs
     att_edus = load_edus(edus_file)
@@ -155,9 +106,6 @@ def load_deptrees_from_attelo_output(ctree_true, dtree_true,
                                      for e in doc_educe_edus])
                          for doc_name, doc_educe_edus in educe_edus.items()}
 
-    # re-build predicted trees from predicted edges and educe EDUs
-    skipped_docs = set()  # docs skipped because non-projective structures
-
     # rebuild RstDepTrees
     for doc_name, es_pred in sorted(edges_pred.items()):
         # get educe EDUs
@@ -172,36 +120,44 @@ def load_deptrees_from_attelo_output(ctree_true, dtree_true,
                     raise ValueError('Weird root label: {}'.format(lbl))
             else:
                 dt_pred.add_dependency(gid2num[src_id], gid2num[tgt_id], lbl)
-        # NEW add nuclearity: heuristic baseline
-        if True:
-            dt_pred.nucs = nuc_clf.predict([dt_pred])[0]
-        else:  # EXPERIMENTAL use gold nuclearity
-            dt_pred.nucs = dtree_true[doc_name].nucs
-        # NEW add rank: some strategies require a mapping from EDU to sentence
-        # EXPERIMENTAL attach array of sentence index for each EDU in tree
+        # add nuclearity: heuristic baseline
+        dt_pred.nucs = nuc_clf.predict([dt_pred])[0]
+        # add rank: heuristic baseline, needs edu2sent
         edu2sent = doc_name2edu2sent[doc_name]
-        dt_pred.sent_idx = edu2sent
-        # 2016-06-28 same for edu2para
-        edu2para = doc_name2edu2para[doc_name]
-        dt_pred.para_idx = edu2para
-        # assert len(edu2sent) == len(edu2para)
-        # end EXPERIMENTAL
-        if False:  # DEBUG
-            print(doc_name)
+        dt_pred.sent_idx = edu2sent  # DIRTY
         dt_pred.ranks = rnk_clf.predict([dt_pred])[0]
-        # end NEW
+        # store
         dtree_pred[doc_name] = dt_pred
 
-        # create pred ctree
+    return dtree_pred
+
+
+def load_attelo_ctrees(output_file, edus_file, nuc_clf, rnk_clf):
+    """Load RST ctrees from attelo output files.
+
+    Parameters
+    ----------
+    output_file: string
+        Path to the file that contains attelo's output
+    edus_file: string
+        Path to the file that describes EDUs.
+    nuc_clf: NuclearityClassifier
+        Classifier to predict nuclearity
+    rnk_clf: RankClassifier
+        Classifier to predict attachment ranking
+
+    Returns
+    -------
+    TODO
+    """
+    # load RST dtrees, with heuristics for nuc and rank
+    dtree_pred = load_attelo_dtrees(output_file, edus_file, nuc_clf, rnk_clf)
+    # convert to RST ctrees
+    ctree_pred = dict()
+    for doc_name, dt_pred in dtree_pred.items():
         try:
-            if True:  # NEW 2016-09-14
-                # direct conversion from ordered dtree to ctree
-                rtree_pred = deptree_to_rst_tree(dt_pred)
-                ctree_pred[doc_name] = rtree_pred
-            else:  # legacy: via SimpleRSTTree, forces binarization
-                bin_srtree_pred = deptree_to_simple_rst_tree(dt_pred)
-                bin_rtree_pred = SimpleRSTTree.to_binary_rst_tree(bin_srtree_pred)
-                ctree_pred[doc_name] = bin_rtree_pred
+            rtree_pred = deptree_to_rst_tree(dt_pred)
+            ctree_pred[doc_name] = rtree_pred
         except RstDtException as rst_e:
             print(rst_e)
             skipped_docs.add(doc_name)
@@ -209,42 +165,31 @@ def load_deptrees_from_attelo_output(ctree_true, dtree_true,
                 print('\n'.join('{}: {}'.format(edu.text_span(), edu)
                                 for edu in educe_edus[doc_name]))
             # raise
-    # end USE TO INCORPORATE CONSTITUENCY LOSS INTO STRUCTURED CLASSIFIERS
 
-    # compare gold with pred on doc_names
-    common_doc_names = set(dtree_true.keys()) & set(dtree_pred.keys())
-
-    # dep scores
-    dtree_true_list = [dt for doc_name, dt in sorted(dtree_true.items())
-                       if doc_name in common_doc_names]
-    dtree_pred_list = [dt for doc_name, dt in sorted(dtree_pred.items())
-                       if doc_name in common_doc_names]
-
-    score_uas, score_las, score_ls = compute_uas_las(dtree_true_list,
-                                                     dtree_pred_list)
-    print('UAS / LAS / LS : {:.4f} / {:.4f} / {:.4f}'.format(
-        score_uas, score_las, score_ls))
-
-    # compute and print PARSEVAL scores
-    if skipped_docs:
-        print('Skipped {} docs over {}'.format(len(skipped_docs),
-                                               len(edges_pred)))
-    # also skip docs passed as argument
-    if skpd_docs is not None:
-        skipped_docs |= skpd_docs
-    # convert dicts to aligned lists of SimpleRSTTrees, skipping docs where
-    # needed
-    ctree_true = [ct for doc_name, ct in sorted(ctree_true.items())
-                  if doc_name not in skipped_docs]
-    ctree_pred = [ct for doc_name, ct in sorted(ctree_pred.items())
-                  if doc_name not in skipped_docs]
-
-    print(parseval_report(ctree_true, ctree_pred,
-                          digits=4))
-    # detailed report on S+N+R
-    if detailed:
-        print(parseval_detailed_report(ctree_true, ctree_pred,
-                                       metric_type='S+R'))
-    # DEBUG
-    # end DEBUG
-    return skipped_docs
+    return ctree_pred
+
+
+def load_deptrees_from_attelo_output(ctree_true, dtree_true,
+                                     output_file, edus_file,
+                                     nuc_clf, rnk_clf,
+                                     detailed=False,
+                                     skpd_docs=None):
+    """Load an RstDepTree from the output of attelo.
+
+    Parameters
+    ----------
+    ctree_true: dict(str, RSTTree)
+        Ground truth RST ctree.
+    dtree_true: dict(str, RstDepTree)
+        Ground truth RST (ordered) dtree.
+    skpd_docs: set(string)
+        Names of documents that should be skipped to compute scores
+
+    Returns
+    -------
+    skipped_docs: set(string)
+        Names of documents that have been skipped to compute scores
+    """
+    # USE TO INCORPORATE CONSTITUENCY LOSS INTO STRUCTURED CLASSIFIERS
+    # load predicted trees
+    # end USE TO INCORPORATE CONSTITUENCY LOSS INTO STRUCTURED CLASSIFIERS
diff --git a/evals/showdown.py b/evals/showdown.py
index 10ed7ff..f21f294 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -20,8 +20,10 @@
 from attelo.metrics.deptree import compute_uas_las
 
 # local to this package
-from evals.codra import eval_codra_output
-from evals.ours import load_deptrees_from_attelo_output
+from evals.codra import load_codra_ctrees, load_codra_dtrees
+from evals.ours import (load_deptrees_from_attelo_output,
+                        load_attelo_ctrees,
+                        load_attelo_dtrees)
 
 
 # RST corpus
@@ -186,37 +188,45 @@ def main():
             ct_true = _binarize(ct_true)
         ctree_true[doc_name] = ct_true
 
-    # predictions: [(parser_name, ([doc_names], [ct_pred], [dt_pred]))]
-    predictions = []
+    
+    c_preds = []  # predictions: [(parser_name, dict(doc_name, ct_pred))]
+    d_preds = []  # predictions: [(parser_name, dict(doc_name, dt_pred))]
     if 'joty' in authors_pred:
         # CODRA outputs RST ctrees ; eval_codra_output maps them to RST dtrees
-        predictions.append(
-            ('joty', eval_codra_output(ctree_true, dtree_true,
-                                       CODRA_OUT_DIR, EDUS_FILE,
-                                       rel_conv=REL_CONV,
-                                       nary_enc='chain',
-                                       nuc_clf=nuc_clf, rnk_clf=rnk_clf,
-                                       detailed=False))
+        c_preds.append(
+            ('joty', load_codra_ctrees(CODRA_OUT_DIR, REL_CONV))
         )
+        d_preds.append(
+            ('joty', load_codra_dtrees(CODRA_OUT_DIR, REL_CONV,
+                                       nary_enc='chain'))
+        )
+        # joty-{chain,tree} would be the same except nary_enc='tree' ;
+        # the nary_enc does not matter because codra outputs binary ctrees,
+        # hence both encodings result in (the same) strictly ordered dtrees
 
     if 'ours_chain' in authors_pred:
-        print('[chain] Eisner, predicted syntax')
-        # attelo out: unordered dtree ; we pass a nuclearity and rank classifiers
-        # to get an ordered dtree ;
-        # need to map to ctree
-        load_deptrees_from_attelo_output(ctree_true, dtree_true,
-                                         EISNER_OUT_SYN_PRED, EDUS_FILE,
-                                         nuc_clf, rnk_clf,
-                                         detailed=False)
-        print('======================')
+        # Eisner, predicted syntax, chain
+        c_preds.append(
+            ('ours-chain', load_attelo_ctrees(EISNER_OUT_SYN_PRED, EDUS_FILE,
+                                              nuc_clf, rnk_clf))
+        )
+        d_preds.append(
+            ('ours-chain', load_attelo_dtrees(EISNER_OUT_SYN_PRED, EDUS_FILE,
+                                              nuc_clf, rnk_clf))
+        )
 
     if 'ours_tree' in authors_pred:
-        print('[tree] Eisner, predicted syntax + same-unit')
-        load_deptrees_from_attelo_output(ctree_true, dtree_true,
-                                         EISNER_OUT_TREE_SYN_PRED_SU, EDUS_FILE,
-                                         nuc_clf, rnk_clf,
-                                         detailed=False)
-        print('======================')
+        # Eisner, predicted syntax, tree + same-unit
+        c_preds.append(
+            ('ours-tree', load_attelo_ctrees(EISNER_OUT_TREE_SYN_PRED_SU,
+                                             EDUS_FILE,
+                                             nuc_clf, rnk_clf))
+        )
+        d_preds.append(
+            ('ours-tree', load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED_SU,
+                                             EDUS_FILE,
+                                             nuc_clf, rnk_clf))
+        )
 
     if False:  # FIXME repair (or forget) these
         print('Eisner, predicted syntax + same-unit')
@@ -238,8 +248,7 @@ def main():
     # report
     # * table format
     digits = 4
-    parser_names = ['joty']
-    width = max(len(x) for x in parser_names)
+    width = max(len(parser_name) for parser_name, _ in d_preds)
 
     headers = ["UAS", "LAS", "LS"]
     fmt = '%% %ds' % width  # first col: parser name
@@ -253,14 +262,14 @@ def main():
     # end table format and header line
 
     # * table content
-    for parser_name, (ctree_pred, dtree_pred) in predictions:
+    for parser_name, dtree_pred in d_preds:
         doc_names = sorted(dtree_true.keys())
         dtree_true_list = [dtree_true[doc_name] for doc_name in doc_names]
         dtree_pred_list = [dtree_pred[doc_name] for doc_name in doc_names]
         score_uas, score_las, score_ls = compute_uas_las(dtree_true_list,
                                                          dtree_pred_list)
         # append to report
-        values = [parser_name]
+        values = ['{pname: <{fill}}'.format(pname=parser_name, fill=width)]
         for v in (score_uas, score_las, score_ls):
             values += ["{0:0.{1}f}".format(v, digits)]
         report += fmt % tuple(values)
@@ -269,12 +278,13 @@ def main():
     # end report
 
     # constituency eval
-    for parser_name, (ctree_pred, dtree_pred) in predictions:
+    for parser_name, ctree_pred in c_preds:
         doc_names = sorted(ctree_true.keys())
         ctree_true_list = [ctree_true[doc_name] for doc_name in doc_names]
         ctree_pred_list = [ctree_pred[doc_name] for doc_name in doc_names]
         # FIXME
         # compute and print PARSEVAL scores
+        print(parser_name)
         print(parseval_report(ctree_true_list, ctree_pred_list, digits=4))
         # detailed report on S+N+R
         if DETAILED:

From 72a2956b32eb6f679dda611cb907a57b2089f0e8 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Fri, 16 Sep 2016 19:04:47 +0200
Subject: [PATCH 18/74] WIP tie order with nary_enc

---
 evals/showdown.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/evals/showdown.py b/evals/showdown.py
index f21f294..9abd50a 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -81,11 +81,12 @@
 NUC_STRATEGY = 'unamb_else_most_frequent'
 RNK_STRATEGY = 'sdist-edist-rl'
 RNK_PRIORITY_SU = True
-RNK_ORDER = 'weak'
 
 
 def setup_dtree_postprocessor(nary_enc):
     """Setup the nuclearity and rank classifiers to flesh out dtrees."""
+    # tie the order with the encoding for n-ary nodes
+    order = 'weak' if nary_enc == 'tree' else 'strict'
     # load train section of the RST corpus, fit (currently dummy) classifiers
     # for nuclearity and rank
     reader_train = RstReader(CD_TRAIN)
@@ -117,7 +118,7 @@ def setup_dtree_postprocessor(nary_enc):
     # rank clf
     rnk_clf = InsideOutAttachmentRanker(strategy=RNK_STRATEGY,
                                         prioritize_same_unit=RNK_PRIORITY_SU,
-                                        order=RNK_ORDER)
+                                        order=order)
     rnk_clf.fit(X_train, y_rnk_train)
     return nuc_clf, rnk_clf
 

From e6c0a5b0563c20b3ee24e19320c41e82fb05f7e5 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Wed, 21 Sep 2016 17:48:22 +0200
Subject: [PATCH 19/74] WIP support output of ji

---
 evals/attelo_predictions_to_disdep.py | 28 ++++++--
 evals/dis2disdep.py                   | 93 ++++++++++++++++++++++++++-
 evals/ours.py                         |  8 +--
 evals/showdown.py                     | 44 ++++++++++---
 irit_rst_dt/local.py                  |  3 +-
 5 files changed, 149 insertions(+), 27 deletions(-)

diff --git a/evals/attelo_predictions_to_disdep.py b/evals/attelo_predictions_to_disdep.py
index f0e7169..2c5a6c3 100755
--- a/evals/attelo_predictions_to_disdep.py
+++ b/evals/attelo_predictions_to_disdep.py
@@ -16,7 +16,8 @@
                                   InsideOutAttachmentRanker)
 
 
-def attelo_predictions_to_disdep_files(edus_file_glob, edges_file, out_dir):
+def attelo_predictions_to_disdep_files(edus_file_glob, edges_file, out_dir,
+                                       nary_enc_pred='tree'):
     """Generate disdep files from a file dump of attelo predictions.
 
     Parameters
@@ -28,12 +29,19 @@ def attelo_predictions_to_disdep_files(edus_file_glob, edges_file, out_dir):
         triples).
     out_dir: str
         Path to the output folder.
+    nary_enc_pred: one of {'chain', 'tree'}
+        Encoding for n-ary cnodes in the predicted dtree ; here it
+        currently triggers the strictness of the order assumed by the
+        dtree postprocessor: nary_enc_pred='chain' implies order='strict',
+        nary_enc_pred='tree' implies order='weak'.
     """
+    order = 'weak' if nary_enc_pred == 'tree' else 'strict'
     # set up heuristic classifiers for nuclearity and rank
     nuc_clf = DummyNuclearityClassifier(strategy='unamb_else_most_frequent')
     nuc_clf.fit([], [])  # dummy fit
-    rnk_clf = InsideOutAttachmentRanker(strategy='closest-intra-rl-inter-rl',
-                                        prioritize_same_unit=True)
+    rnk_clf = InsideOutAttachmentRanker(strategy='sdist-edist-rl',
+                                        prioritize_same_unit=True,
+                                        order=order)
     rnk_clf.fit([], [])  # dummy fit
 
     # load EDUs
@@ -84,17 +92,23 @@ def attelo_predictions_to_disdep_files(edus_file_glob, edges_file, out_dir):
 
 
 if __name__ == '__main__':
+    nary_enc_pred = 'tree'
     edus_file_glob = os.path.join('TMP', 'latest', 'data', 'TEST',
                                   '*.edu-pairs.sparse.edu_input')
-    edges_file_glob = os.path.join('TMP', 'latest', 'scratch-current',
-                                   'combined', 'output.*')
+    edges_file_glob = os.path.join(
+        'TMP', 'latest', 'scratch-current',
+        'combined',
+        # 'output.*'
+        'output.maxent-iheads-global-AD.L-jnt-eisner'
+    )
     # attelo predictions are currently stored in one big file
     edges_files = glob(edges_file_glob)
     assert len(edges_files) == 1
     edges_file = edges_files[0]
     # paths to the resulting disdep files
-    out_dir = os.path.join('TMP_disdep', 'chain', 'ours', 'test')
+    out_dir = os.path.join('TMP_disdep', nary_enc_pred, 'ours', 'test')
     if not os.path.exists(out_dir):
         os.makedirs(out_dir)
     # do the conversion
-    attelo_predictions_to_disdep_files(edus_file_glob, edges_file, out_dir)
+    attelo_predictions_to_disdep_files(edus_file_glob, edges_file, out_dir,
+                                       nary_enc_pred=nary_enc_pred)
diff --git a/evals/dis2disdep.py b/evals/dis2disdep.py
index d1d7966..bb69c97 100755
--- a/evals/dis2disdep.py
+++ b/evals/dis2disdep.py
@@ -2,22 +2,24 @@
 
 TODO
 ----
-* [ ] support the output of Ji & Eisenstein's parser ; need to convert
-      .brackets to .dis_dep (via .dis?)
 * [ ] support intra-sentential level document parsing ; required to score
       Joty's .sen_dis files
 
 """
 from __future__ import absolute_import, print_function
 import argparse
+from collections import defaultdict
+from glob import glob
 import os
 
+from educe.annotation import Span
 from educe.corpus import FileId
 from educe.learning.disdep_format import dump_disdep_files
+from educe.rst_dt.annotation import Node, RSTTree
 from educe.rst_dt.codra import load_codra_output_files
-from educe.rst_dt.feng import load_feng_output_files
 from educe.rst_dt.corpus import Reader
 from educe.rst_dt.deptree import RstDepTree
+from educe.rst_dt.feng import load_feng_output_files
 from educe.rst_dt.rst_wsj_corpus import (DOUBLE_FOLDER, TEST_FOLDER,
                                          TRAIN_FOLDER)
 
@@ -104,7 +106,92 @@ def main():
         # dump_disdep_files)
         for doc_name, dtree in dtrees.items():
             dtree.origin = FileId(doc_name, None, None, None)
+
     elif author == 'ji':
+        if corpus_split != 'test':
+            raise ValueError("The output of Ji & Eisenstein's parser is "
+                             "available for the 'test' split only")
+        # * load the text of the EDUs
+        # FIXME get the text of EDUs from the .merge files
+        corpus_dir = RST_MAIN_TEST
+        reader_true = Reader(corpus_dir)
+        ctree_true = reader_true.slurp()
+        doc_edus = {k.doc: ct_true.leaves() for k, ct_true
+                    in ctree_true.items()}
+        # * for each doc, load the predicted spans from the .brackets
+        ctree_pred = dict()
+        files_pred = os.path.join(OUT_JI, '*.brackets')
+        for f_pred in sorted(glob(files_pred)):
+            doc_name = os.path.splitext(os.path.basename(f_pred))[0]
+            edus = {i: e for i, e in enumerate(doc_edus[doc_name], start=1)}
+            origin = FileId(doc_name, None, None, None)
+            # read spans
+            spans_pred = defaultdict(list)  # predicted spans by length
+            with open(f_pred) as f:
+                for line in f:
+                    # FIXME use a standard module: ast or pickle?
+                    # drop surrounding brackets + opening bracket of edu span
+                    line = line.strip()[2:-1]
+                    edu_span, nuc_rel = line.split('), ')
+                    edu_span = tuple(int(x) for x in edu_span.split(', '))
+                    nuc, rel = nuc_rel.split(', ')
+                    edu_span_len = edu_span[1] - edu_span[0]
+                    spans_pred[edu_span_len].append((edu_span, nuc, rel))
+            # bottom-up construction of the RST ctree
+            # left_border -> list of RST ctree fragments, sorted by len
+            tree_frags = defaultdict(list)
+            for span_len, spans in sorted(spans_pred.items()):
+                for edu_span, nuc, rel in spans:
+                    children = []
+                    edu_beg, edu_end = edu_span
+                    if edu_beg == edu_end:
+                        # leaf node
+                        txt_span = edus[edu_beg].span
+                    else:
+                        # internal node
+                        # * get the children (subtrees)
+                        edu_cur = edu_beg
+                        while edu_cur < edu_end:
+                            kid_nxt = tree_frags[edu_cur][-1]
+                            children.append(kid_nxt)
+                            edu_cur = kid_nxt.label().edu_span[1] + 1
+                        # compute properties of this node
+                        txt_span = Span(children[0].label().span.char_start,
+                                        children[-1].label().span.char_end)
+                    # build node and RSTTree fragment
+                    node = Node(nuc, edu_span, txt_span, rel,
+                                context=None)  # TODO context?
+                    tree_frags[edu_beg].append(
+                        RSTTree(node, children, origin=origin))
+            # build the top node
+            edu_nums = sorted(edus.keys())
+            edu_span = (edu_nums[0], edu_nums[-1])
+            print(doc_name, edu_span)
+            children = []
+            edu_beg, edu_end = edu_span
+            edu_cur = edu_beg
+            while edu_cur < edu_end:
+                print(edu_cur)
+                kid_nxt = tree_frags[edu_cur][-1]
+                children.append(kid_nxt)
+                edu_cur = kid_nxt.label().edu_span[1] + 1
+            txt_span = Span(children[0].label().span.char_start,
+                            children[-1].label().span.char_end)
+            node = Node(nuc, edu_span, txt_span, 'Root', context=None)
+            tree_frags[edu_beg].append(
+                RSTTree(node, children, origin=origin))
+            # now we should have a spanning ctree
+            ct_pred = tree_frags[1][-1]
+            # DEBUG
+            print(sorted(edus.keys())[0],
+                  sorted(edus.keys())[-1])
+            print(ct_pred.label().edu_span)  # RESUME HERE
+            print(sorted(tree_frags.items()))
+            # end DEBUG
+            assert ct_pred.label().edu_span == (sorted(edus.keys())[0],
+                                                sorted(edus.keys())[-1])
+            ctree_pred[doc_name] = ct_pred
+            
         raise NotImplementedError("Output of Ji's parser")
     # do dump
     dump_disdep_files(dtrees.values(), out_dir)
diff --git a/evals/ours.py b/evals/ours.py
index 300b376..0dbe1ce 100644
--- a/evals/ours.py
+++ b/evals/ours.py
@@ -16,9 +16,6 @@
 from educe.rst_dt.document_plus import align_edus_with_paragraphs
 #
 from attelo.io import load_edus
-from attelo.metrics.constituency import (parseval_detailed_report,
-                                         parseval_report)
-from attelo.metrics.deptree import compute_uas_las
 from attelo.table import UNRELATED  # for load_attelo_output_file
 
 
@@ -160,7 +157,6 @@ def load_attelo_ctrees(output_file, edus_file, nuc_clf, rnk_clf):
             ctree_pred[doc_name] = rtree_pred
         except RstDtException as rst_e:
             print(rst_e)
-            skipped_docs.add(doc_name)
             if False:
                 print('\n'.join('{}: {}'.format(edu.text_span(), edu)
                                 for edu in educe_edus[doc_name]))
@@ -171,9 +167,7 @@ def load_attelo_ctrees(output_file, edus_file, nuc_clf, rnk_clf):
 
 def load_deptrees_from_attelo_output(ctree_true, dtree_true,
                                      output_file, edus_file,
-                                     nuc_clf, rnk_clf,
-                                     detailed=False,
-                                     skpd_docs=None):
+                                     nuc_clf, rnk_clf):
     """Load an RstDepTree from the output of attelo.
 
     Parameters
diff --git a/evals/showdown.py b/evals/showdown.py
index 9abd50a..52c096f 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -53,6 +53,12 @@
     'output.maxent-iheads-global-AD.L-jnt-eisner')
 
 # 2016-09-14 "tree" transform, predicted syntax
+EISNER_OUT_TREE_SYN_PRED = os.path.join(
+    '/home/mmorey/melodi',
+    'irit-rst-dt/TMP/latest',  # lbl
+    'scratch-current/combined',
+    'output.maxent-iheads-global-AD.L-jnt-eisner')
+
 EISNER_OUT_TREE_SYN_PRED_SU = os.path.join(
     '/home/mmorey/melodi',
     'irit-rst-dt/TMP/latest',  # lbl
@@ -77,6 +83,8 @@
 
 # level of detail for parseval
 DETAILED = False
+SPAN_SEL = None  # None, 'leaves', 'non-leaves'
+STRINGENT = False
 # hyperparams
 NUC_STRATEGY = 'unamb_else_most_frequent'
 RNK_STRATEGY = 'sdist-edist-rl'
@@ -135,7 +143,7 @@ def main():
     parser.add_argument('authors_pred', nargs='+',
                         choices=['gold', 'silver',
                                  'joty', 'feng', 'ji',
-                                 'ours_chain', 'ours_tree'],
+                                 'ours_chain', 'ours_tree', 'ours_tree_su'],
                         help="Author(s) of the predictions")
     parser.add_argument('--nary_enc_pred', default='tree',
                         choices=['tree', 'chain'],
@@ -161,6 +169,9 @@ def main():
     authors_pred = args.authors_pred
     nary_enc_pred = args.nary_enc_pred
     binarize_true = args.binarize_true
+    if binarize_true and nary_enc_true != 'chain':
+        raise ValueError("--binarize_true is compatible with "
+                         "--nary_enc_true chain only")
 
     # 0. setup the postprocessors to flesh out unordered dtrees into ordered
     # ones with nuclearity
@@ -181,13 +192,13 @@ def main():
         doc_name = doc_id.doc
         # original reference ctree, with coarse labels
         ct_true = REL_CONV(ct_true)  # map fine to coarse relations
-        # corresponding dtree
-        dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc=nary_enc_true)
-        dtree_true[doc_name] = dt_true
-        # binarize ctree if necessary
         if binarize_true:
+            # binarize ctree if required
             ct_true = _binarize(ct_true)
         ctree_true[doc_name] = ct_true
+        # corresponding dtree
+        dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc=nary_enc_true)
+        dtree_true[doc_name] = dt_true
 
     
     c_preds = []  # predictions: [(parser_name, dict(doc_name, ct_pred))]
@@ -219,15 +230,27 @@ def main():
     if 'ours_tree' in authors_pred:
         # Eisner, predicted syntax, tree + same-unit
         c_preds.append(
-            ('ours-tree', load_attelo_ctrees(EISNER_OUT_TREE_SYN_PRED_SU,
+            ('ours-tree', load_attelo_ctrees(EISNER_OUT_TREE_SYN_PRED,
                                              EDUS_FILE,
                                              nuc_clf, rnk_clf))
         )
         d_preds.append(
-            ('ours-tree', load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED_SU,
+            ('ours-tree', load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED,
                                              EDUS_FILE,
                                              nuc_clf, rnk_clf))
         )
+    if 'ours_tree_su' in authors_pred:
+        # Eisner, predicted syntax, tree + same-unit
+        c_preds.append(
+            ('ours-tree-su', load_attelo_ctrees(EISNER_OUT_TREE_SYN_PRED_SU,
+                                                EDUS_FILE,
+                                                nuc_clf, rnk_clf))
+        )
+        d_preds.append(
+            ('ours-tree-su', load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED_SU,
+                                                EDUS_FILE,
+                                                nuc_clf, rnk_clf))
+        )
 
     if False:  # FIXME repair (or forget) these
         print('Eisner, predicted syntax + same-unit')
@@ -286,11 +309,14 @@ def main():
         # FIXME
         # compute and print PARSEVAL scores
         print(parser_name)
-        print(parseval_report(ctree_true_list, ctree_pred_list, digits=4))
+        print(parseval_report(ctree_true_list, ctree_pred_list, digits=4,
+                              span_sel=SPAN_SEL,
+                              stringent=STRINGENT))
         # detailed report on S+N+R
         if DETAILED:
             print(parseval_detailed_report(ctree_true_list, ctree_pred_list,
-                                           metric_type='S+R'))
+                                           metric_type='S+R',
+                                           span_sel=SPAN_SEL))
         # end FIXME
 
 
diff --git a/irit_rst_dt/local.py b/irit_rst_dt/local.py
index bfe2691..f805832 100644
--- a/irit_rst_dt/local.py
+++ b/irit_rst_dt/local.py
@@ -103,7 +103,8 @@
 # TEST_EVALUATION_KEY = 'maxent-AD.L-jnt-mst'
 # TEST_EVALUATION_KEY = 'maxent-AD.L-jnt-eisner'
 # TEST_EVALUATION_KEY = 'maxent-AD.L-jnt_su-eisner'
-TEST_EVALUATION_KEY = 'maxent-iheads-global-AD.L-jnt_su-eisner'
+TEST_EVALUATION_KEY = 'maxent-iheads-global-AD.L-jnt-eisner'
+# TEST_EVALUATION_KEY = 'maxent-iheads-global-AD.L-jnt_su-eisner'
 """Evaluation to use for testing.
 
 Leave this to None until you think it's OK to look at the test data.

From f185f3af6fe0c1663e22bb2c99421b0ed66b895e Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Thu, 22 Sep 2016 15:54:22 +0200
Subject: [PATCH 20/74] ENH added support for DPLP in eval

---
 evals/dis2disdep.py | 100 ++++-------------------------
 evals/ji.py         | 152 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 164 insertions(+), 88 deletions(-)
 create mode 100644 evals/ji.py

diff --git a/evals/dis2disdep.py b/evals/dis2disdep.py
index bb69c97..194abfc 100755
--- a/evals/dis2disdep.py
+++ b/evals/dis2disdep.py
@@ -8,33 +8,38 @@
 """
 from __future__ import absolute_import, print_function
 import argparse
-from collections import defaultdict
-from glob import glob
 import os
 
-from educe.annotation import Span
 from educe.corpus import FileId
 from educe.learning.disdep_format import dump_disdep_files
-from educe.rst_dt.annotation import Node, RSTTree
 from educe.rst_dt.codra import load_codra_output_files
-from educe.rst_dt.corpus import Reader
+from educe.rst_dt.corpus import Reader, RstRelationConverter
 from educe.rst_dt.deptree import RstDepTree
 from educe.rst_dt.feng import load_feng_output_files
 from educe.rst_dt.rst_wsj_corpus import (DOUBLE_FOLDER, TEST_FOLDER,
                                          TRAIN_FOLDER)
 
+from .ji import load_ji_dtrees
+
 
 # original RST corpus
 RST_CORPUS = os.path.join('/home/mmorey/corpora/rst_discourse_treebank/data')
 RST_MAIN_TRAIN = os.path.join(RST_CORPUS, TRAIN_FOLDER)
 RST_MAIN_TEST = os.path.join(RST_CORPUS, TEST_FOLDER)
 RST_DOUBLE = os.path.join(RST_CORPUS, DOUBLE_FOLDER)
+
+# relation converter (fine- to coarse-grained labels)
+RELMAP_FILE = os.path.join('/home/mmorey/melodi/educe',
+                           'educe', 'rst_dt',
+                           'rst_112to18.txt')
+REL_CONV = RstRelationConverter(RELMAP_FILE).convert_tree
+
 # output of Joty's parser
 OUT_JOTY = os.path.join('/home/mmorey/melodi/rst/joty/Doc-level/')
 # output of Feng & Hirst's parser
 OUT_FENG = os.path.join('/home/mmorey/melodi/rst/feng_hirst/tmp/')
 # output of Ji's parser
-OUT_JI = os.path.join('/home/mmorey/melodi/rst/ji_eisenstein/test_input')
+OUT_JI = os.path.join('/home/mmorey/melodi/rst/ji_eisenstein/DPLP/data/docs/test/')
 
 
 def main():
@@ -111,88 +116,7 @@ def main():
         if corpus_split != 'test':
             raise ValueError("The output of Ji & Eisenstein's parser is "
                              "available for the 'test' split only")
-        # * load the text of the EDUs
-        # FIXME get the text of EDUs from the .merge files
-        corpus_dir = RST_MAIN_TEST
-        reader_true = Reader(corpus_dir)
-        ctree_true = reader_true.slurp()
-        doc_edus = {k.doc: ct_true.leaves() for k, ct_true
-                    in ctree_true.items()}
-        # * for each doc, load the predicted spans from the .brackets
-        ctree_pred = dict()
-        files_pred = os.path.join(OUT_JI, '*.brackets')
-        for f_pred in sorted(glob(files_pred)):
-            doc_name = os.path.splitext(os.path.basename(f_pred))[0]
-            edus = {i: e for i, e in enumerate(doc_edus[doc_name], start=1)}
-            origin = FileId(doc_name, None, None, None)
-            # read spans
-            spans_pred = defaultdict(list)  # predicted spans by length
-            with open(f_pred) as f:
-                for line in f:
-                    # FIXME use a standard module: ast or pickle?
-                    # drop surrounding brackets + opening bracket of edu span
-                    line = line.strip()[2:-1]
-                    edu_span, nuc_rel = line.split('), ')
-                    edu_span = tuple(int(x) for x in edu_span.split(', '))
-                    nuc, rel = nuc_rel.split(', ')
-                    edu_span_len = edu_span[1] - edu_span[0]
-                    spans_pred[edu_span_len].append((edu_span, nuc, rel))
-            # bottom-up construction of the RST ctree
-            # left_border -> list of RST ctree fragments, sorted by len
-            tree_frags = defaultdict(list)
-            for span_len, spans in sorted(spans_pred.items()):
-                for edu_span, nuc, rel in spans:
-                    children = []
-                    edu_beg, edu_end = edu_span
-                    if edu_beg == edu_end:
-                        # leaf node
-                        txt_span = edus[edu_beg].span
-                    else:
-                        # internal node
-                        # * get the children (subtrees)
-                        edu_cur = edu_beg
-                        while edu_cur < edu_end:
-                            kid_nxt = tree_frags[edu_cur][-1]
-                            children.append(kid_nxt)
-                            edu_cur = kid_nxt.label().edu_span[1] + 1
-                        # compute properties of this node
-                        txt_span = Span(children[0].label().span.char_start,
-                                        children[-1].label().span.char_end)
-                    # build node and RSTTree fragment
-                    node = Node(nuc, edu_span, txt_span, rel,
-                                context=None)  # TODO context?
-                    tree_frags[edu_beg].append(
-                        RSTTree(node, children, origin=origin))
-            # build the top node
-            edu_nums = sorted(edus.keys())
-            edu_span = (edu_nums[0], edu_nums[-1])
-            print(doc_name, edu_span)
-            children = []
-            edu_beg, edu_end = edu_span
-            edu_cur = edu_beg
-            while edu_cur < edu_end:
-                print(edu_cur)
-                kid_nxt = tree_frags[edu_cur][-1]
-                children.append(kid_nxt)
-                edu_cur = kid_nxt.label().edu_span[1] + 1
-            txt_span = Span(children[0].label().span.char_start,
-                            children[-1].label().span.char_end)
-            node = Node(nuc, edu_span, txt_span, 'Root', context=None)
-            tree_frags[edu_beg].append(
-                RSTTree(node, children, origin=origin))
-            # now we should have a spanning ctree
-            ct_pred = tree_frags[1][-1]
-            # DEBUG
-            print(sorted(edus.keys())[0],
-                  sorted(edus.keys())[-1])
-            print(ct_pred.label().edu_span)  # RESUME HERE
-            print(sorted(tree_frags.items()))
-            # end DEBUG
-            assert ct_pred.label().edu_span == (sorted(edus.keys())[0],
-                                                sorted(edus.keys())[-1])
-            ctree_pred[doc_name] = ct_pred
-            
-        raise NotImplementedError("Output of Ji's parser")
+        dtrees = load_ji_dtrees(OUT_JI, REL_CONV)
     # do dump
     dump_disdep_files(dtrees.values(), out_dir)
 
diff --git a/evals/ji.py b/evals/ji.py
new file mode 100644
index 0000000..2e5e38f
--- /dev/null
+++ b/evals/ji.py
@@ -0,0 +1,152 @@
+"""Load the output of Ji's DPLP parser.
+
+"""
+
+from __future__ import absolute_import, print_function
+
+from collections import defaultdict
+from glob import glob
+import os
+
+from educe.annotation import Span
+from educe.corpus import FileId
+from educe.rst_dt.annotation import Node, RSTTree
+from educe.rst_dt.corpus import Reader
+from educe.rst_dt.deptree import RstDepTree
+from educe.rst_dt.rst_wsj_corpus import TEST_FOLDER
+
+# original RST corpus
+RST_CORPUS = os.path.join('/home/mmorey/corpora/rst_discourse_treebank/data')
+RST_MAIN_TEST = os.path.join(RST_CORPUS, TEST_FOLDER)
+
+
+def load_ji_ctrees(ji_out_dir, rel_conv):
+    """Load the ctrees output by DPLP as .brackets files.
+
+    Parameters
+    ----------
+    ji_out_dir: str
+        Path to the base directory containing the output files.
+
+    Returns
+    -------
+    ctree_pred: dict(str, RSTTree)
+        RST ctree for each document.
+    """
+    # * load the text of the EDUs
+    # FIXME get the text of EDUs from the .merge files
+    corpus_dir = RST_MAIN_TEST
+    reader_true = Reader(corpus_dir)
+    ctree_true = reader_true.slurp()
+    doc_edus = {k.doc: ct_true.leaves() for k, ct_true
+                in ctree_true.items()}
+    # * for each doc, load the predicted spans from the .brackets
+    ctree_pred = dict()
+    files_pred = os.path.join(ji_out_dir, '*.brackets')
+    for f_pred in sorted(glob(files_pred)):
+        doc_name = os.path.splitext(os.path.basename(f_pred))[0]
+        edus = {i: e for i, e in enumerate(doc_edus[doc_name], start=1)}
+        origin = FileId(doc_name, None, None, None)
+        # read spans
+        spans_pred = defaultdict(list)  # predicted spans by length
+        with open(f_pred) as f:
+            for line in f:
+                # FIXME use a standard module: ast? pickle?
+                # * drop surrounding brackets + opening bracket of edu span
+                line = line.strip()[2:-1]
+                edu_span, nuc_rel = line.split('), ')
+                edu_span = tuple(int(x) for x in edu_span.split(', '))
+                nuc, rel = nuc_rel.split(', ')
+                # * remove quotes around nuc and rel
+                nuc = nuc[1:-1]
+                rel = rel[1:-1]
+                #
+                edu_span_len = edu_span[1] - edu_span[0]
+                spans_pred[edu_span_len].append((edu_span, nuc, rel))
+        # bottom-up construction of the RST ctree
+        # left_border -> list of RST ctree fragments, sorted by len
+        tree_frags = defaultdict(list)
+        for span_len, spans in sorted(spans_pred.items()):
+            for edu_span, nuc, rel in spans:
+                children = []
+                edu_beg, edu_end = edu_span
+                if edu_beg == edu_end:
+                    # pre-terminal
+                    txt_span = edus[edu_beg].span
+                    # one child: leaf node: EDU
+                    leaf = edus[edu_beg]
+                    children.append(leaf)
+                else:
+                    # internal node
+                    # * get the children (subtrees)
+                    edu_cur = edu_beg
+                    while edu_cur <= edu_end:
+                        kid_nxt = tree_frags[edu_cur][-1]
+                        children.append(kid_nxt)
+                        edu_cur = kid_nxt.label().edu_span[1] + 1
+                    # compute properties of this node
+                    txt_span = Span(children[0].label().span.char_start,
+                                    children[-1].label().span.char_end)
+                # build node and RSTTree fragment
+                node = Node(nuc, edu_span, txt_span, rel,
+                            context=None)  # TODO context?
+                tree_frags[edu_beg].append(
+                    RSTTree(node, children, origin=origin))
+        # build the top node
+        edu_nums = sorted(edus.keys())
+        edu_span = (edu_nums[0], edu_nums[-1])
+        children = []
+        edu_beg, edu_end = edu_span
+        edu_cur = edu_beg
+        while edu_cur <= edu_end:
+            kid_nxt = tree_frags[edu_cur][-1]
+            children.append(kid_nxt)
+            edu_cur = kid_nxt.label().edu_span[1] + 1
+        txt_span = Span(children[0].label().span.char_start,
+                        children[-1].label().span.char_end)
+        node = Node(nuc, edu_span, txt_span, 'Root', context=None)
+        tree_frags[edu_beg].append(
+            RSTTree(node, children, origin=origin))
+        # now we should have a spanning ctree
+        ct_pred = tree_frags[1][-1]
+        assert ct_pred.label().edu_span == (sorted(edus.keys())[0],
+                                            sorted(edus.keys())[-1])
+        # convert relation labels
+        if rel_conv is not None:
+            ct_pred = rel_conv(ct_pred)
+        # store the resulting RSTTree
+        ctree_pred[doc_name] = ct_pred
+
+    return ctree_pred
+
+
+def load_ji_dtrees(ji_out_dir, rel_conv, nary_enc='chain'):
+    """Get the dtrees that correspond to the ctrees output by DPLP.
+
+    Parameters
+    ----------
+    ji_out_dir: str
+        Path to the base directory containing the output files.
+    rel_conv: TODO
+        Relation converter, from fine- to coarse-grained labels.
+    nary_enc: one of {'chain', 'tree'}
+        Encoding for n-ary nodes.
+
+    Returns
+    -------
+    dtree_pred: dict(str, RstDepTree)
+        RST dtree for each document.
+    """
+    dtree_pred = dict()
+
+    ctree_pred = load_ji_ctrees(ji_out_dir, rel_conv)
+    for doc_name, ct_pred in ctree_pred.items():
+        dtree_pred[doc_name] = RstDepTree.from_rst_tree(
+            ct_pred, nary_enc=nary_enc)
+    # set reference to the document in the RstDepTree (required by
+    # dump_disdep_files)
+    for doc_name, dt_pred in dtree_pred.items():
+        dt_pred.origin = FileId(doc_name, None, None, None)
+
+    return dtree_pred
+

From 2f1b13e5fb58ef1c6088c8a9deaa1ddfa7db07be Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Thu, 22 Sep 2016 16:21:07 +0200
Subject: [PATCH 21/74] FIX eval of dplp

---
 evals/dis2disdep.py |  2 +-
 evals/ji.py         |  8 ++++++++
 evals/showdown.py   | 18 ++++++++++++++++++
 3 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/evals/dis2disdep.py b/evals/dis2disdep.py
index 194abfc..d3140db 100755
--- a/evals/dis2disdep.py
+++ b/evals/dis2disdep.py
@@ -19,7 +19,7 @@
 from educe.rst_dt.rst_wsj_corpus import (DOUBLE_FOLDER, TEST_FOLDER,
                                          TRAIN_FOLDER)
 
-from .ji import load_ji_dtrees
+from evals.ji import load_ji_dtrees
 
 
 # original RST corpus
diff --git a/evals/ji.py b/evals/ji.py
index 2e5e38f..9862abf 100644
--- a/evals/ji.py
+++ b/evals/ji.py
@@ -114,6 +114,14 @@ def load_ji_ctrees(ji_out_dir, rel_conv):
         # convert relation labels
         if rel_conv is not None:
             ct_pred = rel_conv(ct_pred)
+            # change "same_unit" (in Ji's output) into "same-unit" (in ours)
+            for pos in ct_pred.treepositions():
+                t = ct_pred[pos]
+                if isinstance(t, RSTTree):
+                    node = t.label()
+                    # replace "same_unit" with "same-unit"
+                    if node.rel == 'same_unit':
+                        node.rel = 'same-unit'
         # store the resulting RSTTree
         ctree_pred[doc_name] = ct_pred
 
diff --git a/evals/showdown.py b/evals/showdown.py
index 52c096f..4707467 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -21,6 +21,7 @@
 
 # local to this package
 from evals.codra import load_codra_ctrees, load_codra_dtrees
+from evals.ji import load_ji_ctrees, load_ji_dtrees
 from evals.ours import (load_deptrees_from_attelo_output,
                         load_attelo_ctrees,
                         load_attelo_dtrees)
@@ -79,7 +80,10 @@
     'scratch-current/combined',
     'output.maxent-iheads-global-AD.L-jnt-eisner')
 
+# output of Joty's parser CODRA
 CODRA_OUT_DIR = '/home/mmorey/melodi/rst/joty/Doc-level'
+# output of Ji's parser DPLP
+JI_OUT_DIR = os.path.join('/home/mmorey/melodi/rst/ji_eisenstein/DPLP/data/docs/test/')
 
 # level of detail for parseval
 DETAILED = False
@@ -216,6 +220,20 @@ def main():
         # the nary_enc does not matter because codra outputs binary ctrees,
         # hence both encodings result in (the same) strictly ordered dtrees
 
+    if 'ji' in authors_pred:
+        # DPLP outputs RST ctrees in the form of lists of spans;
+        # load_ji_dtrees maps them to RST dtrees
+        c_preds.append(
+            ('ji', load_ji_ctrees(JI_OUT_DIR, REL_CONV))
+        )
+        d_preds.append(
+            ('ji', load_ji_dtrees(JI_OUT_DIR, REL_CONV,
+                                  nary_enc='chain'))
+        )
+        # ji-{chain,tree} would be the same except nary_enc='tree' ;
+        # the nary_enc does not matter because codra outputs binary ctrees,
+        # hence both encodings result in (the same) strictly ordered dtrees
+
     if 'ours_chain' in authors_pred:
         # Eisner, predicted syntax, chain
         c_preds.append(

From 8acd9572ed0691316fd1e26603e72ca62372f20d Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Mon, 26 Sep 2016 10:12:40 +0200
Subject: [PATCH 22/74] ENH variant of parseval scores, per doc then averaged

---
 evals/showdown.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/evals/showdown.py b/evals/showdown.py
index 4707467..7926cca 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -88,6 +88,9 @@
 # level of detail for parseval
 DETAILED = False
 SPAN_SEL = None  # None, 'leaves', 'non-leaves'
+# "PER_DOC = True" computes p, r, f as in DPLP: compute scores per doc,
+# then average over docs
+PER_DOC = False  # should be False, except for comparison with the DPLP paper
 STRINGENT = False
 # hyperparams
 NUC_STRATEGY = 'unamb_else_most_frequent'
@@ -329,6 +332,7 @@ def main():
         print(parser_name)
         print(parseval_report(ctree_true_list, ctree_pred_list, digits=4,
                               span_sel=SPAN_SEL,
+                              per_doc=PER_DOC,
                               stringent=STRINGENT))
         # detailed report on S+N+R
         if DETAILED:

From 1d31ed600dbd4b284a7106d1f90a29beedddd682 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Tue, 27 Sep 2016 17:42:17 +0200
Subject: [PATCH 23/74] ENH parseval for Feng's parser

---
 evals/feng.py     | 88 +++++++++++++++++++++++++++++++++++++++++++++++
 evals/showdown.py | 13 +++++++
 2 files changed, 101 insertions(+)
 create mode 100644 evals/feng.py

diff --git a/evals/feng.py b/evals/feng.py
new file mode 100644
index 0000000..802ddbc
--- /dev/null
+++ b/evals/feng.py
@@ -0,0 +1,88 @@
+"""Load the output of the parser from (Feng and Hirst, 2014).
+
+This is 99% a copy/paste from evals/joty.py .
+I need to come up with a better API and refactor accordingly.
+"""
+
+from __future__ import absolute_import, print_function
+
+import itertools
+
+from educe.rst_dt.feng import load_feng_output_files
+from educe.rst_dt.deptree import RstDepTree
+
+
+def load_feng_ctrees(out_dir, rel_conv):
+    """Load the ctrees output by Feng's parser as .dis files.
+
+    This currently runs on the document-level files (.doc_dis).
+
+    Parameters
+    ----------
+    out_dir: str
+        Path to the base directory containing the output files.
+
+    Returns
+    -------
+    ctree_pred: dict(str, RSTTree)
+        RST ctree for each document.
+    """
+    # load predicted trees
+    data_pred = load_feng_output_files(out_dir)
+    # filenames = data_pred['filenames']
+    doc_names_pred = data_pred['doc_names']
+    rst_ctrees_pred = data_pred['rst_ctrees']
+
+    # build a dict from doc_name to ctree (RSTTree)
+    ctree_pred = dict()  # constituency trees
+    for doc_name, ct_pred in itertools.izip(doc_names_pred, rst_ctrees_pred):
+        # constituency tree
+        # replace fine-grained labels with coarse-grained labels ;
+        # the files we have already contain the coarse labels, except their
+        # initial letter is capitalized whereas ours are not
+        if rel_conv is not None:
+            ct_pred = rel_conv(ct_pred)
+        ctree_pred[doc_name] = ct_pred
+
+    return ctree_pred
+
+
+def load_feng_dtrees(out_dir, rel_conv, nary_enc='chain'):
+    """Get the dtrees that correspond to the ctrees output by Feng's parser.
+
+    Parameters
+    ----------
+    out_dir: str
+        Path to the base directory containing the output files.
+    nary_enc: one of {'chain', 'tree'}
+        Encoding for n-ary nodes.
+
+    Returns
+    -------
+    dtree_pred: dict(str, RstDepTree)
+        RST dtree for each document.
+    """
+    # load predicted trees
+    data_pred = load_feng_output_files(out_dir)
+    # filenames = data_pred['filenames']
+    doc_names_pred = data_pred['doc_names']
+    rst_ctrees_pred = data_pred['rst_ctrees']
+
+    # build a dict from doc_name to ordered dtree (RstDepTree)
+    dtree_pred = dict()
+    for doc_name, ct_pred in itertools.izip(doc_names_pred, rst_ctrees_pred):
+        # constituency tree
+        # replace fine-grained labels with coarse-grained labels ;
+        # the files we have already contain the coarse labels, except their
+        # initial letter is capitalized whereas ours are not
+        if rel_conv is not None:
+            ct_pred = rel_conv(ct_pred)
+        # convert to an ordered dependency tree ;
+        # * 'tree' produces a weakly-ordered dtree strictly equivalent
+        # to the original ctree,
+        # * 'chain' produces a strictly-ordered dtree for which strict
+        # equivalence is not preserved
+        dt_pred = RstDepTree.from_rst_tree(ct_pred, nary_enc=nary_enc)
+        dtree_pred[doc_name] = dt_pred
+
+    return dtree_pred
diff --git a/evals/showdown.py b/evals/showdown.py
index 7926cca..ad15d9d 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -21,6 +21,7 @@
 
 # local to this package
 from evals.codra import load_codra_ctrees, load_codra_dtrees
+from evals.feng import load_feng_ctrees, load_feng_dtrees
 from evals.ji import load_ji_ctrees, load_ji_dtrees
 from evals.ours import (load_deptrees_from_attelo_output,
                         load_attelo_ctrees,
@@ -84,6 +85,8 @@
 CODRA_OUT_DIR = '/home/mmorey/melodi/rst/joty/Doc-level'
 # output of Ji's parser DPLP
 JI_OUT_DIR = os.path.join('/home/mmorey/melodi/rst/ji_eisenstein/DPLP/data/docs/test/')
+# Feng's parser
+FENG_OUT_DIR = '/home/mmorey/melodi/rst/feng_hirst/tmp'
 
 # level of detail for parseval
 DETAILED = False
@@ -210,6 +213,16 @@ def main():
     
     c_preds = []  # predictions: [(parser_name, dict(doc_name, ct_pred))]
     d_preds = []  # predictions: [(parser_name, dict(doc_name, dt_pred))]
+
+    if 'feng' in authors_pred:
+        c_preds.append(
+            ('feng', load_feng_ctrees(FENG_OUT_DIR, REL_CONV))
+        )
+        d_preds.append(
+            ('feng', load_feng_dtrees(FENG_OUT_DIR, REL_CONV,
+                                      nary_enc='chain'))
+        )
+
     if 'joty' in authors_pred:
         # CODRA outputs RST ctrees ; eval_codra_output maps them to RST dtrees
         c_preds.append(

From ff278d06ec995207a6d1474665130539a57df863 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Fri, 30 Sep 2016 19:46:21 +0200
Subject: [PATCH 24/74] WIP more evals, notably on spans from SimpleRSTTree

---
 evals/showdown.py | 57 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 51 insertions(+), 6 deletions(-)

diff --git a/evals/showdown.py b/evals/showdown.py
index ad15d9d..352d303 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -8,7 +8,7 @@
 import argparse
 import os
 
-from educe.rst_dt.annotation import _binarize
+from educe.rst_dt.annotation import _binarize, SimpleRSTTree
 from educe.rst_dt.corpus import (RstRelationConverter,
                                  Reader as RstReader)
 from educe.rst_dt.dep2con import (DummyNuclearityClassifier,
@@ -17,7 +17,7 @@
 #
 from attelo.metrics.constituency import (parseval_detailed_report,
                                          parseval_report)
-from attelo.metrics.deptree import compute_uas_las
+from attelo.metrics.deptree import compute_uas_las, compute_uas_las_undirected
 
 # local to this package
 from evals.codra import load_codra_ctrees, load_codra_dtrees
@@ -90,7 +90,7 @@
 
 # level of detail for parseval
 DETAILED = False
-SPAN_SEL = None  # None, 'leaves', 'non-leaves'
+SPAN_SEL = 'non-leaves'  # None, 'leaves', 'non-leaves'
 # "PER_DOC = True" computes p, r, f as in DPLP: compute scores per doc,
 # then average over docs
 PER_DOC = False  # should be False, except for comparison with the DPLP paper
@@ -171,7 +171,8 @@ def main():
     # * ctree eval
     parser.add_argument('--binarize_true', action='store_true',
                         help="Binarize the reference ctree for the eval")
-
+    parser.add_argument('--simple_rsttree', action='store_true',
+                        help="Binarize ctree and move relations up")
     #
     args = parser.parse_args()
     author_true = args.author_true
@@ -179,6 +180,7 @@ def main():
     authors_pred = args.authors_pred
     nary_enc_pred = args.nary_enc_pred
     binarize_true = args.binarize_true
+    simple_rsttree = args.simple_rsttree
     if binarize_true and nary_enc_true != 'chain':
         raise ValueError("--binarize_true is compatible with "
                          "--nary_enc_true chain only")
@@ -308,7 +310,7 @@ def main():
     digits = 4
     width = max(len(parser_name) for parser_name, _ in d_preds)
 
-    headers = ["UAS", "LAS", "LS"]
+    headers = ["UAS", "LAS", "LS", "UUAS", "ULAS"]
     fmt = '%% %ds' % width  # first col: parser name
     fmt += '  '
     fmt += ' '.join(['% 9s' for _ in headers])
@@ -324,11 +326,28 @@ def main():
         doc_names = sorted(dtree_true.keys())
         dtree_true_list = [dtree_true[doc_name] for doc_name in doc_names]
         dtree_pred_list = [dtree_pred[doc_name] for doc_name in doc_names]
+        # WIP print per doc eval
+        for doc_name, dt_true, dt_pred in zip(
+                doc_names, dtree_true_list, dtree_pred_list):
+            with open(parser_name + '/' + doc_name + '.d_eval', mode='w') as f:
+                print(', '.join('{:.4f}'.format(x)
+                                for x in compute_uas_las(
+                                        [dt_true], [dt_pred])),
+                      file=f)
+                # WIP scores for undirected edges
+                print(', '.join('{:.4f}'.format(x)
+                                for x in compute_uas_las_undirected(
+                                        [dt_true], [dt_pred])),
+                      file=f)
+
+        # end WIP print
         score_uas, score_las, score_ls = compute_uas_las(dtree_true_list,
                                                          dtree_pred_list)
+        score_uuas, score_ulas = compute_uas_las_undirected(dtree_true_list,
+                                                            dtree_pred_list)
         # append to report
         values = ['{pname: <{fill}}'.format(pname=parser_name, fill=width)]
-        for v in (score_uas, score_las, score_ls):
+        for v in (score_uas, score_las, score_ls, score_uuas, score_ulas):
             values += ["{0:0.{1}f}".format(v, digits)]
         report += fmt % tuple(values)
     # end table content
@@ -340,6 +359,32 @@ def main():
         doc_names = sorted(ctree_true.keys())
         ctree_true_list = [ctree_true[doc_name] for doc_name in doc_names]
         ctree_pred_list = [ctree_pred[doc_name] for doc_name in doc_names]
+        if simple_rsttree:
+            ctree_true_list = [SimpleRSTTree.from_rst_tree(x)
+                               for x in ctree_true_list]
+            ctree_pred_list = [SimpleRSTTree.from_rst_tree(x)
+                               for x in ctree_pred_list]
+        # WIP print SimpleRSTTrees
+        if not os.path.exists('gold'):
+            os.makedirs('gold')
+        for doc_name, ct in zip(doc_names, ctree_true_list):
+            with open('gold/' + ct.origin.doc, mode='w') as f:
+                print(ct, file=f)
+        if not os.path.exists(parser_name):
+            os.makedirs(parser_name)
+        for doc_name, ct in zip(doc_names, ctree_pred_list):
+            with open(parser_name + '/' + doc_name, mode='w') as f:
+                print(ct, file=f)
+        # WIP eval each tree in turn
+        for doc_name, ct_true, ct_pred in zip(
+                doc_names, ctree_true_list, ctree_pred_list):
+            with open(parser_name + '/' + doc_name + '.c_eval', mode='w') as f:
+                print(parseval_report([ct_true], [ct_pred], digits=4,
+                                      span_sel=SPAN_SEL,
+                                      per_doc=PER_DOC,
+                                      stringent=STRINGENT),
+                      file=f)
+        # end WIP
         # FIXME
         # compute and print PARSEVAL scores
         print(parser_name)

From e3a34c30e06466ab7cc21f312b2e72ff8500bc06 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Mon, 3 Oct 2016 11:26:36 +0200
Subject: [PATCH 25/74] FIX minor bugs in showdown

---
 evals/showdown.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/evals/showdown.py b/evals/showdown.py
index 352d303..e265c13 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -57,13 +57,13 @@
 # 2016-09-14 "tree" transform, predicted syntax
 EISNER_OUT_TREE_SYN_PRED = os.path.join(
     '/home/mmorey/melodi',
-    'irit-rst-dt/TMP/latest',  # lbl
+    'irit-rst-dt/TMP/2016-09-12T0825',  # lbl
     'scratch-current/combined',
     'output.maxent-iheads-global-AD.L-jnt-eisner')
 
 EISNER_OUT_TREE_SYN_PRED_SU = os.path.join(
     '/home/mmorey/melodi',
-    'irit-rst-dt/TMP/latest',  # lbl
+    'irit-rst-dt/TMP/2016-09-12T0825',  # lbl
     'scratch-current/combined',
     'output.maxent-iheads-global-AD.L-jnt_su-eisner')
 # end 2016-09-14
@@ -327,6 +327,8 @@ def main():
         dtree_true_list = [dtree_true[doc_name] for doc_name in doc_names]
         dtree_pred_list = [dtree_pred[doc_name] for doc_name in doc_names]
         # WIP print per doc eval
+        if not os.path.exists(parser_name):
+            os.makedirs(parser_name)
         for doc_name, dt_true, dt_pred in zip(
                 doc_names, dtree_true_list, dtree_pred_list):
             with open(parser_name + '/' + doc_name + '.d_eval', mode='w') as f:

From 140e93b7eca558bb03a9e05e594e030896822cb8 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Wed, 5 Oct 2016 10:50:02 +0200
Subject: [PATCH 26/74] ENH add metric LAS+O in eval_disdep

---
 evals/eval_disdep.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/evals/eval_disdep.py b/evals/eval_disdep.py
index 8310487..8cbd6f6 100755
--- a/evals/eval_disdep.py
+++ b/evals/eval_disdep.py
@@ -41,7 +41,7 @@
     print('\t'.join(['parser',
                      'a', 'l', 'n', 'r',
                      'al', 'an', 'ar',
-                     'aln',
+                     'aln', 'alr',
                      'alnr',
                      'support']))
 
@@ -60,6 +60,7 @@
         cnt_an = 0  # correct attachment + nuc
         cnt_ar = 0  # correct attachment + rank
         cnt_aln = 0  # correct attachment + label + nuc
+        cnt_alr = 0  # correct attachment + label + rank
         cnt_alnr = 0  # correct attachment + label + nuc + rank
 
         for doc_name, f_true in files_true.items():
@@ -93,12 +94,14 @@
                             cnt_ar += 1
                         if ok_a and ok_l and ok_n:
                             cnt_aln += 1
+                        if ok_a and ok_l and ok_r:
+                            cnt_alr += 1
                         if ok_a and ok_l and ok_n and ok_r:
                             cnt_alnr += 1
         print('\t'.join([author_pred]
                         + ['{:.4f}'.format(float(cnt_x) / cnt_tot)
                            for cnt_x in [cnt_a, cnt_l, cnt_n, cnt_r,
                                          cnt_al, cnt_an, cnt_ar,
-                                         cnt_aln,
+                                         cnt_aln, cnt_alr,
                                          cnt_alnr]]
                         + [str(cnt_tot)]))

From 5ebc6e2a56ee9f96af26f38f4d91962509007328 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Wed, 12 Oct 2016 14:40:02 +0200
Subject: [PATCH 27/74] ENH gcrf output

---
 evals/gcrf_tree_format.py | 208 ++++++++++++++++++++++++++++++++++++++
 evals/showdown.py         |  39 +++++--
 2 files changed, 236 insertions(+), 11 deletions(-)
 create mode 100644 evals/gcrf_tree_format.py

diff --git a/evals/gcrf_tree_format.py b/evals/gcrf_tree_format.py
new file mode 100644
index 0000000..4c7e379
--- /dev/null
+++ b/evals/gcrf_tree_format.py
@@ -0,0 +1,208 @@
+"""Module to load .tree files, output by Feng's gCRF parser.
+
+The .tree files contain binary constituency trees as bracketed strings.
+They differ from the .dis files in that the relation label and
+nuclearity are written on the top node instead of the daughter nodes,
+plus edu spans are not explicitly written at each node.
+"""
+
+from __future__ import absolute_import, print_function
+import codecs
+from glob import glob
+import os
+import re
+
+from nltk.tree import Tree
+
+from educe.rst_dt.annotation import EDU, Node, SimpleRSTTree, Span
+from educe.rst_dt.deptree import RstDepTree
+
+
+TXT_RE = r"(?P<prefix>.+)_!(?P<text>.+)!_(?P<suffix>.+)"
+TXT_PATTERN = re.compile(TXT_RE, flags=re.DOTALL)
+
+
+def reduce_preterminal(terminals, txt_offset, edu_offset):
+    """Create a pre-terminal from a list of terminals.
+
+    Parameters
+    ----------
+    terminals: list of str
+        List of terminals
+
+    Returns
+    -------
+    sct: SimpleRSTTree
+        Pre-terminal.
+    """
+    edu_num = edu_offset
+    edu_txt = ' '.join(terminals)
+    assert edu_txt.startswith('_!') and edu_txt.endswith('!_')
+    edu_txt = edu_txt[2:-2]  # shave off _! and !_
+    edu_txt_span = Span(txt_offset,
+                        txt_offset + len(edu_txt))
+    edu = EDU(edu_num, edu_txt_span, edu_txt,
+              context=None,
+              origin=None)
+    # "pre-terminal"
+    pre_node = Node('leaf', (edu_num, edu_num), edu_txt_span,
+                    'leaf', context=None)
+    sct = SimpleRSTTree(pre_node, [edu])
+    return sct
+
+
+def nltk_to_simple(node, txt_offset=0, edu_offset=1):
+    """Convert an NLTK Tree to a SimpleRSTTree.
+
+    Parameters
+    ----------
+    node: Tree
+        Current tree node.
+    txt_offset: int, defaults to 0
+        Current text offset.
+    edu_offset: int, defaults to 1
+        Current EDU id offset.
+
+    Returns
+    -------
+    sct: SimpleRSTTree
+        Corresponding SimpleRSTTree.
+    """
+    cur_txt_offset = txt_offset
+    cur_edu_offset = edu_offset
+
+    # first, recurse: convert kids
+    new_kids = []
+    for kid in node:
+        if isinstance(kid, Tree):
+            # convert gCRF .tree subtree to SimpleRSTTree
+            new_kid = nltk_to_simple(kid, txt_offset=cur_txt_offset,
+                                     edu_offset=cur_edu_offset)
+            # update current offsets
+            cur_txt_offset = new_kid.label().span.char_end + 1
+            cur_edu_offset = new_kid.label().edu_span[1] + 1
+            new_kids.append(new_kid)
+        else:
+            # kid is a terminal
+            # first, restore parentheses in the text
+            kid = kid.replace('-LRB-', '(').replace('-RRB-', ')')
+            #
+            if not new_kids or isinstance(new_kids[-1], SimpleRSTTree):
+                new_kids.append([])
+            new_kids[-1].append(kid)
+            if kid.endswith('!_'):
+                new_kid = reduce_preterminal(
+                    new_kids[-1], cur_txt_offset, cur_edu_offset)
+                new_kids[-1] = new_kid
+                # update current offsets
+                # * txt_offset: + 1 for whitespace or newline
+                cur_txt_offset = new_kid.label().span.char_end + 1
+                # * edu_offset: + 1 for next EDU
+                cur_edu_offset = new_kid.label().edu_span[1] + 1
+    # check that all have been converted
+    assert all(isinstance(x, SimpleRSTTree) for x in new_kids)
+
+    # we can now compute the label ; the edu_span depends on the
+    # recursive calls
+    lbl = node.label()
+    rel, nuc = lbl.split('[', 1)  # nuc = "N][S]"
+    nuc = nuc[0] + nuc[3]
+    edu_span = (new_kids[0].label().edu_span[0],
+                new_kids[-1].label().edu_span[1])
+    txt_span = Span(new_kids[0].label().span.char_start,
+                    new_kids[-1].label().span.char_end)
+    new_lbl = Node(nuc, edu_span, txt_span, rel)
+    return SimpleRSTTree(new_lbl, new_kids)
+    
+
+def _load_gcrf_tree_file(f):
+    """Do load"""
+    # replace parentheses in text to avoid confusion with parentheses
+    # denoting the bracketed tree structure
+    lines = []
+    for line in f:
+        # replace non-breaking spaces... damn python 2
+        if u"\u00a0" in line:
+            line = line.replace(u"\u00a0", u" ")
+        #
+        m = TXT_PATTERN.match(line)
+        if m is not None:
+            new_line = (m.group('prefix')
+                        + '_!'
+                        + (m.group('text')
+                           .replace('(', '-LRB-')
+                           .replace(')', '-RRB-'))
+                        + '!_'
+                        + m.group('suffix'))
+            line = new_line
+        lines.append(line)
+    ct_str = ''.join(lines)
+    ct = Tree.fromstring(ct_str)
+    sct = nltk_to_simple(ct)
+    return sct
+
+
+def load_gcrf_tree_file(fname):
+    """Load a gCRF tree file.
+
+    Parameters
+    ----------
+    fname: str
+        Path to the file to be loaded.
+
+    Returns
+    -------
+    ct: SimpleRSTTree
+        Binary constituency tree with relation label and nuclearity
+        moved one up.
+    """
+    with codecs.open(fname, encoding='utf-8') as f:
+        ct = _load_gcrf_tree_file(f)
+    return ct
+
+
+def load_gcrf_ctrees(out_dir, rel_conv):
+    """Load the ctrees output by gCRF as .tree files.
+
+    Parameters
+    ----------
+    out_dir: str
+        Path to the base directory containing the output files.
+
+    Returns
+    -------
+    ctree_pred: dict(str, RSTTree)
+        RST ctree for each document.
+    """
+    ctree_pred = dict()
+    for f_tree in glob(os.path.join(out_dir, '*.tree')):
+        doc_name = os.path.splitext(os.path.basename(f_tree))[0]
+        sct_pred = load_gcrf_tree_file(f_tree)
+        ct_pred = SimpleRSTTree.to_binary_rst_tree(sct_pred)
+        if rel_conv is not None:
+            ct_pred = rel_conv(ct_pred)
+        ctree_pred[doc_name] = ct_pred
+    return ctree_pred
+
+
+def load_gcrf_dtrees(out_dir, rel_conv, nary_enc='chain'):
+    """Get the dtrees that correspond to the ctrees output by gCRF.
+
+    Parameters
+    ----------
+    out_dir: str
+        Path to the base directory containing the output files.
+    nary_enc: one of {'chain', 'tree'}
+        Encoding for n-ary nodes.
+
+    Returns
+    -------
+    dtree_pred: dict(str, RstDepTree)
+        RST dtree for each document.
+    """
+    ctree_pred = load_gcrf_ctrees(out_dir, rel_conv)
+    dtree_pred = dict()
+    for doc_name, ct_pred in ctree_pred.items():
+        dt_pred = RstDepTree.from_rst_tree(ct_pred, nary_enc=nary_enc)
+        dtree_pred[doc_name] = dt_pred
+    return dtree_pred
diff --git a/evals/showdown.py b/evals/showdown.py
index e265c13..3c0e01c 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -6,6 +6,7 @@
 from __future__ import print_function
 
 import argparse
+import codecs
 import os
 
 from educe.rst_dt.annotation import _binarize, SimpleRSTTree
@@ -22,6 +23,7 @@
 # local to this package
 from evals.codra import load_codra_ctrees, load_codra_dtrees
 from evals.feng import load_feng_ctrees, load_feng_dtrees
+from evals.gcrf_tree_format import load_gcrf_ctrees, load_gcrf_dtrees
 from evals.ji import load_ji_ctrees, load_ji_dtrees
 from evals.ours import (load_deptrees_from_attelo_output,
                         load_attelo_ctrees,
@@ -85,12 +87,14 @@
 CODRA_OUT_DIR = '/home/mmorey/melodi/rst/joty/Doc-level'
 # output of Ji's parser DPLP
 JI_OUT_DIR = os.path.join('/home/mmorey/melodi/rst/ji_eisenstein/DPLP/data/docs/test/')
-# Feng's parser
-FENG_OUT_DIR = '/home/mmorey/melodi/rst/feng_hirst/tmp'
+# Feng's parsers
+FENG_DIR = '/home/mmorey/melodi/rst/feng_hirst/'
+FENG1_OUT_DIR = os.path.join(FENG_DIR, 'phil', 'tmp')
+FENG2_OUT_DIR = os.path.join(FENG_DIR, 'gCRF_dist/texts/results/test_batch_gold_seg')
 
 # level of detail for parseval
 DETAILED = False
-SPAN_SEL = 'non-leaves'  # None, 'leaves', 'non-leaves'
+SPAN_SEL = None  # None, 'leaves', 'non-leaves'
 # "PER_DOC = True" computes p, r, f as in DPLP: compute scores per doc,
 # then average over docs
 PER_DOC = False  # should be False, except for comparison with the DPLP paper
@@ -152,7 +156,7 @@ def main():
     # predictions
     parser.add_argument('authors_pred', nargs='+',
                         choices=['gold', 'silver',
-                                 'joty', 'feng', 'ji',
+                                 'joty', 'feng', 'feng2', 'ji',
                                  'ours_chain', 'ours_tree', 'ours_tree_su'],
                         help="Author(s) of the predictions")
     parser.add_argument('--nary_enc_pred', default='tree',
@@ -161,7 +165,7 @@ def main():
     # reference
     parser.add_argument('--author_true', default='gold',
                         choices=['gold', 'silver',
-                                 'joty', 'feng', 'ji',
+                                 'joty', 'feng', 'feng2', 'ji',
                                  'ours_chain', 'ours_tree'],
                         help="Author of the reference")
     # * dtree eval
@@ -218,10 +222,19 @@ def main():
 
     if 'feng' in authors_pred:
         c_preds.append(
-            ('feng', load_feng_ctrees(FENG_OUT_DIR, REL_CONV))
+            ('feng', load_feng_ctrees(FENG1_OUT_DIR, REL_CONV))
         )
         d_preds.append(
-            ('feng', load_feng_dtrees(FENG_OUT_DIR, REL_CONV,
+            ('feng', load_feng_dtrees(FENG1_OUT_DIR, REL_CONV,
+                                      nary_enc='chain'))
+        )
+
+    if 'feng2' in authors_pred:
+        c_preds.append(
+            ('gCRF', load_gcrf_ctrees(FENG2_OUT_DIR, REL_CONV))
+        )
+        d_preds.append(
+            ('gCRF', load_gcrf_dtrees(FENG2_OUT_DIR, REL_CONV,
                                       nary_enc='chain'))
         )
 
@@ -331,7 +344,8 @@ def main():
             os.makedirs(parser_name)
         for doc_name, dt_true, dt_pred in zip(
                 doc_names, dtree_true_list, dtree_pred_list):
-            with open(parser_name + '/' + doc_name + '.d_eval', mode='w') as f:
+            with codecs.open(parser_name + '/' + doc_name + '.d_eval',
+                             mode='w', encoding='utf-8') as f:
                 print(', '.join('{:.4f}'.format(x)
                                 for x in compute_uas_las(
                                         [dt_true], [dt_pred])),
@@ -370,17 +384,20 @@ def main():
         if not os.path.exists('gold'):
             os.makedirs('gold')
         for doc_name, ct in zip(doc_names, ctree_true_list):
-            with open('gold/' + ct.origin.doc, mode='w') as f:
+            with codecs.open('gold/' + ct.origin.doc, mode='w',
+                             encoding='utf-8') as f:
                 print(ct, file=f)
         if not os.path.exists(parser_name):
             os.makedirs(parser_name)
         for doc_name, ct in zip(doc_names, ctree_pred_list):
-            with open(parser_name + '/' + doc_name, mode='w') as f:
+            with codecs.open(parser_name + '/' + doc_name, mode='w',
+                             encoding='utf-8') as f:
                 print(ct, file=f)
         # WIP eval each tree in turn
         for doc_name, ct_true, ct_pred in zip(
                 doc_names, ctree_true_list, ctree_pred_list):
-            with open(parser_name + '/' + doc_name + '.c_eval', mode='w') as f:
+            with codecs.open(parser_name + '/' + doc_name + '.c_eval',
+                             mode='w', encoding='utf-8') as f:
                 print(parseval_report([ct_true], [ct_pred], digits=4,
                                       span_sel=SPAN_SEL,
                                       per_doc=PER_DOC,

From 0aa666a82bba6f900e3bb2e2c459f83c96d8202f Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Wed, 12 Oct 2016 17:56:09 +0200
Subject: [PATCH 28/74] WIP repro: gCRF

---
 repro/gcrf/crf_classifier.py |  87 +++++++++
 repro/gcrf/gold_segmenter.py | 112 +++++++++++
 repro/gcrf/parse.py          | 354 +++++++++++++++++++++++++++++++++++
 repro/gcrf/preprocesser.py   | 228 ++++++++++++++++++++++
 4 files changed, 781 insertions(+)
 create mode 100644 repro/gcrf/crf_classifier.py
 create mode 100644 repro/gcrf/gold_segmenter.py
 create mode 100644 repro/gcrf/parse.py
 create mode 100644 repro/gcrf/preprocesser.py

diff --git a/repro/gcrf/crf_classifier.py b/repro/gcrf/crf_classifier.py
new file mode 100644
index 0000000..58ee1ff
--- /dev/null
+++ b/repro/gcrf/crf_classifier.py
@@ -0,0 +1,87 @@
+import os.path
+import subprocess
+
+import paths
+
+
+class CRFClassifier:
+    def __init__(self, name, model_type, model_path, model_file, verbose):
+        self.verbose = verbose
+        self.name = name
+        self.type = model_type
+        self.model_fname = model_file
+        self.model_path = model_path
+
+        model_fpath = os.path.join(self.model_path, self.model_fname)
+        if not os.path.exists(model_fpath):
+            print ('The model path %s for CRF classifier %s does not exist.'
+                   % model_fpath)
+            raise OSError('Could not create classifier subprocess')
+        
+        self.classifier_cmd = [
+            '%s/crfsuite-stdin' % paths.CRFSUITE_PATH,
+            'tag', '-pi',
+            '-m', '%s' % model_fpath
+        ]
+#        print self.classifier_cmd
+        self.classifier = subprocess.Popen(self.classifier_cmd,
+                                           stdin=subprocess.PIPE,
+                                           stdout=subprocess.PIPE,
+                                           stderr=subprocess.PIPE)
+        
+        if self.classifier.poll():
+            raise OSError('Could not create classifier subprocess, with error info:\n%s' % self.classifier.stderr.readline())
+        #self.cnt = 0
+
+    def classify(self, vectors):
+#        print '\n'.join(vectors) + "\n\n"        
+        vectors_str = '\n'.join(vectors) + "\n\n"
+
+        lines_out, lines_err = self.classifier.communicate(vectors_str)
+
+        lines = []
+        for line in lines_out.split('\n'):
+            if not line.strip():
+                break
+            lines.append(line)
+
+        # HACKY replace the subprocess closed by communicate()
+        self.classifier = subprocess.Popen(self.classifier_cmd,
+                                           stdin=subprocess.PIPE,
+                                           stdout=subprocess.PIPE,
+                                           stderr=subprocess.PIPE)
+        
+        if self.classifier.poll():
+            raise OSError('Could not create classifier subprocess, with error info:\n%s' % self.classifier.stderr.readline())
+        # end HACKY
+
+        if self.classifier.poll():
+            raise OSError('crf_classifier subprocess died')
+        
+        predictions = []
+        for line in lines[1:]:
+            line = line.strip()
+#            print line
+            if line != '':
+                fields = line.split(':')
+#                print fields
+                label = fields[0]
+                prob = float(fields[1])
+                predictions.append((label, prob))
+        
+        seq_prob = float(lines[0].split('\t')[1])
+        
+        return seq_prob, predictions
+
+    def poll(self):
+        """
+        Checks that the classifier processes are still alive
+        """
+        if self.classifier is None:
+            return True
+        return self.classifier.poll() is not None
+    
+    def unload(self):
+        if self.classifier is not None and not self.poll():
+            self.classifier.stdin.write('\n')
+            print 'Successfully unloaded %s' % self.name
diff --git a/repro/gcrf/gold_segmenter.py b/repro/gcrf/gold_segmenter.py
new file mode 100644
index 0000000..b963367
--- /dev/null
+++ b/repro/gcrf/gold_segmenter.py
@@ -0,0 +1,112 @@
+"""Pseudo-segmenter for manual (gold) EDU segmentation.
+
+"""
+
+from __future__ import print_function
+import os
+
+import utils.utils
+
+
+class GoldSegmenter(object):
+    """Gold segmenter"""
+
+    def __init__(self, root, _name='gold_segmenter', verbose=False):
+        self.root = root  # root dir for gold .edu files
+        self.name = _name
+        self.verbose = verbose
+
+    def segment(self, doc, filename):
+        """Segment a document.
+
+        Parameters
+        ----------
+        doc: Document
+            Internal representation of a document
+        filename: str
+            Name of the document
+        """
+        # load true segmentation
+        doc_predictions = []
+        fname_doc = os.path.basename(filename)
+        fname_edus = os.path.join(self.root, fname_doc + '.edus')
+        with open(fname_edus) as f_edus:
+            fedus_sentences = f_edus.readlines()
+        doc_predictions = []
+        for sent in fedus_sentences:
+            toks = sent.strip().split(' ')
+            predictions = []
+            for tok in toks[:-1]:
+                if tok == 'EDU_BREAK':
+                    if predictions:
+                        # "not predictions" should not happen, but
+                        # apparently it does, e.g. wsj_1376:
+                        # "EDU_BREAK It provides..."
+                        predictions[-1] = 1
+                else:
+                    predictions.append(0)
+            # set a marginal proba of 1.0 for each prediction
+            doc_predictions.append([(x, 1.0) for x in predictions])
+
+        # c/c
+        doc.edu_word_segmentation = []
+        doc.cuts = []
+        doc.edus = []
+        # end c/c
+
+        for sentence, predictions in zip(doc.sentences, doc_predictions):
+            self.segment_sentence(sentence, predictions)
+
+        # c/c
+        doc.start_edu = 0
+        doc.end_edu = len(doc.edus)
+        # end c/c
+
+    def segment_sentence(self, sentence, predictions):
+        """Segment a sentence.
+        """
+        # c/c from crf_segmenter
+        if len(sentence.tokens) == 1:
+            edus = [[sentence.tokens[0].word, sentence.raw_text[-3 : ]]]
+            
+            sentence.doc.cuts.append((len(sentence.doc.edus), len(sentence.doc.edus) + len(edus)))
+            sentence.start_edu = len(sentence.doc.edus)
+            sentence.end_edu = len(sentence.doc.edus) + len(edus)
+            sentence.doc.edu_word_segmentation.append([(0, 1)])
+            sentence.doc.edus.extend(edus)
+            return
+        # end c/c
+
+        # another c/c
+        edus = []
+        edu_word_segmentations = []
+        start = 0
+        for i in range(len(predictions)):
+            pred = int(predictions[i][0])
+            if pred == 1:
+#                print i, pred
+                edu_word_segmentations.append((start, i + 1))
+                start = i + 1
+        
+        edu_word_segmentations.append((start, len(sentence.tokens)))
+        
+        for (start_word, end_word) in edu_word_segmentations:
+            edu = []
+            for j in range(start_word, end_word):
+                edu.extend(utils.utils.unescape_penn_special_word(sentence.tokens[j].word).split(' '))
+            
+            if end_word == len(sentence.tokens):
+#                print sentence.raw_text
+                edu.append(sentence.raw_text[-3 : ])
+            edus.append(edu)
+        
+        sentence.doc.cuts.append((len(sentence.doc.edus), len(sentence.doc.edus) + len(edus)))
+        sentence.start_edu = len(sentence.doc.edus)
+        sentence.end_edu = len(sentence.doc.edus) + len(edus)
+        sentence.doc.edu_word_segmentation.append(edu_word_segmentations)
+        sentence.doc.edus.extend(edus)
+        # end another c/c
+
+    def unload(self):
+        """Unload ; a no-op here"""
+        pass
diff --git a/repro/gcrf/parse.py b/repro/gcrf/parse.py
new file mode 100644
index 0000000..419acf7
--- /dev/null
+++ b/repro/gcrf/parse.py
@@ -0,0 +1,354 @@
+'''
+Created on 2014-01-17
+
+@author: Vanessa Wei Feng
+'''
+
+from segmenters.crf_segmenter import CRFSegmenter
+from segmenters.gold_segmenter import GoldSegmenter  # MM
+from treebuilder.build_tree_CRF import CRFTreeBuilder
+
+from optparse import OptionParser
+
+import paths
+import os.path
+import sys
+from document.doc import Document
+import time
+import traceback
+from datetime import datetime
+
+from logs.log_writer import LogWriter
+from prep.preprocesser import Preprocesser
+
+import utils.serialize
+
+class DiscourseParser():
+    def __init__(self, options, output_dir = None, 
+                 log_writer = None):
+        self.verbose = options.verbose
+        self.skip_parsing = options.skip_parsing
+        self.global_features = options.global_features
+        self.save_preprocessed_doc = options.save_preprocessed_doc
+        
+        self.output_dir = os.path.join(paths.OUTPUT_PATH, output_dir if output_dir is not None else '')
+        if not os.path.exists(self.output_dir):
+            print 'Output directory %s not exists, creating it now.' % self.output_dir
+            os.makedirs(self.output_dir)
+        
+        self.log_writer = LogWriter(log_writer)
+        
+        self.feature_sets = 'gCRF'
+        
+        initStart = time.time()
+
+        self.preprocesser = None
+        try:
+            self.preprocesser = Preprocesser()
+        except Exception, e:
+            print "*** Loading Preprocessing module failed..."
+            print traceback.print_exc()
+
+            raise e
+        # MM replace CRF segmenter with a fake one that loads segmentation
+        # from a file
+        load_prepared_seg = True
+        if load_prepared_seg:
+            self.segmenter = GoldSegmenter('../texts/results/test_batch_gold_seg')
+        else:
+            try:
+                self.segmenter = CRFSegmenter(_name = self.feature_sets, verbose = self.verbose, global_features = self.global_features)
+            except Exception, e:
+                print "*** Loading Segmentation module failed..."
+                print traceback.print_exc()
+
+                raise e
+        
+        try:        
+            if not self.skip_parsing:
+                self.treebuilder = CRFTreeBuilder(_name = self.feature_sets, verbose = self.verbose) 
+            else:
+                self.treebuilder = None
+        except Exception, e:
+            print "*** Loading Tree-building module failed..."
+            print traceback.print_exc()
+            raise e
+        
+        
+        initEnd = time.time()
+        print 'Finished initialization in %.2f seconds.' % (initEnd - initStart)
+        print       
+    
+        
+    def unload(self):
+        if self.preprocesser is not None:
+            self.preprocesser.unload()
+        
+        if not self.segmenter is None:
+            self.segmenter.unload()
+        
+        if not self.treebuilder is None:
+            self.treebuilder.unload()
+        
+    
+    def parse(self, filename):
+        if not os.path.exists(filename):
+            print '%s does not exist.' % filename
+            return
+        
+        self.log_writer.write('***** Parsing %s...' % filename)
+        
+        try:
+            core_filename = os.path.split(filename)[1]
+            serialized_doc_filename = os.path.join(self.output_dir, core_filename + '.doc.ser')
+            doc = None
+            if os.path.exists(serialized_doc_filename):
+                doc = utils.serialize.loadData(core_filename, self.output_dir, '.doc.ser')
+            
+            if doc is None or not doc.preprocessed:   
+                preprocessStart = time.time()
+                doc = Document()                 
+                doc.preprocess(filename, self.preprocesser)               
+                
+                preprocessEnd = time.time()
+                
+                print 'Finished preprocessing in %.2f seconds.' % (preprocessEnd - preprocessStart)
+                self.log_writer.write('Finished preprocessing in %.2f seconds.' % (preprocessEnd - preprocessStart))
+                
+                if self.save_preprocessed_doc:
+                    print 'Saved preprocessed document data to %s.' % serialized_doc_filename           
+                    utils.serialize.saveData(core_filename, doc, self.output_dir, '.doc.ser')
+                
+            else:
+                print 'Loaded saved serialized document data.'
+            
+            print
+        except Exception, e:
+            print "*** Preprocessing failed ***"
+            print traceback.print_exc()
+               
+            raise e
+        
+        try:
+            if not doc.segmented:
+                segStart = time.time()
+                
+                self.segmenter.segment(doc, filename)  # MM added filename for GoldSegmenter
+                
+                if self.verbose:
+                    print 'edus'
+                    for e in doc.edus:
+                        print e
+                    print
+                    print 'cuts'
+                    for cut in doc.cuts:
+                        print cut
+                    print
+                    print 'edu_word_segmentation'
+                
+                segEnd = time.time()
+                print 'Finished segmentation in %.2f seconds.' % (segEnd - segStart)     
+                print 'Segmented into %d EDUs.' % len(doc.edus)
+                
+                
+                self.log_writer.write('Finished segmentation in %.2f seconds. Segmented into %d EDUs.' % ((segEnd - segStart), len(doc.edus)))
+                if self.save_preprocessed_doc:
+                    print 'Saved segmented document data to %s.' % serialized_doc_filename           
+                    utils.serialize.saveData(core_filename, doc, self.output_dir, '.doc.ser')
+            else:
+                print 'Already segmented into %d EDUs.' % len(doc.edus)
+            
+            print
+        
+            if options.verbose:
+                for e in doc.edus:
+                    print e
+            
+                 
+        except Exception, e:
+            print "*** Segmentation failed ***"
+            print traceback.print_exc()
+               
+            raise e
+        
+        
+        try:    
+            ''' Step 2: build text-level discourse tree '''
+            if self.skip_parsing:
+                outfname = os.path.join(self.output_dir, core_filename + ".edus")
+                print 'Output EDU segmentation result to %s' % outfname
+                f_o = open(outfname, "w")
+                for sentence in doc.sentences:
+                    sent_id = sentence.sent_id
+                    edu_segmentation = doc.edu_word_segmentation[sent_id]
+                    i = 0
+                    sent_out = []
+                    for (j, token) in enumerate(sentence.tokens):
+                        sent_out.append(token.word)
+                        if j < len(sentence.tokens) - 1 and j == edu_segmentation[i][1] - 1:
+                            sent_out.append('EDU_BREAK')
+                            i += 1
+                    f_o.write(' '.join(sent_out) + '\n')
+                    
+                f_o.flush()
+                f_o.close()
+            else:
+                treeBuildStart = time.time()
+    #                
+                outfname = os.path.join(self.output_dir, core_filename + ".tree")
+                
+                pt = self.treebuilder.build_tree(doc)
+                        
+                print 'Finished tree building.'
+    
+                if pt is None:
+                    print "No tree could be built..."
+                        
+                    if not self.treebuilder is None:
+                        self.treebuilder.unload()
+    
+                    return -1
+                                 
+    #           Unescape the parse tree
+                if pt:
+                    doc.discourse_tree = pt
+                    treeBuildEnd = time.time()
+                    
+    #                print out
+                    print 'Finished tree building in %.2f seconds.' % (treeBuildEnd - treeBuildStart)  
+                    self.log_writer.write('Finished tree building in %.2f seconds.' % (treeBuildEnd - treeBuildStart))
+                    
+                    for i in range(len(doc.edus)):
+                        pt.__setitem__(pt.leaf_treeposition(i), '_!%s!_' % ' '.join(doc.edus[i]))
+                    
+                    out = pt.pprint()
+                    print 'Output tree building result to %s.' % outfname
+                    f_o = open(outfname, "w")
+                    f_o.write(out)
+                    f_o.close()
+    
+                
+                if self.save_preprocessed_doc:
+                    print 'Saved fully processed document data to %s.' % serialized_doc_filename           
+                    utils.serialize.saveData(core_filename, doc, self.output_dir, '.doc.ser')
+            
+            print
+        except Exception, e:
+            print traceback.print_exc()
+            
+            raise e
+    
+        print '==================================================='
+        #return dists#, probs
+
+def main(options, args):
+    parser = None
+    try:
+        if options.output_dir:
+            output_dir = args[0]
+            start_arg = 1
+        else:
+            output_dir = None
+            start_arg = 0
+        
+        log_writer = None
+        if options.logging:
+            log_fname = os.path.join(paths.LOGS_PATH, 'log_%s.txt' % (output_dir if output_dir else datetime.now().strftime('%Y_%m_%d_%H_%M_%S')))
+            log_writer = open(log_fname, 'w')
+
+        
+        if options.filelist:
+            file_fname = args[start_arg]
+            if not os.path.exists(file_fname) or not os.path.isfile(file_fname):
+                print 'The specified file list %s is not a file or does not exist' % file_fname
+                return
+                 
+        parser = DiscourseParser(options = options,
+                                 output_dir = output_dir, 
+                                 log_writer = log_writer)
+        
+        files = []
+        skips = 0
+        if options.filelist:
+            file_fname = args[start_arg]
+            for line in open(file_fname).readlines():
+                fname = line.strip()
+                    
+                if os.path.exists(fname):
+                    if os.path.exists(os.path.join(parser.output_dir, os.path.split(fname)[1] + '.tree')):
+                        skips += 1
+                    else:
+                        files.append(fname)
+                else:
+                    skips += 1
+#                    print 'Skip %s since it does not exist.' % fname          
+        else:
+            fname = args[start_arg]
+#            print os.path.join(paths.tmp_folder, os.path.split(fname)[1] + '.xml')
+            if os.path.exists(fname):
+                if os.path.exists(os.path.join(parser.output_dir, os.path.split(fname)[1] + '.tree')):
+                    skips += 1
+                else:
+                    files.append(fname)
+            else:
+                skips += 1
+        
+        print 'Processing %d documents, skipping %d' % (len(files), skips)
+        
+        for (i, filename) in enumerate(files):
+            print 'Parsing %s, progress: %.2f (%d out of %d)' % (filename, i * 100.0 / len(files), i, len(files))
+                    
+            try:
+                parser.parse(filename)
+                
+                parser.log_writer.write('===================================================')
+            except Exception, e:
+                print 'Some error occurred, skipping the file'
+                raise e
+           
+        parser.unload()
+        
+    except Exception, e:
+        print traceback.print_exc()
+        if not parser is None:
+            parser.unload()
+
+
+
+v = '1.0'
+if __name__ == '__main__':
+    usage = "Usage: %prog [options] input_file/dir"
+    
+    optParser = OptionParser(usage=usage, version="%prog " + v)
+    optParser.add_option("-v", "--verbose",
+                      action="store_true", dest="verbose", default=False,
+                      help="verbose mode")
+    optParser.add_option("-s", "--skip_parsing",
+                      action="store_true", dest="skip_parsing", default=False,
+                      help="Skip parsing, i.e., conduct segmentation only.")
+    optParser.add_option("-D", "--filelist",
+                      action="store_true", dest="filelist", default=False,
+                      help="parse all files specified in the filelist file, one file per line.")
+    optParser.add_option("-t", "--output_dir",
+                         action="store_true", dest="output_dir", default=False,
+                         help="Specify a directory for output files.")
+    optParser.add_option("-g", "--global_features",
+                         action="store_true", dest="global_features", default=False,
+                         help="Perform a second pass of EDU segmentation using global features.")
+    optParser.add_option("-l", "--logging",
+                         action="store_true", dest="logging", default=False,
+                         help="Perform logging while parsing.")
+    optParser.add_option("-e", "--save",
+                         action="store_true", dest="save_preprocessed_doc", default=False,
+                         help="Save preprocessed document into serialized file for future use.")
+    
+    
+       
+    (options, args) = optParser.parse_args()
+    if len(args) == 0:
+        optParser.print_help()
+        sys.exit(1)
+                
+        
+    main(options, args)
+    
diff --git a/repro/gcrf/preprocesser.py b/repro/gcrf/preprocesser.py
new file mode 100644
index 0000000..ed90fb4
--- /dev/null
+++ b/repro/gcrf/preprocesser.py
@@ -0,0 +1,228 @@
+'''
+Created on 2014-01-18
+
+@author: Wei
+'''
+import subprocess
+import paths
+from document.sentence import Sentence
+from document.token import Token
+from trees.lexicalized_tree import LexicalizedTree
+import prep_utils
+import os.path
+from syntax_parser import SyntaxParser
+from document.dependency import Dependency
+import re
+
+class Preprocesser:
+    def __init__(self):        
+        self.syntax_parser = None
+        
+        try:
+            self.syntax_parser = SyntaxParser()
+        except Exception, e:
+            raise e
+        
+        self.max_sentence_len = 100
+    
+    def heuristic_sentence_splitting(self, raw_sent):
+        if len(raw_sent) == 0:
+            return []
+        
+        if len(raw_sent.split()) <= self.max_sentence_len:
+            return [raw_sent]
+  
+        i = len(raw_sent) / 2
+        j = i
+        k = i + 1
+        boundaries = [';', ':', '!', '?']
+        
+        results = []
+        while j > 0 and k < len(raw_sent) - 1:
+            if raw_sent[j] in boundaries:
+                l_sent = raw_sent[ : j + 1]
+                r_sent = raw_sent[j + 1 : ].strip()
+                
+                if len(l_sent.split()) > 1 and len(r_sent.split()) > 1:
+                    results.extend(self.heuristic_sentence_splitting(l_sent))
+                    results.extend(self.heuristic_sentence_splitting(r_sent))
+                    return results
+                else:
+                    j -= 1
+                    k += 1
+            elif raw_sent[k] in boundaries:
+                l_sent = raw_sent[ : k + 1]
+                r_sent = raw_sent[k + 1 : ].strip()
+                
+                if len(l_sent.split()) > 1 and len(r_sent.split()) > 1:
+                    results.extend(self.heuristic_sentence_splitting(l_sent))
+                    results.extend(self.heuristic_sentence_splitting(r_sent))
+                    return results
+                else:
+                    j -= 1
+                    k += 1
+            else:
+                j -= 1
+                k += 1
+        
+        if len(results) == 0:
+            return [raw_sent]
+                
+
+    def parse_single_sentence(self, raw_text):
+        return self.syntax_parser.parse_sentence(raw_text)
+    
+    
+    def process_single_sentence(self, doc, raw_text, end_of_para):
+        sentence = Sentence(len(doc.sentences), raw_text + ('<s>' if not end_of_para else '<P>'), doc)
+        parse_tree_str, deps_str = self.parse_single_sentence(raw_text)
+
+        parse = LexicalizedTree.parse(parse_tree_str, leaf_pattern = '(?<=\\s)[^\)\(]+')  
+        sentence.set_unlexicalized_tree(parse)
+        
+        for (token_id, te) in enumerate(parse.leaves()):
+            word = te
+            token = Token(word, token_id + 1, sentence)
+            sentence.add_token(token)
+
+        heads = self.get_heads(sentence, deps_str.split('\n'))
+        sentence.heads = heads
+        sentence.set_lexicalized_tree(prep_utils.create_lexicalized_tree(parse, heads))
+     
+        doc.add_sentence(sentence)
+    
+    
+    def get_heads(self, sentence, dep_elems):
+        heads = []
+        for token in sentence.tokens:
+            heads.append([token.word, token.get_PoS_tag(), 0])
+            
+        for dep_e in dep_elems:
+            m = re.match('(.+?)\((.+?)-(\d+?), (.+?)-(\d+?)\)', dep_e)
+            if m:
+                relation = m.group(1)
+                gov_id = int(m.group(3))
+                dep_id = int(m.group(5))
+
+                heads[dep_id - 1][2] = gov_id
+                sentence.add_dependency(Dependency(gov_id, dep_id, relation))
+
+            
+        return heads
+
+
+    def sentence_splitting(self, raw_filename, doc):
+        doc.sentences = []
+        
+        cmd = 'perl %s/boundary.pl -d %s/HONORIFICS -i %s' % (paths.SSPLITTER_PATH, paths.SSPLITTER_PATH, os.path.abspath(raw_filename))
+
+        p = subprocess.Popen(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True)
+        output, errdata = p.communicate()
+
+        if len(errdata) == 0:
+            raw_paras = output.strip().split('\n\n')
+            seg_sents = []
+            for para_idx, raw_string in enumerate(raw_paras):
+                raw_sentences = raw_string.split('\n')
+                # MM
+                if (os.path.basename(raw_filename) == 'wsj_0655.out'
+                    and para_idx == 8):
+                    # the segmenter wrongly splits on "[{Mr.] [Ortega's}]"
+                    # => repair by merging sentences
+                    raw_sentences = ([raw_sentences[0] + ' '
+                                      + raw_sentences[1]]
+                                     + raw_sentences[2:])
+                elif (os.path.basename(raw_filename) == 'wsj_1169.out'
+                      and para_idx == 0):
+                    # "[Murata Mfg.] [Co.]"
+                    raw_sentences = ([raw_sentences[0] + ' '
+                                      + raw_sentences[1]]
+                                     + raw_sentences[2:])
+                elif (os.path.basename(raw_filename) == 'wsj_1169.out'
+                      and para_idx == 2):
+                    # [G.m.b.] [H.]
+                    raw_sentences = ([raw_sentences[0] + ' '
+                                      + raw_sentences[1]]
+                                     + raw_sentences[2:])
+                elif (os.path.basename(raw_filename) == 'wsj_1331.out'
+                      and para_idx == 9):
+                    # [all over again.] ['"]
+                    raw_sentences = (raw_sentences[:1]
+                                     + [raw_sentences[1] + ' '
+                                        + raw_sentences[2]])
+                elif (os.path.basename(raw_filename) == 'wsj_1376.out'
+                      and para_idx == 5):
+                    # [society.] [. . .]
+                    raw_sentences = (raw_sentences[:1]
+                                     + [raw_sentences[1] + ' '
+                                        + raw_sentences[2]]
+                                     + raw_sentences[3:])
+                elif (os.path.basename(raw_filename) == 'wsj_1376.out'
+                      and para_idx == 6):
+                    # [` Hello.] ['] (twice)
+                    # move the trailing "'" up from the next raw sentence,
+                    # and drop the whitespace after it
+                    raw_sentences[3] = raw_sentences[3] + raw_sentences[4][0]
+                    raw_sentences[4] = raw_sentences[4][2:]
+                    # same for the next sentence
+                    raw_sentences[4] = raw_sentences[4] + raw_sentences[5][0]
+                    raw_sentences[5] = raw_sentences[5][2:]
+                elif (os.path.basename(raw_filename) == 'wsj_1376.out'
+                      and para_idx == 21):
+                    raw_sentences[0] = (raw_sentences[0] + ' '
+                                        + raw_sentences[1] + ' '
+                                        + raw_sentences[2])
+                    raw_sentences = raw_sentences[:1] + raw_sentences[3:]
+                elif (os.path.basename(raw_filename) == 'wsj_1380.out'
+                      and para_idx == 6):
+                    # [... Boston Inc. .] ['s First ...]
+                    raw_sentences[0] = (raw_sentences[0] + ' '
+                                        + raw_sentences[1])
+                    raw_sentences = raw_sentences[:1]
+                elif (os.path.basename(raw_filename) == 'wsj_2385.out'
+                      and para_idx in [4, 5, 12]):
+                    # double dash is equivalent here to ":", hence same
+                    # sentence, ex: [... Co. .][-- ...]
+                    raw_sentences[0] = (raw_sentences[0] + ' '
+                                        + raw_sentences[1])
+                    raw_sentences = raw_sentences[:1]
+                elif (os.path.basename(raw_filename) == 'wsj_2386.out'
+                      and para_idx == 2):
+                    raw_sentences[0] = (raw_sentences[0] + ' '
+                                        + raw_sentences[1])
+                    raw_sentences = raw_sentences[:1] + raw_sentences[2:]
+                elif False:
+                    print para_idx
+                    print raw_sentences
+                # end MM
+                for (i, raw_sent) in enumerate(raw_sentences):
+                    if len(raw_sent.split()) > self.max_sentence_len:
+                        chunked_raw_sents = self.heuristic_sentence_splitting(raw_sent)
+                        if len(chunked_raw_sents) == 1:
+                            continue
+                        
+                        for (j, sent) in enumerate(chunked_raw_sents):
+                            seg_sents.append((sent, i == len(raw_sentences) - 1 and j == len(chunked_raw_sents)))
+                    else:
+                        seg_sents.append((raw_sent, i == len(raw_sentences) - 1))
+            # MM
+            if False and os.path.basename(raw_filename) == 'wsj_2386.out':
+                raise ValueError('gni')            
+            # end MM
+        else:
+            raise NameError("*** Sentence splitter crashed, with trace %s..." % errdata)
+        
+        
+        for (i, (raw_text, end_of_para)) in enumerate(seg_sents):
+            if i % 10 == 0:
+                print 'Processing sentence %d out of %d' % (i, len(seg_sents))
+    
+            self.process_single_sentence(doc, raw_text, end_of_para)
+
+    def preprocess(self, raw_filename, doc):
+        self.sentence_splitting(raw_filename, doc)
+        
+
+    def unload(self):
+        if self.syntax_parser:
+            self.syntax_parser.unload()

From 6cb02c737a1f0baa90bbd1f37f6c9962d4cb9333 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Wed, 12 Oct 2016 17:57:51 +0200
Subject: [PATCH 29/74] ENH disdep eval for gCRF, replace globals with options

---
 evals/dis2disdep.py  | 16 ++++++++++++++--
 evals/eval_disdep.py |  4 ++--
 evals/showdown.py    | 31 ++++++++++++++++++++++---------
 3 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/evals/dis2disdep.py b/evals/dis2disdep.py
index d3140db..bcca38e 100755
--- a/evals/dis2disdep.py
+++ b/evals/dis2disdep.py
@@ -19,6 +19,7 @@
 from educe.rst_dt.rst_wsj_corpus import (DOUBLE_FOLDER, TEST_FOLDER,
                                          TRAIN_FOLDER)
 
+from evals.gcrf_tree_format import load_gcrf_dtrees
 from evals.ji import load_ji_dtrees
 
 
@@ -37,7 +38,9 @@
 # output of Joty's parser
 OUT_JOTY = os.path.join('/home/mmorey/melodi/rst/joty/Doc-level/')
 # output of Feng & Hirst's parser
-OUT_FENG = os.path.join('/home/mmorey/melodi/rst/feng_hirst/tmp/')
+OUT_FENG = os.path.join('/home/mmorey/melodi/rst/feng_hirst/phil/tmp/')
+# output of Feng & Hirst's parser
+OUT_FENG2 = os.path.join('/home/mmorey/melodi/rst/feng_hirst/gCRF_dist/texts/results/test_batch_gold_seg')
 # output of Ji's parser
 OUT_JI = os.path.join('/home/mmorey/melodi/rst/ji_eisenstein/DPLP/data/docs/test/')
 
@@ -51,7 +54,8 @@ def main():
                         choices=['chain', 'tree'],
                         help="Encoding for n-ary nodes")
     parser.add_argument('--author', default='gold',
-                        choices=['gold', 'silver', 'joty', 'feng', 'ji'],
+                        choices=['gold', 'silver',
+                                 'joty', 'feng', 'feng2', 'ji'],
                         help="Author of the version of the corpus")
     parser.add_argument('--split', default='test',
                         choices=['train', 'test', 'double'],
@@ -112,6 +116,14 @@ def main():
         for doc_name, dtree in dtrees.items():
             dtree.origin = FileId(doc_name, None, None, None)
 
+    elif author == 'feng2':
+        if corpus_split != 'test':
+            raise ValueError("The output of Feng & Hirst's parser is "
+                             "available for the 'test' split only")
+        dtrees = load_gcrf_dtrees(OUT_FENG2, REL_CONV)
+        for doc_name, dtree in dtrees.items():
+            dtree.origin = FileId(doc_name, None, None, None)
+
     elif author == 'ji':
         if corpus_split != 'test':
             raise ValueError("The output of Ji & Eisenstein's parser is "
diff --git a/evals/eval_disdep.py b/evals/eval_disdep.py
index 8cbd6f6..81be1b8 100755
--- a/evals/eval_disdep.py
+++ b/evals/eval_disdep.py
@@ -17,12 +17,12 @@
         description="Evaluate dis_dep trees against a given reference")
     parser.add_argument('authors_pred', nargs='+',
                         choices=['gold', 'silver',
-                                 'joty', 'feng', 'ji',
+                                 'joty', 'feng', 'feng2', 'ji',
                                  'ours'],
                         help="Author(s) of the predictions")
     parser.add_argument('--author_true', default='gold',
                         choices=['gold', 'silver',
-                                 'joty', 'feng', 'ji',
+                                 'joty', 'feng', 'feng2', 'ji',
                                  'ours'],
                         help="Author of the reference")
     parser.add_argument('--nary_enc', default='chain',
diff --git a/evals/showdown.py b/evals/showdown.py
index 3c0e01c..0f128f6 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -94,10 +94,6 @@
 
 # level of detail for parseval
 DETAILED = False
-SPAN_SEL = None  # None, 'leaves', 'non-leaves'
-# "PER_DOC = True" computes p, r, f as in DPLP: compute scores per doc,
-# then average over docs
-PER_DOC = False  # should be False, except for comparison with the DPLP paper
 STRINGENT = False
 # hyperparams
 NUC_STRATEGY = 'unamb_else_most_frequent'
@@ -177,6 +173,11 @@ def main():
                         help="Binarize the reference ctree for the eval")
     parser.add_argument('--simple_rsttree', action='store_true',
                         help="Binarize ctree and move relations up")
+    parser.add_argument('--span_sel', default='none',
+                        choices=['none', 'leaves', 'non-leaves'],
+                        help="Binarize ctree and move relations up")
+    parser.add_argument('--per_doc', action='store_true',
+                        help="Doc-averaged scores (cf. Ji's eval)")
     #
     args = parser.parse_args()
     author_true = args.author_true
@@ -185,6 +186,18 @@ def main():
     nary_enc_pred = args.nary_enc_pred
     binarize_true = args.binarize_true
     simple_rsttree = args.simple_rsttree
+    span_sel = args.span_sel
+    if span_sel == 'none':
+        span_sel = None
+    if simple_rsttree:
+        # the point of evaluating on simple rst trees is to get leaves
+        # out of the way
+        span_sel = 'non-leaves'
+    # "per_doc = True" computes p, r, f as in DPLP: compute scores per doc
+    # then average over docs
+    # it should be False, except for comparison with the DPLP paper
+    per_doc = args.per_doc
+    #
     if binarize_true and nary_enc_true != 'chain':
         raise ValueError("--binarize_true is compatible with "
                          "--nary_enc_true chain only")
@@ -399,8 +412,8 @@ def main():
             with codecs.open(parser_name + '/' + doc_name + '.c_eval',
                              mode='w', encoding='utf-8') as f:
                 print(parseval_report([ct_true], [ct_pred], digits=4,
-                                      span_sel=SPAN_SEL,
-                                      per_doc=PER_DOC,
+                                      span_sel=span_sel,
+                                      per_doc=per_doc,
                                       stringent=STRINGENT),
                       file=f)
         # end WIP
@@ -408,14 +421,14 @@ def main():
         # compute and print PARSEVAL scores
         print(parser_name)
         print(parseval_report(ctree_true_list, ctree_pred_list, digits=4,
-                              span_sel=SPAN_SEL,
-                              per_doc=PER_DOC,
+                              span_sel=span_sel,
+                              per_doc=per_doc,
                               stringent=STRINGENT))
         # detailed report on S+N+R
         if DETAILED:
             print(parseval_detailed_report(ctree_true_list, ctree_pred_list,
                                            metric_type='S+R',
-                                           span_sel=SPAN_SEL))
+                                           span_sel=span_sel))
         # end FIXME
 
 

From c721d8518768a8eb29f0719f227a41758154b59b Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Thu, 13 Oct 2016 12:51:25 +0200
Subject: [PATCH 30/74] FIX add option to load EDUs in gcrf parse.py

---
 repro/gcrf/parse.py        | 90 +++++++++++++++++++-------------------
 repro/gcrf/preprocesser.py | 12 +++++
 2 files changed, 57 insertions(+), 45 deletions(-)

diff --git a/repro/gcrf/parse.py b/repro/gcrf/parse.py
index 419acf7..a0ca48f 100644
--- a/repro/gcrf/parse.py
+++ b/repro/gcrf/parse.py
@@ -4,34 +4,33 @@
 @author: Vanessa Wei Feng
 '''
 
-from segmenters.crf_segmenter import CRFSegmenter
-from segmenters.gold_segmenter import GoldSegmenter  # MM
-from treebuilder.build_tree_CRF import CRFTreeBuilder
-
-from optparse import OptionParser
-
-import paths
 import os.path
 import sys
-from document.doc import Document
 import time
 import traceback
 from datetime import datetime
+from optparse import OptionParser
 
+import paths
+import utils.serialize
+from document.doc import Document
 from logs.log_writer import LogWriter
 from prep.preprocesser import Preprocesser
+from segmenters.crf_segmenter import CRFSegmenter
+from segmenters.gold_segmenter import GoldSegmenter  # MM
+from treebuilder.build_tree_CRF import CRFTreeBuilder
 
-import utils.serialize
 
 class DiscourseParser():
-    def __init__(self, options, output_dir = None, 
-                 log_writer = None):
+    def __init__(self, options, output_dir=None, log_writer=None):
         self.verbose = options.verbose
         self.skip_parsing = options.skip_parsing
         self.global_features = options.global_features
         self.save_preprocessed_doc = options.save_preprocessed_doc
         
-        self.output_dir = os.path.join(paths.OUTPUT_PATH, output_dir if output_dir is not None else '')
+        self.output_dir = os.path.join(
+            paths.OUTPUT_PATH,
+            output_dir if output_dir is not None else '')
         if not os.path.exists(self.output_dir):
             print 'Output directory %s not exists, creating it now.' % self.output_dir
             os.makedirs(self.output_dir)
@@ -50,35 +49,37 @@ def __init__(self, options, output_dir = None,
             print traceback.print_exc()
 
             raise e
-        # MM replace CRF segmenter with a fake one that loads segmentation
-        # from a file
-        load_prepared_seg = True
-        if load_prepared_seg:
-            self.segmenter = GoldSegmenter('../texts/results/test_batch_gold_seg')
+
+        # MM enable to load segmentation from .edus files
+        self.load_edus = options.load_edus
+        if self.load_edus:
+            # fake EDU segmenter that loads segmentation from files in a
+            # folder
+            self.segmenter = GoldSegmenter(self.load_edus)
         else:
             try:
-                self.segmenter = CRFSegmenter(_name = self.feature_sets, verbose = self.verbose, global_features = self.global_features)
+                self.segmenter = CRFSegmenter(
+                    _name=self.feature_sets, verbose=self.verbose,
+                    global_features=self.global_features)
             except Exception, e:
                 print "*** Loading Segmentation module failed..."
                 print traceback.print_exc()
-
                 raise e
-        
+
         try:        
             if not self.skip_parsing:
-                self.treebuilder = CRFTreeBuilder(_name = self.feature_sets, verbose = self.verbose) 
+                self.treebuilder = CRFTreeBuilder(
+                    _name=self.feature_sets, verbose=self.verbose) 
             else:
                 self.treebuilder = None
         except Exception, e:
             print "*** Loading Tree-building module failed..."
             print traceback.print_exc()
             raise e
-        
-        
+
         initEnd = time.time()
         print 'Finished initialization in %.2f seconds.' % (initEnd - initStart)
         print       
-    
         
     def unload(self):
         if self.preprocesser is not None:
@@ -89,8 +90,7 @@ def unload(self):
         
         if not self.treebuilder is None:
             self.treebuilder.unload()
-        
-    
+
     def parse(self, filename):
         if not os.path.exists(filename):
             print '%s does not exist.' % filename
@@ -125,15 +125,17 @@ def parse(self, filename):
             print
         except Exception, e:
             print "*** Preprocessing failed ***"
-            print traceback.print_exc()
-               
+            print traceback.print_exc()               
             raise e
         
         try:
             if not doc.segmented:
                 segStart = time.time()
-                
-                self.segmenter.segment(doc, filename)  # MM added filename for GoldSegmenter
+                if self.load_edus:
+                    # MM GoldSegmenter needs a filename
+                    self.segmenter.segment(doc, filename)
+                else:
+                    self.segmenter.segment(doc)
                 
                 if self.verbose:
                     print 'edus'
@@ -149,8 +151,7 @@ def parse(self, filename):
                 segEnd = time.time()
                 print 'Finished segmentation in %.2f seconds.' % (segEnd - segStart)     
                 print 'Segmented into %d EDUs.' % len(doc.edus)
-                
-                
+
                 self.log_writer.write('Finished segmentation in %.2f seconds. Segmented into %d EDUs.' % ((segEnd - segStart), len(doc.edus)))
                 if self.save_preprocessed_doc:
                     print 'Saved segmented document data to %s.' % serialized_doc_filename           
@@ -163,15 +164,12 @@ def parse(self, filename):
             if options.verbose:
                 for e in doc.edus:
                     print e
-            
-                 
+
         except Exception, e:
             print "*** Segmentation failed ***"
-            print traceback.print_exc()
-               
+            print traceback.print_exc()               
             raise e
-        
-        
+
         try:    
             ''' Step 2: build text-level discourse tree '''
             if self.skip_parsing:
@@ -256,7 +254,6 @@ def main(options, args):
             log_fname = os.path.join(paths.LOGS_PATH, 'log_%s.txt' % (output_dir if output_dir else datetime.now().strftime('%Y_%m_%d_%H_%M_%S')))
             log_writer = open(log_fname, 'w')
 
-        
         if options.filelist:
             file_fname = args[start_arg]
             if not os.path.exists(file_fname) or not os.path.isfile(file_fname):
@@ -314,7 +311,6 @@ def main(options, args):
             parser.unload()
 
 
-
 v = '1.0'
 if __name__ == '__main__':
     usage = "Usage: %prog [options] input_file/dir"
@@ -339,16 +335,20 @@ def main(options, args):
                          action="store_true", dest="logging", default=False,
                          help="Perform logging while parsing.")
     optParser.add_option("-e", "--save",
-                         action="store_true", dest="save_preprocessed_doc", default=False,
+                         action="store_true", dest="save_preprocessed_doc",
+                         default=False,
                          help="Save preprocessed document into serialized file for future use.")
-    
-    
+    # MM add option to load segmentation from the .edus files that result
+    # from calling this parser with the --skip_parsing option
+    optParser.add_option('-r', '--load_edus',
+                         dest='load_edus', default=False,
+                         help="Read segmentation from .edus files in folder")
+    # end MM
        
     (options, args) = optParser.parse_args()
     if len(args) == 0:
         optParser.print_help()
         sys.exit(1)
-                
-        
+
     main(options, args)
     
diff --git a/repro/gcrf/preprocesser.py b/repro/gcrf/preprocesser.py
index ed90fb4..0d5be7b 100644
--- a/repro/gcrf/preprocesser.py
+++ b/repro/gcrf/preprocesser.py
@@ -127,6 +127,8 @@ def sentence_splitting(self, raw_filename, doc):
                 # MM
                 if (os.path.basename(raw_filename) == 'wsj_0655.out'
                     and para_idx == 8):
+                    # this error is in the original text *and* is redone
+                    # by the segmenter:
                     # the segmenter wrongly splits on "[{Mr.] [Ortega's}]"
                     # => repair by merging sentences
                     raw_sentences = ([raw_sentences[0] + ' '
@@ -134,24 +136,30 @@ def sentence_splitting(self, raw_filename, doc):
                                      + raw_sentences[2:])
                 elif (os.path.basename(raw_filename) == 'wsj_1169.out'
                       and para_idx == 0):
+                    # this error is in the original text *and* is redone
+                    # by the segmenter:
                     # "[Murata Mfg.] [Co.]"
                     raw_sentences = ([raw_sentences[0] + ' '
                                       + raw_sentences[1]]
                                      + raw_sentences[2:])
                 elif (os.path.basename(raw_filename) == 'wsj_1169.out'
                       and para_idx == 2):
+                    # this error is in the original text *and* is redone
+                    # by the segmenter:
                     # [G.m.b.] [H.]
                     raw_sentences = ([raw_sentences[0] + ' '
                                       + raw_sentences[1]]
                                      + raw_sentences[2:])
                 elif (os.path.basename(raw_filename) == 'wsj_1331.out'
                       and para_idx == 9):
+                    # text is correct, only the segmenter makes an error:
                     # [all over again.] ['"]
                     raw_sentences = (raw_sentences[:1]
                                      + [raw_sentences[1] + ' '
                                         + raw_sentences[2]])
                 elif (os.path.basename(raw_filename) == 'wsj_1376.out'
                       and para_idx == 5):
+                    # text is correct, only the segmenter makes an error:
                     # [society.] [. . .]
                     raw_sentences = (raw_sentences[:1]
                                      + [raw_sentences[1] + ' '
@@ -169,18 +177,21 @@ def sentence_splitting(self, raw_filename, doc):
                     raw_sentences[5] = raw_sentences[5][2:]
                 elif (os.path.basename(raw_filename) == 'wsj_1376.out'
                       and para_idx == 21):
+                    # error by the segmenter
                     raw_sentences[0] = (raw_sentences[0] + ' '
                                         + raw_sentences[1] + ' '
                                         + raw_sentences[2])
                     raw_sentences = raw_sentences[:1] + raw_sentences[3:]
                 elif (os.path.basename(raw_filename) == 'wsj_1380.out'
                       and para_idx == 6):
+                    # error by the segmenter
                     # [... Boston Inc. .] ['s First ...]
                     raw_sentences[0] = (raw_sentences[0] + ' '
                                         + raw_sentences[1])
                     raw_sentences = raw_sentences[:1]
                 elif (os.path.basename(raw_filename) == 'wsj_2385.out'
                       and para_idx in [4, 5, 12]):
+                    # error by the segmenter
                     # double dash is equivalent here to ":", hence same
                     # sentence, ex: [... Co. .][-- ...]
                     raw_sentences[0] = (raw_sentences[0] + ' '
@@ -188,6 +199,7 @@ def sentence_splitting(self, raw_filename, doc):
                     raw_sentences = raw_sentences[:1]
                 elif (os.path.basename(raw_filename) == 'wsj_2386.out'
                       and para_idx == 2):
+                    # error by the segmenter
                     raw_sentences[0] = (raw_sentences[0] + ' '
                                         + raw_sentences[1])
                     raw_sentences = raw_sentences[:1] + raw_sentences[2:]

From f931c4468d4c5e2fcf889feb6c3f1f3f0bd14b8f Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Thu, 13 Oct 2016 12:54:56 +0200
Subject: [PATCH 31/74] ENH add missing utils: conda env file, script to
 reinject gold EDU seg in .edus files

---
 repro/gcrf/environment.yml  |   4 +
 repro/gcrf/gen_gold_edus.py | 179 ++++++++++++++++++++++++++++++++++++
 2 files changed, 183 insertions(+)
 create mode 100644 repro/gcrf/environment.yml
 create mode 100644 repro/gcrf/gen_gold_edus.py

diff --git a/repro/gcrf/environment.yml b/repro/gcrf/environment.yml
new file mode 100644
index 0000000..bb816bb
--- /dev/null
+++ b/repro/gcrf/environment.yml
@@ -0,0 +1,4 @@
+name: gcrf
+dependencies:
+  - python=2.7
+  - nltk=2.0.4
diff --git a/repro/gcrf/gen_gold_edus.py b/repro/gcrf/gen_gold_edus.py
new file mode 100644
index 0000000..4eea6a4
--- /dev/null
+++ b/repro/gcrf/gen_gold_edus.py
@@ -0,0 +1,179 @@
+"""Generate .edus files for Feng's gCRF parser, with gold EDUs.
+
+"""
+
+from __future__ import absolute_import, print_function
+
+import argparse
+from difflib import SequenceMatcher
+from glob import glob
+import os
+
+import numpy as np
+
+TXT_MAP = [
+    (' .', '.'),
+    (' ,', ','),
+    (' %', '%'),
+    (' :', ':'),
+    ('-LRB-', '('),
+    ('-RRB-', ')'),
+    # non-breaking space
+    # FIXME switch to unicode where this is a unique char: u"\u00A0"
+    ('\xc2\xa0', ' '),
+    ("do n't", "don't"),
+    ('...', '. . .'),
+]
+
+
+def dump_gcrf_edus_gold(f_gold, f_pred, f_dest):
+    """Reinject gold segmentation into .edus files output by gCRF.
+
+    Parameters
+    ----------
+    f_gold: str
+        Path to the gold .edus file
+    f_pred: str
+        Path to the predicted .edus file
+    f_dest: str
+        Path to the output
+    """
+    txt_gold = f_gold.read()
+    i_gold = 0  # pointer in txt_gold
+
+    skip_toks = 0  # nb of tokens from _pred that have already been consumed
+
+    for line in f_pred:
+        tokens_pred = line.split(' ')
+        # the newline character (marking the end of sentence) is appended
+        # to the last token
+        assert tokens_pred[-1][-1] == '\n'
+        #
+        for i, tok in enumerate(tokens_pred):
+            if skip_toks:
+                # skip tokens from _pred that have already been consumed
+                skip_toks -= 1
+                continue
+
+            while txt_gold[i_gold] == ' ':
+                # skip whitespaces in gold
+                i_gold += 1
+
+            if (tok[0] == '.' and tokens_pred[i - 1][-1] == '.'
+                and txt_gold[i_gold] != '.'):
+                # preprocessing adds an extra full stop when the last
+                # token ends with one (e.g. for abbreviations:
+                # "Inc." => "Inc. .")
+                if len(tok) > 1:
+                    # skip extra stop, resume normal matching procedure
+                    tok = tok[1:]
+                else:
+                    # token is exactly '.' => skip it
+                    continue
+
+            if tok == 'EDU_BREAK':
+                # predicted EDU break inside sentence
+                if txt_gold[i_gold] == '\n':
+                    # also in gold => correctly predicted => leave it
+                    print(tok, end=' ', file=f_dest)
+                    i_gold += 1
+                    continue
+                else:
+                    # not in gold => erroneously predicted => delete it
+                    # (this is a silent operation)
+                    continue
+            elif tok == '\n' and txt_gold[i_gold] == '\n':
+                # happens when the token before the newline was a copy of
+                # the punctuation added by preprocessing, removed above ;
+                # ex: "... Inc." => "... Inc. ."
+                print(tok, end='', file=f_dest)
+                i_gold += 1
+                continue
+
+            if txt_gold[i_gold:i_gold + 5] == '\n    ':
+                # gold EDU break inside sentence, missing from predicted
+                print('EDU_BREAK', end=' ', file=f_dest)  # FIXME to f_dest
+                i_gold += 5
+
+            # match token
+            # whitespaces inside tokens are non-breaking spaces:
+            # \xc2\xa0 in ascii, but we should really be processing
+            # them as unicode symbols...
+            tok_txt_gold = (tok
+                            .replace('\xc2\xa0', ' ')
+                            .replace('-LRB-', '(')
+                            .replace('-RRB-', ')')
+                            .replace('-LCB-', '{')
+                            .replace('-RCB-', '}')
+                            .replace('``', '"')
+                            .replace("''", '"')
+                            .replace('...', '. . .')
+            )
+            if i < len(tokens_pred) - 1:
+                # all tokens except for the last of the sentence
+                if (txt_gold[i_gold:i_gold + len(tok_txt_gold)]
+                    == tok_txt_gold):
+                    # it is a match indeed
+                    i_gold += len(tok_txt_gold)
+                    # print token followed by a whitespace
+                    print(tok, end=' ', file=f_dest)  # FIXME to f_dest
+                    continue
+                else:
+                    print()
+                    print('wow')
+                    print(tokens_pred[i:])
+                    print(repr(txt_gold[i_gold:i_gold + len(tok_txt_gold)]),
+                          repr(tok))
+                    raise ValueError('gni')
+            else:
+                # last token of the sentence
+                if (txt_gold[i_gold:i_gold + len(tok_txt_gold) + 1]
+                    == tok_txt_gold[:-1] + ' ' + tok[-1]):
+                    # gold has an extra whitespace before the newline
+                    i_gold += len(tok_txt_gold) + 1
+                    # token but no following whitespace
+                    print(tok, end='', file=f_dest)
+                elif (txt_gold[i_gold:i_gold + 7] == '. . . .'
+                      and tok == '...\n'):
+                    # pre-processing replaces '[. . .] [.]' with '...' ;
+                    # let's assume it's normal
+                    i_gold += 7
+                    print(tok, end='', file=f_dest)
+                else:
+                    print()
+                    print('i-2', tokens_pred[i - 2])
+                    print('i-1', tokens_pred[i - 1])
+                    print('i', tokens_pred[i])
+                    print(repr(txt_gold[i_gold:i_gold + len(tok_txt_gold)]),
+                          repr(tok))
+                    raise ValueError('pouet')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Generate .edus files with gold segmentation')
+    parser.add_argument('dir_gold', metavar='DIR',
+                        help='folder with the gold files (.edus)')
+    parser.add_argument('dir_pred', metavar='DIR',
+                        help='folder with the predicted files (.edus)')
+    parser.add_argument('dir_dest', metavar='DIR',
+                        help='output folder')
+
+    args = parser.parse_args()
+
+    # setup output dir
+    if not os.path.exists(args.dir_dest):
+        os.makedirs(args.dir_dest)
+    
+    files_edus_gold = sorted(glob(os.path.join(args.dir_gold, '*.edus')))
+    files_edus_pred = sorted(glob(os.path.join(args.dir_pred, '*.edus')))
+    for file_gold, file_pred in zip(files_edus_gold, files_edus_pred):
+        print(file_gold)
+        assert os.path.basename(file_gold) == os.path.basename(file_pred)
+        file_dest = os.path.join(args.dir_dest,
+                                 os.path.basename(file_pred))
+
+        with open(file_gold) as f_gold:
+            with open(file_pred) as f_pred:
+                with open(file_dest, mode='w') as f_dest:
+                    dump_gcrf_edus_gold(f_gold, f_pred, f_dest)

From d3befbd2d4518a31d8ced724540713958ed35625 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Thu, 13 Oct 2016 17:34:36 +0200
Subject: [PATCH 32/74] ENH load gold EDU seg in dplp

---
 repro/dplp/buildedu.py  | 186 ++++++++++++++++++++++++++++++++++++++++
 repro/dplp/rstparser.py |  32 +++++++
 2 files changed, 218 insertions(+)
 create mode 100644 repro/dplp/buildedu.py
 create mode 100644 repro/dplp/rstparser.py

diff --git a/repro/dplp/buildedu.py b/repro/dplp/buildedu.py
new file mode 100644
index 0000000..e7517d7
--- /dev/null
+++ b/repro/dplp/buildedu.py
@@ -0,0 +1,186 @@
+## buildedu.py
+## Author: Yangfeng Ji
+## Date: 05-03-2015
+## Time-stamp: <yangfeng 09/25/2015 15:35:08>
+
+from os import listdir
+from os.path import join, basename
+from model.classifier import Classifier
+from model.docreader import DocReader
+from model.sample import SampleGenerator
+from cPickle import load
+import gzip
+
+
+# MM
+from glob import glob
+import os
+
+DOC_EDUS = {os.path.splitext(os.path.basename(f))[0]: f
+            for f in glob(os.path.join(
+                    '/home/mmorey/melodi/rst/ji_eisenstein',
+                    'DPLP/data/edus/*/*.edus'))}
+
+
+def load_gold_edus(conll_file):
+    """Load gold EDUs for injection into a conll file.
+
+    Parameters
+    ----------
+    conll_file: str
+        Path to the conll file.
+
+    Returns
+    -------
+    edu_idc: list? of int
+        Index of the EDU for each token.
+    """
+    result = []  # 1 if token is the last of its EDU, 0 otherwise
+
+    doc_name = os.path.splitext(os.path.basename(conll_file))[0]
+    # find corresponding file with gold EDUs
+    fname_edus = DOC_EDUS[doc_name]
+    edus = []
+    with open(fname_edus) as f_edus:
+        for line in f_edus:
+            line = line.strip()
+            if not line:
+                continue
+            # non-empty line
+            edus.append(line)
+    # open conll file and align tokens
+    edu_idx = 0
+    edu_txt = edus[edu_idx]  # remaining text of current EDU
+    with open(conll_file) as f_conll:
+        for line in f_conll:
+            line = line.strip()
+            if not line:
+                continue
+            fields = line.split('\t')
+            wform_conll = fields[2]  # word form
+            # try to read the same amount of characters off the current EDU
+            wform_edus = edu_txt[:len(wform_conll)]
+            try:
+                assert wform_edus == wform_conll
+            except AssertionError:
+                if len(wform_edus) < len(wform_conll):
+                    # EDU boundary happens in the middle of a token:
+                    # possible causes: error in the text of the original doc
+                    # (missing whitespace, wrong version of quotes...), or
+                    # a plain error of the segmenter
+                    assert wform_conll.startswith(wform_edus)
+                    # set the EDU boundary at the current token
+                    result.append(1)
+                    # remaining text
+                    rem_txt = wform_conll[len(wform_edus):].strip()
+                    # read the first characters off the next EDU
+                    edu_idx += 1
+                    if edu_idx == len(edus):
+                        edu_txt = ''
+                    else:
+                        edu_txt = edus[edu_idx]
+                        # read the first characters off the beginning of the
+                        # next EDU, assert that they match
+                        assert edu_txt[:len(rem_txt)] == rem_txt
+                        edu_txt = edu_txt[len(rem_txt):].lstrip()
+                else:
+                    # we don't know how to handle this (yet)
+                    print(wform_conll, wform_edus)
+                    raise
+            else:
+                # print(fields + [edu_idx + 1])
+                # update the state of edu_txt for the next iteration
+                edu_txt = edu_txt[len(wform_conll):].lstrip()
+                if not edu_txt:
+                    # when the current EDU is exhausted, pass to the next
+                    result.append(1)
+                    edu_idx += 1
+                    if edu_idx == len(edus):
+                        # normally, the text should be exhausted on both sides
+                        # (.conll and .edus) at the same time ;
+                        # if the .conll has extra text, the following should
+                        # make the assertion above break at the next iteration
+                        # of the loop
+                        edu_txt = ''
+                    else:
+                        edu_txt = edus[edu_idx]
+                else:
+                    result.append(0)
+    return result
+# end MM
+
+def main(fmodel, fvocab, rpath, wpath):
+    clf = Classifier()
+    dr = DocReader()
+    clf.loadmodel(fmodel)
+    flist = [join(rpath,fname) for fname in listdir(rpath) if fname.endswith('conll')]
+    vocab = load(gzip.open(fvocab))
+    for (fidx, fname) in enumerate(flist):
+        print "Processing file: {}".format(fname)
+        doc = dr.read(fname, withboundary=False)
+        # predict segmentation
+        if False:
+            sg = SampleGenerator(vocab)
+            sg.build(doc)
+            M, _ = sg.getmat()
+            predlabels = clf.predict(M)
+        else:
+            predlabels = load_gold_edus(fname)  # RESUME HERE
+        doc = postprocess(doc, predlabels)
+        writedoc(doc, fname, wpath)
+
+
+def postprocess(doc, predlabels):
+    """ Assign predlabels into doc
+    """
+    tokendict = doc.tokendict
+    for gidx in tokendict.iterkeys():
+        if predlabels[gidx] == 1:
+            tokendict[gidx].boundary = True
+        else:
+            tokendict[gidx].boundary = False
+        if tokendict[gidx].send:
+            tokendict[gidx].boundary = True
+    return doc
+
+
+# def writedoc(doc, fname, wpath):
+#     """ Write doc into a file with the CoNLL-like format
+#     """
+#     tokendict = doc.tokendict
+#     N = len(tokendict)
+#     fname = basename(fname) + '.edu'
+#     fname = join(wpath, fname)
+#     eduidx = 0
+#     with open(fname, 'w') as fout:
+#         for gidx in range(N):
+#             fout.write(str(eduidx) + '\n')
+#             if tokendict[gidx].boundary:
+#                 eduidx += 1
+#             if tokendict[gidx].send:
+#                 fout.write('\n')
+#     print 'Write segmentation: {}'.format(fname)
+
+
+def writedoc(doc, fname, wpath):
+    """ Write file
+    """
+    tokendict = doc.tokendict
+    N = len(tokendict)
+    fname = basename(fname).replace(".conll", ".merge")
+    fname = join(wpath, fname)
+    eduidx = 1
+    with open(fname, 'w') as fout:
+        for gidx in range(N):
+            tok = tokendict[gidx]
+            line = str(tok.sidx) + "\t" + str(tok.tidx) + "\t"
+            line += tok.word + "\t" + tok.lemma + "\t" 
+            line += tok.pos + "\t" + tok.deplabel + "\t" 
+            line += str(tok.hidx) + "\t" + tok.ner + "\t"
+            line += tok.partialparse + "\t" + str(eduidx) + "\n"
+            fout.write(line)
+            # Boundary
+            if tok.boundary:
+                eduidx += 1
+            if tok.send:
+                fout.write("\n")
diff --git a/repro/dplp/rstparser.py b/repro/dplp/rstparser.py
new file mode 100644
index 0000000..73b553b
--- /dev/null
+++ b/repro/dplp/rstparser.py
@@ -0,0 +1,32 @@
+## main.py
+## Author: Yangfeng Ji
+## Date: 09-25-2015
+## Time-stamp: <yangfeng 09/26/2015 00:10:59>
+
+from code.evalparser import evalparser
+from cPickle import load
+import gzip, sys
+
+def main(path, draw=True):
+    with gzip.open("resources/bc3200.pickle.gz") as fin:
+        print 'Load Brown clusters for creating features ...'
+        bcvocab = load(fin)
+    evalparser(path=path, report=True, draw=draw,
+               bcvocab=bcvocab,
+               withdp=False)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) == 2:
+        path = sys.argv[1]
+        print 'Read files from: {}'.format(path)
+        main(path)
+    elif len(sys.argv) == 3:
+        path = sys.argv[1]
+        draw = eval(sys.argv[2])
+        print 'Read files from {}'.format(path)
+        main(path, draw)
+    else:
+        print "Usage: python rstparser.py file_path [draw_rst_tree]"
+        print "\tfile_path - path to the segmented file"
+

From 484f7d58d5e6abbc044502c5bc9a6b384055bfa9 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Mon, 17 Oct 2016 17:05:55 +0200
Subject: [PATCH 33/74] ENH add support for output of hayashi et al's parsers,
 dep/li outputs

---
 evals/dis2disdep.py   |  23 +++++++--
 evals/eval_disdep.py  |  35 +++++++++-----
 evals/hayashi_deps.py | 107 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 149 insertions(+), 16 deletions(-)
 create mode 100644 evals/hayashi_deps.py

diff --git a/evals/dis2disdep.py b/evals/dis2disdep.py
index bcca38e..a41b53f 100755
--- a/evals/dis2disdep.py
+++ b/evals/dis2disdep.py
@@ -20,6 +20,7 @@
                                          TRAIN_FOLDER)
 
 from evals.gcrf_tree_format import load_gcrf_dtrees
+from evals.hayashi_deps import load_hayashi_dtrees
 from evals.ji import load_ji_dtrees
 
 
@@ -33,8 +34,9 @@
 RELMAP_FILE = os.path.join('/home/mmorey/melodi/educe',
                            'educe', 'rst_dt',
                            'rst_112to18.txt')
-REL_CONV = RstRelationConverter(RELMAP_FILE).convert_tree
-
+REL_CONV_BASE = RstRelationConverter(RELMAP_FILE)
+REL_CONV = REL_CONV_BASE.convert_tree
+REL_CONV_DTREE = REL_CONV_BASE.convert_dtree
 # output of Joty's parser
 OUT_JOTY = os.path.join('/home/mmorey/melodi/rst/joty/Doc-level/')
 # output of Feng & Hirst's parser
@@ -43,6 +45,9 @@
 OUT_FENG2 = os.path.join('/home/mmorey/melodi/rst/feng_hirst/gCRF_dist/texts/results/test_batch_gold_seg')
 # output of Ji's parser
 OUT_JI = os.path.join('/home/mmorey/melodi/rst/ji_eisenstein/DPLP/data/docs/test/')
+# output of Hayashi et al.'s parsers
+OUT_HAYASHI_HILDA = os.path.join('/home/mmorey/melodi/rst/hayashi/SIGDIAL/auto_parse/dep/li/')
+OUT_HAYASHI_MST = os.path.join('/home/mmorey/melodi/rst/hayashi/SIGDIAL/auto_parse/cons/trans_li/')
 
 
 def main():
@@ -55,7 +60,8 @@ def main():
                         help="Encoding for n-ary nodes")
     parser.add_argument('--author', default='gold',
                         choices=['gold', 'silver',
-                                 'joty', 'feng', 'feng2', 'ji'],
+                                 'joty', 'feng', 'feng2', 'ji',
+                                 'hayashi_hilda', 'hayashi_mst'],
                         help="Author of the version of the corpus")
     parser.add_argument('--split', default='test',
                         choices=['train', 'test', 'double'],
@@ -129,6 +135,17 @@ def main():
             raise ValueError("The output of Ji & Eisenstein's parser is "
                              "available for the 'test' split only")
         dtrees = load_ji_dtrees(OUT_JI, REL_CONV)
+    elif author == 'hayashi_mst':
+        if corpus_split != 'test':
+            raise ValueError("The output of Hayashi et al.'s parser is "
+                             "available for the 'test' split only")
+        dtrees = load_hayashi_dtrees(OUT_HAYASHI_MST, REL_CONV_DTREE)
+    elif author == 'hayashi_hilda':
+        if corpus_split != 'test':
+            raise ValueError("The output of Hayashi et al.'s parser is "
+                             "available for the 'test' split only")
+        dtrees = load_hayashi_dtrees(OUT_HAYASHI_HILDA, REL_CONV_DTREE)
+            
     # do dump
     dump_disdep_files(dtrees.values(), out_dir)
 
diff --git a/evals/eval_disdep.py b/evals/eval_disdep.py
index 81be1b8..7f84965 100755
--- a/evals/eval_disdep.py
+++ b/evals/eval_disdep.py
@@ -18,11 +18,13 @@
     parser.add_argument('authors_pred', nargs='+',
                         choices=['gold', 'silver',
                                  'joty', 'feng', 'feng2', 'ji',
+                                 'hayashi_hilda', 'hayashi_mst',
                                  'ours'],
                         help="Author(s) of the predictions")
     parser.add_argument('--author_true', default='gold',
                         choices=['gold', 'silver',
                                  'joty', 'feng', 'feng2', 'ji',
+                                 'hayashi_hilda', 'hayashi_mst',
                                  'ours'],
                         help="Author of the reference")
     parser.add_argument('--nary_enc', default='chain',
@@ -38,12 +40,16 @@
     files_true = {os.path.basename(f).rsplit('.')[0]: f
                   for f in glob(os.path.join(dir_true, '*.dis_dep'))}
     # table header
-    print('\t'.join(['parser',
-                     'a', 'l', 'n', 'r',
-                     'al', 'an', 'ar',
-                     'aln', 'alr',
-                     'alnr',
-                     'support']))
+    len_author_str = max(len(x) for x in authors_pred)
+    print('\t'.join([
+        '{parser_name: <{width}}'.format(
+            parser_name='parser', width=len_author_str),
+        'a', 'l', 'n', 'r',
+        'al', 'an', 'ar',
+        'aln', 'alr',
+        'alnr',
+        'support'
+    ]))
 
     for author_pred in authors_pred:
         dir_pred = os.path.join('TMP_disdep', nary_enc, author_pred, 'test')
@@ -98,10 +104,13 @@
                             cnt_alr += 1
                         if ok_a and ok_l and ok_n and ok_r:
                             cnt_alnr += 1
-        print('\t'.join([author_pred]
-                        + ['{:.4f}'.format(float(cnt_x) / cnt_tot)
-                           for cnt_x in [cnt_a, cnt_l, cnt_n, cnt_r,
-                                         cnt_al, cnt_an, cnt_ar,
-                                         cnt_aln, cnt_alr,
-                                         cnt_alnr]]
-                        + [str(cnt_tot)]))
+        print('\t'.join(
+            ['{parser_name: <{width}}'.format(
+                parser_name=author_pred, width=len_author_str)]
+            + ['{:.4f}'.format(float(cnt_x) / cnt_tot)
+               for cnt_x in [cnt_a, cnt_l, cnt_n, cnt_r,
+                             cnt_al, cnt_an, cnt_ar,
+                             cnt_aln, cnt_alr,
+                             cnt_alnr]]
+            + [str(cnt_tot)]
+        ))
diff --git a/evals/hayashi_deps.py b/evals/hayashi_deps.py
new file mode 100644
index 0000000..f613013
--- /dev/null
+++ b/evals/hayashi_deps.py
@@ -0,0 +1,107 @@
+"""Load dependencies output by Hayashi et al.'s parsers.
+
+This module enables to process files in auto_parse/{dep/li,cons/trans_li}.
+"""
+
+from __future__ import absolute_import, print_function
+
+import os
+from glob import glob
+
+from educe.rst_dt.corpus import Reader
+from educe.rst_dt.deptree import RstDepTree
+
+
+# load true ctrees, from the TEST section of the RST-DT, to get gold EDUs
+RST_DT_DIR = '/home/mmorey/corpora/rst-dt/rst_discourse_treebank/data'
+RST_TEST_DIR = os.path.join(RST_DT_DIR, 'RSTtrees-WSJ-main-1.0/TEST')
+if not os.path.exists(RST_TEST_DIR):
+    raise ValueError('Unable to find RST test files at ', RST_TEST_DIR)
+RST_TEST_READER = Reader(RST_TEST_DIR)
+RST_TEST_CTREES_TRUE = {k.doc: v for k, v in RST_TEST_READER.slurp().items()}
+
+
+def _load_hayashi_dep_file(f, edus):
+    """Do load.
+
+    Parameters
+    ----------
+    f: File
+        dep file, open
+    edus: list of EDU
+        True EDUs in this document.
+
+    Returns
+    -------
+    dt: RstDepTree
+        Predicted dtree
+    """
+    dt = RstDepTree(edus=edus, origin=None, nary_enc='tree')  # FIXME origin
+    for line in f:
+        line = line.strip()
+        if not line:
+            continue
+        dep_idx, gov_idx, lbl = line.split()
+        dep_idx = int(dep_idx)
+        gov_idx = int(gov_idx)
+        dt.add_dependency(gov_idx, dep_idx, label=lbl)
+    return dt
+
+
+def load_hayashi_dep_file(fname, edus):
+    """Load a file.
+
+    Parameters
+    ----------
+    fname: str
+        Path to the file
+
+    Returns
+    -------
+    dt: RstDepTree
+        Dependency tree corresponding to the content of this file.
+    """
+    with open(fname) as f:
+        return _load_hayashi_dep_file(f, edus)
+
+
+def load_hayashi_dep_files(out_dir):
+    """Load dep files output by one of Hayashi et al.'s parser.
+
+    Parameters
+    ----------
+    out_dir: str
+        Path to the folder containing the .dis files.
+    """
+    dtrees = dict()
+    for fname in glob(os.path.join(out_dir, '*.dis')):
+        doc_name = os.path.splitext(os.path.basename(fname))[0]
+        edus = RST_TEST_CTREES_TRUE[doc_name].leaves()
+        dtrees[doc_name] = load_hayashi_dep_file(fname, edus)
+    return dtrees
+
+
+def load_hayashi_dtrees(out_dir, rel_conv):
+    """Load the dtrees output by one of Hayashi et al.'s parser.
+
+    Parameters
+    ----------
+    out_dir: str
+        Path to the folder containing .dis files.
+    rel_conv: RstRelationConverter
+        Converter for relation labels (fine- to coarse-grained, plus
+        normalization).
+
+    Returns
+    -------
+    dtree_pred: dict(str, RstDepTree)
+        RST dtree for each document.
+    """
+    dtree_pred = dict()
+
+    dtrees = load_hayashi_dep_files(out_dir)
+    for doc_name, dt_pred in dtrees.items():
+        if rel_conv is not None:
+            dt_pred = rel_conv(dt_pred)
+        dtree_pred[doc_name] = dt_pred
+    return dtree_pred

From d4f9418286e3c19fbe82c91faa840f2cd77ebf21 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Mon, 17 Oct 2016 18:47:56 +0200
Subject: [PATCH 34/74] FIX paths to hayashi outputs

---
 evals/dis2disdep.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/evals/dis2disdep.py b/evals/dis2disdep.py
index a41b53f..fd552fa 100755
--- a/evals/dis2disdep.py
+++ b/evals/dis2disdep.py
@@ -46,8 +46,8 @@
 # output of Ji's parser
 OUT_JI = os.path.join('/home/mmorey/melodi/rst/ji_eisenstein/DPLP/data/docs/test/')
 # output of Hayashi et al.'s parsers
-OUT_HAYASHI_HILDA = os.path.join('/home/mmorey/melodi/rst/hayashi/SIGDIAL/auto_parse/dep/li/')
-OUT_HAYASHI_MST = os.path.join('/home/mmorey/melodi/rst/hayashi/SIGDIAL/auto_parse/cons/trans_li/')
+OUT_HAYASHI_MST = os.path.join('/home/mmorey/melodi/rst/hayashi/SIGDIAL/auto_parse/dep/li/')
+OUT_HAYASHI_HILDA = os.path.join('/home/mmorey/melodi/rst/hayashi/SIGDIAL/auto_parse/cons/trans_li/')
 
 
 def main():

From a4a4202f0fb50131ee6d2b2f23ac84fce349ba75 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Tue, 18 Oct 2016 16:14:40 +0200
Subject: [PATCH 35/74] ENH added conda environment.yml, fix local path to
 corenlp out

---
 environment.yml      | 7 +++++++
 irit_rst_dt/local.py | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)
 create mode 100644 environment.yml

diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000..a1140ca
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,7 @@
+name: irit-rst-dt
+dependencies:
+  - python=2.7
+  - nltk
+  - scikit-learn
+  - pip:
+    - "--editable=git+https://github.com/nlhepler/pydot.git#egg=pydot"
diff --git a/irit_rst_dt/local.py b/irit_rst_dt/local.py
index f805832..ab5c087 100644
--- a/irit_rst_dt/local.py
+++ b/irit_rst_dt/local.py
@@ -122,7 +122,7 @@
 
 # CORENLP_OUT_DIR = None
 # CORENLP_OUT_DIR = '/projets/melodi/corpus/rst-dt-corenlp-2015-01-29'
-CORENLP_OUT_DIR = '/home/mmorey/corpora/rst-dt-corenlp-2015-01-29'
+CORENLP_OUT_DIR = '/home/mmorey/corpora/rst-dt/rst-dt-corenlp-2015-01-29'
 """
 Where to read parses from CoreNLP from
 """

From a4b88275bb8897d34a02b036b02f502500330010 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Thu, 20 Oct 2016 17:20:05 +0200
Subject: [PATCH 36/74] ENH rst parseval metrics, now in educe

---
 evals/codra.py    |  2 --
 evals/li2014.py   | 15 ++++++++++-----
 evals/showdown.py | 42 +++++++++++++++++++-----------------------
 3 files changed, 29 insertions(+), 30 deletions(-)

diff --git a/evals/codra.py b/evals/codra.py
index f3b894e..eb9c6f6 100644
--- a/evals/codra.py
+++ b/evals/codra.py
@@ -15,8 +15,6 @@
 from educe.rst_dt.document_plus import align_edus_with_paragraphs
 #
 from attelo.io import load_edus
-from attelo.metrics.constituency import (parseval_detailed_report,
-                                         parseval_report)
 from attelo.metrics.deptree import compute_uas_las
 
 
diff --git a/evals/li2014.py b/evals/li2014.py
index d8c02a5..1135efc 100644
--- a/evals/li2014.py
+++ b/evals/li2014.py
@@ -3,6 +3,11 @@
 This is a reimplementation of this evaluation procedure.
 """
 
+from educe.rst_dt.metrics.rst_parseval import (rst_parseval_report,
+                                               rst_parseval_detailed_report)
+
+
+
 # FIXME legacy code brutally dumped here, broken
 def twisted_eval_li2014(data_true, data_pred):
     """Run Parseval on transformed gold trees, as in (Li et al., 2014).
@@ -86,12 +91,12 @@ def eval_distortion_gold(corpus, nuc_strategy, rank_strategy,
             chn_bin_srtree_ref)
         gold_twis[doc_name] = chn_bin_rtree_ref
 
-    print(parseval_report(gold_orig, gold_twis,
-                          metric_types=[x[0] for x in LBL_FNS],
-                          digits=4))
+    print(rst_parseval_report(gold_orig, gold_twis,
+                              metric_types=[x[0] for x in LBL_FNS],
+                              digits=4))
     # detailed report on S+N+R
-    print(parseval_detailed_report(ctree_true, ctree_pred,
-                                   metric_type='S+R'))
+    print(rst_parseval_detailed_report(ctree_true, ctree_pred,
+                                       metric_type='S+R'))
 
 
 def comparative_distortion_on_gold():
diff --git a/evals/showdown.py b/evals/showdown.py
index 0f128f6..6d578d1 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -15,9 +15,9 @@
 from educe.rst_dt.dep2con import (DummyNuclearityClassifier,
                                   InsideOutAttachmentRanker)
 from educe.rst_dt.deptree import RstDepTree
+from educe.rst_dt.metrics.rst_parseval import (rst_parseval_detailed_report,
+                                               rst_parseval_report)
 #
-from attelo.metrics.constituency import (parseval_detailed_report,
-                                         parseval_report)
 from attelo.metrics.deptree import compute_uas_las, compute_uas_las_undirected
 
 # local to this package
@@ -173,9 +173,6 @@ def main():
                         help="Binarize the reference ctree for the eval")
     parser.add_argument('--simple_rsttree', action='store_true',
                         help="Binarize ctree and move relations up")
-    parser.add_argument('--span_sel', default='none',
-                        choices=['none', 'leaves', 'non-leaves'],
-                        help="Binarize ctree and move relations up")
     parser.add_argument('--per_doc', action='store_true',
                         help="Doc-averaged scores (cf. Ji's eval)")
     #
@@ -186,13 +183,7 @@ def main():
     nary_enc_pred = args.nary_enc_pred
     binarize_true = args.binarize_true
     simple_rsttree = args.simple_rsttree
-    span_sel = args.span_sel
-    if span_sel == 'none':
-        span_sel = None
-    if simple_rsttree:
-        # the point of evaluating on simple rst trees is to get leaves
-        # out of the way
-        span_sel = 'non-leaves'
+
     # "per_doc = True" computes p, r, f as in DPLP: compute scores per doc
     # then average over docs
     # it should be False, except for comparison with the DPLP paper
@@ -393,6 +384,10 @@ def main():
                                for x in ctree_true_list]
             ctree_pred_list = [SimpleRSTTree.from_rst_tree(x)
                                for x in ctree_pred_list]
+            ctree_type = 'SimpleRST'
+        else:
+            ctree_type = 'RST'
+
         # WIP print SimpleRSTTrees
         if not os.path.exists('gold'):
             os.makedirs('gold')
@@ -411,24 +406,25 @@ def main():
                 doc_names, ctree_true_list, ctree_pred_list):
             with codecs.open(parser_name + '/' + doc_name + '.c_eval',
                              mode='w', encoding='utf-8') as f:
-                print(parseval_report([ct_true], [ct_pred], digits=4,
-                                      span_sel=span_sel,
-                                      per_doc=per_doc,
-                                      stringent=STRINGENT),
+                print(rst_parseval_report([ct_true], [ct_pred],
+                                          ctree_type=ctree_type,
+                                          digits=4,
+                                          per_doc=per_doc,
+                                          stringent=STRINGENT),
                       file=f)
         # end WIP
         # FIXME
         # compute and print PARSEVAL scores
         print(parser_name)
-        print(parseval_report(ctree_true_list, ctree_pred_list, digits=4,
-                              span_sel=span_sel,
-                              per_doc=per_doc,
-                              stringent=STRINGENT))
+        print(rst_parseval_report(ctree_true_list, ctree_pred_list,
+                                  ctree_type=ctree_type, digits=4,
+                                  per_doc=per_doc,
+                                  stringent=STRINGENT))
         # detailed report on S+N+R
         if DETAILED:
-            print(parseval_detailed_report(ctree_true_list, ctree_pred_list,
-                                           metric_type='S+R',
-                                           span_sel=span_sel))
+            print(rst_parseval_detailed_report(
+                ctree_true_list, ctree_pred_list, ctree_type=ctree_type,
+                metric_type='S+R'))
         # end FIXME
 
 

From b6104f20ce88f52e600e95deb5d9d2fa832c5a5f Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Fri, 25 Nov 2016 17:55:51 +0100
Subject: [PATCH 37/74] WIP add support for output of li2016

---
 evals/li_qi.py    | 132 +++++++++++++++++++++++++++++
 evals/showdown.py | 205 +++++++++++++++++++++++++---------------------
 2 files changed, 242 insertions(+), 95 deletions(-)
 create mode 100644 evals/li_qi.py

diff --git a/evals/li_qi.py b/evals/li_qi.py
new file mode 100644
index 0000000..abf1929
--- /dev/null
+++ b/evals/li_qi.py
@@ -0,0 +1,132 @@
+"""Load the output of the parser from (Li et al. 2016).
+
+This is 99% a copy/paste from our own evals/joty.py.
+I really, really need to come up with a better API and refactor accordingly.
+"""
+
+from __future__ import absolute_import, print_function
+
+import codecs
+import glob
+import itertools
+import os
+
+from educe.rst_dt.parse import parse_rst_dt_tree
+from educe.rst_dt.deptree import RstDepTree
+
+
+def load_li_qi_output_files(root_dir):
+    """Load ctrees output by Li Qi's parser on the TEST section of the RST-DT.
+
+    Parameters
+    ----------
+    root_dir: string
+        Path to the main folder containing the parser's output
+
+    Returns
+    -------
+    data: dict
+        Dictionary that should be akin to a sklearn Bunch, with
+        interesting keys 'filenames', 'doc_names' and 'rst_ctrees'.
+
+    Notes
+    -----
+    To ensure compatibility with the rest of the code base, doc_names
+    are automatically added the ".out" extension. This would not work
+    for fileX documents, but they are absent from the TEST section of
+    the RST-WSJ treebank.
+    """
+    # map output filename to doc filename:
+    # here, remove prefix "parsed_"
+    # ex of filename: parsed_wsj_0602.out
+    out_filenames = sorted(glob.glob(os.path.join(root_dir, 'parsed_*')))
+    doc_names = [os.path.basename(out_fn).split('_', 1)[1]
+                 for out_fn in out_filenames]
+    # load the RST trees
+    rst_ctrees = []
+    for out_fn in out_filenames:
+        with codecs.open(out_fn, 'r', 'utf-8') as f:
+            # TODO(?) add support for and use RSTContext
+            rst_ctree = parse_rst_dt_tree(f.read(), None)
+            rst_ctrees.append(rst_ctree)
+
+    data = dict(filenames=out_filenames,
+                doc_names=doc_names,
+                rst_ctrees=rst_ctrees)
+    return data
+
+
+def load_li_qi_ctrees(out_dir, rel_conv):
+    """Load the ctrees output by Li Qi's parser as .dis files.
+
+    This currently runs on the document-level files (.doc_dis).
+
+    Parameters
+    ----------
+    out_dir: str
+        Path to the base directory containing the output files.
+
+    Returns
+    -------
+    ctree_pred: dict(str, RSTTree)
+        RST ctree for each document.
+    """
+    # load predicted trees
+    data_pred = load_li_qi_output_files(out_dir)
+    doc_names_pred = data_pred['doc_names']
+    rst_ctrees_pred = data_pred['rst_ctrees']
+    # map doc_name to ctree (RSTTree)
+    ctree_pred = dict()
+    for doc_name, ct_pred in itertools.izip(doc_names_pred, rst_ctrees_pred):
+        # ctree
+        # replace fine-grained labels with coarse-grained labels :
+        # the files we have already contain the coarse labels, except their
+        # initial letter is capitalized, except for same-unit and span,
+        # whereas ours are not
+        if rel_conv is not None:
+            ct_pred = rel_conv(ct_pred)
+        ctree_pred[doc_name] = ct_pred
+
+    return ctree_pred
+
+
+def load_li_qi_dtrees(out_dir, rel_conv, nary_enc='chain'):
+    """Get the dtrees that correspond to the ctrees output by Li Qi's parser.
+
+    Parameters
+    ----------
+    out_dir: str
+        Path to the base directory containing the output files.
+    nary_enc: one of {'chain', 'tree'}
+        Encoding for n-ary nodes.
+
+    Returns
+    -------
+    dtree_pred: dict(str, RstDepTree)
+        RST dtree for each document.
+    """
+    # load predicted trees
+    data_pred = load_li_qi_output_files(out_dir)
+    # filenames = data_pred['filenames']
+    doc_names_pred = data_pred['doc_names']
+    rst_ctrees_pred = data_pred['rst_ctrees']
+
+    # build a dict from doc_name to ordered dtree (RstDepTree)
+    dtree_pred = dict()
+    for doc_name, ct_pred in itertools.izip(doc_names_pred, rst_ctrees_pred):
+        # constituency tree
+        # replace fine-grained labels with coarse-grained labels ;
+        # the files we have already contain the coarse labels, except their
+        # initial letter is capitalized whereas ours are not
+        if rel_conv is not None:
+            ct_pred = rel_conv(ct_pred)
+        # convert to an ordered dependency tree ;
+        # * 'tree' produces a weakly-ordered dtree strictly equivalent
+        # to the original ctree,
+        # * 'chain' produces a strictly-ordered dtree for which strict
+        # equivalence is not preserved
+        dt_pred = RstDepTree.from_rst_tree(ct_pred, nary_enc=nary_enc)
+        dtree_pred[doc_name] = dt_pred
+
+    return dtree_pred
+
diff --git a/evals/showdown.py b/evals/showdown.py
index 6d578d1..c50f336 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -25,6 +25,7 @@
 from evals.feng import load_feng_ctrees, load_feng_dtrees
 from evals.gcrf_tree_format import load_gcrf_ctrees, load_gcrf_dtrees
 from evals.ji import load_ji_ctrees, load_ji_dtrees
+from evals.li_qi import load_li_qi_ctrees, load_li_qi_dtrees
 from evals.ours import (load_deptrees_from_attelo_output,
                         load_attelo_ctrees,
                         load_attelo_dtrees)
@@ -91,6 +92,8 @@
 FENG_DIR = '/home/mmorey/melodi/rst/feng_hirst/'
 FENG1_OUT_DIR = os.path.join(FENG_DIR, 'phil', 'tmp')
 FENG2_OUT_DIR = os.path.join(FENG_DIR, 'gCRF_dist/texts/results/test_batch_gold_seg')
+# Li Qi's parser
+LI_QI_OUT_DIR = '/home/mmorey/melodi/rst/li_qi/result'
 
 # level of detail for parseval
 DETAILED = False
@@ -153,6 +156,7 @@ def main():
     parser.add_argument('authors_pred', nargs='+',
                         choices=['gold', 'silver',
                                  'joty', 'feng', 'feng2', 'ji',
+                                 'li_qi',
                                  'ours_chain', 'ours_tree', 'ours_tree_su'],
                         help="Author(s) of the predictions")
     parser.add_argument('--nary_enc_pred', default='tree',
@@ -162,6 +166,7 @@ def main():
     parser.add_argument('--author_true', default='gold',
                         choices=['gold', 'silver',
                                  'joty', 'feng', 'feng2', 'ji',
+                                 'li_qi',
                                  'ours_chain', 'ours_tree'],
                         help="Author of the reference")
     # * dtree eval
@@ -224,101 +229,111 @@ def main():
     c_preds = []  # predictions: [(parser_name, dict(doc_name, ct_pred))]
     d_preds = []  # predictions: [(parser_name, dict(doc_name, dt_pred))]
 
-    if 'feng' in authors_pred:
-        c_preds.append(
-            ('feng', load_feng_ctrees(FENG1_OUT_DIR, REL_CONV))
-        )
-        d_preds.append(
-            ('feng', load_feng_dtrees(FENG1_OUT_DIR, REL_CONV,
-                                      nary_enc='chain'))
-        )
-
-    if 'feng2' in authors_pred:
-        c_preds.append(
-            ('gCRF', load_gcrf_ctrees(FENG2_OUT_DIR, REL_CONV))
-        )
-        d_preds.append(
-            ('gCRF', load_gcrf_dtrees(FENG2_OUT_DIR, REL_CONV,
-                                      nary_enc='chain'))
-        )
-
-    if 'joty' in authors_pred:
-        # CODRA outputs RST ctrees ; eval_codra_output maps them to RST dtrees
-        c_preds.append(
-            ('joty', load_codra_ctrees(CODRA_OUT_DIR, REL_CONV))
-        )
-        d_preds.append(
-            ('joty', load_codra_dtrees(CODRA_OUT_DIR, REL_CONV,
-                                       nary_enc='chain'))
-        )
-        # joty-{chain,tree} would be the same except nary_enc='tree' ;
-        # the nary_enc does not matter because codra outputs binary ctrees,
-        # hence both encodings result in (the same) strictly ordered dtrees
-
-    if 'ji' in authors_pred:
-        # DPLP outputs RST ctrees in the form of lists of spans;
-        # load_ji_dtrees maps them to RST dtrees
-        c_preds.append(
-            ('ji', load_ji_ctrees(JI_OUT_DIR, REL_CONV))
-        )
-        d_preds.append(
-            ('ji', load_ji_dtrees(JI_OUT_DIR, REL_CONV,
-                                  nary_enc='chain'))
-        )
-        # ji-{chain,tree} would be the same except nary_enc='tree' ;
-        # the nary_enc does not matter because codra outputs binary ctrees,
-        # hence both encodings result in (the same) strictly ordered dtrees
-
-    if 'ours_chain' in authors_pred:
-        # Eisner, predicted syntax, chain
-        c_preds.append(
-            ('ours-chain', load_attelo_ctrees(EISNER_OUT_SYN_PRED, EDUS_FILE,
-                                              nuc_clf, rnk_clf))
-        )
-        d_preds.append(
-            ('ours-chain', load_attelo_dtrees(EISNER_OUT_SYN_PRED, EDUS_FILE,
-                                              nuc_clf, rnk_clf))
-        )
-
-    if 'ours_tree' in authors_pred:
-        # Eisner, predicted syntax, tree + same-unit
-        c_preds.append(
-            ('ours-tree', load_attelo_ctrees(EISNER_OUT_TREE_SYN_PRED,
-                                             EDUS_FILE,
-                                             nuc_clf, rnk_clf))
-        )
-        d_preds.append(
-            ('ours-tree', load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED,
-                                             EDUS_FILE,
-                                             nuc_clf, rnk_clf))
-        )
-    if 'ours_tree_su' in authors_pred:
-        # Eisner, predicted syntax, tree + same-unit
-        c_preds.append(
-            ('ours-tree-su', load_attelo_ctrees(EISNER_OUT_TREE_SYN_PRED_SU,
-                                                EDUS_FILE,
-                                                nuc_clf, rnk_clf))
-        )
-        d_preds.append(
-            ('ours-tree-su', load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED_SU,
-                                                EDUS_FILE,
-                                                nuc_clf, rnk_clf))
-        )
-
-    if False:  # FIXME repair (or forget) these
-        print('Eisner, predicted syntax + same-unit')
-        load_deptrees_from_attelo_output(ctree_true, dtree_true,
-                                         EISNER_OUT_SYN_PRED_SU, EDUS_FILE,
-                                         nuc_clf, rnk_clf,
-                                         detailed=False)
-        print('======================')
-
-        print('Eisner, gold syntax')
-        load_deptrees_from_attelo_output(ctree_true, dtree_true,
-                                         EISNER_OUT_SYN_GOLD, EDUS_FILE,
-                                         nuc_clf, rnk_clf,
-                                         detailed=False)
-        print('======================')
+    for author_pred in authors_pred:
+        if author_pred == 'li_qi':
+            c_preds.append(
+                ('li_qi', load_li_qi_ctrees(LI_QI_OUT_DIR, REL_CONV))
+            )
+            d_preds.append(
+                ('li_qi', load_li_qi_dtrees(LI_QI_OUT_DIR, REL_CONV,
+                                            nary_enc='chain'))
+            )
+
+        if author_pred == 'feng':
+            c_preds.append(
+                ('gSVM', load_feng_ctrees(FENG1_OUT_DIR, REL_CONV))
+            )
+            d_preds.append(
+                ('gSVM', load_feng_dtrees(FENG1_OUT_DIR, REL_CONV,
+                                          nary_enc='chain'))
+            )
+
+        if author_pred == 'feng2':
+            c_preds.append(
+                ('gCRF', load_gcrf_ctrees(FENG2_OUT_DIR, REL_CONV))
+            )
+            d_preds.append(
+                ('gCRF', load_gcrf_dtrees(FENG2_OUT_DIR, REL_CONV,
+                                          nary_enc='chain'))
+            )
+
+        if author_pred == 'joty':
+            # CODRA outputs RST ctrees ; eval_codra_output maps them to RST dtrees
+            c_preds.append(
+                ('TSP 1-1', load_codra_ctrees(CODRA_OUT_DIR, REL_CONV))
+            )
+            d_preds.append(
+                ('TSP 1-1', load_codra_dtrees(CODRA_OUT_DIR, REL_CONV,
+                                              nary_enc='chain'))
+            )
+            # joty-{chain,tree} would be the same except nary_enc='tree' ;
+            # the nary_enc does not matter because codra outputs binary ctrees,
+            # hence both encodings result in (the same) strictly ordered dtrees
+
+        if author_pred == 'ji':
+            # DPLP outputs RST ctrees in the form of lists of spans;
+            # load_ji_dtrees maps them to RST dtrees
+            c_preds.append(
+                ('DPLP', load_ji_ctrees(JI_OUT_DIR, REL_CONV))
+            )
+            d_preds.append(
+                ('DPLP', load_ji_dtrees(JI_OUT_DIR, REL_CONV,
+                                        nary_enc='chain'))
+            )
+            # ji-{chain,tree} would be the same except nary_enc='tree' ;
+            # the nary_enc does not matter because codra outputs binary ctrees,
+            # hence both encodings result in (the same) strictly ordered dtrees
+
+        if author_pred == 'ours_chain':
+            # Eisner, predicted syntax, chain
+            c_preds.append(
+                ('ours-chain', load_attelo_ctrees(EISNER_OUT_SYN_PRED, EDUS_FILE,
+                                                  nuc_clf, rnk_clf))
+            )
+            d_preds.append(
+                ('ours-chain', load_attelo_dtrees(EISNER_OUT_SYN_PRED, EDUS_FILE,
+                                                  nuc_clf, rnk_clf))
+            )
+
+        if author_pred == 'ours_tree':
+            # Eisner, predicted syntax, tree + same-unit
+            c_preds.append(
+                ('ours-tree', load_attelo_ctrees(EISNER_OUT_TREE_SYN_PRED,
+                                                 EDUS_FILE,
+                                                 nuc_clf, rnk_clf))
+            )
+            d_preds.append(
+                ('ours-tree', load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED,
+                                                 EDUS_FILE,
+                                                 nuc_clf, rnk_clf))
+            )
+        if author_pred == 'ours_tree_su':
+            # Eisner, predicted syntax, tree + same-unit
+            c_preds.append(
+                ('ours-tree-su', load_attelo_ctrees(EISNER_OUT_TREE_SYN_PRED_SU,
+                                                    EDUS_FILE,
+                                                    nuc_clf, rnk_clf))
+            )
+            d_preds.append(
+                ('ours-tree-su', load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED_SU,
+                                                    EDUS_FILE,
+                                                    nuc_clf, rnk_clf))
+            )
+
+        if False:  # FIXME repair (or forget) these
+            print('Eisner, predicted syntax + same-unit')
+            load_deptrees_from_attelo_output(ctree_true, dtree_true,
+                                             EISNER_OUT_SYN_PRED_SU, EDUS_FILE,
+                                             nuc_clf, rnk_clf,
+                                             detailed=False)
+            print('======================')
+
+            print('Eisner, gold syntax')
+            load_deptrees_from_attelo_output(ctree_true, dtree_true,
+                                             EISNER_OUT_SYN_GOLD, EDUS_FILE,
+                                             nuc_clf, rnk_clf,
+                                             detailed=False)
+            print('======================')
 
     # dependency eval
 

From 3d63a625f8d514374b89fd9e0a966743b6cd687b Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Tue, 6 Dec 2016 18:33:01 +0100
Subject: [PATCH 38/74] ENH add eval for Hayashi's HILDA + Li2014 dep parser
 output files

---
 evals/hayashi_cons.py | 154 +++++++++++++++++++++++++++
 evals/ji.py           |   2 +-
 evals/li_sujian.py    | 239 ++++++++++++++++++++++++++++++++++++++++++
 evals/ours.py         |  12 +--
 evals/showdown.py     |  92 +++++++++++-----
 5 files changed, 467 insertions(+), 32 deletions(-)
 create mode 100644 evals/hayashi_cons.py
 create mode 100644 evals/li_sujian.py

diff --git a/evals/hayashi_cons.py b/evals/hayashi_cons.py
new file mode 100644
index 0000000..6f76512
--- /dev/null
+++ b/evals/hayashi_cons.py
@@ -0,0 +1,154 @@
+"""Load RST c-trees output by Hayashi et al.'s reimplementation of HILDA.
+
+"""
+
+from __future__ import absolute_import, print_function
+
+from collections import namedtuple
+import codecs
+import glob
+import itertools
+import os
+
+from nltk import Tree
+
+from educe.annotation import Span
+from educe.rst_dt.annotation import EDU, Node, RSTTree
+from educe.rst_dt.deptree import RstDepTree
+
+
+node_struct = namedtuple('node_struct', ['nuc', 'rel', 'span'])
+
+def read_node(s):
+    """Helper applied when reading a node"""
+    nuc, rel = s.split(':') if s != 'Root' else (s, '---')
+    res = node_struct(nuc=nuc, rel=rel, span=(0, 0))
+    return res
+
+
+leaf_struct = namedtuple('leaf_struct', ['edu_id', 'sent_id', 'para_id'])
+
+def read_leaf(s):
+    """Helper applied when reading a leaf"""
+    edu_id, sent_id, para_id = s[4:].split('_')  # ex: leaf1_1_1
+    res = leaf_struct(edu_id=edu_id, sent_id=sent_id,
+                      para_id=para_id)
+    return res
+
+def propagate_spans(t):
+    """Propagate spans bottom-up in our custom NLTK tree."""
+    dft_span = Span(0, 0)  # default text span
+    dft_text = ''
+
+    lbl = t.label()
+    if all(isinstance(kid, Tree) for kid in t):
+        new_kids = [propagate_spans(kid) for kid in t]
+        edu_start = new_kids[0].label().edu_span[0]
+        edu_end = new_kids[-1].label().edu_span[1]
+    else:
+        # pre-terminal
+        assert len(t) == 1
+        kid = t[0]
+        new_kid = EDU(int(kid.edu_id), dft_span, dft_text)
+        new_kids = [new_kid]
+        edu_start = new_kid.num
+        edu_end = new_kid.num
+    new_lbl = Node(lbl.nuc, (edu_start, edu_end), dft_span, lbl.rel)
+    new_tree = RSTTree(new_lbl, new_kids)
+    return new_tree
+
+
+def load_hayashi_con_files(root_dir):
+    """Load the ctrees output by Hayashi et al.'s reimplementation of HILDA.
+
+    The RST ctrees are supposedly document-level RST trees, with classes of
+    relations.
+
+    Parameters
+    ----------
+    out_dir: str
+        Path to the base directory containing the output files.
+
+    Returns
+    -------
+    data: dict
+        Dictionary that should be akin to a sklearn Bunch, with
+        interesting keys 'filenames', 'doc_names' and 'rst_ctrees'.
+    """
+    # map output filename to doc filename
+    # ex of filename: wsj_0602.out.dis
+    out_filenames = sorted(glob.glob(os.path.join(root_dir, '*.dis')))
+    doc_names = [os.path.basename(out_fn).rsplit('.', 1)[0]
+                 for out_fn in out_filenames]
+    # load the RST trees
+    rst_ctrees = []
+    for out_fn in out_filenames:
+        with codecs.open(out_fn, 'r', 'utf-8') as f:
+            tree_str = f.read()
+            tree_raw = Tree.fromstring(tree_str, read_node=read_node,
+                                       read_leaf=read_leaf)
+            # TODO(?) add support for and use RSTContext
+            rst_ctree = propagate_spans(tree_raw)
+            rst_ctrees.append(rst_ctree)
+
+    data = dict(filenames=out_filenames,
+                doc_names=doc_names,
+                rst_ctrees=rst_ctrees)
+    return data
+
+
+def load_hayashi_hilda_ctrees(out_dir, rel_conv):
+    """Load the ctrees output by Hayashi et al.'s HILDA.
+
+    Parameters
+    ----------
+    out_dir: str
+        Path to the folder containing .dis files.
+    rel_conv: RstRelationConverter
+        Converter for relation labels (fine- to coarse-grained, plus
+        normalization).
+
+    Returns
+    -------
+    ctree_pred: dict(str, RSTTree)
+        RST ctree for each document.
+    """
+    # load predicted ctrees
+    data_pred = load_hayashi_con_files(out_dir)
+    doc_names_pred = data_pred['doc_names']
+    rst_ctrees_pred = data_pred['rst_ctrees']
+
+    # build a dict from doc_name to RST ctree
+    ctree_pred = dict()
+    for doc_name, ct_pred in itertools.izip(doc_names_pred, rst_ctrees_pred):
+        if rel_conv is not None:
+            ct_pred = rel_conv(ct_pred)
+        ctree_pred[doc_name] = ct_pred
+    return ctree_pred
+
+
+def load_hayashi_hilda_dtrees(out_dir, rel_conv, nary_enc='chain'):
+    """Load the dtrees for the ctrees output by Hayashi et al.'s HILDA.
+
+    Parameters
+    ----------
+    out_dir: str
+        Path to the folder containing .dis files.
+    rel_conv: RstRelationConverter
+        Converter for relation labels (fine- to coarse-grained, plus
+        normalization).
+
+    Returns
+    -------
+    dtree_pred: dict(str, RstDepTree)
+        RST dtree for each document.
+    """
+    # load predicted ctrees
+    ctree_pred = load_hayashi_hilda_ctrees(out_dir, rel_conv)
+    # convert to dtrees
+    dtree_pred = dict()
+    for doc_name, ct_pred in ctree_pred.items():
+        dt_pred = RstDepTree.from_rst_tree(ct_pred, nary_enc=nary_enc)
+        dtree_pred[doc_name] = dt_pred
+
+    return dtree_pred
diff --git a/evals/ji.py b/evals/ji.py
index 9862abf..3198a3f 100644
--- a/evals/ji.py
+++ b/evals/ji.py
@@ -16,7 +16,7 @@
 from educe.rst_dt.rst_wsj_corpus import TEST_FOLDER
 
 # original RST corpus
-RST_CORPUS = os.path.join('/home/mmorey/corpora/rst_discourse_treebank/data')
+RST_CORPUS = os.path.join('/home/mmorey/corpora/rst-dt/rst_discourse_treebank/data')
 RST_MAIN_TEST = os.path.join(RST_CORPUS, TEST_FOLDER)
 
 
diff --git a/evals/li_sujian.py b/evals/li_sujian.py
new file mode 100644
index 0000000..1f0d89b
--- /dev/null
+++ b/evals/li_sujian.py
@@ -0,0 +1,239 @@
+"""TODO
+
+"""
+
+from __future__ import absolute_import, print_function
+from collections import Counter
+from glob import glob
+import os
+
+# educe
+from educe.learning.edu_input_format import load_edu_input_file
+from educe.rst_dt.corpus import (RstRelationConverter,
+                                 Reader as RstReader)
+from educe.rst_dt.dep2con import deptree_to_rst_tree
+from educe.rst_dt.deptree import NUC_S, RstDepTree, RstDtException
+from educe.rst_dt.metrics.rst_parseval import rst_parseval_report
+# attelo
+from attelo.metrics.deptree import compute_uas_las as att_compute_uas_las
+# local imports
+from evals.showdown import EDUS_FILE, setup_dtree_postprocessor
+
+
+# RST corpus
+CORPUS_DIR = os.path.join('corpus', 'RSTtrees-WSJ-main-1.0/')
+CD_TRAIN = os.path.join(CORPUS_DIR, 'TRAINING')
+CD_TEST = os.path.join(CORPUS_DIR, 'TEST')
+# relation converter (fine- to coarse-grained labels)
+RELMAP_FILE = os.path.join('/home/mmorey/melodi/educe',
+                           'educe', 'rst_dt',
+                           'rst_112to18.txt')
+REL_CONV = RstRelationConverter(RELMAP_FILE).convert_tree
+# pattern for the .edu_input files of the docs from the test set
+EDUS_FILE_PAT = "TMP/latest/data/TEST/{}.relations.edu-pairs.sparse.edu_input"
+
+# output of Li et al.'s parser
+SAVE_DIR = "/home/mmorey/melodi/rst/li_sujian/TextLevelDiscourseParser/mybackup/mstparser-code-116-trunk/mstparser/save"
+COARSE_FILES = [
+    "136.0detailedOutVersion2.txt",
+    "151.0detailedOut.txt",
+    "164.0detailedOut.txt",
+    "177.0detailedOut.txt",
+    "335.0detailedOut.txt",
+    "37.0detailedOut.txt",
+    "424.0detailedOut.txt",
+    "448.0detailedOut.txt",
+    "455.0detailedOutVersion2.txt",
+    "513.0detailedOutVersion2.txt",
+    "529.0detailedOut.txt",
+    "615.0detailedOutVersion2.txt",
+    "712.0detailedOut.txt",
+    "917.0detailedOut.txt",
+]
+FINE_FILES = [
+    "190.0detailedOut.txt",
+    "473.0detailedOutVersion2.txt",
+    "561.0detailedOut.txt",
+    "723.0detailedOut.txt",
+    "747.0detailedOutVersion2.txt",
+    "825.0detailedOut.txt",
+    "947.0detailedOut.txt",
+    "965.0detailedOutVersion2.txt",
+]
+# different format for predicted labels and description of EDU
+COARSE_FEAT_FILES = [
+    "441.0detailedOut.txt",
+]
+
+
+def load_output_file(out_file):
+    """Load an output file from Li et al.'s dep parser.
+    """
+    doc_names = []
+    heads_true = []
+    labels_true = []
+    heads_pred = []
+    labels_pred = []
+    with open(out_file) as f:
+        for line in f:
+            if line.startswith(".\\testdata"):
+                # file
+                doc_name = line.strip().split("\\")[2][:12]  # drop .edus or else
+                # print(doc_name)
+                doc_names.append(doc_name)
+                heads_true.append([-1])  # initial pad for fake root
+                labels_true.append([''])
+                heads_pred.append([-1])
+                labels_pred.append([''])
+            else:
+                edu_idx, hd_true, hd_pred, lbl_true, lbl_pred, edu_str = line.strip().split(' ', 5)
+                if lbl_pred == '<no-type>':
+                    # not sure whether this should be enabled
+                    lbl_pred = 'Elaboration'
+                heads_true[-1].append(int(hd_true))
+                labels_true[-1].append(lbl_true)
+                heads_pred[-1].append(int(hd_pred))
+                labels_pred[-1].append(lbl_pred)
+    res = {
+        'doc_names': doc_names,
+        'heads_true': heads_true,
+        'labels_true': labels_true,
+        'heads_pred': heads_pred,
+        'labels_pred': labels_pred,
+    }
+    return res
+
+
+if __name__ == "__main__":
+    # load dep trees from corpus
+    reader_test = RstReader(CD_TEST)
+    corpus_test = reader_test.slurp()
+
+    # setup conversion from c- to d-tree and back, and eval type
+    nary_enc = 'tree'
+    eval_li = True
+
+    if eval_li:
+        order = 'strict'
+        nuc_strategy = 'constant'
+        nuc_constant = NUC_S
+        rnk_strategy = 'lllrrr'
+        rnk_prioritize_same_unit = False
+        TWIST_GOLD = True
+        ADD_TRIVIAL_SPANS = True
+    else:  # comparable setup to what we use for our own parsers
+        order = 'weak'
+        nuc_strategy = "unamb_else_most_frequent"
+        nuc_constant = None
+        rnk_strategy = "sdist-edist-rl"
+        rnk_prioritize_same_unit = True
+        TWIST_GOLD = False
+        ADD_TRIVIAL_SPANS = False
+
+    nuc_clf, rnk_clf = setup_dtree_postprocessor(
+        nary_enc=nary_enc, order=order, nuc_strategy=nuc_strategy,
+        nuc_constant=nuc_constant, rnk_strategy=rnk_strategy,
+        rnk_prioritize_same_unit=rnk_prioritize_same_unit)
+
+    ctree_true = dict()
+    dtree_true = dict()
+    labelset_true = Counter()
+    for doc_id, ct_true in sorted(corpus_test.items()):
+        doc_name = doc_id.doc
+        ct_true = REL_CONV(ct_true)  # map fine to coarse rels
+        ctree_true[doc_name] = ct_true
+        dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc=nary_enc)
+        # dirty hack: lowercase ROOT
+        dt_true.labels = [x.lower() if x == 'ROOT' else x
+                          for x in dt_true.labels]
+
+        dtree_true[doc_name] = dt_true
+        labelset_true.update(dt_true.labels[1:])
+
+    # load parser output
+    for fname in COARSE_FILES:
+        dtree_pred = dict()
+        labelset_pred = Counter()
+        #
+        f_cur = os.path.join(SAVE_DIR, fname)
+        dep_bunch = load_output_file(f_cur)
+        doc_names = dep_bunch['doc_names']
+        # load and process _pred
+        for doc_name, heads_pred, labels_pred in zip(
+                dep_bunch['doc_names'], dep_bunch['heads_pred'],
+                dep_bunch['labels_pred']):
+            # create dtree _pred
+            edus_data = load_edu_input_file(EDUS_FILE_PAT.format(doc_name),
+                                            edu_type='rst-dt')
+            edus = edus_data['edus']
+            edu2sent = edus_data['edu2sent']
+            dt_pred = RstDepTree(edus)
+            # add predicted edges
+            for dep_idx, (gov_idx, lbl) in enumerate(zip(
+                    heads_pred[1:], labels_pred[1:]), start=1):
+                if lbl == '<no-type>':
+                    lbl = 'Elaboration'
+                # print(lbl)
+                lbl = lbl.lower()
+                labelset_pred[lbl] += 1
+                dt_pred.add_dependency(gov_idx, dep_idx, lbl)
+            dt_pred.sent_idx = [0] + edu2sent  # 0 for fake root + dirty
+            dtree_pred[doc_name] = dt_pred
+        # end WIP
+        expected_labelset = ['attribution', 'background', 'cause', 'comparison', 'condition', 'contrast', 'elaboration', 'enablement', 'evaluation', 'explanation', 'joint', 'manner-means', 'root', 'same-unit', 'summary', 'temporal', 'textual', 'topic-change', 'topic-comment']
+        assert sorted(labelset_pred.keys()) == expected_labelset
+        # wsj_1189 has a weird "span" label in a multinuclear rel at [7--9]
+        # see footnote in Hayashi et al's SIGDIAL 2016 paper
+        assert sorted(labelset_true.keys()) == sorted(
+            expected_labelset + ['span'])
+
+        # compute UAS and LAS on the _true values from the corpus and
+        # _pred Educe RstDepTrees re-built from their output files
+        dtree_true_list = [dtree_true[doc_name] for doc_name in doc_names]
+        dtree_pred_list = [dtree_pred[doc_name] for doc_name in doc_names]
+        att_score_uas, att_score_las = att_compute_uas_las(
+            dtree_true_list, dtree_pred_list, include_ls=False,
+            include_las_n_o_no=False)
+        print("{}\tUAS={:.4f}\tLAS={:.4f} (attelo)".format(
+            fname, att_score_uas, att_score_las))
+
+        # build predicted c-trees using our heuristics for nuc and rank
+        ctree_pred = dict()
+        for doc_name, dt_pred in dtree_pred.items():
+            # 1. enrich d-tree with nuc and order
+            # a. order: the procedure that generates spans produces a
+            # left-heavy branching: ((A B) C), which should be our
+            # "lllrrr" heuristic
+            dt_pred.ranks = rnk_clf.predict([dt_pred])[0]
+            # b. nuclearity: heuristic baseline
+            dt_pred.nucs = nuc_clf.predict([dt_pred])[0]
+            # 2. build _pred c-tree
+            try:
+                ct_pred = deptree_to_rst_tree(dt_pred)
+                ctree_pred[doc_name] = ct_pred
+            except RstDtException as rst_e:
+                print(rst_e)
+                raise
+            # 3. predict nuc and order in _true d-tree, replace the _true
+            # c-tree with a twisted one, like in their eval
+            if TWIST_GOLD:
+                dt_true = dtree_true[doc_name]
+                dt_true.sent_idx = [0] + edu2sent
+                dt_true.ranks = rnk_clf.predict([dt_true])[0]
+                dt_true.nucs = nuc_clf.predict([dt_true])[0]
+                ct_true = ctree_true[doc_name]
+                try:
+                    ct_true = deptree_to_rst_tree(dt_true)
+                except RstDtException as rst_e:
+                    print(rst_e)
+                    raise
+                ctree_true[doc_name] = ct_true
+            
+        # compute RST-Parseval of these c-trees
+        ctree_true_list = [ctree_true[doc_name] for doc_name in doc_names]
+        ctree_pred_list = [ctree_pred[doc_name] for doc_name in doc_names]
+        print(rst_parseval_report(ctree_true_list, ctree_pred_list,
+                                  ctree_type='RST', digits=4,
+                                  per_doc=False,
+                                  add_trivial_spans=ADD_TRIVIAL_SPANS,
+                                  stringent=False))
diff --git a/evals/ours.py b/evals/ours.py
index 0dbe1ce..f9d48bf 100644
--- a/evals/ours.py
+++ b/evals/ours.py
@@ -72,15 +72,12 @@ def load_attelo_dtrees(output_file, edus_file, nuc_clf, rnk_clf):
     """
     dtree_pred = dict()  # predicted dtrees
     # * setup...
-    # load EDUs as they are known to attelo (sigh)
-    # and predicted edges on these EDUs
-    att_edus = load_edus(edus_file)
-    edges_pred = load_attelo_output_file(output_file)
-    # rebuild educe EDUs from their attelo description
-    # and group them by doc_name
+    # load EDUs as they are known to attelo (sigh): rebuild educe EDUs
+    # from their attelo description and group them by doc_name
     educe_edus = defaultdict(list)
     edu2sent_idx = defaultdict(dict)
     gid2num = dict()
+    att_edus = load_edus(edus_file)
     for att_edu in att_edus:
         # doc name
         doc_name = att_edu.grouping
@@ -103,7 +100,8 @@ def load_attelo_dtrees(output_file, edus_file, nuc_clf, rnk_clf):
                                      for e in doc_educe_edus])
                          for doc_name, doc_educe_edus in educe_edus.items()}
 
-    # rebuild RstDepTrees
+    # load predicted edges, on these EDUs, into RstDepTrees
+    edges_pred = load_attelo_output_file(output_file)
     for doc_name, es_pred in sorted(edges_pred.items()):
         # get educe EDUs
         doc_educe_edus = educe_edus[doc_name]
diff --git a/evals/showdown.py b/evals/showdown.py
index c50f336..9384f44 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -18,12 +18,16 @@
 from educe.rst_dt.metrics.rst_parseval import (rst_parseval_detailed_report,
                                                rst_parseval_report)
 #
-from attelo.metrics.deptree import compute_uas_las, compute_uas_las_undirected
+from attelo.metrics.deptree import (compute_uas_las,
+                                    compute_uas_las_undirected)
 
 # local to this package
 from evals.codra import load_codra_ctrees, load_codra_dtrees
 from evals.feng import load_feng_ctrees, load_feng_dtrees
 from evals.gcrf_tree_format import load_gcrf_ctrees, load_gcrf_dtrees
+from evals.hayashi_cons import (load_hayashi_hilda_ctrees,
+                                load_hayashi_hilda_dtrees)
+from evals.hayashi_deps import load_hayashi_dtrees
 from evals.ji import load_ji_ctrees, load_ji_dtrees
 from evals.li_qi import load_li_qi_ctrees, load_li_qi_dtrees
 from evals.ours import (load_deptrees_from_attelo_output,
@@ -94,20 +98,31 @@
 FENG2_OUT_DIR = os.path.join(FENG_DIR, 'gCRF_dist/texts/results/test_batch_gold_seg')
 # Li Qi's parser
 LI_QI_OUT_DIR = '/home/mmorey/melodi/rst/li_qi/result'
+# Hayashi's HILDA
+HAYASHI_OUT_DIR = '/home/mmorey/melodi/rst/hayashi/SIGDIAL'
+HAYASHI_HILDA_OUT_DIR = os.path.join(HAYASHI_OUT_DIR, 'auto_parse/cons/HILDA')
 
 # level of detail for parseval
 DETAILED = False
+EVAL_LI_DEP = True
 STRINGENT = False
+# additional dependency metrics
+INCLUDE_LS = False
+UNDIRECTED_DEPS = False
+EVAL_NUC_RANK = True
 # hyperparams
 NUC_STRATEGY = 'unamb_else_most_frequent'
+NUC_CONSTANT = None  # only useful for NUC_STRATEGY='constant'
 RNK_STRATEGY = 'sdist-edist-rl'
 RNK_PRIORITY_SU = True
 
 
-def setup_dtree_postprocessor(nary_enc):
+def setup_dtree_postprocessor(nary_enc='chain', order='strict',
+                              nuc_strategy=NUC_STRATEGY,
+                              nuc_constant=NUC_CONSTANT,
+                              rnk_strategy=RNK_STRATEGY,
+                              rnk_prioritize_same_unit=RNK_PRIORITY_SU):
     """Setup the nuclearity and rank classifiers to flesh out dtrees."""
-    # tie the order with the encoding for n-ary nodes
-    order = 'weak' if nary_enc == 'tree' else 'strict'
     # load train section of the RST corpus, fit (currently dummy) classifiers
     # for nuclearity and rank
     reader_train = RstReader(CD_TRAIN)
@@ -134,12 +149,13 @@ def setup_dtree_postprocessor(nary_enc):
         y_nuc_train.append(dt.nucs)
         y_rnk_train.append(dt.ranks)
     # nuclearity clf
-    nuc_clf = DummyNuclearityClassifier(strategy=NUC_STRATEGY)
+    nuc_clf = DummyNuclearityClassifier(strategy=nuc_strategy,
+                                        constant=nuc_constant)
     nuc_clf.fit(X_train, y_nuc_train)
     # rank clf
-    rnk_clf = InsideOutAttachmentRanker(strategy=RNK_STRATEGY,
-                                        prioritize_same_unit=RNK_PRIORITY_SU,
-                                        order=order)
+    rnk_clf = InsideOutAttachmentRanker(
+        strategy=rnk_strategy, prioritize_same_unit=rnk_prioritize_same_unit,
+        order=order)
     rnk_clf.fit(X_train, y_rnk_train)
     return nuc_clf, rnk_clf
 
@@ -156,7 +172,7 @@ def main():
     parser.add_argument('authors_pred', nargs='+',
                         choices=['gold', 'silver',
                                  'joty', 'feng', 'feng2', 'ji',
-                                 'li_qi',
+                                 'li_qi', 'hayashi_hilda',
                                  'ours_chain', 'ours_tree', 'ours_tree_su'],
                         help="Author(s) of the predictions")
     parser.add_argument('--nary_enc_pred', default='tree',
@@ -166,7 +182,7 @@ def main():
     parser.add_argument('--author_true', default='gold',
                         choices=['gold', 'silver',
                                  'joty', 'feng', 'feng2', 'ji',
-                                 'li_qi',
+                                 'li_qi', 'hayashi_hilda',
                                  'ours_chain', 'ours_tree'],
                         help="Author of the reference")
     # * dtree eval
@@ -200,7 +216,10 @@ def main():
 
     # 0. setup the postprocessors to flesh out unordered dtrees into ordered
     # ones with nuclearity
-    nuc_clf, rnk_clf = setup_dtree_postprocessor(nary_enc_pred)
+    # * tie the order with the encoding for n-ary nodes
+    order = 'weak' if nary_enc_pred == 'tree' else 'strict'
+    nuc_clf, rnk_clf = setup_dtree_postprocessor(nary_enc=nary_enc_pred,
+                                                 order=order)
 
     # the eval compares parses for the test section of the RST corpus
     reader_test = RstReader(CD_TEST)
@@ -230,6 +249,16 @@ def main():
     d_preds = []  # predictions: [(parser_name, dict(doc_name, dt_pred))]
 
     for author_pred in authors_pred:
+        if author_pred == 'hayashi_hilda':
+            c_preds.append(
+                ('hayashi_hilda', load_hayashi_hilda_ctrees(
+                    HAYASHI_HILDA_OUT_DIR, REL_CONV))
+            )
+            d_preds.append(
+                ('hayashi_hilda', load_hayashi_hilda_dtrees(
+                    HAYASHI_HILDA_OUT_DIR, REL_CONV, nary_enc='chain'))
+            )
+
         if author_pred == 'li_qi':
             c_preds.append(
                 ('li_qi', load_li_qi_ctrees(LI_QI_OUT_DIR, REL_CONV))
@@ -342,7 +371,13 @@ def main():
     digits = 4
     width = max(len(parser_name) for parser_name, _ in d_preds)
 
-    headers = ["UAS", "LAS", "LS", "UUAS", "ULAS"]
+    headers = ["UAS", "LAS"]
+    if INCLUDE_LS:
+        headers += ["LS"]
+    if EVAL_NUC_RANK:
+        headers += ["LAS+N", "LAS+O", "LAS+N+O"]
+    if UNDIRECTED_DEPS:
+        headers += ["UUAS", "ULAS"]
     fmt = '%% %ds' % width  # first col: parser name
     fmt += '  '
     fmt += ' '.join(['% 9s' for _ in headers])
@@ -367,22 +402,29 @@ def main():
                              mode='w', encoding='utf-8') as f:
                 print(', '.join('{:.4f}'.format(x)
                                 for x in compute_uas_las(
-                                        [dt_true], [dt_pred])),
+                                        [dt_true], [dt_pred],
+                                        include_ls=INCLUDE_LS,
+                                        include_las_n_o_no=EVAL_NUC_RANK)),
                       file=f)
-                # WIP scores for undirected edges
-                print(', '.join('{:.4f}'.format(x)
-                                for x in compute_uas_las_undirected(
-                                        [dt_true], [dt_pred])),
-                      file=f)
-
+                if UNDIRECTED_DEPS:
+                    # scores for undirected edges
+                    print(', '.join('{:.4f}'.format(x)
+                                    for x in compute_uas_las_undirected(
+                                            [dt_true], [dt_pred])),
+                          file=f)
         # end WIP print
-        score_uas, score_las, score_ls = compute_uas_las(dtree_true_list,
-                                                         dtree_pred_list)
-        score_uuas, score_ulas = compute_uas_las_undirected(dtree_true_list,
-                                                            dtree_pred_list)
+
+        all_scores = []
+        all_scores += list(compute_uas_las(
+            dtree_true_list, dtree_pred_list, include_ls=INCLUDE_LS,
+            include_las_n_o_no=EVAL_NUC_RANK))
+        if UNDIRECTED_DEPS:
+            score_uuas, score_ulas = compute_uas_las_undirected(
+                dtree_true_list, dtree_pred_list)
+            all_scores += [score_uuas, score_ulas]
         # append to report
         values = ['{pname: <{fill}}'.format(pname=parser_name, fill=width)]
-        for v in (score_uas, score_las, score_ls, score_uuas, score_ulas):
+        for v in all_scores:
             values += ["{0:0.{1}f}".format(v, digits)]
         report += fmt % tuple(values)
     # end table content
@@ -425,6 +467,7 @@ def main():
                                           ctree_type=ctree_type,
                                           digits=4,
                                           per_doc=per_doc,
+                                          add_trivial_spans=EVAL_LI_DEP,
                                           stringent=STRINGENT),
                       file=f)
         # end WIP
@@ -434,6 +477,7 @@ def main():
         print(rst_parseval_report(ctree_true_list, ctree_pred_list,
                                   ctree_type=ctree_type, digits=4,
                                   per_doc=per_doc,
+                                  add_trivial_spans=EVAL_LI_DEP,
                                   stringent=STRINGENT))
         # detailed report on S+N+R
         if DETAILED:

From b4bb2a10b6d18db8093dcb9422b5de3e6c43bf79 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Wed, 7 Dec 2016 14:35:50 +0100
Subject: [PATCH 39/74] MAINT enable to switch between coarse- and fine-grained
 rels in eval li dep

---
 evals/li_sujian.py | 34 ++++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/evals/li_sujian.py b/evals/li_sujian.py
index 1f0d89b..b9f603b 100644
--- a/evals/li_sujian.py
+++ b/evals/li_sujian.py
@@ -109,16 +109,26 @@ def load_output_file(out_file):
     reader_test = RstReader(CD_TEST)
     corpus_test = reader_test.slurp()
 
+    # choice of predictions: granularity of relations
+    RST_RELS = 'coarse'
+    if RST_RELS == 'coarse':
+        PRED_FILES = COARSE_FILES
+    else:
+        PRED_FILES = FINE_FILES
+    # eval procedure: the one in the parser of Li et al. vs standard one
+    EVAL_LI = True
+
     # setup conversion from c- to d-tree and back, and eval type
     nary_enc = 'tree'
-    eval_li = True
 
-    if eval_li:
+    if EVAL_LI:
+        # reconstruction of the c-tree
         order = 'strict'
         nuc_strategy = 'constant'
         nuc_constant = NUC_S
         rnk_strategy = 'lllrrr'
         rnk_prioritize_same_unit = False
+        # eval
         TWIST_GOLD = True
         ADD_TRIVIAL_SPANS = True
     else:  # comparable setup to what we use for our own parsers
@@ -140,7 +150,9 @@ def load_output_file(out_file):
     labelset_true = Counter()
     for doc_id, ct_true in sorted(corpus_test.items()):
         doc_name = doc_id.doc
-        ct_true = REL_CONV(ct_true)  # map fine to coarse rels
+        if RST_RELS == 'coarse':
+            # map fine to coarse rels
+            ct_true = REL_CONV(ct_true)
         ctree_true[doc_name] = ct_true
         dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc=nary_enc)
         # dirty hack: lowercase ROOT
@@ -151,7 +163,7 @@ def load_output_file(out_file):
         labelset_true.update(dt_true.labels[1:])
 
     # load parser output
-    for fname in COARSE_FILES:
+    for fname in PRED_FILES:
         dtree_pred = dict()
         labelset_pred = Counter()
         #
@@ -180,12 +192,14 @@ def load_output_file(out_file):
             dt_pred.sent_idx = [0] + edu2sent  # 0 for fake root + dirty
             dtree_pred[doc_name] = dt_pred
         # end WIP
-        expected_labelset = ['attribution', 'background', 'cause', 'comparison', 'condition', 'contrast', 'elaboration', 'enablement', 'evaluation', 'explanation', 'joint', 'manner-means', 'root', 'same-unit', 'summary', 'temporal', 'textual', 'topic-change', 'topic-comment']
-        assert sorted(labelset_pred.keys()) == expected_labelset
-        # wsj_1189 has a weird "span" label in a multinuclear rel at [7--9]
-        # see footnote in Hayashi et al's SIGDIAL 2016 paper
-        assert sorted(labelset_true.keys()) == sorted(
-            expected_labelset + ['span'])
+
+        if RST_RELS == 'coarse':
+            expected_labelset = ['attribution', 'background', 'cause', 'comparison', 'condition', 'contrast', 'elaboration', 'enablement', 'evaluation', 'explanation', 'joint', 'manner-means', 'root', 'same-unit', 'summary', 'temporal', 'textual', 'topic-change', 'topic-comment']
+            assert sorted(labelset_pred.keys()) == expected_labelset
+            # wsj_1189 has a weird "span" label in a multinuclear rel at [7--9]
+            # see footnote in Hayashi et al's SIGDIAL 2016 paper
+            assert sorted(labelset_true.keys()) == sorted(
+                expected_labelset + ['span'])
 
         # compute UAS and LAS on the _true values from the corpus and
         # _pred Educe RstDepTrees re-built from their output files

From 1856197490c4088204ead9bc6921ef9d1caed23a Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Fri, 9 Dec 2016 16:01:45 +0100
Subject: [PATCH 40/74] ENH+FIX hayashi_mst, li dep

---
 evals/hayashi_deps.py | 73 ++++++++++++++++++++++++++++++++++++++++---
 evals/li_sujian.py    | 35 +++++++++++----------
 evals/showdown.py     | 69 ++++++++++++++++++++++++++++------------
 3 files changed, 137 insertions(+), 40 deletions(-)

diff --git a/evals/hayashi_deps.py b/evals/hayashi_deps.py
index f613013..00a776b 100644
--- a/evals/hayashi_deps.py
+++ b/evals/hayashi_deps.py
@@ -8,8 +8,10 @@
 import os
 from glob import glob
 
+from educe.learning.edu_input_format import load_edu_input_file
 from educe.rst_dt.corpus import Reader
 from educe.rst_dt.deptree import RstDepTree
+from educe.rst_dt.dep2con import deptree_to_rst_tree
 
 
 # load true ctrees, from the TEST section of the RST-DT, to get gold EDUs
@@ -81,17 +83,28 @@ def load_hayashi_dep_files(out_dir):
     return dtrees
 
 
-def load_hayashi_dtrees(out_dir, rel_conv):
-    """Load the dtrees output by one of Hayashi et al.'s parser.
+def load_hayashi_dep_dtrees(out_dir, rel_conv, edus_file_pat, nuc_clf,
+                            rnk_clf):
+    """Load the dtrees output by one of Hayashi et al.'s dep parsers.
 
     Parameters
     ----------
-    out_dir: str
+    out_dir : str
         Path to the folder containing .dis files.
-    rel_conv: RstRelationConverter
+
+    rel_conv : RstRelationConverter
         Converter for relation labels (fine- to coarse-grained, plus
         normalization).
 
+    edus_file_pat : str
+        Pattern for the .edu_input files.
+
+    nuc_clf : NuclearityClassifier
+        Nuclearity classifier
+
+    rnk_clf : RankClassifier
+        Rank classifier
+
     Returns
     -------
     dtree_pred: dict(str, RstDepTree)
@@ -103,5 +116,57 @@ def load_hayashi_dtrees(out_dir, rel_conv):
     for doc_name, dt_pred in dtrees.items():
         if rel_conv is not None:
             dt_pred = rel_conv(dt_pred)
+        # WIP add nuclearity and rank
+        edus_data = load_edu_input_file(edus_file_pat.format(doc_name),
+                                        edu_type='rst-dt')
+        edu2sent = edus_data['edu2sent']
+        dt_pred.sent_idx = [0] + edu2sent  # 0 for fake root ; DIRTY
+        dt_pred.nucs = nuc_clf.predict([dt_pred])[0]
+        dt_pred.ranks = rnk_clf.predict([dt_pred])[0]
+        # end WIP
         dtree_pred[doc_name] = dt_pred
+        
     return dtree_pred
+
+
+def load_hayashi_dep_ctrees(out_dir, rel_conv, edus_file_pat, nuc_clf,
+                            rnk_clf):
+    """Load the dtrees output by one of Hayashi et al.'s dep parsers.
+
+    Parameters
+    ----------
+    out_dir : str
+        Path to the folder containing .dis files.
+
+    rel_conv : RstRelationConverter
+        Converter for relation labels (fine- to coarse-grained, plus
+        normalization).
+
+    edus_file_pat : str
+        Pattern for the .edu_input files.
+
+    nuc_clf : NuclearityClassifier
+        Nuclearity classifier
+
+    rnk_clf : RankClassifier
+        Rank classifier
+
+    Returns
+    -------
+    ctree_pred: dict(str, RSTTree)
+        RST ctree for each document.
+    """
+    ctree_pred = dict()
+
+    dtree_pred = load_hayashi_dep_dtrees(out_dir, rel_conv, edus_file_pat,
+                                         nuc_clf, rnk_clf)
+    for doc_name, dt_pred in dtree_pred.items():
+        try:
+            ct_pred = deptree_to_rst_tree(dt_pred)
+        except RstDtException:
+            print(doc_name)
+            raise
+        else:
+            ctree_pred[doc_name] = ct_pred
+
+    return ctree_pred
diff --git a/evals/li_sujian.py b/evals/li_sujian.py
index b9f603b..6f80db4 100644
--- a/evals/li_sujian.py
+++ b/evals/li_sujian.py
@@ -17,7 +17,7 @@
 # attelo
 from attelo.metrics.deptree import compute_uas_las as att_compute_uas_las
 # local imports
-from evals.showdown import EDUS_FILE, setup_dtree_postprocessor
+from evals.showdown import EDUS_FILE_PAT, setup_dtree_postprocessor
 
 
 # RST corpus
@@ -29,8 +29,7 @@
                            'educe', 'rst_dt',
                            'rst_112to18.txt')
 REL_CONV = RstRelationConverter(RELMAP_FILE).convert_tree
-# pattern for the .edu_input files of the docs from the test set
-EDUS_FILE_PAT = "TMP/latest/data/TEST/{}.relations.edu-pairs.sparse.edu_input"
+
 
 # output of Li et al.'s parser
 SAVE_DIR = "/home/mmorey/melodi/rst/li_sujian/TextLevelDiscourseParser/mybackup/mstparser-code-116-trunk/mstparser/save"
@@ -65,6 +64,9 @@
     "441.0detailedOut.txt",
 ]
 
+# default file(s) to include ; I picked a coarse-grained one with good scores
+DEFAULT_FILES = ["712.0detailedOut.txt"]
+
 
 def load_output_file(out_file):
     """Load an output file from Li et al.'s dep parser.
@@ -112,14 +114,14 @@ def load_output_file(out_file):
     # choice of predictions: granularity of relations
     RST_RELS = 'coarse'
     if RST_RELS == 'coarse':
-        PRED_FILES = COARSE_FILES
+        PRED_FILES = DEFAULT_FILES  # COARSE_FILES
     else:
         PRED_FILES = FINE_FILES
     # eval procedure: the one in the parser of Li et al. vs standard one
-    EVAL_LI = True
+    EVAL_LI = False
 
     # setup conversion from c- to d-tree and back, and eval type
-    nary_enc = 'tree'
+    nary_enc = 'chain'
 
     if EVAL_LI:
         # reconstruction of the c-tree
@@ -201,16 +203,6 @@ def load_output_file(out_file):
             assert sorted(labelset_true.keys()) == sorted(
                 expected_labelset + ['span'])
 
-        # compute UAS and LAS on the _true values from the corpus and
-        # _pred Educe RstDepTrees re-built from their output files
-        dtree_true_list = [dtree_true[doc_name] for doc_name in doc_names]
-        dtree_pred_list = [dtree_pred[doc_name] for doc_name in doc_names]
-        att_score_uas, att_score_las = att_compute_uas_las(
-            dtree_true_list, dtree_pred_list, include_ls=False,
-            include_las_n_o_no=False)
-        print("{}\tUAS={:.4f}\tLAS={:.4f} (attelo)".format(
-            fname, att_score_uas, att_score_las))
-
         # build predicted c-trees using our heuristics for nuc and rank
         ctree_pred = dict()
         for doc_name, dt_pred in dtree_pred.items():
@@ -242,6 +234,17 @@ def load_output_file(out_file):
                     print(rst_e)
                     raise
                 ctree_true[doc_name] = ct_true
+
+        # compute UAS and LAS on the _true values from the corpus and
+        # _pred Educe RstDepTrees re-built from their output files
+        dtree_true_list = [dtree_true[doc_name] for doc_name in doc_names]
+        dtree_pred_list = [dtree_pred[doc_name] for doc_name in doc_names]
+        sc_uas, sc_las, sc_las_n, sc_las_o, sc_las_no = att_compute_uas_las(
+            dtree_true_list, dtree_pred_list, include_ls=False,
+            include_las_n_o_no=True)
+        print(("{}\tUAS={:.4f}\tLAS={:.4f}\tLAS+N={:.4f}\tLAS+O={:.4f}\t"
+               "LAS+N+O={:.4f}").format(
+                   fname, sc_uas, sc_las, sc_las_n, sc_las_o, sc_las_no))
             
         # compute RST-Parseval of these c-trees
         ctree_true_list = [ctree_true[doc_name] for doc_name in doc_names]
diff --git a/evals/showdown.py b/evals/showdown.py
index 9384f44..11cc0b9 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -27,7 +27,8 @@
 from evals.gcrf_tree_format import load_gcrf_ctrees, load_gcrf_dtrees
 from evals.hayashi_cons import (load_hayashi_hilda_ctrees,
                                 load_hayashi_hilda_dtrees)
-from evals.hayashi_deps import load_hayashi_dtrees
+from evals.hayashi_deps import (load_hayashi_dep_dtrees,
+                                load_hayashi_dep_ctrees)
 from evals.ji import load_ji_ctrees, load_ji_dtrees
 from evals.li_qi import load_li_qi_ctrees, load_li_qi_dtrees
 from evals.ours import (load_deptrees_from_attelo_output,
@@ -43,7 +44,9 @@
 RELMAP_FILE = os.path.join('/home/mmorey/melodi/educe',
                            'educe', 'rst_dt',
                            'rst_112to18.txt')
-REL_CONV = RstRelationConverter(RELMAP_FILE).convert_tree
+REL_CONV_BASE = RstRelationConverter(RELMAP_FILE)
+REL_CONV = REL_CONV_BASE.convert_tree
+REL_CONV_DTREE = REL_CONV_BASE.convert_dtree
 
 
 #
@@ -51,9 +54,14 @@
 #
 
 # * syntax: pred vs gold
+# old-style .edu_input: whole test set
 EDUS_FILE = os.path.join('/home/mmorey/melodi',
                          'irit-rst-dt/TMP/syn_gold_coarse',
                          'TEST.relations.sparse.edu_input')
+
+# new style .edu_input: one file per doc in test set
+EDUS_FILE_PAT = "TMP/latest/data/TEST/{}.relations.edu-pairs.sparse.edu_input"
+
 # outputs of parsers
 EISNER_OUT_SYN_PRED = os.path.join(
     '/home/mmorey/melodi',
@@ -101,10 +109,10 @@
 # Hayashi's HILDA
 HAYASHI_OUT_DIR = '/home/mmorey/melodi/rst/hayashi/SIGDIAL'
 HAYASHI_HILDA_OUT_DIR = os.path.join(HAYASHI_OUT_DIR, 'auto_parse/cons/HILDA')
+HAYASHI_MST_OUT_DIR = os.path.join(HAYASHI_OUT_DIR, 'auto_parse/dep/li')
 
 # level of detail for parseval
 DETAILED = False
-EVAL_LI_DEP = True
 STRINGENT = False
 # additional dependency metrics
 INCLUDE_LS = False
@@ -172,7 +180,7 @@ def main():
     parser.add_argument('authors_pred', nargs='+',
                         choices=['gold', 'silver',
                                  'joty', 'feng', 'feng2', 'ji',
-                                 'li_qi', 'hayashi_hilda',
+                                 'li_qi', 'hayashi_hilda', 'hayashi_mst',
                                  'ours_chain', 'ours_tree', 'ours_tree_su'],
                         help="Author(s) of the predictions")
     parser.add_argument('--nary_enc_pred', default='tree',
@@ -182,7 +190,7 @@ def main():
     parser.add_argument('--author_true', default='gold',
                         choices=['gold', 'silver',
                                  'joty', 'feng', 'feng2', 'ji',
-                                 'li_qi', 'hayashi_hilda',
+                                 'li_qi', 'hayashi_hilda', 'hayashi_mst',
                                  'ours_chain', 'ours_tree'],
                         help="Author of the reference")
     # * dtree eval
@@ -194,8 +202,13 @@ def main():
                         help="Binarize the reference ctree for the eval")
     parser.add_argument('--simple_rsttree', action='store_true',
                         help="Binarize ctree and move relations up")
+    # * non-standard evals
     parser.add_argument('--per_doc', action='store_true',
                         help="Doc-averaged scores (cf. Ji's eval)")
+    parser.add_argument('--eval_li_dep', action='store_true',
+                        help=("Evaluate as in the dep parser of Li et al. "
+                              "2014: all relations are NS, spiders map to "
+                              "left-heavy branching, three trivial spans "))
     #
     args = parser.parse_args()
     author_true = args.author_true
@@ -209,6 +222,11 @@ def main():
     # then average over docs
     # it should be False, except for comparison with the DPLP paper
     per_doc = args.per_doc
+    # "eval_li_dep = True" replaces the original nuclearity and order with
+    # heuristically determined values for _pred but also _true, and adds
+    # three trivial spans
+    eval_li_dep = args.eval_li_dep
+
     #
     if binarize_true and nary_enc_true != 'chain':
         raise ValueError("--binarize_true is compatible with "
@@ -259,6 +277,18 @@ def main():
                     HAYASHI_HILDA_OUT_DIR, REL_CONV, nary_enc='chain'))
             )
 
+        if author_pred == 'hayashi_mst':
+            c_preds.append(
+                ('hayashi_mst', load_hayashi_dep_ctrees(
+                    HAYASHI_MST_OUT_DIR, REL_CONV_DTREE, EDUS_FILE_PAT,
+                    nuc_clf, rnk_clf))
+            )
+            d_preds.append(
+                ('hayashi_mst', load_hayashi_dep_dtrees(
+                    HAYASHI_MST_OUT_DIR, REL_CONV_DTREE, EDUS_FILE_PAT,
+                    nuc_clf, rnk_clf))
+            )
+
         if author_pred == 'li_qi':
             c_preds.append(
                 ('li_qi', load_li_qi_ctrees(LI_QI_OUT_DIR, REL_CONV))
@@ -303,11 +333,12 @@ def main():
             # DPLP outputs RST ctrees in the form of lists of spans;
             # load_ji_dtrees maps them to RST dtrees
             c_preds.append(
-                ('DPLP', load_ji_ctrees(JI_OUT_DIR, REL_CONV))
+                ('DPLP', load_ji_ctrees(
+                    JI_OUT_DIR, REL_CONV))
             )
             d_preds.append(
-                ('DPLP', load_ji_dtrees(JI_OUT_DIR, REL_CONV,
-                                        nary_enc='chain'))
+                ('DPLP', load_ji_dtrees(
+                    JI_OUT_DIR, REL_CONV, nary_enc='chain'))
             )
             # ji-{chain,tree} would be the same except nary_enc='tree' ;
             # the nary_enc does not matter because codra outputs binary ctrees,
@@ -316,25 +347,23 @@ def main():
         if author_pred == 'ours_chain':
             # Eisner, predicted syntax, chain
             c_preds.append(
-                ('ours-chain', load_attelo_ctrees(EISNER_OUT_SYN_PRED, EDUS_FILE,
-                                                  nuc_clf, rnk_clf))
+                ('ours-chain', load_attelo_ctrees(
+                    EISNER_OUT_SYN_PRED, EDUS_FILE, nuc_clf, rnk_clf))
             )
             d_preds.append(
-                ('ours-chain', load_attelo_dtrees(EISNER_OUT_SYN_PRED, EDUS_FILE,
-                                                  nuc_clf, rnk_clf))
+                ('ours-chain', load_attelo_dtrees(
+                    EISNER_OUT_SYN_PRED, EDUS_FILE, nuc_clf, rnk_clf))
             )
 
         if author_pred == 'ours_tree':
             # Eisner, predicted syntax, tree + same-unit
             c_preds.append(
-                ('ours-tree', load_attelo_ctrees(EISNER_OUT_TREE_SYN_PRED,
-                                                 EDUS_FILE,
-                                                 nuc_clf, rnk_clf))
+                ('ours-tree', load_attelo_ctrees(
+                    EISNER_OUT_TREE_SYN_PRED, EDUS_FILE, nuc_clf, rnk_clf))
             )
             d_preds.append(
-                ('ours-tree', load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED,
-                                                 EDUS_FILE,
-                                                 nuc_clf, rnk_clf))
+                ('ours-tree', load_attelo_dtrees(
+                    EISNER_OUT_TREE_SYN_PRED, EDUS_FILE, nuc_clf, rnk_clf))
             )
         if author_pred == 'ours_tree_su':
             # Eisner, predicted syntax, tree + same-unit
@@ -467,7 +496,7 @@ def main():
                                           ctree_type=ctree_type,
                                           digits=4,
                                           per_doc=per_doc,
-                                          add_trivial_spans=EVAL_LI_DEP,
+                                          add_trivial_spans=eval_li_dep,
                                           stringent=STRINGENT),
                       file=f)
         # end WIP
@@ -477,7 +506,7 @@ def main():
         print(rst_parseval_report(ctree_true_list, ctree_pred_list,
                                   ctree_type=ctree_type, digits=4,
                                   per_doc=per_doc,
-                                  add_trivial_spans=EVAL_LI_DEP,
+                                  add_trivial_spans=eval_li_dep,
                                   stringent=STRINGENT))
         # detailed report on S+N+R
         if DETAILED:

From d2bd5c6f61b189e844a85f2ed408849b45d57012 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Sat, 10 Dec 2016 17:45:52 +0100
Subject: [PATCH 41/74] ENH read output of braud's parsers

---
 evals/braud_coling.py | 149 ++++++++++++++++++++++++++++++++++++++++++
 evals/braud_eacl.py   | 120 ++++++++++++++++++++++++++++++++++
 evals/showdown.py     |  47 ++++++++++++-
 3 files changed, 315 insertions(+), 1 deletion(-)
 create mode 100644 evals/braud_coling.py
 create mode 100644 evals/braud_eacl.py

diff --git a/evals/braud_coling.py b/evals/braud_coling.py
new file mode 100644
index 0000000..36fad6a
--- /dev/null
+++ b/evals/braud_coling.py
@@ -0,0 +1,149 @@
+"""Read the output of Braud et al.'s COLING parser.
+
+"""
+
+from __future__ import absolute_import, print_function
+
+import codecs
+from glob import glob
+import os
+
+from nltk import Tree
+
+from educe.annotation import Span
+from educe.rst_dt.annotation import EDU, Node, SimpleRSTTree
+from educe.rst_dt.deptree import RstDepTree
+
+
+# map *.mrg.pred files to the original doc names
+MRG_TO_RST = {
+    '12.mrg.pred': 'wsj_0644.out',  # 4
+    '4.mrg.pred': 'wsj_1129.out',  # 5
+    '26.mrg.pred': 'wsj_1197.out',  # 6
+    '24.mrg.pred': 'wsj_1113.out',  # 8
+    '14.mrg.pred': 'wsj_0684.out',  # 10
+    '32.mrg.pred': 'wsj_1354.out',  # 11
+    '18.mrg.pred': 'wsj_1183.out',  # 12
+    '29.mrg.pred': 'wsj_1346.out',  # 15
+    '28.mrg.pred': 'wsj_1169.out',  # 17
+    '37.mrg.pred': 'wsj_0667.out',  # 17
+    '19.mrg.pred': 'wsj_0607.out', # 19
+    '7.mrg.pred': 'wsj_0654.out', # 19
+    '16.mrg.pred': 'wsj_1325.out',  # 21
+    '25.mrg.pred': 'wsj_2375.out',  # 22
+    '31.mrg.pred': 'wsj_1380.out',  # 23
+    '1.mrg.pred': 'wsj_0623.out',  # 25
+    '15.mrg.pred': 'wsj_2373.out',  # 31
+    '30.mrg.pred': 'wsj_2336.out',  # 31
+    '3.mrg.pred': 'wsj_1365.out',  # 39
+    '34.mrg.pred': 'wsj_1148.out',  # 43
+    '11.mrg.pred': 'wsj_1306.out',  # 47
+    '10.mrg.pred': 'wsj_2354.out',  # 52
+    '35.mrg.pred': 'wsj_1126.out',  # 55
+    '0.mrg.pred': 'wsj_2385.out',  # 60
+    '2.mrg.pred': 'wsj_0632.out',  # 62
+    '20.mrg.pred': 'wsj_0602.out',  # 69
+    '27.mrg.pred': 'wsj_0627.out',  # 69
+    '13.mrg.pred': 'wsj_1189.out',  # 91
+    '6.mrg.pred': 'wsj_0616.out',  # 92
+    '36.mrg.pred': 'wsj_1307.out',  # 98
+    '33.mrg.pred': 'wsj_1142.out',  # 106
+    '9.mrg.pred': 'wsj_0655.out',  # 110
+    '21.mrg.pred': 'wsj_2386.out',  # 127
+    '23.mrg.pred': 'wsj_0689.out',  # 132
+    '8.mrg.pred': 'wsj_1387.out',  # 134
+    '17.mrg.pred': 'wsj_1331.out',  # 158
+    '22.mrg.pred': 'wsj_1376.out',  # 202
+    '5.mrg.pred': 'wsj_1146.out',  # 304
+}
+
+
+def tree_to_simple_rsttree(tree):
+    """Build a SimpleRSTTree from a NLTK Tree"""
+    origin = None  # or is it?
+    if not tree:
+        # no kid: EDU (+pre-terminal)
+        num = int(tree.label())
+        span = Span(num, num)  # FIXME
+        text = ''  # FIXME
+        edu = EDU(num, span, text, context=None, origin=origin)
+        # pre-terminal
+        edu_span = (num, num)
+        nuc = "leaf"
+        rel = "leaf"
+        node = Node(nuc, edu_span, span, rel, context=None)
+        return SimpleRSTTree(node, [edu], origin=origin)
+
+    # internal node
+    new_kids = [tree_to_simple_rsttree(kid) for kid in tree]
+    # node
+    nuc, rel = tree.label().split('-', 1)
+    edu_beg = (new_kids[0].num if isinstance(new_kids[0], EDU)
+               else new_kids[0].label().edu_span[0])
+    edu_end = (new_kids[-1].num if isinstance(new_kids[-1], EDU)
+               else new_kids[-1].label().edu_span[1])
+    edu_span = (edu_beg, edu_end)
+    char_beg = (new_kids[0].num if isinstance(new_kids[0], EDU)
+                  else new_kids[0].label().span.char_start)
+    char_end = (new_kids[-1].num if isinstance(new_kids[-1], EDU)
+                else new_kids[-1].label().span.char_end)
+    span = Span(char_beg, char_end)
+    new_node = Node(nuc, edu_span, span, rel, context=None)
+    new_tree = SimpleRSTTree(new_node, new_kids, origin=origin)
+    return new_tree
+
+
+def _load_braud_coling_file(f):
+    """Do load file"""
+    tree = Tree.fromstring(f.read().strip())
+    simple_ctree = tree_to_simple_rsttree(tree)
+    return simple_ctree
+        
+
+def load_braud_coling_file(fpath):
+    """Load a file."""
+    with codecs.open(fpath, 'rb', 'utf-8') as f:
+        return _load_braud_coling_file(f)
+
+
+def load_braud_coling_ctrees(out_dir, rel_conv):
+    """Load the ctrees output by Braud et al.'s parser
+
+    Parameters
+    ----------
+    out_dir : str
+        Path to the output directory.
+
+    rel_conv : TODO
+        Relation converter
+
+    Returns
+    -------
+    ctree_pred : dict(str, RSTTree)
+        RST c-tree for each document.
+    """
+    ctree_pred = dict()
+    for fpath in sorted(glob(os.path.join(out_dir, '*.mrg.pred'))):
+        fname = os.path.basename(fpath)
+        doc_name = MRG_TO_RST.get(fname, fname)
+        sct_pred = load_braud_coling_file(fpath)
+        # convert to regular RSTTree
+        ct_pred = SimpleRSTTree.to_binary_rst_tree(sct_pred)
+        # convert relation labels
+        ct_pred = rel_conv(ct_pred)
+        # TODO check ct_true: assert that mrg.gold == .out.dis
+        ctree_pred[doc_name] = ct_pred
+    return ctree_pred
+
+
+def load_braud_coling_dtrees(out_dir, rel_conv, nary_enc='chain'):
+    """Do load dtrees"""
+    dtree_pred = dict()
+    ctree_pred = load_braud_coling_ctrees(out_dir, rel_conv)
+    for doc_name, ct_pred in ctree_pred.items():
+        dt_pred = RstDepTree.from_rst_tree(ct_pred)
+        # print(dt_pred.labels)  # DEBUG
+        # raise ValueError('debug me')
+        dtree_pred[doc_name] = dt_pred
+    # TODO load ctrees, convert
+    return dtree_pred
diff --git a/evals/braud_eacl.py b/evals/braud_eacl.py
new file mode 100644
index 0000000..89aea24
--- /dev/null
+++ b/evals/braud_eacl.py
@@ -0,0 +1,120 @@
+"""Read the output of Braud et al.'s EACL parsers.
+
+"""
+
+from __future__ import absolute_import, print_function
+
+import codecs
+from glob import glob
+import os
+
+from nltk import Tree
+
+from educe.annotation import Span
+from educe.rst_dt.annotation import EDU, Node, SimpleRSTTree
+from educe.rst_dt.deptree import RstDepTree
+
+
+def tree_to_simple_rsttree(tree, edu_num=1):
+    """Build a SimpleRSTTree from a NLTK Tree.
+
+    Parameters
+    ----------
+    edu_num : int, defaults to 1
+        Number of the next EDU
+    """
+    origin = None
+
+    if tree.label() == 'EDU':
+        # EDU (+pre-terminal)
+        num = edu_num
+        span = Span(num, num)
+        # 'EDU <joint_text>'
+        text = tree[0]
+        edu = EDU(num, span, text, context=None, origin=origin)
+        # pre-terminal
+        edu_span = (num, num)
+        nuc = "leaf"
+        rel = "leaf"
+        node = Node(nuc, edu_span, span, rel, context=None)
+        return SimpleRSTTree(node, [edu], origin=origin)
+
+    new_kids = []
+    for kid in tree:
+        new_kid = tree_to_simple_rsttree(kid, edu_num=edu_num)
+        edu_num = new_kid.label().edu_span[1] + 1
+        new_kids.append(new_kid)
+
+    # ROOT
+    if tree.label() == 'ROOT':
+        assert len(new_kids) == 1
+        return new_kids[0]
+
+    # internal node
+    # label: 'NNTextualorganization'
+    nuc = tree.label()[:2]
+    rel = tree.label()[2:]
+    # same as in braud_coling
+    edu_beg = (new_kids[0].num if isinstance(new_kids[0], EDU)
+               else new_kids[0].label().edu_span[0])
+    edu_end = (new_kids[-1].num if isinstance(new_kids[-1], EDU)
+               else new_kids[-1].label().edu_span[1])
+    edu_span = (edu_beg, edu_end)
+    char_beg = (new_kids[0].num if isinstance(new_kids[0], EDU)
+                  else new_kids[0].label().span.char_start)
+    char_end = (new_kids[-1].num if isinstance(new_kids[-1], EDU)
+                else new_kids[-1].label().span.char_end)
+    span = Span(char_beg, char_end)
+    new_node = Node(nuc, edu_span, span, rel, context=None)
+    new_tree = SimpleRSTTree(new_node, new_kids, origin=origin)
+    return new_tree
+
+
+def _load_braud_eacl_file(f):
+    """Do load SimpleRSTTrees from f"""
+    sctrees = []
+    for line in f:
+        tree = Tree.fromstring(line.strip())
+        sctree = tree_to_simple_rsttree(tree)
+        sctrees.append(sctree)
+    return sctrees
+
+def load_braud_eacl_file(fpath):
+    """Load SimpleRSTTrees from a file"""
+    with codecs.open(fpath, 'rb', 'utf-8') as f:
+        return _load_braud_eacl_file(f)
+
+def load_braud_eacl_ctrees(fpath, rel_conv, doc_names):
+    """Load the ctrees output by Braud et al.'s parser
+
+    Parameters
+    ----------
+    fpath : str
+        Path to the output file.
+
+    rel_conv : TODO
+        Relation converter.
+
+    Returns
+    -------
+    ctree_pred : dict(str, RSTTree)
+        RST c-tree for each document.
+    """
+    ctree_pred = dict()
+    sctree_pred = load_braud_eacl_file(fpath)
+    for doc_name, sct_pred in zip(doc_names, sctree_pred):
+        ct_pred = SimpleRSTTree.to_binary_rst_tree(sct_pred)
+        ct_pred = rel_conv(ct_pred)
+        ctree_pred[doc_name] = ct_pred
+    return ctree_pred
+
+
+def load_braud_eacl_dtrees(fpath, rel_conv, doc_names, nary_enc='chain'):
+    """Do load dtrees"""
+    dtree_pred = dict()
+    ctree_pred = load_braud_eacl_ctrees(fpath, rel_conv, doc_names)
+    for doc_name, ct_pred in ctree_pred.items():
+        dt_pred = RstDepTree.from_rst_tree(ct_pred)
+        dtree_pred[doc_name] = dt_pred
+    # TODO load ctrees, convert
+    return dtree_pred
diff --git a/evals/showdown.py b/evals/showdown.py
index 11cc0b9..ab0df40 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -22,6 +22,10 @@
                                     compute_uas_las_undirected)
 
 # local to this package
+from evals.braud_coling import (load_braud_coling_ctrees,
+                                load_braud_coling_dtrees)
+from evals.braud_eacl import (load_braud_eacl_ctrees,
+                                load_braud_eacl_dtrees)
 from evals.codra import load_codra_ctrees, load_codra_dtrees
 from evals.feng import load_feng_ctrees, load_feng_dtrees
 from evals.gcrf_tree_format import load_gcrf_ctrees, load_gcrf_dtrees
@@ -110,6 +114,10 @@
 HAYASHI_OUT_DIR = '/home/mmorey/melodi/rst/hayashi/SIGDIAL'
 HAYASHI_HILDA_OUT_DIR = os.path.join(HAYASHI_OUT_DIR, 'auto_parse/cons/HILDA')
 HAYASHI_MST_OUT_DIR = os.path.join(HAYASHI_OUT_DIR, 'auto_parse/dep/li')
+# Braud
+BRAUD_COLING_OUT_DIR = '/home/mmorey/melodi/rst/braud/coling16/pred_trees'
+BRAUD_EACL_MONO = '/home/mmorey/melodi/rst/braud/eacl16/best-en-mono/test_it8_beam16'
+BRAUD_EACL_CROSS_DEV = '/home/mmorey/melodi/rst/braud/eacl16/best-en-cross+dev/test_it10_beam32'
 
 # level of detail for parseval
 DETAILED = False
@@ -181,6 +189,8 @@ def main():
                         choices=['gold', 'silver',
                                  'joty', 'feng', 'feng2', 'ji',
                                  'li_qi', 'hayashi_hilda', 'hayashi_mst',
+                                 'braud_coling', 'braud_eacl_mono',
+                                 'braud_eacl_cross_dev',
                                  'ours_chain', 'ours_tree', 'ours_tree_su'],
                         help="Author(s) of the predictions")
     parser.add_argument('--nary_enc_pred', default='tree',
@@ -191,6 +201,8 @@ def main():
                         choices=['gold', 'silver',
                                  'joty', 'feng', 'feng2', 'ji',
                                  'li_qi', 'hayashi_hilda', 'hayashi_mst',
+                                 'braud_coling', 'braud_eacl_mono',
+                                 'braud_eacl_cross_dev',
                                  'ours_chain', 'ours_tree'],
                         help="Author of the reference")
     # * dtree eval
@@ -261,12 +273,45 @@ def main():
         # corresponding dtree
         dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc=nary_enc_true)
         dtree_true[doc_name] = dt_true
-
+    # sorted doc_names, because braud_eacl put all predictions in one file
+    sorted_doc_names = sorted(dtree_true.keys())
     
     c_preds = []  # predictions: [(parser_name, dict(doc_name, ct_pred))]
     d_preds = []  # predictions: [(parser_name, dict(doc_name, dt_pred))]
 
     for author_pred in authors_pred:
+        if author_pred == 'braud_coling':
+            c_preds.append(
+                ('braud_coling', load_braud_coling_ctrees(
+                    BRAUD_COLING_OUT_DIR, REL_CONV))
+            )
+            d_preds.append(
+                ('braud_coling', load_braud_coling_dtrees(
+                    BRAUD_COLING_OUT_DIR, REL_CONV, nary_enc='chain'))
+            )            
+
+        if author_pred == 'braud_eacl_mono':
+            c_preds.append(
+                ('braud_eacl_mono', load_braud_eacl_ctrees(
+                    BRAUD_EACL_MONO, REL_CONV, sorted_doc_names))
+            )
+            d_preds.append(
+                ('braud_eacl_mono', load_braud_eacl_dtrees(
+                    BRAUD_EACL_MONO, REL_CONV, sorted_doc_names,
+                    nary_enc='chain'))
+            )            
+
+        if author_pred == 'braud_eacl_cross_dev':
+            c_preds.append(
+                ('braud_eacl_cross_dev', load_braud_eacl_ctrees(
+                    BRAUD_EACL_CROSS_DEV, REL_CONV, sorted_doc_names))
+            )
+            d_preds.append(
+                ('braud_eacl_cross_dev', load_braud_eacl_dtrees(
+                    BRAUD_EACL_CROSS_DEV, REL_CONV, sorted_doc_names,
+                    nary_enc='chain'))
+            )            
+
         if author_pred == 'hayashi_hilda':
             c_preds.append(
                 ('hayashi_hilda', load_hayashi_hilda_ctrees(

From 9d80e94bcd2b13a04e4f8f37e59ae050a02f5ae7 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Sat, 10 Dec 2016 18:23:34 +0100
Subject: [PATCH 42/74] WIP map braud output to same labelset

---
 evals/braud_coling.py | 10 +++++++++-
 evals/braud_eacl.py   | 16 +++++++++++++++-
 evals/showdown.py     |  6 ++++++
 3 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/evals/braud_coling.py b/evals/braud_coling.py
index 36fad6a..25f9887 100644
--- a/evals/braud_coling.py
+++ b/evals/braud_coling.py
@@ -6,6 +6,7 @@
 
 import codecs
 from glob import glob
+import itertools
 import os
 
 from nltk import Tree
@@ -78,6 +79,10 @@ def tree_to_simple_rsttree(tree):
     new_kids = [tree_to_simple_rsttree(kid) for kid in tree]
     # node
     nuc, rel = tree.label().split('-', 1)
+    # map to our coarse rel names
+    if rel == 'Textual-organization':
+        rel = 'Textual'
+    # end map
     edu_beg = (new_kids[0].num if isinstance(new_kids[0], EDU)
                else new_kids[0].label().edu_span[0])
     edu_end = (new_kids[-1].num if isinstance(new_kids[-1], EDU)
@@ -145,5 +150,8 @@ def load_braud_coling_dtrees(out_dir, rel_conv, nary_enc='chain'):
         # print(dt_pred.labels)  # DEBUG
         # raise ValueError('debug me')
         dtree_pred[doc_name] = dt_pred
-    # TODO load ctrees, convert
+    # DEBUG
+    all_labels = set(itertools.chain.from_iterable(dt_pred.labels for dt_pred in dtree_pred.values()))
+    print(out_dir, sorted(all_labels))
+    # end DEBUG
     return dtree_pred
diff --git a/evals/braud_eacl.py b/evals/braud_eacl.py
index 89aea24..eacab66 100644
--- a/evals/braud_eacl.py
+++ b/evals/braud_eacl.py
@@ -5,6 +5,7 @@
 from __future__ import absolute_import, print_function
 
 import codecs
+import itertools
 from glob import glob
 import os
 
@@ -54,6 +55,16 @@ def tree_to_simple_rsttree(tree, edu_num=1):
     # label: 'NNTextualorganization'
     nuc = tree.label()[:2]
     rel = tree.label()[2:]
+    # map to our coarse rel names
+    rel_map = {
+        'MannerMeans': 'manner-means',
+        'Sameunit': 'same-unit',
+        'TopicChange': 'topic-change',
+        'TopicComment': 'topic-comment',
+    }        
+    rel = rel_map.get(rel, rel)
+    # end map
+
     # same as in braud_coling
     edu_beg = (new_kids[0].num if isinstance(new_kids[0], EDU)
                else new_kids[0].label().edu_span[0])
@@ -116,5 +127,8 @@ def load_braud_eacl_dtrees(fpath, rel_conv, doc_names, nary_enc='chain'):
     for doc_name, ct_pred in ctree_pred.items():
         dt_pred = RstDepTree.from_rst_tree(ct_pred)
         dtree_pred[doc_name] = dt_pred
-    # TODO load ctrees, convert
+    # DEBUG
+    all_labels = set(itertools.chain.from_iterable(dt_pred.labels for dt_pred in dtree_pred.values()))
+    print(fpath, sorted(all_labels))
+    # end DEBUG
     return dtree_pred
diff --git a/evals/showdown.py b/evals/showdown.py
index ab0df40..bfac168 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -462,6 +462,12 @@ def main():
     report += '\n'
     # end table format and header line
 
+    # DEBUG
+    import itertools
+    all_labels = set(itertools.chain.from_iterable(dt_true.labels for dt_true in dtree_true.values()))
+    print("TRUE", sorted(all_labels))
+    # end DEBUG
+
     # * table content
     for parser_name, dtree_pred in d_preds:
         doc_names = sorted(dtree_true.keys())

From e6c1e9c74ec64de3bf0592d82a9ec133cee32065 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Mon, 12 Dec 2016 17:30:41 +0100
Subject: [PATCH 43/74] ENH add check to eval: labelset_pred is a subset of
 labelset_true

---
 evals/braud_coling.py     |  6 ----
 evals/braud_eacl.py       |  6 ++--
 evals/feng.py             | 26 ++++++++-------
 evals/gcrf_tree_format.py | 10 ++++++
 evals/hayashi_deps.py     |  5 +++
 evals/ji.py               | 11 ++++++-
 evals/showdown.py         | 66 +++++++++++++--------------------------
 7 files changed, 63 insertions(+), 67 deletions(-)

diff --git a/evals/braud_coling.py b/evals/braud_coling.py
index 25f9887..625cb19 100644
--- a/evals/braud_coling.py
+++ b/evals/braud_coling.py
@@ -147,11 +147,5 @@ def load_braud_coling_dtrees(out_dir, rel_conv, nary_enc='chain'):
     ctree_pred = load_braud_coling_ctrees(out_dir, rel_conv)
     for doc_name, ct_pred in ctree_pred.items():
         dt_pred = RstDepTree.from_rst_tree(ct_pred)
-        # print(dt_pred.labels)  # DEBUG
-        # raise ValueError('debug me')
         dtree_pred[doc_name] = dt_pred
-    # DEBUG
-    all_labels = set(itertools.chain.from_iterable(dt_pred.labels for dt_pred in dtree_pred.values()))
-    print(out_dir, sorted(all_labels))
-    # end DEBUG
     return dtree_pred
diff --git a/evals/braud_eacl.py b/evals/braud_eacl.py
index eacab66..082efa5 100644
--- a/evals/braud_eacl.py
+++ b/evals/braud_eacl.py
@@ -90,11 +90,13 @@ def _load_braud_eacl_file(f):
         sctrees.append(sctree)
     return sctrees
 
+
 def load_braud_eacl_file(fpath):
     """Load SimpleRSTTrees from a file"""
     with codecs.open(fpath, 'rb', 'utf-8') as f:
         return _load_braud_eacl_file(f)
 
+
 def load_braud_eacl_ctrees(fpath, rel_conv, doc_names):
     """Load the ctrees output by Braud et al.'s parser
 
@@ -127,8 +129,4 @@ def load_braud_eacl_dtrees(fpath, rel_conv, doc_names, nary_enc='chain'):
     for doc_name, ct_pred in ctree_pred.items():
         dt_pred = RstDepTree.from_rst_tree(ct_pred)
         dtree_pred[doc_name] = dt_pred
-    # DEBUG
-    all_labels = set(itertools.chain.from_iterable(dt_pred.labels for dt_pred in dtree_pred.values()))
-    print(fpath, sorted(all_labels))
-    # end DEBUG
     return dtree_pred
diff --git a/evals/feng.py b/evals/feng.py
index 802ddbc..fd65acf 100644
--- a/evals/feng.py
+++ b/evals/feng.py
@@ -8,6 +8,8 @@
 
 import itertools
 
+from nltk import Tree
+
 from educe.rst_dt.feng import load_feng_output_files
 from educe.rst_dt.deptree import RstDepTree
 
@@ -42,6 +44,15 @@ def load_feng_ctrees(out_dir, rel_conv):
         # initial letter is capitalized whereas ours are not
         if rel_conv is not None:
             ct_pred = rel_conv(ct_pred)
+        # "normalize" names of classes of RST relations:
+        # "textual-organization" => "textual"
+        for pos in ct_pred.treepositions():
+            t = ct_pred[pos]
+            if isinstance(t, Tree):
+                node = t.label()
+                if node.rel == 'textual-organization':
+                    node.rel = 'textual'
+        # end normalize
         ctree_pred[doc_name] = ct_pred
 
     return ctree_pred
@@ -62,21 +73,12 @@ def load_feng_dtrees(out_dir, rel_conv, nary_enc='chain'):
     dtree_pred: dict(str, RstDepTree)
         RST dtree for each document.
     """
-    # load predicted trees
-    data_pred = load_feng_output_files(out_dir)
-    # filenames = data_pred['filenames']
-    doc_names_pred = data_pred['doc_names']
-    rst_ctrees_pred = data_pred['rst_ctrees']
+    # load predicted c-trees
+    ctree_pred = load_feng_ctrees(out_dir, rel_conv)
 
     # build a dict from doc_name to ordered dtree (RstDepTree)
     dtree_pred = dict()
-    for doc_name, ct_pred in itertools.izip(doc_names_pred, rst_ctrees_pred):
-        # constituency tree
-        # replace fine-grained labels with coarse-grained labels ;
-        # the files we have already contain the coarse labels, except their
-        # initial letter is capitalized whereas ours are not
-        if rel_conv is not None:
-            ct_pred = rel_conv(ct_pred)
+    for doc_name, ct_pred in ctree_pred.items():
         # convert to an ordered dependency tree ;
         # * 'tree' produces a weakly-ordered dtree strictly equivalent
         # to the original ctree,
diff --git a/evals/gcrf_tree_format.py b/evals/gcrf_tree_format.py
index 4c7e379..1b4fd0b 100644
--- a/evals/gcrf_tree_format.py
+++ b/evals/gcrf_tree_format.py
@@ -181,7 +181,17 @@ def load_gcrf_ctrees(out_dir, rel_conv):
         ct_pred = SimpleRSTTree.to_binary_rst_tree(sct_pred)
         if rel_conv is not None:
             ct_pred = rel_conv(ct_pred)
+        # "normalize" names of classes of RST relations:
+        # "textual-organization" => "textual"
+        for pos in ct_pred.treepositions():
+            t = ct_pred[pos]
+            if isinstance(t, Tree):
+                node = t.label()
+                if node.rel == 'textual-organization':
+                    node.rel = 'textual'
+        # end normalize
         ctree_pred[doc_name] = ct_pred
+
     return ctree_pred
 
 
diff --git a/evals/hayashi_deps.py b/evals/hayashi_deps.py
index 00a776b..442e688 100644
--- a/evals/hayashi_deps.py
+++ b/evals/hayashi_deps.py
@@ -116,6 +116,11 @@ def load_hayashi_dep_dtrees(out_dir, rel_conv, edus_file_pat, nuc_clf,
     for doc_name, dt_pred in dtrees.items():
         if rel_conv is not None:
             dt_pred = rel_conv(dt_pred)
+        # normalize names of classes of RST relations:
+        # "root" is "ROOT" in my coarse labelset (TODO: make it consistent)
+        dt_pred.labels = ['ROOT' if x == 'root' else x
+                          for x in dt_pred.labels]
+        # end normalize
         # WIP add nuclearity and rank
         edus_data = load_edu_input_file(edus_file_pat.format(doc_name),
                                         edu_type='rst-dt')
diff --git a/evals/ji.py b/evals/ji.py
index 3198a3f..e5c6a9c 100644
--- a/evals/ji.py
+++ b/evals/ji.py
@@ -114,7 +114,9 @@ def load_ji_ctrees(ji_out_dir, rel_conv):
         # convert relation labels
         if rel_conv is not None:
             ct_pred = rel_conv(ct_pred)
-            # change "same_unit" (in Ji's output) into "same-unit" (in ours)
+            # normalize names of classes of RST relations:
+            # "same_unit" => "same-unit"
+            # "topic" => "topic-change" or "topic-comment"?
             for pos in ct_pred.treepositions():
                 t = ct_pred[pos]
                 if isinstance(t, RSTTree):
@@ -122,6 +124,13 @@ def load_ji_ctrees(ji_out_dir, rel_conv):
                     # replace "same_unit" with "same-unit"
                     if node.rel == 'same_unit':
                         node.rel = 'same-unit'
+                    elif node.rel == 'topic':
+                        # either "topic-comment" or "topic-change" ;
+                        # I expect the parser to find "topic-comment" to
+                        # be easier but apparently it has no consequence
+                        # on the current output I reproduced
+                        node.rel = 'topic-comment'
+            # end normalize
         # store the resulting RSTTree
         ctree_pred[doc_name] = ct_pred
 
diff --git a/evals/showdown.py b/evals/showdown.py
index bfac168..29a0af7 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -7,6 +7,7 @@
 
 import argparse
 import codecs
+import itertools
 import os
 
 from educe.rst_dt.annotation import _binarize, SimpleRSTTree
@@ -41,7 +42,7 @@
 
 
 # RST corpus
-CORPUS_DIR = os.path.join('corpus', 'RSTtrees-WSJ-main-1.0/')
+CORPUS_DIR = os.path.join('corpus', 'RSTtrees-WSJ-main-1.01/')
 CD_TRAIN = os.path.join(CORPUS_DIR, 'TRAINING')
 CD_TEST = os.path.join(CORPUS_DIR, 'TEST')
 # relation converter (fine- to coarse-grained labels)
@@ -462,38 +463,27 @@ def main():
     report += '\n'
     # end table format and header line
 
-    # DEBUG
-    import itertools
-    all_labels = set(itertools.chain.from_iterable(dt_true.labels for dt_true in dtree_true.values()))
-    print("TRUE", sorted(all_labels))
-    # end DEBUG
-
     # * table content
+    # _true
+    doc_names = sorted(dtree_true.keys())
+    dtree_true_list = [dtree_true[doc_name] for doc_name in doc_names]
+    labelset_true = set(itertools.chain.from_iterable(
+        x.labels for x in dtree_true_list))
+    labelset_true.add("span")  # RST-DT v.1.0 has an error in wsj_1189 7-9
+    # _pred
     for parser_name, dtree_pred in d_preds:
-        doc_names = sorted(dtree_true.keys())
-        dtree_true_list = [dtree_true[doc_name] for doc_name in doc_names]
         dtree_pred_list = [dtree_pred[doc_name] for doc_name in doc_names]
-        # WIP print per doc eval
-        if not os.path.exists(parser_name):
-            os.makedirs(parser_name)
-        for doc_name, dt_true, dt_pred in zip(
-                doc_names, dtree_true_list, dtree_pred_list):
-            with codecs.open(parser_name + '/' + doc_name + '.d_eval',
-                             mode='w', encoding='utf-8') as f:
-                print(', '.join('{:.4f}'.format(x)
-                                for x in compute_uas_las(
-                                        [dt_true], [dt_pred],
-                                        include_ls=INCLUDE_LS,
-                                        include_las_n_o_no=EVAL_NUC_RANK)),
-                      file=f)
-                if UNDIRECTED_DEPS:
-                    # scores for undirected edges
-                    print(', '.join('{:.4f}'.format(x)
-                                    for x in compute_uas_las_undirected(
-                                            [dt_true], [dt_pred])),
-                          file=f)
-        # end WIP print
-
+        # check that labelset_pred is a subset of labelset_true
+        labelset_pred = set(itertools.chain.from_iterable(
+            x.labels for x in dtree_pred_list))
+        try:
+            assert labelset_pred.issubset(labelset_true)
+        except AssertionError:
+            print(parser_name)
+            print('T - P', labelset_true - labelset_pred)
+            print('P - T', labelset_pred - labelset_true)
+            raise
+        # end check
         all_scores = []
         all_scores += list(compute_uas_las(
             dtree_true_list, dtree_pred_list, include_ls=INCLUDE_LS,
@@ -516,6 +506,7 @@ def main():
         doc_names = sorted(ctree_true.keys())
         ctree_true_list = [ctree_true[doc_name] for doc_name in doc_names]
         ctree_pred_list = [ctree_pred[doc_name] for doc_name in doc_names]
+
         if simple_rsttree:
             ctree_true_list = [SimpleRSTTree.from_rst_tree(x)
                                for x in ctree_true_list]
@@ -538,20 +529,7 @@ def main():
             with codecs.open(parser_name + '/' + doc_name, mode='w',
                              encoding='utf-8') as f:
                 print(ct, file=f)
-        # WIP eval each tree in turn
-        for doc_name, ct_true, ct_pred in zip(
-                doc_names, ctree_true_list, ctree_pred_list):
-            with codecs.open(parser_name + '/' + doc_name + '.c_eval',
-                             mode='w', encoding='utf-8') as f:
-                print(rst_parseval_report([ct_true], [ct_pred],
-                                          ctree_type=ctree_type,
-                                          digits=4,
-                                          per_doc=per_doc,
-                                          add_trivial_spans=eval_li_dep,
-                                          stringent=STRINGENT),
-                      file=f)
-        # end WIP
-        # FIXME
+
         # compute and print PARSEVAL scores
         print(parser_name)
         print(rst_parseval_report(ctree_true_list, ctree_pred_list,

From fde2149ca17b05170992520ffff6d9b2360bf752 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Tue, 13 Dec 2016 15:28:33 +0100
Subject: [PATCH 44/74] ENH updated with actual output from Ji and Surdeanu

---
 evals/ji.py       |  10 ++-
 evals/showdown.py |  20 ++++-
 evals/surdeanu.py | 211 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 237 insertions(+), 4 deletions(-)
 create mode 100644 evals/surdeanu.py

diff --git a/evals/ji.py b/evals/ji.py
index e5c6a9c..6f01512 100644
--- a/evals/ji.py
+++ b/evals/ji.py
@@ -122,14 +122,20 @@ def load_ji_ctrees(ji_out_dir, rel_conv):
                 if isinstance(t, RSTTree):
                     node = t.label()
                     # replace "same_unit" with "same-unit"
-                    if node.rel == 'same_unit':
+                    if node.rel == 'same_unit':  # DPLP v. 1
                         node.rel = 'same-unit'
-                    elif node.rel == 'topic':
+                    elif node.rel == 'topic':  # DPLP v. 1
                         # either "topic-comment" or "topic-change" ;
                         # I expect the parser to find "topic-comment" to
                         # be easier but apparently it has no consequence
                         # on the current output I reproduced
                         node.rel = 'topic-comment'
+                    elif node.rel == 'sameunit':  # Ji's output
+                        node.rel = 'same-unit'
+                    elif node.rel == 'topicchange':  # Ji's output
+                        node.rel = 'topic-change'
+                    elif node.rel == 'topiccomment':  # Ji's output
+                        node.rel = 'topic-comment'
             # end normalize
         # store the resulting RSTTree
         ctree_pred[doc_name] = ct_pred
diff --git a/evals/showdown.py b/evals/showdown.py
index 29a0af7..eb8ad09 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -39,7 +39,7 @@
 from evals.ours import (load_deptrees_from_attelo_output,
                         load_attelo_ctrees,
                         load_attelo_dtrees)
-
+from evals.surdeanu import load_surdeanu_ctrees, load_surdeanu_dtrees
 
 # RST corpus
 CORPUS_DIR = os.path.join('corpus', 'RSTtrees-WSJ-main-1.01/')
@@ -104,7 +104,8 @@
 # output of Joty's parser CODRA
 CODRA_OUT_DIR = '/home/mmorey/melodi/rst/joty/Doc-level'
 # output of Ji's parser DPLP
-JI_OUT_DIR = os.path.join('/home/mmorey/melodi/rst/ji_eisenstein/DPLP/data/docs/test/')
+# JI_OUT_DIR = os.path.join('/home/mmorey/melodi/rst/ji_eisenstein', 'DPLP/data/docs/test/')
+JI_OUT_DIR = os.path.join('/home/mmorey/melodi/rst/ji_eisenstein', 'official_output/outputs/')
 # Feng's parsers
 FENG_DIR = '/home/mmorey/melodi/rst/feng_hirst/'
 FENG1_OUT_DIR = os.path.join(FENG_DIR, 'phil', 'tmp')
@@ -119,6 +120,9 @@
 BRAUD_COLING_OUT_DIR = '/home/mmorey/melodi/rst/braud/coling16/pred_trees'
 BRAUD_EACL_MONO = '/home/mmorey/melodi/rst/braud/eacl16/best-en-mono/test_it8_beam16'
 BRAUD_EACL_CROSS_DEV = '/home/mmorey/melodi/rst/braud/eacl16/best-en-cross+dev/test_it10_beam32'
+# Surdeanu
+SURDEANU_LOG_FILE = '/home/mmorey/melodi/rst/surdeanu/output/log'
+
 
 # level of detail for parseval
 DETAILED = False
@@ -192,6 +196,7 @@ def main():
                                  'li_qi', 'hayashi_hilda', 'hayashi_mst',
                                  'braud_coling', 'braud_eacl_mono',
                                  'braud_eacl_cross_dev',
+                                 'surdeanu',
                                  'ours_chain', 'ours_tree', 'ours_tree_su'],
                         help="Author(s) of the predictions")
     parser.add_argument('--nary_enc_pred', default='tree',
@@ -204,6 +209,7 @@ def main():
                                  'li_qi', 'hayashi_hilda', 'hayashi_mst',
                                  'braud_coling', 'braud_eacl_mono',
                                  'braud_eacl_cross_dev',
+                                 'surdeanu',
                                  'ours_chain', 'ours_tree'],
                         help="Author of the reference")
     # * dtree eval
@@ -390,6 +396,16 @@ def main():
             # the nary_enc does not matter because codra outputs binary ctrees,
             # hence both encodings result in (the same) strictly ordered dtrees
 
+        if author_pred == 'surdeanu':
+            c_preds.append(
+                ('surdeanu', load_surdeanu_ctrees(
+                    SURDEANU_LOG_FILE, REL_CONV))
+            )
+            d_preds.append(
+                ('surdeanu', load_surdeanu_dtrees(
+                    SURDEANU_LOG_FILE, REL_CONV, nary_enc='chain'))
+            )
+
         if author_pred == 'ours_chain':
             # Eisner, predicted syntax, chain
             c_preds.append(
diff --git a/evals/surdeanu.py b/evals/surdeanu.py
new file mode 100644
index 0000000..31111e8
--- /dev/null
+++ b/evals/surdeanu.py
@@ -0,0 +1,211 @@
+"""Load RST trees output by Surdeanu et al.'s parser.
+
+This format differs from the verbose output of the parser: PM added
+brackets so they are easier to read.
+"""
+
+from __future__ import absolute_import, print_function
+import codecs
+import re
+
+from nltk import Tree
+
+from educe.annotation import Span
+from educe.corpus import FileId
+from educe.rst_dt.annotation import EDU, Node, SimpleRSTTree
+from educe.rst_dt.deptree import RstDepTree
+
+
+# timestamped line
+TS_LINE = r"\d\d:\d\d:\d\d.\d\d\d \[run-main-0\].*"
+TS_RE = re.compile(TS_LINE)
+
+
+def tree_to_simple_rsttree(tree, edu_num=1):
+    """Build a SimpleRSTTree from an NLTK Tree (formatted a la Surdeanu).
+
+    Parameters
+    ----------
+    tree : nltk.Tree
+        Tree
+
+    edu_num : int, defaults to 1
+        Number of the next EDU
+
+    Returns
+    -------
+    sct : SimpleRSTTree
+        The corresponding SimpleRSTTree.
+    """
+    origin = None
+
+    if tree.label() == 'TEXT':
+        # EDU (+pre-terminal)
+        num = edu_num
+        span = Span(num, num)
+        # 'TEXT <text>'
+        text = '__'.join(tree)
+        edu = EDU(num, span, text, context=None, origin=origin)
+        # pre-terminal
+        edu_span = (num, num)
+        nuc = "leaf"
+        rel = "leaf"
+        node = Node(nuc, edu_span, span, rel, context=None)
+        return SimpleRSTTree(node, [edu], origin=origin)
+
+    new_kids = []
+    for kid in tree:
+        new_kid = tree_to_simple_rsttree(kid, edu_num=edu_num)
+        edu_num = new_kid.label().edu_span[1] + 1
+        new_kids.append(new_kid)
+
+    # internal node
+    # (modified) label: 'elaboration:NS' or 'joint' (no explicit nuc: NN)
+    if tree.label()[-3] == ':':
+        rel = tree.label()[:-3]
+        nuc = tree.label()[-2:]
+    else:
+        rel = tree.label()
+        nuc = 'NN'
+    # map to our coarse rel names
+    # TODO?
+    # end map
+    # same as in braud_coling and braud_eacl
+    edu_beg = (new_kids[0].num if isinstance(new_kids[0], EDU)
+               else new_kids[0].label().edu_span[0])
+    edu_end = (new_kids[-1].num if isinstance(new_kids[-1], EDU)
+               else new_kids[-1].label().edu_span[1])
+    edu_span = (edu_beg, edu_end)
+    char_beg = (new_kids[0].num if isinstance(new_kids[0], EDU)
+                  else new_kids[0].label().span.char_start)
+    char_end = (new_kids[-1].num if isinstance(new_kids[-1], EDU)
+                else new_kids[-1].label().span.char_end)
+    span = Span(char_beg, char_end)
+    new_node = Node(nuc, edu_span, span, rel, context=None)
+    new_tree = SimpleRSTTree(new_node, new_kids, origin=origin)
+    return new_tree
+
+
+def _load_surdeanu_ctrees(log_file, rel_conv):
+    """Do load"""
+    doc_names = []
+    nltk_ctrees = []
+    ctree_pred = dict()  # result
+
+    ctree_cur = []  # lines for the current c-tree
+    state_cur = 0  # current state (finite state machine for dummies)
+    for line in log_file:
+        # DIRTY replace non-breaking spaces output by CoreNLP, as in
+        # educe.rst_dt.learning.doc_vectorizer
+        if isinstance(line, unicode):
+            line2 = line.replace(u'\xa0', u' ')
+            line = line2.encode('utf-8')
+        # end replace
+
+        if state_cur == 0:
+            line = line.strip()
+            # skip initial lines until "Documents"
+            if line == "Documents":
+                state_cur = 1
+        elif state_cur == 1:
+            line = line.strip()
+            # read list of document names
+            if line == "end Documents":
+                state_cur = 2
+            else:
+                assert line.endswith('.dis')
+                doc_name = line[:-4]
+                doc_names.append(doc_name)
+        elif state_cur == 2:
+            # skip intermediate lines
+            if line.strip() == "System tree:":
+                state_cur = 3
+        elif state_cur == 3:
+            if line.strip() == "System tree:":
+                if ctree_cur:
+                    # parse the previous predicted c-tree ("System tree")
+                    nltk_ct_pred = Tree.fromstring(''.join(ctree_cur))
+                    nltk_ctrees.append(nltk_ct_pred)
+                    # reset accumulator
+                    ctree_cur = []
+            elif TS_RE.match(line):
+                # stop reading trees
+                state_cur = 4
+                if ctree_cur:
+                    # parse last predicted tree
+                    nltk_ct_pred = Tree.fromstring(''.join(ctree_cur))
+                    nltk_ctrees.append(nltk_ct_pred)
+                    ctree_cur = []  # reset (bc who wants side effects?)
+            else:
+                # accumulate lines for the next predicted c-tree
+                # we immediately replace " (LeftToRight)" with ":NS",
+                # " (RightToLeft)" with ":SN", otherwise it should be ":NN"
+                line = line.replace(" (LeftToRight)", ":NS").replace(" (RightToLeft)", ":SN").replace("TEXT:", "TEXT ")
+                ctree_cur.append(line)
+        elif state_cur == 4:
+            # just read on
+            continue
+
+    # we got two predicted ctrees for each doc, with gold then predicted EDUs
+    # filter to keep only ctrees with gold EDUs, i.e. at even indices
+    nltk_ctrees = nltk_ctrees[::2]
+    # for each doc, create an RSTTree from the NLTK tree
+    for doc_name, nltk_ct_pred in zip(doc_names, nltk_ctrees):
+        # the c-tree read corresponds to a SimpleRstTree
+        sct_pred = tree_to_simple_rsttree(nltk_ct_pred)
+        ct_pred = SimpleRSTTree.to_binary_rst_tree(sct_pred)
+        ct_pred = rel_conv(ct_pred)
+        ctree_pred[doc_name] = ct_pred
+    return ctree_pred
+
+
+def load_surdeanu_ctrees(log_file, rel_conv):
+    """Load c-trees output by Surdeanu's parser.
+
+    Parameters
+    ----------
+    log_file : str
+        Path to the log file with the document names followed by the
+        reference and predicted c-trees.
+
+    rel_conv : RstRelationConverter
+        Converter to map fine-grained relation labels to classes.
+
+    Returns
+    -------
+    ctree_pred : dict(str, RSTTree)
+        Predicted c-tree for each doc.
+    """
+    with codecs.open(log_file, mode='rb', encoding='utf-8') as f:
+        return _load_surdeanu_ctrees(f, rel_conv)
+
+
+def load_surdeanu_dtrees(log_file, rel_conv, nary_enc='chain'):
+    """Get the dtrees for the ctrees output by Surdeanu's parser.
+
+    Parameters
+    ----------
+    log_file: str
+        Path to the log file with the output.
+    rel_conv: TODO
+        Relation converter, from fine- to coarse-grained labels.
+    nary_enc: one of {'chain', 'tree'}
+        Encoding for n-ary nodes.
+
+    Returns
+    -------
+    dtree_pred: dict(str, RstDepTree)
+        RST dtree for each document.
+    """
+    dtree_pred = dict()
+
+    ctree_pred = load_surdeanu_ctrees(log_file, rel_conv)
+    for doc_name, ct_pred in ctree_pred.items():
+        dtree_pred[doc_name] = RstDepTree.from_rst_tree(
+            ct_pred, nary_enc=nary_enc)
+    # set reference to the document in the RstDepTree (required by
+    # dump_disdep_files)
+    for doc_name, dt_pred in dtree_pred.items():
+        dt_pred.origin = FileId(doc_name, None, None, None)
+
+    return dtree_pred

From 7552821dc5b1d3904bba221cb66109edf47fe354 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Tue, 13 Dec 2016 20:22:28 +0100
Subject: [PATCH 45/74] DOC minor typo

---
 evals/hayashi_deps.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/evals/hayashi_deps.py b/evals/hayashi_deps.py
index 442e688..cb812f5 100644
--- a/evals/hayashi_deps.py
+++ b/evals/hayashi_deps.py
@@ -136,7 +136,8 @@ def load_hayashi_dep_dtrees(out_dir, rel_conv, edus_file_pat, nuc_clf,
 
 def load_hayashi_dep_ctrees(out_dir, rel_conv, edus_file_pat, nuc_clf,
                             rnk_clf):
-    """Load the dtrees output by one of Hayashi et al.'s dep parsers.
+    """Load the ctrees for the dtrees output by one of Hayashi et al.'s
+    dep parsers.
 
     Parameters
     ----------

From 2db94f73d5726e40e4e6b668eef5d2942ed9943c Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Tue, 13 Dec 2016 20:47:10 +0100
Subject: [PATCH 46/74] ENH incorporate eval for li_sujian

---
 evals/li_sujian.py | 320 ++++++++++++++++++++++++---------------------
 evals/showdown.py  |  20 ++-
 2 files changed, 190 insertions(+), 150 deletions(-)

diff --git a/evals/li_sujian.py b/evals/li_sujian.py
index 6f80db4..ee235bb 100644
--- a/evals/li_sujian.py
+++ b/evals/li_sujian.py
@@ -3,32 +3,15 @@
 """
 
 from __future__ import absolute_import, print_function
-from collections import Counter
-from glob import glob
 import os
 
 # educe
 from educe.learning.edu_input_format import load_edu_input_file
-from educe.rst_dt.corpus import (RstRelationConverter,
-                                 Reader as RstReader)
 from educe.rst_dt.dep2con import deptree_to_rst_tree
 from educe.rst_dt.deptree import NUC_S, RstDepTree, RstDtException
 from educe.rst_dt.metrics.rst_parseval import rst_parseval_report
 # attelo
 from attelo.metrics.deptree import compute_uas_las as att_compute_uas_las
-# local imports
-from evals.showdown import EDUS_FILE_PAT, setup_dtree_postprocessor
-
-
-# RST corpus
-CORPUS_DIR = os.path.join('corpus', 'RSTtrees-WSJ-main-1.0/')
-CD_TRAIN = os.path.join(CORPUS_DIR, 'TRAINING')
-CD_TEST = os.path.join(CORPUS_DIR, 'TEST')
-# relation converter (fine- to coarse-grained labels)
-RELMAP_FILE = os.path.join('/home/mmorey/melodi/educe',
-                           'educe', 'rst_dt',
-                           'rst_112to18.txt')
-REL_CONV = RstRelationConverter(RELMAP_FILE).convert_tree
 
 
 # output of Li et al.'s parser
@@ -64,8 +47,8 @@
     "441.0detailedOut.txt",
 ]
 
-# default file(s) to include ; I picked a coarse-grained one with good scores
-DEFAULT_FILES = ["712.0detailedOut.txt"]
+# default file to include ; I picked a coarse-grained one with good scores
+DEFAULT_FILE = os.path.join(SAVE_DIR, "712.0detailedOut.txt")
 
 
 def load_output_file(out_file):
@@ -106,41 +89,137 @@ def load_output_file(out_file):
     return res
 
 
-if __name__ == "__main__":
-    # load dep trees from corpus
-    reader_test = RstReader(CD_TEST)
-    corpus_test = reader_test.slurp()
+def load_li_sujian_dep_dtrees(out_file, rel_conv_dtree, edus_file_pat,
+                              nuc_clf, rnk_clf):
+    """Load the dtrees output by Li Sujian et al.'s dep parser.
+
+    Parameters
+    ----------
+    out_file : str
+        Path to the file containing all the predictions.
+
+    rel_conv_dtree : RstRelationConverter
+        Converter to map relation labels to (normalized) coarse-grained
+        classes.
+
+    edus_file_pat : str
+        Pattern for the .edu_input files.
+
+    nuc_clf : NuclearityClassifier
+        Nuclearity classifier
+
+    rnk_clf : RankClassifier
+        Rank classifier
+
+    Returns
+    -------
+    dtree_pred : dict(str, RstDepTree)
+        RST dtree for each doc.
+    """
+    dtree_pred = dict()
+
+    dep_bunch = load_output_file(out_file)
+    # load and process _pred
+    for doc_name, heads_pred, labels_pred in zip(
+            dep_bunch['doc_names'], dep_bunch['heads_pred'],
+            dep_bunch['labels_pred']):
+        # create dtree _pred
+        edus_data = load_edu_input_file(edus_file_pat.format(doc_name),
+                                        edu_type='rst-dt')
+        edus = edus_data['edus']
+        edu2sent = edus_data['edu2sent']
+        dt_pred = RstDepTree(edus)
+        # add predicted edges
+        for dep_idx, (gov_idx, lbl) in enumerate(zip(
+                heads_pred[1:], labels_pred[1:]), start=1):
+            if lbl == '<no-type>':
+                lbl = 'Elaboration'
+            lbl = lbl.lower()
+            dt_pred.add_dependency(gov_idx, dep_idx, lbl)
+        # map to relation classes
+        dt_pred = rel_conv_dtree(dt_pred)
+        dt_pred.labels = ['ROOT' if x == 'root' else x
+                          for x in dt_pred.labels]
+        # attach edu2sent, for later use by rnk_clf
+        dt_pred.sent_idx = [0] + edu2sent  # 0 for fake root + dirty
+        dtree_pred[doc_name] = dt_pred
+        # end WIP
+
+    for doc_name in sorted(dtree_pred.keys()):
+        dt_pred = dtree_pred[doc_name]
+        # enrich d-tree with nuc and order
+        dt_pred.ranks = rnk_clf.predict([dt_pred])[0]
+        dt_pred.nucs = nuc_clf.predict([dt_pred])[0]
+        dtree_pred[doc_name] = dt_pred
+
+    return dtree_pred
+
+
+def load_li_sujian_dep_ctrees(out_file, rel_conv_dtree, edus_file_pat,
+                              nuc_clf, rnk_clf):
+    """Load the ctrees for the dtrees output by Li Sujian et al.'s parser.
+
+    Parameters
+    ----------
+    out_file : str
+        Path to the file containing all the predictions.
+
+    rel_conv_dtree : RstRelationConverter
+        Converter to map relation labels to (normalized) coarse-grained
+        classes.
+
+    edus_file_pat : str
+        Pattern for the .edu_input files.
 
-    # choice of predictions: granularity of relations
-    RST_RELS = 'coarse'
-    if RST_RELS == 'coarse':
-        PRED_FILES = DEFAULT_FILES  # COARSE_FILES
-    else:
-        PRED_FILES = FINE_FILES
-    # eval procedure: the one in the parser of Li et al. vs standard one
-    EVAL_LI = False
+    nuc_clf : NuclearityClassifier
+        Nuclearity classifier
 
+    rnk_clf : RankClassifier
+        Rank classifier
+
+    Returns
+    -------
+    ctree_pred : dict(str, RSTTree)
+        RST ctree for each doc.
+    """
+    ctree_pred = dict()
+
+    dtree_pred = load_li_sujian_dep_dtrees(
+        out_file, rel_conv_dtree, edus_file_pat, nuc_clf, rnk_clf)
+    for doc_name, dt_pred in sorted(dtree_pred.items()):
+        ct_pred = deptree_to_rst_tree(dt_pred)
+        ctree_pred[doc_name] = ct_pred
+    return ctree_pred
+
+
+def twisted_eval(out_file, rel_conv_dtree, setup_dtree_postprocessor,
+                 ctree_true, dtree_true, edus_file_pat):
+    """Perform a twisted eval.
+
+    Parameters
+    ----------
+    setup_dtree_postprocessor : function
+        Function that sets up nuc_clf and rnk_clf.
+
+    ctree_true : dict(str, RSTTree)
+        Gold ctrees
+
+    dtree_true : dict(str, DepRstTree)
+        Gold dtrees
+
+    out_file : str
+        Path to the output file.
+    """
     # setup conversion from c- to d-tree and back, and eval type
     nary_enc = 'chain'
-
-    if EVAL_LI:
-        # reconstruction of the c-tree
-        order = 'strict'
-        nuc_strategy = 'constant'
-        nuc_constant = NUC_S
-        rnk_strategy = 'lllrrr'
-        rnk_prioritize_same_unit = False
-        # eval
-        TWIST_GOLD = True
-        ADD_TRIVIAL_SPANS = True
-    else:  # comparable setup to what we use for our own parsers
-        order = 'weak'
-        nuc_strategy = "unamb_else_most_frequent"
-        nuc_constant = None
-        rnk_strategy = "sdist-edist-rl"
-        rnk_prioritize_same_unit = True
-        TWIST_GOLD = False
-        ADD_TRIVIAL_SPANS = False
+    # reconstruction of the c-tree
+    order = 'strict'
+    nuc_strategy = 'constant'
+    nuc_constant = NUC_S
+    rnk_strategy = 'lllrrr'
+    rnk_prioritize_same_unit = False
+    # eval
+    add_trivial_spans = True
 
     nuc_clf, rnk_clf = setup_dtree_postprocessor(
         nary_enc=nary_enc, order=order, nuc_strategy=nuc_strategy,
@@ -149,108 +228,51 @@ def load_output_file(out_file):
 
     ctree_true = dict()
     dtree_true = dict()
-    labelset_true = Counter()
-    for doc_id, ct_true in sorted(corpus_test.items()):
-        doc_name = doc_id.doc
-        if RST_RELS == 'coarse':
-            # map fine to coarse rels
-            ct_true = REL_CONV(ct_true)
-        ctree_true[doc_name] = ct_true
-        dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc=nary_enc)
+    for doc_name, dt_true in sorted(dtree_true.items()):
         # dirty hack: lowercase ROOT
         dt_true.labels = [x.lower() if x == 'ROOT' else x
                           for x in dt_true.labels]
 
-        dtree_true[doc_name] = dt_true
-        labelset_true.update(dt_true.labels[1:])
-
     # load parser output
-    for fname in PRED_FILES:
-        dtree_pred = dict()
-        labelset_pred = Counter()
-        #
-        f_cur = os.path.join(SAVE_DIR, fname)
-        dep_bunch = load_output_file(f_cur)
-        doc_names = dep_bunch['doc_names']
-        # load and process _pred
-        for doc_name, heads_pred, labels_pred in zip(
-                dep_bunch['doc_names'], dep_bunch['heads_pred'],
-                dep_bunch['labels_pred']):
-            # create dtree _pred
-            edus_data = load_edu_input_file(EDUS_FILE_PAT.format(doc_name),
-                                            edu_type='rst-dt')
-            edus = edus_data['edus']
-            edu2sent = edus_data['edu2sent']
-            dt_pred = RstDepTree(edus)
-            # add predicted edges
-            for dep_idx, (gov_idx, lbl) in enumerate(zip(
-                    heads_pred[1:], labels_pred[1:]), start=1):
-                if lbl == '<no-type>':
-                    lbl = 'Elaboration'
-                # print(lbl)
-                lbl = lbl.lower()
-                labelset_pred[lbl] += 1
-                dt_pred.add_dependency(gov_idx, dep_idx, lbl)
-            dt_pred.sent_idx = [0] + edu2sent  # 0 for fake root + dirty
-            dtree_pred[doc_name] = dt_pred
-        # end WIP
+    dtree_pred = load_li_sujian_dep_dtrees(
+        out_file, rel_conv_dtree, edus_file_pat, nuc_clf, rnk_clf)
+    ctree_pred = load_li_sujian_dep_ctrees(
+        out_file, rel_conv_dtree, edus_file_pat, nuc_clf, rnk_clf)
+
+    # use our heuristics to replace the true nuc and order in
+    # dt_true with a predicted one, replace ct_true with its
+    # twisted version
+    for doc_name, dt_true in dtree_true.items():
+        dt_pred = dtree_pred[doc_name]
+        # twiste dt_true
+        dt_true.sent_idx = dt_pred.sent_idx
+        dt_true.ranks = rnk_clf.predict([dt_true])[0]
+        dt_true.nucs = nuc_clf.predict([dt_true])[0]
+        # re-gen ct_true
+        try:
+            ct_true = deptree_to_rst_tree(dt_true)
+        except RstDtException as rst_e:
+            print(rst_e)
+            raise
+        ctree_true[doc_name] = ct_true
+
+    # compute UAS and LAS on the _true values from the corpus and
+    # _pred Educe RstDepTrees re-built from their output files
+    doc_names = sorted(dtree_true.keys())
+    dtree_true_list = [dtree_true[doc_name] for doc_name in doc_names]
+    dtree_pred_list = [dtree_pred[doc_name] for doc_name in doc_names]
+    sc_uas, sc_las, sc_las_n, sc_las_o, sc_las_no = att_compute_uas_las(
+        dtree_true_list, dtree_pred_list, include_ls=False,
+        include_las_n_o_no=True)
+    print(("{}\tUAS={:.4f}\tLAS={:.4f}\tLAS+N={:.4f}\tLAS+O={:.4f}\t"
+           "LAS+N+O={:.4f}").format(
+               out_file, sc_uas, sc_las, sc_las_n, sc_las_o, sc_las_no))
 
-        if RST_RELS == 'coarse':
-            expected_labelset = ['attribution', 'background', 'cause', 'comparison', 'condition', 'contrast', 'elaboration', 'enablement', 'evaluation', 'explanation', 'joint', 'manner-means', 'root', 'same-unit', 'summary', 'temporal', 'textual', 'topic-change', 'topic-comment']
-            assert sorted(labelset_pred.keys()) == expected_labelset
-            # wsj_1189 has a weird "span" label in a multinuclear rel at [7--9]
-            # see footnote in Hayashi et al's SIGDIAL 2016 paper
-            assert sorted(labelset_true.keys()) == sorted(
-                expected_labelset + ['span'])
-
-        # build predicted c-trees using our heuristics for nuc and rank
-        ctree_pred = dict()
-        for doc_name, dt_pred in dtree_pred.items():
-            # 1. enrich d-tree with nuc and order
-            # a. order: the procedure that generates spans produces a
-            # left-heavy branching: ((A B) C), which should be our
-            # "lllrrr" heuristic
-            dt_pred.ranks = rnk_clf.predict([dt_pred])[0]
-            # b. nuclearity: heuristic baseline
-            dt_pred.nucs = nuc_clf.predict([dt_pred])[0]
-            # 2. build _pred c-tree
-            try:
-                ct_pred = deptree_to_rst_tree(dt_pred)
-                ctree_pred[doc_name] = ct_pred
-            except RstDtException as rst_e:
-                print(rst_e)
-                raise
-            # 3. predict nuc and order in _true d-tree, replace the _true
-            # c-tree with a twisted one, like in their eval
-            if TWIST_GOLD:
-                dt_true = dtree_true[doc_name]
-                dt_true.sent_idx = [0] + edu2sent
-                dt_true.ranks = rnk_clf.predict([dt_true])[0]
-                dt_true.nucs = nuc_clf.predict([dt_true])[0]
-                ct_true = ctree_true[doc_name]
-                try:
-                    ct_true = deptree_to_rst_tree(dt_true)
-                except RstDtException as rst_e:
-                    print(rst_e)
-                    raise
-                ctree_true[doc_name] = ct_true
-
-        # compute UAS and LAS on the _true values from the corpus and
-        # _pred Educe RstDepTrees re-built from their output files
-        dtree_true_list = [dtree_true[doc_name] for doc_name in doc_names]
-        dtree_pred_list = [dtree_pred[doc_name] for doc_name in doc_names]
-        sc_uas, sc_las, sc_las_n, sc_las_o, sc_las_no = att_compute_uas_las(
-            dtree_true_list, dtree_pred_list, include_ls=False,
-            include_las_n_o_no=True)
-        print(("{}\tUAS={:.4f}\tLAS={:.4f}\tLAS+N={:.4f}\tLAS+O={:.4f}\t"
-               "LAS+N+O={:.4f}").format(
-                   fname, sc_uas, sc_las, sc_las_n, sc_las_o, sc_las_no))
-            
-        # compute RST-Parseval of these c-trees
-        ctree_true_list = [ctree_true[doc_name] for doc_name in doc_names]
-        ctree_pred_list = [ctree_pred[doc_name] for doc_name in doc_names]
-        print(rst_parseval_report(ctree_true_list, ctree_pred_list,
-                                  ctree_type='RST', digits=4,
-                                  per_doc=False,
-                                  add_trivial_spans=ADD_TRIVIAL_SPANS,
-                                  stringent=False))
+    # compute RST-Parseval of these c-trees
+    ctree_true_list = [ctree_true[doc_name] for doc_name in doc_names]
+    ctree_pred_list = [ctree_pred[doc_name] for doc_name in doc_names]
+    print(rst_parseval_report(ctree_true_list, ctree_pred_list,
+                              ctree_type='RST', digits=4,
+                              per_doc=False,
+                              add_trivial_spans=add_trivial_spans,
+                              stringent=False))
diff --git a/evals/showdown.py b/evals/showdown.py
index eb8ad09..fe8b80c 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -36,6 +36,9 @@
                                 load_hayashi_dep_ctrees)
 from evals.ji import load_ji_ctrees, load_ji_dtrees
 from evals.li_qi import load_li_qi_ctrees, load_li_qi_dtrees
+from evals.li_sujian import (DEFAULT_FILE as LI_SUJIAN_OUT_FILE,
+                             load_li_sujian_dep_ctrees,
+                             load_li_sujian_dep_dtrees)
 from evals.ours import (load_deptrees_from_attelo_output,
                         load_attelo_ctrees,
                         load_attelo_dtrees)
@@ -122,7 +125,8 @@
 BRAUD_EACL_CROSS_DEV = '/home/mmorey/melodi/rst/braud/eacl16/best-en-cross+dev/test_it10_beam32'
 # Surdeanu
 SURDEANU_LOG_FILE = '/home/mmorey/melodi/rst/surdeanu/output/log'
-
+# Li Sujian dep parser
+# imported, see above
 
 # level of detail for parseval
 DETAILED = False
@@ -197,6 +201,7 @@ def main():
                                  'braud_coling', 'braud_eacl_mono',
                                  'braud_eacl_cross_dev',
                                  'surdeanu',
+                                 'li_sujian',
                                  'ours_chain', 'ours_tree', 'ours_tree_su'],
                         help="Author(s) of the predictions")
     parser.add_argument('--nary_enc_pred', default='tree',
@@ -210,6 +215,7 @@ def main():
                                  'braud_coling', 'braud_eacl_mono',
                                  'braud_eacl_cross_dev',
                                  'surdeanu',
+                                 'li_sujian',
                                  'ours_chain', 'ours_tree'],
                         help="Author of the reference")
     # * dtree eval
@@ -350,6 +356,18 @@ def main():
                                             nary_enc='chain'))
             )
 
+        if author_pred == 'li_sujian':
+            c_preds.append(
+                ('li_sujian', load_li_sujian_dep_ctrees(
+                    LI_SUJIAN_OUT_FILE, REL_CONV_DTREE, EDUS_FILE_PAT,
+                    nuc_clf, rnk_clf))
+            )
+            d_preds.append(
+                ('li_sujian', load_li_sujian_dep_dtrees(
+                    LI_SUJIAN_OUT_FILE, REL_CONV_DTREE, EDUS_FILE_PAT,
+                    nuc_clf, rnk_clf))
+            )
+
         if author_pred == 'feng':
             c_preds.append(
                 ('gSVM', load_feng_ctrees(FENG1_OUT_DIR, REL_CONV))

From 2a12e87d1aeb48770366b7297c815e7a94a18992 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Tue, 3 Jan 2017 15:37:34 +0100
Subject: [PATCH 47/74] MAINT move loaders for RST parsers from educe to
 ./evals

---
 evals/codra.py      | 79 +++++++++++++++++++++++++++++++++++++--------
 evals/dis2disdep.py | 48 +++++++++++++++++++--------
 evals/feng.py       | 55 +++++++++++++++++++++++++++++--
 3 files changed, 154 insertions(+), 28 deletions(-)

diff --git a/evals/codra.py b/evals/codra.py
index eb9c6f6..a586389 100644
--- a/evals/codra.py
+++ b/evals/codra.py
@@ -1,21 +1,73 @@
-"""Use the same evaluation procedure Evaluate the output of CODRA
+"""This module enables to load the output of Joty's discourse parser CODRA.
 
 """
 
 from __future__ import absolute_import, print_function
 
+import codecs
 from collections import defaultdict
+import glob
 import itertools
+import os
 
-import numpy as np
-
-from educe.rst_dt.codra import load_codra_output_files
-from educe.rst_dt.dep2con import deptree_to_rst_tree
 from educe.rst_dt.deptree import RstDepTree
-from educe.rst_dt.document_plus import align_edus_with_paragraphs
-#
-from attelo.io import load_edus
-from attelo.metrics.deptree import compute_uas_las
+from educe.rst_dt.parse import parse_rst_dt_tree
+
+
+def load_codra_output_files(container_path, level='doc'):
+    """Load ctrees output by CODRA on the TEST section of RST-WSJ.
+
+    Parameters
+    ----------
+    container_path: string
+        Path to the main folder containing CODRA's output
+
+    level: {'doc', 'sent'}, optional (default='doc')
+        Level of decoding: document-level or sentence-level
+
+    Returns
+    -------
+    data: dict
+        Dictionary that should be akin to a sklearn Bunch, with
+        interesting keys 'filenames', 'doc_names' and 'rst_ctrees'.
+
+    Notes
+    -----
+    To ensure compatibility with the rest of the code base, doc_names
+    are automatically added the ".out" extension. This would not work
+    for fileX documents, but they are absent from the TEST section of
+    the RST-WSJ treebank.
+    """
+    if level == 'doc':
+        file_ext = '.doc_dis'
+    elif level == 'sent':
+        file_ext = '.sen_dis'
+    else:
+        raise ValueError("level {} not in ['doc', 'sent']".format(level))
+
+    # find all files with the right extension
+    pathname = os.path.join(container_path, '*{}'.format(file_ext))
+    # filenames are sorted by name to avoid having to realign data
+    # loaded with different functions
+    filenames = sorted(glob.glob(pathname))  # glob.glob() returns a list
+
+    # find corresponding doc names
+    doc_names = [os.path.splitext(os.path.basename(filename))[0] + '.out'
+                 for filename in filenames]
+
+    # load the RST trees
+    rst_ctrees = []
+    for filename in filenames:
+        with codecs.open(filename, 'r', 'utf-8') as f:
+            # TODO (?) add support for and use RSTContext
+            rst_ctree = parse_rst_dt_tree(f.read(), None)
+            rst_ctrees.append(rst_ctree)
+
+    data = dict(filenames=filenames,
+                doc_names=doc_names,
+                rst_ctrees=rst_ctrees)
+
+    return data
 
 
 def load_codra_ctrees(codra_out_dir, rel_conv):
@@ -127,8 +179,9 @@ def get_edu2sent(att_edus):
         edu2sent_idx[doc_name][edu_num] = sent_idx
     # sort EDUs by num
     # rebuild educe-style edu2sent ; prepend 0 for the fake root
-    doc_name2edu2sent = {doc_name: ([0]
-                                    + [s_idx for e_num, s_idx
-                                       in sorted(edu2sent.items())])
-                         for doc_name, edu2sent in edu2sent_idx.items()}
+    doc_name2edu2sent = {
+        doc_name: ([0] +
+                   [s_idx for e_num, s_idx in sorted(edu2sent.items())])
+        for doc_name, edu2sent in edu2sent_idx.items()
+    }
     return doc_name2edu2sent
diff --git a/evals/dis2disdep.py b/evals/dis2disdep.py
index fd552fa..5825cfc 100755
--- a/evals/dis2disdep.py
+++ b/evals/dis2disdep.py
@@ -12,16 +12,19 @@
 
 from educe.corpus import FileId
 from educe.learning.disdep_format import dump_disdep_files
-from educe.rst_dt.codra import load_codra_output_files
 from educe.rst_dt.corpus import Reader, RstRelationConverter
 from educe.rst_dt.deptree import RstDepTree
 from educe.rst_dt.feng import load_feng_output_files
 from educe.rst_dt.rst_wsj_corpus import (DOUBLE_FOLDER, TEST_FOLDER,
                                          TRAIN_FOLDER)
 
+from evals.codra import load_codra_output_files
 from evals.gcrf_tree_format import load_gcrf_dtrees
-from evals.hayashi_deps import load_hayashi_dtrees
+from evals.hayashi_cons import load_hayashi_hilda_dtrees
+from evals.hayashi_deps import load_hayashi_dep_dtrees
 from evals.ji import load_ji_dtrees
+from evals.showdown import (setup_dtree_postprocessor, NUC_STRATEGY,
+                            NUC_CONSTANT, RNK_STRATEGY, RNK_PRIORITY_SU)
 
 
 # original RST corpus
@@ -30,6 +33,11 @@
 RST_MAIN_TEST = os.path.join(RST_CORPUS, TEST_FOLDER)
 RST_DOUBLE = os.path.join(RST_CORPUS, DOUBLE_FOLDER)
 
+# get edu2sent, set up rnk_clf and nuc_clf to predict rank and order for
+# the output of Hayashi's MST parser
+# * new style .edu_input: one file per doc in test set
+EDUS_FILE_PAT = "TMP/latest/data/TEST/{}.relations.edu-pairs.sparse.edu_input"
+
 # relation converter (fine- to coarse-grained labels)
 RELMAP_FILE = os.path.join('/home/mmorey/melodi/educe',
                            'educe', 'rst_dt',
@@ -39,15 +47,18 @@
 REL_CONV_DTREE = REL_CONV_BASE.convert_dtree
 # output of Joty's parser
 OUT_JOTY = os.path.join('/home/mmorey/melodi/rst/joty/Doc-level/')
-# output of Feng & Hirst's parser
-OUT_FENG = os.path.join('/home/mmorey/melodi/rst/feng_hirst/phil/tmp/')
-# output of Feng & Hirst's parser
-OUT_FENG2 = os.path.join('/home/mmorey/melodi/rst/feng_hirst/gCRF_dist/texts/results/test_batch_gold_seg')
+# output of Feng & Hirst's parsers
+FENG_BASEDIR = '/home/mmorey/melodi/rst/feng_hirst'
+OUT_FENG = os.path.join(FENG_BASEDIR, 'phil/tmp/')
+OUT_FENG2 = os.path.join(FENG_BASEDIR,
+                         'gCRF_dist/texts/results/test_batch_gold_seg')
 # output of Ji's parser
-OUT_JI = os.path.join('/home/mmorey/melodi/rst/ji_eisenstein/DPLP/data/docs/test/')
+JI_BASEDIR = '/home/mmorey/melodi/rst/ji_eisenstein'
+OUT_JI = os.path.join(JI_BASEDIR, 'DPLP/data/docs/test/')
 # output of Hayashi et al.'s parsers
-OUT_HAYASHI_MST = os.path.join('/home/mmorey/melodi/rst/hayashi/SIGDIAL/auto_parse/dep/li/')
-OUT_HAYASHI_HILDA = os.path.join('/home/mmorey/melodi/rst/hayashi/SIGDIAL/auto_parse/cons/trans_li/')
+HAYASHI_BASEDIR = '/home/mmorey/melodi/rst/hayashi/SIGDIAL/'
+OUT_HAYASHI_MST = os.path.join(HAYASHI_BASEDIR, 'auto_parse/dep/li/')
+OUT_HAYASHI_HILDA = os.path.join(HAYASHI_BASEDIR, 'auto_parse/cons/trans_li/')
 
 
 def main():
@@ -70,7 +81,8 @@ def main():
                         help="Root directory for the output")
     args = parser.parse_args()
     # precise output path, by default: TMP_disdep/chain/gold/train
-    out_dir = os.path.join(args.out_root, args.nary_enc, args.author, args.split)
+    out_dir = os.path.join(args.out_root, args.nary_enc, args.author,
+                           args.split)
     if not os.path.exists(out_dir):
         os.makedirs(out_dir)
     # read RST trees
@@ -139,13 +151,23 @@ def main():
         if corpus_split != 'test':
             raise ValueError("The output of Hayashi et al.'s parser is "
                              "available for the 'test' split only")
-        dtrees = load_hayashi_dtrees(OUT_HAYASHI_MST, REL_CONV_DTREE)
+        # setup nuc_clf, rnk_clf
+        nuc_clf, rnk_clf = setup_dtree_postprocessor(
+            nary_enc='tree', order='weak',
+            nuc_strategy=NUC_STRATEGY,
+            nuc_constant=NUC_CONSTANT,
+            rnk_strategy=RNK_STRATEGY,
+            rnk_prioritize_same_unit=RNK_PRIORITY_SU)
+        # end setup
+        dtrees = load_hayashi_dep_dtrees(
+            OUT_HAYASHI_MST, REL_CONV_DTREE, EDUS_FILE_PAT,
+            nuc_clf, rnk_clf)
     elif author == 'hayashi_hilda':
         if corpus_split != 'test':
             raise ValueError("The output of Hayashi et al.'s parser is "
                              "available for the 'test' split only")
-        dtrees = load_hayashi_dtrees(OUT_HAYASHI_HILDA, REL_CONV_DTREE)
-            
+        dtrees = load_hayashi_hilda_dtrees(OUT_HAYASHI_HILDA, REL_CONV)
+
     # do dump
     dump_disdep_files(dtrees.values(), out_dir)
 
diff --git a/evals/feng.py b/evals/feng.py
index fd65acf..a9c60f0 100644
--- a/evals/feng.py
+++ b/evals/feng.py
@@ -1,4 +1,4 @@
-"""Load the output of the parser from (Feng and Hirst, 2014).
+"""Load the output of the RST parser from (Feng and Hirst, 2014).
 
 This is 99% a copy/paste from evals/joty.py .
 I need to come up with a better API and refactor accordingly.
@@ -6,12 +6,63 @@
 
 from __future__ import absolute_import, print_function
 
+import codecs
+import glob
 import itertools
+import os
 
 from nltk import Tree
 
-from educe.rst_dt.feng import load_feng_output_files
 from educe.rst_dt.deptree import RstDepTree
+from educe.rst_dt.parse import parse_rst_dt_tree
+
+
+def load_feng_output_files(root_dir):
+    """Load ctrees output by Feng & Hirst's parser on the TEST section of
+    RST-WSJ.
+
+    Parameters
+    ----------
+    root_dir: string
+        Path to the main folder containing the parser's output
+
+    Returns
+    -------
+    data: dict
+        Dictionary that should be akin to a sklearn Bunch, with
+        interesting keys 'filenames', 'doc_names' and 'rst_ctrees'.
+
+    Notes
+    -----
+    To ensure compatibility with the rest of the code base, doc_names
+    are automatically added the ".out" extension. This would not work
+    for fileX documents, but they are absent from the TEST section of
+    the RST-WSJ treebank.
+    """
+    # find all files with the right extension
+    file_ext = '.txt.dis'
+    pathname = os.path.join(root_dir, '*{}'.format(file_ext))
+    # filenames are sorted by name to avoid having to realign data
+    # loaded with different functions
+    filenames = sorted(glob.glob(pathname))  # glob.glob() returns a list
+
+    # find corresponding doc names
+    doc_names = [os.path.basename(filename).rsplit('.', 2)[0] + '.out'
+                 for filename in filenames]
+
+    # load the RST trees
+    rst_ctrees = []
+    for filename in filenames:
+        with codecs.open(filename, 'r', 'utf-8') as f:
+            # TODO (?) add support for and use RSTContext
+            rst_ctree = parse_rst_dt_tree(f.read(), None)
+            rst_ctrees.append(rst_ctree)
+
+    data = dict(filenames=filenames,
+                doc_names=doc_names,
+                rst_ctrees=rst_ctrees)
+
+    return data
 
 
 def load_feng_ctrees(out_dir, rel_conv):

From 5750e89dc6eea7f7dd3e6e09a069c58c335cf3a0 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Fri, 3 Feb 2017 20:42:22 +0100
Subject: [PATCH 48/74] WIP disable frag pairs

---
 irit_rst_dt/cmd/gather.py | 46 ++++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/irit_rst_dt/cmd/gather.py b/irit_rst_dt/cmd/gather.py
index b986097..df37074 100644
--- a/irit_rst_dt/cmd/gather.py
+++ b/irit_rst_dt/cmd/gather.py
@@ -188,28 +188,30 @@ def main(args):
                          vocab_path=vocab_path,
                          label_path=label_path)
 
-    # frag pairs: supplementary pairs from/to each fragmented EDU to
-    # the other fragmented EDUs and the EDUs that don't belong to any
-    # fragmented EDU
-    instances = 'frag-pairs'
-    same_unit_types = set(('true' if isinstance(x, AttachOracle)
-                           else 'pred')
-                          for clf in same_unit_clfs)
-    for same_unit_type in sorted(same_unit_types):
-        # we use the vocabulary and labelset from "edu-pairs" ; this is the
-        # simplest solution currently and it seems correct, but maybe we
-        # could extend "edu-pairs" with these pairs when we learn the
-        # vocabulary?
-        if not args.skip_training:
-            extract_features(TRAINING_CORPUS, tdir_data, fix_pseudo_rels,
-                             instances, frag_edus=same_unit_type,
-                             vocab_path=vocab_path,
-                             label_path=label_path)
-        if TEST_CORPUS is not None:
-            extract_features(TEST_CORPUS, tdir_data, fix_pseudo_rels,
-                             instances, frag_edus=same_unit_type,
-                             vocab_path=vocab_path,
-                             label_path=label_path)
+    # WIP 2017-02-03 disable frag-pairs
+    if False:
+        # frag pairs: supplementary pairs from/to each fragmented EDU to
+        # the other fragmented EDUs and the EDUs that don't belong to any
+        # fragmented EDU
+        instances = 'frag-pairs'
+        same_unit_types = set(('true' if isinstance(x, AttachOracle)
+                               else 'pred')
+                              for clf in same_unit_clfs)
+        for same_unit_type in sorted(same_unit_types):
+            # we use the vocabulary and labelset from "edu-pairs" ;
+            # this is the simplest solution currently and it seems
+            # correct, but maybe we could extend "edu-pairs" with these
+            # pairs when we learn the vocabulary?
+            if not args.skip_training:
+                extract_features(TRAINING_CORPUS, tdir_data, fix_pseudo_rels,
+                                 instances, frag_edus=same_unit_type,
+                                 vocab_path=vocab_path,
+                                 label_path=label_path)
+            if TEST_CORPUS is not None:
+                extract_features(TEST_CORPUS, tdir_data, fix_pseudo_rels,
+                                 instances, frag_edus=same_unit_type,
+                                 vocab_path=vocab_path,
+                                 label_path=label_path)
     # end frag pairs        
 
     with open(os.path.join(tdir_data, "versions-gather.txt"), "w") as stream:

From 5784da8b2b9dcb0abf541bb6920973f3ec9be2ae Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Tue, 7 Feb 2017 10:38:42 +0100
Subject: [PATCH 49/74] FIX paths for CDU-related files are now optional

---
 irit_rst_dt/harness.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/irit_rst_dt/harness.py b/irit_rst_dt/harness.py
index 3242a31..a555411 100644
--- a/irit_rst_dt/harness.py
+++ b/irit_rst_dt/harness.py
@@ -108,22 +108,28 @@ def create_folds(self, mpack):
     # paths
     # ------------------------------------------------------
 
-    def mpack_paths(self, test_data, stripped=False):
+    def mpack_paths(self, test_data, stripped=False, with_cdus=False):
         """
         Parameters
         ----------
-        test_data: boolean
+        test_data : boolean
             If true, the returned paths point to self.testset else to
             self.dataset.
-        stripped: boolean
+
+        stripped : boolean, defaults to False
             TODO
 
+        with_cdus : boolean, defaults to False
+            If True, generate CDUs (eg. for fragmented EDUs), pairings
+            on them and the corresponding feature vectors.
+
         Returns
         -------
-        paths: dict of file paths
-            Path to: edu_input, pairings, features, vocab, labels,
-            cdu_input, cdu_pairings, cdu_features, corpus (to access
-            gold structures, WIP).
+        paths : dict of (glob patterns of) file paths
+            Path to: edu_input, pairings, features, vocab, labels.
+            Also contains 'corpus' (to access gold structures, WIP for
+            RST-DT) ; if `with_cdus` is True, also cdu_input,
+            cdu_pairings, cdu_features.
         """
         base = 'relations.edu-pairs'
         ext = base + '.sparse'

From de586487b55b80ae3de20a4d469bd0e914086ac2 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Tue, 7 Feb 2017 10:44:00 +0100
Subject: [PATCH 50/74] FIX paths for CDU-related files are now (really)
 optional

---
 irit_rst_dt/harness.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/irit_rst_dt/harness.py b/irit_rst_dt/harness.py
index a555411..03801e4 100644
--- a/irit_rst_dt/harness.py
+++ b/irit_rst_dt/harness.py
@@ -138,9 +138,10 @@ def mpack_paths(self, test_data, stripped=False, with_cdus=False):
         vocab_path = fp.join(self.eval_dir, "%s.%s.vocab" % (dset, ext))
         labels_path = fp.join(self.eval_dir, "%s.%s.labels" % (dset, base))
         core_path = fp.join(self.eval_dir, dset, "*.%s" % ext)
-        # 2016-07-28 pairs on fragmented EDUs
-        frag_ext = 'relations.frag-pairs.sparse'
-        frag_path = fp.join(self.eval_dir, dset, "*.%s" % frag_ext)
+        if with_cdus:
+            # 2016-07-28 pairs on fragmented EDUs
+            frag_ext = 'relations.frag-pairs.sparse'
+            frag_path = fp.join(self.eval_dir, dset, "*.%s" % frag_ext)
         # WIP gold RST trees
         corpus_path = fp.abspath(TEST_CORPUS if test_data
                                  else TRAINING_CORPUS)
@@ -153,10 +154,13 @@ def mpack_paths(self, test_data, stripped=False, with_cdus=False):
             'vocab': vocab_path,
             'labels': labels_path,
             # fragmented EDUs
-            'cdu_input': frag_path + '.cdu_input',
-            'cdu_pairings': frag_path + '.cdu_pairings',
-            'cdu_features': ((frag_path + '.stripped') if stripped
-                             else frag_path),
+            'cdu_input': (frag_path + '.cdu_input' if with_cdus
+                          else None)
+            'cdu_pairings': (frag_path + '.cdu_pairings' if with_cdus
+                             else None)
+            'cdu_features': (((frag_path + '.stripped') if stripped
+                              else frag_path) if with_cdus
+                             else None),
             # corpus for gold RST trees
             'corpus': corpus_path,
         }

From ad2dd783157f648d1ec5bb25d3deeeb65c757bf5 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Tue, 7 Feb 2017 10:48:54 +0100
Subject: [PATCH 51/74] FIX paths for CDU-related files are now (really)
 optional

---
 irit_rst_dt/harness.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/irit_rst_dt/harness.py b/irit_rst_dt/harness.py
index 03801e4..b843168 100644
--- a/irit_rst_dt/harness.py
+++ b/irit_rst_dt/harness.py
@@ -155,9 +155,9 @@ def mpack_paths(self, test_data, stripped=False, with_cdus=False):
             'labels': labels_path,
             # fragmented EDUs
             'cdu_input': (frag_path + '.cdu_input' if with_cdus
-                          else None)
+                          else None),
             'cdu_pairings': (frag_path + '.cdu_pairings' if with_cdus
-                             else None)
+                             else None),
             'cdu_features': (((frag_path + '.stripped') if stripped
                               else frag_path) if with_cdus
                              else None),

From 056a7b151167b2e58226de14ce3d6dcae88f4272 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Mon, 13 Feb 2017 10:03:28 +0100
Subject: [PATCH 52/74] FIX disable cdu paths

---
 irit_rst_dt/harness.py | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/irit_rst_dt/harness.py b/irit_rst_dt/harness.py
index b843168..40f89ec 100644
--- a/irit_rst_dt/harness.py
+++ b/irit_rst_dt/harness.py
@@ -138,10 +138,6 @@ def mpack_paths(self, test_data, stripped=False, with_cdus=False):
         vocab_path = fp.join(self.eval_dir, "%s.%s.vocab" % (dset, ext))
         labels_path = fp.join(self.eval_dir, "%s.%s.labels" % (dset, base))
         core_path = fp.join(self.eval_dir, dset, "*.%s" % ext)
-        if with_cdus:
-            # 2016-07-28 pairs on fragmented EDUs
-            frag_ext = 'relations.frag-pairs.sparse'
-            frag_path = fp.join(self.eval_dir, dset, "*.%s" % frag_ext)
         # WIP gold RST trees
         corpus_path = fp.abspath(TEST_CORPUS if test_data
                                  else TRAINING_CORPUS)
@@ -153,17 +149,23 @@ def mpack_paths(self, test_data, stripped=False, with_cdus=False):
                          else core_path),
             'vocab': vocab_path,
             'labels': labels_path,
-            # fragmented EDUs
-            'cdu_input': (frag_path + '.cdu_input' if with_cdus
-                          else None),
-            'cdu_pairings': (frag_path + '.cdu_pairings' if with_cdus
-                             else None),
-            'cdu_features': (((frag_path + '.stripped') if stripped
-                              else frag_path) if with_cdus
-                             else None),
             # corpus for gold RST trees
             'corpus': corpus_path,
         }
+        if with_cdus:
+            # 2016-07-28 fragmented EDUs
+            frag_ext = 'relations.frag-pairs.sparse'
+            frag_path = fp.join(self.eval_dir, dset, "*.%s" % frag_ext)
+            res.update([
+                ('cdu_input', (frag_path + '.cdu_input' if with_cdus
+                               else None)),
+                ('cdu_pairings', (frag_path + '.cdu_pairings' if with_cdus
+                                  else None)),
+                ('cdu_features', (((frag_path + '.stripped') if stripped
+                                   else frag_path) if with_cdus
+                                  else None)),
+            ])
+
         return res
 
     def model_paths(self, rconf, fold, parser):

From 86a73e9ee636fcd142dfc7c38046ac8e224158bb Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Mon, 13 Feb 2017 10:52:22 +0100
Subject: [PATCH 53/74] FIX add graphviz to environment.yml

---
 environment.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/environment.yml b/environment.yml
index a1140ca..417ecaa 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,6 +1,7 @@
 name: irit-rst-dt
 dependencies:
   - python=2.7
+  - graphviz=2.38.0
   - nltk
   - scikit-learn
   - pip:

From 4165175fc29bad6d307154d2badb05a6da0d7d9a Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Tue, 11 Apr 2017 10:29:44 +0200
Subject: [PATCH 54/74] FIX evals paths

---
 evals/li_sujian.py |  2 +-
 evals/showdown.py  | 36 +++++++++++++++++++-----------------
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/evals/li_sujian.py b/evals/li_sujian.py
index ee235bb..d84ae9f 100644
--- a/evals/li_sujian.py
+++ b/evals/li_sujian.py
@@ -15,7 +15,7 @@
 
 
 # output of Li et al.'s parser
-SAVE_DIR = "/home/mmorey/melodi/rst/li_sujian/TextLevelDiscourseParser/mybackup/mstparser-code-116-trunk/mstparser/save"
+SAVE_DIR = "/home/mmorey/melodi/rst/replication/li_sujian/TextLevelDiscourseParser/mybackup/mstparser-code-116-trunk/mstparser/save"
 COARSE_FILES = [
     "136.0detailedOutVersion2.txt",
     "151.0detailedOut.txt",
diff --git a/evals/showdown.py b/evals/showdown.py
index fe8b80c..497ea29 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -63,29 +63,31 @@
 
 # * syntax: pred vs gold
 # old-style .edu_input: whole test set
-EDUS_FILE = os.path.join('/home/mmorey/melodi',
+EDUS_FILE = os.path.join('/home/mmorey/melodi/rst',
                          'irit-rst-dt/TMP/syn_gold_coarse',
                          'TEST.relations.sparse.edu_input')
 
 # new style .edu_input: one file per doc in test set
-EDUS_FILE_PAT = "TMP/latest/data/TEST/{}.relations.edu-pairs.sparse.edu_input"
+# was: TMP/latest/data..., replaced latest with 2016-09-30T1701 but
+# might be wrong (or it might have no consequence here)
+EDUS_FILE_PAT = "TMP/2016-09-30T1701/data/TEST/{}.relations.edu-pairs.sparse.edu_input"
 
 # outputs of parsers
 EISNER_OUT_SYN_PRED = os.path.join(
-    '/home/mmorey/melodi',
+    '/home/mmorey/melodi/rst',
     'irit-rst-dt/TMP/syn_pred_coarse',  # lbl
     'scratch-current/combined',
     'output.maxent-iheads-global-AD.L-jnt-eisner')
 
 # 2016-09-14 "tree" transform, predicted syntax
 EISNER_OUT_TREE_SYN_PRED = os.path.join(
-    '/home/mmorey/melodi',
+    '/home/mmorey/melodi/rst',
     'irit-rst-dt/TMP/2016-09-12T0825',  # lbl
     'scratch-current/combined',
     'output.maxent-iheads-global-AD.L-jnt-eisner')
 
 EISNER_OUT_TREE_SYN_PRED_SU = os.path.join(
-    '/home/mmorey/melodi',
+    '/home/mmorey/melodi/rst',
     'irit-rst-dt/TMP/2016-09-12T0825',  # lbl
     'scratch-current/combined',
     'output.maxent-iheads-global-AD.L-jnt_su-eisner')
@@ -93,38 +95,38 @@
 
 
 EISNER_OUT_SYN_PRED_SU = os.path.join(
-    '/home/mmorey/melodi',
+    '/home/mmorey/melodi/rst',
     'irit-rst-dt/TMP/latest',  # lbl
     'scratch-current/combined',
     'output.maxent-AD.L-jnt_su-eisner')
 
 EISNER_OUT_SYN_GOLD = os.path.join(
-    '/home/mmorey/melodi',
+    '/home/mmorey/melodi/rst',
     'irit-rst-dt/TMP/syn_gold_coarse',  # lbl
     'scratch-current/combined',
     'output.maxent-iheads-global-AD.L-jnt-eisner')
 
 # output of Joty's parser CODRA
-CODRA_OUT_DIR = '/home/mmorey/melodi/rst/joty/Doc-level'
+CODRA_OUT_DIR = '/home/mmorey/melodi/rst/replication/joty/Doc-level'
 # output of Ji's parser DPLP
-# JI_OUT_DIR = os.path.join('/home/mmorey/melodi/rst/ji_eisenstein', 'DPLP/data/docs/test/')
-JI_OUT_DIR = os.path.join('/home/mmorey/melodi/rst/ji_eisenstein', 'official_output/outputs/')
+# JI_OUT_DIR = os.path.join('/home/mmorey/melodi/rst/replication/ji_eisenstein', 'DPLP/data/docs/test/')
+JI_OUT_DIR = os.path.join('/home/mmorey/melodi/rst/replication/ji_eisenstein', 'official_output/outputs/')
 # Feng's parsers
-FENG_DIR = '/home/mmorey/melodi/rst/feng_hirst/'
+FENG_DIR = '/home/mmorey/melodi/rst/replication/feng_hirst/'
 FENG1_OUT_DIR = os.path.join(FENG_DIR, 'phil', 'tmp')
 FENG2_OUT_DIR = os.path.join(FENG_DIR, 'gCRF_dist/texts/results/test_batch_gold_seg')
 # Li Qi's parser
-LI_QI_OUT_DIR = '/home/mmorey/melodi/rst/li_qi/result'
+LI_QI_OUT_DIR = '/home/mmorey/melodi/rst/replication/li_qi/result'
 # Hayashi's HILDA
-HAYASHI_OUT_DIR = '/home/mmorey/melodi/rst/hayashi/SIGDIAL'
+HAYASHI_OUT_DIR = '/home/mmorey/melodi/rst/replication/hayashi/SIGDIAL'
 HAYASHI_HILDA_OUT_DIR = os.path.join(HAYASHI_OUT_DIR, 'auto_parse/cons/HILDA')
 HAYASHI_MST_OUT_DIR = os.path.join(HAYASHI_OUT_DIR, 'auto_parse/dep/li')
 # Braud
-BRAUD_COLING_OUT_DIR = '/home/mmorey/melodi/rst/braud/coling16/pred_trees'
-BRAUD_EACL_MONO = '/home/mmorey/melodi/rst/braud/eacl16/best-en-mono/test_it8_beam16'
-BRAUD_EACL_CROSS_DEV = '/home/mmorey/melodi/rst/braud/eacl16/best-en-cross+dev/test_it10_beam32'
+BRAUD_COLING_OUT_DIR = '/home/mmorey/melodi/rst/replication/braud/coling16/pred_trees'
+BRAUD_EACL_MONO = '/home/mmorey/melodi/rst/replication/braud/eacl16/best-en-mono/test_it8_beam16'
+BRAUD_EACL_CROSS_DEV = '/home/mmorey/melodi/rst/replication/braud/eacl16/best-en-cross+dev/test_it10_beam32'
 # Surdeanu
-SURDEANU_LOG_FILE = '/home/mmorey/melodi/rst/surdeanu/output/log'
+SURDEANU_LOG_FILE = '/home/mmorey/melodi/rst/replication/surdeanu/output/log'
 # Li Sujian dep parser
 # imported, see above
 

From b95e184eb3e2d299b9e117ed3da22c6b09aa9fbc Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Tue, 11 Apr 2017 10:55:37 +0200
Subject: [PATCH 55/74] ENH showdown: param digits

---
 evals/showdown.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/evals/showdown.py b/evals/showdown.py
index 497ea29..6c93b33 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -236,6 +236,9 @@ def main():
                         help=("Evaluate as in the dep parser of Li et al. "
                               "2014: all relations are NS, spiders map to "
                               "left-heavy branching, three trivial spans "))
+    # * display options
+    parser.add_argument('--digits', type=int, default=3,
+                        help='Precision (number of digits) of scores')
     #
     args = parser.parse_args()
     author_true = args.author_true
@@ -244,6 +247,8 @@ def main():
     nary_enc_pred = args.nary_enc_pred
     binarize_true = args.binarize_true
     simple_rsttree = args.simple_rsttree
+    # display
+    digits = args.digits
 
     # "per_doc = True" computes p, r, f as in DPLP: compute scores per doc
     # then average over docs
@@ -479,7 +484,6 @@ def main():
 
     # report
     # * table format
-    digits = 4
     width = max(len(parser_name) for parser_name, _ in d_preds)
 
     headers = ["UAS", "LAS"]
@@ -569,15 +573,15 @@ def main():
         # compute and print PARSEVAL scores
         print(parser_name)
         print(rst_parseval_report(ctree_true_list, ctree_pred_list,
-                                  ctree_type=ctree_type, digits=4,
+                                  ctree_type=ctree_type, digits=digits,
                                   per_doc=per_doc,
                                   add_trivial_spans=eval_li_dep,
                                   stringent=STRINGENT))
-        # detailed report on S+N+R
+        # detailed report on R
         if DETAILED:
             print(rst_parseval_detailed_report(
                 ctree_true_list, ctree_pred_list, ctree_type=ctree_type,
-                metric_type='S+R'))
+                metric_type='R'))
         # end FIXME
 
 

From 1ecb3d33d80abd261fe4bd4c22d3da85c4559a32 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Tue, 11 Apr 2017 17:26:47 +0200
Subject: [PATCH 56/74] ENH param detailed, compact report

---
 evals/showdown.py | 115 ++++++++++++++++++++++++++++------------------
 1 file changed, 71 insertions(+), 44 deletions(-)

diff --git a/evals/showdown.py b/evals/showdown.py
index 6c93b33..708b2a9 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -17,6 +17,7 @@
                                   InsideOutAttachmentRanker)
 from educe.rst_dt.deptree import RstDepTree
 from educe.rst_dt.metrics.rst_parseval import (rst_parseval_detailed_report,
+                                               rst_parseval_compact_report,
                                                rst_parseval_report)
 #
 from attelo.metrics.deptree import (compute_uas_las,
@@ -131,7 +132,6 @@
 # imported, see above
 
 # level of detail for parseval
-DETAILED = False
 STRINGENT = False
 # additional dependency metrics
 INCLUDE_LS = False
@@ -239,6 +239,8 @@ def main():
     # * display options
     parser.add_argument('--digits', type=int, default=3,
                         help='Precision (number of digits) of scores')
+    parser.add_argument('--detailed', type=int, default=0,
+                        help='Level of detail for evaluations')
     #
     args = parser.parse_args()
     author_true = args.author_true
@@ -249,6 +251,8 @@ def main():
     simple_rsttree = args.simple_rsttree
     # display
     digits = args.digits
+    # level of detail for evals
+    detailed = args.detailed
 
     # "per_doc = True" computes p, r, f as in DPLP: compute scores per doc
     # then average over docs
@@ -470,14 +474,14 @@ def main():
             load_deptrees_from_attelo_output(ctree_true, dtree_true,
                                              EISNER_OUT_SYN_PRED_SU, EDUS_FILE,
                                              nuc_clf, rnk_clf,
-                                             detailed=False)
+                                             detailed=(detailed >= 3))
             print('======================')
 
             print('Eisner, gold syntax')
             load_deptrees_from_attelo_output(ctree_true, dtree_true,
                                              EISNER_OUT_SYN_GOLD, EDUS_FILE,
                                              nuc_clf, rnk_clf,
-                                             detailed=False)
+                                             detailed=(detailed >= 3))
             print('======================')
 
     # dependency eval
@@ -542,47 +546,70 @@ def main():
     # end report
 
     # constituency eval
-    for parser_name, ctree_pred in c_preds:
-        doc_names = sorted(ctree_true.keys())
-        ctree_true_list = [ctree_true[doc_name] for doc_name in doc_names]
-        ctree_pred_list = [ctree_pred[doc_name] for doc_name in doc_names]
-
-        if simple_rsttree:
-            ctree_true_list = [SimpleRSTTree.from_rst_tree(x)
-                               for x in ctree_true_list]
-            ctree_pred_list = [SimpleRSTTree.from_rst_tree(x)
-                               for x in ctree_pred_list]
-            ctree_type = 'SimpleRST'
-        else:
-            ctree_type = 'RST'
-
-        # WIP print SimpleRSTTrees
-        if not os.path.exists('gold'):
-            os.makedirs('gold')
-        for doc_name, ct in zip(doc_names, ctree_true_list):
-            with codecs.open('gold/' + ct.origin.doc, mode='w',
-                             encoding='utf-8') as f:
-                print(ct, file=f)
-        if not os.path.exists(parser_name):
-            os.makedirs(parser_name)
-        for doc_name, ct in zip(doc_names, ctree_pred_list):
-            with codecs.open(parser_name + '/' + doc_name, mode='w',
-                             encoding='utf-8') as f:
-                print(ct, file=f)
-
-        # compute and print PARSEVAL scores
-        print(parser_name)
-        print(rst_parseval_report(ctree_true_list, ctree_pred_list,
-                                  ctree_type=ctree_type, digits=digits,
-                                  per_doc=per_doc,
-                                  add_trivial_spans=eval_li_dep,
-                                  stringent=STRINGENT))
-        # detailed report on R
-        if DETAILED:
-            print(rst_parseval_detailed_report(
-                ctree_true_list, ctree_pred_list, ctree_type=ctree_type,
-                metric_type='R'))
-        # end FIXME
+    ctree_type = 'SimpleRST' if simple_rsttree else 'RST'
+
+    doc_names = sorted(ctree_true.keys())
+    ctree_true_list = [ctree_true[doc_name] for doc_name in doc_names]
+    if simple_rsttree:
+        ctree_true_list = [SimpleRSTTree.from_rst_tree(x)
+                           for x in ctree_true_list]
+    # WIP print SimpleRSTTrees
+    if not os.path.exists('gold'):
+        os.makedirs('gold')
+    for doc_name, ct in zip(doc_names, ctree_true_list):
+        with codecs.open('gold/' + ct.origin.doc, mode='w',
+                         encoding='utf-8') as f:
+            print(ct, file=f)
+
+    # sort the predictions of each parser, so they match the order of
+    # documents and reference trees in _true
+    ctree_preds = [(parser_name,
+                    [ctree_pred[doc_name] for doc_name in doc_names])
+                   for parser_name, ctree_pred in c_preds]
+    if simple_rsttree:
+        ctree_preds = [(parser_name,
+                        [SimpleRSTTree.from_rst_tree(x)
+                         for x in ctree_pred_list])
+                       for parser_name, ctree_pred_list in ctree_preds]
+    # generate report
+    if detailed == 0:
+        # compact report, f1-scores only
+        print(rst_parseval_compact_report(ctree_true_list, ctree_preds,
+                                          ctree_type=ctree_type,
+                                          metric_types=['S', 'N', 'R', 'F'],
+                                          digits=digits,
+                                          per_doc=per_doc,
+                                          add_trivial_spans=eval_li_dep,
+                                          stringent=STRINGENT))
+    else:
+        # standard reports: 1 table per parser, 1 line per metric,
+        # cols = [p, r, f1, support_true, support_pred]
+        for parser_name, ctree_pred_list in ctree_preds:
+            # WIP print SimpleRSTTrees
+            if not os.path.exists(parser_name):
+                os.makedirs(parser_name)
+            for doc_name, ct in zip(doc_names, ctree_pred_list):
+                with codecs.open(parser_name + '/' + doc_name, mode='w',
+                                 encoding='utf-8') as f:
+                    print(ct, file=f)
+
+            # compute and print PARSEVAL scores
+            print(parser_name)
+            # metric_types=None includes the variants with head:
+            # S+H, N+H, R+H, F+H
+            print(rst_parseval_report(ctree_true_list, ctree_pred_list,
+                                      ctree_type=ctree_type,
+                                      metric_types=None,
+                                      digits=digits,
+                                      per_doc=per_doc,
+                                      add_trivial_spans=eval_li_dep,
+                                      stringent=STRINGENT))
+            # detailed report on R
+            if detailed >= 2:
+                print(rst_parseval_detailed_report(
+                    ctree_true_list, ctree_pred_list, ctree_type=ctree_type,
+                    metric_type='R'))
+            # end FIXME
 
 
 if __name__ == '__main__':

From 5b0291f0d86abf4d21f4ed62695d27b080c4ab9f Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Wed, 12 Apr 2017 11:53:42 +0200
Subject: [PATCH 57/74] MAINT+ENH pylint showdown, parseval double

---
 evals/showdown.py | 90 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 80 insertions(+), 10 deletions(-)

diff --git a/evals/showdown.py b/evals/showdown.py
index 708b2a9..b3a4d3e 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -49,6 +49,7 @@
 CORPUS_DIR = os.path.join('corpus', 'RSTtrees-WSJ-main-1.01/')
 CD_TRAIN = os.path.join(CORPUS_DIR, 'TRAINING')
 CD_TEST = os.path.join(CORPUS_DIR, 'TEST')
+DOUBLE_DIR = os.path.join('corpus', 'RSTtrees-WSJ-double-1.0')
 # relation converter (fine- to coarse-grained labels)
 RELMAP_FILE = os.path.join('/home/mmorey/melodi/educe',
                            'educe', 'rst_dt',
@@ -299,11 +300,12 @@ def main():
         dtree_true[doc_name] = dt_true
     # sorted doc_names, because braud_eacl put all predictions in one file
     sorted_doc_names = sorted(dtree_true.keys())
-    
+
     c_preds = []  # predictions: [(parser_name, dict(doc_name, ct_pred))]
     d_preds = []  # predictions: [(parser_name, dict(doc_name, dt_pred))]
 
     for author_pred in authors_pred:
+        # braud coling 2016
         if author_pred == 'braud_coling':
             c_preds.append(
                 ('braud_coling', load_braud_coling_ctrees(
@@ -312,8 +314,8 @@ def main():
             d_preds.append(
                 ('braud_coling', load_braud_coling_dtrees(
                     BRAUD_COLING_OUT_DIR, REL_CONV, nary_enc='chain'))
-            )            
-
+            )
+        # braud eacl 2017 - mono
         if author_pred == 'braud_eacl_mono':
             c_preds.append(
                 ('braud_eacl_mono', load_braud_eacl_ctrees(
@@ -323,8 +325,8 @@ def main():
                 ('braud_eacl_mono', load_braud_eacl_dtrees(
                     BRAUD_EACL_MONO, REL_CONV, sorted_doc_names,
                     nary_enc='chain'))
-            )            
-
+            )
+        # braud eacl 2017 - cross+dev
         if author_pred == 'braud_eacl_cross_dev':
             c_preds.append(
                 ('braud_eacl_cross_dev', load_braud_eacl_ctrees(
@@ -334,7 +336,7 @@ def main():
                 ('braud_eacl_cross_dev', load_braud_eacl_dtrees(
                     BRAUD_EACL_CROSS_DEV, REL_CONV, sorted_doc_names,
                     nary_enc='chain'))
-            )            
+            )
 
         if author_pred == 'hayashi_hilda':
             c_preds.append(
@@ -473,15 +475,13 @@ def main():
             print('Eisner, predicted syntax + same-unit')
             load_deptrees_from_attelo_output(ctree_true, dtree_true,
                                              EISNER_OUT_SYN_PRED_SU, EDUS_FILE,
-                                             nuc_clf, rnk_clf,
-                                             detailed=(detailed >= 3))
+                                             nuc_clf, rnk_clf)
             print('======================')
 
             print('Eisner, gold syntax')
             load_deptrees_from_attelo_output(ctree_true, dtree_true,
                                              EISNER_OUT_SYN_GOLD, EDUS_FILE,
-                                             nuc_clf, rnk_clf,
-                                             detailed=(detailed >= 3))
+                                             nuc_clf, rnk_clf)
             print('======================')
 
     # dependency eval
@@ -611,6 +611,76 @@ def main():
                     metric_type='R'))
             # end FIXME
 
+    # 2017-04-11 compute agreement between human annotators, on DOUBLE
+    if 'silver' in authors_pred:
+        # read the annotation we'll consider as "silver"
+        reader_dbl = RstReader(DOUBLE_DIR)
+        corpus_dbl_pred = {k.doc: v for k, v in reader_dbl.slurp().items()}
+        docs_dbl = sorted(k for k in corpus_dbl_pred.keys())
+        # collect the "true" annotation for the docs in double, from train
+        # and test
+        # (test has already been read at the beginning of this script)
+        corpus_test_dbl = {k.doc: v for k, v in corpus_test.items()
+                           if k.doc in docs_dbl}
+        # read the docs from train that are in double
+        reader_train = RstReader(CD_TRAIN)
+        corpus_train = reader_train.slurp()
+        corpus_train_dbl = {k.doc: v for k, v in corpus_train.items()
+                            if k.doc in docs_dbl}
+        # assemble the "true" version of the double subset
+        corpus_dbl_true = dict(corpus_test_dbl.items() +
+                               corpus_train_dbl.items())
+        assert (sorted(corpus_dbl_true.keys()) ==
+                sorted(corpus_dbl_pred.keys()))
+        # extra check?
+        for doc_name in docs_dbl:
+            leaf_spans_true = [x.text_span() for x
+                               in corpus_dbl_true[doc_name].leaves()]
+            leaf_spans_pred = [x.text_span() for x
+                               in corpus_dbl_pred[doc_name].leaves()]
+            if (leaf_spans_true != leaf_spans_pred):
+                print(doc_name, 'EEEE')
+                print('true - pred',
+                      set(leaf_spans_true) - set(leaf_spans_pred))
+                print('pred - true',
+                      set(leaf_spans_pred) - set(leaf_spans_true))
+            else:
+                print(doc_name, 'ok')
+        # end extra check
+
+        # 48 docs in train,
+        # 5 docs in test: ['wsj_0627.out', 'wsj_0684.out', 'wsj_1129.out',
+        # 'wsj_1365.out', 'wsj_1387.out']
+        # create parallel lists of ctrees for _true and _pred, mapped to
+        # coarse rels and binarized
+        # _pred:
+        ctree_dbl_pred = [corpus_dbl_pred[doc_name] for doc_name in docs_dbl]
+        ctree_dbl_pred = [REL_CONV(x) for x in ctree_dbl_pred]
+        if binarize_true:  # maybe not?
+            ctree_dbl_pred = [_binarize(x) for x in ctree_dbl_pred]
+        if simple_rsttree:
+            ctree_dbl_pred = [SimpleRSTTree.from_rst_tree(x)
+                              for x in ctree_dbl_pred]
+        # _true:
+        ctree_dbl_true = [corpus_dbl_true[doc_name] for doc_name in docs_dbl]
+        ctree_dbl_true = [REL_CONV(x) for x in ctree_dbl_true]
+        if binarize_true:
+            ctree_dbl_true = [_binarize(x) for x in ctree_dbl_true]
+        if simple_rsttree:
+            ctree_dbl_true = [SimpleRSTTree.from_rst_tree(x)
+                              for x in ctree_dbl_true]
+        # generate report
+        ctree_dbl_preds = [('silver', ctree_dbl_pred)]
+        print(rst_parseval_compact_report(ctree_dbl_true, ctree_dbl_preds,
+                                          ctree_type=ctree_type,
+                                          span_type='chars',
+                                          metric_types=['S', 'N', 'R', 'F'],
+                                          digits=digits,
+                                          per_doc=per_doc,
+                                          add_trivial_spans=eval_li_dep,
+                                          stringent=STRINGENT))
+    # end 2017-04-11 agreement between human annotators
+
 
 if __name__ == '__main__':
     main()

From c97f68aa64f5228db0aa55a7b4acb403b5c5ddd7 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Wed, 12 Apr 2017 16:10:17 +0200
Subject: [PATCH 58/74] FIX disable print of differing spans between RST double
 and main

---
 evals/showdown.py | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/evals/showdown.py b/evals/showdown.py
index b3a4d3e..2757dd8 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -633,19 +633,20 @@ def main():
         assert (sorted(corpus_dbl_true.keys()) ==
                 sorted(corpus_dbl_pred.keys()))
         # extra check?
-        for doc_name in docs_dbl:
-            leaf_spans_true = [x.text_span() for x
-                               in corpus_dbl_true[doc_name].leaves()]
-            leaf_spans_pred = [x.text_span() for x
-                               in corpus_dbl_pred[doc_name].leaves()]
-            if (leaf_spans_true != leaf_spans_pred):
-                print(doc_name, 'EEEE')
-                print('true - pred',
-                      set(leaf_spans_true) - set(leaf_spans_pred))
-                print('pred - true',
-                      set(leaf_spans_pred) - set(leaf_spans_true))
-            else:
-                print(doc_name, 'ok')
+        if False:
+            for doc_name in docs_dbl:
+                leaf_spans_true = [x.text_span() for x
+                                   in corpus_dbl_true[doc_name].leaves()]
+                leaf_spans_pred = [x.text_span() for x
+                                   in corpus_dbl_pred[doc_name].leaves()]
+                if (leaf_spans_true != leaf_spans_pred):
+                    print(doc_name, 'EEEE')
+                    print('true - pred',
+                          set(leaf_spans_true) - set(leaf_spans_pred))
+                    print('pred - true',
+                          set(leaf_spans_pred) - set(leaf_spans_true))
+                else:
+                    print(doc_name, 'ok')
         # end extra check
 
         # 48 docs in train,

From adf07c1d70a14654c0a5277dc0f8ef48c1614ba9 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Wed, 17 May 2017 11:40:53 +0200
Subject: [PATCH 59/74] ENH evals/showdown: changes in options and table
 display

---
 evals/showdown.py | 80 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 51 insertions(+), 29 deletions(-)

diff --git a/evals/showdown.py b/evals/showdown.py
index 2757dd8..e4ad93a 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -212,7 +212,8 @@ def main():
                         help="Encoding of n-ary nodes for the predictions")
     # reference
     parser.add_argument('--author_true', default='gold',
-                        choices=['gold', 'silver',
+                        choices=['each',  # NEW generate sim matrix
+                                 'gold', 'silver',
                                  'joty', 'feng', 'feng2', 'ji',
                                  'li_qi', 'hayashi_hilda', 'hayashi_mst',
                                  'braud_coling', 'braud_eacl_mono',
@@ -221,13 +222,13 @@ def main():
                                  'li_sujian',
                                  'ours_chain', 'ours_tree'],
                         help="Author of the reference")
-    # * dtree eval
-    parser.add_argument('--nary_enc_true', default='tree',
-                        choices=['tree', 'chain'],
-                        help="Encoding of n-ary nodes for the reference")
-    # * ctree eval
-    parser.add_argument('--binarize_true', action='store_true',
-                        help="Binarize the reference ctree for the eval")
+    # * ctree/dtree eval: the value of binarize_true determines the values
+    # of nary_enc_true and order_true (the latter is yet unused)
+    parser.add_argument('--binarize_true', default='none',
+                        choices=['none', 'right', 'right_mixed', 'left'],
+                        help=("Binarization method for the reference ctree"
+                              "in the eval ; defaults to 'none' for no "
+                              "binarization"))
     parser.add_argument('--simple_rsttree', action='store_true',
                         help="Binarize ctree and move relations up")
     # * non-standard evals
@@ -240,18 +241,23 @@ def main():
     # * display options
     parser.add_argument('--digits', type=int, default=3,
                         help='Precision (number of digits) of scores')
+    parser.add_argument('--percent', action='store_true',
+                        help='Scores are displayed as percentages (ex: 57.9)')
     parser.add_argument('--detailed', type=int, default=0,
                         help='Level of detail for evaluations')
     #
     args = parser.parse_args()
     author_true = args.author_true
-    nary_enc_true = args.nary_enc_true
     authors_pred = args.authors_pred
     nary_enc_pred = args.nary_enc_pred
     binarize_true = args.binarize_true
     simple_rsttree = args.simple_rsttree
     # display
     digits = args.digits
+    percent = args.percent
+    if percent:
+        if digits < 3:
+            raise ValueError('--percent requires --digits >= 3')
     # level of detail for evals
     detailed = args.detailed
 
@@ -264,10 +270,15 @@ def main():
     # three trivial spans
     eval_li_dep = args.eval_li_dep
 
-    #
-    if binarize_true and nary_enc_true != 'chain':
-        raise ValueError("--binarize_true is compatible with "
-                         "--nary_enc_true chain only")
+    if binarize_true in ('right', 'right_mixed'):
+        nary_enc_true = 'chain'
+        order_true = 'strict'
+    elif binarize_true == 'left':
+        nary_enc_true = 'tree'
+        order_true = 'strict'
+    else:  # 'none' for no binarization of the reference tree
+        nary_enc_true = 'tree'
+        order_true = 'weak'
 
     # 0. setup the postprocessors to flesh out unordered dtrees into ordered
     # ones with nuclearity
@@ -291,9 +302,9 @@ def main():
         doc_name = doc_id.doc
         # original reference ctree, with coarse labels
         ct_true = REL_CONV(ct_true)  # map fine to coarse relations
-        if binarize_true:
+        if binarize_true != "none":
             # binarize ctree if required
-            ct_true = _binarize(ct_true)
+            ct_true = _binarize(ct_true, branching=binarize_true)
         ctree_true[doc_name] = ct_true
         # corresponding dtree
         dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc=nary_enc_true)
@@ -485,16 +496,18 @@ def main():
             print('======================')
 
     # dependency eval
-
+    dep_metrics = ["U"]
+    if EVAL_NUC_RANK:
+        dep_metrics += ['O', 'N']
+    dep_metrics += ["R"]
+    if INCLUDE_LS:
+        dep_metrics += ["tag_R"]
+    if EVAL_NUC_RANK:
+        dep_metrics += ["R+N", "R+O", "F"]
     # report
     # * table format
     width = max(len(parser_name) for parser_name, _ in d_preds)
-
-    headers = ["UAS", "LAS"]
-    if INCLUDE_LS:
-        headers += ["LS"]
-    if EVAL_NUC_RANK:
-        headers += ["LAS+N", "LAS+O", "LAS+N+O"]
+    headers = dep_metrics
     if UNDIRECTED_DEPS:
         headers += ["UUAS", "ULAS"]
     fmt = '%% %ds' % width  # first col: parser name
@@ -505,6 +518,8 @@ def main():
     headers = [""] + headers
     report = fmt % tuple(headers)
     report += '\n'
+    # display percentages
+    dep_digits = digits - 2 if percent else digits
     # end table format and header line
 
     # * table content
@@ -530,8 +545,8 @@ def main():
         # end check
         all_scores = []
         all_scores += list(compute_uas_las(
-            dtree_true_list, dtree_pred_list, include_ls=INCLUDE_LS,
-            include_las_n_o_no=EVAL_NUC_RANK))
+            dtree_true_list, dtree_pred_list, metrics=dep_metrics,
+            doc_names=doc_names))
         if UNDIRECTED_DEPS:
             score_uuas, score_ulas = compute_uas_las_undirected(
                 dtree_true_list, dtree_pred_list)
@@ -539,7 +554,9 @@ def main():
         # append to report
         values = ['{pname: <{fill}}'.format(pname=parser_name, fill=width)]
         for v in all_scores:
-            values += ["{0:0.{1}f}".format(v, digits)]
+            if percent:
+                v = v * 100.0
+            values += ["{0:0.{1}f}".format(v, dep_digits)]
         report += fmt % tuple(values)
     # end table content
     print(report)
@@ -578,6 +595,7 @@ def main():
                                           ctree_type=ctree_type,
                                           metric_types=['S', 'N', 'R', 'F'],
                                           digits=digits,
+                                          percent=percent,
                                           per_doc=per_doc,
                                           add_trivial_spans=eval_li_dep,
                                           stringent=STRINGENT))
@@ -601,6 +619,7 @@ def main():
                                       ctree_type=ctree_type,
                                       metric_types=None,
                                       digits=digits,
+                                      percent=percent,
                                       per_doc=per_doc,
                                       add_trivial_spans=eval_li_dep,
                                       stringent=STRINGENT))
@@ -657,16 +676,18 @@ def main():
         # _pred:
         ctree_dbl_pred = [corpus_dbl_pred[doc_name] for doc_name in docs_dbl]
         ctree_dbl_pred = [REL_CONV(x) for x in ctree_dbl_pred]
-        if binarize_true:  # maybe not?
-            ctree_dbl_pred = [_binarize(x) for x in ctree_dbl_pred]
+        if binarize_true != 'none':  # maybe not?
+            ctree_dbl_pred = [_binarize(x, branching=binarize_true)
+                              for x in ctree_dbl_pred]
         if simple_rsttree:
             ctree_dbl_pred = [SimpleRSTTree.from_rst_tree(x)
                               for x in ctree_dbl_pred]
         # _true:
         ctree_dbl_true = [corpus_dbl_true[doc_name] for doc_name in docs_dbl]
         ctree_dbl_true = [REL_CONV(x) for x in ctree_dbl_true]
-        if binarize_true:
-            ctree_dbl_true = [_binarize(x) for x in ctree_dbl_true]
+        if binarize_true != 'none':
+            ctree_dbl_true = [_binarize(x, branching=binarize_true)
+                              for x in ctree_dbl_true]
         if simple_rsttree:
             ctree_dbl_true = [SimpleRSTTree.from_rst_tree(x)
                               for x in ctree_dbl_true]
@@ -677,6 +698,7 @@ def main():
                                           span_type='chars',
                                           metric_types=['S', 'N', 'R', 'F'],
                                           digits=digits,
+                                          percent=percent,
                                           per_doc=per_doc,
                                           add_trivial_spans=eval_li_dep,
                                           stringent=STRINGENT))

From 2faff312d24fc63b5f28db72096098606a9e6dd9 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Wed, 17 May 2017 15:23:35 +0200
Subject: [PATCH 60/74] ENH showdown: use any author as _true

---
 evals/hayashi_deps.py |   2 +-
 evals/ours.py         |   2 +
 evals/showdown.py     | 155 ++++++++++++++++++++++++------------------
 3 files changed, 92 insertions(+), 67 deletions(-)

diff --git a/evals/hayashi_deps.py b/evals/hayashi_deps.py
index cb812f5..cbde909 100644
--- a/evals/hayashi_deps.py
+++ b/evals/hayashi_deps.py
@@ -38,7 +38,7 @@ def _load_hayashi_dep_file(f, edus):
     dt: RstDepTree
         Predicted dtree
     """
-    dt = RstDepTree(edus=edus, origin=None, nary_enc='tree')  # FIXME origin
+    dt = RstDepTree(edus=edus, origin=None, nary_enc='chain')  # FIXME origin
     for line in f:
         line = line.strip()
         if not line:
diff --git a/evals/ours.py b/evals/ours.py
index f9d48bf..938a53c 100644
--- a/evals/ours.py
+++ b/evals/ours.py
@@ -10,6 +10,7 @@
 
 from educe.annotation import Span as EduceSpan
 from educe.rst_dt.annotation import (EDU as EduceEDU, SimpleRSTTree)
+from educe.rst_dt.corpus import mk_key
 from educe.rst_dt.dep2con import (deptree_to_simple_rst_tree,
                                   deptree_to_rst_tree)
 from educe.rst_dt.deptree import RstDepTree, RstDtException
@@ -115,6 +116,7 @@ def load_attelo_dtrees(output_file, edus_file, nuc_clf, rnk_clf):
                     raise ValueError('Weird root label: {}'.format(lbl))
             else:
                 dt_pred.add_dependency(gid2num[src_id], gid2num[tgt_id], lbl)
+        dt_pred.origin = mk_key(doc_name)
         # add nuclearity: heuristic baseline
         dt_pred.nucs = nuc_clf.predict([dt_pred])[0]
         # add rank: heuristic baseline, needs edu2sent
diff --git a/evals/showdown.py b/evals/showdown.py
index e4ad93a..daef76a 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -143,6 +143,18 @@
 NUC_CONSTANT = None  # only useful for NUC_STRATEGY='constant'
 RNK_STRATEGY = 'sdist-edist-rl'
 RNK_PRIORITY_SU = True
+# known 'authors'
+AUTHORS = [
+    'gold',  # RST-main
+    'silver',  # RST-double
+    'JCN15_1S1S', 'FH14_gSVM', 'FH14_gCRF', 'JE14',
+    'LLC16', 'HHN16_hilda', 'HHN16_mst',
+    'BPS16', 'BCS17_mono',
+    'BCS17_cross_dev',
+    'SHV15_D',
+    'li_sujian',
+    'ours-chain', 'ours-tree', 'ours-tree-su'
+]
 
 
 def setup_dtree_postprocessor(nary_enc='chain', order='strict',
@@ -198,29 +210,14 @@ def main():
         description="Evaluate parsers' output against a given reference")
     # predictions
     parser.add_argument('authors_pred', nargs='+',
-                        choices=['gold', 'silver',
-                                 'joty', 'feng', 'feng2', 'ji',
-                                 'li_qi', 'hayashi_hilda', 'hayashi_mst',
-                                 'braud_coling', 'braud_eacl_mono',
-                                 'braud_eacl_cross_dev',
-                                 'surdeanu',
-                                 'li_sujian',
-                                 'ours_chain', 'ours_tree', 'ours_tree_su'],
+                        choices=AUTHORS,
                         help="Author(s) of the predictions")
     parser.add_argument('--nary_enc_pred', default='tree',
                         choices=['tree', 'chain'],
                         help="Encoding of n-ary nodes for the predictions")
     # reference
     parser.add_argument('--author_true', default='gold',
-                        choices=['each',  # NEW generate sim matrix
-                                 'gold', 'silver',
-                                 'joty', 'feng', 'feng2', 'ji',
-                                 'li_qi', 'hayashi_hilda', 'hayashi_mst',
-                                 'braud_coling', 'braud_eacl_mono',
-                                 'braud_eacl_cross_dev',
-                                 'surdeanu',
-                                 'li_sujian',
-                                 'ours_chain', 'ours_tree'],
+                        choices=AUTHORS + ['each'],  # NEW generate sim matrix
                         help="Author of the reference")
     # * ctree/dtree eval: the value of binarize_true determines the values
     # of nary_enc_true and order_true (the latter is yet unused)
@@ -291,11 +288,7 @@ def main():
     reader_test = RstReader(CD_TEST)
     corpus_test = reader_test.slurp()
 
-    # reference
-    # current assumption: author_true is 'gold'
-    if author_true != 'gold':
-        raise NotImplementedError('Not yet')
-
+    # reference: author_true can be any of the authors_pred (defaults to gold)
     ctree_true = dict()  # ctrees
     dtree_true = dict()  # dtrees from the original ctrees ('tree' transform)
     for doc_id, ct_true in sorted(corpus_test.items()):
@@ -317,66 +310,66 @@ def main():
 
     for author_pred in authors_pred:
         # braud coling 2016
-        if author_pred == 'braud_coling':
+        if author_pred == 'BPS16':
             c_preds.append(
-                ('braud_coling', load_braud_coling_ctrees(
+                ('BPS16', load_braud_coling_ctrees(
                     BRAUD_COLING_OUT_DIR, REL_CONV))
             )
             d_preds.append(
-                ('braud_coling', load_braud_coling_dtrees(
+                ('BPS16', load_braud_coling_dtrees(
                     BRAUD_COLING_OUT_DIR, REL_CONV, nary_enc='chain'))
             )
         # braud eacl 2017 - mono
-        if author_pred == 'braud_eacl_mono':
+        if author_pred == 'BCS17_mono':
             c_preds.append(
-                ('braud_eacl_mono', load_braud_eacl_ctrees(
+                ('BCS17_mono', load_braud_eacl_ctrees(
                     BRAUD_EACL_MONO, REL_CONV, sorted_doc_names))
             )
             d_preds.append(
-                ('braud_eacl_mono', load_braud_eacl_dtrees(
+                ('BCS17_mono', load_braud_eacl_dtrees(
                     BRAUD_EACL_MONO, REL_CONV, sorted_doc_names,
                     nary_enc='chain'))
             )
         # braud eacl 2017 - cross+dev
-        if author_pred == 'braud_eacl_cross_dev':
+        if author_pred == 'BCS17_cross_dev':
             c_preds.append(
-                ('braud_eacl_cross_dev', load_braud_eacl_ctrees(
+                ('BCS17_cross_dev', load_braud_eacl_ctrees(
                     BRAUD_EACL_CROSS_DEV, REL_CONV, sorted_doc_names))
             )
             d_preds.append(
-                ('braud_eacl_cross_dev', load_braud_eacl_dtrees(
+                ('BCS17_cross_dev', load_braud_eacl_dtrees(
                     BRAUD_EACL_CROSS_DEV, REL_CONV, sorted_doc_names,
                     nary_enc='chain'))
             )
 
-        if author_pred == 'hayashi_hilda':
+        if author_pred == 'HHN16_hilda':
             c_preds.append(
-                ('hayashi_hilda', load_hayashi_hilda_ctrees(
+                ('HHN16_hilda', load_hayashi_hilda_ctrees(
                     HAYASHI_HILDA_OUT_DIR, REL_CONV))
             )
             d_preds.append(
-                ('hayashi_hilda', load_hayashi_hilda_dtrees(
+                ('HHN16_hilda', load_hayashi_hilda_dtrees(
                     HAYASHI_HILDA_OUT_DIR, REL_CONV, nary_enc='chain'))
             )
 
-        if author_pred == 'hayashi_mst':
+        if author_pred == 'HHN16_mst':
             c_preds.append(
-                ('hayashi_mst', load_hayashi_dep_ctrees(
+                ('HHN16_mst', load_hayashi_dep_ctrees(
                     HAYASHI_MST_OUT_DIR, REL_CONV_DTREE, EDUS_FILE_PAT,
                     nuc_clf, rnk_clf))
             )
             d_preds.append(
-                ('hayashi_mst', load_hayashi_dep_dtrees(
+                ('HHN16_mst', load_hayashi_dep_dtrees(
                     HAYASHI_MST_OUT_DIR, REL_CONV_DTREE, EDUS_FILE_PAT,
                     nuc_clf, rnk_clf))
             )
 
-        if author_pred == 'li_qi':
+        if author_pred == 'LLC16':
             c_preds.append(
-                ('li_qi', load_li_qi_ctrees(LI_QI_OUT_DIR, REL_CONV))
+                ('LLC16', load_li_qi_ctrees(LI_QI_OUT_DIR, REL_CONV))
             )
             d_preds.append(
-                ('li_qi', load_li_qi_dtrees(LI_QI_OUT_DIR, REL_CONV,
+                ('LLC16', load_li_qi_dtrees(LI_QI_OUT_DIR, REL_CONV,
                                             nary_enc='chain'))
             )
 
@@ -392,63 +385,63 @@ def main():
                     nuc_clf, rnk_clf))
             )
 
-        if author_pred == 'feng':
+        if author_pred == 'FH14_gSVM':
             c_preds.append(
-                ('gSVM', load_feng_ctrees(FENG1_OUT_DIR, REL_CONV))
+                ('FH14_gSVM', load_feng_ctrees(FENG1_OUT_DIR, REL_CONV))
             )
             d_preds.append(
-                ('gSVM', load_feng_dtrees(FENG1_OUT_DIR, REL_CONV,
-                                          nary_enc='chain'))
+                ('FH14_gSVM', load_feng_dtrees(FENG1_OUT_DIR, REL_CONV,
+                                               nary_enc='chain'))
             )
 
-        if author_pred == 'feng2':
+        if author_pred == 'FH14_gCRF':
             c_preds.append(
-                ('gCRF', load_gcrf_ctrees(FENG2_OUT_DIR, REL_CONV))
+                ('FH14_gCRF', load_gcrf_ctrees(FENG2_OUT_DIR, REL_CONV))
             )
             d_preds.append(
-                ('gCRF', load_gcrf_dtrees(FENG2_OUT_DIR, REL_CONV,
-                                          nary_enc='chain'))
+                ('FH14_gCRF', load_gcrf_dtrees(FENG2_OUT_DIR, REL_CONV,
+                                               nary_enc='chain'))
             )
 
-        if author_pred == 'joty':
+        if author_pred == 'JCN15_1S1S':
             # CODRA outputs RST ctrees ; eval_codra_output maps them to RST dtrees
             c_preds.append(
-                ('TSP 1-1', load_codra_ctrees(CODRA_OUT_DIR, REL_CONV))
+                ('JCN15_1S1S', load_codra_ctrees(CODRA_OUT_DIR, REL_CONV))
             )
             d_preds.append(
-                ('TSP 1-1', load_codra_dtrees(CODRA_OUT_DIR, REL_CONV,
-                                              nary_enc='chain'))
+                ('JCN15_1S1S', load_codra_dtrees(CODRA_OUT_DIR, REL_CONV,
+                                                 nary_enc='chain'))
             )
             # joty-{chain,tree} would be the same except nary_enc='tree' ;
             # the nary_enc does not matter because codra outputs binary ctrees,
             # hence both encodings result in (the same) strictly ordered dtrees
 
-        if author_pred == 'ji':
+        if author_pred == 'JE14':
             # DPLP outputs RST ctrees in the form of lists of spans;
             # load_ji_dtrees maps them to RST dtrees
             c_preds.append(
-                ('DPLP', load_ji_ctrees(
+                ('JE14', load_ji_ctrees(
                     JI_OUT_DIR, REL_CONV))
             )
             d_preds.append(
-                ('DPLP', load_ji_dtrees(
+                ('JE14', load_ji_dtrees(
                     JI_OUT_DIR, REL_CONV, nary_enc='chain'))
             )
             # ji-{chain,tree} would be the same except nary_enc='tree' ;
             # the nary_enc does not matter because codra outputs binary ctrees,
             # hence both encodings result in (the same) strictly ordered dtrees
 
-        if author_pred == 'surdeanu':
+        if author_pred == 'SHV15_D':
             c_preds.append(
-                ('surdeanu', load_surdeanu_ctrees(
+                ('SHV15_D', load_surdeanu_ctrees(
                     SURDEANU_LOG_FILE, REL_CONV))
             )
             d_preds.append(
-                ('surdeanu', load_surdeanu_dtrees(
+                ('SHV15_D', load_surdeanu_dtrees(
                     SURDEANU_LOG_FILE, REL_CONV, nary_enc='chain'))
             )
 
-        if author_pred == 'ours_chain':
+        if author_pred == 'ours-chain':
             # Eisner, predicted syntax, chain
             c_preds.append(
                 ('ours-chain', load_attelo_ctrees(
@@ -459,7 +452,7 @@ def main():
                     EISNER_OUT_SYN_PRED, EDUS_FILE, nuc_clf, rnk_clf))
             )
 
-        if author_pred == 'ours_tree':
+        if author_pred == 'ours-tree':
             # Eisner, predicted syntax, tree + same-unit
             c_preds.append(
                 ('ours-tree', load_attelo_ctrees(
@@ -469,7 +462,7 @@ def main():
                 ('ours-tree', load_attelo_dtrees(
                     EISNER_OUT_TREE_SYN_PRED, EDUS_FILE, nuc_clf, rnk_clf))
             )
-        if author_pred == 'ours_tree_su':
+        if author_pred == 'ours-tree-su':
             # Eisner, predicted syntax, tree + same-unit
             c_preds.append(
                 ('ours-tree-su', load_attelo_ctrees(EISNER_OUT_TREE_SYN_PRED_SU,
@@ -481,6 +474,14 @@ def main():
                                                     EDUS_FILE,
                                                     nuc_clf, rnk_clf))
             )
+        # 2017-05-17 enable "gold" as parser, should give perfect scores
+        if author_pred == 'gold':
+            c_preds.append(
+                ('gold', ctree_true)
+            )
+            d_preds.append(
+                ('gold', dtree_true)
+            )
 
         if False:  # FIXME repair (or forget) these
             print('Eisner, predicted syntax + same-unit')
@@ -525,10 +526,18 @@ def main():
     # * table content
     # _true
     doc_names = sorted(dtree_true.keys())
-    dtree_true_list = [dtree_true[doc_name] for doc_name in doc_names]
     labelset_true = set(itertools.chain.from_iterable(
-        x.labels for x in dtree_true_list))
+        x.labels for x in dtree_true.values()))
     labelset_true.add("span")  # RST-DT v.1.0 has an error in wsj_1189 7-9
+    # 2017-05-17 any author can be used as reference
+    # FIXME
+    # dtree_true_list = [dtree_true[doc_name] for doc_name in doc_names]
+    dtree_true_list = []
+    for parser_name, dtree_pred in d_preds:
+        if parser_name == author_true:
+            dtree_true_list = [dtree_pred[doc_name] for doc_name in doc_names]
+            break
+    # end FIXME
     # _pred
     for parser_name, dtree_pred in d_preds:
         dtree_pred_list = [dtree_pred[doc_name] for doc_name in doc_names]
@@ -539,8 +548,9 @@ def main():
             assert labelset_pred.issubset(labelset_true)
         except AssertionError:
             print(parser_name)
-            print('T - P', labelset_true - labelset_pred)
-            print('P - T', labelset_pred - labelset_true)
+            print('T & P', sorted(labelset_true.intersection(labelset_pred)))
+            print('T - P', sorted(labelset_true - labelset_pred))
+            print('P - T', sorted(labelset_pred - labelset_true))
             raise
         # end check
         all_scores = []
@@ -566,7 +576,15 @@ def main():
     ctree_type = 'SimpleRST' if simple_rsttree else 'RST'
 
     doc_names = sorted(ctree_true.keys())
-    ctree_true_list = [ctree_true[doc_name] for doc_name in doc_names]
+    # ctree_true_list = [ctree_true[doc_name] for doc_name in doc_names]
+    # FIXME
+    ctree_true_list = []
+    for parser_name, ctree_pred in c_preds:
+        if parser_name == author_true:
+            ctree_true_list = [ctree_pred[doc_name] for doc_name in doc_names]
+            break
+    # end FIXME
+
     if simple_rsttree:
         ctree_true_list = [SimpleRSTTree.from_rst_tree(x)
                            for x in ctree_true_list]
@@ -632,6 +650,11 @@ def main():
 
     # 2017-04-11 compute agreement between human annotators, on DOUBLE
     if 'silver' in authors_pred:
+        # 'silver' can be meaningfully compared to 'gold' only (too few
+        # documents otherwise)
+        if author_true != 'gold':
+            raise NotImplementedError('Not yet')
+
         # read the annotation we'll consider as "silver"
         reader_dbl = RstReader(DOUBLE_DIR)
         corpus_dbl_pred = {k.doc: v for k, v in reader_dbl.slurp().items()}

From b4ef3ec14e5253b7bc241ec81d2911653aafc937 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Wed, 17 May 2017 16:22:02 +0200
Subject: [PATCH 61/74] ENH showdown: compact reports use author_true

---
 evals/showdown.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/evals/showdown.py b/evals/showdown.py
index daef76a..4b62709 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -609,7 +609,7 @@ def main():
     # generate report
     if detailed == 0:
         # compact report, f1-scores only
-        print(rst_parseval_compact_report(ctree_true_list, ctree_preds,
+        print(rst_parseval_compact_report(author_true, ctree_preds,
                                           ctree_type=ctree_type,
                                           metric_types=['S', 'N', 'R', 'F'],
                                           digits=digits,
@@ -715,8 +715,9 @@ def main():
             ctree_dbl_true = [SimpleRSTTree.from_rst_tree(x)
                               for x in ctree_dbl_true]
         # generate report
-        ctree_dbl_preds = [('silver', ctree_dbl_pred)]
-        print(rst_parseval_compact_report(ctree_dbl_true, ctree_dbl_preds,
+        ctree_dbl_preds = [('silver', ctree_dbl_pred),
+                           ('gold', ctree_dbl_true)]
+        print(rst_parseval_compact_report(author_true, ctree_dbl_preds,
                                           ctree_type=ctree_type,
                                           span_type='chars',
                                           metric_types=['S', 'N', 'R', 'F'],

From 1b9487cf460348c2afb692c1202dfc9ce68ba19f Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Thu, 18 May 2017 12:07:01 +0200
Subject: [PATCH 62/74] ENH showdown: similarity matrix

---
 evals/showdown.py | 205 ++++++++++++++++++++++++++--------------------
 1 file changed, 115 insertions(+), 90 deletions(-)

diff --git a/evals/showdown.py b/evals/showdown.py
index 4b62709..167f6a1 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -18,7 +18,8 @@
 from educe.rst_dt.deptree import RstDepTree
 from educe.rst_dt.metrics.rst_parseval import (rst_parseval_detailed_report,
                                                rst_parseval_compact_report,
-                                               rst_parseval_report)
+                                               rst_parseval_report,
+                                               rst_parseval_similarity)
 #
 from attelo.metrics.deptree import (compute_uas_las,
                                     compute_uas_las_undirected)
@@ -532,69 +533,65 @@ def main():
     # 2017-05-17 any author can be used as reference
     # FIXME
     # dtree_true_list = [dtree_true[doc_name] for doc_name in doc_names]
-    dtree_true_list = []
-    for parser_name, dtree_pred in d_preds:
-        if parser_name == author_true:
-            dtree_true_list = [dtree_pred[doc_name] for doc_name in doc_names]
-            break
-    # end FIXME
-    # _pred
-    for parser_name, dtree_pred in d_preds:
-        dtree_pred_list = [dtree_pred[doc_name] for doc_name in doc_names]
-        # check that labelset_pred is a subset of labelset_true
-        labelset_pred = set(itertools.chain.from_iterable(
-            x.labels for x in dtree_pred_list))
-        try:
-            assert labelset_pred.issubset(labelset_true)
-        except AssertionError:
-            print(parser_name)
-            print('T & P', sorted(labelset_true.intersection(labelset_pred)))
-            print('T - P', sorted(labelset_true - labelset_pred))
-            print('P - T', sorted(labelset_pred - labelset_true))
-            raise
-        # end check
-        all_scores = []
-        all_scores += list(compute_uas_las(
-            dtree_true_list, dtree_pred_list, metrics=dep_metrics,
-            doc_names=doc_names))
-        if UNDIRECTED_DEPS:
-            score_uuas, score_ulas = compute_uas_las_undirected(
-                dtree_true_list, dtree_pred_list)
-            all_scores += [score_uuas, score_ulas]
-        # append to report
-        values = ['{pname: <{fill}}'.format(pname=parser_name, fill=width)]
-        for v in all_scores:
-            if percent:
-                v = v * 100.0
-            values += ["{0:0.{1}f}".format(v, dep_digits)]
-        report += fmt % tuple(values)
-    # end table content
-    print(report)
-    # end report
+    parsers_true = [author_true] if author_true != 'each' else authors_pred
+    for parser_true in parsers_true:
+        dtree_true_list = []
+        for parser_name, dtree_pred in d_preds:
+            if parser_name == parser_true:
+                dtree_true_list = [dtree_pred[doc_name] for doc_name in doc_names]
+                break
+        # end FIXME
+        # _pred
+        for parser_name, dtree_pred in d_preds:
+            dtree_pred_list = [dtree_pred[doc_name] for doc_name in doc_names]
+            # check that labelset_pred is a subset of labelset_true
+            labelset_pred = set(itertools.chain.from_iterable(
+                x.labels for x in dtree_pred_list))
+            try:
+                assert labelset_pred.issubset(labelset_true)
+            except AssertionError:
+                print(parser_name)
+                print('T & P', sorted(labelset_true.intersection(labelset_pred)))
+                print('T - P', sorted(labelset_true - labelset_pred))
+                print('P - T', sorted(labelset_pred - labelset_true))
+                raise
+            # end check
+            all_scores = []
+            all_scores += list(compute_uas_las(
+                dtree_true_list, dtree_pred_list, metrics=dep_metrics,
+                doc_names=doc_names))
+            if UNDIRECTED_DEPS:
+                score_uuas, score_ulas = compute_uas_las_undirected(
+                    dtree_true_list, dtree_pred_list)
+                all_scores += [score_uuas, score_ulas]
+            # append to report
+            values = ['{pname: <{fill}}'.format(pname=parser_name, fill=width)]
+            for v in all_scores:
+                if percent:
+                    v = v * 100.0
+                values += ["{0:0.{1}f}".format(v, dep_digits)]
+            report += fmt % tuple(values)
+        # end table content
+        print(report)
+        # end report
 
     # constituency eval
     ctree_type = 'SimpleRST' if simple_rsttree else 'RST'
 
     doc_names = sorted(ctree_true.keys())
-    # ctree_true_list = [ctree_true[doc_name] for doc_name in doc_names]
-    # FIXME
-    ctree_true_list = []
-    for parser_name, ctree_pred in c_preds:
-        if parser_name == author_true:
-            ctree_true_list = [ctree_pred[doc_name] for doc_name in doc_names]
-            break
-    # end FIXME
 
-    if simple_rsttree:
-        ctree_true_list = [SimpleRSTTree.from_rst_tree(x)
-                           for x in ctree_true_list]
-    # WIP print SimpleRSTTrees
-    if not os.path.exists('gold'):
-        os.makedirs('gold')
-    for doc_name, ct in zip(doc_names, ctree_true_list):
-        with codecs.open('gold/' + ct.origin.doc, mode='w',
-                         encoding='utf-8') as f:
-            print(ct, file=f)
+    if False:  # back when 'gold' was the only possible ref
+        ctree_true_list = [ctree_true[doc_name] for doc_name in doc_names]
+        if simple_rsttree:
+            ctree_true_list = [SimpleRSTTree.from_rst_tree(x)
+                               for x in ctree_true_list]
+        # WIP print SimpleRSTTrees
+        if not os.path.exists('gold'):
+            os.makedirs('gold')
+        for doc_name, ct in zip(doc_names, ctree_true_list):
+            with codecs.open('gold/' + ct.origin.doc, mode='w',
+                             encoding='utf-8') as f:
+                print(ct, file=f)
 
     # sort the predictions of each parser, so they match the order of
     # documents and reference trees in _true
@@ -606,48 +603,76 @@ def main():
                         [SimpleRSTTree.from_rst_tree(x)
                          for x in ctree_pred_list])
                        for parser_name, ctree_pred_list in ctree_preds]
+
+    # 2017-05-17 allow any parser to be ref
     # generate report
     if detailed == 0:
-        # compact report, f1-scores only
-        print(rst_parseval_compact_report(author_true, ctree_preds,
+        # 2017-05-17 WIP similarity matrix: author_true='each': restrict
+        # to the S metric only, so as to display a sim. matrix
+        if author_true == 'each':
+            metric_type = 'S'
+            print(rst_parseval_similarity(ctree_preds,
                                           ctree_type=ctree_type,
-                                          metric_types=['S', 'N', 'R', 'F'],
+                                          metric_type=metric_type,
                                           digits=digits,
                                           percent=percent,
+                                          print_support=False,
                                           per_doc=per_doc,
                                           add_trivial_spans=eval_li_dep,
-                                          stringent=STRINGENT))
+                                          stringent=STRINGENT,
+                                          out_format='latex'))
+        else:
+            metric_types = ['S', 'N', 'R', 'F']
+            # compact report, f1-scores only
+            print(rst_parseval_compact_report(author_true, ctree_preds,
+                                              ctree_type=ctree_type,
+                                              metric_types=metric_types,
+                                              digits=digits,
+                                              percent=percent,
+                                              per_doc=per_doc,
+                                              add_trivial_spans=eval_li_dep,
+                                              stringent=STRINGENT))
     else:
-        # standard reports: 1 table per parser, 1 line per metric,
-        # cols = [p, r, f1, support_true, support_pred]
-        for parser_name, ctree_pred_list in ctree_preds:
-            # WIP print SimpleRSTTrees
-            if not os.path.exists(parser_name):
-                os.makedirs(parser_name)
-            for doc_name, ct in zip(doc_names, ctree_pred_list):
-                with codecs.open(parser_name + '/' + doc_name, mode='w',
-                                 encoding='utf-8') as f:
-                    print(ct, file=f)
-
-            # compute and print PARSEVAL scores
-            print(parser_name)
-            # metric_types=None includes the variants with head:
-            # S+H, N+H, R+H, F+H
-            print(rst_parseval_report(ctree_true_list, ctree_pred_list,
-                                      ctree_type=ctree_type,
-                                      metric_types=None,
-                                      digits=digits,
-                                      percent=percent,
-                                      per_doc=per_doc,
-                                      add_trivial_spans=eval_li_dep,
-                                      stringent=STRINGENT))
-            # detailed report on R
-            if detailed >= 2:
-                print(rst_parseval_detailed_report(
-                    ctree_true_list, ctree_pred_list, ctree_type=ctree_type,
-                    metric_type='R'))
+        parsers_true = [author_true] if author_true != 'each' else authors_pred
+        for parser_true in parsers_true:
+            # standard reports: 1 table per parser, 1 line per metric,
+            # cols = [p, r, f1, support_true, support_pred]
+            # FIXME
+            ctree_true_list = []
+            for parser_name, ctree_pred in c_preds:
+                if parser_name == parser_true:
+                    ctree_true_list = [ctree_pred[doc_name] for doc_name in doc_names]
+                    break
             # end FIXME
 
+            for parser_name, ctree_pred_list in ctree_preds:
+                # WIP print SimpleRSTTrees
+                if not os.path.exists(parser_name):
+                    os.makedirs(parser_name)
+                for doc_name, ct in zip(doc_names, ctree_pred_list):
+                    with codecs.open(parser_name + '/' + doc_name, mode='w',
+                                     encoding='utf-8') as f:
+                        print(ct, file=f)
+
+                # compute and print PARSEVAL scores
+                print(parser_name)
+                # metric_types=None includes the variants with head:
+                # S+H, N+H, R+H, F+H
+                print(rst_parseval_report(ctree_true_list, ctree_pred_list,
+                                          ctree_type=ctree_type,
+                                          metric_types=None,
+                                          digits=digits,
+                                          percent=percent,
+                                          per_doc=per_doc,
+                                          add_trivial_spans=eval_li_dep,
+                                          stringent=STRINGENT))
+                # detailed report on R
+                if detailed >= 2:
+                    print(rst_parseval_detailed_report(
+                        ctree_true_list, ctree_pred_list, ctree_type=ctree_type,
+                        metric_type='R'))
+                # end FIXME
+
     # 2017-04-11 compute agreement between human annotators, on DOUBLE
     if 'silver' in authors_pred:
         # 'silver' can be meaningfully compared to 'gold' only (too few

From ca02b4993c5de86fc698c9c001da61c382801678 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Thu, 18 May 2017 16:41:21 +0200
Subject: [PATCH 63/74] ENH showdown: dep_compact_report

---
 evals/showdown.py | 95 ++++++++++++++++-------------------------------
 1 file changed, 31 insertions(+), 64 deletions(-)

diff --git a/evals/showdown.py b/evals/showdown.py
index 167f6a1..2aaba37 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -22,7 +22,7 @@
                                                rst_parseval_similarity)
 #
 from attelo.metrics.deptree import (compute_uas_las,
-                                    compute_uas_las_undirected)
+                                    dep_compact_report)
 
 # local to this package
 from evals.braud_coling import (load_braud_coling_ctrees,
@@ -137,7 +137,6 @@
 STRINGENT = False
 # additional dependency metrics
 INCLUDE_LS = False
-UNDIRECTED_DEPS = False
 EVAL_NUC_RANK = True
 # hyperparams
 NUC_STRATEGY = 'unamb_else_most_frequent'
@@ -506,74 +505,21 @@ def main():
         dep_metrics += ["tag_R"]
     if EVAL_NUC_RANK:
         dep_metrics += ["R+N", "R+O", "F"]
-    # report
-    # * table format
-    width = max(len(parser_name) for parser_name, _ in d_preds)
-    headers = dep_metrics
-    if UNDIRECTED_DEPS:
-        headers += ["UUAS", "ULAS"]
-    fmt = '%% %ds' % width  # first col: parser name
-    fmt += '  '
-    fmt += ' '.join(['% 9s' for _ in headers])
-    fmt += '\n'
-
-    headers = [""] + headers
-    report = fmt % tuple(headers)
-    report += '\n'
-    # display percentages
-    dep_digits = digits - 2 if percent else digits
-    # end table format and header line
-
-    # * table content
+
     # _true
     doc_names = sorted(dtree_true.keys())
     labelset_true = set(itertools.chain.from_iterable(
         x.labels for x in dtree_true.values()))
     labelset_true.add("span")  # RST-DT v.1.0 has an error in wsj_1189 7-9
     # 2017-05-17 any author can be used as reference
-    # FIXME
-    # dtree_true_list = [dtree_true[doc_name] for doc_name in doc_names]
-    parsers_true = [author_true] if author_true != 'each' else authors_pred
-    for parser_true in parsers_true:
-        dtree_true_list = []
-        for parser_name, dtree_pred in d_preds:
-            if parser_name == parser_true:
-                dtree_true_list = [dtree_pred[doc_name] for doc_name in doc_names]
-                break
-        # end FIXME
-        # _pred
-        for parser_name, dtree_pred in d_preds:
-            dtree_pred_list = [dtree_pred[doc_name] for doc_name in doc_names]
-            # check that labelset_pred is a subset of labelset_true
-            labelset_pred = set(itertools.chain.from_iterable(
-                x.labels for x in dtree_pred_list))
-            try:
-                assert labelset_pred.issubset(labelset_true)
-            except AssertionError:
-                print(parser_name)
-                print('T & P', sorted(labelset_true.intersection(labelset_pred)))
-                print('T - P', sorted(labelset_true - labelset_pred))
-                print('P - T', sorted(labelset_pred - labelset_true))
-                raise
-            # end check
-            all_scores = []
-            all_scores += list(compute_uas_las(
-                dtree_true_list, dtree_pred_list, metrics=dep_metrics,
-                doc_names=doc_names))
-            if UNDIRECTED_DEPS:
-                score_uuas, score_ulas = compute_uas_las_undirected(
-                    dtree_true_list, dtree_pred_list)
-                all_scores += [score_uuas, score_ulas]
-            # append to report
-            values = ['{pname: <{fill}}'.format(pname=parser_name, fill=width)]
-            for v in all_scores:
-                if percent:
-                    v = v * 100.0
-                values += ["{0:0.{1}f}".format(v, dep_digits)]
-            report += fmt % tuple(values)
-        # end table content
-        print(report)
-        # end report
+    if author_true != 'each':
+        parser_true = author_true
+        print(dep_compact_report(parser_true, d_preds, dep_metrics,
+                                 doc_names, labelset_true,
+                                 digits=digits,
+                                 percent=percent))
+    else:
+        raise ValueError("Sim matrix on dependencies not implemented yet")
 
     # constituency eval
     ctree_type = 'SimpleRST' if simple_rsttree else 'RST'
@@ -722,11 +668,17 @@ def main():
         # create parallel lists of ctrees for _true and _pred, mapped to
         # coarse rels and binarized
         # _pred:
+        # * ctree
         ctree_dbl_pred = [corpus_dbl_pred[doc_name] for doc_name in docs_dbl]
         ctree_dbl_pred = [REL_CONV(x) for x in ctree_dbl_pred]
         if binarize_true != 'none':  # maybe not?
             ctree_dbl_pred = [_binarize(x, branching=binarize_true)
                               for x in ctree_dbl_pred]
+        # * dtree (as dict from doc_name to dtree !?)
+        dtree_dbl_pred = {doc_name: RstDepTree.from_rst_tree(
+            ct, nary_enc=nary_enc_true)
+                          for doc_name, ct in zip(docs_dbl, ctree_dbl_pred)}
+        # * simple_rsttree (?)
         if simple_rsttree:
             ctree_dbl_pred = [SimpleRSTTree.from_rst_tree(x)
                               for x in ctree_dbl_pred]
@@ -736,10 +688,15 @@ def main():
         if binarize_true != 'none':
             ctree_dbl_true = [_binarize(x, branching=binarize_true)
                               for x in ctree_dbl_true]
+        # * dtree (as dict from doc_name to dtree !?)
+        dtree_dbl_true = {doc_name: RstDepTree.from_rst_tree(
+            ct, nary_enc=nary_enc_true)
+                          for doc_name, ct in zip(docs_dbl, ctree_dbl_true)}
         if simple_rsttree:
             ctree_dbl_true = [SimpleRSTTree.from_rst_tree(x)
                               for x in ctree_dbl_true]
         # generate report
+        # * ctree eval
         ctree_dbl_preds = [('silver', ctree_dbl_pred),
                            ('gold', ctree_dbl_true)]
         print(rst_parseval_compact_report(author_true, ctree_dbl_preds,
@@ -751,6 +708,16 @@ def main():
                                           per_doc=per_doc,
                                           add_trivial_spans=eval_li_dep,
                                           stringent=STRINGENT))
+        # * dtree eval
+        if False:
+            # TODO cope with differences in segmentation
+            dtree_dbl_preds = [('silver', dtree_dbl_pred),
+                               ('gold', dtree_dbl_true)]
+            print(dep_compact_report(author_true, dtree_dbl_preds,
+                                     dep_metrics, docs_dbl,
+                                     labelset_true,
+                                     digits=digits,
+                                     percent=percent))
     # end 2017-04-11 agreement between human annotators
 
 

From 3731b653246ed7a43bd0fe19865f1de158b43edf Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Sat, 20 May 2017 11:12:08 +0200
Subject: [PATCH 64/74] FIX showdown: skip dep sim matrix

---
 evals/showdown.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/evals/showdown.py b/evals/showdown.py
index 2aaba37..e8591f9 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -519,7 +519,8 @@ def main():
                                  digits=digits,
                                  percent=percent))
     else:
-        raise ValueError("Sim matrix on dependencies not implemented yet")
+        pass
+        # raise ValueError("Sim matrix on dependencies not implemented yet")
 
     # constituency eval
     ctree_type = 'SimpleRST' if simple_rsttree else 'RST'

From a056968694e6960c49d0561eaa21e2fa3500ddf3 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Sun, 21 May 2017 13:33:38 +0200
Subject: [PATCH 65/74] ENH showdown: pairwise dep similarity

---
 evals/showdown.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/evals/showdown.py b/evals/showdown.py
index e8591f9..6c540f9 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -22,7 +22,8 @@
                                                rst_parseval_similarity)
 #
 from attelo.metrics.deptree import (compute_uas_las,
-                                    dep_compact_report)
+                                    dep_compact_report,
+                                    dep_similarity)
 
 # local to this package
 from evals.braud_coling import (load_braud_coling_ctrees,
@@ -150,7 +151,7 @@
     'JCN15_1S1S', 'FH14_gSVM', 'FH14_gCRF', 'JE14',
     'LLC16', 'HHN16_hilda', 'HHN16_mst',
     'BPS16', 'BCS17_mono',
-    'BCS17_cross_dev',
+    'BCS17_cross',
     'SHV15_D',
     'li_sujian',
     'ours-chain', 'ours-tree', 'ours-tree-su'
@@ -331,13 +332,13 @@ def main():
                     nary_enc='chain'))
             )
         # braud eacl 2017 - cross+dev
-        if author_pred == 'BCS17_cross_dev':
+        if author_pred == 'BCS17_cross':
             c_preds.append(
-                ('BCS17_cross_dev', load_braud_eacl_ctrees(
+                ('BCS17_cross', load_braud_eacl_ctrees(
                     BRAUD_EACL_CROSS_DEV, REL_CONV, sorted_doc_names))
             )
             d_preds.append(
-                ('BCS17_cross_dev', load_braud_eacl_dtrees(
+                ('BCS17_cross', load_braud_eacl_dtrees(
                     BRAUD_EACL_CROSS_DEV, REL_CONV, sorted_doc_names,
                     nary_enc='chain'))
             )
@@ -519,7 +520,9 @@ def main():
                                  digits=digits,
                                  percent=percent))
     else:
-        pass
+        print(dep_similarity(d_preds, doc_names, labelset_true,
+                             dep_metric='U', digits=digits, percent=percent,
+                             out_format='latex'))
         # raise ValueError("Sim matrix on dependencies not implemented yet")
 
     # constituency eval

From 349d8e46b47624f903d50bc4826eaaf74a4ac6b0 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Mon, 5 Jun 2017 15:48:12 +0200
Subject: [PATCH 66/74] ENH evals.showdown: new parser WLW17

---
 evals/ji.py       |  6 ++++++
 evals/showdown.py | 20 +++++++++++++++++++-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/evals/ji.py b/evals/ji.py
index 6f01512..08fbd8b 100644
--- a/evals/ji.py
+++ b/evals/ji.py
@@ -136,6 +136,12 @@ def load_ji_ctrees(ji_out_dir, rel_conv):
                         node.rel = 'topic-change'
                     elif node.rel == 'topiccomment':  # Ji's output
                         node.rel = 'topic-comment'
+                    elif node.rel == 'textual-organization':  # WLW17 output
+                        # we use 'textual' as the coarse label ;
+                        # JE14 outputs textualorganization which is the
+                        # fine label in our taxonomy, hence is mapped to
+                        # textual beforehand
+                        node.rel = 'textual'
             # end normalize
         # store the resulting RSTTree
         ctree_pred[doc_name] = ct_pred
diff --git a/evals/showdown.py b/evals/showdown.py
index 6c540f9..9e910a1 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -133,6 +133,8 @@
 SURDEANU_LOG_FILE = '/home/mmorey/melodi/rst/replication/surdeanu/output/log'
 # Li Sujian dep parser
 # imported, see above
+# Wang, Li and Wang at ACL 2017
+WLW17_OUT_DIR = '/home/mmorey/melodi/rst/replication/wang/rst-dt/RSTtrees-WSJ-main-1.0/TEST'
 
 # level of detail for parseval
 STRINGENT = False
@@ -153,6 +155,7 @@
     'BPS16', 'BCS17_mono',
     'BCS17_cross',
     'SHV15_D',
+    'WLW17',  # Wang, Li and Wang, ACL17
     'li_sujian',
     'ours-chain', 'ours-tree', 'ours-tree-su'
 ]
@@ -429,7 +432,22 @@ def main():
                     JI_OUT_DIR, REL_CONV, nary_enc='chain'))
             )
             # ji-{chain,tree} would be the same except nary_enc='tree' ;
-            # the nary_enc does not matter because codra outputs binary ctrees,
+            # the nary_enc does not matter because DPLP outputs binary ctrees,
+            # hence both encodings result in (the same) strictly ordered dtrees
+
+        if author_pred == 'WLW17':
+            # WLW17 outputs RST ctrees in the form of lists of spans, just
+            # like JE14 ;
+            # load_ji_dtrees maps them to RST dtrees
+            c_preds.append(
+                ('WLW17', load_ji_ctrees(
+                    WLW17_OUT_DIR, REL_CONV))
+            )
+            d_preds.append(
+                ('WLW17', load_ji_dtrees(
+                    WLW17_OUT_DIR, REL_CONV, nary_enc='chain'))
+            )
+            # the nary_enc does not matter because WLW17 outputs binary ctrees,
             # hence both encodings result in (the same) strictly ordered dtrees
 
         if author_pred == 'SHV15_D':

From 3bfca65b814614d0ea096d6e84e175d0ae218109 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Tue, 13 Jun 2017 15:44:06 +0200
Subject: [PATCH 67/74] FIX irit_rst_dt.harness: pick upstream/master to
 resolve conflict

---
 irit_rst_dt/harness.py | 55 ------------------------------------------
 1 file changed, 55 deletions(-)

diff --git a/irit_rst_dt/harness.py b/irit_rst_dt/harness.py
index 63ac448..726ef05 100644
--- a/irit_rst_dt/harness.py
+++ b/irit_rst_dt/harness.py
@@ -107,31 +107,6 @@ def create_folds(self, mpack):
     # ------------------------------------------------------
     # paths
     # ------------------------------------------------------
-
-<<<<<<< HEAD
-    def mpack_paths(self, test_data, stripped=False, with_cdus=False):
-        """
-        Parameters
-        ----------
-        test_data : boolean
-            If true, the returned paths point to self.testset else to
-            self.dataset.
-
-        stripped : boolean, defaults to False
-            TODO
-
-        with_cdus : boolean, defaults to False
-            If True, generate CDUs (eg. for fragmented EDUs), pairings
-            on them and the corresponding feature vectors.
-
-        Returns
-        -------
-        paths : dict of (glob patterns of) file paths
-            Path to: edu_input, pairings, features, vocab, labels.
-            Also contains 'corpus' (to access gold structures, WIP for
-            RST-DT) ; if `with_cdus` is True, also cdu_input,
-            cdu_pairings, cdu_features.
-=======
     def mpack_paths(self, test_data, stripped=False):
         """Return a dict of paths needed to read a datapack.
 
@@ -150,7 +125,6 @@ def mpack_paths(self, test_data, stripped=False):
             Paths to files that enable to read a datapack.
             Useful keys are 'edu_input', 'pairings', 'features', 'vocab',
             'corpus' (WIP, used to access gold structures).
->>>>>>> upstream/master
         """
         base = 'relations.edu-pairs'
         ext = base + '.sparse'
@@ -162,34 +136,6 @@ def mpack_paths(self, test_data, stripped=False):
         # WIP gold RST trees
         corpus_path = fp.abspath(TEST_CORPUS if test_data
                                  else TRAINING_CORPUS)
-<<<<<<< HEAD
-        # end gold RST trees
-        res = {
-            'edu_input': core_path + '.edu_input',
-            'pairings': core_path + '.pairings',
-            'features': ((core_path + '.stripped') if stripped
-                         else core_path),
-            'vocab': vocab_path,
-            'labels': labels_path,
-            # corpus for gold RST trees
-            'corpus': corpus_path,
-        }
-        if with_cdus:
-            # 2016-07-28 fragmented EDUs
-            frag_ext = 'relations.frag-pairs.sparse'
-            frag_path = fp.join(self.eval_dir, dset, "*.%s" % frag_ext)
-            res.update([
-                ('cdu_input', (frag_path + '.cdu_input' if with_cdus
-                               else None)),
-                ('cdu_pairings', (frag_path + '.cdu_pairings' if with_cdus
-                                  else None)),
-                ('cdu_features', (((frag_path + '.stripped') if stripped
-                                   else frag_path) if with_cdus
-                                  else None)),
-            ])
-
-        return res
-=======
         # end WIP
         return {
             'edu_input': core_path + '.edu_input',
@@ -198,7 +144,6 @@ def mpack_paths(self, test_data, stripped=False):
             'vocab': core_path + '.vocab',
             'corpus': corpus_path
         }
->>>>>>> upstream/master
 
     def model_paths(self, rconf, fold, parser):
         """Paths to the learner(s) model(s).

From 80345140bd591b6dcfe68c5116cfa01de8a88147 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Wed, 29 Nov 2017 17:41:45 +0100
Subject: [PATCH 68/74] ENH d-metrics: add N+O, rm R+O ; c-metrics: add +H

---
 evals/showdown.py | 46 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/evals/showdown.py b/evals/showdown.py
index 9e910a1..e26ccaa 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -29,7 +29,7 @@
 from evals.braud_coling import (load_braud_coling_ctrees,
                                 load_braud_coling_dtrees)
 from evals.braud_eacl import (load_braud_eacl_ctrees,
-                                load_braud_eacl_dtrees)
+                              load_braud_eacl_dtrees)
 from evals.codra import load_codra_ctrees, load_codra_dtrees
 from evals.feng import load_feng_ctrees, load_feng_dtrees
 from evals.gcrf_tree_format import load_gcrf_ctrees, load_gcrf_dtrees
@@ -67,7 +67,8 @@
 
 # * syntax: pred vs gold
 # old-style .edu_input: whole test set
-EDUS_FILE = os.path.join('/home/mmorey/melodi/rst',
+EDUS_FILE = os.path.join('/home/mmorey',
+                         'melodi/rst',
                          'irit-rst-dt/TMP/syn_gold_coarse',
                          'TEST.relations.sparse.edu_input')
 
@@ -78,20 +79,23 @@
 
 # outputs of parsers
 EISNER_OUT_SYN_PRED = os.path.join(
-    '/home/mmorey/melodi/rst',
+    '/home/mmorey',
+    'melodi/rst',
     'irit-rst-dt/TMP/syn_pred_coarse',  # lbl
     'scratch-current/combined',
     'output.maxent-iheads-global-AD.L-jnt-eisner')
 
 # 2016-09-14 "tree" transform, predicted syntax
 EISNER_OUT_TREE_SYN_PRED = os.path.join(
-    '/home/mmorey/melodi/rst',
+    '/home/mmorey',
+    'melodi/rst',
     'irit-rst-dt/TMP/2016-09-12T0825',  # lbl
     'scratch-current/combined',
     'output.maxent-iheads-global-AD.L-jnt-eisner')
 
 EISNER_OUT_TREE_SYN_PRED_SU = os.path.join(
-    '/home/mmorey/melodi/rst',
+    '/home/mmorey',
+    'melodi/rst',
     'irit-rst-dt/TMP/2016-09-12T0825',  # lbl
     'scratch-current/combined',
     'output.maxent-iheads-global-AD.L-jnt_su-eisner')
@@ -99,28 +103,37 @@
 
 
 EISNER_OUT_SYN_PRED_SU = os.path.join(
-    '/home/mmorey/melodi/rst',
+    '/home/mmorey',
+    'melodi/rst',
     'irit-rst-dt/TMP/latest',  # lbl
     'scratch-current/combined',
     'output.maxent-AD.L-jnt_su-eisner')
 
 EISNER_OUT_SYN_GOLD = os.path.join(
-    '/home/mmorey/melodi/rst',
+    '/home/mmorey',
+    'melodi/rst',
     'irit-rst-dt/TMP/syn_gold_coarse',  # lbl
     'scratch-current/combined',
     'output.maxent-iheads-global-AD.L-jnt-eisner')
 
 # output of Joty's parser CODRA
-CODRA_OUT_DIR = '/home/mmorey/melodi/rst/replication/joty/Doc-level'
+CODRA_OUT_DIR = os.path.join(
+    '/home/mmorey',
+    'melodi/rst/replication/joty/Doc-level'
+)
 # output of Ji's parser DPLP
 # JI_OUT_DIR = os.path.join('/home/mmorey/melodi/rst/replication/ji_eisenstein', 'DPLP/data/docs/test/')
-JI_OUT_DIR = os.path.join('/home/mmorey/melodi/rst/replication/ji_eisenstein', 'official_output/outputs/')
+JI_OUT_DIR = os.path.join('/home/mmorey',
+                          'melodi/rst/replication/ji_eisenstein',
+                          'official_output/outputs/')
 # Feng's parsers
-FENG_DIR = '/home/mmorey/melodi/rst/replication/feng_hirst/'
+FENG_DIR = os.path.join('/home/mmorey',
+                        'melodi/rst/replication/feng_hirst/')
 FENG1_OUT_DIR = os.path.join(FENG_DIR, 'phil', 'tmp')
 FENG2_OUT_DIR = os.path.join(FENG_DIR, 'gCRF_dist/texts/results/test_batch_gold_seg')
 # Li Qi's parser
-LI_QI_OUT_DIR = '/home/mmorey/melodi/rst/replication/li_qi/result'
+LI_QI_OUT_DIR = os.path.join('/home/mmorey',
+                             'melodi/rst/replication/li_qi/result')
 # Hayashi's HILDA
 HAYASHI_OUT_DIR = '/home/mmorey/melodi/rst/replication/hayashi/SIGDIAL'
 HAYASHI_HILDA_OUT_DIR = os.path.join(HAYASHI_OUT_DIR, 'auto_parse/cons/HILDA')
@@ -134,7 +147,9 @@
 # Li Sujian dep parser
 # imported, see above
 # Wang, Li and Wang at ACL 2017
-WLW17_OUT_DIR = '/home/mmorey/melodi/rst/replication/wang/rst-dt/RSTtrees-WSJ-main-1.0/TEST'
+WLW17_OUT_DIR = os.path.join(
+    '/home/mmorey',
+    'melodi/rst/replication/wang/rst-dt/RSTtrees-WSJ-main-1.0/TEST')
 
 # level of detail for parseval
 STRINGENT = False
@@ -523,7 +538,7 @@ def main():
     if INCLUDE_LS:
         dep_metrics += ["tag_R"]
     if EVAL_NUC_RANK:
-        dep_metrics += ["R+N", "R+O", "F"]
+        dep_metrics += ["N+O", "R+N", "F"]  # 2017-11-29 disable "R+O"
 
     # _true
     doc_names = sorted(dtree_true.keys())
@@ -590,7 +605,10 @@ def main():
                                           stringent=STRINGENT,
                                           out_format='latex'))
         else:
-            metric_types = ['S', 'N', 'R', 'F']
+            metric_types = [
+                'S', 'N', 'R', 'F',
+                'S+H', 'N+H', 'R+H', 'F+H',
+            ]
             # compact report, f1-scores only
             print(rst_parseval_compact_report(author_true, ctree_preds,
                                               ctree_type=ctree_type,

From 1043db3e719e0e5a6f44050b827051455e737178 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Fri, 1 Dec 2017 17:37:59 +0100
Subject: [PATCH 69/74] FIX showdown: out_fmt, span metrics +H+K+HH

---
 evals/showdown.py | 39 +++++++++++++++++++++++++--------------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/evals/showdown.py b/evals/showdown.py
index e26ccaa..994a9f0 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -166,7 +166,7 @@
     'gold',  # RST-main
     'silver',  # RST-double
     'JCN15_1S1S', 'FH14_gSVM', 'FH14_gCRF', 'JE14',
-    'LLC16', 'HHN16_hilda', 'HHN16_mst',
+    'LLC16', 'HHN16_HILDA', 'HHN16_MST',
     'BPS16', 'BCS17_mono',
     'BCS17_cross',
     'SHV15_D',
@@ -261,6 +261,9 @@ def main():
                         help='Scores are displayed as percentages (ex: 57.9)')
     parser.add_argument('--detailed', type=int, default=0,
                         help='Level of detail for evaluations')
+    parser.add_argument('--out_fmt', default='text',
+                        choices=['text', 'latex'],
+                        help='Output format')
     #
     args = parser.parse_args()
     author_true = args.author_true
@@ -276,6 +279,7 @@ def main():
             raise ValueError('--percent requires --digits >= 3')
     # level of detail for evals
     detailed = args.detailed
+    out_fmt = args.out_fmt
 
     # "per_doc = True" computes p, r, f as in DPLP: compute scores per doc
     # then average over docs
@@ -361,24 +365,24 @@ def main():
                     nary_enc='chain'))
             )
 
-        if author_pred == 'HHN16_hilda':
+        if author_pred == 'HHN16_HILDA':
             c_preds.append(
-                ('HHN16_hilda', load_hayashi_hilda_ctrees(
+                ('HHN16_HILDA', load_hayashi_hilda_ctrees(
                     HAYASHI_HILDA_OUT_DIR, REL_CONV))
             )
             d_preds.append(
-                ('HHN16_hilda', load_hayashi_hilda_dtrees(
+                ('HHN16_HILDA', load_hayashi_hilda_dtrees(
                     HAYASHI_HILDA_OUT_DIR, REL_CONV, nary_enc='chain'))
             )
 
-        if author_pred == 'HHN16_mst':
+        if author_pred == 'HHN16_MST':
             c_preds.append(
-                ('HHN16_mst', load_hayashi_dep_ctrees(
+                ('HHN16_MST', load_hayashi_dep_ctrees(
                     HAYASHI_MST_OUT_DIR, REL_CONV_DTREE, EDUS_FILE_PAT,
                     nuc_clf, rnk_clf))
             )
             d_preds.append(
-                ('HHN16_mst', load_hayashi_dep_dtrees(
+                ('HHN16_MST', load_hayashi_dep_dtrees(
                     HAYASHI_MST_OUT_DIR, REL_CONV_DTREE, EDUS_FILE_PAT,
                     nuc_clf, rnk_clf))
             )
@@ -533,12 +537,12 @@ def main():
     # dependency eval
     dep_metrics = ["U"]
     if EVAL_NUC_RANK:
-        dep_metrics += ['O', 'N']
+        dep_metrics += ['O', 'N', 'O+N']
     dep_metrics += ["R"]
     if INCLUDE_LS:
         dep_metrics += ["tag_R"]
     if EVAL_NUC_RANK:
-        dep_metrics += ["N+O", "R+N", "F"]  # 2017-11-29 disable "R+O"
+        dep_metrics += ["R+N", "F"]  # 2017-11-29 disable "R+O"
 
     # _true
     doc_names = sorted(dtree_true.keys())
@@ -551,11 +555,12 @@ def main():
         print(dep_compact_report(parser_true, d_preds, dep_metrics,
                                  doc_names, labelset_true,
                                  digits=digits,
-                                 percent=percent))
+                                 percent=percent,
+                                 out_format=out_fmt))
     else:
         print(dep_similarity(d_preds, doc_names, labelset_true,
                              dep_metric='U', digits=digits, percent=percent,
-                             out_format='latex'))
+                             out_format=out_fmt))
         # raise ValueError("Sim matrix on dependencies not implemented yet")
 
     # constituency eval
@@ -603,11 +608,15 @@ def main():
                                           per_doc=per_doc,
                                           add_trivial_spans=eval_li_dep,
                                           stringent=STRINGENT,
-                                          out_format='latex'))
+                                          out_format=out_fmt))
         else:
             metric_types = [
                 'S', 'N', 'R', 'F',
-                'S+H', 'N+H', 'R+H', 'F+H',
+                # 'S+H', 'N+H', 'R+H', 'F+H',
+                # 'S+K', 'N+K', 'R+K', 'F+K',
+                # 'S+HH', 'N+HH', 'R+HH', 'F+HH',
+                # 'S+K+HH', 'N+K+HH', 'R+K+HH', 'F+K+HH',
+                'S+H+K+HH', 'N+H+K+HH', 'R+H+K+HH', 'F+H+K+HH',
             ]
             # compact report, f1-scores only
             print(rst_parseval_compact_report(author_true, ctree_preds,
@@ -615,9 +624,11 @@ def main():
                                               metric_types=metric_types,
                                               digits=digits,
                                               percent=percent,
+                                              print_support=False,
                                               per_doc=per_doc,
                                               add_trivial_spans=eval_li_dep,
-                                              stringent=STRINGENT))
+                                              stringent=STRINGENT,
+                                              out_format=out_fmt))
     else:
         parsers_true = [author_true] if author_true != 'each' else authors_pred
         for parser_true in parsers_true:

From ff4177d66fccab94e025bb76f12778f5d3105ed5 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Tue, 12 Dec 2017 23:20:44 +0100
Subject: [PATCH 70/74] WIP nuc_clf

---
 evals/prepare_nuc_dataset.py  | 167 ++++++++++++++++++++++++++++++++++
 evals/showdown.py             |  38 +++++++-
 evals/train_nuc_classifier.py | 147 ++++++++++++++++++++++++++++++
 3 files changed, 347 insertions(+), 5 deletions(-)
 create mode 100644 evals/prepare_nuc_dataset.py
 create mode 100644 evals/train_nuc_classifier.py

diff --git a/evals/prepare_nuc_dataset.py b/evals/prepare_nuc_dataset.py
new file mode 100644
index 0000000..97e1c6e
--- /dev/null
+++ b/evals/prepare_nuc_dataset.py
@@ -0,0 +1,167 @@
+"""This utility script outputs a dataset of the nuclearity of RST edges.
+
+Given the path to the RST-DT corpus and a dataset of candidate RST
+dependencies labelled with their gold coarse (class) RST relation (or
+none if they are unrelated), produce a similar dataset for the task
+of nuclearity prediction.
+
+As of 2017-12-08, we filter out the instances for unrelated pairs of EDUs
+and left-oriented dependencies, only keeping right-oriented dependencies
+(except for "ROOT").
+The resulting dataset describes a binary classification problem.
+"""
+
+from __future__ import absolute_import, print_function
+
+import argparse
+import codecs
+import itertools
+import os
+
+from educe.rst_dt.annotation import NUC_N, NUC_S
+from educe.rst_dt.corpus import RstRelationConverter, RELMAP_112_18_FILE
+from educe.rst_dt.dep_corpus import read_corpus
+from educe.rst_dt.deptree import RstDepTree
+
+
+def main(corpus, dataset, out_dir, nary_enc):
+    """Do prepare the nuclearity dataset.
+
+    Parameters
+    ----------
+    corpus : str
+        Path to the RST-DT "main" corpus.
+    dataset : str
+        Path to the existing dataset labelled with coarse relations.
+    out_dir : str
+        Path to the output folder.
+    """
+    # (re-)create a d-corpus from the RST-DT c-corpus
+    corpus_subset = os.path.basename(dataset).split('.')[0]
+    if corpus_subset not in ('TRAINING', 'TEST'):
+        raise ValueError("dataset must be a filepath that starts with"
+                         "one of {'TRAINING', 'TEST'}")
+    if corpus_subset == 'TRAINING':
+        section = 'train'
+    else:  # 'TEST'
+        section = 'test'
+    rst_ccorpus = read_corpus(corpus, section=section)
+    rel_conv = RstRelationConverter(RELMAP_112_18_FILE).convert_dtree
+    rst_dcorpus = dict()  # FileId.doc -> RstDepTree
+    for doc_key, rst_ctree in rst_ccorpus[section].items():
+        rst_dtree = RstDepTree.from_rst_tree(rst_ctree, nary_enc=nary_enc)
+        rst_dtree_coarse = rel_conv(rst_dtree)
+        rst_dcorpus[doc_key.doc] = rst_dtree_coarse
+    # for each candidate dependency in the dataset, read the nuclearity
+    # from the RST d-corpus
+    # Nota: we stream through the dataset to avoid loading it entirely in
+    # memory ; we don't need to open the vocabulary file (.vocab), nor the
+    # description of the EDUs (.edu_input)
+    pairings = dataset + '.pairings'
+    # edu_desc = dataset + '.edu_input'
+    new_dataset = os.path.join(out_dir, os.path.basename(dataset))
+    new_pairs = os.path.join(out_dir, os.path.basename(pairings))
+    if ((os.path.abspath(new_dataset) == os.path.abspath(dataset) or
+         os.path.abspath(new_pairs) == os.path.abspath(pairings))):
+        raise ValueError("I won't let you erase your base dataset")
+    with codecs.open(dataset, mode='rb', encoding='utf-8') as f_data:
+        with codecs.open(pairings, mode='rb', encoding='utf-8') as f_pairs:
+            with codecs.open(new_dataset, mode='wb', encoding='utf-8') as data_out:
+                with codecs.open(new_pairs, mode='wb', encoding='utf-8') as pairs_out:
+                    # read header line in svmlight file
+                    header = f_data.readline()
+                    header_prefix = '# labels: '
+                    assert header.startswith(header_prefix)
+                    labels = header[len(header_prefix):].split()
+                    int2lbl = dict(enumerate(labels, start=1))
+                    lbl2int = {lbl: i for i, lbl in int2lbl.items()}
+                    unrelated = lbl2int["UNRELATED"]
+                    root = lbl2int["ROOT"]
+                    # write labels in header of new svmlight file, as an
+                    # ordered list mapped to {1, 2}
+                    print(header_prefix + ' '.join((NUC_N, NUC_S)),
+                          file=data_out)
+                    # stream through lines
+                    for pair, line in itertools.izip(f_pairs, f_data):
+                        # read candidate pair of EDUs
+                        src_id, tgt_id = pair.strip().split('\t')
+                        if src_id == 'ROOT':
+                            continue
+                        # now both src_id and tgt_id are of form "docname_int"
+                        # ex: "wsj_0600.out_1"
+                        src_idx = int(src_id.rsplit('_', 1)[1])
+                        doc_name, tgt_idx = tgt_id.rsplit('_', 1)
+                        tgt_idx = int(tgt_idx)
+                        if tgt_idx < src_idx:
+                            # skip left dependencies: by construction,
+                            # their nuclearity can only be Satellite
+                            # (SN edges)
+                            continue
+                        # print(doc_name, src_id, tgt_id, src_idx, tgt_idx)
+                        # read corresponding ref class (label), feature vector
+                        lbl_idx, feat_vector = line.strip().split(' ', 1)
+                        lbl_idx = int(lbl_idx)  # lbl currently encoded as int
+                        if lbl_idx in (unrelated, root):
+                            continue
+                        try:
+                            lbl = int2lbl[lbl_idx]
+                        except KeyError:
+                            # the test set in RST-DT 1.0 has an error:
+                            # wsj_1189.out [8-9] is labelled "span" instead of
+                            # "Consequence" ; some runs used this erroneous
+                            # version, hence had a class "0" (unknown) for
+                            # this line in the dataset
+                            if ((doc_name == 'wsj_1189.out' and
+                                 src_idx == 7 and
+                                 tgt_idx == 9)):
+                                lbl = 'cause'
+                                lbl_idx = lbl2int[lbl]
+                            else:
+                                print(doc_name, src_idx, tgt_idx)
+                                raise
+                        # print(src_id, tgt_id, lbl)
+                        dtree = rst_dcorpus[doc_name]
+                        assert dtree.heads[tgt_idx] == src_idx
+                        assert dtree.labels[tgt_idx] == lbl
+                        if dtree.nucs[tgt_idx] == NUC_N:
+                            nuc_idx = 1
+                        elif dtree.nucs[tgt_idx] == NUC_S:
+                            nuc_idx = 2
+                        else:
+                            raise ValueError("weird nuclearity {}".format(
+                                dtree.nucs[tgt_idx]))
+                        print(str(nuc_idx) + ' ' + feat_vector,
+                              file=data_out)
+                        print(pair.strip(), file=pairs_out)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='Prepare a nuclearity dataset.'
+    )
+    parser.add_argument('--corpus',
+                        help='Path to the RST-DT "main" corpus',
+                        default=os.path.join(
+                            os.path.expanduser('~'),
+                            'corpora/rst-dt/rst_discourse_treebank/data',
+                            'RSTtrees-WSJ-main-1.01'
+                        ))
+    parser.add_argument('--dataset',
+                        help='Base file of the dataset',
+                        default=os.path.join(
+                            os.path.expanduser('~'),
+                            'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse',
+                            'TRAINING.relations.sparse'
+                        ))
+    parser.add_argument('--out_dir',
+                        help='Output folder',
+                        default=os.path.join(
+                            os.path.expanduser('~'),
+                            'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_NUC'
+                        ))
+    parser.add_argument('--nary_enc',
+                        help='Encoding for n-ary nodes',
+                        choices=['chain', 'tree'],
+                        default='chain')
+    args = parser.parse_args()
+    main(args.corpus, args.dataset, args.out_dir, args.nary_enc)
diff --git a/evals/showdown.py b/evals/showdown.py
index 994a9f0..9193612 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -10,6 +10,8 @@
 import itertools
 import os
 
+from sklearn.datasets import load_svmlight_files
+
 from educe.rst_dt.annotation import _binarize, SimpleRSTTree
 from educe.rst_dt.corpus import (RstRelationConverter,
                                  Reader as RstReader)
@@ -46,6 +48,9 @@
                         load_attelo_ctrees,
                         load_attelo_dtrees)
 from evals.surdeanu import load_surdeanu_ctrees, load_surdeanu_dtrees
+# 2017-12-12 nuc_clf WIP
+from evals.train_nuc_classifier import RightBinaryNuclearityClassifier
+# end WIP nuc_clf
 
 # RST corpus
 CORPUS_DIR = os.path.join('corpus', 'RSTtrees-WSJ-main-1.01/')
@@ -204,13 +209,36 @@ def setup_dtree_postprocessor(nary_enc='chain', order='strict',
     y_nuc_train = []
     y_rnk_train = []
     for doc_name, dt in sorted(dtree_true.items()):
+        # print(dt.__dict__)
+        # raise ValueError('wip wip nuc_clf')
         X_train.append(dt)
         y_nuc_train.append(dt.nucs)
         y_rnk_train.append(dt.ranks)
     # nuclearity clf
-    nuc_clf = DummyNuclearityClassifier(strategy=nuc_strategy,
-                                        constant=nuc_constant)
-    nuc_clf.fit(X_train, y_nuc_train)
+    if False:
+        nuc_clf = DummyNuclearityClassifier(strategy=nuc_strategy,
+                                            constant=nuc_constant)
+        nuc_clf.fit(X_train, y_nuc_train)
+    else:
+        # 2017-12-12 WIP nuc_clf
+        # shiny new nuc_clf ; still very hacky
+        # import the nuclearity TRAIN and TEST sets generated from
+        # the svmlight feature vectors (ahem)
+        dset_folder = os.path.join(
+            os.path.expanduser('~'),
+            'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_NUC'
+        )
+        dset_train = os.path.join(dset_folder, 'TRAINING.relations.sparse')
+        dset_test = os.path.join(dset_folder, 'TEST.relations.sparse')
+        # FIXME read n_features from .vocab
+        X_nuc_train, y_nuc_train, X_nuc_test, y_nuc_test = load_svmlight_files(
+            (dset_train, dset_test),
+            n_features=46731,
+            zero_based=False
+        )
+        nuc_clf = RightBinaryNuclearityClassifier()
+        nuc_clf = nuc_clf.fit(X_nuc_train, y_nuc_train)
+        # end WIP nuc_clf
     # rank clf
     rnk_clf = InsideOutAttachmentRanker(
         strategy=rnk_strategy, prioritize_same_unit=rnk_prioritize_same_unit,
@@ -612,11 +640,11 @@ def main():
         else:
             metric_types = [
                 'S', 'N', 'R', 'F',
-                # 'S+H', 'N+H', 'R+H', 'F+H',
+                'S+H', 'N+H', 'R+H', 'F+H',
                 # 'S+K', 'N+K', 'R+K', 'F+K',
                 # 'S+HH', 'N+HH', 'R+HH', 'F+HH',
                 # 'S+K+HH', 'N+K+HH', 'R+K+HH', 'F+K+HH',
-                'S+H+K+HH', 'N+H+K+HH', 'R+H+K+HH', 'F+H+K+HH',
+                # 'S+H+K+HH', 'N+H+K+HH', 'R+H+K+HH', 'F+H+K+HH',
             ]
             # compact report, f1-scores only
             print(rst_parseval_compact_report(author_true, ctree_preds,
diff --git a/evals/train_nuc_classifier.py b/evals/train_nuc_classifier.py
new file mode 100644
index 0000000..bb25adf
--- /dev/null
+++ b/evals/train_nuc_classifier.py
@@ -0,0 +1,147 @@
+"""This utility script trains a classifier for nuclearity of RST edges.
+
+Given the path to a nuclearity dataset, it trains a classifier and
+evaluates it.
+"""
+
+
+from __future__ import absolute_import, print_function
+
+import argparse
+import codecs
+from collections import defaultdict
+import itertools
+import os
+
+from sklearn.datasets import load_svmlight_file, load_svmlight_files
+from sklearn.model_selection import cross_val_score
+from sklearn.linear_model.logistic import LogisticRegression
+# from sklearn.model_selection import GridSearchCV
+from sklearn.preprocessing import LabelEncoder
+import matplotlib.pyplot as plt
+
+from educe.rst_dt.annotation import NUC_N, NUC_S
+
+
+if False:
+    # import the nuclearity TRAIN and TEST sets
+    dset_folder = os.path.join(
+        os.path.expanduser('~'),
+        'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_NUC'
+    )
+    dset_train = os.path.join(dset_folder, 'TRAINING.relations.sparse')
+    dset_test = os.path.join(dset_folder, 'TEST.relations.sparse')
+
+    X_train, y_train, X_test, y_test = load_svmlight_files(
+        (dset_train, dset_test),
+        zero_based=False
+    )
+    nuc_clf = LogisticRegression(penalty='l1', n_jobs=2)
+    # train nuclearity classifier, cross-validate performance on train
+    scores = cross_val_score(nuc_clf, X_train, y_train, cv=10)
+    print(scores)
+    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
+    # fit a 
+    nuc_clf = nuc_clf.fit(X_train, y_train)
+    print(nuc_clf.score(X_test, y_test))
+
+
+# 2017-12-06 non-dummy nuc_clf
+# DIRTY load the feature vector for all candidate edges in the TEST
+# set
+feat_vecs = dict()
+dset_folder = os.path.join(
+    os.path.expanduser('~'),
+    'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse'
+)
+dset_test = os.path.join(dset_folder, 'TEST.relations.sparse')
+# we use the original svmlight files whose label is the relation
+# class (which we actually don't need here)
+# FIXME read n_features from .vocab
+X_test, y_lbl_test = load_svmlight_file(dset_test, n_features=46731,
+                                        zero_based=False)
+# build mapping from doc_name, src_idx, tgt_idx to line number
+# in X_test
+pairs = dset_test + '.pairings'
+pair_map = defaultdict(lambda: defaultdict(dict))
+with codecs.open(pairs, mode='rb', encoding='utf-8') as f_pairs:
+    for i, line in enumerate(f_pairs):
+        src_id, tgt_id = line.strip().split('\t')
+        src_idx = (0 if src_id == 'ROOT'
+                   else int(src_id.rsplit('_', 1)[1]))
+        doc_name, tgt_idx = tgt_id.rsplit('_', 1)
+        tgt_idx = int(tgt_idx)
+        # print(line)
+        # print(doc_name, src_idx, tgt_idx)
+        pair_map[doc_name][src_idx][tgt_idx] = i
+# end DIRTY
+
+
+class RightBinaryNuclearityClassifier(object):
+    """Predict the nuclearity of right-oriented dependencies (binary).
+
+    The nuclearity of ordinary, right-oriented dependencies can be
+    either `NUC_S` or `NUC_N` (NS or NN relations).
+    Right-oriented dependencies from the fake root have nuclearity
+    `NUC_R` by convention ; Left-oriented dependencies have nuclearity
+    `NUC_S`.
+
+    Parameters
+    ----------
+    bin_clf : sklearn classifier
+        Binary classifier for right dependencies: NN vs NS.
+    """
+
+    def __init__(self, bin_clf=LogisticRegression(penalty='l1', n_jobs=2)):
+        """Init"""
+        self.bin_clf = bin_clf
+
+    def fit(self, X, y):
+        """Fit"""
+        self.bin_clf = self.bin_clf.fit(X, y)
+        if True:  # verbose
+            scores = cross_val_score(self.bin_clf, X, y, cv=10)
+            print(scores)
+            print("Accuracy: %0.2f (+/- %0.2f)" % (
+                scores.mean(), scores.std() * 2))
+        return self
+
+    def predict(self, X):
+        """Predict nuclearity of edges in RstDepTrees X from the TEST set.
+        """
+        y = []
+        for dtree in X:
+            doc_name = dtree.origin.doc
+            yi = []
+            for i, head in enumerate(dtree.heads):
+                if i == 0:
+                    # fake root !? maybe we shouldn't write anything
+                    # here ;
+                    # FIXME check how to be consistent throughout educe and
+                    # eval code
+                    yi.append(NUC_N)
+                elif i < head:
+                    # left edge: SN
+                    yi.append(NUC_S)
+                elif head == 0:
+                    # FIXME NUC_R for edges from the root?
+                    yi.append(NUC_N)
+                else:
+                    # right edge: NN or NS?
+                    line_idx = pair_map[doc_name][head][i]
+                    # X_test[line_idx,:] is a matrix with 1 row
+                    Xi = X_test[line_idx,:]
+                    try:
+                        y_pred = self.bin_clf.predict(Xi)
+                    except ValueError:
+                        print(Xi)
+                        raise
+                    if y_pred == 1:
+                        yi.append(NUC_N)
+                    elif y_pred == 2:
+                        yi.append(NUC_S)
+                    else:
+                        raise ValueError("Weird prediction: {}".format(
+                            y_pred))
+            y.append(yi)
+        return y

From 822af14a0ee51d49906d7f304d3e62d0c3112050 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Thu, 14 Dec 2017 16:46:05 +0100
Subject: [PATCH 71/74] FIX load either c- or d-trees once, pass them to the
 other loader

---
 evals/braud_coling.py         |  15 ++++-
 evals/braud_eacl.py           |  15 ++++-
 evals/codra.py                |  22 ++++---
 evals/gcrf_tree_format.py     |   8 ++-
 evals/hayashi_cons.py         |  11 +++-
 evals/hayashi_deps.py         |  21 +++----
 evals/ji.py                   |   9 ++-
 evals/li_qi.py                |  21 ++++---
 evals/ours.py                 |  14 +++--
 evals/showdown.py             | 111 +++++++++++++++++++++-------------
 evals/surdeanu.py             |  10 ++-
 evals/train_nuc_classifier.py |   9 +--
 12 files changed, 169 insertions(+), 97 deletions(-)

diff --git a/evals/braud_coling.py b/evals/braud_coling.py
index 625cb19..4856aac 100644
--- a/evals/braud_coling.py
+++ b/evals/braud_coling.py
@@ -141,10 +141,19 @@ def load_braud_coling_ctrees(out_dir, rel_conv):
     return ctree_pred
 
 
-def load_braud_coling_dtrees(out_dir, rel_conv, nary_enc='chain'):
-    """Do load dtrees"""
+def load_braud_coling_dtrees(out_dir, rel_conv, nary_enc='chain',
+                             ctree_pred=None):
+    """Do load dtrees.
+
+    Parameters
+    ----------
+    ctree_pred : dict(str, RSTTree), optional
+        RST c-trees, indexed by doc_name. If c-trees are provided this
+        way, `out_dir` is ignored.
+    """
     dtree_pred = dict()
-    ctree_pred = load_braud_coling_ctrees(out_dir, rel_conv)
+    if ctree_pred is None:
+        ctree_pred = load_braud_coling_ctrees(out_dir, rel_conv)
     for doc_name, ct_pred in ctree_pred.items():
         dt_pred = RstDepTree.from_rst_tree(ct_pred)
         dtree_pred[doc_name] = dt_pred
diff --git a/evals/braud_eacl.py b/evals/braud_eacl.py
index 082efa5..e865a8e 100644
--- a/evals/braud_eacl.py
+++ b/evals/braud_eacl.py
@@ -122,10 +122,19 @@ def load_braud_eacl_ctrees(fpath, rel_conv, doc_names):
     return ctree_pred
 
 
-def load_braud_eacl_dtrees(fpath, rel_conv, doc_names, nary_enc='chain'):
-    """Do load dtrees"""
+def load_braud_eacl_dtrees(fpath, rel_conv, doc_names, nary_enc='chain',
+                           ctree_pred=None):
+    """Do load dtrees
+
+    Parameters
+    ----------
+    ctree_pred : dict(str, RSTTree), optional
+        RST c-trees, indexed by doc_name. If c-trees are provided this
+        way, `out_dir` is ignored.
+    """
     dtree_pred = dict()
-    ctree_pred = load_braud_eacl_ctrees(fpath, rel_conv, doc_names)
+    if ctree_pred is None:
+        ctree_pred = load_braud_eacl_ctrees(fpath, rel_conv, doc_names)
     for doc_name, ct_pred in ctree_pred.items():
         dt_pred = RstDepTree.from_rst_tree(ct_pred)
         dtree_pred[doc_name] = dt_pred
diff --git a/evals/codra.py b/evals/codra.py
index a586389..11b5aea 100644
--- a/evals/codra.py
+++ b/evals/codra.py
@@ -105,7 +105,8 @@ def load_codra_ctrees(codra_out_dir, rel_conv):
     return ctree_pred
 
 
-def load_codra_dtrees(codra_out_dir, rel_conv, nary_enc='chain'):
+def load_codra_dtrees(codra_out_dir, rel_conv, nary_enc='chain',
+                      ctree_pred=None):
     """Get the dtrees that correspond to the ctrees output by CODRA.
 
     Parameters
@@ -114,21 +115,26 @@ def load_codra_dtrees(codra_out_dir, rel_conv, nary_enc='chain'):
         Path to the base directory containing the output files.
     nary_enc: one of {'chain', 'tree'}
         Encoding for n-ary nodes.
+    ctree_pred : dict(str, RSTTree), optional
+        RST c-trees, indexed by doc_name. If c-trees are provided this
+        way, `out_dir` is ignored.
 
     Returns
     -------
     dtree_pred: dict(str, RstDepTree)
         RST dtree for each document.
     """
-    # load predicted trees
-    data_pred = load_codra_output_files(codra_out_dir)
-    # filenames = data_pred['filenames']
-    doc_names_pred = data_pred['doc_names']
-    rst_ctrees_pred = data_pred['rst_ctrees']
-
+    if ctree_pred is None:
+        # load predicted trees
+        data_pred = load_codra_output_files(codra_out_dir)
+        # filenames = data_pred['filenames']
+        doc_names_pred = data_pred['doc_names']
+        rst_ctrees_pred = data_pred['rst_ctrees']
+        ctree_pred = {doc_name: ct_pred for doc_name, ct_pred
+                      in itertools.izip(doc_names_pred, rst_ctrees_pred)}
     # build a dict from doc_name to ordered dtree (RstDepTree)
     dtree_pred = dict()
-    for doc_name, ct_pred in itertools.izip(doc_names_pred, rst_ctrees_pred):
+    for doc_name, ct_pred in ctree_pred.items():
         # constituency tree
         # replace fine-grained labels with coarse-grained labels ;
         # the files we have already contain the coarse labels, except their
diff --git a/evals/gcrf_tree_format.py b/evals/gcrf_tree_format.py
index 1b4fd0b..ba8fe27 100644
--- a/evals/gcrf_tree_format.py
+++ b/evals/gcrf_tree_format.py
@@ -195,7 +195,7 @@ def load_gcrf_ctrees(out_dir, rel_conv):
     return ctree_pred
 
 
-def load_gcrf_dtrees(out_dir, rel_conv, nary_enc='chain'):
+def load_gcrf_dtrees(out_dir, rel_conv, nary_enc='chain', ctree_pred=None):
     """Get the dtrees that correspond to the ctrees output by gCRF.
 
     Parameters
@@ -204,13 +204,17 @@ def load_gcrf_dtrees(out_dir, rel_conv, nary_enc='chain'):
         Path to the base directory containing the output files.
     nary_enc: one of {'chain', 'tree'}
         Encoding for n-ary nodes.
+    ctree_pred : dict(str, RSTTree), optional
+        RST c-trees, indexed by doc_name. If c-trees are provided this
+        way, `out_dir` is ignored.
 
     Returns
     -------
     dtree_pred: dict(str, RstDepTree)
         RST dtree for each document.
     """
-    ctree_pred = load_gcrf_ctrees(out_dir, rel_conv)
+    if ctree_pred is None:
+        ctree_pred = load_gcrf_ctrees(out_dir, rel_conv)
     dtree_pred = dict()
     for doc_name, ct_pred in ctree_pred.items():
         dt_pred = RstDepTree.from_rst_tree(ct_pred, nary_enc=nary_enc)
diff --git a/evals/hayashi_cons.py b/evals/hayashi_cons.py
index 6f76512..7bdb9a7 100644
--- a/evals/hayashi_cons.py
+++ b/evals/hayashi_cons.py
@@ -127,7 +127,8 @@ def load_hayashi_hilda_ctrees(out_dir, rel_conv):
     return ctree_pred
 
 
-def load_hayashi_hilda_dtrees(out_dir, rel_conv, nary_enc='chain'):
+def load_hayashi_hilda_dtrees(out_dir, rel_conv, nary_enc='chain',
+                              ctree_pred=None):
     """Load the dtrees for the ctrees output by Hayashi et al.'s HILDA.
 
     Parameters
@@ -137,14 +138,18 @@ def load_hayashi_hilda_dtrees(out_dir, rel_conv, nary_enc='chain'):
     rel_conv: RstRelationConverter
         Converter for relation labels (fine- to coarse-grained, plus
         normalization).
+    ctree_pred : dict(str, RSTTree), optional
+        RST c-trees, indexed by doc_name. If c-trees are provided this
+        way, `out_dir` is ignored.
 
     Returns
     -------
     dtree_pred: dict(str, RstDepTree)
         RST dtree for each document.
     """
-    # load predicted ctrees
-    ctree_pred = load_hayashi_hilda_ctrees(out_dir, rel_conv)
+    if ctree_pred is None:
+        # load predicted ctrees
+        ctree_pred = load_hayashi_hilda_ctrees(out_dir, rel_conv)
     # convert to dtrees
     dtree_pred = dict()
     for doc_name, ct_pred in ctree_pred.items():
diff --git a/evals/hayashi_deps.py b/evals/hayashi_deps.py
index cbde909..c5fd6b3 100644
--- a/evals/hayashi_deps.py
+++ b/evals/hayashi_deps.py
@@ -10,7 +10,7 @@
 
 from educe.learning.edu_input_format import load_edu_input_file
 from educe.rst_dt.corpus import Reader
-from educe.rst_dt.deptree import RstDepTree
+from educe.rst_dt.deptree import RstDepTree, RstDtException
 from educe.rst_dt.dep2con import deptree_to_rst_tree
 
 
@@ -91,17 +91,13 @@ def load_hayashi_dep_dtrees(out_dir, rel_conv, edus_file_pat, nuc_clf,
     ----------
     out_dir : str
         Path to the folder containing .dis files.
-
     rel_conv : RstRelationConverter
         Converter for relation labels (fine- to coarse-grained, plus
         normalization).
-
     edus_file_pat : str
         Pattern for the .edu_input files.
-
     nuc_clf : NuclearityClassifier
         Nuclearity classifier
-
     rnk_clf : RankClassifier
         Rank classifier
 
@@ -135,7 +131,7 @@ def load_hayashi_dep_dtrees(out_dir, rel_conv, edus_file_pat, nuc_clf,
 
 
 def load_hayashi_dep_ctrees(out_dir, rel_conv, edus_file_pat, nuc_clf,
-                            rnk_clf):
+                            rnk_clf, dtree_pred=None):
     """Load the ctrees for the dtrees output by one of Hayashi et al.'s
     dep parsers.
 
@@ -143,19 +139,18 @@ def load_hayashi_dep_ctrees(out_dir, rel_conv, edus_file_pat, nuc_clf,
     ----------
     out_dir : str
         Path to the folder containing .dis files.
-
     rel_conv : RstRelationConverter
         Converter for relation labels (fine- to coarse-grained, plus
         normalization).
-
     edus_file_pat : str
         Pattern for the .edu_input files.
-
     nuc_clf : NuclearityClassifier
         Nuclearity classifier
-
     rnk_clf : RankClassifier
         Rank classifier
+    dtree_pred : dict(str, RstDepTree), optional
+        RST d-trees, indexed by doc_name. If d-trees are provided this
+        way, `out_dir` is ignored.
 
     Returns
     -------
@@ -163,9 +158,9 @@ def load_hayashi_dep_ctrees(out_dir, rel_conv, edus_file_pat, nuc_clf,
         RST ctree for each document.
     """
     ctree_pred = dict()
-
-    dtree_pred = load_hayashi_dep_dtrees(out_dir, rel_conv, edus_file_pat,
-                                         nuc_clf, rnk_clf)
+    if dtree_pred is None:
+        dtree_pred = load_hayashi_dep_dtrees(out_dir, rel_conv, edus_file_pat,
+                                             nuc_clf, rnk_clf)
     for doc_name, dt_pred in dtree_pred.items():
         try:
             ct_pred = deptree_to_rst_tree(dt_pred)
diff --git a/evals/ji.py b/evals/ji.py
index 08fbd8b..c6ab6e8 100644
--- a/evals/ji.py
+++ b/evals/ji.py
@@ -149,7 +149,7 @@ def load_ji_ctrees(ji_out_dir, rel_conv):
     return ctree_pred
 
 
-def load_ji_dtrees(ji_out_dir, rel_conv, nary_enc='chain'):
+def load_ji_dtrees(ji_out_dir, rel_conv, nary_enc='chain', ctree_pred=None):
     """Get the dtrees that correspond to the ctrees output by DPLP.
 
     Parameters
@@ -160,6 +160,9 @@ def load_ji_dtrees(ji_out_dir, rel_conv, nary_enc='chain'):
         Relation converter, from fine- to coarse-grained labels.
     nary_enc: one of {'chain', 'tree'}
         Encoding for n-ary nodes.
+    ctree_pred : dict(str, RSTTree), optional
+        RST c-trees, indexed by doc_name. If c-trees are provided this
+        way, `out_dir` is ignored.
 
     Returns
     -------
@@ -167,8 +170,8 @@ def load_ji_dtrees(ji_out_dir, rel_conv, nary_enc='chain'):
         RST dtree for each document.
     """
     dtree_pred = dict()
-
-    ctree_pred = load_ji_ctrees(ji_out_dir, rel_conv)
+    if ctree_pred is None:
+        ctree_pred = load_ji_ctrees(ji_out_dir, rel_conv)
     for doc_name, ct_pred in ctree_pred.items():
         dtree_pred[doc_name] = RstDepTree.from_rst_tree(
             ct_pred, nary_enc=nary_enc)
diff --git a/evals/li_qi.py b/evals/li_qi.py
index abf1929..2df67d2 100644
--- a/evals/li_qi.py
+++ b/evals/li_qi.py
@@ -90,7 +90,7 @@ def load_li_qi_ctrees(out_dir, rel_conv):
     return ctree_pred
 
 
-def load_li_qi_dtrees(out_dir, rel_conv, nary_enc='chain'):
+def load_li_qi_dtrees(out_dir, rel_conv, nary_enc='chain', ctree_pred=None):
     """Get the dtrees that correspond to the ctrees output by Li Qi's parser.
 
     Parameters
@@ -99,21 +99,26 @@ def load_li_qi_dtrees(out_dir, rel_conv, nary_enc='chain'):
         Path to the base directory containing the output files.
     nary_enc: one of {'chain', 'tree'}
         Encoding for n-ary nodes.
+    ctree_pred : dict(str, RSTTree), optional
+        RST c-trees, indexed by doc_name. If c-trees are provided this
+        way, `out_dir` is ignored.
 
     Returns
     -------
     dtree_pred: dict(str, RstDepTree)
         RST dtree for each document.
     """
-    # load predicted trees
-    data_pred = load_li_qi_output_files(out_dir)
-    # filenames = data_pred['filenames']
-    doc_names_pred = data_pred['doc_names']
-    rst_ctrees_pred = data_pred['rst_ctrees']
-
+    if ctree_pred is None:
+        # load predicted trees
+        data_pred = load_li_qi_output_files(out_dir)
+        # filenames = data_pred['filenames']
+        doc_names_pred = data_pred['doc_names']
+        rst_ctrees_pred = data_pred['rst_ctrees']
+        ctree_pred = {doc_name: ct_pred for doc_name, ct_pred
+                      in itertools.izip(doc_names_pred, rst_ctrees_pred)}
     # build a dict from doc_name to ordered dtree (RstDepTree)
     dtree_pred = dict()
-    for doc_name, ct_pred in itertools.izip(doc_names_pred, rst_ctrees_pred):
+    for doc_name, ct_pred in ctree_pred.items():
         # constituency tree
         # replace fine-grained labels with coarse-grained labels ;
         # the files we have already contain the coarse labels, except their
diff --git a/evals/ours.py b/evals/ours.py
index 938a53c..2e50c2f 100644
--- a/evals/ours.py
+++ b/evals/ours.py
@@ -117,7 +117,7 @@ def load_attelo_dtrees(output_file, edus_file, nuc_clf, rnk_clf):
             else:
                 dt_pred.add_dependency(gid2num[src_id], gid2num[tgt_id], lbl)
         dt_pred.origin = mk_key(doc_name)
-        # add nuclearity: heuristic baseline
+        # add nuclearity: heuristic baseline WIP or true classifier
         dt_pred.nucs = nuc_clf.predict([dt_pred])[0]
         # add rank: heuristic baseline, needs edu2sent
         edu2sent = doc_name2edu2sent[doc_name]
@@ -129,7 +129,8 @@ def load_attelo_dtrees(output_file, edus_file, nuc_clf, rnk_clf):
     return dtree_pred
 
 
-def load_attelo_ctrees(output_file, edus_file, nuc_clf, rnk_clf):
+def load_attelo_ctrees(output_file, edus_file, nuc_clf, rnk_clf,
+                       dtree_pred=None):
     """Load RST ctrees from attelo output files.
 
     Parameters
@@ -142,13 +143,18 @@ def load_attelo_ctrees(output_file, edus_file, nuc_clf, rnk_clf):
         Classifier to predict nuclearity
     rnk_clf: RankClassifier
         Classifier to predict attachment ranking
+    dtree_pred : dict(str, RstDepTree), optional
+        RST d-trees, indexed by doc_name. If d-trees are provided this
+        way, `out_dir` is ignored.
 
     Returns
     -------
     TODO
     """
-    # load RST dtrees, with heuristics for nuc and rank
-    dtree_pred = load_attelo_dtrees(output_file, edus_file, nuc_clf, rnk_clf)
+    if dtree_pred is None:
+        # load RST dtrees, with heuristics for nuc and rank
+        dtree_pred = load_attelo_dtrees(output_file, edus_file, nuc_clf,
+                                        rnk_clf)
     # convert to RST ctrees
     ctree_pred = dict()
     for doc_name, dt_pred in dtree_pred.items():
diff --git a/evals/showdown.py b/evals/showdown.py
index 9193612..4c311e6 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -11,6 +11,7 @@
 import os
 
 from sklearn.datasets import load_svmlight_files
+from sklearn.linear_model.logistic import LogisticRegressionCV
 
 from educe.rst_dt.annotation import _binarize, SimpleRSTTree
 from educe.rst_dt.corpus import (RstRelationConverter,
@@ -215,7 +216,7 @@ def setup_dtree_postprocessor(nary_enc='chain', order='strict',
         y_nuc_train.append(dt.nucs)
         y_rnk_train.append(dt.ranks)
     # nuclearity clf
-    if False:
+    if True:
         nuc_clf = DummyNuclearityClassifier(strategy=nuc_strategy,
                                             constant=nuc_constant)
         nuc_clf.fit(X_train, y_nuc_train)
@@ -236,7 +237,10 @@ def setup_dtree_postprocessor(nary_enc='chain', order='strict',
             n_features=46731,
             zero_based=False
         )
-        nuc_clf = RightBinaryNuclearityClassifier()
+        bin_clf = LogisticRegressionCV(Cs=10,  # defaults to 10
+                                       penalty='l1', solver='liblinear',
+                                       n_jobs=3)
+        nuc_clf = RightBinaryNuclearityClassifier(bin_clf=bin_clf)
         nuc_clf = nuc_clf.fit(X_nuc_train, y_nuc_train)
         # end WIP nuc_clf
     # rank clf
@@ -362,69 +366,79 @@ def main():
     for author_pred in authors_pred:
         # braud coling 2016
         if author_pred == 'BPS16':
+            ctree_pred = load_braud_coling_ctrees(BRAUD_COLING_OUT_DIR,
+                                                  REL_CONV)
             c_preds.append(
-                ('BPS16', load_braud_coling_ctrees(
-                    BRAUD_COLING_OUT_DIR, REL_CONV))
+                ('BPS16', ctree_pred)
             )
             d_preds.append(
                 ('BPS16', load_braud_coling_dtrees(
-                    BRAUD_COLING_OUT_DIR, REL_CONV, nary_enc='chain'))
+                    BRAUD_COLING_OUT_DIR, REL_CONV, nary_enc='chain',
+                    ctree_pred=ctree_pred))
             )
         # braud eacl 2017 - mono
         if author_pred == 'BCS17_mono':
+            ctree_pred = load_braud_eacl_ctrees(BRAUD_EACL_MONO, REL_CONV,
+                                                sorted_doc_names)
             c_preds.append(
-                ('BCS17_mono', load_braud_eacl_ctrees(
-                    BRAUD_EACL_MONO, REL_CONV, sorted_doc_names))
+                ('BCS17_mono', ctree_pred)
             )
             d_preds.append(
                 ('BCS17_mono', load_braud_eacl_dtrees(
                     BRAUD_EACL_MONO, REL_CONV, sorted_doc_names,
-                    nary_enc='chain'))
+                    nary_enc='chain', ctree_pred=ctree_pred))
             )
         # braud eacl 2017 - cross+dev
         if author_pred == 'BCS17_cross':
+            ctree_pred = load_braud_eacl_ctrees(BRAUD_EACL_CROSS_DEV,
+                                                REL_CONV, sorted_doc_names)
             c_preds.append(
-                ('BCS17_cross', load_braud_eacl_ctrees(
-                    BRAUD_EACL_CROSS_DEV, REL_CONV, sorted_doc_names))
+                ('BCS17_cross', ctree_pred)
             )
             d_preds.append(
                 ('BCS17_cross', load_braud_eacl_dtrees(
                     BRAUD_EACL_CROSS_DEV, REL_CONV, sorted_doc_names,
-                    nary_enc='chain'))
+                    nary_enc='chain', ctree_pred=ctree_pred))
             )
 
         if author_pred == 'HHN16_HILDA':
+            ctree_pred = load_hayashi_hilda_ctrees(HAYASHI_HILDA_OUT_DIR,
+                                                   REL_CONV)
             c_preds.append(
-                ('HHN16_HILDA', load_hayashi_hilda_ctrees(
-                    HAYASHI_HILDA_OUT_DIR, REL_CONV))
+                ('HHN16_HILDA', ctree_pred)
             )
             d_preds.append(
                 ('HHN16_HILDA', load_hayashi_hilda_dtrees(
-                    HAYASHI_HILDA_OUT_DIR, REL_CONV, nary_enc='chain'))
+                    HAYASHI_HILDA_OUT_DIR, REL_CONV, nary_enc='chain',
+                    ctree_pred=ctree_pred))
             )
 
         if author_pred == 'HHN16_MST':
+            dtree_pred = load_hayashi_dep_dtrees(
+                    HAYASHI_MST_OUT_DIR, REL_CONV_DTREE, EDUS_FILE_PAT,
+                    nuc_clf, rnk_clf)
             c_preds.append(
                 ('HHN16_MST', load_hayashi_dep_ctrees(
                     HAYASHI_MST_OUT_DIR, REL_CONV_DTREE, EDUS_FILE_PAT,
-                    nuc_clf, rnk_clf))
+                    nuc_clf, rnk_clf, dtree_pred=dtree_pred))
             )
             d_preds.append(
-                ('HHN16_MST', load_hayashi_dep_dtrees(
-                    HAYASHI_MST_OUT_DIR, REL_CONV_DTREE, EDUS_FILE_PAT,
-                    nuc_clf, rnk_clf))
+                ('HHN16_MST', dtree_pred)
             )
 
         if author_pred == 'LLC16':
+            ctree_pred = load_li_qi_ctrees(LI_QI_OUT_DIR, REL_CONV)
             c_preds.append(
-                ('LLC16', load_li_qi_ctrees(LI_QI_OUT_DIR, REL_CONV))
+                ('LLC16', ctree_pred)
             )
             d_preds.append(
                 ('LLC16', load_li_qi_dtrees(LI_QI_OUT_DIR, REL_CONV,
-                                            nary_enc='chain'))
+                                            nary_enc='chain',
+                                            ctree_pred=ctree_pred))
             )
 
         if author_pred == 'li_sujian':
+            # FIXME load d-trees once, pass dtree_pred to the c-loader
             c_preds.append(
                 ('li_sujian', load_li_sujian_dep_ctrees(
                     LI_SUJIAN_OUT_FILE, REL_CONV_DTREE, EDUS_FILE_PAT,
@@ -437,6 +451,7 @@ def main():
             )
 
         if author_pred == 'FH14_gSVM':
+            # FIXME load c-trees once, pass ctree_pred to the d-loader
             c_preds.append(
                 ('FH14_gSVM', load_feng_ctrees(FENG1_OUT_DIR, REL_CONV))
             )
@@ -446,22 +461,26 @@ def main():
             )
 
         if author_pred == 'FH14_gCRF':
+            ctree_pred = load_gcrf_ctrees(FENG2_OUT_DIR, REL_CONV)
             c_preds.append(
-                ('FH14_gCRF', load_gcrf_ctrees(FENG2_OUT_DIR, REL_CONV))
+                ('FH14_gCRF', ctree_pred)
             )
             d_preds.append(
                 ('FH14_gCRF', load_gcrf_dtrees(FENG2_OUT_DIR, REL_CONV,
-                                               nary_enc='chain'))
+                                               nary_enc='chain',
+                                               ctree_pred=ctree_pred))
             )
 
         if author_pred == 'JCN15_1S1S':
             # CODRA outputs RST ctrees ; eval_codra_output maps them to RST dtrees
+            ctree_pred = load_codra_ctrees(CODRA_OUT_DIR, REL_CONV)
             c_preds.append(
-                ('JCN15_1S1S', load_codra_ctrees(CODRA_OUT_DIR, REL_CONV))
+                ('JCN15_1S1S', ctree_pred)
             )
             d_preds.append(
                 ('JCN15_1S1S', load_codra_dtrees(CODRA_OUT_DIR, REL_CONV,
-                                                 nary_enc='chain'))
+                                                 nary_enc='chain',
+                                                 ctree_pred=ctree_pred))
             )
             # joty-{chain,tree} would be the same except nary_enc='tree' ;
             # the nary_enc does not matter because codra outputs binary ctrees,
@@ -470,13 +489,14 @@ def main():
         if author_pred == 'JE14':
             # DPLP outputs RST ctrees in the form of lists of spans;
             # load_ji_dtrees maps them to RST dtrees
+            ctree_pred = load_ji_ctrees(JI_OUT_DIR, REL_CONV)
             c_preds.append(
-                ('JE14', load_ji_ctrees(
-                    JI_OUT_DIR, REL_CONV))
+                ('JE14', ctree_pred)
             )
             d_preds.append(
-                ('JE14', load_ji_dtrees(
-                    JI_OUT_DIR, REL_CONV, nary_enc='chain'))
+                ('JE14', load_ji_dtrees(JI_OUT_DIR, REL_CONV,
+                                        nary_enc='chain',
+                                        ctree_pred=ctree_pred))
             )
             # ji-{chain,tree} would be the same except nary_enc='tree' ;
             # the nary_enc does not matter because DPLP outputs binary ctrees,
@@ -498,47 +518,52 @@ def main():
             # hence both encodings result in (the same) strictly ordered dtrees
 
         if author_pred == 'SHV15_D':
+            ctree_pred = load_surdeanu_ctrees(SURDEANU_LOG_FILE, REL_CONV)
             c_preds.append(
-                ('SHV15_D', load_surdeanu_ctrees(
-                    SURDEANU_LOG_FILE, REL_CONV))
+                ('SHV15_D', ctree_pred)
             )
             d_preds.append(
                 ('SHV15_D', load_surdeanu_dtrees(
-                    SURDEANU_LOG_FILE, REL_CONV, nary_enc='chain'))
+                    SURDEANU_LOG_FILE, REL_CONV, nary_enc='chain',
+                    ctree_pred=ctree_pred))
             )
 
         if author_pred == 'ours-chain':
             # Eisner, predicted syntax, chain
+            dtree_pred = load_attelo_dtrees(EISNER_OUT_SYN_PRED, EDUS_FILE,
+                                            nuc_clf, rnk_clf)
             c_preds.append(
                 ('ours-chain', load_attelo_ctrees(
-                    EISNER_OUT_SYN_PRED, EDUS_FILE, nuc_clf, rnk_clf))
+                    EISNER_OUT_SYN_PRED, EDUS_FILE, nuc_clf, rnk_clf,
+                    dtree_pred=dtree_pred))
             )
             d_preds.append(
-                ('ours-chain', load_attelo_dtrees(
-                    EISNER_OUT_SYN_PRED, EDUS_FILE, nuc_clf, rnk_clf))
+                ('ours-chain', dtree_pred)
             )
 
         if author_pred == 'ours-tree':
             # Eisner, predicted syntax, tree + same-unit
+            dtree_pred = load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED,
+                                            EDUS_FILE, nuc_clf, rnk_clf)
             c_preds.append(
                 ('ours-tree', load_attelo_ctrees(
-                    EISNER_OUT_TREE_SYN_PRED, EDUS_FILE, nuc_clf, rnk_clf))
+                    EISNER_OUT_TREE_SYN_PRED, EDUS_FILE, nuc_clf, rnk_clf,
+                    dtree_pred=dtree_pred))
             )
             d_preds.append(
-                ('ours-tree', load_attelo_dtrees(
-                    EISNER_OUT_TREE_SYN_PRED, EDUS_FILE, nuc_clf, rnk_clf))
+                ('ours-tree', dtree_pred)
             )
         if author_pred == 'ours-tree-su':
             # Eisner, predicted syntax, tree + same-unit
+            dtree_pred = load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED_SU,
+                                            EDUS_FILE, nuc_clf, rnk_clf)
             c_preds.append(
-                ('ours-tree-su', load_attelo_ctrees(EISNER_OUT_TREE_SYN_PRED_SU,
-                                                    EDUS_FILE,
-                                                    nuc_clf, rnk_clf))
+                ('ours-tree-su', load_attelo_ctrees(
+                    EISNER_OUT_TREE_SYN_PRED_SU, EDUS_FILE, nuc_clf, rnk_clf,
+                    dtree_pred=dtree_pred))
             )
             d_preds.append(
-                ('ours-tree-su', load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED_SU,
-                                                    EDUS_FILE,
-                                                    nuc_clf, rnk_clf))
+                ('ours-tree-su', dtree_pred)
             )
         # 2017-05-17 enable "gold" as parser, should give perfect scores
         if author_pred == 'gold':
diff --git a/evals/surdeanu.py b/evals/surdeanu.py
index 31111e8..7884e34 100644
--- a/evals/surdeanu.py
+++ b/evals/surdeanu.py
@@ -180,7 +180,8 @@ def load_surdeanu_ctrees(log_file, rel_conv):
         return _load_surdeanu_ctrees(f, rel_conv)
 
 
-def load_surdeanu_dtrees(log_file, rel_conv, nary_enc='chain'):
+def load_surdeanu_dtrees(log_file, rel_conv, nary_enc='chain',
+                         ctree_pred=None):
     """Get the dtrees for the ctrees output by Surdeanu's parser.
 
     Parameters
@@ -191,6 +192,9 @@ def load_surdeanu_dtrees(log_file, rel_conv, nary_enc='chain'):
         Relation converter, from fine- to coarse-grained labels.
     nary_enc: one of {'chain', 'tree'}
         Encoding for n-ary nodes.
+    ctree_pred : dict(str, RSTTree), optional
+        RST c-trees, indexed by doc_name. If c-trees are provided this
+        way, `out_dir` is ignored.
 
     Returns
     -------
@@ -198,8 +202,8 @@ def load_surdeanu_dtrees(log_file, rel_conv, nary_enc='chain'):
         RST dtree for each document.
     """
     dtree_pred = dict()
-
-    ctree_pred = load_surdeanu_ctrees(log_file, rel_conv)
+    if ctree_pred is None:
+        ctree_pred = load_surdeanu_ctrees(log_file, rel_conv)
     for doc_name, ct_pred in ctree_pred.items():
         dtree_pred[doc_name] = RstDepTree.from_rst_tree(
             ct_pred, nary_enc=nary_enc)
diff --git a/evals/train_nuc_classifier.py b/evals/train_nuc_classifier.py
index bb25adf..784e944 100644
--- a/evals/train_nuc_classifier.py
+++ b/evals/train_nuc_classifier.py
@@ -15,8 +15,7 @@
 
 from sklearn.datasets import load_svmlight_file, load_svmlight_files
 from sklearn.model_selection import cross_val_score
-from sklearn.linear_model.logistic import LogisticRegression
-# from sklearn.model_selection import GridSearchCV
+from sklearn.linear_model.logistic import LogisticRegression, LogisticRegressionCV
 from sklearn.preprocessing import LabelEncoder
 import matplotlib.pyplot as plt
 
@@ -36,7 +35,8 @@
         (dset_train, dset_test),
         zero_based=False
     )
-    nuc_clf = LogisticRegression(penalty='l1', n_jobs=2)
+    nuc_clf = LogisticRegressionCV(penalty='l1', solver='liblinear',
+                                   n_jobs=2)
     # train nuclearity classifier, cross-validate performance on train
     scores = cross_val_score(nuc_clf, X_train, y_train, cv=10)
     print(scores)
@@ -92,7 +92,7 @@ class RightBinaryNuclearityClassifier(object):
         Binary classifier for right dependencies: NN vs NS.
     """
 
-    def __init__(self, bin_clf=LogisticRegression(penalty='l1', n_jobs=2)):
+    def __init__(self, bin_clf=LogisticRegression(penalty='l1', solver='liblinear', n_jobs=2)):
         """Init"""
         self.bin_clf = bin_clf
 
@@ -143,5 +143,6 @@ def predict(self, X):
                     else:
                         raise ValueError("Weird prediction: {}".format(
                             y_pred))
+
             y.append(yi)
         return y

From 96937967851d5d5711e058064fb934f4eff75b6a Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Tue, 19 Dec 2017 11:18:33 +0100
Subject: [PATCH 72/74] ENH rel_clf, nuc_clf, model_split='sent'

---
 evals/ours.py                 |  12 +-
 evals/prepare_nuc_dataset.py  | 262 ++++++++++++++++++++++++----------
 evals/prepare_rel_dataset.py  | 256 +++++++++++++++++++++++++++++++++
 evals/showdown.py             | 170 ++++++++++++++++++----
 evals/train_nuc_classifier.py | 196 +++++++++++++++++++------
 evals/train_rel_relabeller.py | 201 ++++++++++++++++++++++++++
 6 files changed, 951 insertions(+), 146 deletions(-)
 create mode 100644 evals/prepare_rel_dataset.py
 create mode 100644 evals/train_rel_relabeller.py

diff --git a/evals/ours.py b/evals/ours.py
index 2e50c2f..6d651b4 100644
--- a/evals/ours.py
+++ b/evals/ours.py
@@ -57,7 +57,7 @@ def load_attelo_output_file(output_file):
     return edges_pred
 
 
-def load_attelo_dtrees(output_file, edus_file, nuc_clf, rnk_clf):
+def load_attelo_dtrees(output_file, edus_file, rel_clf, nuc_clf, rnk_clf):
     """Load RST dtrees from attelo output files.
 
     Parameters
@@ -117,6 +117,10 @@ def load_attelo_dtrees(output_file, edus_file, nuc_clf, rnk_clf):
             else:
                 dt_pred.add_dependency(gid2num[src_id], gid2num[tgt_id], lbl)
         dt_pred.origin = mk_key(doc_name)
+        # 2017-12-14 relabel relations
+        if rel_clf is not None:
+            dt_pred.labels = rel_clf.predict([dt_pred])[0]
+        # end relabel relations
         # add nuclearity: heuristic baseline WIP or true classifier
         dt_pred.nucs = nuc_clf.predict([dt_pred])[0]
         # add rank: heuristic baseline, needs edu2sent
@@ -129,7 +133,7 @@ def load_attelo_dtrees(output_file, edus_file, nuc_clf, rnk_clf):
     return dtree_pred
 
 
-def load_attelo_ctrees(output_file, edus_file, nuc_clf, rnk_clf,
+def load_attelo_ctrees(output_file, edus_file, rel_clf, nuc_clf, rnk_clf,
                        dtree_pred=None):
     """Load RST ctrees from attelo output files.
 
@@ -153,8 +157,8 @@ def load_attelo_ctrees(output_file, edus_file, nuc_clf, rnk_clf,
     """
     if dtree_pred is None:
         # load RST dtrees, with heuristics for nuc and rank
-        dtree_pred = load_attelo_dtrees(output_file, edus_file, nuc_clf,
-                                        rnk_clf)
+        dtree_pred = load_attelo_dtrees(output_file, edus_file,
+                                        rel_clf, nuc_clf, rnk_clf)
     # convert to RST ctrees
     ctree_pred = dict()
     for doc_name, dt_pred in dtree_pred.items():
diff --git a/evals/prepare_nuc_dataset.py b/evals/prepare_nuc_dataset.py
index 97e1c6e..e9d534f 100644
--- a/evals/prepare_nuc_dataset.py
+++ b/evals/prepare_nuc_dataset.py
@@ -24,7 +24,7 @@
 from educe.rst_dt.deptree import RstDepTree
 
 
-def main(corpus, dataset, out_dir, nary_enc):
+def main(corpus, dataset, out_dir, nary_enc, model_split):
     """Do prepare the nuclearity dataset.
 
     Parameters
@@ -35,6 +35,12 @@ def main(corpus, dataset, out_dir, nary_enc):
         Path to the existing dataset labelled with coarse relations.
     out_dir : str
         Path to the output folder.
+    nary_enc : str, one of {'chain', 'tree'}
+        Encoding for n-ary nodes.
+    model_split : str, one of {'none', 'sent', 'sent-para'}
+        If not 'none', use distinct models for subsets of instances:
+        * 'sent': intra- vs inter-sentential,
+        * 'sent-para': intra-sentential, intra-paragraph, rest (doc-level).
     """
     # (re-)create a d-corpus from the RST-DT c-corpus
     corpus_subset = os.path.basename(dataset).split('.')[0]
@@ -59,80 +65,181 @@ def main(corpus, dataset, out_dir, nary_enc):
     # description of the EDUs (.edu_input)
     pairings = dataset + '.pairings'
     # edu_desc = dataset + '.edu_input'
-    new_dataset = os.path.join(out_dir, os.path.basename(dataset))
-    new_pairs = os.path.join(out_dir, os.path.basename(pairings))
-    if ((os.path.abspath(new_dataset) == os.path.abspath(dataset) or
-         os.path.abspath(new_pairs) == os.path.abspath(pairings))):
-        raise ValueError("I won't let you erase your base dataset")
-    with codecs.open(dataset, mode='rb', encoding='utf-8') as f_data:
-        with codecs.open(pairings, mode='rb', encoding='utf-8') as f_pairs:
-            with codecs.open(new_dataset, mode='wb', encoding='utf-8') as data_out:
-                with codecs.open(new_pairs, mode='wb', encoding='utf-8') as pairs_out:
-                    # read header line in svmlight file
-                    header = f_data.readline()
-                    header_prefix = '# labels: '
-                    assert header.startswith(header_prefix)
-                    labels = header[len(header_prefix):].split()
-                    int2lbl = dict(enumerate(labels, start=1))
-                    lbl2int = {lbl: i for i, lbl in int2lbl.items()}
-                    unrelated = lbl2int["UNRELATED"]
-                    root = lbl2int["ROOT"]
-                    # write labels in header of new svmlight file, as an
-                    # ordered list mapped to {1, 2}
-                    print(header_prefix + ' '.join((NUC_N, NUC_S)),
-                          file=data_out)
-                    # stream through lines
-                    for pair, line in itertools.izip(f_pairs, f_data):
-                        # read candidate pair of EDUs
-                        src_id, tgt_id = pair.strip().split('\t')
-                        if src_id == 'ROOT':
-                            continue
-                        # now both src_id and tgt_id are of form "docname_int"
-                        # ex: "wsj_0600.out_1"
-                        src_idx = int(src_id.rsplit('_', 1)[1])
-                        doc_name, tgt_idx = tgt_id.rsplit('_', 1)
-                        tgt_idx = int(tgt_idx)
-                        if tgt_idx < src_idx:
-                            # skip left dependencies: by construction,
-                            # their nuclearity can only be Satellite
-                            # (SN edges)
-                            continue
-                        # print(doc_name, src_id, tgt_id, src_idx, tgt_idx)
-                        # read corresponding ref class (label), feature vector
-                        lbl_idx, feat_vector = line.strip().split(' ', 1)
-                        lbl_idx = int(lbl_idx)  # lbl currently encoded as int
-                        if lbl_idx in (unrelated, root):
-                            continue
-                        try:
-                            lbl = int2lbl[lbl_idx]
-                        except KeyError:
-                            # the test set in RST-DT 1.0 has an error:
-                            # wsj_1189.out [8-9] is labelled "span" instead of
-                            # "Consequence" ; some runs used this erroneous
-                            # version, hence had a class "0" (unknown) for
-                            # this line in the dataset
-                            if ((doc_name == 'wsj_1189.out' and
-                                 src_idx == 7 and
-                                 tgt_idx == 9)):
-                                lbl = 'cause'
-                                lbl_idx = lbl2int[lbl]
-                            else:
-                                print(doc_name, src_idx, tgt_idx)
-                                raise
-                        # print(src_id, tgt_id, lbl)
-                        dtree = rst_dcorpus[doc_name]
-                        assert dtree.heads[tgt_idx] == src_idx
-                        assert dtree.labels[tgt_idx] == lbl
-                        if dtree.nucs[tgt_idx] == NUC_N:
-                            nuc_idx = 1
-                        elif dtree.nucs[tgt_idx] == NUC_S:
-                            nuc_idx = 2
-                        else:
-                            raise ValueError("weird nuclearity {}".format(
-                                dtree.nucs[tgt_idx]))
-                        print(str(nuc_idx) + ' ' + feat_vector,
+    if model_split == 'none':
+        new_dataset = os.path.join(out_dir, os.path.basename(dataset))
+        new_pairs = os.path.join(out_dir, os.path.basename(pairings))
+        if ((os.path.abspath(new_dataset) == os.path.abspath(dataset) or
+             os.path.abspath(new_pairs) == os.path.abspath(pairings))):
+            raise ValueError("I won't let you erase your base dataset")
+        with codecs.open(dataset, mode='rb', encoding='utf-8') as f_data:
+            with codecs.open(pairings, mode='rb', encoding='utf-8') as f_pairs:
+                with codecs.open(new_dataset, mode='wb', encoding='utf-8') as data_out:
+                    with codecs.open(new_pairs, mode='wb', encoding='utf-8') as pairs_out:
+                        # read header line in svmlight file
+                        header = f_data.readline()
+                        header_prefix = '# labels: '
+                        assert header.startswith(header_prefix)
+                        labels = header[len(header_prefix):].split()
+                        int2lbl = dict(enumerate(labels, start=1))
+                        lbl2int = {lbl: i for i, lbl in int2lbl.items()}
+                        unrelated = lbl2int["UNRELATED"]
+                        root = lbl2int["ROOT"]
+                        # write labels in header of new svmlight file, as an
+                        # ordered list mapped to {1, 2}
+                        print(header_prefix + ' '.join((NUC_N, NUC_S)),
                               file=data_out)
-                        print(pair.strip(), file=pairs_out)
+                        # stream through lines
+                        for pair, line in itertools.izip(f_pairs, f_data):
+                            # read candidate pair of EDUs
+                            src_id, tgt_id = pair.strip().split('\t')
+                            if src_id == 'ROOT':
+                                continue
+                            # now both src_id and tgt_id are of form "docname_int"
+                            # ex: "wsj_0600.out_1"
+                            src_idx = int(src_id.rsplit('_', 1)[1])
+                            doc_name, tgt_idx = tgt_id.rsplit('_', 1)
+                            tgt_idx = int(tgt_idx)
+                            if tgt_idx < src_idx:
+                                # skip left dependencies: by construction,
+                                # their nuclearity can only be Satellite
+                                # (SN edges)
+                                continue
+                            # print(doc_name, src_id, tgt_id, src_idx, tgt_idx)
+                            # read corresponding ref class (label), feature vector
+                            lbl_idx, feat_vector = line.strip().split(' ', 1)
+                            lbl_idx = int(lbl_idx)  # lbl currently encoded as int
+                            if lbl_idx in (unrelated, root):
+                                continue
+                            try:
+                                lbl = int2lbl[lbl_idx]
+                            except KeyError:
+                                # the test set in RST-DT 1.0 has an error:
+                                # wsj_1189.out [8-9] is labelled "span" instead of
+                                # "Consequence" ; some runs used this erroneous
+                                # version, hence had a class "0" (unknown) for
+                                # this line in the dataset
+                                if ((doc_name == 'wsj_1189.out' and
+                                     src_idx == 7 and
+                                     tgt_idx == 9)):
+                                    lbl = 'cause'
+                                    lbl_idx = lbl2int[lbl]
+                                else:
+                                    print(doc_name, src_idx, tgt_idx)
+                                    raise
+                            # print(src_id, tgt_id, lbl)
+                            dtree = rst_dcorpus[doc_name]
+                            assert dtree.heads[tgt_idx] == src_idx
+                            assert dtree.labels[tgt_idx] == lbl
+                            if dtree.nucs[tgt_idx] == NUC_N:
+                                nuc_idx = 1
+                            elif dtree.nucs[tgt_idx] == NUC_S:
+                                nuc_idx = 2
+                            else:
+                                raise ValueError("weird nuclearity {}".format(
+                                    dtree.nucs[tgt_idx]))
+                            print(str(nuc_idx) + ' ' + feat_vector,
+                                  file=data_out)
+                            print(pair.strip(), file=pairs_out)
+    elif model_split == 'sent':
+        # 2 datasets: intra- and inter-sentential
+        new_dataset = (
+            os.path.join(out_dir + '_intrasent', os.path.basename(dataset)),
+            os.path.join(out_dir + '_intersent', os.path.basename(dataset))
+        )
+        new_pairs = (
+            os.path.join(out_dir + '_intrasent', os.path.basename(pairings)),
+            os.path.join(out_dir + '_intersent', os.path.basename(pairings))
+        )
+        if ((os.path.abspath(new_dataset[0]) == os.path.abspath(dataset) or
+             os.path.abspath(new_pairs[0]) == os.path.abspath(pairings) or
+             os.path.abspath(new_dataset[1]) == os.path.abspath(dataset) or
+             os.path.abspath(new_pairs[1]) == os.path.abspath(pairings))):
+            raise ValueError("I won't let you erase your base dataset")
+        with codecs.open(dataset, mode='rb', encoding='utf-8') as f_data:
+            with codecs.open(pairings, mode='rb', encoding='utf-8') as f_pairs:
+                with codecs.open(new_dataset[0], mode='wb', encoding='utf-8') as data_out_intra:
+                    with codecs.open(new_pairs[0], mode='wb', encoding='utf-8') as pairs_out_intra:
+                        with codecs.open(new_dataset[1], mode='wb', encoding='utf-8') as data_out_inter:
+                            with codecs.open(new_pairs[1], mode='wb', encoding='utf-8') as pairs_out_inter:
+                                # read header line in svmlight file
+                                header = f_data.readline()
+                                header_prefix = '# labels: '
+                                assert header.startswith(header_prefix)
+                                labels = header[len(header_prefix):].split()
+                                int2lbl = dict(enumerate(labels, start=1))
+                                lbl2int = {lbl: i for i, lbl in int2lbl.items()}
+                                unrelated = lbl2int["UNRELATED"]
+                                root = lbl2int["ROOT"]
+                                # write labels in header of new svmlight file, as an
+                                # ordered list mapped to {1, 2}
+                                print(header_prefix + ' '.join((NUC_N, NUC_S)),
+                                      file=data_out_intra)
+                                print(header_prefix + ' '.join((NUC_N, NUC_S)),
+                                      file=data_out_inter)
+                                # stream through lines
+                                for pair, line in itertools.izip(f_pairs, f_data):
+                                    # read candidate pair of EDUs
+                                    src_id, tgt_id = pair.strip().split('\t')
+                                    if src_id == 'ROOT':
+                                        continue
+                                    # now both src_id and tgt_id are of form "docname_int"
+                                    # ex: "wsj_0600.out_1"
+                                    src_idx = int(src_id.rsplit('_', 1)[1])
+                                    doc_name, tgt_idx = tgt_id.rsplit('_', 1)
+                                    tgt_idx = int(tgt_idx)
+                                    if tgt_idx < src_idx:
+                                        # skip left dependencies: by construction,
+                                        # their nuclearity can only be Satellite
+                                        # (SN edges)
+                                        continue
+                                    # print(doc_name, src_id, tgt_id, src_idx, tgt_idx)
+                                    # read corresponding ref class (label), feature vector
+                                    lbl_idx, feat_vector = line.strip().split(' ', 1)
+                                    lbl_idx = int(lbl_idx)  # lbl currently encoded as int
+                                    if lbl_idx in (unrelated, root):
+                                        continue
+                                    try:
+                                        lbl = int2lbl[lbl_idx]
+                                    except KeyError:
+                                        # the test set in RST-DT 1.0 has an error:
+                                        # wsj_1189.out [8-9] is labelled "span" instead of
+                                        # "Consequence" ; some runs used this erroneous
+                                        # version, hence had a class "0" (unknown) for
+                                        # this line in the dataset
+                                        if ((doc_name == 'wsj_1189.out' and
+                                             src_idx == 7 and
+                                             tgt_idx == 9)):
+                                            lbl = 'cause'
+                                            lbl_idx = lbl2int[lbl]
+                                        else:
+                                            print(doc_name, src_idx, tgt_idx)
+                                            raise
+                                    # print(src_id, tgt_id, lbl)
+                                    dtree = rst_dcorpus[doc_name]
+                                    assert dtree.heads[tgt_idx] == src_idx
+                                    assert dtree.labels[tgt_idx] == lbl
+                                    if dtree.nucs[tgt_idx] == NUC_N:
+                                        nuc_idx = 1
+                                    elif dtree.nucs[tgt_idx] == NUC_S:
+                                        nuc_idx = 2
+                                    else:
+                                        raise ValueError("weird nuclearity {}".format(
+                                            dtree.nucs[tgt_idx]))
+                                    if ((' 269:' in feat_vector or
+                                         ' 303:' in feat_vector)):
+                                        # 269 is same_sentence_intra_right
+                                        # 303 is same_sentence_intra_left
+                                        # FIXME find a cleaner way
+                                        print(str(nuc_idx) + ' ' + feat_vector,
+                                              file=data_out_intra)
+                                        print(pair.strip(),
+                                              file=pairs_out_intra)
+                                    else:
+                                        # inter-sentential
+                                        print(str(nuc_idx) + ' ' + feat_vector,
+                                              file=data_out_inter)
+                                        print(pair.strip(),
+                                              file=pairs_out_inter)
 
 
 if __name__ == "__main__":
@@ -163,5 +270,10 @@ def main(corpus, dataset, out_dir, nary_enc):
                         help='Encoding for n-ary nodes',
                         choices=['chain', 'tree'],
                         default='chain')
+    parser.add_argument('--model_split',
+                        help='Separate models for subsets of instances',
+                        choices=['none', 'sent', 'sent-para'],
+                        default='none')
     args = parser.parse_args()
-    main(args.corpus, args.dataset, args.out_dir, args.nary_enc)
+    main(args.corpus, args.dataset, args.out_dir, args.nary_enc,
+         args.model_split)
diff --git a/evals/prepare_rel_dataset.py b/evals/prepare_rel_dataset.py
new file mode 100644
index 0000000..0bd6a5b
--- /dev/null
+++ b/evals/prepare_rel_dataset.py
@@ -0,0 +1,256 @@
+"""This utility script outputs a dataset of the relation of RST edges.
+
+Given the path to the RST-DT corpus and a dataset of candidate RST
+dependencies labelled with their gold coarse (class) RST relation (or
+none if they are unrelated), produce a filtered version of the dataset
+for the task of relation labelling.
+
+As of 2017-12-14, we filter out the instances for unrelated pairs of EDUs
+and dependencies headed by the fake root.
+The resulting dataset describes a n-ary classification problem whose
+labelset is the set of (coarse-grained) classes of RST relations.
+"""
+
+from __future__ import absolute_import, print_function
+
+import argparse
+import codecs
+import itertools
+import os
+
+from educe.rst_dt.annotation import NUC_N, NUC_S
+from educe.rst_dt.corpus import RstRelationConverter, RELMAP_112_18_FILE
+from educe.rst_dt.dep_corpus import read_corpus
+from educe.rst_dt.deptree import RstDepTree
+
+
+def main(corpus, dataset, out_dir, nary_enc, model_split):
+    """Do prepare the RST relation dataset.
+
+    Parameters
+    ----------
+    corpus : str
+        Path to the RST-DT "main" corpus.
+    dataset : str
+        Path to the existing dataset labelled with coarse relations.
+    out_dir : str
+        Path to the output folder.
+    model_split : str, one of {'none', 'sent', 'sent-para'}
+        If not 'none', use distinct models for subsets of instances:
+        * 'sent': intra- vs inter-sentential,
+        * 'sent-para': intra-sentential, intra-paragraph, rest (doc-level).
+    """
+    # (re-)create a d-corpus from the RST-DT c-corpus
+    corpus_subset = os.path.basename(dataset).split('.')[0]
+    if corpus_subset not in ('TRAINING', 'TEST'):
+        raise ValueError("dataset must be a filepath that starts with"
+                         "one of {'TRAINING', 'TEST'}")
+    if corpus_subset == 'TRAINING':
+        section = 'train'
+    else:  # 'TEST'
+        section = 'test'
+    rst_ccorpus = read_corpus(corpus, section=section)
+    rel_conv = RstRelationConverter(RELMAP_112_18_FILE).convert_dtree
+    rst_dcorpus = dict()  # FileId.doc -> RstDepTree
+    for doc_key, rst_ctree in rst_ccorpus[section].items():
+        rst_dtree = RstDepTree.from_rst_tree(rst_ctree, nary_enc=nary_enc)
+        rst_dtree_coarse = rel_conv(rst_dtree)
+        rst_dcorpus[doc_key.doc] = rst_dtree_coarse
+    # for each candidate dependency in the dataset, read the nuclearity
+    # from the RST d-corpus
+    # Nota: we stream through the dataset to avoid loading it entirely in
+    # memory ; we don't need to open the vocabulary file (.vocab), nor the
+    # description of the EDUs (.edu_input)
+    pairings = dataset + '.pairings'
+    # edu_desc = dataset + '.edu_input'
+    if model_split == 'none':
+        new_dataset = os.path.join(out_dir, os.path.basename(dataset))
+        new_pairs = os.path.join(out_dir, os.path.basename(pairings))
+        if ((os.path.abspath(new_dataset) == os.path.abspath(dataset) or
+             os.path.abspath(new_pairs) == os.path.abspath(pairings))):
+            raise ValueError("I won't let you erase your base dataset")
+        with codecs.open(dataset, mode='rb', encoding='utf-8') as f_data:
+            with codecs.open(pairings, mode='rb', encoding='utf-8') as f_pairs:
+                with codecs.open(new_dataset, mode='wb', encoding='utf-8') as data_out:
+                    with codecs.open(new_pairs, mode='wb', encoding='utf-8') as pairs_out:
+                        # read header line in svmlight file
+                        header = f_data.readline()
+                        header_prefix = '# labels: '
+                        assert header.startswith(header_prefix)
+                        labels = header[len(header_prefix):].split()
+                        int2lbl = dict(enumerate(labels, start=1))
+                        lbl2int = {lbl: i for i, lbl in int2lbl.items()}
+                        unrelated = lbl2int["UNRELATED"]
+                        root = lbl2int["ROOT"]
+                        # write labels in header of new svmlight file, here
+                        # we just copy the existing header (even if it has
+                        # ROOT and UNRELATED that should never appear here)
+                        print(header, file=data_out)
+                        # stream through lines
+                        for pair, line in itertools.izip(f_pairs, f_data):
+                            # read candidate pair of EDUs
+                            src_id, tgt_id = pair.strip().split('\t')
+                            if src_id == 'ROOT':
+                                continue
+                            # now both src_id and tgt_id are of form "docname_int"
+                            # ex: "wsj_0600.out_1"
+                            src_idx = int(src_id.rsplit('_', 1)[1])
+                            doc_name, tgt_idx = tgt_id.rsplit('_', 1)
+                            tgt_idx = int(tgt_idx)
+                            # read corresponding ref class (label), feature vector
+                            lbl_idx, feat_vector = line.strip().split(' ', 1)
+                            lbl_idx = int(lbl_idx)  # lbl currently encoded as int
+                            if lbl_idx in (unrelated, root):
+                                continue
+                            try:
+                                lbl = int2lbl[lbl_idx]
+                            except KeyError:
+                                # the test set in RST-DT 1.0 has an error:
+                                # wsj_1189.out [8-9] is labelled "span" instead of
+                                # "Consequence" ; some runs used this erroneous
+                                # version, hence had a class "0" (unknown) for
+                                # this line in the dataset
+                                if ((doc_name == 'wsj_1189.out' and
+                                     src_idx == 7 and
+                                     tgt_idx == 9)):
+                                    lbl = 'cause'
+                                    lbl_idx = lbl2int[lbl]
+                                else:
+                                    print(doc_name, src_idx, tgt_idx)
+                                    raise
+                            # print(src_id, tgt_id, lbl)
+                            dtree = rst_dcorpus[doc_name]
+                            assert dtree.heads[tgt_idx] == src_idx
+                            assert dtree.labels[tgt_idx] == lbl
+                            print(str(lbl_idx) + ' ' + feat_vector,
+                                  file=data_out)
+                            print(pair.strip(), file=pairs_out)
+    elif model_split == 'sent':
+        # 2 datasets: intra- and inter-sentential
+        new_dataset = (
+            os.path.join(out_dir + '_intrasent', os.path.basename(dataset)),
+            os.path.join(out_dir + '_intersent', os.path.basename(dataset))
+        )
+        new_pairs = (
+            os.path.join(out_dir + '_intrasent', os.path.basename(pairings)),
+            os.path.join(out_dir + '_intersent', os.path.basename(pairings))
+        )
+        if ((os.path.abspath(new_dataset[0]) == os.path.abspath(dataset) or
+             os.path.abspath(new_pairs[0]) == os.path.abspath(pairings) or
+             os.path.abspath(new_dataset[1]) == os.path.abspath(dataset) or
+             os.path.abspath(new_pairs[1]) == os.path.abspath(pairings))):
+            raise ValueError("I won't let you erase your base dataset")
+        with codecs.open(dataset, mode='rb', encoding='utf-8') as f_data:
+            with codecs.open(pairings, mode='rb', encoding='utf-8') as f_pairs:
+                with codecs.open(new_dataset[0], mode='wb', encoding='utf-8') as data_out_intra:
+                    with codecs.open(new_pairs[0], mode='wb', encoding='utf-8') as pairs_out_intra:
+                        with codecs.open(new_dataset[1], mode='wb', encoding='utf-8') as data_out_inter:
+                            with codecs.open(new_pairs[1], mode='wb', encoding='utf-8') as pairs_out_inter:
+                                # read header line in svmlight file
+                                header = f_data.readline()
+                                header_prefix = '# labels: '
+                                assert header.startswith(header_prefix)
+                                labels = header[len(header_prefix):].split()
+                                int2lbl = dict(enumerate(labels, start=1))
+                                lbl2int = {lbl: i for i, lbl in int2lbl.items()}
+                                unrelated = lbl2int["UNRELATED"]
+                                root = lbl2int["ROOT"]
+                                # write labels in header of new svmlight file
+                                print(header, file=data_out_intra)
+                                print(header, file=data_out_inter)
+                                # stream through lines
+                                for pair, line in itertools.izip(f_pairs, f_data):
+                                    # read candidate pair of EDUs
+                                    src_id, tgt_id = pair.strip().split('\t')
+                                    if src_id == 'ROOT':
+                                        continue
+                                    # now both src_id and tgt_id are of form "docname_int"
+                                    # ex: "wsj_0600.out_1"
+                                    src_idx = int(src_id.rsplit('_', 1)[1])
+                                    doc_name, tgt_idx = tgt_id.rsplit('_', 1)
+                                    tgt_idx = int(tgt_idx)
+                                    # read corresponding ref class (label), feature vector
+                                    lbl_idx, feat_vector = line.strip().split(' ', 1)
+                                    lbl_idx = int(lbl_idx)  # lbl currently encoded as int
+                                    if lbl_idx in (unrelated, root):
+                                        continue
+                                    try:
+                                        lbl = int2lbl[lbl_idx]
+                                    except KeyError:
+                                        # the test set in RST-DT 1.0 has an error:
+                                        # wsj_1189.out [8-9] is labelled "span" instead of
+                                        # "Consequence" ; some runs used this erroneous
+                                        # version, hence had a class "0" (unknown) for
+                                        # this line in the dataset
+                                        if ((doc_name == 'wsj_1189.out' and
+                                             src_idx == 7 and
+                                             tgt_idx == 9)):
+                                            lbl = 'cause'
+                                            lbl_idx = lbl2int[lbl]
+                                        else:
+                                            print(doc_name, src_idx, tgt_idx)
+                                            raise
+                                    # print(src_id, tgt_id, lbl)
+                                    dtree = rst_dcorpus[doc_name]
+                                    assert dtree.heads[tgt_idx] == src_idx
+                                    assert dtree.labels[tgt_idx] == lbl
+                                    if ((' 269:' in feat_vector or
+                                         ' 303:' in feat_vector) and
+                                        (' 103:' in feat_vector or
+                                         ' 158:' in feat_vector or
+                                         ' 234:' in feat_vector or
+                                         ' 314:' in feat_vector)):
+                                        # 269 is same_sentence_intra_right
+                                        # 303 is same_sentence_intra_left ;
+                                        # 103 is same_para_inter_right
+                                        # 158 is same_para_inter_left
+                                        # 234 is same_para_intra_right
+                                        # 314 is same_para_intra_left
+                                        # FIXME find a cleaner way
+                                        print(str(lbl_idx) + ' ' + feat_vector,
+                                              file=data_out_intra)
+                                        print(pair.strip(),
+                                              file=pairs_out_intra)
+                                    else:
+                                        # inter-sentential
+                                        print(str(lbl_idx) + ' ' + feat_vector,
+                                              file=data_out_inter)
+                                        print(pair.strip(),
+                                              file=pairs_out_inter)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='Prepare a relation dataset.'
+    )
+    parser.add_argument('--corpus',
+                        help='Path to the RST-DT "main" corpus',
+                        default=os.path.join(
+                            os.path.expanduser('~'),
+                            'corpora/rst-dt/rst_discourse_treebank/data',
+                            'RSTtrees-WSJ-main-1.01'
+                        ))
+    parser.add_argument('--dataset',
+                        help='Base file of the dataset',
+                        default=os.path.join(
+                            os.path.expanduser('~'),
+                            'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse',
+                            'TRAINING.relations.sparse'
+                        ))
+    parser.add_argument('--out_dir',
+                        help='Output folder',
+                        default=os.path.join(
+                            os.path.expanduser('~'),
+                            'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_REL'
+                        ))
+    parser.add_argument('--nary_enc',
+                        help='Encoding for n-ary nodes',
+                        choices=['chain', 'tree'],
+                        default='chain')
+    parser.add_argument('--model_split',
+                        help='Separate models for subsets of instances',
+                        choices=['none', 'sent', 'sent-para'],
+                        default='none')
+    args = parser.parse_args()
+    main(args.corpus, args.dataset, args.out_dir, args.nary_enc,
+         args.model_split)
diff --git a/evals/showdown.py b/evals/showdown.py
index 4c311e6..af6f117 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -7,11 +7,12 @@
 
 import argparse
 import codecs
+from collections import defaultdict
 import itertools
 import os
 
 from sklearn.datasets import load_svmlight_files
-from sklearn.linear_model.logistic import LogisticRegressionCV
+from sklearn.linear_model.logistic import LogisticRegression, LogisticRegressionCV
 
 from educe.rst_dt.annotation import _binarize, SimpleRSTTree
 from educe.rst_dt.corpus import (RstRelationConverter,
@@ -51,6 +52,7 @@
 from evals.surdeanu import load_surdeanu_ctrees, load_surdeanu_dtrees
 # 2017-12-12 nuc_clf WIP
 from evals.train_nuc_classifier import RightBinaryNuclearityClassifier
+from evals.train_rel_relabeller import RelationRelabeller
 # end WIP nuc_clf
 
 # RST corpus
@@ -204,6 +206,25 @@ def setup_dtree_postprocessor(nary_enc='chain', order='strict',
         # flavours of dtree
         dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc=nary_enc)
         dtree_true[doc_name] = dt_true
+        # 2017-12-18 WIP print spiders in d-trees, see if some could be
+        # solved with para_idx
+        rnk_deps = defaultdict(list)  # gov -> list of (rnk, dep)
+        for i, (gov, rnk, nuc, lbl) in enumerate(
+                zip(dt_true.heads[1:], dt_true.ranks[1:], dt_true.nucs[1:],
+                    dt_true.labels[1:]),
+                start=1):
+            rnk_deps[gov].append((rnk, i))
+        ordered_deps = {k: sorted(v) for k, v in rnk_deps.items()}
+        for gov, ord_deps in sorted(ordered_deps.items()):
+            if ((any(x[1] < gov for x in ord_deps) and
+                 any(x[1] > gov for x in ord_deps))):
+                if doc_name.startswith('wsj_06'):
+                    print(doc_name, gov, ord_deps)
+                elif doc_name.startswith('file'):
+                    pass
+                else:
+                    raise ValueError("spider!")
+        # end 2017-12-18 WIP spiders
     # fit classifiers for nuclearity and rank (DIRTY)
     # NB: both are (dummily) fit on weakly ordered dtrees
     X_train = []
@@ -215,8 +236,68 @@ def setup_dtree_postprocessor(nary_enc='chain', order='strict',
         X_train.append(dt)
         y_nuc_train.append(dt.nucs)
         y_rnk_train.append(dt.ranks)
+    # 2017-12-14 WIP relation relabeller
+    if False:
+        model_split = 'sent'  # {'none', 'sent'}
+        if model_split == 'none':
+            dset_folder = os.path.join(
+                os.path.expanduser('~'),
+                'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_REL'
+            )
+            dset_rel_train = os.path.join(dset_folder, 'TRAINING.relations.sparse')
+            dset_rel_test = os.path.join(dset_folder, 'TEST.relations.sparse')
+            # FIXME read n_features from .vocab
+            X_rel_train, y_rel_train, X_rel_test, y_rel_test = load_svmlight_files(
+                (dset_rel_train, dset_rel_test),
+                n_features=46731,
+                zero_based=False
+            )
+        elif model_split == 'sent':
+            # * intra
+            dset_folder_intra = os.path.join(
+                os.path.expanduser('~'),
+                'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_REL_intrasent'
+            )
+            dset_train_intra = os.path.join(dset_folder_intra, 'TRAINING.relations.sparse')
+            dset_test_intra = os.path.join(dset_folder_intra, 'TEST.relations.sparse')
+            # FIXME read n_features from .vocab
+            X_rel_train_intra, y_rel_train_intra, X_rel_test_intra, y_rel_test_intra = load_svmlight_files(
+                (dset_train_intra, dset_test_intra),
+                n_features=46731,
+                zero_based=False
+            )
+            # * inter
+            dset_folder_inter = os.path.join(
+                os.path.expanduser('~'),
+                'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_REL_intersent'
+            )
+            dset_train_inter = os.path.join(dset_folder_inter, 'TRAINING.relations.sparse')
+            dset_test_inter = os.path.join(dset_folder_inter, 'TEST.relations.sparse')
+            # FIXME read n_features from .vocab
+            X_rel_train_inter, y_rel_train_inter, X_rel_test_inter, y_rel_test_inter = load_svmlight_files(
+                (dset_train_inter, dset_test_inter),
+                n_features=46731,
+                zero_based=False
+            )
+            # put together intra and inter
+            X_rel_train = (X_rel_train_intra, X_rel_train_inter)
+            y_rel_train = (y_rel_train_intra, y_rel_train_inter)
+            # TODO the same for {X,y}_rel_test ?
+        else:
+            raise ValueError("what model_split?")
+        # common call
+        mul_clf = LogisticRegressionCV(Cs=10,  # defaults to 10,
+                                       penalty='l1', solver='liblinear',
+                                       n_jobs=3)
+        rel_clf = RelationRelabeller(mul_clf=mul_clf, model_split=model_split)
+        rel_clf = rel_clf.fit(X_rel_train, y_rel_train)
+    else:
+        rel_clf = None
+    # end 2017-12-14 relations relabeller
     # nuclearity clf
     if True:
+        # TODO see whether intra/inter-sentential would be good
+        # for the dummy nuc clf
         nuc_clf = DummyNuclearityClassifier(strategy=nuc_strategy,
                                             constant=nuc_constant)
         nuc_clf.fit(X_train, y_nuc_train)
@@ -225,22 +306,59 @@ def setup_dtree_postprocessor(nary_enc='chain', order='strict',
         # shiny new nuc_clf ; still very hacky
         # import the nuclearity TRAIN and TEST sets generated from
         # the svmlight feature vectors (ahem)
-        dset_folder = os.path.join(
-            os.path.expanduser('~'),
-            'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_NUC'
-        )
-        dset_train = os.path.join(dset_folder, 'TRAINING.relations.sparse')
-        dset_test = os.path.join(dset_folder, 'TEST.relations.sparse')
-        # FIXME read n_features from .vocab
-        X_nuc_train, y_nuc_train, X_nuc_test, y_nuc_test = load_svmlight_files(
-            (dset_train, dset_test),
-            n_features=46731,
-            zero_based=False
-        )
+        model_split = 'sent'
+        #
+        if model_split == 'none':
+            dset_folder = os.path.join(
+                os.path.expanduser('~'),
+                'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_NUC'
+            )
+            dset_train = os.path.join(dset_folder, 'TRAINING.relations.sparse')
+            dset_test = os.path.join(dset_folder, 'TEST.relations.sparse')
+            # FIXME read n_features from .vocab
+            X_nuc_train, y_nuc_train, X_nuc_test, y_nuc_test = load_svmlight_files(
+                (dset_train, dset_test),
+                n_features=46731,
+                zero_based=False
+            )
+        elif model_split == 'sent':
+            # * intra
+            dset_folder_intra = os.path.join(
+                os.path.expanduser('~'),
+                'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_NUC_intrasent'
+            )
+            dset_train_intra = os.path.join(dset_folder_intra, 'TRAINING.relations.sparse')
+            dset_test_intra = os.path.join(dset_folder_intra, 'TEST.relations.sparse')
+            # FIXME read n_features from .vocab
+            X_nuc_train_intra, y_nuc_train_intra, X_nuc_test_intra, y_nuc_test_intra = load_svmlight_files(
+                (dset_train_intra, dset_test_intra),
+                n_features=46731,
+                zero_based=False
+            )
+            # * inter
+            dset_folder_inter = os.path.join(
+                os.path.expanduser('~'),
+                'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_NUC_intersent'
+            )
+            dset_train_inter = os.path.join(dset_folder_inter, 'TRAINING.relations.sparse')
+            dset_test_inter = os.path.join(dset_folder_inter, 'TEST.relations.sparse')
+            # FIXME read n_features from .vocab
+            X_nuc_train_inter, y_nuc_train_inter, X_nuc_test_inter, y_nuc_test_inter = load_svmlight_files(
+                (dset_train_inter, dset_test_inter),
+                n_features=46731,
+                zero_based=False
+            )
+            # put together intra and inter
+            X_nuc_train = (X_nuc_train_intra, X_nuc_train_inter)
+            y_nuc_train = (y_nuc_train_intra, y_nuc_train_inter)
+            # TODO the same for {X,y}_nuc_test ?
+        else:
+            raise ValueError("what model_split?")
         bin_clf = LogisticRegressionCV(Cs=10,  # defaults to 10
                                        penalty='l1', solver='liblinear',
                                        n_jobs=3)
-        nuc_clf = RightBinaryNuclearityClassifier(bin_clf=bin_clf)
+        nuc_clf = RightBinaryNuclearityClassifier(bin_clf=bin_clf,
+                                                  model_split=model_split)
         nuc_clf = nuc_clf.fit(X_nuc_train, y_nuc_train)
         # end WIP nuc_clf
     # rank clf
@@ -248,7 +366,7 @@ def setup_dtree_postprocessor(nary_enc='chain', order='strict',
         strategy=rnk_strategy, prioritize_same_unit=rnk_prioritize_same_unit,
         order=order)
     rnk_clf.fit(X_train, y_rnk_train)
-    return nuc_clf, rnk_clf
+    return nuc_clf, rnk_clf, rel_clf
 
 
 # FIXME:
@@ -336,8 +454,8 @@ def main():
     # ones with nuclearity
     # * tie the order with the encoding for n-ary nodes
     order = 'weak' if nary_enc_pred == 'tree' else 'strict'
-    nuc_clf, rnk_clf = setup_dtree_postprocessor(nary_enc=nary_enc_pred,
-                                                 order=order)
+    nuc_clf, rnk_clf, rel_clf = setup_dtree_postprocessor(
+        nary_enc=nary_enc_pred, order=order)
 
     # the eval compares parses for the test section of the RST corpus
     reader_test = RstReader(CD_TEST)
@@ -531,11 +649,11 @@ def main():
         if author_pred == 'ours-chain':
             # Eisner, predicted syntax, chain
             dtree_pred = load_attelo_dtrees(EISNER_OUT_SYN_PRED, EDUS_FILE,
-                                            nuc_clf, rnk_clf)
+                                            rel_clf, nuc_clf, rnk_clf)
             c_preds.append(
-                ('ours-chain', load_attelo_ctrees(
-                    EISNER_OUT_SYN_PRED, EDUS_FILE, nuc_clf, rnk_clf,
-                    dtree_pred=dtree_pred))
+                ('ours-chain', load_attelo_ctrees(EISNER_OUT_SYN_PRED, EDUS_FILE,
+                                                  rel_clf, nuc_clf, rnk_clf,
+                                                  dtree_pred=dtree_pred))
             )
             d_preds.append(
                 ('ours-chain', dtree_pred)
@@ -543,12 +661,12 @@ def main():
 
         if author_pred == 'ours-tree':
             # Eisner, predicted syntax, tree + same-unit
-            dtree_pred = load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED,
-                                            EDUS_FILE, nuc_clf, rnk_clf)
+            dtree_pred = load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED, EDUS_FILE,
+                                            rel_clf, nuc_clf, rnk_clf)
             c_preds.append(
-                ('ours-tree', load_attelo_ctrees(
-                    EISNER_OUT_TREE_SYN_PRED, EDUS_FILE, nuc_clf, rnk_clf,
-                    dtree_pred=dtree_pred))
+                ('ours-tree', load_attelo_ctrees(EISNER_OUT_TREE_SYN_PRED, EDUS_FILE,
+                                                 rel_clf, nuc_clf, rnk_clf,
+                                                 dtree_pred=dtree_pred))
             )
             d_preds.append(
                 ('ours-tree', dtree_pred)
diff --git a/evals/train_nuc_classifier.py b/evals/train_nuc_classifier.py
index 784e944..3126882 100644
--- a/evals/train_nuc_classifier.py
+++ b/evals/train_nuc_classifier.py
@@ -10,44 +10,21 @@
 import argparse
 import codecs
 from collections import defaultdict
+import copy
 import itertools
 import os
+import sys
 
 from sklearn.datasets import load_svmlight_file, load_svmlight_files
-from sklearn.model_selection import cross_val_score
 from sklearn.linear_model.logistic import LogisticRegression, LogisticRegressionCV
+from sklearn.model_selection import cross_val_score
 from sklearn.preprocessing import LabelEncoder
-import matplotlib.pyplot as plt
 
 from educe.rst_dt.annotation import NUC_N, NUC_S
 
 
-if False:
-    # import the nuclearity TRAIN and TEST sets
-    dset_folder = os.path.join(
-        os.path.expanduser('~'),
-        'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_NUC'
-    )
-    dset_train = os.path.join(dset_folder, 'TRAINING.relations.sparse')
-    dset_test = os.path.join(dset_folder, 'TEST.relations.sparse')
-
-    X_train, y_train, X_test, y_test = load_svmlight_files(
-        (dset_train, dset_test),
-        zero_based=False
-    )
-    nuc_clf = LogisticRegressionCV(penalty='l1', solver='liblinear',
-                                   n_jobs=2)
-    # train nuclearity classifier, cross-validate performance on train
-    scores = cross_val_score(nuc_clf, X_train, y_train, cv=10)
-    print(scores)
-    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
-    # fit a 
-    nuc_clf = nuc_clf.fit(X_train, y_train)
-    print(nuc_clf.score(X_test, y_test))
-
-
 # 2017-12-06 non-dummy nuc_clf
-# DIRTY load the feature vector for all candidate edges in the TEST
+# DIRTY load the feature vectors of all candidate edges in the TEST
 # set
 feat_vecs = dict()
 dset_folder = os.path.join(
@@ -90,24 +67,62 @@ class RightBinaryNuclearityClassifier(object):
     ----------
     bin_clf : sklearn classifier
         Binary classifier for right dependencies: NN vs NS.
+    model_split : str, one of {'none', 'sent', 'sent-para'}
+        Distinct models for subsets of instances.
     """
 
-    def __init__(self, bin_clf=LogisticRegression(penalty='l1', solver='liblinear', n_jobs=2)):
+    def __init__(self, bin_clf=LogisticRegression(penalty='l1', solver='liblinear', n_jobs=2), model_split='none'):
         """Init"""
-        self.bin_clf = bin_clf
+        self.model_split = model_split
+        if model_split == 'none':
+            self.bin_clf = bin_clf
+        elif model_split == 'sent':
+            self.bin_clf_intra = copy.deepcopy(bin_clf)
+            self.bin_clf_inter = copy.deepcopy(bin_clf)
+        else:
+            raise ValueError("model_split?")
 
     def fit(self, X, y):
-        """Fit"""
-        self.bin_clf = self.bin_clf.fit(X, y)
-        if True:  # verbose
-            scores = cross_val_score(self.bin_clf, X, y, cv=10)
-            print(scores)
-            print("Accuracy: %0.2f (+/- %0.2f)" % (
-                scores.mean(), scores.std() * 2))
+        """Fit.
+
+        FIXME X is currently expected to be a (flat) list of candidate
+        edges instead of a list of RstDepTrees.
+        """
+        if self.model_split == 'none':
+            self.bin_clf = self.bin_clf.fit(X, y)
+            if True:  # verbose
+                scores = cross_val_score(self.bin_clf, X, y, cv=10)
+                print(scores)
+                print("Accuracy: %0.2f (+/- %0.2f)" % (
+                    scores.mean(), scores.std() * 2))
+        elif self.model_split == 'sent':
+            assert len(X) == 2  # intra, inter
+            assert len(y) == 2  # intra, inter
+            # * intra
+            self.bin_clf_intra = self.bin_clf_intra.fit(X[0], y[0])
+            if True:  # verbose
+                scores = cross_val_score(self.bin_clf_intra, X[0], y[0], cv=10)
+                print(scores)
+                print("Accuracy: %0.2f (+/- %0.2f)" % (
+                    scores.mean(), scores.std() * 2))
+            # * inter
+            self.bin_clf_inter = self.bin_clf_inter.fit(X[1], y[1])
+            if True:  # verbose
+                scores = cross_val_score(self.bin_clf_inter, X[1], y[1], cv=10)
+                print(scores)
+                print("Accuracy: %0.2f (+/- %0.2f)" % (
+                    scores.mean(), scores.std() * 2))
+
         return self
 
     def predict(self, X):
         """Predict nuclearity of edges in RstDepTrees X from the TEST set.
+
+        Parameters
+        ----------
+        X : list of RstDepTree
+            D-trees ; the feature vectors of all edges are already
+            available from the global context.
         """
         y = []
         for dtree in X:
@@ -131,11 +146,30 @@ def predict(self, X):
                     line_idx = pair_map[doc_name][head][i]
                     # X_test[line_idx,:] is a matrix with 1 row
                     Xi = X_test[line_idx,:]
-                    try:
-                        y_pred = self.bin_clf.predict(Xi)
-                    except ValueError:
-                        print(Xi)
-                        raise
+                    if self.model_split == 'none':
+                        try:
+                            y_pred = self.bin_clf.predict(Xi)
+                        except ValueError:
+                            print(Xi)
+                            raise
+                    elif self.model_split == 'sent':
+                        # same_sentence_intra_{right,left}: 269, 303
+                        # our vocab is 1-based but sklearn converts it to
+                        # 0-based ;
+                        # check it's not a left dep
+                        assert Xi[0, 302] == 0
+                        #
+                        if Xi[0, 268] == 1:
+                            sel_clf = self.bin_clf_intra
+                        else:
+                            sel_clf = self.bin_clf_inter
+                        #
+                        try:
+                            y_pred = sel_clf.predict(Xi)
+                        except ValueError:
+                            print(Xi)
+                            raise
+                    # append prediction
                     if y_pred == 1:
                         yi.append(NUC_N)
                     elif y_pred == 2:
@@ -146,3 +180,83 @@ def predict(self, X):
 
             y.append(yi)
         return y
+
+
+if __name__ == "__main__":
+    model_split = 'sent'  # {'none', 'sent'}
+    # eval on intra- and inter-sent
+    # * intra
+    dset_folder_intra = os.path.join(
+        os.path.expanduser('~'),
+        'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_NUC_intrasent'
+    )
+    dset_train_intra = os.path.join(dset_folder_intra, 'TRAINING.relations.sparse')
+    dset_test_intra = os.path.join(dset_folder_intra, 'TEST.relations.sparse')
+    X_train_intra, y_train_intra, X_test_intra, y_test_intra = load_svmlight_files(
+        (dset_train_intra, dset_test_intra),
+        n_features=46731,
+        zero_based=False
+    )
+    # * inter
+    dset_folder_inter = os.path.join(
+        os.path.expanduser('~'),
+        'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_NUC_intersent'
+    )
+    dset_train_inter = os.path.join(dset_folder_inter, 'TRAINING.relations.sparse')
+    dset_test_inter = os.path.join(dset_folder_inter, 'TEST.relations.sparse')
+    X_train_inter, y_train_inter, X_test_inter, y_test_inter = load_svmlight_files(
+        (dset_train_inter, dset_test_inter),
+        n_features=46731,
+        zero_based=False
+    )
+    #
+    if model_split == 'none':
+        # import the nuclearity TRAIN and TEST sets
+        dset_folder = os.path.join(
+            os.path.expanduser('~'),
+            'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_NUC'
+        )
+        dset_train = os.path.join(dset_folder, 'TRAINING.relations.sparse')
+        dset_test = os.path.join(dset_folder, 'TEST.relations.sparse')
+
+        X_train, y_train, X_test, y_test = load_svmlight_files(
+            (dset_train, dset_test),
+            n_features=46731,
+            zero_based=False
+        )
+        nuc_clf = LogisticRegressionCV(penalty='l1', solver='liblinear',
+                                       n_jobs=3)
+        # train nuclearity classifier, cross-validate performance on train
+        scores = cross_val_score(nuc_clf, X_train, y_train, cv=10)
+        print(scores)
+        print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
+        # fit a 
+        nuc_clf = nuc_clf.fit(X_train, y_train)
+        print(nuc_clf.score(X_test, y_test))
+        print('separate eval on intra then inter')
+        print(nuc_clf.score(X_test_intra, y_test_intra))
+        print(nuc_clf.score(X_test_inter, y_test_inter))
+    elif model_split == 'sent':
+        # fit distinct classifiers for intra- and inter-sentential
+        # * intra: train nuclearity classifier, cross-validate performance on train
+        nuc_clf_intra = LogisticRegressionCV(penalty='l1', solver='liblinear',
+                                             n_jobs=3)
+        scores_intra = cross_val_score(nuc_clf_intra, X_train_intra, y_train_intra,
+                                       cv=10)
+        print(scores_intra)
+        print("Accuracy: %0.2f (+/- %0.2f)" % (
+            scores_intra.mean(), scores_intra.std() * 2))
+        #
+        nuc_clf_intra = nuc_clf_intra.fit(X_train_intra, y_train_intra)
+        print(nuc_clf_intra.score(X_test_intra, y_test_intra))
+        # * inter: train nuclearity classifier, cross-validate performance on train
+        nuc_clf_inter = LogisticRegressionCV(penalty='l1', solver='liblinear',
+                                             n_jobs=3)
+        scores_inter = cross_val_score(nuc_clf_inter, X_train_inter, y_train_inter,
+                                       cv=10)
+        print(scores_inter)
+        print("Accuracy: %0.2f (+/- %0.2f)" % (
+            scores_inter.mean(), scores_inter.std() * 2))
+        #
+        nuc_clf_inter = nuc_clf_inter.fit(X_train_inter, y_train_inter)
+        print(nuc_clf_inter.score(X_test_inter, y_test_inter))
diff --git a/evals/train_rel_relabeller.py b/evals/train_rel_relabeller.py
new file mode 100644
index 0000000..4ccf661
--- /dev/null
+++ b/evals/train_rel_relabeller.py
@@ -0,0 +1,201 @@
+"""This utility script trains a (re)labeller for RST edges.
+
+Given the path to a relation labelling dataset, it trains a classifier
+and evaluates it.
+"""
+
+from __future__ import absolute_import, print_function
+
+import argparse
+import codecs
+from collections import defaultdict
+import copy
+import os
+
+from sklearn.datasets import load_svmlight_file, load_svmlight_files
+from sklearn.linear_model.logistic import LogisticRegression, LogisticRegressionCV
+from sklearn.model_selection import cross_val_score
+
+from educe.rst_dt.deptree import _ROOT_HEAD, _ROOT_LABEL
+
+
+# build mapping from int to label (reverse label encoding)
+dset_rel_folder = os.path.join(
+    os.path.expanduser('~'),
+    'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_REL'
+)
+dset_rel_train = os.path.join(dset_rel_folder, 'TRAINING.relations.sparse')
+dset_rel_test = os.path.join(dset_rel_folder, 'TEST.relations.sparse')
+
+with codecs.open(dset_rel_train, mode='rb', encoding='utf-8') as f_train:
+    header = f_train.readline()
+    header_prefix = '# labels: '
+    assert header.startswith(header_prefix)
+    # DEBUG? explicit cast from unicode to str
+    labels = [str(lbl) for lbl in header[len(header_prefix):].split()]
+    int2lbl = dict(enumerate(labels, start=1))
+    lbl2int = {lbl: i for i, lbl in int2lbl.items()}
+    # unrelated = lbl2int["UNRELATED"]
+    # root = lbl2int["ROOT"]
+
+# 2017-12-14 relation (re)labeller
+# DIRTY load the feature vector for all *candidate* edges in the TEST
+# set (for predict())
+feat_vecs = dict()
+dset_folder = os.path.join(
+    os.path.expanduser('~'),
+    'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse'
+)
+dset_test = os.path.join(dset_folder, 'TEST.relations.sparse')
+# we use the original svmlight files whose label is the relation
+# class (which we actually don't need here)
+# FIXME read n_features from .vocab
+X_test, y_lbl_test = load_svmlight_file(dset_test, n_features=46731,
+                                        zero_based=False)
+# build mapping from doc_name, src_idx, tgt_idx to line number
+# in X_test
+pairs = dset_test + '.pairings'
+pair_map = defaultdict(lambda: defaultdict(dict))
+with codecs.open(pairs, mode='rb', encoding='utf-8') as f_pairs:
+    for i, line in enumerate(f_pairs):
+        src_id, tgt_id = line.strip().split('\t')
+        src_idx = (0 if src_id == 'ROOT'
+                   else int(src_id.rsplit('_', 1)[1]))
+        doc_name, tgt_idx = tgt_id.rsplit('_', 1)
+        tgt_idx = int(tgt_idx)
+        # print(line)
+        # print(doc_name, src_idx, tgt_idx)
+        pair_map[doc_name][src_idx][tgt_idx] = i
+# end DIRTY
+
+
+if False:
+    # load the relation TRAIN and TEST sets
+    X_rel_train, y_rel_train, X_rel_test, y_rel_test = load_svmlight_files(
+        (dset_rel_train, dset_rel_test),
+        zero_based=False
+    )
+    rel_clf = LogisticRegressionCV(penalty='l1', solver='liblinear',
+                                   n_jobs=3)
+    # train relation classifier, cross-validate performance on train
+    scores = cross_val_score(rel_clf, X_rel_train, y_rel_train, cv=10)
+    print(scores)
+    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
+    # fit a 
+    rel_clf = rel_clf.fit(X_rel_train, y_rel_train)
+    print(rel_clf.score(X_rel_test, y_rel_test))
+
+
+class RelationRelabeller(object):
+    """Predict the coarse-grained RST relation of dependencies.
+
+    Dependencies headed by the fake root node are labelled "ROOT" by
+    convention.
+
+    Parameters
+    ----------
+    mul_clf : sklearn classifier
+        Multi-class classifier for RST (coarse-grained) relations.
+    """
+
+    def __init__(self, mul_clf=LogisticRegression(penalty='l1', solver='liblinear', n_jobs=3), model_split='none'):
+        """Init"""
+        self.model_split = model_split
+        if model_split == 'none':
+            self.mul_clf = mul_clf
+        elif model_split == 'sent':
+            self.mul_clf_intra = copy.deepcopy(mul_clf)
+            self.mul_clf_inter = copy.deepcopy(mul_clf)
+        else:
+            raise ValueError("model_split?")
+
+    def fit(self, X, y):
+        """Fit.
+
+        FIXME X is currently expected to be a (flat) list of candidate
+        edges instead of a list of RstDepTrees.
+        """
+        if self.model_split == 'none':
+            self.mul_clf = self.mul_clf.fit(X, y)
+            if True:  # verbose
+                scores = cross_val_score(self.mul_clf, X, y, cv=10)
+                print(scores)
+                print("Accuracy: %0.2f (+/- %0.2f)" % (
+                    scores.mean(), scores.std() * 2))
+        elif self.model_split == 'sent':
+            assert len(X) == 2  # intra, inter
+            assert len(y) == 2  # intra, inter
+            # * intra
+            self.mul_clf_intra = self.mul_clf_intra.fit(X[0], y[0])
+            if True:  # verbose
+                scores = cross_val_score(self.mul_clf_intra, X[0], y[0], cv=10)
+                print(scores)
+                print("Accuracy: %0.2f (+/- %0.2f)" % (
+                    scores.mean(), scores.std() * 2))
+            # * inter
+            self.mul_clf_inter = self.mul_clf_inter.fit(X[1], y[1])
+            if True:  # verbose
+                scores = cross_val_score(self.mul_clf_inter, X[1], y[1], cv=10)
+                print(scores)
+                print("Accuracy: %0.2f (+/- %0.2f)" % (
+                    scores.mean(), scores.std() * 2))
+
+        return self
+
+    def predict(self, X):
+        """Predict relation of edges in RstDepTrees X from the TEST set.
+        """
+        y = []
+        for dtree in X:
+            doc_name = dtree.origin.doc
+            yi = []
+            for i, (head, rel) in enumerate(zip(dtree.heads, dtree.labels)):
+                if i == 0:
+                    # fake root !? maybe we shouldn't write anything
+                    # here ;
+                    # FIXME check how to be consistent throughout educe and
+                    # eval code
+                    # yi.append(_ROOT_LABEL)
+                    yi.append(None)
+                elif head == 0:
+                    # TODO check the expected value (consistency)
+                    yi.append(_ROOT_LABEL)
+                else:
+                    # regular edge
+                    line_idx = pair_map[doc_name][head][i]
+                    # X_test[line_idx,:] is a matrix with 1 row
+                    Xi = X_test[line_idx,:]
+                    if self.model_split == 'none':
+                        try:
+                            y_pred = self.mul_clf.predict(Xi)
+                        except ValueError:
+                            print(Xi)
+                            raise
+                    elif self.model_split == 'sent':
+                        # same_sentence_intra_{right,left}: 269, 303
+                        # our vocab is 1-based but sklearn converts it to
+                        # 0-based ;
+                        # same_para_* : 103, 158, 234, 314
+                        if ((Xi[0, 268] == 1 or Xi[0, 302] == 1) and
+                            (Xi[0, 102] == 1 or Xi[0, 157] == 1 or
+                             Xi[0, 233] == 1 or Xi[0, 313] == 1)):
+                            sel_clf = self.mul_clf_intra
+                        else:
+                            sel_clf = self.mul_clf_inter
+                        #
+                        try:
+                            y_pred = sel_clf.predict(Xi)
+                        except ValueError:
+                            print(Xi)
+                            raise
+                    # append prediction
+                    try:
+                        yi.append(int2lbl[int(y_pred[0])])
+                        if False and rel != int2lbl[int(y_pred[0])]:
+                            print(doc_name, head, i,
+                                  rel, int2lbl[int(y_pred[0])])  # DEBUG
+                    except KeyError:
+                        raise ValueError("Weird prediction: {}".format(
+                            y_pred))
+            y.append(yi)
+        return y

From 4fb6eaa3935ab6145d30ff8c617624540c925c12 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Tue, 19 Dec 2017 22:50:35 +0100
Subject: [PATCH 73/74] FIX pass doc_edus to loaders

---
 evals/hayashi_deps.py | 36 ++++++++++++++++-----------------
 evals/ji.py           | 31 +++++++++++++----------------
 evals/ours.py         | 27 +++++++++++++++++++++----
 evals/showdown.py     | 46 ++++++++++++++++---------------------------
 4 files changed, 72 insertions(+), 68 deletions(-)

diff --git a/evals/hayashi_deps.py b/evals/hayashi_deps.py
index c5fd6b3..b6f40d2 100644
--- a/evals/hayashi_deps.py
+++ b/evals/hayashi_deps.py
@@ -9,20 +9,10 @@
 from glob import glob
 
 from educe.learning.edu_input_format import load_edu_input_file
-from educe.rst_dt.corpus import Reader
 from educe.rst_dt.deptree import RstDepTree, RstDtException
 from educe.rst_dt.dep2con import deptree_to_rst_tree
 
 
-# load true ctrees, from the TEST section of the RST-DT, to get gold EDUs
-RST_DT_DIR = '/home/mmorey/corpora/rst-dt/rst_discourse_treebank/data'
-RST_TEST_DIR = os.path.join(RST_DT_DIR, 'RSTtrees-WSJ-main-1.0/TEST')
-if not os.path.exists(RST_TEST_DIR):
-    raise ValueError('Unable to find RST test files at ', RST_TEST_DIR)
-RST_TEST_READER = Reader(RST_TEST_DIR)
-RST_TEST_CTREES_TRUE = {k.doc: v for k, v in RST_TEST_READER.slurp().items()}
-
-
 def _load_hayashi_dep_file(f, edus):
     """Do load.
 
@@ -67,24 +57,27 @@ def load_hayashi_dep_file(fname, edus):
         return _load_hayashi_dep_file(f, edus)
 
 
-def load_hayashi_dep_files(out_dir):
+def load_hayashi_dep_files(out_dir, doc_edus):
     """Load dep files output by one of Hayashi et al.'s parser.
 
     Parameters
     ----------
     out_dir: str
         Path to the folder containing the .dis files.
+    doc_edus : dict(str, list(EDU))
+        Mapping from doc_name to the list of its EDUs (read from the
+        corpus).
     """
     dtrees = dict()
     for fname in glob(os.path.join(out_dir, '*.dis')):
         doc_name = os.path.splitext(os.path.basename(fname))[0]
-        edus = RST_TEST_CTREES_TRUE[doc_name].leaves()
+        edus = doc_edus[doc_name]
         dtrees[doc_name] = load_hayashi_dep_file(fname, edus)
     return dtrees
 
 
-def load_hayashi_dep_dtrees(out_dir, rel_conv, edus_file_pat, nuc_clf,
-                            rnk_clf):
+def load_hayashi_dep_dtrees(out_dir, rel_conv, doc_edus, edus_file_pat,
+                            nuc_clf, rnk_clf):
     """Load the dtrees output by one of Hayashi et al.'s dep parsers.
 
     Parameters
@@ -94,6 +87,9 @@ def load_hayashi_dep_dtrees(out_dir, rel_conv, edus_file_pat, nuc_clf,
     rel_conv : RstRelationConverter
         Converter for relation labels (fine- to coarse-grained, plus
         normalization).
+    doc_edus : dict(str, list(EDU))
+        Mapping from doc_name to the list of its EDUs (read from the
+        corpus).
     edus_file_pat : str
         Pattern for the .edu_input files.
     nuc_clf : NuclearityClassifier
@@ -108,7 +104,7 @@ def load_hayashi_dep_dtrees(out_dir, rel_conv, edus_file_pat, nuc_clf,
     """
     dtree_pred = dict()
 
-    dtrees = load_hayashi_dep_files(out_dir)
+    dtrees = load_hayashi_dep_files(out_dir, doc_edus)
     for doc_name, dt_pred in dtrees.items():
         if rel_conv is not None:
             dt_pred = rel_conv(dt_pred)
@@ -130,8 +126,8 @@ def load_hayashi_dep_dtrees(out_dir, rel_conv, edus_file_pat, nuc_clf,
     return dtree_pred
 
 
-def load_hayashi_dep_ctrees(out_dir, rel_conv, edus_file_pat, nuc_clf,
-                            rnk_clf, dtree_pred=None):
+def load_hayashi_dep_ctrees(out_dir, rel_conv, doc_edus, edus_file_pat,
+                            nuc_clf, rnk_clf, dtree_pred=None):
     """Load the ctrees for the dtrees output by one of Hayashi et al.'s
     dep parsers.
 
@@ -142,6 +138,9 @@ def load_hayashi_dep_ctrees(out_dir, rel_conv, edus_file_pat, nuc_clf,
     rel_conv : RstRelationConverter
         Converter for relation labels (fine- to coarse-grained, plus
         normalization).
+    doc_edus : dict(str, list(EDU))
+        Mapping from doc_name to the list of its EDUs (read from the
+        corpus).
     edus_file_pat : str
         Pattern for the .edu_input files.
     nuc_clf : NuclearityClassifier
@@ -159,7 +158,8 @@ def load_hayashi_dep_ctrees(out_dir, rel_conv, edus_file_pat, nuc_clf,
     """
     ctree_pred = dict()
     if dtree_pred is None:
-        dtree_pred = load_hayashi_dep_dtrees(out_dir, rel_conv, edus_file_pat,
+        dtree_pred = load_hayashi_dep_dtrees(out_dir, rel_conv, doc_edus,
+                                             edus_file_pat,
                                              nuc_clf, rnk_clf)
     for doc_name, dt_pred in dtree_pred.items():
         try:
diff --git a/evals/ji.py b/evals/ji.py
index c6ab6e8..1b1808c 100644
--- a/evals/ji.py
+++ b/evals/ji.py
@@ -11,35 +11,28 @@
 from educe.annotation import Span
 from educe.corpus import FileId
 from educe.rst_dt.annotation import Node, RSTTree
-from educe.rst_dt.corpus import Reader
 from educe.rst_dt.deptree import RstDepTree
-from educe.rst_dt.rst_wsj_corpus import TEST_FOLDER
 
-# original RST corpus
-RST_CORPUS = os.path.join('/home/mmorey/corpora/rst-dt/rst_discourse_treebank/data')
-RST_MAIN_TEST = os.path.join(RST_CORPUS, TEST_FOLDER)
 
-
-def load_ji_ctrees(ji_out_dir, rel_conv):
+def load_ji_ctrees(ji_out_dir, rel_conv, doc_edus):
     """Load the ctrees output by DPLP as .brackets files.
 
     Parameters
     ----------
-    ji_out_dir: str
+    ji_out_dir : str
         Path to the base directory containing the output files.
+    rel_conv : RstRelationConverter?
+        Relation converter.
+    doc_edus : dict(str, list(EDU))
+        Mapping from doc_name to the list of its EDUs (read from the
+        corpus).
 
     Returns
     -------
     ctree_pred: dict(str, RSTTree)
         RST ctree for each document.
     """
-    # * load the text of the EDUs
-    # FIXME get the text of EDUs from the .merge files
-    corpus_dir = RST_MAIN_TEST
-    reader_true = Reader(corpus_dir)
-    ctree_true = reader_true.slurp()
-    doc_edus = {k.doc: ct_true.leaves() for k, ct_true
-                in ctree_true.items()}
+    # FIXME? get the text of EDUs from the .merge files?
     # * for each doc, load the predicted spans from the .brackets
     ctree_pred = dict()
     files_pred = os.path.join(ji_out_dir, '*.brackets')
@@ -149,7 +142,8 @@ def load_ji_ctrees(ji_out_dir, rel_conv):
     return ctree_pred
 
 
-def load_ji_dtrees(ji_out_dir, rel_conv, nary_enc='chain', ctree_pred=None):
+def load_ji_dtrees(ji_out_dir, rel_conv, doc_edus, nary_enc='chain',
+                   ctree_pred=None):
     """Get the dtrees that correspond to the ctrees output by DPLP.
 
     Parameters
@@ -160,6 +154,9 @@ def load_ji_dtrees(ji_out_dir, rel_conv, nary_enc='chain', ctree_pred=None):
         Relation converter, from fine- to coarse-grained labels.
     nary_enc: one of {'chain', 'tree'}
         Encoding for n-ary nodes.
+    doc_edus : dict(str, list(EDU))
+        Mapping from doc_name to the list of its EDUs (read from the
+        corpus).
     ctree_pred : dict(str, RSTTree), optional
         RST c-trees, indexed by doc_name. If c-trees are provided this
         way, `out_dir` is ignored.
@@ -171,7 +168,7 @@ def load_ji_dtrees(ji_out_dir, rel_conv, nary_enc='chain', ctree_pred=None):
     """
     dtree_pred = dict()
     if ctree_pred is None:
-        ctree_pred = load_ji_ctrees(ji_out_dir, rel_conv)
+        ctree_pred = load_ji_ctrees(ji_out_dir, rel_conv, doc_edus)
     for doc_name, ct_pred in ctree_pred.items():
         dtree_pred[doc_name] = RstDepTree.from_rst_tree(
             ct_pred, nary_enc=nary_enc)
diff --git a/evals/ours.py b/evals/ours.py
index 6d651b4..5a8f210 100644
--- a/evals/ours.py
+++ b/evals/ours.py
@@ -57,7 +57,8 @@ def load_attelo_output_file(output_file):
     return edges_pred
 
 
-def load_attelo_dtrees(output_file, edus_file, rel_clf, nuc_clf, rnk_clf):
+def load_attelo_dtrees(output_file, edus_file, rel_clf, nuc_clf, rnk_clf,
+                       doc_edus=None):
     """Load RST dtrees from attelo output files.
 
     Parameters
@@ -66,6 +67,13 @@ def load_attelo_dtrees(output_file, edus_file, rel_clf, nuc_clf, rnk_clf):
         Path to the file that contains attelo's output
     edus_file: string
         Path to the file that describes EDUs.
+    doc_edus : dict(str, list(EDU)), optional
+        Mapping from doc_name to the list of its EDUs (read from the
+        corpus). If None, each EDU is re-created using information in
+        the `.edu_input` file, otherwise EDUs are created but their text
+        is taken from `doc_edus`.
+        FIXME avoid creating "new" EDUs altogether if `doc_edus` is not
+        None.
 
     Returns
     -------
@@ -85,7 +93,10 @@ def load_attelo_dtrees(output_file, edus_file, rel_clf, nuc_clf, rnk_clf):
         # EDU info
         edu_num = int(att_edu.id.rsplit('_', 1)[1])
         edu_span = EduceSpan(att_edu.start, att_edu.end)
-        edu_text = att_edu.text
+        if doc_edus is not None:
+            edu_text = doc_edus[doc_name][edu_num - 1].raw_text
+        else:
+            edu_text = att_edu.text
         educe_edus[doc_name].append(EduceEDU(edu_num, edu_span, edu_text))
         # map global id of EDU to num of EDU inside doc
         gid2num[att_edu.id] = edu_num
@@ -134,7 +145,7 @@ def load_attelo_dtrees(output_file, edus_file, rel_clf, nuc_clf, rnk_clf):
 
 
 def load_attelo_ctrees(output_file, edus_file, rel_clf, nuc_clf, rnk_clf,
-                       dtree_pred=None):
+                       doc_edus=None, dtree_pred=None):
     """Load RST ctrees from attelo output files.
 
     Parameters
@@ -147,6 +158,13 @@ def load_attelo_ctrees(output_file, edus_file, rel_clf, nuc_clf, rnk_clf,
         Classifier to predict nuclearity
     rnk_clf: RankClassifier
         Classifier to predict attachment ranking
+    doc_edus : dict(str, list(EDU)), optional
+        Mapping from doc_name to the list of its EDUs (read from the
+        corpus). If None, each EDU is re-created using information in
+        the `.edu_input` file, otherwise EDUs are created but their text
+        is taken from `doc_edus`.
+        FIXME avoid creating "new" EDUs altogether if `doc_edus` is not
+        None.
     dtree_pred : dict(str, RstDepTree), optional
         RST d-trees, indexed by doc_name. If d-trees are provided this
         way, `out_dir` is ignored.
@@ -158,7 +176,8 @@ def load_attelo_ctrees(output_file, edus_file, rel_clf, nuc_clf, rnk_clf,
     if dtree_pred is None:
         # load RST dtrees, with heuristics for nuc and rank
         dtree_pred = load_attelo_dtrees(output_file, edus_file,
-                                        rel_clf, nuc_clf, rnk_clf)
+                                        rel_clf, nuc_clf, rnk_clf,
+                                        doc_edus=doc_edus)
     # convert to RST ctrees
     ctree_pred = dict()
     for doc_name, dt_pred in dtree_pred.items():
diff --git a/evals/showdown.py b/evals/showdown.py
index af6f117..f6a7587 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -7,7 +7,6 @@
 
 import argparse
 import codecs
-from collections import defaultdict
 import itertools
 import os
 
@@ -206,25 +205,6 @@ def setup_dtree_postprocessor(nary_enc='chain', order='strict',
         # flavours of dtree
         dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc=nary_enc)
         dtree_true[doc_name] = dt_true
-        # 2017-12-18 WIP print spiders in d-trees, see if some could be
-        # solved with para_idx
-        rnk_deps = defaultdict(list)  # gov -> list of (rnk, dep)
-        for i, (gov, rnk, nuc, lbl) in enumerate(
-                zip(dt_true.heads[1:], dt_true.ranks[1:], dt_true.nucs[1:],
-                    dt_true.labels[1:]),
-                start=1):
-            rnk_deps[gov].append((rnk, i))
-        ordered_deps = {k: sorted(v) for k, v in rnk_deps.items()}
-        for gov, ord_deps in sorted(ordered_deps.items()):
-            if ((any(x[1] < gov for x in ord_deps) and
-                 any(x[1] > gov for x in ord_deps))):
-                if doc_name.startswith('wsj_06'):
-                    print(doc_name, gov, ord_deps)
-                elif doc_name.startswith('file'):
-                    pass
-                else:
-                    raise ValueError("spider!")
-        # end 2017-12-18 WIP spiders
     # fit classifiers for nuclearity and rank (DIRTY)
     # NB: both are (dummily) fit on weakly ordered dtrees
     X_train = []
@@ -460,6 +440,8 @@ def main():
     # the eval compares parses for the test section of the RST corpus
     reader_test = RstReader(CD_TEST)
     corpus_test = reader_test.slurp()
+    doc_edus_test = {k.doc: ct_true.leaves() for k, ct_true
+                     in corpus_test.items()}
 
     # reference: author_true can be any of the authors_pred (defaults to gold)
     ctree_true = dict()  # ctrees
@@ -533,12 +515,12 @@ def main():
 
         if author_pred == 'HHN16_MST':
             dtree_pred = load_hayashi_dep_dtrees(
-                    HAYASHI_MST_OUT_DIR, REL_CONV_DTREE, EDUS_FILE_PAT,
-                    nuc_clf, rnk_clf)
+                HAYASHI_MST_OUT_DIR, REL_CONV_DTREE, doc_edus_test,
+                EDUS_FILE_PAT, nuc_clf, rnk_clf)
             c_preds.append(
                 ('HHN16_MST', load_hayashi_dep_ctrees(
-                    HAYASHI_MST_OUT_DIR, REL_CONV_DTREE, EDUS_FILE_PAT,
-                    nuc_clf, rnk_clf, dtree_pred=dtree_pred))
+                    HAYASHI_MST_OUT_DIR, REL_CONV_DTREE, doc_edus_test,
+                    EDUS_FILE_PAT, nuc_clf, rnk_clf, dtree_pred=dtree_pred))
             )
             d_preds.append(
                 ('HHN16_MST', dtree_pred)
@@ -607,12 +589,12 @@ def main():
         if author_pred == 'JE14':
             # DPLP outputs RST ctrees in the form of lists of spans;
             # load_ji_dtrees maps them to RST dtrees
-            ctree_pred = load_ji_ctrees(JI_OUT_DIR, REL_CONV)
+            ctree_pred = load_ji_ctrees(JI_OUT_DIR, REL_CONV, doc_edus_test)
             c_preds.append(
                 ('JE14', ctree_pred)
             )
             d_preds.append(
-                ('JE14', load_ji_dtrees(JI_OUT_DIR, REL_CONV,
+                ('JE14', load_ji_dtrees(JI_OUT_DIR, REL_CONV, doc_edus_test,
                                         nary_enc='chain',
                                         ctree_pred=ctree_pred))
             )
@@ -649,10 +631,12 @@ def main():
         if author_pred == 'ours-chain':
             # Eisner, predicted syntax, chain
             dtree_pred = load_attelo_dtrees(EISNER_OUT_SYN_PRED, EDUS_FILE,
-                                            rel_clf, nuc_clf, rnk_clf)
+                                            rel_clf, nuc_clf, rnk_clf,
+                                            doc_edus=doc_edus_test)
             c_preds.append(
                 ('ours-chain', load_attelo_ctrees(EISNER_OUT_SYN_PRED, EDUS_FILE,
                                                   rel_clf, nuc_clf, rnk_clf,
+                                                  doc_edus=doc_edus_test,
                                                   dtree_pred=dtree_pred))
             )
             d_preds.append(
@@ -662,10 +646,12 @@ def main():
         if author_pred == 'ours-tree':
             # Eisner, predicted syntax, tree + same-unit
             dtree_pred = load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED, EDUS_FILE,
-                                            rel_clf, nuc_clf, rnk_clf)
+                                            rel_clf, nuc_clf, rnk_clf,
+                                            doc_edus=doc_edus_test)
             c_preds.append(
                 ('ours-tree', load_attelo_ctrees(EISNER_OUT_TREE_SYN_PRED, EDUS_FILE,
                                                  rel_clf, nuc_clf, rnk_clf,
+                                                 doc_edus=doc_edus_test,
                                                  dtree_pred=dtree_pred))
             )
             d_preds.append(
@@ -674,10 +660,12 @@ def main():
         if author_pred == 'ours-tree-su':
             # Eisner, predicted syntax, tree + same-unit
             dtree_pred = load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED_SU,
-                                            EDUS_FILE, nuc_clf, rnk_clf)
+                                            EDUS_FILE, nuc_clf, rnk_clf,
+                                            doc_edus=doc_edus_test)
             c_preds.append(
                 ('ours-tree-su', load_attelo_ctrees(
                     EISNER_OUT_TREE_SYN_PRED_SU, EDUS_FILE, nuc_clf, rnk_clf,
+                    doc_edus=doc_edus_test,
                     dtree_pred=dtree_pred))
             )
             d_preds.append(

From 9f068655250fb0ec47740a9855a50662690588c8 Mon Sep 17 00:00:00 2001
From: moreymat <mathieu.morey@gmail.com>
Date: Wed, 20 Dec 2017 10:48:09 +0100
Subject: [PATCH 74/74] FIX nary_enc_pred tied to each set of predictions, no
 longer CLI arg

---
 evals/showdown.py | 88 +++++++++++++++++++++++++++--------------------
 1 file changed, 51 insertions(+), 37 deletions(-)

diff --git a/evals/showdown.py b/evals/showdown.py
index f6a7587..1fe101a 100644
--- a/evals/showdown.py
+++ b/evals/showdown.py
@@ -361,9 +361,6 @@ def main():
     parser.add_argument('authors_pred', nargs='+',
                         choices=AUTHORS,
                         help="Author(s) of the predictions")
-    parser.add_argument('--nary_enc_pred', default='tree',
-                        choices=['tree', 'chain'],
-                        help="Encoding of n-ary nodes for the predictions")
     # reference
     parser.add_argument('--author_true', default='gold',
                         choices=AUTHORS + ['each'],  # NEW generate sim matrix
@@ -398,7 +395,6 @@ def main():
     args = parser.parse_args()
     author_true = args.author_true
     authors_pred = args.authors_pred
-    nary_enc_pred = args.nary_enc_pred
     binarize_true = args.binarize_true
     simple_rsttree = args.simple_rsttree
     # display
@@ -419,7 +415,8 @@ def main():
     # heuristically determined values for _pred but also _true, and adds
     # three trivial spans
     eval_li_dep = args.eval_li_dep
-
+    # nary_enc_true is used ; order_true currently is not (implicit in
+    # nary_enc_true)
     if binarize_true in ('right', 'right_mixed'):
         nary_enc_true = 'chain'
         order_true = 'strict'
@@ -433,9 +430,14 @@ def main():
     # 0. setup the postprocessors to flesh out unordered dtrees into ordered
     # ones with nuclearity
     # * tie the order with the encoding for n-ary nodes
-    order = 'weak' if nary_enc_pred == 'tree' else 'strict'
-    nuc_clf, rnk_clf, rel_clf = setup_dtree_postprocessor(
-        nary_enc=nary_enc_pred, order=order)
+    nuc_clf_chain, rnk_clf_chain, rel_clf_chain = setup_dtree_postprocessor(
+        nary_enc='chain', order='strict')
+    # FIXME explicit differenciation between (heuristic) classifiers for
+    # the "chain" vs "tree" transforms (2 parameters: nary_enc, order) ;
+    # nuc_clf, rnk_clf, rel_clf might contain implicit assumptions
+    # tied to the "chain" transform, might not be optimal for "tree"
+    nuc_clf_tree, rnk_clf_tree, rel_clf_tree = setup_dtree_postprocessor(
+        nary_enc='tree', order='weak')
 
     # the eval compares parses for the test section of the RST corpus
     reader_test = RstReader(CD_TEST)
@@ -514,13 +516,15 @@ def main():
             )
 
         if author_pred == 'HHN16_MST':
+            # paper: {nary_enc_pred='chain', order='strict'}
             dtree_pred = load_hayashi_dep_dtrees(
                 HAYASHI_MST_OUT_DIR, REL_CONV_DTREE, doc_edus_test,
-                EDUS_FILE_PAT, nuc_clf, rnk_clf)
+                EDUS_FILE_PAT, nuc_clf_chain, rnk_clf_chain)
             c_preds.append(
                 ('HHN16_MST', load_hayashi_dep_ctrees(
                     HAYASHI_MST_OUT_DIR, REL_CONV_DTREE, doc_edus_test,
-                    EDUS_FILE_PAT, nuc_clf, rnk_clf, dtree_pred=dtree_pred))
+                    EDUS_FILE_PAT, nuc_clf_chain, rnk_clf_chain,
+                    dtree_pred=dtree_pred))
             )
             d_preds.append(
                 ('HHN16_MST', dtree_pred)
@@ -538,16 +542,18 @@ def main():
             )
 
         if author_pred == 'li_sujian':
-            # FIXME load d-trees once, pass dtree_pred to the c-loader
+            # FIXME load d-trees once, pass dtree_pred to the c-loader ;
+            # paper says 'chain' transform, but it might be worth
+            # checking
             c_preds.append(
                 ('li_sujian', load_li_sujian_dep_ctrees(
                     LI_SUJIAN_OUT_FILE, REL_CONV_DTREE, EDUS_FILE_PAT,
-                    nuc_clf, rnk_clf))
+                    nuc_clf_chain, rnk_clf_chain))
             )
             d_preds.append(
                 ('li_sujian', load_li_sujian_dep_dtrees(
                     LI_SUJIAN_OUT_FILE, REL_CONV_DTREE, EDUS_FILE_PAT,
-                    nuc_clf, rnk_clf))
+                    nuc_clf_chain, rnk_clf_chain))
             )
 
         if author_pred == 'FH14_gSVM':
@@ -630,14 +636,16 @@ def main():
 
         if author_pred == 'ours-chain':
             # Eisner, predicted syntax, chain
-            dtree_pred = load_attelo_dtrees(EISNER_OUT_SYN_PRED, EDUS_FILE,
-                                            rel_clf, nuc_clf, rnk_clf,
-                                            doc_edus=doc_edus_test)
+            dtree_pred = load_attelo_dtrees(
+                EISNER_OUT_SYN_PRED, EDUS_FILE,
+                rel_clf_chain, nuc_clf_chain, rnk_clf_chain,
+                doc_edus=doc_edus_test)
             c_preds.append(
-                ('ours-chain', load_attelo_ctrees(EISNER_OUT_SYN_PRED, EDUS_FILE,
-                                                  rel_clf, nuc_clf, rnk_clf,
-                                                  doc_edus=doc_edus_test,
-                                                  dtree_pred=dtree_pred))
+                ('ours-chain', load_attelo_ctrees(
+                    EISNER_OUT_SYN_PRED, EDUS_FILE,
+                    rel_clf_chain, nuc_clf_chain, rnk_clf_chain,
+                    doc_edus=doc_edus_test,
+                    dtree_pred=dtree_pred))
             )
             d_preds.append(
                 ('ours-chain', dtree_pred)
@@ -645,26 +653,30 @@ def main():
 
         if author_pred == 'ours-tree':
             # Eisner, predicted syntax, tree + same-unit
-            dtree_pred = load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED, EDUS_FILE,
-                                            rel_clf, nuc_clf, rnk_clf,
-                                            doc_edus=doc_edus_test)
+            dtree_pred = load_attelo_dtrees(
+                EISNER_OUT_TREE_SYN_PRED, EDUS_FILE,
+                rel_clf_tree, nuc_clf_tree, rnk_clf_tree,
+                doc_edus=doc_edus_test)
             c_preds.append(
-                ('ours-tree', load_attelo_ctrees(EISNER_OUT_TREE_SYN_PRED, EDUS_FILE,
-                                                 rel_clf, nuc_clf, rnk_clf,
-                                                 doc_edus=doc_edus_test,
-                                                 dtree_pred=dtree_pred))
+                ('ours-tree', load_attelo_ctrees(
+                    EISNER_OUT_TREE_SYN_PRED, EDUS_FILE,
+                    rel_clf_tree, nuc_clf_tree, rnk_clf_tree,
+                    doc_edus=doc_edus_test,
+                    dtree_pred=dtree_pred))
             )
             d_preds.append(
                 ('ours-tree', dtree_pred)
             )
         if author_pred == 'ours-tree-su':
             # Eisner, predicted syntax, tree + same-unit
-            dtree_pred = load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED_SU,
-                                            EDUS_FILE, nuc_clf, rnk_clf,
-                                            doc_edus=doc_edus_test)
+            dtree_pred = load_attelo_dtrees(
+                EISNER_OUT_TREE_SYN_PRED_SU, EDUS_FILE, 
+                rel_clf_tree, nuc_clf_tree, rnk_clf_tree,
+                doc_edus=doc_edus_test)
             c_preds.append(
                 ('ours-tree-su', load_attelo_ctrees(
-                    EISNER_OUT_TREE_SYN_PRED_SU, EDUS_FILE, nuc_clf, rnk_clf,
+                    EISNER_OUT_TREE_SYN_PRED_SU, EDUS_FILE,
+                    rel_clf_tree, nuc_clf_tree, rnk_clf_tree,
                     doc_edus=doc_edus_test,
                     dtree_pred=dtree_pred))
             )
@@ -682,15 +694,17 @@ def main():
 
         if False:  # FIXME repair (or forget) these
             print('Eisner, predicted syntax + same-unit')
-            load_deptrees_from_attelo_output(ctree_true, dtree_true,
-                                             EISNER_OUT_SYN_PRED_SU, EDUS_FILE,
-                                             nuc_clf, rnk_clf)
+            load_deptrees_from_attelo_output(
+                ctree_true, dtree_true,
+                EISNER_OUT_SYN_PRED_SU, EDUS_FILE,
+                rel_clf_chain, nuc_clf_chain, rnk_clf_chain)
             print('======================')
 
             print('Eisner, gold syntax')
-            load_deptrees_from_attelo_output(ctree_true, dtree_true,
-                                             EISNER_OUT_SYN_GOLD, EDUS_FILE,
-                                             nuc_clf, rnk_clf)
+            load_deptrees_from_attelo_output(
+                ctree_true, dtree_true,
+                EISNER_OUT_SYN_GOLD, EDUS_FILE,
+                rel_clf_chain, nuc_clf_chain, rnk_clf_chain)
             print('======================')
 
     # dependency eval