irit-melodi · moreymat · Jun 27, 2016 · Jul 1, 2016 · Jul 29, 2016 · Jul 29, 2016
diff --git a/environment.yml b/environment.yml
@@ -0,0 +1,8 @@
+name: irit-rst-dt
+dependencies:
+  - python=2.7
+  - graphviz=2.38.0
+  - nltk
+  - scikit-learn
+  - pip:
+    - "--editable=git+https://github.com/nlhepler/pydot.git#egg=pydot"
diff --git a/evals/__init__.py b/evals/__init__.py
diff --git a/evals/attelo_predictions_to_disdep.py b/evals/attelo_predictions_to_disdep.py
@@ -0,0 +1,114 @@
+"""Small utility script to convert predictions from attelo to dis_dep files.
+"""
+
+from __future__ import absolute_import, print_function
+
+from collections import defaultdict
+from glob import glob
+import os
+
+from attelo.io import load_edus, load_predictions
+from attelo.metrics.util import barebones_rst_deptree
+from attelo.table import UNRELATED
+from educe.corpus import FileId
+from educe.learning.disdep_format import dump_disdep_files
+from educe.rst_dt.dep2con import (DummyNuclearityClassifier,
+                                  InsideOutAttachmentRanker)
+
+
+def attelo_predictions_to_disdep_files(edus_file_glob, edges_file, out_dir,
+                                       nary_enc_pred='tree'):
+    """Generate disdep files from a file dump of attelo predictions.
+
+    Parameters
+    ----------
+    edus_file_glob: str
+        Regex for `edu_input` file paths.
+    edges_file: str
+        Path to the file that contains attelo predictions (edges as
+        triples).
+    out_dir: str
+        Path to the output folder.
+    nary_enc_pred: one of {'chain', 'tree'}
+        Encoding for n-ary cnodes in the predicted dtree ; here it
+        currently triggers the strictness of the order assumed by the
+        dtree postprocessor: nary_enc_pred='chain' implies order='strict',
+        nary_enc_pred='tree' implies order='weak'.
+    """
+    order = 'weak' if nary_enc_pred == 'tree' else 'strict'
+    # set up heuristic classifiers for nuclearity and rank
+    nuc_clf = DummyNuclearityClassifier(strategy='unamb_else_most_frequent')
+    nuc_clf.fit([], [])  # dummy fit
+    rnk_clf = InsideOutAttachmentRanker(strategy='sdist-edist-rl',
+                                        prioritize_same_unit=True,
+                                        order=order)
+    rnk_clf.fit([], [])  # dummy fit
+
+    # load EDUs
+    doc_edus = dict()
+    id2doc = dict()
+    for edu_input_file in glob(edus_file_glob):
+        doc_name = os.path.basename(edu_input_file).rsplit('.', 4)[0]  # FRAGILE
+        edus = load_edus(edu_input_file)
+        assert doc_name == edus[0].grouping
+        # map doc_name to list of EDUs ; populate reverse mapping from
+        # EDU id to doc_name, so that we can dispatch edges to their
+        # document
+        # we keep the list of EDUs sorted as in edu_input, hence we
+        # assume edu_input follows the linear order of EDUs
+        doc_edus[doc_name] = edus
+        for edu in edus:
+            id2doc[edu.id] = doc_name
+    # load edges and dispatch them to their doc
+    edges_pred = load_predictions(edges_file)
+    # for each doc, list edges
+    doc_edges = defaultdict(list)
+    for gov_id, dep_id, lbl in edges_pred:
+        if lbl != UNRELATED:
+            doc_name = id2doc[dep_id]
+            doc_edges[doc_name].append((gov_id, dep_id, lbl))
+
+    # for each doc, get a full-fledged RstDepTree, nuclearity and ranking
+    # are currently determined heuristically
+    doc_dtree = dict()
+    for doc_name, edus in doc_edus.items():
+        # comply with current API for barebones_rst_deptree:
+        # for each doc, create a dict with one item (doc_name, list of edges)
+        dep_edges = doc_edges[doc_name]
+        # create a barebones RST dep tree: head and label only
+        dtree, edu2sent = barebones_rst_deptree(dep_edges, edus, strict=False)
+        # set its origin
+        dtree.origin = FileId(doc_name, None, None, None)
+        # flesh out with heuristically-determined nuclearity
+        dtree.nucs = nuc_clf.predict([dtree])[0]
+        # and heuristically-determined rank (needs edu2sent to prioritize
+        # intra-sentential attachments over inter-sentential ones)
+        dtree.sent_idx = edu2sent  # DIRTY
+        dtree.ranks = rnk_clf.predict([dtree])[0]
+        doc_dtree[doc_name] = dtree
+
+    # write the disdep files
+    dump_disdep_files(doc_dtree.values(), out_dir)
+
+
+if __name__ == '__main__':
+    nary_enc_pred = 'tree'
+    edus_file_glob = os.path.join('TMP', 'latest', 'data', 'TEST',
+                                  '*.edu-pairs.sparse.edu_input')
+    edges_file_glob = os.path.join(
+        'TMP', 'latest', 'scratch-current',
+        'combined',
+        # 'output.*'
+        'output.maxent-iheads-global-AD.L-jnt-eisner'
+    )
+    # attelo predictions are currently stored in one big file
+    edges_files = glob(edges_file_glob)
+    assert len(edges_files) == 1
+    edges_file = edges_files[0]
+    # paths to the resulting disdep files
+    out_dir = os.path.join('TMP_disdep', nary_enc_pred, 'ours', 'test')
+    if not os.path.exists(out_dir):
+        os.makedirs(out_dir)
+    # do the conversion
+    attelo_predictions_to_disdep_files(edus_file_glob, edges_file, out_dir,
+                                       nary_enc_pred=nary_enc_pred)
diff --git a/evals/braud_coling.py b/evals/braud_coling.py
@@ -0,0 +1,160 @@
+"""Read the output of Braud et al.'s COLING parser.
+
+"""
+
+from __future__ import absolute_import, print_function
+
+import codecs
+from glob import glob
+import itertools
+import os
+
+from nltk import Tree
+
+from educe.annotation import Span
+from educe.rst_dt.annotation import EDU, Node, SimpleRSTTree
+from educe.rst_dt.deptree import RstDepTree
+
+
+# map *.mrg.pred files to the original doc names
+MRG_TO_RST = {
+    '12.mrg.pred': 'wsj_0644.out',  # 4
+    '4.mrg.pred': 'wsj_1129.out',  # 5
+    '26.mrg.pred': 'wsj_1197.out',  # 6
+    '24.mrg.pred': 'wsj_1113.out',  # 8
+    '14.mrg.pred': 'wsj_0684.out',  # 10
+    '32.mrg.pred': 'wsj_1354.out',  # 11
+    '18.mrg.pred': 'wsj_1183.out',  # 12
+    '29.mrg.pred': 'wsj_1346.out',  # 15
+    '28.mrg.pred': 'wsj_1169.out',  # 17
+    '37.mrg.pred': 'wsj_0667.out',  # 17
+    '19.mrg.pred': 'wsj_0607.out', # 19
+    '7.mrg.pred': 'wsj_0654.out', # 19
+    '16.mrg.pred': 'wsj_1325.out',  # 21
+    '25.mrg.pred': 'wsj_2375.out',  # 22
+    '31.mrg.pred': 'wsj_1380.out',  # 23
+    '1.mrg.pred': 'wsj_0623.out',  # 25
+    '15.mrg.pred': 'wsj_2373.out',  # 31
+    '30.mrg.pred': 'wsj_2336.out',  # 31
+    '3.mrg.pred': 'wsj_1365.out',  # 39
+    '34.mrg.pred': 'wsj_1148.out',  # 43
+    '11.mrg.pred': 'wsj_1306.out',  # 47
+    '10.mrg.pred': 'wsj_2354.out',  # 52
+    '35.mrg.pred': 'wsj_1126.out',  # 55
+    '0.mrg.pred': 'wsj_2385.out',  # 60
+    '2.mrg.pred': 'wsj_0632.out',  # 62
+    '20.mrg.pred': 'wsj_0602.out',  # 69
+    '27.mrg.pred': 'wsj_0627.out',  # 69
+    '13.mrg.pred': 'wsj_1189.out',  # 91
+    '6.mrg.pred': 'wsj_0616.out',  # 92
+    '36.mrg.pred': 'wsj_1307.out',  # 98
+    '33.mrg.pred': 'wsj_1142.out',  # 106
+    '9.mrg.pred': 'wsj_0655.out',  # 110
+    '21.mrg.pred': 'wsj_2386.out',  # 127
+    '23.mrg.pred': 'wsj_0689.out',  # 132
+    '8.mrg.pred': 'wsj_1387.out',  # 134
+    '17.mrg.pred': 'wsj_1331.out',  # 158
+    '22.mrg.pred': 'wsj_1376.out',  # 202
+    '5.mrg.pred': 'wsj_1146.out',  # 304
+}
+
+
+def tree_to_simple_rsttree(tree):
+    """Build a SimpleRSTTree from a NLTK Tree"""
+    origin = None  # or is it?
+    if not tree:
+        # no kid: EDU (+pre-terminal)
+        num = int(tree.label())
+        span = Span(num, num)  # FIXME
+        text = ''  # FIXME
+        edu = EDU(num, span, text, context=None, origin=origin)
+        # pre-terminal
+        edu_span = (num, num)
+        nuc = "leaf"
+        rel = "leaf"
+        node = Node(nuc, edu_span, span, rel, context=None)
+        return SimpleRSTTree(node, [edu], origin=origin)
+
+    # internal node
+    new_kids = [tree_to_simple_rsttree(kid) for kid in tree]
+    # node
+    nuc, rel = tree.label().split('-', 1)
+    # map to our coarse rel names
+    if rel == 'Textual-organization':
+        rel = 'Textual'
+    # end map
+    edu_beg = (new_kids[0].num if isinstance(new_kids[0], EDU)
+               else new_kids[0].label().edu_span[0])
+    edu_end = (new_kids[-1].num if isinstance(new_kids[-1], EDU)
+               else new_kids[-1].label().edu_span[1])
+    edu_span = (edu_beg, edu_end)
+    char_beg = (new_kids[0].num if isinstance(new_kids[0], EDU)
+                  else new_kids[0].label().span.char_start)
+    char_end = (new_kids[-1].num if isinstance(new_kids[-1], EDU)
+                else new_kids[-1].label().span.char_end)
+    span = Span(char_beg, char_end)
+    new_node = Node(nuc, edu_span, span, rel, context=None)
+    new_tree = SimpleRSTTree(new_node, new_kids, origin=origin)
+    return new_tree
+
+
+def _load_braud_coling_file(f):
+    """Do load file"""
+    tree = Tree.fromstring(f.read().strip())
+    simple_ctree = tree_to_simple_rsttree(tree)
+    return simple_ctree
+
+
+def load_braud_coling_file(fpath):
+    """Load a file."""
+    with codecs.open(fpath, 'rb', 'utf-8') as f:
+        return _load_braud_coling_file(f)
+
+
+def load_braud_coling_ctrees(out_dir, rel_conv):
+    """Load the ctrees output by Braud et al.'s parser
+
+    Parameters
+    ----------
+    out_dir : str
+        Path to the output directory.
+
+    rel_conv : TODO
+        Relation converter
+
+    Returns
+    -------
+    ctree_pred : dict(str, RSTTree)
+        RST c-tree for each document.
+    """
+    ctree_pred = dict()
+    for fpath in sorted(glob(os.path.join(out_dir, '*.mrg.pred'))):
+        fname = os.path.basename(fpath)
+        doc_name = MRG_TO_RST.get(fname, fname)
+        sct_pred = load_braud_coling_file(fpath)
+        # convert to regular RSTTree
+        ct_pred = SimpleRSTTree.to_binary_rst_tree(sct_pred)
+        # convert relation labels
+        ct_pred = rel_conv(ct_pred)
+        # TODO check ct_true: assert that mrg.gold == .out.dis
+        ctree_pred[doc_name] = ct_pred
+    return ctree_pred
+
+
+def load_braud_coling_dtrees(out_dir, rel_conv, nary_enc='chain',
+                             ctree_pred=None):
+    """Do load dtrees.
+
+    Parameters
+    ----------
+    ctree_pred : dict(str, RSTTree), optional
+        RST c-trees, indexed by doc_name. If c-trees are provided this
+        way, `out_dir` is ignored.
+    """
+    dtree_pred = dict()
+    if ctree_pred is None:
+        ctree_pred = load_braud_coling_ctrees(out_dir, rel_conv)
+    for doc_name, ct_pred in ctree_pred.items():
+        dt_pred = RstDepTree.from_rst_tree(ct_pred)
+        dtree_pred[doc_name] = dt_pred
+    return dtree_pred