From f55be6e7044b861ae129d276719d749fdb745d49 Mon Sep 17 00:00:00 2001 From: moreymat Date: Mon, 27 Jun 2016 17:21:16 +0200 Subject: [PATCH 01/74] WIP add comparative evaluations for different parsers --- evals/__init__.py | 0 evals/codra.py | 110 +++++++++++++++++++ evals/li2014.py | 117 ++++++++++++++++++++ evals/ours.py | 259 +++++++++++++++++++++++++++++++++++++++++++++ evals/showdown.py | 86 +++++++++++++++ evals/utils_wip.py | 248 +++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 820 insertions(+) create mode 100644 evals/__init__.py create mode 100644 evals/codra.py create mode 100644 evals/li2014.py create mode 100644 evals/ours.py create mode 100644 evals/showdown.py create mode 100644 evals/utils_wip.py diff --git a/evals/__init__.py b/evals/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/evals/codra.py b/evals/codra.py new file mode 100644 index 0000000..800bdca --- /dev/null +++ b/evals/codra.py @@ -0,0 +1,110 @@ +"""Use the same evaluation procedure Evaluate the output of CODRA + +""" + +from __future__ import print_function + +import itertools +import os + +from educe.rst_dt.annotation import SimpleRSTTree, _binarize +from educe.rst_dt.codra import load_codra_output_files +from educe.rst_dt.corpus import (Reader as RstReader, + RstRelationConverter as RstRelationConverter) +from educe.rst_dt.deptree import RstDepTree + +from attelo.metrics.constituency import (parseval_detailed_report, + parseval_report) +from attelo.metrics.deptree import compute_uas_las + + +# RST corpus +CORPUS_DIR = os.path.abspath(os.path.join( + os.path.dirname(os.path.realpath(__file__)), + '..', 'corpus', + 'RSTtrees-WSJ-main-1.0/')) +CD_TRAIN = os.path.join(CORPUS_DIR, 'TRAINING') +CD_TEST = os.path.join(CORPUS_DIR, 'TEST') +# relation converter (fine- to coarse-grained labels) +RELMAP_FILE = os.path.join('/home/mmorey/melodi/educe', + 'educe', 'rst_dt', + 'rst_112to18.txt') +REL_CONV = RstRelationConverter(RELMAP_FILE).convert_tree + + +def eval_codra_output(codra_out_dir): + """Load and evaluate the .dis files output by CODRA. + + This currently runs on the document-level files (.doc_dis). + """ + # load reference trees + dtree_true = dict() # dependency trees + ctree_true = dict() # constituency trees + # FIXME: find ways to read the right (not necessarily TEST) section + # and only the required documents + rst_reader = RstReader(CD_TEST) + rst_corpus = rst_reader.slurp() + + for doc_id, rtree_true in sorted(rst_corpus.items()): + doc_name = doc_id.doc + + # transform into binary tree with coarse-grained labels + coarse_rtree_true = REL_CONV(rtree_true) + bin_rtree_true = _binarize(coarse_rtree_true) + ctree_true[doc_name] = bin_rtree_true + + # transform into dependency tree via SimpleRSTTree + bin_srtree_true = SimpleRSTTree.from_rst_tree(coarse_rtree_true) + dt_true = RstDepTree.from_simple_rst_tree(bin_srtree_true) + dtree_true[doc_name] = dt_true + + # load predicted trees + data_pred = load_codra_output_files(codra_out_dir) + # filenames = data_pred['filenames'] + doc_names_pred = data_pred['doc_names'] + rst_ctrees_pred = data_pred['rst_ctrees'] + + # gather predictions + dtree_pred = dict() # dependency trees + ctree_pred = dict() # constituency trees + + for doc_name, rst_ctree in itertools.izip(doc_names_pred, rst_ctrees_pred): + # constituency tree + # replace fine-grained labels with coarse-grained labels + # 2016-06-27 useless, the files we have already contain the coarse + # labels + coarse_rtree_pred = REL_CONV(rst_ctree) + ctree_pred[doc_name] = coarse_rtree_pred + + # dependency tree + # conversion via SimpleRSTTree to RstDepTree + bin_srtree_pred = SimpleRSTTree.from_rst_tree(coarse_rtree_pred) + dt_pred = RstDepTree.from_simple_rst_tree(bin_srtree_pred) + dtree_pred[doc_name] = dt_pred + + # compare pred and true + common_doc_names = set(dtree_true.keys()) & set(dtree_pred.keys()) + + # dep scores + dtree_true_list = [dt for doc_name, dt in sorted(dtree_true.items()) + if doc_name in common_doc_names] + dtree_pred_list = [dt for doc_name, dt in sorted(dtree_pred.items()) + if doc_name in common_doc_names] + + score_uas, score_las, score_ls = compute_uas_las(dtree_true_list, + dtree_pred_list) + print('UAS / LAS / LS : {:.4f} / {:.4f} / {:.4f}'.format( + score_uas, score_las, score_ls)) + + skipped_docs = set() + # convert dicts to aligned lists of SimpleRSTTrees, skipping docs where + # needed + ctree_true = [ct for doc_name, ct in sorted(ctree_true.items()) + if doc_name not in skipped_docs] + ctree_pred = [ct for doc_name, ct in sorted(ctree_pred.items()) + if doc_name not in skipped_docs] + # compute and print PARSEVAL scores + print(parseval_report(ctree_true, ctree_pred, digits=4)) + # detailed report on S+N+R + print(parseval_detailed_report(ctree_true, ctree_pred, + metric_type='S+R')) diff --git a/evals/li2014.py b/evals/li2014.py new file mode 100644 index 0000000..d8c02a5 --- /dev/null +++ b/evals/li2014.py @@ -0,0 +1,117 @@ +"""Evaluation procedure used in the parser of (Li et al. 2014). + +This is a reimplementation of this evaluation procedure. +""" + +# FIXME legacy code brutally dumped here, broken +def twisted_eval_li2014(data_true, data_pred): + """Run Parseval on transformed gold trees, as in (Li et al., 2014). + + This applies a deterministic transform to the gold constituency tree + that basically re-orders attachments of a head EDU. + """ + # 1. ctrees_true -> dtrees_true or dtrees_twis (if the procedure + # is fishy) + # 2. dtrees_[true|twis] -> ctrees_twis + # RESUME HERE + # hint: ctrees_twis contain only NS nuclearity (...) + + # TODO check exact conformance with the code of their parser: + # how rank and nuclearity are determined + data_true['rst_ctrees'] = [] + for dt_true in data_true['rst_dtrees']: + # FIXME map EDUs to sentences + dt_true.sent_idx = [edu_id2sent_idx[e.identifier()] + for e in dt_true.edus] + # TODO check that 'lllrrr' effectively corresponds to the strategy + # they apply + chn_bin_srtree_true = deptree_to_simple_rst_tree( + dt_true, MULTINUC_LBLS, strategy='lllrrr') + chn_bin_rtree_true = SimpleRSTTree.to_binary_rst_tree( + chn_bin_srtree_true) + bin_rtree_true = chn_bin_rtree_true + data_true['rst_ctrees'].append(bin_rtree_true) +# end FIXME + + +# FIXME currently broken, need to declare and fit classifiers for nuc and rank +# (nuc_classifier and rank_classifier) +# TODO move to ? +def eval_distortion_gold(corpus, nuc_strategy, rank_strategy, + prioritize_same_unit): + """Load an RstDepTree from the output of attelo. + + Parameters + ---------- + corpus: string + Path to the gold corpus to be evaluated + nuc_strategy: string + Strategy to predict nuclearity + rank_strategy: string + Strategy to predict attachment ranking + """ + # print parameters + print('corpus: {}\tnuc_strategy: {}\trank_strategy: {}'.format( + corpus, nuc_strategy, rank_strategy)) + + gold_orig = dict() + gold_twis = dict() + + # FIXME: find ways to read the right (not necessarily TEST) section + # and only the required documents + rst_reader = RstReader(corpus) + rst_corpus = rst_reader.slurp() + for doc_id, rtree_ref in sorted(rst_corpus.items()): + doc_name = doc_id.doc + + # original gold + # convert labels to coarse + coarse_rtree_ref = REL_CONV(rtree_ref) + # convert to binary tree + bin_rtree_ref = _binarize(coarse_rtree_ref) + gold_orig[doc_name] = bin_rtree_ref + + # distorted gold: forget nuclearity and order of attachment + # convert to RstDepTree via SimpleRSTTree + bin_srtree_ref = SimpleRSTTree.from_rst_tree(coarse_rtree_ref) + dt_ref = RstDepTree.from_simple_rst_tree(bin_srtree_ref) + # FIXME replace gold nuclearity and rank with predicted ones, + # using the given heuristics + # dt_ref.nucs = nuc_classifier.predict([dt_ref])[0] + # dt_ref.ranks = rank_classifier.predict([dt_ref])[0] + # end FIXME + # regenerate a binary RST tree + chn_bin_srtree_ref = deptree_to_simple_rst_tree(dt_ref) + chn_bin_rtree_ref = SimpleRSTTree.to_binary_rst_tree( + chn_bin_srtree_ref) + gold_twis[doc_name] = chn_bin_rtree_ref + + print(parseval_report(gold_orig, gold_twis, + metric_types=[x[0] for x in LBL_FNS], + digits=4)) + # detailed report on S+N+R + print(parseval_detailed_report(ctree_true, ctree_pred, + metric_type='S+R')) + + +def comparative_distortion_on_gold(): + """Evaluate the impact of forgetting nuclearity and rank in the gold. + + Quantify the distortion and loss when forgetting nuclearity and rank + in the gold and replacing them with deterministically-determined + values. + + Possible configurations are the cross-product of strategies to + heuristically determine rank and nuclearity. + """ + gold_corpus = CD_TRAIN # CD_TEST + nuc_strats = ["most_frequent_by_rel", + "unamb_else_most_frequent"] + rank_strats = ['lllrrr', + 'rrrlll', + 'lrlrlr', + 'rlrlrl'] + prioritize_same_units = [True, False] + for nuc_strat in nuc_strats: + for rank_strat in rank_strats: + eval_distortion_gold(gold_corpus, nuc_strat, rank_strat) diff --git a/evals/ours.py b/evals/ours.py new file mode 100644 index 0000000..750dd09 --- /dev/null +++ b/evals/ours.py @@ -0,0 +1,259 @@ +"""Evaluate our parsers. + +""" + +from __future__ import print_function + +from collections import defaultdict +import os + +from educe.annotation import Span as EduceSpan +from educe.rst_dt.annotation import (EDU as EduceEDU, + SimpleRSTTree, _binarize) +from educe.rst_dt.corpus import (Reader as RstReader, + RstRelationConverter as RstRelationConverter) +from educe.rst_dt.dep2con import (deptree_to_simple_rst_tree, + DummyNuclearityClassifier, + InsideOutAttachmentRanker) +from educe.rst_dt.deptree import RstDepTree, RstDtException +# +from attelo.io import load_edus +from attelo.metrics.constituency import (parseval_detailed_report, + parseval_report) +from attelo.metrics.deptree import compute_uas_las +from attelo.table import UNRELATED # for load_attelo_output_file + + +# RST corpus +CORPUS_DIR = os.path.abspath(os.path.join( + os.path.dirname(os.path.realpath(__file__)), + '..', 'corpus', + 'RSTtrees-WSJ-main-1.0/')) +CD_TRAIN = os.path.join(CORPUS_DIR, 'TRAINING') +CD_TEST = os.path.join(CORPUS_DIR, 'TEST') +# relation converter (fine- to coarse-grained labels) +RELMAP_FILE = os.path.join('/home/mmorey/melodi/educe', + 'educe', 'rst_dt', + 'rst_112to18.txt') +REL_CONV = RstRelationConverter(RELMAP_FILE).convert_tree + + +# move to attelo.datasets.attelo_out_format +def load_attelo_output_file(output_file): + """Load edges from an attelo output file. + + An attelo output file typically contains edges from several + documents. This function indexes edges by the name of their + document. + + Parameters + ---------- + output_file: string + Path to the attelo output file + + Returns + ------- + edges_pred: dict(string, [(string, string, string)]) + Predicted edges for each document, indexed by doc name + + Notes + ----- + See `attelo.io.load_predictions` that is almost equivalent to this + function. They are expected to converge some day into a better, + obvious in retrospect, function. + """ + edges_pred = defaultdict(list) + with open(output_file) as f: + for line in f: + src_id, tgt_id, lbl = line.strip().split('\t') + if lbl != UNRELATED: + # dirty hack: get doc name from EDU id + # e.g. (EDU id = wsj_0601_1) => (doc id = wsj_0601) + doc_name = tgt_id.rsplit('_', 1)[0] + edges_pred[doc_name].append((src_id, tgt_id, lbl)) + + return edges_pred + + +def load_deptrees_from_attelo_output(output_file, edus_file, + nuc_strategy, rank_strategy, + prioritize_same_unit=True, + skpd_docs=None): + """Load an RstDepTree from the output of attelo. + + Parameters + ---------- + output_file: string + Path to the file that contains attelo's output + nuc_strategy: string + Strategy to predict nuclearity + rank_strategy: string + Strategy to predict attachment ranking + skpd_docs: set(string) + Names of documents that should be skipped to compute scores + + Returns + ------- + skipped_docs: set(string) + Names of documents that have been skipped to compute scores + """ + # load reference trees + dtree_true = dict() # dependency trees + ctree_true = dict() # constituency trees + # FIXME: find ways to read the right (not necessarily TEST) section + # and only the required documents + rst_reader = RstReader(CD_TEST) + rst_corpus = rst_reader.slurp() + for doc_id, rtree_true in sorted(rst_corpus.items()): + doc_name = doc_id.doc + + # transform into binary tree with coarse-grained labels + coarse_rtree_true = REL_CONV(rtree_true) + bin_rtree_true = _binarize(coarse_rtree_true) + ctree_true[doc_name] = bin_rtree_true + + # transform into dependency tree via SimpleRSTTree + bin_srtree_true = SimpleRSTTree.from_rst_tree(coarse_rtree_true) + dt_true = RstDepTree.from_simple_rst_tree(bin_srtree_true) + dtree_true[doc_name] = dt_true + + # USE TO INCORPORATE CONSTITUENCY LOSS INTO STRUCTURED CLASSIFIERS + # load predicted trees + dtree_pred = dict() # predicted dtrees + ctree_pred = dict() # predicted ctrees + # load EDUs as they are known to attelo (sigh) + # and predicted edges on these EDUs + att_edus = load_edus(edus_file) + edges_pred = load_attelo_output_file(output_file) + # rebuild educe EDUs from their attelo description + # and group them by doc_name + educe_edus = defaultdict(list) + edu2sent_idx = defaultdict(dict) + gid2num = dict() + for att_edu in att_edus: + # doc name + doc_name = att_edu.grouping + # EDU info + edu_num = int(att_edu.id.rsplit('_', 1)[1]) + edu_span = EduceSpan(att_edu.start, att_edu.end) + edu_text = att_edu.text + educe_edus[doc_name].append(EduceEDU(edu_num, edu_span, edu_text)) + # map global id of EDU to num of EDU inside doc + gid2num[att_edu.id] = edu_num + # map EDU to sentence + sent_idx = int(att_edu.subgrouping.split('_sent')[1]) + edu2sent_idx[doc_name][edu_num] = sent_idx + # sort EDUs by num + educe_edus = {doc_name: sorted(edus, key=lambda e: e.num) + for doc_name, edus in educe_edus.items()} + # rebuild educe-style edu2sent ; prepend 0 for the fake root + doc_name2edu2sent = {doc_name: ([0] + + [edu2sent_idx[doc_name][e.num] + for e in doc_educe_edus]) + for doc_name, doc_educe_edus in educe_edus.items()} + + # re-build predicted trees from predicted edges and educe EDUs + skipped_docs = set() # docs skipped because non-projective structures + + # classifiers for nuclearity and ranking + # FIXME declare, fit and predict upstream... + X_train = [] + y_nuc_train = [] + y_rank_train = [] + for doc_name, dt in sorted(dtree_true.items()): + X_train.append(dt) + y_nuc_train.append(dt.nucs) + y_rank_train.append(dt.ranks) + # nuclearity + nuc_classifier = DummyNuclearityClassifier(strategy=nuc_strategy) + nuc_classifier.fit(X_train, y_nuc_train) + # ranking classifier + rank_classifier = InsideOutAttachmentRanker( + strategy=rank_strategy, + prioritize_same_unit=prioritize_same_unit) + rank_classifier.fit(X_train, y_rank_train) + + # rebuild RstDepTrees + for doc_name, es_pred in sorted(edges_pred.items()): + # get educe EDUs + doc_educe_edus = educe_edus[doc_name] + # create pred dtree + dt_pred = RstDepTree(doc_educe_edus) + for src_id, tgt_id, lbl in es_pred: + if src_id == 'ROOT': + if lbl == 'ROOT': + dt_pred.set_root(gid2num[tgt_id]) + else: + raise ValueError('Weird root label: {}'.format(lbl)) + else: + dt_pred.add_dependency(gid2num[src_id], gid2num[tgt_id], lbl) + # NEW add nuclearity: heuristic baseline + if True: + dt_pred.nucs = nuc_classifier.predict([dt_pred])[0] + else: # EXPERIMENTAL use gold nuclearity + dt_pred.nucs = dtree_true[doc_name].nucs + # NEW add rank: some strategies require a mapping from EDU to sentence + # EXPERIMENTAL attach array of sentence index for each EDU in tree + edu2sent = doc_name2edu2sent[doc_name] + dt_pred.sent_idx = edu2sent + # end EXPERIMENTAL + if False: # DEBUG + print(doc_name) + dt_pred.ranks = rank_classifier.predict([dt_pred])[0] + # end NEW + dtree_pred[doc_name] = dt_pred + + # create pred ctree + try: + bin_srtree_pred = deptree_to_simple_rst_tree(dt_pred) + if False: # EXPERIMENTAL + # currently False to run on output that already has + # labels embedding nuclearity + bin_srtree_pred = SimpleRSTTree.incorporate_nuclearity_into_label( + bin_srtree_pred) + bin_rtree_pred = SimpleRSTTree.to_binary_rst_tree(bin_srtree_pred) + ctree_pred[doc_name] = bin_rtree_pred + except RstDtException as rst_e: + print(rst_e) + skipped_docs.add(doc_name) + if False: + print('\n'.join('{}: {}'.format(edu.text_span(), edu) + for edu in educe_edus[doc_name])) + # raise + # end USE TO INCORPORATE CONSTITUENCY LOSS INTO STRUCTURED CLASSIFIERS + + # compare gold with pred on doc_names + common_doc_names = set(dtree_true.keys()) & set(dtree_pred.keys()) + + # dep scores + dtree_true_list = [dt for doc_name, dt in sorted(dtree_true.items()) + if doc_name in common_doc_names] + dtree_pred_list = [dt for doc_name, dt in sorted(dtree_pred.items()) + if doc_name in common_doc_names] + + score_uas, score_las, score_ls = compute_uas_las(dtree_true_list, + dtree_pred_list) + print('UAS / LAS / LS : {:.4f} / {:.4f} / {:.4f}'.format( + score_uas, score_las, score_ls)) + + # compute and print PARSEVAL scores + if skipped_docs: + print('Skipped {} docs over {}'.format(len(skipped_docs), + len(edges_pred))) + # also skip docs passed as argument + if skpd_docs is not None: + skipped_docs |= skpd_docs + # convert dicts to aligned lists of SimpleRSTTrees, skipping docs where + # needed + ctree_true = [ct for doc_name, ct in sorted(ctree_true.items()) + if doc_name not in skipped_docs] + ctree_pred = [ct for doc_name, ct in sorted(ctree_pred.items()) + if doc_name not in skipped_docs] + + print(parseval_report(ctree_true, ctree_pred, + digits=4)) + # detailed report on S+N+R + print(parseval_detailed_report(ctree_true, ctree_pred, + metric_type='S+R')) + + return skipped_docs diff --git a/evals/showdown.py b/evals/showdown.py new file mode 100644 index 0000000..dfc81ee --- /dev/null +++ b/evals/showdown.py @@ -0,0 +1,86 @@ +"""This module evaluates the output of discourse parsers. + +Included are dependency and constituency tree metrics. +""" + +from __future__ import print_function + +import os + +# from educe.rst_dt.annotation import RSTTree, SimpleRSTTree, _binarize +from educe.rst_dt.corpus import RstRelationConverter # , Reader as RstReader + +# from educe.rst_dt.dep2con import (deptree_to_simple_rst_tree) +# from educe.rst_dt.deptree import (RstDepTree, RstDtException) +# +# from attelo.metrics.constituency import (LBL_FNS, parseval_detailed_report, +# parseval_report) +# local to this package +from evals.codra import eval_codra_output +from evals.ours import load_deptrees_from_attelo_output + + +# RST corpus +CORPUS_DIR = os.path.join('corpus', 'RSTtrees-WSJ-main-1.0/') +CD_TRAIN = os.path.join(CORPUS_DIR, 'TRAINING') +CD_TEST = os.path.join(CORPUS_DIR, 'TEST') +# relation converter (fine- to coarse-grained labels) +RELMAP_FILE = os.path.join('/home/mmorey/melodi/educe', + 'educe', 'rst_dt', + 'rst_112to18.txt') +REL_CONV = RstRelationConverter(RELMAP_FILE).convert_tree + + +# +# EVALUATIONS +# + +# * syntax: pred vs gold +EDUS_FILE = os.path.join('/home/mmorey/melodi', + 'irit-rst-dt/TMP/syn_gold_coarse', + 'TEST.relations.sparse.edu_input') +# outputs of parsers +EISNER_OUT_SYN_PRED = os.path.join( + '/home/mmorey/melodi', + 'irit-rst-dt/TMP/syn_pred_coarse', # lbl + 'scratch-current/combined', + 'output.maxent-iheads-global-AD.L-jnt-eisner') + +EISNER_OUT_SYN_GOLD = os.path.join( + '/home/mmorey/melodi', + 'irit-rst-dt/TMP/syn_gold_coarse', # lbl + 'scratch-current/combined', + 'output.maxent-iheads-global-AD.L-jnt-eisner') + +CODRA_OUT_DIR = '/home/mmorey/melodi/joty/Doc-level' + + + +# FIXME load gold trees here once and for all, pass them to each +# evaluation + +print('CODRA (Joty)') +eval_codra_output(CODRA_OUT_DIR) +print('=======================') + +print('Eisner, predicted syntax') +load_deptrees_from_attelo_output(EISNER_OUT_SYN_PRED, EDUS_FILE, + nuc_strategy="unamb_else_most_frequent", + # nuc_strategy="most_frequent_by_rel", + rank_strategy='closest-intra-rl-inter-rl', + prioritize_same_unit=True) +print('======================') + +print('Eisner, gold syntax') +load_deptrees_from_attelo_output(EISNER_OUT_SYN_GOLD, EDUS_FILE, + nuc_strategy="unamb_else_most_frequent", + # nuc_strategy="most_frequent_by_rel", + rank_strategy='closest-intra-rl-inter-rl', + prioritize_same_unit=True) +print('======================') + + +# TODO use nuclearity classifier +# starting with baseline: DummyNuclearityClassifier, that assigns to each +# EDU the most frequent nuclearity of its (incoming) relation in the +# training corpus, i.e. 'S' for 'NS', 'N' for 'NN' diff --git a/evals/utils_wip.py b/evals/utils_wip.py new file mode 100644 index 0000000..bd1d1d0 --- /dev/null +++ b/evals/utils_wip.py @@ -0,0 +1,248 @@ +"""Various utility functions that are WIP. + +These functions are expected to move to educe or attelo when they +are mature. +""" + +from __future__ import print_function + +import os +import sys + +from educe.rst_dt.annotation import RSTTree +from educe.rst_dt.corpus import Reader as RstReader +from educe.rst_dt.dep2con import deptree_to_simple_rst_tree +from educe.rst_dt.deptree import RstDepTree, RstDtException +# +from evals.ours import load_attelo_output_file + + +# RST corpus +CORPUS_DIR = os.path.abspath(os.path.join( + os.path.dirname(os.path.realpath(__file__)), + '..', 'corpus', + 'RSTtrees-WSJ-main-1.0/')) +CD_TRAIN = os.path.join(CORPUS_DIR, 'TRAINING') +CD_TEST = os.path.join(CORPUS_DIR, 'TEST') + +# move to educe.rst_dt.datasets.rst_dis_format +STR_ROOT = '{nuc} (span {edu_span})' +STR_NODE = '{nuc} (span {edu_span}) (rel2par {rel})' +STR_LEAF = '{nuc} (leaf {edu_num}) (rel2par {rel}) (text _!{edu_txt}_!)' + + +def _str_node(tree): + """String for the top node of an RSTTree + + Parameters + ---------- + tree: educe.rst_dt.annotation.RSTTree + The tree whose top node we want to print + """ + node = tree.label() + # get fields + nuc = node.nuclearity + edu_span = node.edu_span + rel = node.rel + # leaf (in reality, we are at the pre-terminal) + if len(tree) == 1: + # get text from the real leaf (EDU) + txt = tree[0].text() + node_str = STR_LEAF.format(nuc=nuc, edu_num=edu_span[0], + rel=rel, edu_txt=txt) + # internal node + else: + edu_span_str = '{} {}'.format(str(edu_span[0]), str(edu_span[1])) + node_str = STR_NODE.format(nuc=nuc, edu_span=edu_span_str, + rel=rel) + + return node_str + + +def tree_str_gen(tree): + """Return a generator of strings, one per tree node""" + # init tree stack with the whole tree, nesting level 0 + tree_stack = [(tree, 0)] + + while tree_stack: + tree, lvl = tree_stack.pop() + yield '{lw}{node_str}'.format(lw=' ' * lvl, + node_str=_str_node(tree)) + tree_stack.extend(reversed([(subtree, lvl + 1) for subtree in tree + if isinstance(subtree, RSTTree)])) + # RESUME HERE: add opening (easy) and closing (trickier) parentheses + # TODO do not print relation (None) for ROOT + + +def _dump_rst_dis_file(out_file, ct_pred): + """Actually do dump. + + Parameters + ---------- + out_file: File + Output file + + ct_pred: RSTTree + Binary RST tree + """ + res_str = '\n'.join(tree_str_gen(ct_pred)) # or str(ct_pred) ? + out_file.write(res_str) + + +def dump_rst_dis_file(out_file, ctree): + """Dump a binary RST tree to a file. + + Parameters + ---------- + out_file: string + Path to the output file + + ctree: RSTTree + Binary RST tree + """ + with open(out_file, 'w') as f: + _dump_rst_dis_file(f, ctree) +# end educe.rst_dt.datasets.rst_dis_format + + +# move to educe.rst_dt.datasets.dep_dis_format ? +def dump_dep_dis_file(out_file, dtree): + """Dump a (RST) dependency tree to a file. + + Parameters + ---------- + out_file: string + Path to the output file + + dtree: RstDepTree + RST dependency tree + """ + with open(out_file, 'w') as f: + res = '\n'.join('{}\t{}'.format(hd, lbl) + for hd, lbl in zip(dtree.heads, dtree.labels)) + f.write(res) +# end attelo.datasets.dep_dis_format + + +# move to educe.rst_dt.attelo_out_format +# +# this function is only called by `convert_attelo_output_file_to_dis_files` +# +# FIXME: find ways to read the right (not necessarily TEST) section +# and only the required documents +def load_trees_from_attelo_output_file(att_output_file): + """Load predicted RST trees from attelo's output file. + + Parameters + ---------- + att_output_file: string + Path to the file that contains attelo's output + + Returns + ------- + ctrees_pred: dict(string, SimpleRSTTree) + Predicted SimpleRSTTree for each document, indexed by its name + """ + # get predicted tree for each doc + # these currently come in the form of edges on attelo EDUs + edges_pred = load_attelo_output_file(att_output_file) + + # get educe EDUs + edus = dict() + # FIXME: parameterize this, cf. function-wide FIXME above + rst_reader = RstReader(CD_TEST) + rst_corpus = rst_reader.slurp() + for doc_id, rtree_true in sorted(rst_corpus.items()): + doc_name = doc_id.doc + edus[doc_name] = rtree_true.leaves() + + # re-build predicted trees from predicted edges and educe EDUs + dtree_pred = dict() # predicted dtrees + ctree_pred = dict() # predicted ctrees + skipped_docs = set() # docs skipped because non-projective structures + for doc_name, es_pred in sorted(edges_pred.items()): + # map from EDU id to EDU num + # EDU id should be common to educe and attelo + id2num = {edu.identifier(): edu.num for edu in edus[doc_name]} + # create pred dtree + dt_pred = RstDepTree(edus[doc_name]) + for src_id, tgt_id, lbl in es_pred: + if src_id == 'ROOT': + if lbl == 'ROOT': + dt_pred.set_root(id2num[tgt_id]) + else: + raise ValueError('Weird root label: {}'.format(lbl)) + else: + dt_pred.add_dependency(id2num[src_id], id2num[tgt_id], lbl) + dtree_pred[doc_name] = dt_pred + # create pred ctree + try: + ctree_pred[doc_name] = deptree_to_simple_rst_tree(dt_pred) + except RstDtException: + skipped_docs.add(doc_name) + if False: + print('\n'.join('{}: {}'.format(edu.text_span(), edu) + for edu in edus[doc_name])) + # raise + if skipped_docs: + print('Skipped {} docs over {}'.format(len(skipped_docs), + len(edges_pred))) + + return ctree_pred +# end educe.rst_dt.attelo_out_format + + +# move to educe.datasets.rst_dis_format +def convert_attelo_output_file_to_dis_files(output_dir, att_output_file): + """Convert attelo's output file to a set of dis files in output_dir. + + Parameters + ---------- + output_dir: string + Path of the directory for the dis files + output_file: string + Path to the file that contains attelo's output + + Returns + ------- + ctrees_pred: dict(string, SimpleRSTTree) + Predicted SimpleRSTTree for each document, indexed by its name + """ + if not os.path.exists(output_dir): + raise ValueError('Absent path: {}'.format(output_dir)) + + ctree_pred = load_trees_from_attelo_output_file(att_output_file) + # output each SimpleRSTTree to a dis file + for doc_name, ct_pred in ctree_pred.items(): + out_fname = os.path.join(output_dir, doc_name + '.dis') + dump_rst_dis_file(out_fname, ct_pred) + # DEBUG + sys.exit() +# end educe.datasets.rst_dis_format + + +# ?? +def load_gold(): + """Load gold structures from RST-WSJ/TEST. + + Returns + ------- + data: dictionary that should be akin to a sklearn Bunch, + with interesting keys 'filenames', 'doc_names', 'rst_ctrees', + 'rst_dtrees'. + """ + # TODO make this the only place where the gold is loaded + # shared between evals of both CODRA and attelo's outputs + filenames = [] # TODO + # load doc names and reference trees + rst_reader = RstReader(CD_TEST) + rst_corpus = rst_reader.slurp() + doc_names = [] + rst_ctrees = [] + for doc_id, rst_ctree in sorted(rst_corpus.items(), + key=lambda kv: kv[0].doc): + doc_names.append(doc_id.doc) + rst_ctrees.append(rst_ctree) + # RESUME HERE (or not) + raise NotImplementedError +# end ?? From 45709b001dd9ede90821e7775706f616c82e2d0e Mon Sep 17 00:00:00 2001 From: moreymat Date: Fri, 1 Jul 2016 23:19:24 +0200 Subject: [PATCH 02/74] WIP special processing for same_unit --- irit_rst_dt/config/common.py | 23 +++++++++++++++++++++++ irit_rst_dt/harness.py | 3 ++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/irit_rst_dt/config/common.py b/irit_rst_dt/config/common.py index 54a6f2e..f224581 100644 --- a/irit_rst_dt/config/common.py +++ b/irit_rst_dt/config/common.py @@ -14,6 +14,8 @@ from attelo.learning.oracle import (AttachOracle, LabelOracle) from attelo.parser.full import (JointPipeline, PostlabelPipeline) +from attelo.parser.same_unit import (JointSameUnitPipeline, + SklearnSameUnitClassifier) def combined_key(*variants): @@ -115,6 +117,27 @@ def mk_joint(klearner, kdecoder): parser=Keyed(parser_key, parser)) +def mk_joint_su(klearner, kdecoder): + "return a joint decoding parser config with same-unit" + settings = _core_settings('AD.L-jnt_su', klearner) + parser_key = combined_key(settings, kdecoder) + key = combined_key(klearner, parser_key) + # su: use same kind of learner as "attach" + parser = JointSameUnitPipeline( + learner_attach=klearner.attach.payload, + learner_label=klearner.label.payload, + learner_su=( + SklearnSameUnitClassifier(klearner.attach.payload._learner) + if not isinstance(klearner.attach.payload, AttachOracle) + else klearner.attach.payload + ), + decoder=kdecoder.payload) + return EvaluationConfig(key=key, + settings=settings, + learner=klearner, + parser=Keyed(parser_key, parser)) + + def mk_post(klearner, kdecoder): "return a post label parser" settings = _core_settings('AD.L-pst', klearner) diff --git a/irit_rst_dt/harness.py b/irit_rst_dt/harness.py index ef4fd60..eddf7d4 100644 --- a/irit_rst_dt/harness.py +++ b/irit_rst_dt/harness.py @@ -203,7 +203,8 @@ def _eval_model_path(subconf, mtype): else: return { 'attach': _eval_model_path(rconf, "attach"), - 'label': _eval_model_path(rconf, "relate") + 'label': _eval_model_path(rconf, "relate"), + 'su': _eval_model_path(rconf, "su"), } # ------------------------------------------------------ From 70385852fbfd893ce6697aaae00c86c7e6f245b3 Mon Sep 17 00:00:00 2001 From: moreymat Date: Fri, 29 Jul 2016 17:28:16 +0200 Subject: [PATCH 03/74] WIP comparative evals, first attempts at same-unit preproc --- evals/codra.py | 107 +++++++++++++++++++++++++++++++-- evals/ours.py | 45 +++++++++++++- evals/showdown.py | 31 ++++++++-- irit_rst_dt/cmd/gather.py | 111 +++++++++++++++++++++++++++-------- irit_rst_dt/config/common.py | 22 +++++++ irit_rst_dt/harness.py | 24 ++++---- irit_rst_dt/local.py | 42 ++++++++++--- 7 files changed, 327 insertions(+), 55 deletions(-) diff --git a/evals/codra.py b/evals/codra.py index 800bdca..17dbacb 100644 --- a/evals/codra.py +++ b/evals/codra.py @@ -4,15 +4,23 @@ from __future__ import print_function +from collections import defaultdict import itertools import os +import numpy as np + from educe.rst_dt.annotation import SimpleRSTTree, _binarize from educe.rst_dt.codra import load_codra_output_files from educe.rst_dt.corpus import (Reader as RstReader, RstRelationConverter as RstRelationConverter) +from educe.rst_dt.dep2con import (deptree_to_simple_rst_tree, + DummyNuclearityClassifier, + InsideOutAttachmentRanker) from educe.rst_dt.deptree import RstDepTree - +from educe.rst_dt.document_plus import align_edus_with_paragraphs +# +from attelo.io import load_edus from attelo.metrics.constituency import (parseval_detailed_report, parseval_report) from attelo.metrics.deptree import compute_uas_las @@ -32,7 +40,10 @@ REL_CONV = RstRelationConverter(RELMAP_FILE).convert_tree -def eval_codra_output(codra_out_dir): +def eval_codra_output(codra_out_dir, edus_file, + nuc_strategy, rank_strategy, + prioritize_same_unit=True, + detailed=False): """Load and evaluate the .dis files output by CODRA. This currently runs on the document-level files (.doc_dis). @@ -45,6 +56,22 @@ def eval_codra_output(codra_out_dir): rst_reader = RstReader(CD_TEST) rst_corpus = rst_reader.slurp() + # WIP 2016-06-29 sent_idx + att_edus = load_edus(edus_file) + edu2sent_idx = defaultdict(dict) + for att_edu in att_edus: + doc_name = att_edu.grouping + edu_num = int(att_edu.id.rsplit('_', 1)[1]) + sent_idx = int(att_edu.subgrouping.split('_sent')[1]) + edu2sent_idx[doc_name][edu_num] = sent_idx + # sort EDUs by num + # rebuild educe-style edu2sent ; prepend 0 for the fake root + doc_name2edu2sent = {doc_name: ([0] + + [s_idx for e_num, s_idx + in sorted(edu2sent.items())]) + for doc_name, edu2sent in edu2sent_idx.items()} + doc_name2edu2para = dict() + for doc_id, rtree_true in sorted(rst_corpus.items()): doc_name = doc_id.doc @@ -58,6 +85,32 @@ def eval_codra_output(codra_out_dir): dt_true = RstDepTree.from_simple_rst_tree(bin_srtree_true) dtree_true[doc_name] = dt_true + # WIP 2016-06-29 para_idx + doc_edus = rtree_true.leaves() + doc_txt = doc_edus[0].context._text + # retrieve paragraph idx + doc_paras = doc_edus[0].context.paragraphs + if doc_paras is not None: + edu2para = align_edus_with_paragraphs( + doc_edus, doc_paras, doc_txt) + # yerk: interpolate values in edu2para where missing + edu2para_fix = [] + for edu_idx in edu2para: + if edu_idx is not None: + edu2para_fix.append(edu_idx) + else: + # interpolation strategy: copy the last regular value + # that has been seen + edu2para_fix.append(edu2para_fix[-1]) + edu2para = edu2para_fix + # end yerk: interpolate + edu2para = [0] + list(np.array(edu2para) + 1) + doc_name2edu2para[doc_name] = edu2para + else: + doc_name2edu2para[doc_name] = None + # end retrieve paragraph idx + + # load predicted trees data_pred = load_codra_output_files(codra_out_dir) # filenames = data_pred['filenames'] @@ -106,5 +159,51 @@ def eval_codra_output(codra_out_dir): # compute and print PARSEVAL scores print(parseval_report(ctree_true, ctree_pred, digits=4)) # detailed report on S+N+R - print(parseval_detailed_report(ctree_true, ctree_pred, - metric_type='S+R')) + if detailed: + print(parseval_detailed_report(ctree_true, ctree_pred, + metric_type='S+R')) + + if False: + # WIP 2016-06-29 use our deterministic classifiers for nuc and rank + # => estimate degradation on Joty's output => hint at ours + # FIXME declare, fit and predict upstream on the training corpus... + # but currently fit is a no-op for both so this horror is in fact safe + X_train = [] + y_nuc_train = [] + y_rank_train = [] + for doc_name, dt in sorted(dtree_true.items()): + X_train.append(dt) + y_nuc_train.append(dt.nucs) + y_rank_train.append(dt.ranks) + # nuclearity + nuc_classifier = DummyNuclearityClassifier(strategy=nuc_strategy) + nuc_classifier.fit(X_train, y_nuc_train) + # ranking classifier + rank_classifier = InsideOutAttachmentRanker( + strategy=rank_strategy, + prioritize_same_unit=prioritize_same_unit) + rank_classifier.fit(X_train, y_rank_train) + # rebuild ctrees + ctree_pred2 = dict() + for doc_name, dt_pred in sorted(dtree_pred.items()): + # set nuclearity + dt_pred.nucs = nuc_classifier.predict([dt_pred])[0] + # set ranking, needs sent_idx (WIP on para_idx) + edu2sent = doc_name2edu2sent[doc_name] + dt_pred.sent_idx = edu2sent + # 2016-06-28 same for edu2para + edu2para = doc_name2edu2para[doc_name] + dt_pred.para_idx = edu2para + dt_pred.ranks = rank_classifier.predict([dt_pred])[0] + # end NEW + bin_srtree_pred = deptree_to_simple_rst_tree(dt_pred) + bin_rtree_pred = SimpleRSTTree.to_binary_rst_tree(bin_srtree_pred) + ctree_pred2[doc_name] = bin_rtree_pred + # + skipped_docs = set() + ctree_pred2 = [ct for doc_name, ct in sorted(ctree_pred2.items()) + if doc_name not in skipped_docs] + print(parseval_report(ctree_true, ctree_pred2, digits=4)) + if detailed: + print(parseval_detailed_report(ctree_true, ctree_pred2, + metric_type='S+R')) diff --git a/evals/ours.py b/evals/ours.py index 750dd09..156c76a 100644 --- a/evals/ours.py +++ b/evals/ours.py @@ -7,6 +7,8 @@ from collections import defaultdict import os +import numpy as np + from educe.annotation import Span as EduceSpan from educe.rst_dt.annotation import (EDU as EduceEDU, SimpleRSTTree, _binarize) @@ -16,6 +18,7 @@ DummyNuclearityClassifier, InsideOutAttachmentRanker) from educe.rst_dt.deptree import RstDepTree, RstDtException +from educe.rst_dt.document_plus import align_edus_with_paragraphs # from attelo.io import load_edus from attelo.metrics.constituency import (parseval_detailed_report, @@ -78,6 +81,7 @@ def load_attelo_output_file(output_file): def load_deptrees_from_attelo_output(output_file, edus_file, nuc_strategy, rank_strategy, prioritize_same_unit=True, + detailed=False, skpd_docs=None): """Load an RstDepTree from the output of attelo. @@ -97,6 +101,8 @@ def load_deptrees_from_attelo_output(output_file, edus_file, skipped_docs: set(string) Names of documents that have been skipped to compute scores """ + doc_name2edu2para = dict() + # load reference trees dtree_true = dict() # dependency trees ctree_true = dict() # constituency trees @@ -117,6 +123,36 @@ def load_deptrees_from_attelo_output(output_file, edus_file, dt_true = RstDepTree.from_simple_rst_tree(bin_srtree_true) dtree_true[doc_name] = dt_true + # 2016-06-28 retrieve paragraph idx of each EDU + # FIXME refactor to get in a better way, in a better place + # currently, we take EDUs from the RSTTree and paragraphs from + # the RSTContext, so no left padding in either list ; + # the dtree contains the left padding EDU, so we compute the + # edu2paragraph alignment on real units only, shift by one, + # then prepend 0 + doc_edus = rtree_true.leaves() + doc_paras = doc_edus[0].context.paragraphs + doc_txt = doc_edus[0].context._text + if doc_paras is not None: + edu2para = align_edus_with_paragraphs( + doc_edus, doc_paras, doc_txt) + # yerk: interpolate values in edu2para where missing + edu2para_fix = [] + for edu_idx in edu2para: + if edu_idx is not None: + edu2para_fix.append(edu_idx) + else: + # interpolation strategy: copy the last regular value + # that has been seen + edu2para_fix.append(edu2para_fix[-1]) + edu2para = edu2para_fix + # end yerk: interpolate + edu2para = [0] + list(np.array(edu2para) + 1) + doc_name2edu2para[doc_name] = edu2para + else: + doc_name2edu2para[doc_name] = None + # end retrieve paragraph idx + # USE TO INCORPORATE CONSTITUENCY LOSS INTO STRUCTURED CLASSIFIERS # load predicted trees dtree_pred = dict() # predicted dtrees @@ -196,6 +232,10 @@ def load_deptrees_from_attelo_output(output_file, edus_file, # EXPERIMENTAL attach array of sentence index for each EDU in tree edu2sent = doc_name2edu2sent[doc_name] dt_pred.sent_idx = edu2sent + # 2016-06-28 same for edu2para + edu2para = doc_name2edu2para[doc_name] + dt_pred.para_idx = edu2para + # assert len(edu2sent) == len(edu2para) # end EXPERIMENTAL if False: # DEBUG print(doc_name) @@ -253,7 +293,8 @@ def load_deptrees_from_attelo_output(output_file, edus_file, print(parseval_report(ctree_true, ctree_pred, digits=4)) # detailed report on S+N+R - print(parseval_detailed_report(ctree_true, ctree_pred, - metric_type='S+R')) + if detailed: + print(parseval_detailed_report(ctree_true, ctree_pred, + metric_type='S+R')) return skipped_docs diff --git a/evals/showdown.py b/evals/showdown.py index dfc81ee..14c5a2f 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -46,6 +46,12 @@ 'scratch-current/combined', 'output.maxent-iheads-global-AD.L-jnt-eisner') +EISNER_OUT_SYN_PRED_SU = os.path.join( + '/home/mmorey/melodi', + 'irit-rst-dt/TMP/latest', # lbl + 'scratch-current/combined', + 'output.maxent-AD.L-jnt_su-eisner') + EISNER_OUT_SYN_GOLD = os.path.join( '/home/mmorey/melodi', 'irit-rst-dt/TMP/syn_gold_coarse', # lbl @@ -56,11 +62,18 @@ -# FIXME load gold trees here once and for all, pass them to each -# evaluation +# FIXME: +# * [ ] load gold trees here once and for all, pass them to each evaluation +# * [ ] create summary table with one system per row, one metric per column, +# keep only the f-score (because for binary trees with manual segmentation +# precision = recall = f-score). print('CODRA (Joty)') -eval_codra_output(CODRA_OUT_DIR) +eval_codra_output(CODRA_OUT_DIR, EDUS_FILE, + nuc_strategy="unamb_else_most_frequent", + rank_strategy='closest-intra-rl-inter-rl', + prioritize_same_unit=True, + detailed=True) print('=======================') print('Eisner, predicted syntax') @@ -68,7 +81,17 @@ nuc_strategy="unamb_else_most_frequent", # nuc_strategy="most_frequent_by_rel", rank_strategy='closest-intra-rl-inter-rl', - prioritize_same_unit=True) + prioritize_same_unit=True, + detailed=True) +print('======================') + +print('Eisner, predicted syntax + same-unit') +load_deptrees_from_attelo_output(EISNER_OUT_SYN_PRED_SU, EDUS_FILE, + nuc_strategy="unamb_else_most_frequent", + # nuc_strategy="most_frequent_by_rel", + rank_strategy='closest-intra-rl-inter-rl', + prioritize_same_unit=True, + detailed=True) print('======================') print('Eisner, gold syntax') diff --git a/irit_rst_dt/cmd/gather.py b/irit_rst_dt/cmd/gather.py index f7fcba6..e4db2cd 100644 --- a/irit_rst_dt/cmd/gather.py +++ b/irit_rst_dt/cmd/gather.py @@ -6,17 +6,12 @@ """ from __future__ import print_function -from os import path as fp import os from attelo.harness.util import call, force_symlink -from ..local import (TEST_CORPUS, - TRAINING_CORPUS, - PTB_DIR, - FEATURE_SET, - CORENLP_OUT_DIR, - LECSIE_DATA_DIR) +from ..local import (FEATURE_SET, LABEL_SET, TEST_CORPUS, TRAINING_CORPUS, + SAME_UNIT, PTB_DIR, CORENLP_OUT_DIR, LECSIE_DATA_DIR) from ..util import (current_tmp, latest_tmp) NAME = 'gather' @@ -40,16 +35,19 @@ def config_argparser(psr): psr.add_argument('--skip-training', action='store_true', help='only gather test data') - psr.add_argument('--coarse', - action='store_true', - help='use coarse-grained labels') psr.add_argument('--fix_pseudo_rels', - action='store_true', - help='fix pseudo-relation labels') + action='store_true', + help='fix pseudo-relation labels') + # WIP frag pairs + psr.add_argument('--resume-frag-pairs', + action='store_true', + help='resume extraction at frag-pairs') + # end WIP frag pairs psr.set_defaults(func=main) -def extract_features(corpus, output_dir, coarse, fix_pseudo_rels, +def extract_features(corpus, output_dir, fix_pseudo_rels, instances, + frag_edus=None, vocab_path=None, label_path=None): """Extract instances from a corpus, store them in files. @@ -64,10 +62,10 @@ def extract_features(corpus, output_dir, coarse, fix_pseudo_rels, Path to the corpus. output_dir: filepath Path to the output folder. - coarse: boolean, False by default - Use coarse-grained relation labels. fix_pseudo_rels: boolean, False by default Rewrite pseudo-relations to improve consistency (WIP). + instances: one of {'same-unit', 'all-pairs'} + Selection of instances to extract. vocab_path: filepath Path to a fixed vocabulary mapping, for feature extraction (needed if extracting test data: the same vocabulary should be @@ -83,6 +81,7 @@ def extract_features(corpus, output_dir, coarse, fix_pseudo_rels, PTB_DIR, # TODO make this optional and exclusive from CoreNLP output_dir, '--feature_set', FEATURE_SET, + '--instances', instances, ] # NEW 2016-05-19 rewrite pseudo-relations if fix_pseudo_rels: @@ -91,7 +90,7 @@ def extract_features(corpus, output_dir, coarse, fix_pseudo_rels, ]) # NEW 2016-05-03 use coarse- or fine-grained relation labels # NB "coarse" was the previous default - if coarse: + if LABEL_SET == 'coarse': cmd.extend([ '--coarse' ]) @@ -103,6 +102,8 @@ def extract_features(corpus, output_dir, coarse, fix_pseudo_rels, cmd.extend([ '--lecsie_data_dir', LECSIE_DATA_DIR, ]) + if frag_edus is not None: + cmd.extend(['--frag-edus', frag_edus]) if vocab_path is not None: cmd.extend(['--vocabulary', vocab_path]) if label_path is not None: @@ -117,22 +118,80 @@ def main(args): You shouldn't need to call this yourself if you're using `config_argparser` """ - if args.skip_training: + if args.skip_training or args.resume_frag_pairs: tdir = latest_tmp() else: tdir = current_tmp() - extract_features(TRAINING_CORPUS, tdir, args.coarse, - args.fix_pseudo_rels) + + fix_pseudo_rels = args.fix_pseudo_rels + + # same-unit + instances = 'same-unit' + su_prefix_train = '{}.{}'.format( + instances, os.path.basename(TRAINING_CORPUS)) + su_train_path = os.path.join(tdir, su_prefix_train) + su_label_path = su_train_path + '.relations.sparse' + su_vocab_path = su_label_path + '.vocab' if TEST_CORPUS is not None: - train_path = fp.join(tdir, fp.basename(TRAINING_CORPUS)) - label_path = train_path + '.relations.sparse' - vocab_path = label_path + '.vocab' - extract_features(TEST_CORPUS, tdir, args.coarse, - args.fix_pseudo_rels, + su_prefix_test = '{}.{}'.format( + instances, os.path.basename(TEST_CORPUS)) + su_test_path = os.path.join(tdir, su_prefix_test) + + if SAME_UNIT in ['joint', 'preproc'] and not args.resume_frag_pairs: + if not args.skip_training: + # * train + extract_features(TRAINING_CORPUS, tdir, fix_pseudo_rels, + instances) + if TEST_CORPUS is not None: + # * test + extract_features(TEST_CORPUS, tdir, fix_pseudo_rels, + instances, + vocab_path=su_vocab_path, + label_path=su_label_path) + + # all pairs + instances = 'all-pairs' + if not args.skip_training and not args.resume_frag_pairs: + extract_features(TRAINING_CORPUS, tdir, fix_pseudo_rels, + instances) + # path to the vocab and labelset gathered from the training set, + # we'll use these paths for the test set and for the frag-pairs + prefix_train = '{}.{}'.format( + instances, os.path.basename(TRAINING_CORPUS)) + train_path = os.path.join(tdir, prefix_train) + label_path = train_path + '.relations.sparse' + vocab_path = label_path + '.vocab' + if TEST_CORPUS is not None and not args.resume_frag_pairs: + extract_features(TEST_CORPUS, tdir, fix_pseudo_rels, + instances, vocab_path=vocab_path, label_path=label_path) + + # frag pairs: supplementary pairs from/to each fragmented EDU to + # the other fragmented EDUs and the EDUs that don't belong to any + # fragmented EDU + instances = 'frag-pairs' + # we use the vocabulary and labelset from "all-pairs" ; this is the + # simplest solution currently and it seems correct, but maybe we + # could extend "all-pairs" with these pairs when we learn the + # vocabulary? + if not args.skip_training: + frag_edus_train = su_train_path + '.relations' + '.deps_true' + extract_features(TRAINING_CORPUS, tdir, fix_pseudo_rels, + instances, frag_edus=frag_edus_train, + vocab_path=vocab_path, + label_path=label_path) + if TEST_CORPUS is not None: + frag_edus_test = su_test_path + '.relations' + '.deps_true' + extract_features(TEST_CORPUS, tdir, fix_pseudo_rels, + instances, frag_edus=frag_edus_test, + vocab_path=vocab_path, + label_path=label_path) + # end frag pairs + with open(os.path.join(tdir, "versions-gather.txt"), "w") as stream: call(["pip", "freeze"], stdout=stream) - if not args.skip_training: + + if not (args.skip_training or args.resume_frag_pairs): latest_dir = latest_tmp() - force_symlink(fp.basename(tdir), latest_dir) + force_symlink(os.path.basename(tdir), latest_dir) diff --git a/irit_rst_dt/config/common.py b/irit_rst_dt/config/common.py index f224581..2bd824f 100644 --- a/irit_rst_dt/config/common.py +++ b/irit_rst_dt/config/common.py @@ -15,6 +15,7 @@ from attelo.parser.full import (JointPipeline, PostlabelPipeline) from attelo.parser.same_unit import (JointSameUnitPipeline, + SameUnitJointPipeline, SklearnSameUnitClassifier) @@ -138,6 +139,27 @@ def mk_joint_su(klearner, kdecoder): parser=Keyed(parser_key, parser)) +def mk_su_joint(klearner, kdecoder): + "return a parser config with same-unit then joint decoding" + settings = _core_settings('su.AD.L-jnt', klearner) + parser_key = combined_key(settings, kdecoder) + key = combined_key(klearner, parser_key) + # su: use same kind of learner as "attach" + parser = SameUnitJointPipeline( + learner_su=( + SklearnSameUnitClassifier(klearner.attach.payload._learner) + if not isinstance(klearner.attach.payload, AttachOracle) + else klearner.attach.payload + ), + learner_attach=klearner.attach.payload, + learner_label=klearner.label.payload, + decoder=kdecoder.payload) + return EvaluationConfig(key=key, + settings=settings, + learner=klearner, + parser=Keyed(parser_key, parser)) + + def mk_post(klearner, kdecoder): "return a post label parser" settings = _core_settings('AD.L-pst', klearner) diff --git a/irit_rst_dt/harness.py b/irit_rst_dt/harness.py index eddf7d4..9c81c1b 100644 --- a/irit_rst_dt/harness.py +++ b/irit_rst_dt/harness.py @@ -117,21 +117,18 @@ def mpack_paths(self, test_data, stripped=False): Returns ------- - path_to_edu_input : string - - path_to_pairings : string - - path_to_features : string - - path_to_vocab : string - - corpus_path : string - Path to corpus in order to access gold structures (WIP). + paths: tuple of file paths + Path to: edu_input, pairings, features, vocab, cdu_input, + cdu_pairings, cdu_features, corpus (to access gold + structures, WIP). """ - ext = 'relations.sparse' + ext = 'relations.edu-pairs.sparse' # path to data file in the evaluation dir dset = self.testset if test_data else self.dataset core_path = fp.join(self.eval_dir, "%s.%s" % (dset, ext)) + # 2016-07-28 pairs on fragmented EDUs + frag_ext = 'relations.frag-pairs.sparse' + frag_path = fp.join(self.eval_dir, "%s.%s" % (dset, frag_ext)) # WIP gold RST trees corpus_path = fp.abspath(TEST_CORPUS if test_data else TRAINING_CORPUS) @@ -140,6 +137,11 @@ def mpack_paths(self, test_data, stripped=False): core_path + '.pairings', (core_path + '.stripped') if stripped else core_path, core_path + '.vocab', + # fragmented EDUs + frag_path + '.cdu_input', + frag_path + '.pairings', + (frag_path + '.stripped') if stripped else frag_path, + # corpus corpus_path) def model_paths(self, rconf, fold, parser): diff --git a/irit_rst_dt/local.py b/irit_rst_dt/local.py index 4fad6a4..47b3022 100644 --- a/irit_rst_dt/local.py +++ b/irit_rst_dt/local.py @@ -49,6 +49,7 @@ decoder_last, decoder_local, mk_joint, + mk_joint_su, mk_post) # PATHS @@ -64,8 +65,8 @@ """Results over time we are making a point of saving""" # TRAINING_CORPUS = 'tiny' -# TRAINING_CORPUS = 'corpus/RSTtrees-WSJ-main-1.0/TRAINING' -TRAINING_CORPUS = 'corpus/RSTtrees-WSJ-double-1.0' +TRAINING_CORPUS = 'corpus/RSTtrees-WSJ-main-1.0/TRAINING' +# TRAINING_CORPUS = 'corpus/RSTtrees-WSJ-double-1.0' """Corpora for use in building/training models and running our incremental experiments. Later on we should consider using the held-out test data for something, but let's make a point of @@ -90,10 +91,11 @@ validation on the training data) """ -TEST_EVALUATION_KEY = None +# TEST_EVALUATION_KEY = None # TEST_EVALUATION_KEY = 'maxent-AD.L-jnt-mst' # TEST_EVALUATION_KEY = 'maxent-AD.L-jnt-eisner' -# TEST_EVALUATION_KEY = 'maxent-iheads-global-AD.L-jnt-eisner' +# TEST_EVALUATION_KEY = 'maxent-AD.L-jnt_su-eisner' +TEST_EVALUATION_KEY = 'maxent-iheads-global-AD.L-jnt_su-eisner' """Evaluation to use for testing. Leave this to None until you think it's OK to look at the test data. @@ -109,8 +111,9 @@ parsed/mrg/wsj) """ -CORENLP_OUT_DIR = None +# CORENLP_OUT_DIR = None # CORENLP_OUT_DIR = '/projets/melodi/corpus/rst-dt-corenlp-2015-01-29' +CORENLP_OUT_DIR = '/home/mmorey/corpora/rst-dt-corenlp-2015-01-29' """ Where to read parses from CoreNLP from """ @@ -126,6 +129,16 @@ Which feature set to use for feature extraction """ +LABEL_SET = 'coarse' # one of {'coarse', 'fine'} or a list of strings +""" +Which label set to use +""" + +SAME_UNIT = 'joint' # one of {'joint', 'preproc', 'no'} +""" +Whether to have a special processing for same-unit +""" + FIXED_FOLD_FILE = None # FIXED_FOLD_FILE = 'folds-TRAINING.json' """ @@ -253,6 +266,19 @@ def _core_parsers(klearner, unique_real_root=True): use_prob=True)), ] ] + # WIP with same-unit + if SAME_UNIT == 'joint': + joint.extend([ + mk_joint_su(klearner, d) for d in [ + # decoder_last(), + # DECODER_LOCAL, + # decoder_mst(), + Keyed('eisner', + EisnerDecoder(unique_real_root=unique_real_root, + use_prob=True)), + ] + ]) + # end WIP # postlabeling use_prob = klearner.attach.payload.can_predict_proba @@ -261,9 +287,9 @@ def _core_parsers(klearner, unique_real_root=True): # decoder_last() , # DECODER_LOCAL, # decoder_mst(), - Keyed('eisner', - EisnerDecoder(unique_real_root=unique_real_root, - use_prob=use_prob)), + # Keyed('eisner', + # EisnerDecoder(unique_real_root=unique_real_root, + # use_prob=use_prob)), ] ] From 5b74d1432939ab41aaf9ca0b805c67b570e639ff Mon Sep 17 00:00:00 2001 From: moreymat Date: Fri, 29 Jul 2016 17:47:20 +0200 Subject: [PATCH 04/74] FIX update naming conventions for files --- irit_rst_dt/cmd/gather.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/irit_rst_dt/cmd/gather.py b/irit_rst_dt/cmd/gather.py index e4db2cd..c808c44 100644 --- a/irit_rst_dt/cmd/gather.py +++ b/irit_rst_dt/cmd/gather.py @@ -64,7 +64,7 @@ def extract_features(corpus, output_dir, fix_pseudo_rels, instances, Path to the output folder. fix_pseudo_rels: boolean, False by default Rewrite pseudo-relations to improve consistency (WIP). - instances: one of {'same-unit', 'all-pairs'} + instances: one of {'same-unit', 'edu-pairs'} Selection of instances to extract. vocab_path: filepath Path to a fixed vocabulary mapping, for feature extraction @@ -128,13 +128,13 @@ def main(args): # same-unit instances = 'same-unit' su_prefix_train = '{}.{}'.format( - instances, os.path.basename(TRAINING_CORPUS)) + os.path.basename(TRAINING_CORPUS), instances) su_train_path = os.path.join(tdir, su_prefix_train) su_label_path = su_train_path + '.relations.sparse' su_vocab_path = su_label_path + '.vocab' if TEST_CORPUS is not None: su_prefix_test = '{}.{}'.format( - instances, os.path.basename(TEST_CORPUS)) + os.path.basename(TEST_CORPUS), instances) su_test_path = os.path.join(tdir, su_prefix_test) if SAME_UNIT in ['joint', 'preproc'] and not args.resume_frag_pairs: @@ -150,14 +150,14 @@ def main(args): label_path=su_label_path) # all pairs - instances = 'all-pairs' + instances = 'edu-pairs' if not args.skip_training and not args.resume_frag_pairs: extract_features(TRAINING_CORPUS, tdir, fix_pseudo_rels, instances) # path to the vocab and labelset gathered from the training set, # we'll use these paths for the test set and for the frag-pairs prefix_train = '{}.{}'.format( - instances, os.path.basename(TRAINING_CORPUS)) + os.path.basename(TRAINING_CORPUS), instances) train_path = os.path.join(tdir, prefix_train) label_path = train_path + '.relations.sparse' vocab_path = label_path + '.vocab' @@ -171,9 +171,9 @@ def main(args): # the other fragmented EDUs and the EDUs that don't belong to any # fragmented EDU instances = 'frag-pairs' - # we use the vocabulary and labelset from "all-pairs" ; this is the + # we use the vocabulary and labelset from "edu-pairs" ; this is the # simplest solution currently and it seems correct, but maybe we - # could extend "all-pairs" with these pairs when we learn the + # could extend "edu-pairs" with these pairs when we learn the # vocabulary? if not args.skip_training: frag_edus_train = su_train_path + '.relations' + '.deps_true' From 20e8384b1e71cd804333b6aad882a1e38dab204f Mon Sep 17 00:00:00 2001 From: moreymat Date: Mon, 29 Aug 2016 15:52:23 +0200 Subject: [PATCH 05/74] MAINT minor refactoring, same-unit --- irit_rst_dt/config/common.py | 2 +- irit_rst_dt/config/intra.py | 10 +-- irit_rst_dt/local.py | 130 ++++++++++++++++------------------- 3 files changed, 65 insertions(+), 77 deletions(-) diff --git a/irit_rst_dt/config/common.py b/irit_rst_dt/config/common.py index 2bd824f..abf3efd 100644 --- a/irit_rst_dt/config/common.py +++ b/irit_rst_dt/config/common.py @@ -145,7 +145,7 @@ def mk_su_joint(klearner, kdecoder): parser_key = combined_key(settings, kdecoder) key = combined_key(klearner, parser_key) # su: use same kind of learner as "attach" - parser = SameUnitJointPipeline( + parser = JointSameUnitPipeline( learner_su=( SklearnSameUnitClassifier(klearner.attach.payload._learner) if not isinstance(klearner.attach.payload, AttachOracle) diff --git a/irit_rst_dt/config/intra.py b/irit_rst_dt/config/intra.py index b130ab5..1921f7d 100644 --- a/irit_rst_dt/config/intra.py +++ b/irit_rst_dt/config/intra.py @@ -11,17 +11,17 @@ def combine_intra(econfs, kconf, primary='intra', verbose=False): Parameters ---------- econfs: IntraInterPair(EvaluationConfig) - + Evaluation configs for the intra and inter parsers. kconf: Keyed(parser constructor) - - primary: ['intra', 'inter'] - Treat the intra/inter config as the primary one for the key + Key for the whole intra/inter parser. + primary: one of {'intra', 'inter'} + Treat the intra or inter config as the primary one for the key. verbose: boolean, optional Verbosity of the intra/inter parser Returns ------- - econf : EvaluationConfig + econf: EvaluationConfig Evaluation configuration for the IntraInterParser. """ if primary == 'intra': diff --git a/irit_rst_dt/local.py b/irit_rst_dt/local.py index 47b3022..419c077 100644 --- a/irit_rst_dt/local.py +++ b/irit_rst_dt/local.py @@ -11,7 +11,13 @@ from os import path as fp import itertools as itr -from attelo.harness.config import (LearnerConfig, +from sklearn.linear_model import (LogisticRegression) +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier + +# attelo +from attelo.harness.config import (EvaluationConfig, + LearnerConfig, Keyed) # from attelo.decoding.astar import (AstarArgs, # AstarDecoder, @@ -26,12 +32,7 @@ FrontierToHeadParser, # SentOnlyParser, SoftParser) - -from sklearn.linear_model import (LogisticRegression) -from sklearn.tree import DecisionTreeClassifier -from sklearn.ensemble import RandomForestClassifier - - +# this harness from .config.intra import (combine_intra) from .config.perceptron import (attach_learner_dp_pa, attach_learner_dp_perc, @@ -50,7 +51,10 @@ decoder_local, mk_joint, mk_joint_su, - mk_post) + mk_su_joint, + mk_post, + JointPipeline, + Settings) # PATHS @@ -278,6 +282,17 @@ def _core_parsers(klearner, unique_real_root=True): use_prob=True)), ] ]) + elif SAME_UNIT == 'preproc': + joint.extend([ + mk_su_joint(klearner, d) for d in [ + # decoder_last(), + # DECODER_LOCAL, + # decoder_mst(), + Keyed('eisner', + EisnerDecoder(unique_real_root=unique_real_root, + use_prob=True)), + ] + ]) # end WIP # postlabeling @@ -322,74 +337,26 @@ def _core_parsers(klearner, unique_real_root=True): HARNESS_NAME = 'irit-rst-dt' -# possibly obsolete -def _mk_basic_intras(klearner, kconf): - """Intra/inter parser based on a single core parser - """ - # NEW intra parsers are explicitly authorized to have more than one - # real root (necessary for the Eisner decoder, maybe other decoders too) - parsers = [IntraInterPair(intra=x, inter=y) for x, y in - zip(_core_parsers(klearner, unique_real_root=False), - _core_parsers(klearner))] - return [combine_intra(p, kconf) for p in parsers] - - -def _mk_sorc_intras(klearner, kconf): - """Intra/inter parsers based on a single core parser - and a sentence oracle - """ - parsers = [IntraInterPair(intra=x, inter=y) for x, y in - zip(_core_parsers(ORACLE, unique_real_root=False), - _core_parsers(klearner))] - return [combine_intra(p, kconf, primary='inter') for p in parsers] - - -def _mk_dorc_intras(klearner, kconf): - """Intra/inter parsers based on a single core parser - and a document oracle - """ - parsers = [IntraInterPair(intra=x, inter=y) for x, y in - zip(_core_parsers(klearner, unique_real_root=False), - _core_parsers(ORACLE))] - return [combine_intra(p, kconf, primary='intra') for p in parsers] - - -def _mk_last_intras(klearner, kconf): - """Parsers using "last" for intra and a core decoder for inter. - """ - if ((not klearner.attach.payload.can_predict_proba or - not klearner.label.payload.can_predict_proba)): - return [] - - kconf = Keyed(key=combined_key('last', kconf), - payload=kconf.payload) - econf_last = mk_joint(klearner, decoder_last()) - parsers = [IntraInterPair(intra=econf_last, inter=y) for y in - _core_parsers(klearner)] - return [combine_intra(p, kconf, primary='inter') for p in parsers] -# end of possibly obsolete - - def _is_junk(econf): """ Any configuration for which this function returns True will be silently discarded """ # intrasential head to head mode only works with mst for now - has = econf.settings - kids = econf.settings.children - has_intra_oracle = has.intra and (kids.intra.oracle or kids.inter.oracle) - has_any_oracle = has.oracle or has_intra_oracle + has_intra_oracle = (econf.settings.intra + and (econf.settings.children.intra.oracle + or econf.settings.children.inter.oracle)) + has_any_oracle = econf.settings.oracle or has_intra_oracle - decoder_name = econf.parser.key[len(has.key) + 1:] + decoder_name = econf.parser.key[len(econf.settings.key) + 1:] # last with last-based intra decoders is a bit redundant - if has.intra and decoder_name == 'last': + if econf.settings.intra and decoder_name == 'last': return True # oracle would be redundant with sentence/doc oracles # FIXME the above is wrong for intra/inter parsers because gold edges # can fall out of the search space - if has.oracle and has_intra_oracle: + if econf.settings.oracle and has_intra_oracle: return True # FIXME should sometimes be False # toggle or comment to enable filtering in/out oracles @@ -404,6 +371,24 @@ def _evaluations(): res = [] # == one-step (global) parsers == + # WIP + # maxent, eisner, AD.L-jnt + maxent_klearner = LearnerConfig(attach=attach_learner_maxent(), + label=label_learner_maxent()) + res.append( + EvaluationConfig(key='maxent-AD.L-jnt-eisner-NEW', + settings=Settings(key='AD.L-jnt', + intra=False, + oracle=False, + children=None), + learner=maxent_klearner, + parser=Keyed('AD.L-jnt-eisner-NEW', + JointPipeline( + learner_attach=maxent_klearner.attach.payload, + learner_label=maxent_klearner.label.payload, + decoder=EisnerDecoder(unique_real_root=True, use_prob=True)))) + ) + # end WIP learners = [] learners.extend(_LOCAL_LEARNERS) # current structured learners don't do probs, hence non-prob decoders @@ -412,8 +397,8 @@ def _evaluations(): # MST is disabled by default, as it does not output projective trees # nonprob_mst = MstDecoder(MstRootStrategy.fake_root, False) # learners.extend(l(nonprob_mst) for l in _STRUCTURED_LEARNERS) - global_parsers = itr.chain.from_iterable(_core_parsers(l) - for l in learners) + global_parsers = itr.chain.from_iterable( + _core_parsers(l) for l in learners) res.extend(global_parsers) # == two-step parsers: intra then inter-sentential == @@ -449,9 +434,12 @@ def _evaluations(): # NEW intra parsers are explicitly authorized (in fact, expected) # to have more than one real root ; this is necessary for the # Eisner decoder and probably others, with "hard" strategies - ii_pairs.extend(IntraInterPair(intra=x, inter=y) for x, y in - zip(_core_parsers(intra_lnr, unique_real_root=True), # TODO add unique_real_root to hyperparameters in grid search - _core_parsers(inter_lnr, unique_real_root=True))) + # TODO add unique_real_root to hyperparameters in grid search + ii_pairs.extend( + IntraInterPair(intra=x, inter=y) for x, y in + zip(_core_parsers(intra_lnr, unique_real_root=True), + _core_parsers(inter_lnr, unique_real_root=True)) + ) # cross-product: pairs of parsers x intra-/inter- configs ii_parsers = [combine_intra(p, kconf, primary=('inter' if p.intra.settings.oracle @@ -484,9 +472,9 @@ def _want_details(econf): else: learners = [econf.learner] has_maxent = any('maxent' in l.key for l in learners) - has = econf.settings - kids = econf.settings.children - has_intra_oracle = has.intra and (kids.intra.oracle or kids.inter.oracle) + has_intra_oracle = (econf.settings.intra and + (econf.settings.children.intra.oracle or + econf.settings.children.inter.oracle)) return (has_maxent and ('mst' in econf.parser.key or 'astar' in econf.parser.key or 'eisner' in econf.parser.key) and From 9a153ada9cb55bae529dcc367874cb03e9f12d88 Mon Sep 17 00:00:00 2001 From: moreymat Date: Sat, 3 Sep 2016 15:33:52 +0200 Subject: [PATCH 06/74] WIP one file per doc, same-unit --- irit_rst_dt/cmd/clean.py | 6 +-- irit_rst_dt/cmd/gather.py | 98 ++++++++++++++++++++++-------------- irit_rst_dt/config/common.py | 21 ++++---- irit_rst_dt/harness.py | 58 ++++++++++++--------- 4 files changed, 107 insertions(+), 76 deletions(-) diff --git a/irit_rst_dt/cmd/clean.py b/irit_rst_dt/cmd/clean.py index ad55823..2a9a019 100644 --- a/irit_rst_dt/cmd/clean.py +++ b/irit_rst_dt/cmd/clean.py @@ -34,10 +34,10 @@ def main(_): You shouldn't need to call this yourself if you're using `config_argparser` """ - for data_dir in sorted(subdirs(LOCAL_TMP)): - if fp.basename(data_dir) == "latest": + for base_dir in sorted(subdirs(LOCAL_TMP)): + if fp.basename(base_dir) == "latest": continue - for subdir in subdirs(data_dir): + for subdir in subdirs(base_dir): bname = fp.basename(subdir) if bname in ["eval-current", "eval-previous", "scratch-current", "scratch-previous"]: diff --git a/irit_rst_dt/cmd/gather.py b/irit_rst_dt/cmd/gather.py index c808c44..996f570 100644 --- a/irit_rst_dt/cmd/gather.py +++ b/irit_rst_dt/cmd/gather.py @@ -6,12 +6,17 @@ """ from __future__ import print_function +import itertools import os from attelo.harness.util import call, force_symlink +from attelo.learning.oracle import AttachOracle +from attelo.parser.intra import IntraInterParser +from attelo.parser.same_unit import SameUnitClassifierWrapper from ..local import (FEATURE_SET, LABEL_SET, TEST_CORPUS, TRAINING_CORPUS, - SAME_UNIT, PTB_DIR, CORENLP_OUT_DIR, LECSIE_DATA_DIR) + SAME_UNIT, PTB_DIR, CORENLP_OUT_DIR, LECSIE_DATA_DIR, + EVALUATIONS) from ..util import (current_tmp, latest_tmp) NAME = 'gather' @@ -125,26 +130,41 @@ def main(args): fix_pseudo_rels = args.fix_pseudo_rels + # 2016-09-01 put data files in {tdir}/data + tdir_data = os.path.join(tdir, 'data') + if not os.path.exists(tdir_data): + os.makedirs(tdir_data) # same-unit - instances = 'same-unit' - su_prefix_train = '{}.{}'.format( - os.path.basename(TRAINING_CORPUS), instances) - su_train_path = os.path.join(tdir, su_prefix_train) - su_label_path = su_train_path + '.relations.sparse' - su_vocab_path = su_label_path + '.vocab' - if TEST_CORPUS is not None: - su_prefix_test = '{}.{}'.format( - os.path.basename(TEST_CORPUS), instances) - su_test_path = os.path.join(tdir, su_prefix_test) - - if SAME_UNIT in ['joint', 'preproc'] and not args.resume_frag_pairs: + all_parsers = [] + for econf in EVALUATIONS: + parser = econf.parser[1] + if isinstance(parser, IntraInterParser): + all_parsers.extend(x[1] for x in itertools.chain( + parser._parsers.intra.steps, parser._parsers.inter.steps)) + else: + all_parsers.extend(x[1] for x in parser.steps) + same_unit_parsers = [x for x in all_parsers + if isinstance(x, SameUnitClassifierWrapper)] + same_unit_clfs = [x._learner_su for x in same_unit_parsers] + if same_unit_parsers and not args.resume_frag_pairs: + instances = 'same-unit' + su_prefix_train = '{}.relations.{}'.format( + os.path.basename(TRAINING_CORPUS), instances) + su_train_path = os.path.join(tdir_data, su_prefix_train) + su_label_path = su_train_path + '.labels' + su_vocab_path = su_train_path + '.sparse.vocab' + if TEST_CORPUS is not None: + su_prefix_test = '{}.{}'.format( + os.path.basename(TEST_CORPUS), instances) + su_test_path = os.path.join(tdir_data, su_prefix_test) + if not args.skip_training: # * train - extract_features(TRAINING_CORPUS, tdir, fix_pseudo_rels, + extract_features(TRAINING_CORPUS, tdir_data, fix_pseudo_rels, instances) if TEST_CORPUS is not None: # * test - extract_features(TEST_CORPUS, tdir, fix_pseudo_rels, + extract_features(TEST_CORPUS, tdir_data, fix_pseudo_rels, instances, vocab_path=su_vocab_path, label_path=su_label_path) @@ -152,17 +172,17 @@ def main(args): # all pairs instances = 'edu-pairs' if not args.skip_training and not args.resume_frag_pairs: - extract_features(TRAINING_CORPUS, tdir, fix_pseudo_rels, + extract_features(TRAINING_CORPUS, tdir_data, fix_pseudo_rels, instances) # path to the vocab and labelset gathered from the training set, # we'll use these paths for the test set and for the frag-pairs - prefix_train = '{}.{}'.format( + prefix_train = '{}.relations.{}'.format( os.path.basename(TRAINING_CORPUS), instances) - train_path = os.path.join(tdir, prefix_train) - label_path = train_path + '.relations.sparse' - vocab_path = label_path + '.vocab' + train_path = os.path.join(tdir_data, prefix_train) + label_path = train_path + '.labels' + vocab_path = train_path + '.sparse.vocab' if TEST_CORPUS is not None and not args.resume_frag_pairs: - extract_features(TEST_CORPUS, tdir, fix_pseudo_rels, + extract_features(TEST_CORPUS, tdir_data, fix_pseudo_rels, instances, vocab_path=vocab_path, label_path=label_path) @@ -171,25 +191,27 @@ def main(args): # the other fragmented EDUs and the EDUs that don't belong to any # fragmented EDU instances = 'frag-pairs' - # we use the vocabulary and labelset from "edu-pairs" ; this is the - # simplest solution currently and it seems correct, but maybe we - # could extend "edu-pairs" with these pairs when we learn the - # vocabulary? - if not args.skip_training: - frag_edus_train = su_train_path + '.relations' + '.deps_true' - extract_features(TRAINING_CORPUS, tdir, fix_pseudo_rels, - instances, frag_edus=frag_edus_train, - vocab_path=vocab_path, - label_path=label_path) - if TEST_CORPUS is not None: - frag_edus_test = su_test_path + '.relations' + '.deps_true' - extract_features(TEST_CORPUS, tdir, fix_pseudo_rels, - instances, frag_edus=frag_edus_test, - vocab_path=vocab_path, - label_path=label_path) + same_unit_types = set(('true' if isinstance(x, AttachOracle) + else 'pred') + for clf in same_unit_clfs) + for same_unit_type in sorted(same_unit_types): + # we use the vocabulary and labelset from "edu-pairs" ; this is the + # simplest solution currently and it seems correct, but maybe we + # could extend "edu-pairs" with these pairs when we learn the + # vocabulary? + if not args.skip_training: + extract_features(TRAINING_CORPUS, tdir_data, fix_pseudo_rels, + instances, frag_edus=same_unit_type, + vocab_path=vocab_path, + label_path=label_path) + if TEST_CORPUS is not None: + extract_features(TEST_CORPUS, tdir_data, fix_pseudo_rels, + instances, frag_edus=same_unit_type, + vocab_path=vocab_path, + label_path=label_path) # end frag pairs - with open(os.path.join(tdir, "versions-gather.txt"), "w") as stream: + with open(os.path.join(tdir_data, "versions-gather.txt"), "w") as stream: call(["pip", "freeze"], stdout=stream) if not (args.skip_training or args.resume_frag_pairs): diff --git a/irit_rst_dt/config/common.py b/irit_rst_dt/config/common.py index abf3efd..1ff9e9b 100644 --- a/irit_rst_dt/config/common.py +++ b/irit_rst_dt/config/common.py @@ -1,7 +1,9 @@ """Commonly used configuration options""" from collections import namedtuple +import copy import six + # from attelo.decoding.astar import (AstarArgs, # AstarDecoder, # Heuristic, @@ -15,8 +17,7 @@ from attelo.parser.full import (JointPipeline, PostlabelPipeline) from attelo.parser.same_unit import (JointSameUnitPipeline, - SameUnitJointPipeline, - SklearnSameUnitClassifier) + SameUnitJointPipeline) def combined_key(*variants): @@ -127,11 +128,9 @@ def mk_joint_su(klearner, kdecoder): parser = JointSameUnitPipeline( learner_attach=klearner.attach.payload, learner_label=klearner.label.payload, - learner_su=( - SklearnSameUnitClassifier(klearner.attach.payload._learner) - if not isinstance(klearner.attach.payload, AttachOracle) - else klearner.attach.payload - ), + # FIXME this copy does not really make sense here, but at least + # its type is correct + learner_su=copy.deepcopy(klearner.attach.payload), decoder=kdecoder.payload) return EvaluationConfig(key=key, settings=settings, @@ -146,11 +145,9 @@ def mk_su_joint(klearner, kdecoder): key = combined_key(klearner, parser_key) # su: use same kind of learner as "attach" parser = JointSameUnitPipeline( - learner_su=( - SklearnSameUnitClassifier(klearner.attach.payload._learner) - if not isinstance(klearner.attach.payload, AttachOracle) - else klearner.attach.payload - ), + # FIXME this copy does not really make sense here, but at least + # its type is correct + learner_su=copy.deepcopy(klearner.attach.payload), learner_attach=klearner.attach.payload, learner_label=klearner.label.payload, decoder=kdecoder.payload) diff --git a/irit_rst_dt/harness.py b/irit_rst_dt/harness.py index 9c81c1b..3242a31 100644 --- a/irit_rst_dt/harness.py +++ b/irit_rst_dt/harness.py @@ -2,6 +2,7 @@ Paths to files used or generated by the test harness ''' from collections import Counter +from glob import glob from os import path as fp import sys @@ -42,13 +43,13 @@ def __init__(self): def run(self, runcfg): """Run the evaluation """ - data_dir = latest_tmp() - if not fp.exists(data_dir): + base_dir = latest_tmp() + if not fp.exists(base_dir): exit_ungathered() - eval_dir, scratch_dir = prepare_dirs(runcfg, data_dir) + eval_dir, scratch_dir = prepare_dirs(runcfg, base_dir) self.load(runcfg, eval_dir, scratch_dir) - evidence_of_gathered = self.mpack_paths(False)[0] - if not fp.exists(evidence_of_gathered): + evidence_of_gathered = self.mpack_paths(False)['edu_input'] + if not glob(evidence_of_gathered): exit_ungathered() evaluate_corpus(self) @@ -114,35 +115,46 @@ def mpack_paths(self, test_data, stripped=False): test_data: boolean If true, the returned paths point to self.testset else to self.dataset. + stripped: boolean + TODO Returns ------- - paths: tuple of file paths - Path to: edu_input, pairings, features, vocab, cdu_input, - cdu_pairings, cdu_features, corpus (to access gold - structures, WIP). + paths: dict of file paths + Path to: edu_input, pairings, features, vocab, labels, + cdu_input, cdu_pairings, cdu_features, corpus (to access + gold structures, WIP). """ - ext = 'relations.edu-pairs.sparse' + base = 'relations.edu-pairs' + ext = base + '.sparse' # path to data file in the evaluation dir dset = self.testset if test_data else self.dataset - core_path = fp.join(self.eval_dir, "%s.%s" % (dset, ext)) + vocab_path = fp.join(self.eval_dir, "%s.%s.vocab" % (dset, ext)) + labels_path = fp.join(self.eval_dir, "%s.%s.labels" % (dset, base)) + core_path = fp.join(self.eval_dir, dset, "*.%s" % ext) # 2016-07-28 pairs on fragmented EDUs frag_ext = 'relations.frag-pairs.sparse' - frag_path = fp.join(self.eval_dir, "%s.%s" % (dset, frag_ext)) + frag_path = fp.join(self.eval_dir, dset, "*.%s" % frag_ext) # WIP gold RST trees corpus_path = fp.abspath(TEST_CORPUS if test_data else TRAINING_CORPUS) - # end WIP - return (core_path + '.edu_input', - core_path + '.pairings', - (core_path + '.stripped') if stripped else core_path, - core_path + '.vocab', - # fragmented EDUs - frag_path + '.cdu_input', - frag_path + '.pairings', - (frag_path + '.stripped') if stripped else frag_path, - # corpus - corpus_path) + # end gold RST trees + res = { + 'edu_input': core_path + '.edu_input', + 'pairings': core_path + '.pairings', + 'features': ((core_path + '.stripped') if stripped + else core_path), + 'vocab': vocab_path, + 'labels': labels_path, + # fragmented EDUs + 'cdu_input': frag_path + '.cdu_input', + 'cdu_pairings': frag_path + '.cdu_pairings', + 'cdu_features': ((frag_path + '.stripped') if stripped + else frag_path), + # corpus for gold RST trees + 'corpus': corpus_path, + } + return res def model_paths(self, rconf, fold, parser): """Paths to the learner(s) model(s). From 6888c259a4b137dbb7f825c292ab4e1a591cc4b3 Mon Sep 17 00:00:00 2001 From: moreymat Date: Sun, 4 Sep 2016 19:04:48 +0200 Subject: [PATCH 07/74] WIP de-engineering local --- irit_rst_dt/config/intra.py | 6 +- irit_rst_dt/local.py | 259 ++++++++++++++++++++++++++---------- 2 files changed, 191 insertions(+), 74 deletions(-) diff --git a/irit_rst_dt/config/intra.py b/irit_rst_dt/config/intra.py index 1921f7d..7806d29 100644 --- a/irit_rst_dt/config/intra.py +++ b/irit_rst_dt/config/intra.py @@ -31,9 +31,9 @@ def combine_intra(econfs, kconf, primary='intra', verbose=False): else: raise ValueError("'primary' should be one of intra/inter: " + primary) - parsers = econfs.fmap(lambda e: e.parser.payload) - subsettings = econfs.fmap(lambda e: e.settings) - learners = econfs.fmap(lambda e: e.learner) + parsers = econfs.fmap(lambda e: e.parser.payload) # IntraInterPair + subsettings = econfs.fmap(lambda e: e.settings) # IntraInterPair + learners = econfs.fmap(lambda e: e.learner) # IntraInterPair settings = Settings(key=combined_key(kconf, econf.settings), intra=True, oracle=econf.settings.oracle, diff --git a/irit_rst_dt/local.py b/irit_rst_dt/local.py index 419c077..0cb9cb1 100644 --- a/irit_rst_dt/local.py +++ b/irit_rst_dt/local.py @@ -27,11 +27,15 @@ from attelo.decoding.mst import (MstDecoder, MstRootStrategy) from attelo.learning.local import (SklearnAttachClassifier, SklearnLabelClassifier) +from attelo.learning.oracle import AttachOracle from attelo.parser.intra import (IntraInterPair, HeadToHeadParser, FrontierToHeadParser, # SentOnlyParser, SoftParser) +from attelo.parser.same_unit import (JointSameUnitPipeline, + SameUnitJointPipeline) + # this harness from .config.intra import (combine_intra) from .config.perceptron import (attach_learner_dp_pa, @@ -252,7 +256,7 @@ def _structured(klearner): """ -def _core_parsers(klearner, unique_real_root=True): +def _core_parsers(klearner, unique_real_root=True, same_unit='no'): """Our basic parser configurations """ # joint @@ -271,7 +275,7 @@ def _core_parsers(klearner, unique_real_root=True): ] ] # WIP with same-unit - if SAME_UNIT == 'joint': + if same_unit == 'joint': joint.extend([ mk_joint_su(klearner, d) for d in [ # decoder_last(), @@ -282,7 +286,7 @@ def _core_parsers(klearner, unique_real_root=True): use_prob=True)), ] ]) - elif SAME_UNIT == 'preproc': + elif same_unit == 'preproc': joint.extend([ mk_su_joint(klearner, d) for d in [ # decoder_last(), @@ -376,78 +380,191 @@ def _evaluations(): maxent_klearner = LearnerConfig(attach=attach_learner_maxent(), label=label_learner_maxent()) res.append( - EvaluationConfig(key='maxent-AD.L-jnt-eisner-NEW', - settings=Settings(key='AD.L-jnt', - intra=False, - oracle=False, - children=None), - learner=maxent_klearner, - parser=Keyed('AD.L-jnt-eisner-NEW', - JointPipeline( - learner_attach=maxent_klearner.attach.payload, - learner_label=maxent_klearner.label.payload, - decoder=EisnerDecoder(unique_real_root=True, use_prob=True)))) + EvaluationConfig( + key='maxent-AD.L-jnt-eisner', + settings=Settings(key='AD.L-jnt', + intra=False, + oracle=False, + children=None), + learner=maxent_klearner, + parser=Keyed('AD.L-jnt-eisner', + JointPipeline( + learner_attach=maxent_klearner.attach.payload, + learner_label=maxent_klearner.label.payload, + decoder=EisnerDecoder(unique_real_root=True, use_prob=True)))) + ) + + # maxent, eisner, AD.L-jnt then overwrite predicted "Same-Unit" + # FIXME "learner" might be wrong: this LearnerConfig has no mention of + # the same-unit classifier + maxent_su_learner = attach_learner_maxent() + # oracle_su_learner = Keyed('oracle', AttachOracle()) # alternative + res.append( + EvaluationConfig( + key='maxent-AD.L-jnt_su-eisner', + settings=Settings(key='AD.L-jnt_su', + intra=False, + oracle=False, + children=None), + # FIXME ("attach", "label"), lacks "same_unit" + learner=maxent_klearner, + parser=Keyed('AD.L-jnt_su-eisner', + JointSameUnitPipeline( + learner_attach=maxent_klearner.attach.payload, + learner_label=maxent_klearner.label.payload, + learner_su=maxent_su_learner.payload, + decoder=EisnerDecoder(unique_real_root=True, use_prob=True)))) ) # end WIP - learners = [] - learners.extend(_LOCAL_LEARNERS) - # current structured learners don't do probs, hence non-prob decoders - nonprob_eisner = EisnerDecoder(use_prob=False) - learners.extend(l(nonprob_eisner) for l in _STRUCTURED_LEARNERS) - # MST is disabled by default, as it does not output projective trees - # nonprob_mst = MstDecoder(MstRootStrategy.fake_root, False) - # learners.extend(l(nonprob_mst) for l in _STRUCTURED_LEARNERS) - global_parsers = itr.chain.from_iterable( - _core_parsers(l) for l in learners) - res.extend(global_parsers) + + if False: # legacy code for one-step parsers + learners = [] + learners.extend(_LOCAL_LEARNERS) + # current structured learners don't do probs, hence non-prob decoders + nonprob_eisner = EisnerDecoder(use_prob=False) + learners.extend(l(nonprob_eisner) for l in _STRUCTURED_LEARNERS) + # MST is disabled by default, as it does not output projective trees + # nonprob_mst = MstDecoder(MstRootStrategy.fake_root, False) + # learners.extend(l(nonprob_mst) for l in _STRUCTURED_LEARNERS) + global_parsers = itr.chain.from_iterable( + _core_parsers(l, same_unit=SAME_UNIT) for l in learners) + res.extend(global_parsers) # == two-step parsers: intra then inter-sentential == - ii_learners = [] # (intra, inter) learners - ii_learners.extend((copy.deepcopy(klearner), copy.deepcopy(klearner)) - for klearner in _LOCAL_LEARNERS - if klearner != ORACLE) - # keep pointer to intra and inter oracles - ii_oracles = (copy.deepcopy(ORACLE), ORACLE_INTER) - ii_learners.append(ii_oracles) - # structured learners, cf. supra - intra_nonprob_eisner = EisnerDecoder(use_prob=False, - unique_real_root=True) - inter_nonprob_eisner = EisnerDecoder(use_prob=False, - unique_real_root=True) - ii_learners.extend((copy.deepcopy(l)(intra_nonprob_eisner), - copy.deepcopy(l)(inter_nonprob_eisner)) - for l in _STRUCTURED_LEARNERS) - # couples of learners with either sentence- or document-level oracle - sorc_ii_learners = [ - (ii_oracles[0], inter_lnr) for intra_lnr, inter_lnr in ii_learners - if (ii_oracles[0], inter_lnr) not in ii_learners - ] - dorc_ii_learners = [ - (intra_lnr, ii_oracles[1]) for intra_lnr, inter_lnr in ii_learners - if (intra_lnr, ii_oracles[1]) not in ii_learners - ] - # enumerate pairs of (intra, inter) parsers - ii_pairs = [] - for intra_lnr, inter_lnr in itr.chain(ii_learners, - sorc_ii_learners, - dorc_ii_learners): - # NEW intra parsers are explicitly authorized (in fact, expected) - # to have more than one real root ; this is necessary for the - # Eisner decoder and probably others, with "hard" strategies - # TODO add unique_real_root to hyperparameters in grid search - ii_pairs.extend( - IntraInterPair(intra=x, inter=y) for x, y in - zip(_core_parsers(intra_lnr, unique_real_root=True), - _core_parsers(inter_lnr, unique_real_root=True)) - ) - # cross-product: pairs of parsers x intra-/inter- configs - ii_parsers = [combine_intra(p, kconf, - primary=('inter' if p.intra.settings.oracle - else 'intra'), - verbose=_VERBOSE_INTRA_INTER) - for p, kconf - in itr.product(ii_pairs, _INTRA_INTER_CONFIGS)] - res.extend(ii_parsers) + # WIP explicit declaration + maxent_team_intra = LearnerConfig(attach=attach_learner_maxent(), + label=label_learner_maxent()) + # FIXME ? maybe sel_inter='global' implies that + # maxent_team_inter = LearnerConfig(attach=maxent_klearner.attach, label=maxent_klearner.label) + maxent_team_inter = LearnerConfig(attach=attach_learner_maxent(), + label=label_learner_maxent()) + res.append( + EvaluationConfig( + key='maxent-iheads-global-AD.L-jnt-eisner', + settings=Settings(key='iheads-global-AD.L-jnt', + intra=True, + oracle=False, + children=IntraInterPair( + intra=Settings(key='AD.L-jnt', + intra=False, + oracle=False, + children=None), + inter=Settings(key='AD.L-jnt', + intra=False, + oracle=False, + children=None))), + learner=IntraInterPair(intra=maxent_team_intra, + inter=maxent_team_inter), + parser=Keyed('iheads-global-AD.L-jnt-eisner', + HeadToHeadParser( + IntraInterPair( + intra=JointPipeline( + learner_attach=maxent_team_intra.attach.payload, + learner_label=maxent_team_intra.label.payload, + decoder=EisnerDecoder(unique_real_root=True, use_prob=True)), + inter=JointPipeline( + learner_attach=maxent_team_inter.attach.payload, + learner_label=maxent_team_inter.label.payload, + decoder=EisnerDecoder(unique_real_root=True, use_prob=True))), + sel_inter='global', + verbose=_VERBOSE_INTRA_INTER))) + ) + + # maxent-iheads-global-AD.L-jnt_su-eisner + maxent_su_learner_intra = attach_learner_maxent() # WIP + res.append( + EvaluationConfig( + key='maxent-iheads-global-AD.L-jnt_su-eisner', + settings=Settings(key='iheads-global-AD.L-jnt_su', + intra=True, + oracle=False, + children=IntraInterPair( + intra=Settings(key='AD.L-jnt_su', + intra=False, + oracle=False, + children=None), + inter=Settings(key='AD.L-jnt', + intra=False, + oracle=False, + children=None))), + learner=IntraInterPair(intra=maxent_team_intra, + inter=maxent_team_inter), + parser=Keyed('iheads-global-AD.L-jnt_su-eisner', + HeadToHeadParser( + IntraInterPair( + intra=JointSameUnitPipeline( + learner_attach=maxent_team_intra.attach.payload, + learner_label=maxent_team_intra.label.payload, + learner_su=maxent_su_learner_intra.payload, + decoder=EisnerDecoder(unique_real_root=True, use_prob=True)), + inter=JointPipeline( + learner_attach=maxent_team_inter.attach.payload, + learner_label=maxent_team_inter.label.payload, + decoder=EisnerDecoder(unique_real_root=True, use_prob=True))), + sel_inter='global', + verbose=_VERBOSE_INTRA_INTER))) + ) + # end WIP + + if False: # disable legacy code for 2-step parsers + ii_learners = [] # (intra, inter) learners + ii_learners.extend((copy.deepcopy(klearner), copy.deepcopy(klearner)) + for klearner in _LOCAL_LEARNERS + if klearner != ORACLE) + # keep pointer to intra and inter oracles + ii_oracles = (copy.deepcopy(ORACLE), ORACLE_INTER) + ii_learners.append(ii_oracles) + # structured learners, cf. supra + intra_nonprob_eisner = EisnerDecoder(use_prob=False, + unique_real_root=True) + inter_nonprob_eisner = EisnerDecoder(use_prob=False, + unique_real_root=True) + + ii_learners.extend((copy.deepcopy(l)(intra_nonprob_eisner), + copy.deepcopy(l)(inter_nonprob_eisner)) + for l in _STRUCTURED_LEARNERS) + # couples of learners with either sentence- or document-level oracle + sorc_ii_learners = [ + (ii_oracles[0], inter_lnr) for intra_lnr, inter_lnr in ii_learners + if (ii_oracles[0], inter_lnr) not in ii_learners + ] + dorc_ii_learners = [ + (intra_lnr, ii_oracles[1]) for intra_lnr, inter_lnr in ii_learners + if (intra_lnr, ii_oracles[1]) not in ii_learners + ] + # enumerate pairs of (intra, inter) parsers + ii_pairs = [] + for intra_lnr, inter_lnr in itr.chain(ii_learners, + sorc_ii_learners, + dorc_ii_learners): + # NEW intra parsers are explicitly authorized (in fact, expected) + # to have more than one real root ; this is necessary for the + # Eisner decoder and probably others, with "hard" strategies + # TODO add unique_real_root to hyperparameters in grid search + intra_parsers = _core_parsers(intra_lnr, unique_real_root=True, + same_unit=SAME_UNIT) + # same-unit is undefined for inter, in the RST-DT corpus + # (at least in our implementation) + inter_parsers = _core_parsers(inter_lnr, unique_real_root=True, + same_unit='no') + if SAME_UNIT != 'no': + # inter_parsers would be twice less numerous than intra_parsers + # => dirty hack: double the inter parsers + inter_parsers = inter_parsers + inter_parsers + + ii_pairs.extend(IntraInterPair(intra=x, inter=y) for x, y + # FIXME should probably not be a zip(), cf dirty hack + # above + in zip(intra_parsers, inter_parsers) + ) + # cross-product: pairs of parsers x intra-/inter- configs + ii_parsers = [combine_intra(p, kconf, + primary=('inter' if p.intra.settings.oracle + else 'intra'), + verbose=_VERBOSE_INTRA_INTER) + for p, kconf + in itr.product(ii_pairs, _INTRA_INTER_CONFIGS)] + res.extend(ii_parsers) return [x for x in res if not _is_junk(x)] From fee674e89d71ebcc14ed80a4f1e3d0dcb36286f6 Mon Sep 17 00:00:00 2001 From: moreymat Date: Wed, 7 Sep 2016 16:23:07 +0200 Subject: [PATCH 08/74] NEW convert attelo predictions to disdep file --- evals/attelo_predictions_to_disdep.py | 101 ++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100755 evals/attelo_predictions_to_disdep.py diff --git a/evals/attelo_predictions_to_disdep.py b/evals/attelo_predictions_to_disdep.py new file mode 100755 index 0000000..231a233 --- /dev/null +++ b/evals/attelo_predictions_to_disdep.py @@ -0,0 +1,101 @@ +"""Small utility script to convert predictions from attelo to dis_dep files. +""" + +from __future__ import absolute_import, print_function + +from collections import defaultdict +from glob import glob +import os + +from attelo.io import load_edus, load_predictions +from attelo.metrics.util import barebones_rst_deptree +from attelo.table import UNRELATED +from educe.corpus import FileId +from educe.learning.disdep_format import dump_disdep_files +from educe.rst_dt.dep2con import (DummyNuclearityClassifier, + InsideOutAttachmentRanker) + + +def attelo_predictions_to_disdep_files(edus_file_glob, edges_file, out_dir): + """Generate disdep files from a file dump of attelo predictions. + + Parameters + ---------- + edus_file_glob: str + Regex for `edu_input` file paths. + edges_file: str + Path to the file that contains attelo predictions (edges as + triples). + out_dir: str + Path to the output folder. + """ + # set up heuristic classifiers for nuclearity and rank + nuc_clf = DummyNuclearityClassifier(strategy='unamb_else_most_frequent') + nuc_clf.fit([], []) # dummy fit + rnk_clf = InsideOutAttachmentRanker(strategy='closest-intra-rl-inter-rl', + prioritize_same_unit=True) + rnk_clf.fit([], []) # dummy fit + + # load EDUs + doc_edus = dict() + id2doc = dict() + for edu_input_file in glob(edus_file_glob): + doc_name = os.path.basename(edu_input_file).rsplit('.', 4)[0] # FRAGILE + edus = load_edus(edu_input_file) + assert doc_name == edus[0].grouping + # map doc_name to list of EDUs ; populate reverse mapping from + # EDU id to doc_name, so that we can dispatch edges to their + # document + # we keep the list of EDUs sorted as in edu_input, hence we + # assume edu_input follows the linear order of EDUs + doc_edus[doc_name] = edus + for edu in edus: + id2doc[edu.id] = doc_name + # load edges and dispatch them to their doc + edges_pred = load_predictions(edges_file) + # for each doc, list edges + doc_edges = defaultdict(list) + for gov_id, dep_id, lbl in edges_pred: + if lbl != UNRELATED: + doc_name = id2doc[dep_id] + doc_edges[doc_name].append((gov_id, dep_id, lbl)) + + # for each doc, get a full-fledged RstDepTree, nuclearity and ranking + # are currently determined heuristically + doc_dtree = dict() + for doc_name, edus in doc_edus.items(): + # comply with current API for barebones_rst_deptree: + # for each doc, create a dict with one item (doc_name, list of edges) + dep_edges = doc_edges[doc_name] + # create a barebones RST dep tree: head and label only + dtree, edu2sent = barebones_rst_deptree(dep_edges, edus, strict=False) + # set its origin + dtree.origin = FileId(doc_name, None, None, None) + # flesh out with heuristically-determined nuclearity + dtree.nucs = nuc_clf.predict([dtree])[0] + # and heuristically-determined rank (needs edu2sent to prioritize + # intra-sentential attachments over inter-sentential ones) + dtree.sent_idx = edu2sent # DIRTY + dtree.ranks = rnk_clf.predict([dtree])[0] + doc_dtree[doc_name] = dtree + + # write the disdep files + dump_disdep_files(doc_dtree.values(), out_dir) + + +if __name__ == '__main__': + edus_file_glob = os.path.join('TMP', 'latest', 'data', 'TEST', + '*.edu-pairs.sparse.edu_input') + edges_file_glob = os.path.join('TMP', 'latest', 'scratch-current', + 'combined', 'output.*') + out_dir = 'TMP_disdep_chain_pred_ours' + # attelo predictions are currently stored in one big file + edges_files = glob(edges_file_glob) + assert len(edges_files) == 1 + edges_file = edges_files[0] + # paths to the resulting disdep files + out_dir = os.path.join(out_dir, 'TEST') + if not os.path.exists(out_dir): + os.makedirs(out_dir) + # do the conversion + attelo_predictions_to_disdep_files(edus_file_glob, edges_file, out_dir) From d5e2a4319ce7ba0169b5eb2088dda528bd815849 Mon Sep 17 00:00:00 2001 From: moreymat Date: Wed, 7 Sep 2016 17:14:40 +0200 Subject: [PATCH 09/74] ENH new script to compute dependency scores on disdep files --- evals/disdep_eval.py | 77 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100755 evals/disdep_eval.py diff --git a/evals/disdep_eval.py b/evals/disdep_eval.py new file mode 100755 index 0000000..d33f439 --- /dev/null +++ b/evals/disdep_eval.py @@ -0,0 +1,77 @@ +"""Evaluation procedure for discourse dependency (disdep) files. + +Computes UAS and flavours of LAS for labels, nuclearity, rank and +their combinations. +""" + +from __future__ import absolute_import, print_function +import codecs +import csv +from glob import glob +import os + + +if __name__ == '__main__': + # TODO turn into argparse params + dir_true = os.path.join('TMP_disdep_chain_true', 'TEST') + dir_pred = os.path.join('TMP_disdep_chain_pred_ours', 'TEST') + # end TODO + files_true = {os.path.basename(f).rsplit('.')[0]: f + for f in glob(os.path.join(dir_true, '*.dis_dep'))} + files_pred = {os.path.basename(f).rsplit('.')[0]: f + for f in glob(os.path.join(dir_pred, '*.dis_dep'))} + assert sorted(files_true.keys()) == sorted(files_pred.keys()) + + cnt_tot = 0 # total deps + cnt_a = 0 # correct heads (attachments) + cnt_l = 0 # correct labels + cnt_n = 0 # correct nuclearity + cnt_r = 0 # correct ranks + cnt_al = 0 # correct labelled attachments + cnt_an = 0 # correct attachment + nuc + cnt_ar = 0 # correct attachment + rank + cnt_aln = 0 # correct attachment + label + nuc + cnt_alnr = 0 # correct attachment + label + nuc + rank + + for doc_name, f_true in files_true.items(): + f_pred = files_pred[doc_name] + with codecs.open(f_true, 'r', encoding='utf-8') as f_true: + with codecs.open(f_pred, 'r', encoding='utf-8') as f_pred: + reader_true = csv.reader(f_true, dialect=csv.excel_tab) + reader_pred = csv.reader(f_pred, dialect=csv.excel_tab) + for line_true, line_pred in zip(reader_true, reader_pred): + # i, txt, head, label, clabel, nuc, rank + assert line_true[0] == line_pred[0] # safety check + ok_a = line_true[2] == line_pred[2] + ok_l = line_true[4] == line_pred[4] # use clabel + ok_n = line_true[5] == line_pred[5] + ok_r = line_true[6] == line_pred[6] + # update running counters + cnt_tot += 1 + if ok_a: + cnt_a += 1 + if ok_l: + cnt_l += 1 + if ok_n: + cnt_n += 1 + if ok_r: + cnt_r += 1 + if ok_a and ok_l: + cnt_al += 1 + if ok_a and ok_n: + cnt_an += 1 + if ok_a and ok_r: + cnt_ar += 1 + if ok_a and ok_l and ok_n: + cnt_aln += 1 + if ok_a and ok_l and ok_n and ok_r: + cnt_alnr += 1 + print('\t'.join(['a', 'l', 'n', 'r', + 'al', 'an', 'ar', + 'aln', + 'alnr'])) + print('\t'.join('{:.4f}'.format(float(cnt_x) / cnt_tot) + for cnt_x in [cnt_a, cnt_l, cnt_n, cnt_r, + cnt_al, cnt_an, cnt_ar, + cnt_aln, + cnt_alnr])) From d3e61283cb4713d82664485c4d29d2d0c9c9713b Mon Sep 17 00:00:00 2001 From: moreymat Date: Thu, 8 Sep 2016 11:29:28 +0200 Subject: [PATCH 10/74] ENH dis_dep from various sources, general eval script for dis_dep --- evals/attelo_predictions_to_disdep.py | 3 +- evals/codra.py | 2 +- evals/dis2disdep.py | 103 +++++++++++++++++++++++ evals/{disdep_eval.py => eval_disdep.py} | 22 ++++- requirements.txt | 8 +- 5 files changed, 129 insertions(+), 9 deletions(-) create mode 100755 evals/dis2disdep.py rename evals/{disdep_eval.py => eval_disdep.py} (76%) diff --git a/evals/attelo_predictions_to_disdep.py b/evals/attelo_predictions_to_disdep.py index 231a233..f0e7169 100755 --- a/evals/attelo_predictions_to_disdep.py +++ b/evals/attelo_predictions_to_disdep.py @@ -88,13 +88,12 @@ def attelo_predictions_to_disdep_files(edus_file_glob, edges_file, out_dir): '*.edu-pairs.sparse.edu_input') edges_file_glob = os.path.join('TMP', 'latest', 'scratch-current', 'combined', 'output.*') - out_dir = 'TMP_disdep_chain_pred_ours' # attelo predictions are currently stored in one big file edges_files = glob(edges_file_glob) assert len(edges_files) == 1 edges_file = edges_files[0] # paths to the resulting disdep files - out_dir = os.path.join(out_dir, 'TEST') + out_dir = os.path.join('TMP_disdep', 'chain', 'ours', 'test') if not os.path.exists(out_dir): os.makedirs(out_dir) # do the conversion diff --git a/evals/codra.py b/evals/codra.py index 17dbacb..fb36048 100644 --- a/evals/codra.py +++ b/evals/codra.py @@ -2,7 +2,7 @@ """ -from __future__ import print_function +from __future__ import absolute_import, print_function from collections import defaultdict import itertools diff --git a/evals/dis2disdep.py b/evals/dis2disdep.py new file mode 100755 index 0000000..2e4e418 --- /dev/null +++ b/evals/dis2disdep.py @@ -0,0 +1,103 @@ +"""Convert RST trees to their dependency version (.dis to .dis_dep). + +TODO +---- +* [ ] support the output of Ji & Eisenstein's parser ; need to convert + .brackets to .dis_dep (via .dis?) +* [ ] support intra-sentential level document parsing ; required to score + Joty's .sen_dis files + +""" +from __future__ import absolute_import, print_function +import argparse +import os + +from educe.corpus import FileId +from educe.learning.disdep_format import dump_disdep_files +from educe.rst_dt.codra import load_codra_output_files +from educe.rst_dt.corpus import Reader +from educe.rst_dt.deptree import RstDepTree +from educe.rst_dt.rst_wsj_corpus import (DOUBLE_FOLDER, TEST_FOLDER, + TRAIN_FOLDER) + + +# original RST corpus +RST_CORPUS = os.path.join('/home/mmorey/corpora/rst_discourse_treebank/data') +RST_MAIN_TRAIN = os.path.join(RST_CORPUS, TRAIN_FOLDER) +RST_MAIN_TEST = os.path.join(RST_CORPUS, TEST_FOLDER) +RST_DOUBLE = os.path.join(RST_CORPUS, DOUBLE_FOLDER) +# output of Joty's parser +OUT_JOTY = os.path.join('/home/mmorey/melodi/rst/joty/Doc-level/') +# output of Feng & Hirst's parser +OUT_FENG = os.path.join('/home/mmorey/melodi/rst/feng_hirst/tmp/') +# output of Ji's parser +OUT_JI = os.path.join('/home/mmorey/melodi/rst/ji_eisenstein/test_input') + + +def main(): + """Main""" + parser = argparse.ArgumentParser( + description='Convert .dis files to .dis_dep' + ) + parser.add_argument('--nary_enc', default='chain', + choices=['chain', 'tree'], + help="Encoding for n-ary nodes") + parser.add_argument('--author', default='gold', + choices=['gold', 'silver', 'joty', 'feng', 'ji'], + help="Author of the version of the corpus") + parser.add_argument('--split', default='test', + choices=['train', 'test', 'double'], + help="Relevant part of the corpus") + parser.add_argument('--out_root', default='TMP_disdep', + help="Root directory for the output") + args = parser.parse_args() + # precise output path, by default: TMP_disdep/chain/gold/train + out_dir = os.path.join(args.out_root, args.nary_enc, args.author, args.split) + if not os.path.exists(out_dir): + os.makedirs(out_dir) + # read RST trees + nary_enc = args.nary_enc + author = args.author + corpus_split = args.split + + if author == 'gold': + if corpus_split == 'train': + corpus_dir = RST_MAIN_TRAIN + elif corpus_split == 'test': + corpus_dir = RST_MAIN_TEST + elif corpus_split == 'double': + raise NotImplementedError("Gold trees for 'double'") + reader = Reader(corpus_dir) + rtrees = reader.slurp() + dtrees = {doc_name: RstDepTree.from_rst_tree(rtree, nary_enc=nary_enc) + for doc_name, rtree in rtrees.items()} + elif author == 'silver': + if corpus_split == 'double': + corpus_dir = RST_DOUBLE + else: + raise ValueError("'silver' annotation is available for the " + "'double' split only") + elif author == 'joty': + if corpus_split != 'test': + raise ValueError("The output of Joty's parser is available for " + "the 'test' split only") + data_pred = load_codra_output_files(OUT_JOTY, level='doc') + doc_names = data_pred['doc_names'] + rtrees = data_pred['rst_ctrees'] + dtrees = {doc_name: RstDepTree.from_rst_tree(rtree, nary_enc=nary_enc) + for doc_name, rtree in zip(doc_names, rtrees)} + # set reference to the document in the RstDepTree (required by + # dump_disdep_files) + for doc_name, dtree in dtrees.items(): + dtree.origin = FileId(doc_name, None, None, None) + elif author == 'feng': + # files_glob = os.path.join(OUT_FENG, '*.txt.dis') # FIXME + raise NotImplementedError("Output of Feng's parser") + elif author == 'ji': + raise NotImplementedError("Output of Ji's parser") + # do dump + dump_disdep_files(dtrees.values(), out_dir) + + +if __name__ == '__main__': + main() diff --git a/evals/disdep_eval.py b/evals/eval_disdep.py similarity index 76% rename from evals/disdep_eval.py rename to evals/eval_disdep.py index d33f439..e74467e 100755 --- a/evals/disdep_eval.py +++ b/evals/eval_disdep.py @@ -5,6 +5,7 @@ """ from __future__ import absolute_import, print_function +import argparse import codecs import csv from glob import glob @@ -12,9 +13,24 @@ if __name__ == '__main__': - # TODO turn into argparse params - dir_true = os.path.join('TMP_disdep_chain_true', 'TEST') - dir_pred = os.path.join('TMP_disdep_chain_pred_ours', 'TEST') + parser = argparse.ArgumentParser( + description="Evaluate dis_dep trees against a given reference") + parser.add_argument('author_pred', + choices=['gold', 'silver', + 'joty', 'feng', 'ji', + 'ours'], + help="Author of the predictions") + parser.add_argument('--author_true', default='gold', + choices=['gold', 'silver', + 'joty', 'feng', 'ji', + 'ours'], + help="Author of the reference") + args = parser.parse_args() + author_true = args.author_true + author_pred = args.author_pred + # TODO add argparse params for nary_enc and split + dir_true = os.path.join('TMP_disdep', 'chain', author_true, 'test') + dir_pred = os.path.join('TMP_disdep', 'chain', author_pred, 'test') # end TODO files_true = {os.path.basename(f).rsplit('.')[0]: f for f in glob(os.path.join(dir_true, '*.dis_dep'))} diff --git a/requirements.txt b/requirements.txt index 7d348b6..4735983 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,6 @@ --e git+https://github.com/irit-melodi/educe.git#egg=educe --e git+https://github.com/irit-melodi/attelo.git#egg=attelo --e git+https://github.com/nlhepler/pydot.git#egg=pydot +# -e git+https://github.com/irit-melodi/educe.git#egg=educe +-e /home/mmorey/melodi/educe +# -e git+https://github.com/irit-melodi/attelo.git#egg=attelo +-e /home/mmorey/melodi/attelo +# -e git+https://github.com/nlhepler/pydot.git#egg=pydot -e . From eb21923963a3536a9ede1551b2087b169b8e0cd8 Mon Sep 17 00:00:00 2001 From: moreymat Date: Thu, 8 Sep 2016 16:00:18 +0200 Subject: [PATCH 11/74] ENH support for the output of feng's parser, evaluate several parsers --- evals/dis2disdep.py | 15 ++++- evals/eval_disdep.py | 135 +++++++++++++++++++++++-------------------- 2 files changed, 86 insertions(+), 64 deletions(-) diff --git a/evals/dis2disdep.py b/evals/dis2disdep.py index 2e4e418..d1d7966 100755 --- a/evals/dis2disdep.py +++ b/evals/dis2disdep.py @@ -15,6 +15,7 @@ from educe.corpus import FileId from educe.learning.disdep_format import dump_disdep_files from educe.rst_dt.codra import load_codra_output_files +from educe.rst_dt.feng import load_feng_output_files from educe.rst_dt.corpus import Reader from educe.rst_dt.deptree import RstDepTree from educe.rst_dt.rst_wsj_corpus import (DOUBLE_FOLDER, TEST_FOLDER, @@ -91,8 +92,18 @@ def main(): for doc_name, dtree in dtrees.items(): dtree.origin = FileId(doc_name, None, None, None) elif author == 'feng': - # files_glob = os.path.join(OUT_FENG, '*.txt.dis') # FIXME - raise NotImplementedError("Output of Feng's parser") + if corpus_split != 'test': + raise ValueError("The output of Feng & Hirst's parser is " + "available for the 'test' split only") + data_pred = load_feng_output_files(OUT_FENG) + doc_names = data_pred['doc_names'] + rtrees = data_pred['rst_ctrees'] + dtrees = {doc_name: RstDepTree.from_rst_tree(rtree, nary_enc=nary_enc) + for doc_name, rtree in zip(doc_names, rtrees)} + # set reference to the document in the RstDepTree (required by + # dump_disdep_files) + for doc_name, dtree in dtrees.items(): + dtree.origin = FileId(doc_name, None, None, None) elif author == 'ji': raise NotImplementedError("Output of Ji's parser") # do dump diff --git a/evals/eval_disdep.py b/evals/eval_disdep.py index e74467e..8310487 100755 --- a/evals/eval_disdep.py +++ b/evals/eval_disdep.py @@ -15,79 +15,90 @@ if __name__ == '__main__': parser = argparse.ArgumentParser( description="Evaluate dis_dep trees against a given reference") - parser.add_argument('author_pred', + parser.add_argument('authors_pred', nargs='+', choices=['gold', 'silver', 'joty', 'feng', 'ji', 'ours'], - help="Author of the predictions") + help="Author(s) of the predictions") parser.add_argument('--author_true', default='gold', choices=['gold', 'silver', 'joty', 'feng', 'ji', 'ours'], help="Author of the reference") + parser.add_argument('--nary_enc', default='chain', + choices=['tree', 'chain'], + help="Encoding of n-ary nodes") + # TODO add argparse param for split args = parser.parse_args() author_true = args.author_true - author_pred = args.author_pred - # TODO add argparse params for nary_enc and split - dir_true = os.path.join('TMP_disdep', 'chain', author_true, 'test') - dir_pred = os.path.join('TMP_disdep', 'chain', author_pred, 'test') - # end TODO + authors_pred = args.authors_pred + nary_enc = args.nary_enc + # reference + dir_true = os.path.join('TMP_disdep', nary_enc, author_true, 'test') files_true = {os.path.basename(f).rsplit('.')[0]: f for f in glob(os.path.join(dir_true, '*.dis_dep'))} - files_pred = {os.path.basename(f).rsplit('.')[0]: f - for f in glob(os.path.join(dir_pred, '*.dis_dep'))} - assert sorted(files_true.keys()) == sorted(files_pred.keys()) - - cnt_tot = 0 # total deps - cnt_a = 0 # correct heads (attachments) - cnt_l = 0 # correct labels - cnt_n = 0 # correct nuclearity - cnt_r = 0 # correct ranks - cnt_al = 0 # correct labelled attachments - cnt_an = 0 # correct attachment + nuc - cnt_ar = 0 # correct attachment + rank - cnt_aln = 0 # correct attachment + label + nuc - cnt_alnr = 0 # correct attachment + label + nuc + rank - - for doc_name, f_true in files_true.items(): - f_pred = files_pred[doc_name] - with codecs.open(f_true, 'r', encoding='utf-8') as f_true: - with codecs.open(f_pred, 'r', encoding='utf-8') as f_pred: - reader_true = csv.reader(f_true, dialect=csv.excel_tab) - reader_pred = csv.reader(f_pred, dialect=csv.excel_tab) - for line_true, line_pred in zip(reader_true, reader_pred): - # i, txt, head, label, clabel, nuc, rank - assert line_true[0] == line_pred[0] # safety check - ok_a = line_true[2] == line_pred[2] - ok_l = line_true[4] == line_pred[4] # use clabel - ok_n = line_true[5] == line_pred[5] - ok_r = line_true[6] == line_pred[6] - # update running counters - cnt_tot += 1 - if ok_a: - cnt_a += 1 - if ok_l: - cnt_l += 1 - if ok_n: - cnt_n += 1 - if ok_r: - cnt_r += 1 - if ok_a and ok_l: - cnt_al += 1 - if ok_a and ok_n: - cnt_an += 1 - if ok_a and ok_r: - cnt_ar += 1 - if ok_a and ok_l and ok_n: - cnt_aln += 1 - if ok_a and ok_l and ok_n and ok_r: - cnt_alnr += 1 - print('\t'.join(['a', 'l', 'n', 'r', + # table header + print('\t'.join(['parser', + 'a', 'l', 'n', 'r', 'al', 'an', 'ar', 'aln', - 'alnr'])) - print('\t'.join('{:.4f}'.format(float(cnt_x) / cnt_tot) - for cnt_x in [cnt_a, cnt_l, cnt_n, cnt_r, - cnt_al, cnt_an, cnt_ar, - cnt_aln, - cnt_alnr])) + 'alnr', + 'support'])) + + for author_pred in authors_pred: + dir_pred = os.path.join('TMP_disdep', nary_enc, author_pred, 'test') + files_pred = {os.path.basename(f).rsplit('.')[0]: f + for f in glob(os.path.join(dir_pred, '*.dis_dep'))} + assert sorted(files_true.keys()) == sorted(files_pred.keys()) + + cnt_tot = 0 # total deps + cnt_a = 0 # correct heads (attachments) + cnt_l = 0 # correct labels + cnt_n = 0 # correct nuclearity + cnt_r = 0 # correct ranks + cnt_al = 0 # correct labelled attachments + cnt_an = 0 # correct attachment + nuc + cnt_ar = 0 # correct attachment + rank + cnt_aln = 0 # correct attachment + label + nuc + cnt_alnr = 0 # correct attachment + label + nuc + rank + + for doc_name, f_true in files_true.items(): + f_pred = files_pred[doc_name] + with codecs.open(f_true, 'r', encoding='utf-8') as f_true: + with codecs.open(f_pred, 'r', encoding='utf-8') as f_pred: + reader_true = csv.reader(f_true, dialect=csv.excel_tab) + reader_pred = csv.reader(f_pred, dialect=csv.excel_tab) + for line_true, line_pred in zip(reader_true, reader_pred): + # i, txt, head, label, clabel, nuc, rank + assert line_true[0] == line_pred[0] # safety check + ok_a = line_true[2] == line_pred[2] + ok_l = line_true[4] == line_pred[4] # use clabel + ok_n = line_true[5] == line_pred[5] + ok_r = line_true[6] == line_pred[6] + # update running counters + cnt_tot += 1 + if ok_a: + cnt_a += 1 + if ok_l: + cnt_l += 1 + if ok_n: + cnt_n += 1 + if ok_r: + cnt_r += 1 + if ok_a and ok_l: + cnt_al += 1 + if ok_a and ok_n: + cnt_an += 1 + if ok_a and ok_r: + cnt_ar += 1 + if ok_a and ok_l and ok_n: + cnt_aln += 1 + if ok_a and ok_l and ok_n and ok_r: + cnt_alnr += 1 + print('\t'.join([author_pred] + + ['{:.4f}'.format(float(cnt_x) / cnt_tot) + for cnt_x in [cnt_a, cnt_l, cnt_n, cnt_r, + cnt_al, cnt_an, cnt_ar, + cnt_aln, + cnt_alnr]] + + [str(cnt_tot)])) From c8097a5ada15c68db1e140953dc5ec834d56166e Mon Sep 17 00:00:00 2001 From: moreymat Date: Wed, 14 Sep 2016 18:22:16 +0200 Subject: [PATCH 12/74] WIP add nary_enc to params in gather and local --- irit_rst_dt/cmd/gather.py | 3 ++- irit_rst_dt/local.py | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/irit_rst_dt/cmd/gather.py b/irit_rst_dt/cmd/gather.py index 996f570..b986097 100644 --- a/irit_rst_dt/cmd/gather.py +++ b/irit_rst_dt/cmd/gather.py @@ -16,7 +16,7 @@ from ..local import (FEATURE_SET, LABEL_SET, TEST_CORPUS, TRAINING_CORPUS, SAME_UNIT, PTB_DIR, CORENLP_OUT_DIR, LECSIE_DATA_DIR, - EVALUATIONS) + NARY_ENC, EVALUATIONS) from ..util import (current_tmp, latest_tmp) NAME = 'gather' @@ -86,6 +86,7 @@ def extract_features(corpus, output_dir, fix_pseudo_rels, instances, PTB_DIR, # TODO make this optional and exclusive from CoreNLP output_dir, '--feature_set', FEATURE_SET, + '--nary_enc', NARY_ENC, # 2016-09-12 '--instances', instances, ] # NEW 2016-05-19 rewrite pseudo-relations diff --git a/irit_rst_dt/local.py b/irit_rst_dt/local.py index 0cb9cb1..bfe2691 100644 --- a/irit_rst_dt/local.py +++ b/irit_rst_dt/local.py @@ -147,6 +147,11 @@ Whether to have a special processing for same-unit """ +NARY_ENC = 'tree' # one of {'chain', 'tree'} +""" +Encoding for n-ary nodes in the ctree. +""" + FIXED_FOLD_FILE = None # FIXED_FOLD_FILE = 'folds-TRAINING.json' """ From a094aff82077e8e3955df8d29dfc4d9de936f235 Mon Sep 17 00:00:00 2001 From: moreymat Date: Wed, 14 Sep 2016 18:23:00 +0200 Subject: [PATCH 13/74] WIP cleaner evaluation: toggle for nary_enc, binarize_ref --- evals/codra.py | 16 ++++++++++------ evals/ours.py | 35 +++++++++++++++++++++-------------- evals/showdown.py | 42 ++++++++++++++++++++++++++++++++++-------- 3 files changed, 65 insertions(+), 28 deletions(-) diff --git a/evals/codra.py b/evals/codra.py index fb36048..c1ed324 100644 --- a/evals/codra.py +++ b/evals/codra.py @@ -41,8 +41,10 @@ def eval_codra_output(codra_out_dir, edus_file, + nary_enc, nuc_strategy, rank_strategy, prioritize_same_unit=True, + binarize_ref=False, detailed=False): """Load and evaluate the .dis files output by CODRA. @@ -77,12 +79,15 @@ def eval_codra_output(codra_out_dir, edus_file, # transform into binary tree with coarse-grained labels coarse_rtree_true = REL_CONV(rtree_true) - bin_rtree_true = _binarize(coarse_rtree_true) - ctree_true[doc_name] = bin_rtree_true + if binarize_ref: + bin_rtree_true = _binarize(coarse_rtree_true) + ct_true = bin_rtree_true + else: + ct_true = coarse_rtree_true + ctree_true[doc_name] = ct_true # transform into dependency tree via SimpleRSTTree - bin_srtree_true = SimpleRSTTree.from_rst_tree(coarse_rtree_true) - dt_true = RstDepTree.from_simple_rst_tree(bin_srtree_true) + dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc=nary_enc) dtree_true[doc_name] = dt_true # WIP 2016-06-29 para_idx @@ -131,8 +136,7 @@ def eval_codra_output(codra_out_dir, edus_file, # dependency tree # conversion via SimpleRSTTree to RstDepTree - bin_srtree_pred = SimpleRSTTree.from_rst_tree(coarse_rtree_pred) - dt_pred = RstDepTree.from_simple_rst_tree(bin_srtree_pred) + dt_pred = RstDepTree.from_rst_tree(coarse_rtree_pred, nary_enc='chain') dtree_pred[doc_name] = dt_pred # compare pred and true diff --git a/evals/ours.py b/evals/ours.py index 156c76a..90df9e6 100644 --- a/evals/ours.py +++ b/evals/ours.py @@ -15,6 +15,7 @@ from educe.rst_dt.corpus import (Reader as RstReader, RstRelationConverter as RstRelationConverter) from educe.rst_dt.dep2con import (deptree_to_simple_rst_tree, + deptree_to_rst_tree, DummyNuclearityClassifier, InsideOutAttachmentRanker) from educe.rst_dt.deptree import RstDepTree, RstDtException @@ -79,8 +80,11 @@ def load_attelo_output_file(output_file): def load_deptrees_from_attelo_output(output_file, edus_file, + nary_enc, nuc_strategy, rank_strategy, prioritize_same_unit=True, + order='weak', + binarize_ref=False, detailed=False, skpd_docs=None): """Load an RstDepTree from the output of attelo. @@ -115,12 +119,15 @@ def load_deptrees_from_attelo_output(output_file, edus_file, # transform into binary tree with coarse-grained labels coarse_rtree_true = REL_CONV(rtree_true) - bin_rtree_true = _binarize(coarse_rtree_true) - ctree_true[doc_name] = bin_rtree_true + if binarize_ref: + bin_rtree_true = _binarize(coarse_rtree_true) + ct_true = bin_rtree_true + else: + ct_true = coarse_rtree_true + ctree_true[doc_name] = ct_true - # transform into dependency tree via SimpleRSTTree - bin_srtree_true = SimpleRSTTree.from_rst_tree(coarse_rtree_true) - dt_true = RstDepTree.from_simple_rst_tree(bin_srtree_true) + # transform into dependency tree + dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc=nary_enc) dtree_true[doc_name] = dt_true # 2016-06-28 retrieve paragraph idx of each EDU @@ -206,7 +213,8 @@ def load_deptrees_from_attelo_output(output_file, edus_file, # ranking classifier rank_classifier = InsideOutAttachmentRanker( strategy=rank_strategy, - prioritize_same_unit=prioritize_same_unit) + prioritize_same_unit=prioritize_same_unit, + order=order) rank_classifier.fit(X_train, y_rank_train) # rebuild RstDepTrees @@ -245,14 +253,13 @@ def load_deptrees_from_attelo_output(output_file, edus_file, # create pred ctree try: - bin_srtree_pred = deptree_to_simple_rst_tree(dt_pred) - if False: # EXPERIMENTAL - # currently False to run on output that already has - # labels embedding nuclearity - bin_srtree_pred = SimpleRSTTree.incorporate_nuclearity_into_label( - bin_srtree_pred) - bin_rtree_pred = SimpleRSTTree.to_binary_rst_tree(bin_srtree_pred) - ctree_pred[doc_name] = bin_rtree_pred + if False: + rtree_pred = deptree_to_rst_tree(dt_pred) + ctree_pred[doc_name] = rtree_pred + else: # legacy: via SimpleRSTTree, forces binarization + bin_srtree_pred = deptree_to_simple_rst_tree(dt_pred) + bin_rtree_pred = SimpleRSTTree.to_binary_rst_tree(bin_srtree_pred) + ctree_pred[doc_name] = bin_rtree_pred except RstDtException as rst_e: print(rst_e) skipped_docs.add(doc_name) diff --git a/evals/showdown.py b/evals/showdown.py index 14c5a2f..677c0bf 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -46,6 +46,14 @@ 'scratch-current/combined', 'output.maxent-iheads-global-AD.L-jnt-eisner') +# 2016-09-14 "tree" transform, predicted syntax +EISNER_OUT_TREE_SYN_PRED_SU = os.path.join( + '/home/mmorey/melodi', + 'irit-rst-dt/TMP/latest', # lbl + 'scratch-current/combined', + 'output.maxent-iheads-global-AD.L-jnt_su-eisner') +# end 2016-09-14 + EISNER_OUT_SYN_PRED_SU = os.path.join( '/home/mmorey/melodi', 'irit-rst-dt/TMP/latest', # lbl @@ -58,7 +66,7 @@ 'scratch-current/combined', 'output.maxent-iheads-global-AD.L-jnt-eisner') -CODRA_OUT_DIR = '/home/mmorey/melodi/joty/Doc-level' +CODRA_OUT_DIR = '/home/mmorey/melodi/rst/joty/Doc-level' @@ -70,28 +78,46 @@ print('CODRA (Joty)') eval_codra_output(CODRA_OUT_DIR, EDUS_FILE, + 'chain', nuc_strategy="unamb_else_most_frequent", - rank_strategy='closest-intra-rl-inter-rl', + rank_strategy='sdist-edist-rl', prioritize_same_unit=True, - detailed=True) + binarize_ref=False, + detailed=False) print('=======================') -print('Eisner, predicted syntax') +print('[chain] Eisner, predicted syntax') load_deptrees_from_attelo_output(EISNER_OUT_SYN_PRED, EDUS_FILE, + 'chain', nuc_strategy="unamb_else_most_frequent", # nuc_strategy="most_frequent_by_rel", - rank_strategy='closest-intra-rl-inter-rl', + rank_strategy='sdist-edist-rl', prioritize_same_unit=True, - detailed=True) + order='weak', + binarize_ref=False, + detailed=False) +print('======================') + +print('[tree] Eisner, predicted syntax + same-unit') +load_deptrees_from_attelo_output(EISNER_OUT_TREE_SYN_PRED_SU, EDUS_FILE, + 'tree', + nuc_strategy="unamb_else_most_frequent", + # nuc_strategy="most_frequent_by_rel", + rank_strategy='sdist-edist-rl', + prioritize_same_unit=True, + order='weak', + binarize_ref=False, + detailed=False) print('======================') print('Eisner, predicted syntax + same-unit') load_deptrees_from_attelo_output(EISNER_OUT_SYN_PRED_SU, EDUS_FILE, + 'chain', nuc_strategy="unamb_else_most_frequent", # nuc_strategy="most_frequent_by_rel", - rank_strategy='closest-intra-rl-inter-rl', + rank_strategy='sdist-edist-rl', prioritize_same_unit=True, - detailed=True) + detailed=False) print('======================') print('Eisner, gold syntax') From d38d5df647759953eaf5b424f545dc04313b5b73 Mon Sep 17 00:00:00 2001 From: moreymat Date: Thu, 15 Sep 2016 16:09:48 +0200 Subject: [PATCH 14/74] WIP cleaner eval showdown --- evals/codra.py | 93 ++++---------------------- evals/ours.py | 97 ++++++--------------------- evals/showdown.py | 164 ++++++++++++++++++++++++++++++---------------- 3 files changed, 142 insertions(+), 212 deletions(-) diff --git a/evals/codra.py b/evals/codra.py index c1ed324..7bc5275 100644 --- a/evals/codra.py +++ b/evals/codra.py @@ -6,17 +6,11 @@ from collections import defaultdict import itertools -import os import numpy as np -from educe.rst_dt.annotation import SimpleRSTTree, _binarize from educe.rst_dt.codra import load_codra_output_files -from educe.rst_dt.corpus import (Reader as RstReader, - RstRelationConverter as RstRelationConverter) -from educe.rst_dt.dep2con import (deptree_to_simple_rst_tree, - DummyNuclearityClassifier, - InsideOutAttachmentRanker) +from educe.rst_dt.dep2con import deptree_to_rst_tree from educe.rst_dt.deptree import RstDepTree from educe.rst_dt.document_plus import align_edus_with_paragraphs # @@ -26,38 +20,14 @@ from attelo.metrics.deptree import compute_uas_las -# RST corpus -CORPUS_DIR = os.path.abspath(os.path.join( - os.path.dirname(os.path.realpath(__file__)), - '..', 'corpus', - 'RSTtrees-WSJ-main-1.0/')) -CD_TRAIN = os.path.join(CORPUS_DIR, 'TRAINING') -CD_TEST = os.path.join(CORPUS_DIR, 'TEST') -# relation converter (fine- to coarse-grained labels) -RELMAP_FILE = os.path.join('/home/mmorey/melodi/educe', - 'educe', 'rst_dt', - 'rst_112to18.txt') -REL_CONV = RstRelationConverter(RELMAP_FILE).convert_tree - - -def eval_codra_output(codra_out_dir, edus_file, - nary_enc, - nuc_strategy, rank_strategy, - prioritize_same_unit=True, - binarize_ref=False, +def eval_codra_output(ctree_true, dtree_true, + codra_out_dir, edus_file, + nuc_clf, rnk_clf, detailed=False): """Load and evaluate the .dis files output by CODRA. This currently runs on the document-level files (.doc_dis). """ - # load reference trees - dtree_true = dict() # dependency trees - ctree_true = dict() # constituency trees - # FIXME: find ways to read the right (not necessarily TEST) section - # and only the required documents - rst_reader = RstReader(CD_TEST) - rst_corpus = rst_reader.slurp() - # WIP 2016-06-29 sent_idx att_edus = load_edus(edus_file) edu2sent_idx = defaultdict(dict) @@ -74,22 +44,7 @@ def eval_codra_output(codra_out_dir, edus_file, for doc_name, edu2sent in edu2sent_idx.items()} doc_name2edu2para = dict() - for doc_id, rtree_true in sorted(rst_corpus.items()): - doc_name = doc_id.doc - - # transform into binary tree with coarse-grained labels - coarse_rtree_true = REL_CONV(rtree_true) - if binarize_ref: - bin_rtree_true = _binarize(coarse_rtree_true) - ct_true = bin_rtree_true - else: - ct_true = coarse_rtree_true - ctree_true[doc_name] = ct_true - - # transform into dependency tree via SimpleRSTTree - dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc=nary_enc) - dtree_true[doc_name] = dt_true - + for doc_name, rtree_true in sorted(ctree_true.items()): # WIP 2016-06-29 para_idx doc_edus = rtree_true.leaves() doc_txt = doc_edus[0].context._text @@ -115,7 +70,6 @@ def eval_codra_output(codra_out_dir, edus_file, doc_name2edu2para[doc_name] = None # end retrieve paragraph idx - # load predicted trees data_pred = load_codra_output_files(codra_out_dir) # filenames = data_pred['filenames'] @@ -129,13 +83,11 @@ def eval_codra_output(codra_out_dir, edus_file, for doc_name, rst_ctree in itertools.izip(doc_names_pred, rst_ctrees_pred): # constituency tree # replace fine-grained labels with coarse-grained labels - # 2016-06-27 useless, the files we have already contain the coarse - # labels - coarse_rtree_pred = REL_CONV(rst_ctree) + # no need to replace labels: the files we have already contain + # the coarse labels + coarse_rtree_pred = rst_ctree ctree_pred[doc_name] = coarse_rtree_pred - - # dependency tree - # conversion via SimpleRSTTree to RstDepTree + # convert to weakly-ordered dependency tree dt_pred = RstDepTree.from_rst_tree(coarse_rtree_pred, nary_enc='chain') dtree_pred[doc_name] = dt_pred @@ -167,42 +119,25 @@ def eval_codra_output(codra_out_dir, edus_file, print(parseval_detailed_report(ctree_true, ctree_pred, metric_type='S+R')) - if False: + if True: # WIP 2016-06-29 use our deterministic classifiers for nuc and rank # => estimate degradation on Joty's output => hint at ours - # FIXME declare, fit and predict upstream on the training corpus... - # but currently fit is a no-op for both so this horror is in fact safe - X_train = [] - y_nuc_train = [] - y_rank_train = [] - for doc_name, dt in sorted(dtree_true.items()): - X_train.append(dt) - y_nuc_train.append(dt.nucs) - y_rank_train.append(dt.ranks) # nuclearity - nuc_classifier = DummyNuclearityClassifier(strategy=nuc_strategy) - nuc_classifier.fit(X_train, y_nuc_train) - # ranking classifier - rank_classifier = InsideOutAttachmentRanker( - strategy=rank_strategy, - prioritize_same_unit=prioritize_same_unit) - rank_classifier.fit(X_train, y_rank_train) # rebuild ctrees ctree_pred2 = dict() for doc_name, dt_pred in sorted(dtree_pred.items()): # set nuclearity - dt_pred.nucs = nuc_classifier.predict([dt_pred])[0] + dt_pred.nucs = nuc_clf.predict([dt_pred])[0] # set ranking, needs sent_idx (WIP on para_idx) edu2sent = doc_name2edu2sent[doc_name] dt_pred.sent_idx = edu2sent # 2016-06-28 same for edu2para edu2para = doc_name2edu2para[doc_name] dt_pred.para_idx = edu2para - dt_pred.ranks = rank_classifier.predict([dt_pred])[0] + dt_pred.ranks = rnk_clf.predict([dt_pred])[0] # end NEW - bin_srtree_pred = deptree_to_simple_rst_tree(dt_pred) - bin_rtree_pred = SimpleRSTTree.to_binary_rst_tree(bin_srtree_pred) - ctree_pred2[doc_name] = bin_rtree_pred + rtree_pred = deptree_to_rst_tree(dt_pred) + ctree_pred2[doc_name] = rtree_pred # skipped_docs = set() ctree_pred2 = [ct for doc_name, ct in sorted(ctree_pred2.items()) diff --git a/evals/ours.py b/evals/ours.py index 90df9e6..1633edf 100644 --- a/evals/ours.py +++ b/evals/ours.py @@ -5,19 +5,13 @@ from __future__ import print_function from collections import defaultdict -import os import numpy as np from educe.annotation import Span as EduceSpan -from educe.rst_dt.annotation import (EDU as EduceEDU, - SimpleRSTTree, _binarize) -from educe.rst_dt.corpus import (Reader as RstReader, - RstRelationConverter as RstRelationConverter) +from educe.rst_dt.annotation import (EDU as EduceEDU, SimpleRSTTree) from educe.rst_dt.dep2con import (deptree_to_simple_rst_tree, - deptree_to_rst_tree, - DummyNuclearityClassifier, - InsideOutAttachmentRanker) + deptree_to_rst_tree) from educe.rst_dt.deptree import RstDepTree, RstDtException from educe.rst_dt.document_plus import align_edus_with_paragraphs # @@ -28,20 +22,6 @@ from attelo.table import UNRELATED # for load_attelo_output_file -# RST corpus -CORPUS_DIR = os.path.abspath(os.path.join( - os.path.dirname(os.path.realpath(__file__)), - '..', 'corpus', - 'RSTtrees-WSJ-main-1.0/')) -CD_TRAIN = os.path.join(CORPUS_DIR, 'TRAINING') -CD_TEST = os.path.join(CORPUS_DIR, 'TEST') -# relation converter (fine- to coarse-grained labels) -RELMAP_FILE = os.path.join('/home/mmorey/melodi/educe', - 'educe', 'rst_dt', - 'rst_112to18.txt') -REL_CONV = RstRelationConverter(RELMAP_FILE).convert_tree - - # move to attelo.datasets.attelo_out_format def load_attelo_output_file(output_file): """Load edges from an attelo output file. @@ -79,24 +59,25 @@ def load_attelo_output_file(output_file): return edges_pred -def load_deptrees_from_attelo_output(output_file, edus_file, - nary_enc, - nuc_strategy, rank_strategy, - prioritize_same_unit=True, - order='weak', - binarize_ref=False, +def load_deptrees_from_attelo_output(ctree_true, dtree_true, + output_file, edus_file, + nuc_clf, rnk_clf, detailed=False, skpd_docs=None): """Load an RstDepTree from the output of attelo. Parameters ---------- + ctree_true: dict(str, RSTTree) + Ground truth RST ctree. + dtree_true: dict(str, RstDepTree) + Ground truth RST (ordered) dtree. output_file: string Path to the file that contains attelo's output - nuc_strategy: string - Strategy to predict nuclearity - rank_strategy: string - Strategy to predict attachment ranking + nuc_clf: NuclearityClassifier + Classifier to predict nuclearity + rnk_clf: RankClassifier + Classifier to predict attachment ranking skpd_docs: set(string) Names of documents that should be skipped to compute scores @@ -108,28 +89,7 @@ def load_deptrees_from_attelo_output(output_file, edus_file, doc_name2edu2para = dict() # load reference trees - dtree_true = dict() # dependency trees - ctree_true = dict() # constituency trees - # FIXME: find ways to read the right (not necessarily TEST) section - # and only the required documents - rst_reader = RstReader(CD_TEST) - rst_corpus = rst_reader.slurp() - for doc_id, rtree_true in sorted(rst_corpus.items()): - doc_name = doc_id.doc - - # transform into binary tree with coarse-grained labels - coarse_rtree_true = REL_CONV(rtree_true) - if binarize_ref: - bin_rtree_true = _binarize(coarse_rtree_true) - ct_true = bin_rtree_true - else: - ct_true = coarse_rtree_true - ctree_true[doc_name] = ct_true - - # transform into dependency tree - dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc=nary_enc) - dtree_true[doc_name] = dt_true - + for doc_name, rtree_true in sorted(ctree_true.items()): # 2016-06-28 retrieve paragraph idx of each EDU # FIXME refactor to get in a better way, in a better place # currently, we take EDUs from the RSTTree and paragraphs from @@ -198,25 +158,6 @@ def load_deptrees_from_attelo_output(output_file, edus_file, # re-build predicted trees from predicted edges and educe EDUs skipped_docs = set() # docs skipped because non-projective structures - # classifiers for nuclearity and ranking - # FIXME declare, fit and predict upstream... - X_train = [] - y_nuc_train = [] - y_rank_train = [] - for doc_name, dt in sorted(dtree_true.items()): - X_train.append(dt) - y_nuc_train.append(dt.nucs) - y_rank_train.append(dt.ranks) - # nuclearity - nuc_classifier = DummyNuclearityClassifier(strategy=nuc_strategy) - nuc_classifier.fit(X_train, y_nuc_train) - # ranking classifier - rank_classifier = InsideOutAttachmentRanker( - strategy=rank_strategy, - prioritize_same_unit=prioritize_same_unit, - order=order) - rank_classifier.fit(X_train, y_rank_train) - # rebuild RstDepTrees for doc_name, es_pred in sorted(edges_pred.items()): # get educe EDUs @@ -233,7 +174,7 @@ def load_deptrees_from_attelo_output(output_file, edus_file, dt_pred.add_dependency(gid2num[src_id], gid2num[tgt_id], lbl) # NEW add nuclearity: heuristic baseline if True: - dt_pred.nucs = nuc_classifier.predict([dt_pred])[0] + dt_pred.nucs = nuc_clf.predict([dt_pred])[0] else: # EXPERIMENTAL use gold nuclearity dt_pred.nucs = dtree_true[doc_name].nucs # NEW add rank: some strategies require a mapping from EDU to sentence @@ -247,13 +188,14 @@ def load_deptrees_from_attelo_output(output_file, edus_file, # end EXPERIMENTAL if False: # DEBUG print(doc_name) - dt_pred.ranks = rank_classifier.predict([dt_pred])[0] + dt_pred.ranks = rnk_clf.predict([dt_pred])[0] # end NEW dtree_pred[doc_name] = dt_pred # create pred ctree try: - if False: + if True: # NEW 2016-09-14 + # direct conversion from ordered dtree to ctree rtree_pred = deptree_to_rst_tree(dt_pred) ctree_pred[doc_name] = rtree_pred else: # legacy: via SimpleRSTTree, forces binarization @@ -303,5 +245,6 @@ def load_deptrees_from_attelo_output(output_file, edus_file, if detailed: print(parseval_detailed_report(ctree_true, ctree_pred, metric_type='S+R')) - + # DEBUG + # end DEBUG return skipped_docs diff --git a/evals/showdown.py b/evals/showdown.py index 677c0bf..8f0ccea 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -7,11 +7,12 @@ import os -# from educe.rst_dt.annotation import RSTTree, SimpleRSTTree, _binarize -from educe.rst_dt.corpus import RstRelationConverter # , Reader as RstReader - -# from educe.rst_dt.dep2con import (deptree_to_simple_rst_tree) -# from educe.rst_dt.deptree import (RstDepTree, RstDtException) +from educe.rst_dt.annotation import _binarize +from educe.rst_dt.corpus import (RstRelationConverter, + Reader as RstReader) +from educe.rst_dt.dep2con import (DummyNuclearityClassifier, + InsideOutAttachmentRanker) +from educe.rst_dt.deptree import RstDepTree # # from attelo.metrics.constituency import (LBL_FNS, parseval_detailed_report, # parseval_report) @@ -54,6 +55,7 @@ 'output.maxent-iheads-global-AD.L-jnt_su-eisner') # end 2016-09-14 + EISNER_OUT_SYN_PRED_SU = os.path.join( '/home/mmorey/melodi', 'irit-rst-dt/TMP/latest', # lbl @@ -69,67 +71,117 @@ CODRA_OUT_DIR = '/home/mmorey/melodi/rst/joty/Doc-level' +# hyperparams +NUC_STRATEGY = 'unamb_else_most_frequent' +RNK_STRATEGY = 'sdist-edist-rl' +RNK_PRIORITY_SU = True +RNK_ORDER = 'weak' + # FIXME: -# * [ ] load gold trees here once and for all, pass them to each evaluation # * [ ] create summary table with one system per row, one metric per column, # keep only the f-score (because for binary trees with manual segmentation # precision = recall = f-score). -print('CODRA (Joty)') -eval_codra_output(CODRA_OUT_DIR, EDUS_FILE, - 'chain', - nuc_strategy="unamb_else_most_frequent", - rank_strategy='sdist-edist-rl', - prioritize_same_unit=True, - binarize_ref=False, - detailed=False) -print('=======================') - -print('[chain] Eisner, predicted syntax') -load_deptrees_from_attelo_output(EISNER_OUT_SYN_PRED, EDUS_FILE, - 'chain', - nuc_strategy="unamb_else_most_frequent", - # nuc_strategy="most_frequent_by_rel", - rank_strategy='sdist-edist-rl', - prioritize_same_unit=True, - order='weak', - binarize_ref=False, - detailed=False) -print('======================') - -print('[tree] Eisner, predicted syntax + same-unit') -load_deptrees_from_attelo_output(EISNER_OUT_TREE_SYN_PRED_SU, EDUS_FILE, - 'tree', - nuc_strategy="unamb_else_most_frequent", - # nuc_strategy="most_frequent_by_rel", - rank_strategy='sdist-edist-rl', - prioritize_same_unit=True, - order='weak', - binarize_ref=False, - detailed=False) -print('======================') +# 1. load train section of the RST corpus, fit (currently dummy) classifiers +# for nuclearity and rank +reader_train = RstReader(CD_TRAIN) +corpus_train = reader_train.slurp() +# gold RST trees +ctree_true = dict() # ctrees +ctree_bin_true = dict() # ctrees, binarized +dtree_true = dict() # dtrees from the original ctrees ('tree' transform) +dtree_bin_true = dict() # dtrees from the binarized ctrees ('chain' transform) +for doc_id, ct_true in sorted(corpus_train.items()): + doc_name = doc_id.doc + # flavours of ctree + ct_true = REL_CONV(ct_true) # map fine to coarse relations + ctree_true[doc_name] = ct_true + ct_bin_true = _binarize(ct_true) + ctree_bin_true[doc_name] = ct_bin_true + # flavours of dtree + dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc='tree') + dt_bin_true = RstDepTree.from_rst_tree(ct_true, nary_enc='chain') + # alt: + # dt_bin_true = RstDepTree.from_rst_tree(ct_bin_true, nary_enc='chain') + dtree_true[doc_name] = dt_true + dtree_bin_true[doc_name] = dt_bin_true +# fit classifiers for nuclearity and rank (DIRTY) +# NB: both are (dummily) fit on weakly ordered dtrees +X_train = [] +y_nuc_train = [] +y_rnk_train = [] +for doc_name, dt in sorted(dtree_true.items()): + X_train.append(dt) + y_nuc_train.append(dt.nucs) + y_rnk_train.append(dt.ranks) +# nuclearity clf +nuc_clf = DummyNuclearityClassifier(strategy=NUC_STRATEGY) +nuc_clf.fit(X_train, y_nuc_train) +# rank clf +rnk_clf = InsideOutAttachmentRanker(strategy=RNK_STRATEGY, + prioritize_same_unit=RNK_PRIORITY_SU, + order=RNK_ORDER) +rnk_clf.fit(X_train, y_rnk_train) + +# load test section of the RST corpus +reader_test = RstReader(CD_TEST) +corpus_test = reader_test.slurp() +# gold RST trees +ctree_true = dict() # ctrees +ctree_bin_true = dict() # ctrees, binarized +dtree_true = dict() # dtrees from the original ctrees ('tree' transform) +dtree_bin_true = dict() # dtrees from the binarized ctrees ('chain' transform) +for doc_id, ct_true in sorted(corpus_test.items()): + doc_name = doc_id.doc + # flavours of ctree + ct_true = REL_CONV(ct_true) # map fine to coarse relations + ctree_true[doc_name] = ct_true + ct_bin_true = _binarize(ct_true) + ctree_bin_true[doc_name] = ct_bin_true + # flavours of dtree + dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc='tree') + dt_bin_true = RstDepTree.from_rst_tree(ct_true, nary_enc='chain') + # alt: + # dt_bin_true = RstDepTree.from_rst_tree(ct_bin_true, nary_enc='chain') + dtree_true[doc_name] = dt_true + dtree_bin_true[doc_name] = dt_bin_true + + +if True: + print('CODRA (Joty)') + eval_codra_output(ctree_true, dtree_true, + CODRA_OUT_DIR, EDUS_FILE, + nuc_clf, rnk_clf, + detailed=False) + print('=======================') + +if True: + print('[chain] Eisner, predicted syntax') + load_deptrees_from_attelo_output(ctree_true, dtree_true, + EISNER_OUT_SYN_PRED, EDUS_FILE, + nuc_clf, rnk_clf, + detailed=False) + print('======================') + +if True: + print('[tree] Eisner, predicted syntax + same-unit') + load_deptrees_from_attelo_output(ctree_true, dtree_true, + EISNER_OUT_TREE_SYN_PRED_SU, EDUS_FILE, + nuc_clf, rnk_clf, + detailed=False) + print('======================') print('Eisner, predicted syntax + same-unit') -load_deptrees_from_attelo_output(EISNER_OUT_SYN_PRED_SU, EDUS_FILE, - 'chain', - nuc_strategy="unamb_else_most_frequent", - # nuc_strategy="most_frequent_by_rel", - rank_strategy='sdist-edist-rl', - prioritize_same_unit=True, +load_deptrees_from_attelo_output(ctree_true, dtree_true, + EISNER_OUT_SYN_PRED_SU, EDUS_FILE, + nuc_clf, rnk_clf, detailed=False) print('======================') print('Eisner, gold syntax') -load_deptrees_from_attelo_output(EISNER_OUT_SYN_GOLD, EDUS_FILE, - nuc_strategy="unamb_else_most_frequent", - # nuc_strategy="most_frequent_by_rel", - rank_strategy='closest-intra-rl-inter-rl', - prioritize_same_unit=True) +load_deptrees_from_attelo_output(ctree_true, dtree_true, + EISNER_OUT_SYN_GOLD, EDUS_FILE, + nuc_clf, rnk_clf, + detailed=False) print('======================') - - -# TODO use nuclearity classifier -# starting with baseline: DummyNuclearityClassifier, that assigns to each -# EDU the most frequent nuclearity of its (incoming) relation in the -# training corpus, i.e. 'S' for 'NS', 'N' for 'NN' From 290fe5c54f87a0d58e63f3a39da4c71585dbc6ad Mon Sep 17 00:00:00 2001 From: moreymat Date: Thu, 15 Sep 2016 16:26:55 +0200 Subject: [PATCH 15/74] FIX pass rel_conv and nary_enc to codra eval --- evals/codra.py | 15 +++++++++------ evals/showdown.py | 4 +++- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/evals/codra.py b/evals/codra.py index 7bc5275..d83318e 100644 --- a/evals/codra.py +++ b/evals/codra.py @@ -22,7 +22,9 @@ def eval_codra_output(ctree_true, dtree_true, codra_out_dir, edus_file, - nuc_clf, rnk_clf, + rel_conv=None, + nary_enc='chain', + nuc_clf=None, rnk_clf=None, detailed=False): """Load and evaluate the .dis files output by CODRA. @@ -80,15 +82,16 @@ def eval_codra_output(ctree_true, dtree_true, dtree_pred = dict() # dependency trees ctree_pred = dict() # constituency trees - for doc_name, rst_ctree in itertools.izip(doc_names_pred, rst_ctrees_pred): + for doc_name, ct_pred in itertools.izip(doc_names_pred, rst_ctrees_pred): # constituency tree # replace fine-grained labels with coarse-grained labels # no need to replace labels: the files we have already contain # the coarse labels - coarse_rtree_pred = rst_ctree - ctree_pred[doc_name] = coarse_rtree_pred + if rel_conv is not None: + ct_pred = rel_conv(ct_pred) + ctree_pred[doc_name] = ct_pred # convert to weakly-ordered dependency tree - dt_pred = RstDepTree.from_rst_tree(coarse_rtree_pred, nary_enc='chain') + dt_pred = RstDepTree.from_rst_tree(ct_pred, nary_enc='chain') dtree_pred[doc_name] = dt_pred # compare pred and true @@ -119,7 +122,7 @@ def eval_codra_output(ctree_true, dtree_true, print(parseval_detailed_report(ctree_true, ctree_pred, metric_type='S+R')) - if True: + if nuc_clf is not None and rnk_clf is not None: # WIP 2016-06-29 use our deterministic classifiers for nuc and rank # => estimate degradation on Joty's output => hint at ours # nuclearity diff --git a/evals/showdown.py b/evals/showdown.py index 8f0ccea..5cfa81a 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -152,7 +152,9 @@ print('CODRA (Joty)') eval_codra_output(ctree_true, dtree_true, CODRA_OUT_DIR, EDUS_FILE, - nuc_clf, rnk_clf, + rel_conv=REL_CONV, + nary_enc='chain', + nuc_clf=nuc_clf, rnk_clf=rnk_clf, detailed=False) print('=======================') From 2fcdfc3041349609dd7a724c51a5a3de52d4ab18 Mon Sep 17 00:00:00 2001 From: moreymat Date: Fri, 16 Sep 2016 11:57:53 +0200 Subject: [PATCH 16/74] WIP more compact display in showdown --- evals/codra.py | 14 +-- evals/showdown.py | 312 ++++++++++++++++++++++++++++++---------------- 2 files changed, 206 insertions(+), 120 deletions(-) diff --git a/evals/codra.py b/evals/codra.py index d83318e..21f1faa 100644 --- a/evals/codra.py +++ b/evals/codra.py @@ -94,19 +94,7 @@ def eval_codra_output(ctree_true, dtree_true, dt_pred = RstDepTree.from_rst_tree(ct_pred, nary_enc='chain') dtree_pred[doc_name] = dt_pred - # compare pred and true - common_doc_names = set(dtree_true.keys()) & set(dtree_pred.keys()) - - # dep scores - dtree_true_list = [dt for doc_name, dt in sorted(dtree_true.items()) - if doc_name in common_doc_names] - dtree_pred_list = [dt for doc_name, dt in sorted(dtree_pred.items()) - if doc_name in common_doc_names] - - score_uas, score_las, score_ls = compute_uas_las(dtree_true_list, - dtree_pred_list) - print('UAS / LAS / LS : {:.4f} / {:.4f} / {:.4f}'.format( - score_uas, score_las, score_ls)) + return ctree_pred, dtree_pred skipped_docs = set() # convert dicts to aligned lists of SimpleRSTTrees, skipping docs where diff --git a/evals/showdown.py b/evals/showdown.py index 5cfa81a..10ed7ff 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -5,6 +5,7 @@ from __future__ import print_function +import argparse import os from educe.rst_dt.annotation import _binarize @@ -14,8 +15,10 @@ InsideOutAttachmentRanker) from educe.rst_dt.deptree import RstDepTree # -# from attelo.metrics.constituency import (LBL_FNS, parseval_detailed_report, -# parseval_report) +from attelo.metrics.constituency import (parseval_detailed_report, + parseval_report) +from attelo.metrics.deptree import compute_uas_las + # local to this package from evals.codra import eval_codra_output from evals.ours import load_deptrees_from_attelo_output @@ -70,7 +73,8 @@ CODRA_OUT_DIR = '/home/mmorey/melodi/rst/joty/Doc-level' - +# level of detail for parseval +DETAILED = False # hyperparams NUC_STRATEGY = 'unamb_else_most_frequent' RNK_STRATEGY = 'sdist-edist-rl' @@ -78,112 +82,206 @@ RNK_ORDER = 'weak' +def setup_dtree_postprocessor(nary_enc): + """Setup the nuclearity and rank classifiers to flesh out dtrees.""" + # load train section of the RST corpus, fit (currently dummy) classifiers + # for nuclearity and rank + reader_train = RstReader(CD_TRAIN) + corpus_train = reader_train.slurp() + # gold RST trees + ctree_true = dict() # ctrees + dtree_true = dict() # dtrees from the original ctrees ('tree' transform) + + for doc_id, ct_true in sorted(corpus_train.items()): + doc_name = doc_id.doc + # flavours of ctree + ct_true = REL_CONV(ct_true) # map fine to coarse relations + ctree_true[doc_name] = ct_true + # flavours of dtree + dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc=nary_enc) + dtree_true[doc_name] = dt_true + # fit classifiers for nuclearity and rank (DIRTY) + # NB: both are (dummily) fit on weakly ordered dtrees + X_train = [] + y_nuc_train = [] + y_rnk_train = [] + for doc_name, dt in sorted(dtree_true.items()): + X_train.append(dt) + y_nuc_train.append(dt.nucs) + y_rnk_train.append(dt.ranks) + # nuclearity clf + nuc_clf = DummyNuclearityClassifier(strategy=NUC_STRATEGY) + nuc_clf.fit(X_train, y_nuc_train) + # rank clf + rnk_clf = InsideOutAttachmentRanker(strategy=RNK_STRATEGY, + prioritize_same_unit=RNK_PRIORITY_SU, + order=RNK_ORDER) + rnk_clf.fit(X_train, y_rnk_train) + return nuc_clf, rnk_clf + + # FIXME: # * [ ] create summary table with one system per row, one metric per column, # keep only the f-score (because for binary trees with manual segmentation # precision = recall = f-score). +def main(): + """Run the eval""" + parser = argparse.ArgumentParser( + description="Evaluate parsers' output against a given reference") + # predictions + parser.add_argument('authors_pred', nargs='+', + choices=['gold', 'silver', + 'joty', 'feng', 'ji', + 'ours_chain', 'ours_tree'], + help="Author(s) of the predictions") + parser.add_argument('--nary_enc_pred', default='tree', + choices=['tree', 'chain'], + help="Encoding of n-ary nodes for the predictions") + # reference + parser.add_argument('--author_true', default='gold', + choices=['gold', 'silver', + 'joty', 'feng', 'ji', + 'ours_chain', 'ours_tree'], + help="Author of the reference") + # * dtree eval + parser.add_argument('--nary_enc_true', default='tree', + choices=['tree', 'chain'], + help="Encoding of n-ary nodes for the reference") + # * ctree eval + parser.add_argument('--binarize_true', action='store_true', + help="Binarize the reference ctree for the eval") + + # + args = parser.parse_args() + author_true = args.author_true + nary_enc_true = args.nary_enc_true + authors_pred = args.authors_pred + nary_enc_pred = args.nary_enc_pred + binarize_true = args.binarize_true + + # 0. setup the postprocessors to flesh out unordered dtrees into ordered + # ones with nuclearity + nuc_clf, rnk_clf = setup_dtree_postprocessor(nary_enc_pred) + + # the eval compares parses for the test section of the RST corpus + reader_test = RstReader(CD_TEST) + corpus_test = reader_test.slurp() + + # reference + # current assumption: author_true is 'gold' + if author_true != 'gold': + raise NotImplementedError('Not yet') + + ctree_true = dict() # ctrees + dtree_true = dict() # dtrees from the original ctrees ('tree' transform) + for doc_id, ct_true in sorted(corpus_test.items()): + doc_name = doc_id.doc + # original reference ctree, with coarse labels + ct_true = REL_CONV(ct_true) # map fine to coarse relations + # corresponding dtree + dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc=nary_enc_true) + dtree_true[doc_name] = dt_true + # binarize ctree if necessary + if binarize_true: + ct_true = _binarize(ct_true) + ctree_true[doc_name] = ct_true + + # predictions: [(parser_name, ([doc_names], [ct_pred], [dt_pred]))] + predictions = [] + if 'joty' in authors_pred: + # CODRA outputs RST ctrees ; eval_codra_output maps them to RST dtrees + predictions.append( + ('joty', eval_codra_output(ctree_true, dtree_true, + CODRA_OUT_DIR, EDUS_FILE, + rel_conv=REL_CONV, + nary_enc='chain', + nuc_clf=nuc_clf, rnk_clf=rnk_clf, + detailed=False)) + ) + + if 'ours_chain' in authors_pred: + print('[chain] Eisner, predicted syntax') + # attelo out: unordered dtree ; we pass a nuclearity and rank classifiers + # to get an ordered dtree ; + # need to map to ctree + load_deptrees_from_attelo_output(ctree_true, dtree_true, + EISNER_OUT_SYN_PRED, EDUS_FILE, + nuc_clf, rnk_clf, + detailed=False) + print('======================') + + if 'ours_tree' in authors_pred: + print('[tree] Eisner, predicted syntax + same-unit') + load_deptrees_from_attelo_output(ctree_true, dtree_true, + EISNER_OUT_TREE_SYN_PRED_SU, EDUS_FILE, + nuc_clf, rnk_clf, + detailed=False) + print('======================') + + if False: # FIXME repair (or forget) these + print('Eisner, predicted syntax + same-unit') + load_deptrees_from_attelo_output(ctree_true, dtree_true, + EISNER_OUT_SYN_PRED_SU, EDUS_FILE, + nuc_clf, rnk_clf, + detailed=False) + print('======================') + + print('Eisner, gold syntax') + load_deptrees_from_attelo_output(ctree_true, dtree_true, + EISNER_OUT_SYN_GOLD, EDUS_FILE, + nuc_clf, rnk_clf, + detailed=False) + print('======================') + + # dependency eval + + # report + # * table format + digits = 4 + parser_names = ['joty'] + width = max(len(x) for x in parser_names) + + headers = ["UAS", "LAS", "LS"] + fmt = '%% %ds' % width # first col: parser name + fmt += ' ' + fmt += ' '.join(['% 9s' for _ in headers]) + fmt += '\n' + + headers = [""] + headers + report = fmt % tuple(headers) + report += '\n' + # end table format and header line + + # * table content + for parser_name, (ctree_pred, dtree_pred) in predictions: + doc_names = sorted(dtree_true.keys()) + dtree_true_list = [dtree_true[doc_name] for doc_name in doc_names] + dtree_pred_list = [dtree_pred[doc_name] for doc_name in doc_names] + score_uas, score_las, score_ls = compute_uas_las(dtree_true_list, + dtree_pred_list) + # append to report + values = [parser_name] + for v in (score_uas, score_las, score_ls): + values += ["{0:0.{1}f}".format(v, digits)] + report += fmt % tuple(values) + # end table content + print(report) + # end report + + # constituency eval + for parser_name, (ctree_pred, dtree_pred) in predictions: + doc_names = sorted(ctree_true.keys()) + ctree_true_list = [ctree_true[doc_name] for doc_name in doc_names] + ctree_pred_list = [ctree_pred[doc_name] for doc_name in doc_names] + # FIXME + # compute and print PARSEVAL scores + print(parseval_report(ctree_true_list, ctree_pred_list, digits=4)) + # detailed report on S+N+R + if DETAILED: + print(parseval_detailed_report(ctree_true_list, ctree_pred_list, + metric_type='S+R')) + # end FIXME + -# 1. load train section of the RST corpus, fit (currently dummy) classifiers -# for nuclearity and rank -reader_train = RstReader(CD_TRAIN) -corpus_train = reader_train.slurp() -# gold RST trees -ctree_true = dict() # ctrees -ctree_bin_true = dict() # ctrees, binarized -dtree_true = dict() # dtrees from the original ctrees ('tree' transform) -dtree_bin_true = dict() # dtrees from the binarized ctrees ('chain' transform) -for doc_id, ct_true in sorted(corpus_train.items()): - doc_name = doc_id.doc - # flavours of ctree - ct_true = REL_CONV(ct_true) # map fine to coarse relations - ctree_true[doc_name] = ct_true - ct_bin_true = _binarize(ct_true) - ctree_bin_true[doc_name] = ct_bin_true - # flavours of dtree - dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc='tree') - dt_bin_true = RstDepTree.from_rst_tree(ct_true, nary_enc='chain') - # alt: - # dt_bin_true = RstDepTree.from_rst_tree(ct_bin_true, nary_enc='chain') - dtree_true[doc_name] = dt_true - dtree_bin_true[doc_name] = dt_bin_true -# fit classifiers for nuclearity and rank (DIRTY) -# NB: both are (dummily) fit on weakly ordered dtrees -X_train = [] -y_nuc_train = [] -y_rnk_train = [] -for doc_name, dt in sorted(dtree_true.items()): - X_train.append(dt) - y_nuc_train.append(dt.nucs) - y_rnk_train.append(dt.ranks) -# nuclearity clf -nuc_clf = DummyNuclearityClassifier(strategy=NUC_STRATEGY) -nuc_clf.fit(X_train, y_nuc_train) -# rank clf -rnk_clf = InsideOutAttachmentRanker(strategy=RNK_STRATEGY, - prioritize_same_unit=RNK_PRIORITY_SU, - order=RNK_ORDER) -rnk_clf.fit(X_train, y_rnk_train) - -# load test section of the RST corpus -reader_test = RstReader(CD_TEST) -corpus_test = reader_test.slurp() -# gold RST trees -ctree_true = dict() # ctrees -ctree_bin_true = dict() # ctrees, binarized -dtree_true = dict() # dtrees from the original ctrees ('tree' transform) -dtree_bin_true = dict() # dtrees from the binarized ctrees ('chain' transform) -for doc_id, ct_true in sorted(corpus_test.items()): - doc_name = doc_id.doc - # flavours of ctree - ct_true = REL_CONV(ct_true) # map fine to coarse relations - ctree_true[doc_name] = ct_true - ct_bin_true = _binarize(ct_true) - ctree_bin_true[doc_name] = ct_bin_true - # flavours of dtree - dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc='tree') - dt_bin_true = RstDepTree.from_rst_tree(ct_true, nary_enc='chain') - # alt: - # dt_bin_true = RstDepTree.from_rst_tree(ct_bin_true, nary_enc='chain') - dtree_true[doc_name] = dt_true - dtree_bin_true[doc_name] = dt_bin_true - - -if True: - print('CODRA (Joty)') - eval_codra_output(ctree_true, dtree_true, - CODRA_OUT_DIR, EDUS_FILE, - rel_conv=REL_CONV, - nary_enc='chain', - nuc_clf=nuc_clf, rnk_clf=rnk_clf, - detailed=False) - print('=======================') - -if True: - print('[chain] Eisner, predicted syntax') - load_deptrees_from_attelo_output(ctree_true, dtree_true, - EISNER_OUT_SYN_PRED, EDUS_FILE, - nuc_clf, rnk_clf, - detailed=False) - print('======================') - -if True: - print('[tree] Eisner, predicted syntax + same-unit') - load_deptrees_from_attelo_output(ctree_true, dtree_true, - EISNER_OUT_TREE_SYN_PRED_SU, EDUS_FILE, - nuc_clf, rnk_clf, - detailed=False) - print('======================') - -print('Eisner, predicted syntax + same-unit') -load_deptrees_from_attelo_output(ctree_true, dtree_true, - EISNER_OUT_SYN_PRED_SU, EDUS_FILE, - nuc_clf, rnk_clf, - detailed=False) -print('======================') - -print('Eisner, gold syntax') -load_deptrees_from_attelo_output(ctree_true, dtree_true, - EISNER_OUT_SYN_GOLD, EDUS_FILE, - nuc_clf, rnk_clf, - detailed=False) -print('======================') +if __name__ == '__main__': + main() From b2246b84619b619ec393adc54819e7a266ab399f Mon Sep 17 00:00:00 2001 From: moreymat Date: Fri, 16 Sep 2016 18:17:00 +0200 Subject: [PATCH 17/74] WIP concise showdown: joty, ours --- evals/codra.py | 197 +++++++++++++++++++++++----------------------- evals/ours.py | 187 ++++++++++++++++--------------------------- evals/showdown.py | 72 +++++++++-------- 3 files changed, 204 insertions(+), 252 deletions(-) diff --git a/evals/codra.py b/evals/codra.py index 21f1faa..f3b894e 100644 --- a/evals/codra.py +++ b/evals/codra.py @@ -20,120 +20,117 @@ from attelo.metrics.deptree import compute_uas_las -def eval_codra_output(ctree_true, dtree_true, - codra_out_dir, edus_file, - rel_conv=None, - nary_enc='chain', - nuc_clf=None, rnk_clf=None, - detailed=False): - """Load and evaluate the .dis files output by CODRA. +def load_codra_ctrees(codra_out_dir, rel_conv): + """Load the ctrees output by CODRA as .dis files. This currently runs on the document-level files (.doc_dis). - """ - # WIP 2016-06-29 sent_idx - att_edus = load_edus(edus_file) - edu2sent_idx = defaultdict(dict) - for att_edu in att_edus: - doc_name = att_edu.grouping - edu_num = int(att_edu.id.rsplit('_', 1)[1]) - sent_idx = int(att_edu.subgrouping.split('_sent')[1]) - edu2sent_idx[doc_name][edu_num] = sent_idx - # sort EDUs by num - # rebuild educe-style edu2sent ; prepend 0 for the fake root - doc_name2edu2sent = {doc_name: ([0] - + [s_idx for e_num, s_idx - in sorted(edu2sent.items())]) - for doc_name, edu2sent in edu2sent_idx.items()} - doc_name2edu2para = dict() - - for doc_name, rtree_true in sorted(ctree_true.items()): - # WIP 2016-06-29 para_idx - doc_edus = rtree_true.leaves() - doc_txt = doc_edus[0].context._text - # retrieve paragraph idx - doc_paras = doc_edus[0].context.paragraphs - if doc_paras is not None: - edu2para = align_edus_with_paragraphs( - doc_edus, doc_paras, doc_txt) - # yerk: interpolate values in edu2para where missing - edu2para_fix = [] - for edu_idx in edu2para: - if edu_idx is not None: - edu2para_fix.append(edu_idx) - else: - # interpolation strategy: copy the last regular value - # that has been seen - edu2para_fix.append(edu2para_fix[-1]) - edu2para = edu2para_fix - # end yerk: interpolate - edu2para = [0] + list(np.array(edu2para) + 1) - doc_name2edu2para[doc_name] = edu2para - else: - doc_name2edu2para[doc_name] = None - # end retrieve paragraph idx + Parameters + ---------- + codra_out_dir: str + Path to the base directory containing the output files. + + Returns + ------- + ctree_pred: dict(str, RSTTree) + RST ctree for each document. + """ # load predicted trees data_pred = load_codra_output_files(codra_out_dir) # filenames = data_pred['filenames'] doc_names_pred = data_pred['doc_names'] rst_ctrees_pred = data_pred['rst_ctrees'] - # gather predictions - dtree_pred = dict() # dependency trees + # build a dict from doc_name to ctree (RSTTree) ctree_pred = dict() # constituency trees - for doc_name, ct_pred in itertools.izip(doc_names_pred, rst_ctrees_pred): # constituency tree - # replace fine-grained labels with coarse-grained labels - # no need to replace labels: the files we have already contain - # the coarse labels + # replace fine-grained labels with coarse-grained labels ; + # the files we have already contain the coarse labels, except their + # initial letter is capitalized whereas ours are not if rel_conv is not None: ct_pred = rel_conv(ct_pred) ctree_pred[doc_name] = ct_pred - # convert to weakly-ordered dependency tree - dt_pred = RstDepTree.from_rst_tree(ct_pred, nary_enc='chain') + + return ctree_pred + + +def load_codra_dtrees(codra_out_dir, rel_conv, nary_enc='chain'): + """Get the dtrees that correspond to the ctrees output by CODRA. + + Parameters + ---------- + codra_out_dir: str + Path to the base directory containing the output files. + nary_enc: one of {'chain', 'tree'} + Encoding for n-ary nodes. + + Returns + ------- + dtree_pred: dict(str, RstDepTree) + RST dtree for each document. + """ + # load predicted trees + data_pred = load_codra_output_files(codra_out_dir) + # filenames = data_pred['filenames'] + doc_names_pred = data_pred['doc_names'] + rst_ctrees_pred = data_pred['rst_ctrees'] + + # build a dict from doc_name to ordered dtree (RstDepTree) + dtree_pred = dict() + for doc_name, ct_pred in itertools.izip(doc_names_pred, rst_ctrees_pred): + # constituency tree + # replace fine-grained labels with coarse-grained labels ; + # the files we have already contain the coarse labels, except their + # initial letter is capitalized whereas ours are not + if rel_conv is not None: + ct_pred = rel_conv(ct_pred) + # convert to an ordered dependency tree ; + # * 'tree' produces a weakly-ordered dtree strictly equivalent + # to the original ctree, + # * 'chain' produces a strictly-ordered dtree for which strict + # equivalence is not preserved + dt_pred = RstDepTree.from_rst_tree(ct_pred, nary_enc=nary_enc) dtree_pred[doc_name] = dt_pred - return ctree_pred, dtree_pred - - skipped_docs = set() - # convert dicts to aligned lists of SimpleRSTTrees, skipping docs where - # needed - ctree_true = [ct for doc_name, ct in sorted(ctree_true.items()) - if doc_name not in skipped_docs] - ctree_pred = [ct for doc_name, ct in sorted(ctree_pred.items()) - if doc_name not in skipped_docs] - # compute and print PARSEVAL scores - print(parseval_report(ctree_true, ctree_pred, digits=4)) - # detailed report on S+N+R - if detailed: - print(parseval_detailed_report(ctree_true, ctree_pred, - metric_type='S+R')) - - if nuc_clf is not None and rnk_clf is not None: - # WIP 2016-06-29 use our deterministic classifiers for nuc and rank - # => estimate degradation on Joty's output => hint at ours - # nuclearity - # rebuild ctrees - ctree_pred2 = dict() - for doc_name, dt_pred in sorted(dtree_pred.items()): - # set nuclearity - dt_pred.nucs = nuc_clf.predict([dt_pred])[0] - # set ranking, needs sent_idx (WIP on para_idx) - edu2sent = doc_name2edu2sent[doc_name] - dt_pred.sent_idx = edu2sent - # 2016-06-28 same for edu2para - edu2para = doc_name2edu2para[doc_name] - dt_pred.para_idx = edu2para - dt_pred.ranks = rnk_clf.predict([dt_pred])[0] - # end NEW - rtree_pred = deptree_to_rst_tree(dt_pred) - ctree_pred2[doc_name] = rtree_pred - # - skipped_docs = set() - ctree_pred2 = [ct for doc_name, ct in sorted(ctree_pred2.items()) - if doc_name not in skipped_docs] - print(parseval_report(ctree_true, ctree_pred2, digits=4)) - if detailed: - print(parseval_detailed_report(ctree_true, ctree_pred2, - metric_type='S+R')) + return dtree_pred + + +# TODO move this generic util to a more appropriate place. +# This implementation is quite ad-hoc, tailored for RST e.g. to retrieve +# the edu_num, so I would need to generalize this code first. +def get_edu2sent(att_edus): + """Get edu2sent mapping, from a list of attelo EDUs. + + Parameters + ---------- + att_edus: list of attelo EDUs + List of attelo EDUs, as produced by `load_edus`. + + Returns + ------- + doc_name2edu2sent: dict(str, [int]) + For each document, get the sentence index for every EDU. + + Example: + ``` + att_edus = load_edus(edus_file) + doc_name2edu2sent = get_edu2sent(att_edus) + for doc_name, edu2sent in doc_name2edu2sent.items(): + dtree[doc_name].edu2sent = edu2sent + ``` + + """ + edu2sent_idx = defaultdict(dict) + for att_edu in att_edus: + doc_name = att_edu.grouping + edu_num = int(att_edu.id.rsplit('_', 1)[1]) + sent_idx = int(att_edu.subgrouping.split('_sent')[1]) + edu2sent_idx[doc_name][edu_num] = sent_idx + # sort EDUs by num + # rebuild educe-style edu2sent ; prepend 0 for the fake root + doc_name2edu2sent = {doc_name: ([0] + + [s_idx for e_num, s_idx + in sorted(edu2sent.items())]) + for doc_name, edu2sent in edu2sent_idx.items()} + return doc_name2edu2sent diff --git a/evals/ours.py b/evals/ours.py index 1633edf..300b376 100644 --- a/evals/ours.py +++ b/evals/ours.py @@ -59,71 +59,22 @@ def load_attelo_output_file(output_file): return edges_pred -def load_deptrees_from_attelo_output(ctree_true, dtree_true, - output_file, edus_file, - nuc_clf, rnk_clf, - detailed=False, - skpd_docs=None): - """Load an RstDepTree from the output of attelo. +def load_attelo_dtrees(output_file, edus_file, nuc_clf, rnk_clf): + """Load RST dtrees from attelo output files. Parameters ---------- - ctree_true: dict(str, RSTTree) - Ground truth RST ctree. - dtree_true: dict(str, RstDepTree) - Ground truth RST (ordered) dtree. output_file: string Path to the file that contains attelo's output - nuc_clf: NuclearityClassifier - Classifier to predict nuclearity - rnk_clf: RankClassifier - Classifier to predict attachment ranking - skpd_docs: set(string) - Names of documents that should be skipped to compute scores + edus_file: string + Path to the file that describes EDUs. Returns ------- - skipped_docs: set(string) - Names of documents that have been skipped to compute scores + TODO """ - doc_name2edu2para = dict() - - # load reference trees - for doc_name, rtree_true in sorted(ctree_true.items()): - # 2016-06-28 retrieve paragraph idx of each EDU - # FIXME refactor to get in a better way, in a better place - # currently, we take EDUs from the RSTTree and paragraphs from - # the RSTContext, so no left padding in either list ; - # the dtree contains the left padding EDU, so we compute the - # edu2paragraph alignment on real units only, shift by one, - # then prepend 0 - doc_edus = rtree_true.leaves() - doc_paras = doc_edus[0].context.paragraphs - doc_txt = doc_edus[0].context._text - if doc_paras is not None: - edu2para = align_edus_with_paragraphs( - doc_edus, doc_paras, doc_txt) - # yerk: interpolate values in edu2para where missing - edu2para_fix = [] - for edu_idx in edu2para: - if edu_idx is not None: - edu2para_fix.append(edu_idx) - else: - # interpolation strategy: copy the last regular value - # that has been seen - edu2para_fix.append(edu2para_fix[-1]) - edu2para = edu2para_fix - # end yerk: interpolate - edu2para = [0] + list(np.array(edu2para) + 1) - doc_name2edu2para[doc_name] = edu2para - else: - doc_name2edu2para[doc_name] = None - # end retrieve paragraph idx - - # USE TO INCORPORATE CONSTITUENCY LOSS INTO STRUCTURED CLASSIFIERS - # load predicted trees dtree_pred = dict() # predicted dtrees - ctree_pred = dict() # predicted ctrees + # * setup... # load EDUs as they are known to attelo (sigh) # and predicted edges on these EDUs att_edus = load_edus(edus_file) @@ -155,9 +106,6 @@ def load_deptrees_from_attelo_output(ctree_true, dtree_true, for e in doc_educe_edus]) for doc_name, doc_educe_edus in educe_edus.items()} - # re-build predicted trees from predicted edges and educe EDUs - skipped_docs = set() # docs skipped because non-projective structures - # rebuild RstDepTrees for doc_name, es_pred in sorted(edges_pred.items()): # get educe EDUs @@ -172,36 +120,44 @@ def load_deptrees_from_attelo_output(ctree_true, dtree_true, raise ValueError('Weird root label: {}'.format(lbl)) else: dt_pred.add_dependency(gid2num[src_id], gid2num[tgt_id], lbl) - # NEW add nuclearity: heuristic baseline - if True: - dt_pred.nucs = nuc_clf.predict([dt_pred])[0] - else: # EXPERIMENTAL use gold nuclearity - dt_pred.nucs = dtree_true[doc_name].nucs - # NEW add rank: some strategies require a mapping from EDU to sentence - # EXPERIMENTAL attach array of sentence index for each EDU in tree + # add nuclearity: heuristic baseline + dt_pred.nucs = nuc_clf.predict([dt_pred])[0] + # add rank: heuristic baseline, needs edu2sent edu2sent = doc_name2edu2sent[doc_name] - dt_pred.sent_idx = edu2sent - # 2016-06-28 same for edu2para - edu2para = doc_name2edu2para[doc_name] - dt_pred.para_idx = edu2para - # assert len(edu2sent) == len(edu2para) - # end EXPERIMENTAL - if False: # DEBUG - print(doc_name) + dt_pred.sent_idx = edu2sent # DIRTY dt_pred.ranks = rnk_clf.predict([dt_pred])[0] - # end NEW + # store dtree_pred[doc_name] = dt_pred - # create pred ctree + return dtree_pred + + +def load_attelo_ctrees(output_file, edus_file, nuc_clf, rnk_clf): + """Load RST ctrees from attelo output files. + + Parameters + ---------- + output_file: string + Path to the file that contains attelo's output + edus_file: string + Path to the file that describes EDUs. + nuc_clf: NuclearityClassifier + Classifier to predict nuclearity + rnk_clf: RankClassifier + Classifier to predict attachment ranking + + Returns + ------- + TODO + """ + # load RST dtrees, with heuristics for nuc and rank + dtree_pred = load_attelo_dtrees(output_file, edus_file, nuc_clf, rnk_clf) + # convert to RST ctrees + ctree_pred = dict() + for doc_name, dt_pred in dtree_pred.items(): try: - if True: # NEW 2016-09-14 - # direct conversion from ordered dtree to ctree - rtree_pred = deptree_to_rst_tree(dt_pred) - ctree_pred[doc_name] = rtree_pred - else: # legacy: via SimpleRSTTree, forces binarization - bin_srtree_pred = deptree_to_simple_rst_tree(dt_pred) - bin_rtree_pred = SimpleRSTTree.to_binary_rst_tree(bin_srtree_pred) - ctree_pred[doc_name] = bin_rtree_pred + rtree_pred = deptree_to_rst_tree(dt_pred) + ctree_pred[doc_name] = rtree_pred except RstDtException as rst_e: print(rst_e) skipped_docs.add(doc_name) @@ -209,42 +165,31 @@ def load_deptrees_from_attelo_output(ctree_true, dtree_true, print('\n'.join('{}: {}'.format(edu.text_span(), edu) for edu in educe_edus[doc_name])) # raise - # end USE TO INCORPORATE CONSTITUENCY LOSS INTO STRUCTURED CLASSIFIERS - # compare gold with pred on doc_names - common_doc_names = set(dtree_true.keys()) & set(dtree_pred.keys()) - - # dep scores - dtree_true_list = [dt for doc_name, dt in sorted(dtree_true.items()) - if doc_name in common_doc_names] - dtree_pred_list = [dt for doc_name, dt in sorted(dtree_pred.items()) - if doc_name in common_doc_names] - - score_uas, score_las, score_ls = compute_uas_las(dtree_true_list, - dtree_pred_list) - print('UAS / LAS / LS : {:.4f} / {:.4f} / {:.4f}'.format( - score_uas, score_las, score_ls)) - - # compute and print PARSEVAL scores - if skipped_docs: - print('Skipped {} docs over {}'.format(len(skipped_docs), - len(edges_pred))) - # also skip docs passed as argument - if skpd_docs is not None: - skipped_docs |= skpd_docs - # convert dicts to aligned lists of SimpleRSTTrees, skipping docs where - # needed - ctree_true = [ct for doc_name, ct in sorted(ctree_true.items()) - if doc_name not in skipped_docs] - ctree_pred = [ct for doc_name, ct in sorted(ctree_pred.items()) - if doc_name not in skipped_docs] - - print(parseval_report(ctree_true, ctree_pred, - digits=4)) - # detailed report on S+N+R - if detailed: - print(parseval_detailed_report(ctree_true, ctree_pred, - metric_type='S+R')) - # DEBUG - # end DEBUG - return skipped_docs + return ctree_pred + + +def load_deptrees_from_attelo_output(ctree_true, dtree_true, + output_file, edus_file, + nuc_clf, rnk_clf, + detailed=False, + skpd_docs=None): + """Load an RstDepTree from the output of attelo. + + Parameters + ---------- + ctree_true: dict(str, RSTTree) + Ground truth RST ctree. + dtree_true: dict(str, RstDepTree) + Ground truth RST (ordered) dtree. + skpd_docs: set(string) + Names of documents that should be skipped to compute scores + + Returns + ------- + skipped_docs: set(string) + Names of documents that have been skipped to compute scores + """ + # USE TO INCORPORATE CONSTITUENCY LOSS INTO STRUCTURED CLASSIFIERS + # load predicted trees + # end USE TO INCORPORATE CONSTITUENCY LOSS INTO STRUCTURED CLASSIFIERS diff --git a/evals/showdown.py b/evals/showdown.py index 10ed7ff..f21f294 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -20,8 +20,10 @@ from attelo.metrics.deptree import compute_uas_las # local to this package -from evals.codra import eval_codra_output -from evals.ours import load_deptrees_from_attelo_output +from evals.codra import load_codra_ctrees, load_codra_dtrees +from evals.ours import (load_deptrees_from_attelo_output, + load_attelo_ctrees, + load_attelo_dtrees) # RST corpus @@ -186,37 +188,45 @@ def main(): ct_true = _binarize(ct_true) ctree_true[doc_name] = ct_true - # predictions: [(parser_name, ([doc_names], [ct_pred], [dt_pred]))] - predictions = [] + + c_preds = [] # predictions: [(parser_name, dict(doc_name, ct_pred))] + d_preds = [] # predictions: [(parser_name, dict(doc_name, dt_pred))] if 'joty' in authors_pred: # CODRA outputs RST ctrees ; eval_codra_output maps them to RST dtrees - predictions.append( - ('joty', eval_codra_output(ctree_true, dtree_true, - CODRA_OUT_DIR, EDUS_FILE, - rel_conv=REL_CONV, - nary_enc='chain', - nuc_clf=nuc_clf, rnk_clf=rnk_clf, - detailed=False)) + c_preds.append( + ('joty', load_codra_ctrees(CODRA_OUT_DIR, REL_CONV)) ) + d_preds.append( + ('joty', load_codra_dtrees(CODRA_OUT_DIR, REL_CONV, + nary_enc='chain')) + ) + # joty-{chain,tree} would be the same except nary_enc='tree' ; + # the nary_enc does not matter because codra outputs binary ctrees, + # hence both encodings result in (the same) strictly ordered dtrees if 'ours_chain' in authors_pred: - print('[chain] Eisner, predicted syntax') - # attelo out: unordered dtree ; we pass a nuclearity and rank classifiers - # to get an ordered dtree ; - # need to map to ctree - load_deptrees_from_attelo_output(ctree_true, dtree_true, - EISNER_OUT_SYN_PRED, EDUS_FILE, - nuc_clf, rnk_clf, - detailed=False) - print('======================') + # Eisner, predicted syntax, chain + c_preds.append( + ('ours-chain', load_attelo_ctrees(EISNER_OUT_SYN_PRED, EDUS_FILE, + nuc_clf, rnk_clf)) + ) + d_preds.append( + ('ours-chain', load_attelo_dtrees(EISNER_OUT_SYN_PRED, EDUS_FILE, + nuc_clf, rnk_clf)) + ) if 'ours_tree' in authors_pred: - print('[tree] Eisner, predicted syntax + same-unit') - load_deptrees_from_attelo_output(ctree_true, dtree_true, - EISNER_OUT_TREE_SYN_PRED_SU, EDUS_FILE, - nuc_clf, rnk_clf, - detailed=False) - print('======================') + # Eisner, predicted syntax, tree + same-unit + c_preds.append( + ('ours-tree', load_attelo_ctrees(EISNER_OUT_TREE_SYN_PRED_SU, + EDUS_FILE, + nuc_clf, rnk_clf)) + ) + d_preds.append( + ('ours-tree', load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED_SU, + EDUS_FILE, + nuc_clf, rnk_clf)) + ) if False: # FIXME repair (or forget) these print('Eisner, predicted syntax + same-unit') @@ -238,8 +248,7 @@ def main(): # report # * table format digits = 4 - parser_names = ['joty'] - width = max(len(x) for x in parser_names) + width = max(len(parser_name) for parser_name, _ in d_preds) headers = ["UAS", "LAS", "LS"] fmt = '%% %ds' % width # first col: parser name @@ -253,14 +262,14 @@ def main(): # end table format and header line # * table content - for parser_name, (ctree_pred, dtree_pred) in predictions: + for parser_name, dtree_pred in d_preds: doc_names = sorted(dtree_true.keys()) dtree_true_list = [dtree_true[doc_name] for doc_name in doc_names] dtree_pred_list = [dtree_pred[doc_name] for doc_name in doc_names] score_uas, score_las, score_ls = compute_uas_las(dtree_true_list, dtree_pred_list) # append to report - values = [parser_name] + values = ['{pname: <{fill}}'.format(pname=parser_name, fill=width)] for v in (score_uas, score_las, score_ls): values += ["{0:0.{1}f}".format(v, digits)] report += fmt % tuple(values) @@ -269,12 +278,13 @@ def main(): # end report # constituency eval - for parser_name, (ctree_pred, dtree_pred) in predictions: + for parser_name, ctree_pred in c_preds: doc_names = sorted(ctree_true.keys()) ctree_true_list = [ctree_true[doc_name] for doc_name in doc_names] ctree_pred_list = [ctree_pred[doc_name] for doc_name in doc_names] # FIXME # compute and print PARSEVAL scores + print(parser_name) print(parseval_report(ctree_true_list, ctree_pred_list, digits=4)) # detailed report on S+N+R if DETAILED: From 72a2956b32eb6f679dda611cb907a57b2089f0e8 Mon Sep 17 00:00:00 2001 From: moreymat Date: Fri, 16 Sep 2016 19:04:47 +0200 Subject: [PATCH 18/74] WIP tie order with nary_enc --- evals/showdown.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/evals/showdown.py b/evals/showdown.py index f21f294..9abd50a 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -81,11 +81,12 @@ NUC_STRATEGY = 'unamb_else_most_frequent' RNK_STRATEGY = 'sdist-edist-rl' RNK_PRIORITY_SU = True -RNK_ORDER = 'weak' def setup_dtree_postprocessor(nary_enc): """Setup the nuclearity and rank classifiers to flesh out dtrees.""" + # tie the order with the encoding for n-ary nodes + order = 'weak' if nary_enc == 'tree' else 'strict' # load train section of the RST corpus, fit (currently dummy) classifiers # for nuclearity and rank reader_train = RstReader(CD_TRAIN) @@ -117,7 +118,7 @@ def setup_dtree_postprocessor(nary_enc): # rank clf rnk_clf = InsideOutAttachmentRanker(strategy=RNK_STRATEGY, prioritize_same_unit=RNK_PRIORITY_SU, - order=RNK_ORDER) + order=order) rnk_clf.fit(X_train, y_rnk_train) return nuc_clf, rnk_clf From e6c0a5b0563c20b3ee24e19320c41e82fb05f7e5 Mon Sep 17 00:00:00 2001 From: moreymat Date: Wed, 21 Sep 2016 17:48:22 +0200 Subject: [PATCH 19/74] WIP support output of ji --- evals/attelo_predictions_to_disdep.py | 28 ++++++-- evals/dis2disdep.py | 93 ++++++++++++++++++++++++++- evals/ours.py | 8 +-- evals/showdown.py | 44 ++++++++++--- irit_rst_dt/local.py | 3 +- 5 files changed, 149 insertions(+), 27 deletions(-) diff --git a/evals/attelo_predictions_to_disdep.py b/evals/attelo_predictions_to_disdep.py index f0e7169..2c5a6c3 100755 --- a/evals/attelo_predictions_to_disdep.py +++ b/evals/attelo_predictions_to_disdep.py @@ -16,7 +16,8 @@ InsideOutAttachmentRanker) -def attelo_predictions_to_disdep_files(edus_file_glob, edges_file, out_dir): +def attelo_predictions_to_disdep_files(edus_file_glob, edges_file, out_dir, + nary_enc_pred='tree'): """Generate disdep files from a file dump of attelo predictions. Parameters @@ -28,12 +29,19 @@ def attelo_predictions_to_disdep_files(edus_file_glob, edges_file, out_dir): triples). out_dir: str Path to the output folder. + nary_enc_pred: one of {'chain', 'tree'} + Encoding for n-ary cnodes in the predicted dtree ; here it + currently triggers the strictness of the order assumed by the + dtree postprocessor: nary_enc_pred='chain' implies order='strict', + nary_enc_pred='tree' implies order='weak'. """ + order = 'weak' if nary_enc_pred == 'tree' else 'strict' # set up heuristic classifiers for nuclearity and rank nuc_clf = DummyNuclearityClassifier(strategy='unamb_else_most_frequent') nuc_clf.fit([], []) # dummy fit - rnk_clf = InsideOutAttachmentRanker(strategy='closest-intra-rl-inter-rl', - prioritize_same_unit=True) + rnk_clf = InsideOutAttachmentRanker(strategy='sdist-edist-rl', + prioritize_same_unit=True, + order=order) rnk_clf.fit([], []) # dummy fit # load EDUs @@ -84,17 +92,23 @@ def attelo_predictions_to_disdep_files(edus_file_glob, edges_file, out_dir): if __name__ == '__main__': + nary_enc_pred = 'tree' edus_file_glob = os.path.join('TMP', 'latest', 'data', 'TEST', '*.edu-pairs.sparse.edu_input') - edges_file_glob = os.path.join('TMP', 'latest', 'scratch-current', - 'combined', 'output.*') + edges_file_glob = os.path.join( + 'TMP', 'latest', 'scratch-current', + 'combined', + # 'output.*' + 'output.maxent-iheads-global-AD.L-jnt-eisner' + ) # attelo predictions are currently stored in one big file edges_files = glob(edges_file_glob) assert len(edges_files) == 1 edges_file = edges_files[0] # paths to the resulting disdep files - out_dir = os.path.join('TMP_disdep', 'chain', 'ours', 'test') + out_dir = os.path.join('TMP_disdep', nary_enc_pred, 'ours', 'test') if not os.path.exists(out_dir): os.makedirs(out_dir) # do the conversion - attelo_predictions_to_disdep_files(edus_file_glob, edges_file, out_dir) + attelo_predictions_to_disdep_files(edus_file_glob, edges_file, out_dir, + nary_enc_pred=nary_enc_pred) diff --git a/evals/dis2disdep.py b/evals/dis2disdep.py index d1d7966..bb69c97 100755 --- a/evals/dis2disdep.py +++ b/evals/dis2disdep.py @@ -2,22 +2,24 @@ TODO ---- -* [ ] support the output of Ji & Eisenstein's parser ; need to convert - .brackets to .dis_dep (via .dis?) * [ ] support intra-sentential level document parsing ; required to score Joty's .sen_dis files """ from __future__ import absolute_import, print_function import argparse +from collections import defaultdict +from glob import glob import os +from educe.annotation import Span from educe.corpus import FileId from educe.learning.disdep_format import dump_disdep_files +from educe.rst_dt.annotation import Node, RSTTree from educe.rst_dt.codra import load_codra_output_files -from educe.rst_dt.feng import load_feng_output_files from educe.rst_dt.corpus import Reader from educe.rst_dt.deptree import RstDepTree +from educe.rst_dt.feng import load_feng_output_files from educe.rst_dt.rst_wsj_corpus import (DOUBLE_FOLDER, TEST_FOLDER, TRAIN_FOLDER) @@ -104,7 +106,92 @@ def main(): # dump_disdep_files) for doc_name, dtree in dtrees.items(): dtree.origin = FileId(doc_name, None, None, None) + elif author == 'ji': + if corpus_split != 'test': + raise ValueError("The output of Ji & Eisenstein's parser is " + "available for the 'test' split only") + # * load the text of the EDUs + # FIXME get the text of EDUs from the .merge files + corpus_dir = RST_MAIN_TEST + reader_true = Reader(corpus_dir) + ctree_true = reader_true.slurp() + doc_edus = {k.doc: ct_true.leaves() for k, ct_true + in ctree_true.items()} + # * for each doc, load the predicted spans from the .brackets + ctree_pred = dict() + files_pred = os.path.join(OUT_JI, '*.brackets') + for f_pred in sorted(glob(files_pred)): + doc_name = os.path.splitext(os.path.basename(f_pred))[0] + edus = {i: e for i, e in enumerate(doc_edus[doc_name], start=1)} + origin = FileId(doc_name, None, None, None) + # read spans + spans_pred = defaultdict(list) # predicted spans by length + with open(f_pred) as f: + for line in f: + # FIXME use a standard module: ast or pickle? + # drop surrounding brackets + opening bracket of edu span + line = line.strip()[2:-1] + edu_span, nuc_rel = line.split('), ') + edu_span = tuple(int(x) for x in edu_span.split(', ')) + nuc, rel = nuc_rel.split(', ') + edu_span_len = edu_span[1] - edu_span[0] + spans_pred[edu_span_len].append((edu_span, nuc, rel)) + # bottom-up construction of the RST ctree + # left_border -> list of RST ctree fragments, sorted by len + tree_frags = defaultdict(list) + for span_len, spans in sorted(spans_pred.items()): + for edu_span, nuc, rel in spans: + children = [] + edu_beg, edu_end = edu_span + if edu_beg == edu_end: + # leaf node + txt_span = edus[edu_beg].span + else: + # internal node + # * get the children (subtrees) + edu_cur = edu_beg + while edu_cur < edu_end: + kid_nxt = tree_frags[edu_cur][-1] + children.append(kid_nxt) + edu_cur = kid_nxt.label().edu_span[1] + 1 + # compute properties of this node + txt_span = Span(children[0].label().span.char_start, + children[-1].label().span.char_end) + # build node and RSTTree fragment + node = Node(nuc, edu_span, txt_span, rel, + context=None) # TODO context? + tree_frags[edu_beg].append( + RSTTree(node, children, origin=origin)) + # build the top node + edu_nums = sorted(edus.keys()) + edu_span = (edu_nums[0], edu_nums[-1]) + print(doc_name, edu_span) + children = [] + edu_beg, edu_end = edu_span + edu_cur = edu_beg + while edu_cur < edu_end: + print(edu_cur) + kid_nxt = tree_frags[edu_cur][-1] + children.append(kid_nxt) + edu_cur = kid_nxt.label().edu_span[1] + 1 + txt_span = Span(children[0].label().span.char_start, + children[-1].label().span.char_end) + node = Node(nuc, edu_span, txt_span, 'Root', context=None) + tree_frags[edu_beg].append( + RSTTree(node, children, origin=origin)) + # now we should have a spanning ctree + ct_pred = tree_frags[1][-1] + # DEBUG + print(sorted(edus.keys())[0], + sorted(edus.keys())[-1]) + print(ct_pred.label().edu_span) # RESUME HERE + print(sorted(tree_frags.items())) + # end DEBUG + assert ct_pred.label().edu_span == (sorted(edus.keys())[0], + sorted(edus.keys())[-1]) + ctree_pred[doc_name] = ct_pred + raise NotImplementedError("Output of Ji's parser") # do dump dump_disdep_files(dtrees.values(), out_dir) diff --git a/evals/ours.py b/evals/ours.py index 300b376..0dbe1ce 100644 --- a/evals/ours.py +++ b/evals/ours.py @@ -16,9 +16,6 @@ from educe.rst_dt.document_plus import align_edus_with_paragraphs # from attelo.io import load_edus -from attelo.metrics.constituency import (parseval_detailed_report, - parseval_report) -from attelo.metrics.deptree import compute_uas_las from attelo.table import UNRELATED # for load_attelo_output_file @@ -160,7 +157,6 @@ def load_attelo_ctrees(output_file, edus_file, nuc_clf, rnk_clf): ctree_pred[doc_name] = rtree_pred except RstDtException as rst_e: print(rst_e) - skipped_docs.add(doc_name) if False: print('\n'.join('{}: {}'.format(edu.text_span(), edu) for edu in educe_edus[doc_name])) @@ -171,9 +167,7 @@ def load_attelo_ctrees(output_file, edus_file, nuc_clf, rnk_clf): def load_deptrees_from_attelo_output(ctree_true, dtree_true, output_file, edus_file, - nuc_clf, rnk_clf, - detailed=False, - skpd_docs=None): + nuc_clf, rnk_clf): """Load an RstDepTree from the output of attelo. Parameters diff --git a/evals/showdown.py b/evals/showdown.py index 9abd50a..52c096f 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -53,6 +53,12 @@ 'output.maxent-iheads-global-AD.L-jnt-eisner') # 2016-09-14 "tree" transform, predicted syntax +EISNER_OUT_TREE_SYN_PRED = os.path.join( + '/home/mmorey/melodi', + 'irit-rst-dt/TMP/latest', # lbl + 'scratch-current/combined', + 'output.maxent-iheads-global-AD.L-jnt-eisner') + EISNER_OUT_TREE_SYN_PRED_SU = os.path.join( '/home/mmorey/melodi', 'irit-rst-dt/TMP/latest', # lbl @@ -77,6 +83,8 @@ # level of detail for parseval DETAILED = False +SPAN_SEL = None # None, 'leaves', 'non-leaves' +STRINGENT = False # hyperparams NUC_STRATEGY = 'unamb_else_most_frequent' RNK_STRATEGY = 'sdist-edist-rl' @@ -135,7 +143,7 @@ def main(): parser.add_argument('authors_pred', nargs='+', choices=['gold', 'silver', 'joty', 'feng', 'ji', - 'ours_chain', 'ours_tree'], + 'ours_chain', 'ours_tree', 'ours_tree_su'], help="Author(s) of the predictions") parser.add_argument('--nary_enc_pred', default='tree', choices=['tree', 'chain'], @@ -161,6 +169,9 @@ def main(): authors_pred = args.authors_pred nary_enc_pred = args.nary_enc_pred binarize_true = args.binarize_true + if binarize_true and nary_enc_true != 'chain': + raise ValueError("--binarize_true is compatible with " + "--nary_enc_true chain only") # 0. setup the postprocessors to flesh out unordered dtrees into ordered # ones with nuclearity @@ -181,13 +192,13 @@ def main(): doc_name = doc_id.doc # original reference ctree, with coarse labels ct_true = REL_CONV(ct_true) # map fine to coarse relations - # corresponding dtree - dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc=nary_enc_true) - dtree_true[doc_name] = dt_true - # binarize ctree if necessary if binarize_true: + # binarize ctree if required ct_true = _binarize(ct_true) ctree_true[doc_name] = ct_true + # corresponding dtree + dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc=nary_enc_true) + dtree_true[doc_name] = dt_true c_preds = [] # predictions: [(parser_name, dict(doc_name, ct_pred))] @@ -219,15 +230,27 @@ def main(): if 'ours_tree' in authors_pred: # Eisner, predicted syntax, tree + same-unit c_preds.append( - ('ours-tree', load_attelo_ctrees(EISNER_OUT_TREE_SYN_PRED_SU, + ('ours-tree', load_attelo_ctrees(EISNER_OUT_TREE_SYN_PRED, EDUS_FILE, nuc_clf, rnk_clf)) ) d_preds.append( - ('ours-tree', load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED_SU, + ('ours-tree', load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED, EDUS_FILE, nuc_clf, rnk_clf)) ) + if 'ours_tree_su' in authors_pred: + # Eisner, predicted syntax, tree + same-unit + c_preds.append( + ('ours-tree-su', load_attelo_ctrees(EISNER_OUT_TREE_SYN_PRED_SU, + EDUS_FILE, + nuc_clf, rnk_clf)) + ) + d_preds.append( + ('ours-tree-su', load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED_SU, + EDUS_FILE, + nuc_clf, rnk_clf)) + ) if False: # FIXME repair (or forget) these print('Eisner, predicted syntax + same-unit') @@ -286,11 +309,14 @@ def main(): # FIXME # compute and print PARSEVAL scores print(parser_name) - print(parseval_report(ctree_true_list, ctree_pred_list, digits=4)) + print(parseval_report(ctree_true_list, ctree_pred_list, digits=4, + span_sel=SPAN_SEL, + stringent=STRINGENT)) # detailed report on S+N+R if DETAILED: print(parseval_detailed_report(ctree_true_list, ctree_pred_list, - metric_type='S+R')) + metric_type='S+R', + span_sel=SPAN_SEL)) # end FIXME diff --git a/irit_rst_dt/local.py b/irit_rst_dt/local.py index bfe2691..f805832 100644 --- a/irit_rst_dt/local.py +++ b/irit_rst_dt/local.py @@ -103,7 +103,8 @@ # TEST_EVALUATION_KEY = 'maxent-AD.L-jnt-mst' # TEST_EVALUATION_KEY = 'maxent-AD.L-jnt-eisner' # TEST_EVALUATION_KEY = 'maxent-AD.L-jnt_su-eisner' -TEST_EVALUATION_KEY = 'maxent-iheads-global-AD.L-jnt_su-eisner' +TEST_EVALUATION_KEY = 'maxent-iheads-global-AD.L-jnt-eisner' +# TEST_EVALUATION_KEY = 'maxent-iheads-global-AD.L-jnt_su-eisner' """Evaluation to use for testing. Leave this to None until you think it's OK to look at the test data. From f185f3af6fe0c1663e22bb2c99421b0ed66b895e Mon Sep 17 00:00:00 2001 From: moreymat Date: Thu, 22 Sep 2016 15:54:22 +0200 Subject: [PATCH 20/74] ENH added support for DPLP in eval --- evals/dis2disdep.py | 100 ++++------------------------- evals/ji.py | 152 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 164 insertions(+), 88 deletions(-) create mode 100644 evals/ji.py diff --git a/evals/dis2disdep.py b/evals/dis2disdep.py index bb69c97..194abfc 100755 --- a/evals/dis2disdep.py +++ b/evals/dis2disdep.py @@ -8,33 +8,38 @@ """ from __future__ import absolute_import, print_function import argparse -from collections import defaultdict -from glob import glob import os -from educe.annotation import Span from educe.corpus import FileId from educe.learning.disdep_format import dump_disdep_files -from educe.rst_dt.annotation import Node, RSTTree from educe.rst_dt.codra import load_codra_output_files -from educe.rst_dt.corpus import Reader +from educe.rst_dt.corpus import Reader, RstRelationConverter from educe.rst_dt.deptree import RstDepTree from educe.rst_dt.feng import load_feng_output_files from educe.rst_dt.rst_wsj_corpus import (DOUBLE_FOLDER, TEST_FOLDER, TRAIN_FOLDER) +from .ji import load_ji_dtrees + # original RST corpus RST_CORPUS = os.path.join('/home/mmorey/corpora/rst_discourse_treebank/data') RST_MAIN_TRAIN = os.path.join(RST_CORPUS, TRAIN_FOLDER) RST_MAIN_TEST = os.path.join(RST_CORPUS, TEST_FOLDER) RST_DOUBLE = os.path.join(RST_CORPUS, DOUBLE_FOLDER) + +# relation converter (fine- to coarse-grained labels) +RELMAP_FILE = os.path.join('/home/mmorey/melodi/educe', + 'educe', 'rst_dt', + 'rst_112to18.txt') +REL_CONV = RstRelationConverter(RELMAP_FILE).convert_tree + # output of Joty's parser OUT_JOTY = os.path.join('/home/mmorey/melodi/rst/joty/Doc-level/') # output of Feng & Hirst's parser OUT_FENG = os.path.join('/home/mmorey/melodi/rst/feng_hirst/tmp/') # output of Ji's parser -OUT_JI = os.path.join('/home/mmorey/melodi/rst/ji_eisenstein/test_input') +OUT_JI = os.path.join('/home/mmorey/melodi/rst/ji_eisenstein/DPLP/data/docs/test/') def main(): @@ -111,88 +116,7 @@ def main(): if corpus_split != 'test': raise ValueError("The output of Ji & Eisenstein's parser is " "available for the 'test' split only") - # * load the text of the EDUs - # FIXME get the text of EDUs from the .merge files - corpus_dir = RST_MAIN_TEST - reader_true = Reader(corpus_dir) - ctree_true = reader_true.slurp() - doc_edus = {k.doc: ct_true.leaves() for k, ct_true - in ctree_true.items()} - # * for each doc, load the predicted spans from the .brackets - ctree_pred = dict() - files_pred = os.path.join(OUT_JI, '*.brackets') - for f_pred in sorted(glob(files_pred)): - doc_name = os.path.splitext(os.path.basename(f_pred))[0] - edus = {i: e for i, e in enumerate(doc_edus[doc_name], start=1)} - origin = FileId(doc_name, None, None, None) - # read spans - spans_pred = defaultdict(list) # predicted spans by length - with open(f_pred) as f: - for line in f: - # FIXME use a standard module: ast or pickle? - # drop surrounding brackets + opening bracket of edu span - line = line.strip()[2:-1] - edu_span, nuc_rel = line.split('), ') - edu_span = tuple(int(x) for x in edu_span.split(', ')) - nuc, rel = nuc_rel.split(', ') - edu_span_len = edu_span[1] - edu_span[0] - spans_pred[edu_span_len].append((edu_span, nuc, rel)) - # bottom-up construction of the RST ctree - # left_border -> list of RST ctree fragments, sorted by len - tree_frags = defaultdict(list) - for span_len, spans in sorted(spans_pred.items()): - for edu_span, nuc, rel in spans: - children = [] - edu_beg, edu_end = edu_span - if edu_beg == edu_end: - # leaf node - txt_span = edus[edu_beg].span - else: - # internal node - # * get the children (subtrees) - edu_cur = edu_beg - while edu_cur < edu_end: - kid_nxt = tree_frags[edu_cur][-1] - children.append(kid_nxt) - edu_cur = kid_nxt.label().edu_span[1] + 1 - # compute properties of this node - txt_span = Span(children[0].label().span.char_start, - children[-1].label().span.char_end) - # build node and RSTTree fragment - node = Node(nuc, edu_span, txt_span, rel, - context=None) # TODO context? - tree_frags[edu_beg].append( - RSTTree(node, children, origin=origin)) - # build the top node - edu_nums = sorted(edus.keys()) - edu_span = (edu_nums[0], edu_nums[-1]) - print(doc_name, edu_span) - children = [] - edu_beg, edu_end = edu_span - edu_cur = edu_beg - while edu_cur < edu_end: - print(edu_cur) - kid_nxt = tree_frags[edu_cur][-1] - children.append(kid_nxt) - edu_cur = kid_nxt.label().edu_span[1] + 1 - txt_span = Span(children[0].label().span.char_start, - children[-1].label().span.char_end) - node = Node(nuc, edu_span, txt_span, 'Root', context=None) - tree_frags[edu_beg].append( - RSTTree(node, children, origin=origin)) - # now we should have a spanning ctree - ct_pred = tree_frags[1][-1] - # DEBUG - print(sorted(edus.keys())[0], - sorted(edus.keys())[-1]) - print(ct_pred.label().edu_span) # RESUME HERE - print(sorted(tree_frags.items())) - # end DEBUG - assert ct_pred.label().edu_span == (sorted(edus.keys())[0], - sorted(edus.keys())[-1]) - ctree_pred[doc_name] = ct_pred - - raise NotImplementedError("Output of Ji's parser") + dtrees = load_ji_dtrees(OUT_JI, REL_CONV) # do dump dump_disdep_files(dtrees.values(), out_dir) diff --git a/evals/ji.py b/evals/ji.py new file mode 100644 index 0000000..2e5e38f --- /dev/null +++ b/evals/ji.py @@ -0,0 +1,152 @@ +"""Load the output of Ji's DPLP parser. + +""" + +from __future__ import absolute_import, print_function + +from collections import defaultdict +from glob import glob +import os + +from educe.annotation import Span +from educe.corpus import FileId +from educe.rst_dt.annotation import Node, RSTTree +from educe.rst_dt.corpus import Reader +from educe.rst_dt.deptree import RstDepTree +from educe.rst_dt.rst_wsj_corpus import TEST_FOLDER + +# original RST corpus +RST_CORPUS = os.path.join('/home/mmorey/corpora/rst_discourse_treebank/data') +RST_MAIN_TEST = os.path.join(RST_CORPUS, TEST_FOLDER) + + +def load_ji_ctrees(ji_out_dir, rel_conv): + """Load the ctrees output by DPLP as .brackets files. + + Parameters + ---------- + ji_out_dir: str + Path to the base directory containing the output files. + + Returns + ------- + ctree_pred: dict(str, RSTTree) + RST ctree for each document. + """ + # * load the text of the EDUs + # FIXME get the text of EDUs from the .merge files + corpus_dir = RST_MAIN_TEST + reader_true = Reader(corpus_dir) + ctree_true = reader_true.slurp() + doc_edus = {k.doc: ct_true.leaves() for k, ct_true + in ctree_true.items()} + # * for each doc, load the predicted spans from the .brackets + ctree_pred = dict() + files_pred = os.path.join(ji_out_dir, '*.brackets') + for f_pred in sorted(glob(files_pred)): + doc_name = os.path.splitext(os.path.basename(f_pred))[0] + edus = {i: e for i, e in enumerate(doc_edus[doc_name], start=1)} + origin = FileId(doc_name, None, None, None) + # read spans + spans_pred = defaultdict(list) # predicted spans by length + with open(f_pred) as f: + for line in f: + # FIXME use a standard module: ast? pickle? + # * drop surrounding brackets + opening bracket of edu span + line = line.strip()[2:-1] + edu_span, nuc_rel = line.split('), ') + edu_span = tuple(int(x) for x in edu_span.split(', ')) + nuc, rel = nuc_rel.split(', ') + # * remove quotes around nuc and rel + nuc = nuc[1:-1] + rel = rel[1:-1] + # + edu_span_len = edu_span[1] - edu_span[0] + spans_pred[edu_span_len].append((edu_span, nuc, rel)) + # bottom-up construction of the RST ctree + # left_border -> list of RST ctree fragments, sorted by len + tree_frags = defaultdict(list) + for span_len, spans in sorted(spans_pred.items()): + for edu_span, nuc, rel in spans: + children = [] + edu_beg, edu_end = edu_span + if edu_beg == edu_end: + # pre-terminal + txt_span = edus[edu_beg].span + # one child: leaf node: EDU + leaf = edus[edu_beg] + children.append(leaf) + else: + # internal node + # * get the children (subtrees) + edu_cur = edu_beg + while edu_cur <= edu_end: + kid_nxt = tree_frags[edu_cur][-1] + children.append(kid_nxt) + edu_cur = kid_nxt.label().edu_span[1] + 1 + # compute properties of this node + txt_span = Span(children[0].label().span.char_start, + children[-1].label().span.char_end) + # build node and RSTTree fragment + node = Node(nuc, edu_span, txt_span, rel, + context=None) # TODO context? + tree_frags[edu_beg].append( + RSTTree(node, children, origin=origin)) + # build the top node + edu_nums = sorted(edus.keys()) + edu_span = (edu_nums[0], edu_nums[-1]) + children = [] + edu_beg, edu_end = edu_span + edu_cur = edu_beg + while edu_cur <= edu_end: + kid_nxt = tree_frags[edu_cur][-1] + children.append(kid_nxt) + edu_cur = kid_nxt.label().edu_span[1] + 1 + txt_span = Span(children[0].label().span.char_start, + children[-1].label().span.char_end) + node = Node(nuc, edu_span, txt_span, 'Root', context=None) + tree_frags[edu_beg].append( + RSTTree(node, children, origin=origin)) + # now we should have a spanning ctree + ct_pred = tree_frags[1][-1] + assert ct_pred.label().edu_span == (sorted(edus.keys())[0], + sorted(edus.keys())[-1]) + # convert relation labels + if rel_conv is not None: + ct_pred = rel_conv(ct_pred) + # store the resulting RSTTree + ctree_pred[doc_name] = ct_pred + + return ctree_pred + + +def load_ji_dtrees(ji_out_dir, rel_conv, nary_enc='chain'): + """Get the dtrees that correspond to the ctrees output by DPLP. + + Parameters + ---------- + ji_out_dir: str + Path to the base directory containing the output files. + rel_conv: TODO + Relation converter, from fine- to coarse-grained labels. + nary_enc: one of {'chain', 'tree'} + Encoding for n-ary nodes. + + Returns + ------- + dtree_pred: dict(str, RstDepTree) + RST dtree for each document. + """ + dtree_pred = dict() + + ctree_pred = load_ji_ctrees(ji_out_dir, rel_conv) + for doc_name, ct_pred in ctree_pred.items(): + dtree_pred[doc_name] = RstDepTree.from_rst_tree( + ct_pred, nary_enc=nary_enc) + # set reference to the document in the RstDepTree (required by + # dump_disdep_files) + for doc_name, dt_pred in dtree_pred.items(): + dt_pred.origin = FileId(doc_name, None, None, None) + + return dtree_pred + From 2f1b13e5fb58ef1c6088c8a9deaa1ddfa7db07be Mon Sep 17 00:00:00 2001 From: moreymat Date: Thu, 22 Sep 2016 16:21:07 +0200 Subject: [PATCH 21/74] FIX eval of dplp --- evals/dis2disdep.py | 2 +- evals/ji.py | 8 ++++++++ evals/showdown.py | 18 ++++++++++++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/evals/dis2disdep.py b/evals/dis2disdep.py index 194abfc..d3140db 100755 --- a/evals/dis2disdep.py +++ b/evals/dis2disdep.py @@ -19,7 +19,7 @@ from educe.rst_dt.rst_wsj_corpus import (DOUBLE_FOLDER, TEST_FOLDER, TRAIN_FOLDER) -from .ji import load_ji_dtrees +from evals.ji import load_ji_dtrees # original RST corpus diff --git a/evals/ji.py b/evals/ji.py index 2e5e38f..9862abf 100644 --- a/evals/ji.py +++ b/evals/ji.py @@ -114,6 +114,14 @@ def load_ji_ctrees(ji_out_dir, rel_conv): # convert relation labels if rel_conv is not None: ct_pred = rel_conv(ct_pred) + # change "same_unit" (in Ji's output) into "same-unit" (in ours) + for pos in ct_pred.treepositions(): + t = ct_pred[pos] + if isinstance(t, RSTTree): + node = t.label() + # replace "same_unit" with "same-unit" + if node.rel == 'same_unit': + node.rel = 'same-unit' # store the resulting RSTTree ctree_pred[doc_name] = ct_pred diff --git a/evals/showdown.py b/evals/showdown.py index 52c096f..4707467 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -21,6 +21,7 @@ # local to this package from evals.codra import load_codra_ctrees, load_codra_dtrees +from evals.ji import load_ji_ctrees, load_ji_dtrees from evals.ours import (load_deptrees_from_attelo_output, load_attelo_ctrees, load_attelo_dtrees) @@ -79,7 +80,10 @@ 'scratch-current/combined', 'output.maxent-iheads-global-AD.L-jnt-eisner') +# output of Joty's parser CODRA CODRA_OUT_DIR = '/home/mmorey/melodi/rst/joty/Doc-level' +# output of Ji's parser DPLP +JI_OUT_DIR = os.path.join('/home/mmorey/melodi/rst/ji_eisenstein/DPLP/data/docs/test/') # level of detail for parseval DETAILED = False @@ -216,6 +220,20 @@ def main(): # the nary_enc does not matter because codra outputs binary ctrees, # hence both encodings result in (the same) strictly ordered dtrees + if 'ji' in authors_pred: + # DPLP outputs RST ctrees in the form of lists of spans; + # load_ji_dtrees maps them to RST dtrees + c_preds.append( + ('ji', load_ji_ctrees(JI_OUT_DIR, REL_CONV)) + ) + d_preds.append( + ('ji', load_ji_dtrees(JI_OUT_DIR, REL_CONV, + nary_enc='chain')) + ) + # ji-{chain,tree} would be the same except nary_enc='tree' ; + # the nary_enc does not matter because codra outputs binary ctrees, + # hence both encodings result in (the same) strictly ordered dtrees + if 'ours_chain' in authors_pred: # Eisner, predicted syntax, chain c_preds.append( From 8acd9572ed0691316fd1e26603e72ca62372f20d Mon Sep 17 00:00:00 2001 From: moreymat Date: Mon, 26 Sep 2016 10:12:40 +0200 Subject: [PATCH 22/74] ENH variant of parseval scores, per doc then averaged --- evals/showdown.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/evals/showdown.py b/evals/showdown.py index 4707467..7926cca 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -88,6 +88,9 @@ # level of detail for parseval DETAILED = False SPAN_SEL = None # None, 'leaves', 'non-leaves' +# "PER_DOC = True" computes p, r, f as in DPLP: compute scores per doc, +# then average over docs +PER_DOC = False # should be False, except for comparison with the DPLP paper STRINGENT = False # hyperparams NUC_STRATEGY = 'unamb_else_most_frequent' @@ -329,6 +332,7 @@ def main(): print(parser_name) print(parseval_report(ctree_true_list, ctree_pred_list, digits=4, span_sel=SPAN_SEL, + per_doc=PER_DOC, stringent=STRINGENT)) # detailed report on S+N+R if DETAILED: From 1d31ed600dbd4b284a7106d1f90a29beedddd682 Mon Sep 17 00:00:00 2001 From: moreymat Date: Tue, 27 Sep 2016 17:42:17 +0200 Subject: [PATCH 23/74] ENH parseval for Feng's parser --- evals/feng.py | 88 +++++++++++++++++++++++++++++++++++++++++++++++ evals/showdown.py | 13 +++++++ 2 files changed, 101 insertions(+) create mode 100644 evals/feng.py diff --git a/evals/feng.py b/evals/feng.py new file mode 100644 index 0000000..802ddbc --- /dev/null +++ b/evals/feng.py @@ -0,0 +1,88 @@ +"""Load the output of the parser from (Feng and Hirst, 2014). + +This is 99% a copy/paste from evals/joty.py . +I need to come up with a better API and refactor accordingly. +""" + +from __future__ import absolute_import, print_function + +import itertools + +from educe.rst_dt.feng import load_feng_output_files +from educe.rst_dt.deptree import RstDepTree + + +def load_feng_ctrees(out_dir, rel_conv): + """Load the ctrees output by Feng's parser as .dis files. + + This currently runs on the document-level files (.doc_dis). + + Parameters + ---------- + out_dir: str + Path to the base directory containing the output files. + + Returns + ------- + ctree_pred: dict(str, RSTTree) + RST ctree for each document. + """ + # load predicted trees + data_pred = load_feng_output_files(out_dir) + # filenames = data_pred['filenames'] + doc_names_pred = data_pred['doc_names'] + rst_ctrees_pred = data_pred['rst_ctrees'] + + # build a dict from doc_name to ctree (RSTTree) + ctree_pred = dict() # constituency trees + for doc_name, ct_pred in itertools.izip(doc_names_pred, rst_ctrees_pred): + # constituency tree + # replace fine-grained labels with coarse-grained labels ; + # the files we have already contain the coarse labels, except their + # initial letter is capitalized whereas ours are not + if rel_conv is not None: + ct_pred = rel_conv(ct_pred) + ctree_pred[doc_name] = ct_pred + + return ctree_pred + + +def load_feng_dtrees(out_dir, rel_conv, nary_enc='chain'): + """Get the dtrees that correspond to the ctrees output by Feng's parser. + + Parameters + ---------- + out_dir: str + Path to the base directory containing the output files. + nary_enc: one of {'chain', 'tree'} + Encoding for n-ary nodes. + + Returns + ------- + dtree_pred: dict(str, RstDepTree) + RST dtree for each document. + """ + # load predicted trees + data_pred = load_feng_output_files(out_dir) + # filenames = data_pred['filenames'] + doc_names_pred = data_pred['doc_names'] + rst_ctrees_pred = data_pred['rst_ctrees'] + + # build a dict from doc_name to ordered dtree (RstDepTree) + dtree_pred = dict() + for doc_name, ct_pred in itertools.izip(doc_names_pred, rst_ctrees_pred): + # constituency tree + # replace fine-grained labels with coarse-grained labels ; + # the files we have already contain the coarse labels, except their + # initial letter is capitalized whereas ours are not + if rel_conv is not None: + ct_pred = rel_conv(ct_pred) + # convert to an ordered dependency tree ; + # * 'tree' produces a weakly-ordered dtree strictly equivalent + # to the original ctree, + # * 'chain' produces a strictly-ordered dtree for which strict + # equivalence is not preserved + dt_pred = RstDepTree.from_rst_tree(ct_pred, nary_enc=nary_enc) + dtree_pred[doc_name] = dt_pred + + return dtree_pred diff --git a/evals/showdown.py b/evals/showdown.py index 7926cca..ad15d9d 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -21,6 +21,7 @@ # local to this package from evals.codra import load_codra_ctrees, load_codra_dtrees +from evals.feng import load_feng_ctrees, load_feng_dtrees from evals.ji import load_ji_ctrees, load_ji_dtrees from evals.ours import (load_deptrees_from_attelo_output, load_attelo_ctrees, @@ -84,6 +85,8 @@ CODRA_OUT_DIR = '/home/mmorey/melodi/rst/joty/Doc-level' # output of Ji's parser DPLP JI_OUT_DIR = os.path.join('/home/mmorey/melodi/rst/ji_eisenstein/DPLP/data/docs/test/') +# Feng's parser +FENG_OUT_DIR = '/home/mmorey/melodi/rst/feng_hirst/tmp' # level of detail for parseval DETAILED = False @@ -210,6 +213,16 @@ def main(): c_preds = [] # predictions: [(parser_name, dict(doc_name, ct_pred))] d_preds = [] # predictions: [(parser_name, dict(doc_name, dt_pred))] + + if 'feng' in authors_pred: + c_preds.append( + ('feng', load_feng_ctrees(FENG_OUT_DIR, REL_CONV)) + ) + d_preds.append( + ('feng', load_feng_dtrees(FENG_OUT_DIR, REL_CONV, + nary_enc='chain')) + ) + if 'joty' in authors_pred: # CODRA outputs RST ctrees ; eval_codra_output maps them to RST dtrees c_preds.append( From ff278d06ec995207a6d1474665130539a57df863 Mon Sep 17 00:00:00 2001 From: moreymat Date: Fri, 30 Sep 2016 19:46:21 +0200 Subject: [PATCH 24/74] WIP more evals, notably on spans from SimpleRSTTree --- evals/showdown.py | 57 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 51 insertions(+), 6 deletions(-) diff --git a/evals/showdown.py b/evals/showdown.py index ad15d9d..352d303 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -8,7 +8,7 @@ import argparse import os -from educe.rst_dt.annotation import _binarize +from educe.rst_dt.annotation import _binarize, SimpleRSTTree from educe.rst_dt.corpus import (RstRelationConverter, Reader as RstReader) from educe.rst_dt.dep2con import (DummyNuclearityClassifier, @@ -17,7 +17,7 @@ # from attelo.metrics.constituency import (parseval_detailed_report, parseval_report) -from attelo.metrics.deptree import compute_uas_las +from attelo.metrics.deptree import compute_uas_las, compute_uas_las_undirected # local to this package from evals.codra import load_codra_ctrees, load_codra_dtrees @@ -90,7 +90,7 @@ # level of detail for parseval DETAILED = False -SPAN_SEL = None # None, 'leaves', 'non-leaves' +SPAN_SEL = 'non-leaves' # None, 'leaves', 'non-leaves' # "PER_DOC = True" computes p, r, f as in DPLP: compute scores per doc, # then average over docs PER_DOC = False # should be False, except for comparison with the DPLP paper @@ -171,7 +171,8 @@ def main(): # * ctree eval parser.add_argument('--binarize_true', action='store_true', help="Binarize the reference ctree for the eval") - + parser.add_argument('--simple_rsttree', action='store_true', + help="Binarize ctree and move relations up") # args = parser.parse_args() author_true = args.author_true @@ -179,6 +180,7 @@ def main(): authors_pred = args.authors_pred nary_enc_pred = args.nary_enc_pred binarize_true = args.binarize_true + simple_rsttree = args.simple_rsttree if binarize_true and nary_enc_true != 'chain': raise ValueError("--binarize_true is compatible with " "--nary_enc_true chain only") @@ -308,7 +310,7 @@ def main(): digits = 4 width = max(len(parser_name) for parser_name, _ in d_preds) - headers = ["UAS", "LAS", "LS"] + headers = ["UAS", "LAS", "LS", "UUAS", "ULAS"] fmt = '%% %ds' % width # first col: parser name fmt += ' ' fmt += ' '.join(['% 9s' for _ in headers]) @@ -324,11 +326,28 @@ def main(): doc_names = sorted(dtree_true.keys()) dtree_true_list = [dtree_true[doc_name] for doc_name in doc_names] dtree_pred_list = [dtree_pred[doc_name] for doc_name in doc_names] + # WIP print per doc eval + for doc_name, dt_true, dt_pred in zip( + doc_names, dtree_true_list, dtree_pred_list): + with open(parser_name + '/' + doc_name + '.d_eval', mode='w') as f: + print(', '.join('{:.4f}'.format(x) + for x in compute_uas_las( + [dt_true], [dt_pred])), + file=f) + # WIP scores for undirected edges + print(', '.join('{:.4f}'.format(x) + for x in compute_uas_las_undirected( + [dt_true], [dt_pred])), + file=f) + + # end WIP print score_uas, score_las, score_ls = compute_uas_las(dtree_true_list, dtree_pred_list) + score_uuas, score_ulas = compute_uas_las_undirected(dtree_true_list, + dtree_pred_list) # append to report values = ['{pname: <{fill}}'.format(pname=parser_name, fill=width)] - for v in (score_uas, score_las, score_ls): + for v in (score_uas, score_las, score_ls, score_uuas, score_ulas): values += ["{0:0.{1}f}".format(v, digits)] report += fmt % tuple(values) # end table content @@ -340,6 +359,32 @@ def main(): doc_names = sorted(ctree_true.keys()) ctree_true_list = [ctree_true[doc_name] for doc_name in doc_names] ctree_pred_list = [ctree_pred[doc_name] for doc_name in doc_names] + if simple_rsttree: + ctree_true_list = [SimpleRSTTree.from_rst_tree(x) + for x in ctree_true_list] + ctree_pred_list = [SimpleRSTTree.from_rst_tree(x) + for x in ctree_pred_list] + # WIP print SimpleRSTTrees + if not os.path.exists('gold'): + os.makedirs('gold') + for doc_name, ct in zip(doc_names, ctree_true_list): + with open('gold/' + ct.origin.doc, mode='w') as f: + print(ct, file=f) + if not os.path.exists(parser_name): + os.makedirs(parser_name) + for doc_name, ct in zip(doc_names, ctree_pred_list): + with open(parser_name + '/' + doc_name, mode='w') as f: + print(ct, file=f) + # WIP eval each tree in turn + for doc_name, ct_true, ct_pred in zip( + doc_names, ctree_true_list, ctree_pred_list): + with open(parser_name + '/' + doc_name + '.c_eval', mode='w') as f: + print(parseval_report([ct_true], [ct_pred], digits=4, + span_sel=SPAN_SEL, + per_doc=PER_DOC, + stringent=STRINGENT), + file=f) + # end WIP # FIXME # compute and print PARSEVAL scores print(parser_name) From e3a34c30e06466ab7cc21f312b2e72ff8500bc06 Mon Sep 17 00:00:00 2001 From: moreymat Date: Mon, 3 Oct 2016 11:26:36 +0200 Subject: [PATCH 25/74] FIX minor bugs in showdown --- evals/showdown.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/evals/showdown.py b/evals/showdown.py index 352d303..e265c13 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -57,13 +57,13 @@ # 2016-09-14 "tree" transform, predicted syntax EISNER_OUT_TREE_SYN_PRED = os.path.join( '/home/mmorey/melodi', - 'irit-rst-dt/TMP/latest', # lbl + 'irit-rst-dt/TMP/2016-09-12T0825', # lbl 'scratch-current/combined', 'output.maxent-iheads-global-AD.L-jnt-eisner') EISNER_OUT_TREE_SYN_PRED_SU = os.path.join( '/home/mmorey/melodi', - 'irit-rst-dt/TMP/latest', # lbl + 'irit-rst-dt/TMP/2016-09-12T0825', # lbl 'scratch-current/combined', 'output.maxent-iheads-global-AD.L-jnt_su-eisner') # end 2016-09-14 @@ -327,6 +327,8 @@ def main(): dtree_true_list = [dtree_true[doc_name] for doc_name in doc_names] dtree_pred_list = [dtree_pred[doc_name] for doc_name in doc_names] # WIP print per doc eval + if not os.path.exists(parser_name): + os.makedirs(parser_name) for doc_name, dt_true, dt_pred in zip( doc_names, dtree_true_list, dtree_pred_list): with open(parser_name + '/' + doc_name + '.d_eval', mode='w') as f: From 140e93b7eca558bb03a9e05e594e030896822cb8 Mon Sep 17 00:00:00 2001 From: moreymat Date: Wed, 5 Oct 2016 10:50:02 +0200 Subject: [PATCH 26/74] ENH add metric LAS+O in eval_disdep --- evals/eval_disdep.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/evals/eval_disdep.py b/evals/eval_disdep.py index 8310487..8cbd6f6 100755 --- a/evals/eval_disdep.py +++ b/evals/eval_disdep.py @@ -41,7 +41,7 @@ print('\t'.join(['parser', 'a', 'l', 'n', 'r', 'al', 'an', 'ar', - 'aln', + 'aln', 'alr', 'alnr', 'support'])) @@ -60,6 +60,7 @@ cnt_an = 0 # correct attachment + nuc cnt_ar = 0 # correct attachment + rank cnt_aln = 0 # correct attachment + label + nuc + cnt_alr = 0 # correct attachment + label + rank cnt_alnr = 0 # correct attachment + label + nuc + rank for doc_name, f_true in files_true.items(): @@ -93,12 +94,14 @@ cnt_ar += 1 if ok_a and ok_l and ok_n: cnt_aln += 1 + if ok_a and ok_l and ok_r: + cnt_alr += 1 if ok_a and ok_l and ok_n and ok_r: cnt_alnr += 1 print('\t'.join([author_pred] + ['{:.4f}'.format(float(cnt_x) / cnt_tot) for cnt_x in [cnt_a, cnt_l, cnt_n, cnt_r, cnt_al, cnt_an, cnt_ar, - cnt_aln, + cnt_aln, cnt_alr, cnt_alnr]] + [str(cnt_tot)])) From 5ebc6e2a56ee9f96af26f38f4d91962509007328 Mon Sep 17 00:00:00 2001 From: moreymat Date: Wed, 12 Oct 2016 14:40:02 +0200 Subject: [PATCH 27/74] ENH gcrf output --- evals/gcrf_tree_format.py | 208 ++++++++++++++++++++++++++++++++++++++ evals/showdown.py | 39 +++++-- 2 files changed, 236 insertions(+), 11 deletions(-) create mode 100644 evals/gcrf_tree_format.py diff --git a/evals/gcrf_tree_format.py b/evals/gcrf_tree_format.py new file mode 100644 index 0000000..4c7e379 --- /dev/null +++ b/evals/gcrf_tree_format.py @@ -0,0 +1,208 @@ +"""Module to load .tree files, output by Feng's gCRF parser. + +The .tree files contain binary constituency trees as bracketed strings. +They differ from the .dis files in that the relation label and +nuclearity are written on the top node instead of the daughter nodes, +plus edu spans are not explicitly written at each node. +""" + +from __future__ import absolute_import, print_function +import codecs +from glob import glob +import os +import re + +from nltk.tree import Tree + +from educe.rst_dt.annotation import EDU, Node, SimpleRSTTree, Span +from educe.rst_dt.deptree import RstDepTree + + +TXT_RE = r"(?P.+)_!(?P.+)!_(?P.+)" +TXT_PATTERN = re.compile(TXT_RE, flags=re.DOTALL) + + +def reduce_preterminal(terminals, txt_offset, edu_offset): + """Create a pre-terminal from a list of terminals. + + Parameters + ---------- + terminals: list of str + List of terminals + + Returns + ------- + sct: SimpleRSTTree + Pre-terminal. + """ + edu_num = edu_offset + edu_txt = ' '.join(terminals) + assert edu_txt.startswith('_!') and edu_txt.endswith('!_') + edu_txt = edu_txt[2:-2] # shave off _! and !_ + edu_txt_span = Span(txt_offset, + txt_offset + len(edu_txt)) + edu = EDU(edu_num, edu_txt_span, edu_txt, + context=None, + origin=None) + # "pre-terminal" + pre_node = Node('leaf', (edu_num, edu_num), edu_txt_span, + 'leaf', context=None) + sct = SimpleRSTTree(pre_node, [edu]) + return sct + + +def nltk_to_simple(node, txt_offset=0, edu_offset=1): + """Convert an NLTK Tree to a SimpleRSTTree. + + Parameters + ---------- + node: Tree + Current tree node. + txt_offset: int, defaults to 0 + Current text offset. + edu_offset: int, defaults to 1 + Current EDU id offset. + + Returns + ------- + sct: SimpleRSTTree + Corresponding SimpleRSTTree. + """ + cur_txt_offset = txt_offset + cur_edu_offset = edu_offset + + # first, recurse: convert kids + new_kids = [] + for kid in node: + if isinstance(kid, Tree): + # convert gCRF .tree subtree to SimpleRSTTree + new_kid = nltk_to_simple(kid, txt_offset=cur_txt_offset, + edu_offset=cur_edu_offset) + # update current offsets + cur_txt_offset = new_kid.label().span.char_end + 1 + cur_edu_offset = new_kid.label().edu_span[1] + 1 + new_kids.append(new_kid) + else: + # kid is a terminal + # first, restore parentheses in the text + kid = kid.replace('-LRB-', '(').replace('-RRB-', ')') + # + if not new_kids or isinstance(new_kids[-1], SimpleRSTTree): + new_kids.append([]) + new_kids[-1].append(kid) + if kid.endswith('!_'): + new_kid = reduce_preterminal( + new_kids[-1], cur_txt_offset, cur_edu_offset) + new_kids[-1] = new_kid + # update current offsets + # * txt_offset: + 1 for whitespace or newline + cur_txt_offset = new_kid.label().span.char_end + 1 + # * edu_offset: + 1 for next EDU + cur_edu_offset = new_kid.label().edu_span[1] + 1 + # check that all have been converted + assert all(isinstance(x, SimpleRSTTree) for x in new_kids) + + # we can now compute the label ; the edu_span depends on the + # recursive calls + lbl = node.label() + rel, nuc = lbl.split('[', 1) # nuc = "N][S]" + nuc = nuc[0] + nuc[3] + edu_span = (new_kids[0].label().edu_span[0], + new_kids[-1].label().edu_span[1]) + txt_span = Span(new_kids[0].label().span.char_start, + new_kids[-1].label().span.char_end) + new_lbl = Node(nuc, edu_span, txt_span, rel) + return SimpleRSTTree(new_lbl, new_kids) + + +def _load_gcrf_tree_file(f): + """Do load""" + # replace parentheses in text to avoid confusion with parentheses + # denoting the bracketed tree structure + lines = [] + for line in f: + # replace non-breaking spaces... damn python 2 + if u"\u00a0" in line: + line = line.replace(u"\u00a0", u" ") + # + m = TXT_PATTERN.match(line) + if m is not None: + new_line = (m.group('prefix') + + '_!' + + (m.group('text') + .replace('(', '-LRB-') + .replace(')', '-RRB-')) + + '!_' + + m.group('suffix')) + line = new_line + lines.append(line) + ct_str = ''.join(lines) + ct = Tree.fromstring(ct_str) + sct = nltk_to_simple(ct) + return sct + + +def load_gcrf_tree_file(fname): + """Load a gCRF tree file. + + Parameters + ---------- + fname: str + Path to the file to be loaded. + + Returns + ------- + ct: SimpleRSTTree + Binary constituency tree with relation label and nuclearity + moved one up. + """ + with codecs.open(fname, encoding='utf-8') as f: + ct = _load_gcrf_tree_file(f) + return ct + + +def load_gcrf_ctrees(out_dir, rel_conv): + """Load the ctrees output by gCRF as .tree files. + + Parameters + ---------- + out_dir: str + Path to the base directory containing the output files. + + Returns + ------- + ctree_pred: dict(str, RSTTree) + RST ctree for each document. + """ + ctree_pred = dict() + for f_tree in glob(os.path.join(out_dir, '*.tree')): + doc_name = os.path.splitext(os.path.basename(f_tree))[0] + sct_pred = load_gcrf_tree_file(f_tree) + ct_pred = SimpleRSTTree.to_binary_rst_tree(sct_pred) + if rel_conv is not None: + ct_pred = rel_conv(ct_pred) + ctree_pred[doc_name] = ct_pred + return ctree_pred + + +def load_gcrf_dtrees(out_dir, rel_conv, nary_enc='chain'): + """Get the dtrees that correspond to the ctrees output by gCRF. + + Parameters + ---------- + out_dir: str + Path to the base directory containing the output files. + nary_enc: one of {'chain', 'tree'} + Encoding for n-ary nodes. + + Returns + ------- + dtree_pred: dict(str, RstDepTree) + RST dtree for each document. + """ + ctree_pred = load_gcrf_ctrees(out_dir, rel_conv) + dtree_pred = dict() + for doc_name, ct_pred in ctree_pred.items(): + dt_pred = RstDepTree.from_rst_tree(ct_pred, nary_enc=nary_enc) + dtree_pred[doc_name] = dt_pred + return dtree_pred diff --git a/evals/showdown.py b/evals/showdown.py index e265c13..3c0e01c 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -6,6 +6,7 @@ from __future__ import print_function import argparse +import codecs import os from educe.rst_dt.annotation import _binarize, SimpleRSTTree @@ -22,6 +23,7 @@ # local to this package from evals.codra import load_codra_ctrees, load_codra_dtrees from evals.feng import load_feng_ctrees, load_feng_dtrees +from evals.gcrf_tree_format import load_gcrf_ctrees, load_gcrf_dtrees from evals.ji import load_ji_ctrees, load_ji_dtrees from evals.ours import (load_deptrees_from_attelo_output, load_attelo_ctrees, @@ -85,12 +87,14 @@ CODRA_OUT_DIR = '/home/mmorey/melodi/rst/joty/Doc-level' # output of Ji's parser DPLP JI_OUT_DIR = os.path.join('/home/mmorey/melodi/rst/ji_eisenstein/DPLP/data/docs/test/') -# Feng's parser -FENG_OUT_DIR = '/home/mmorey/melodi/rst/feng_hirst/tmp' +# Feng's parsers +FENG_DIR = '/home/mmorey/melodi/rst/feng_hirst/' +FENG1_OUT_DIR = os.path.join(FENG_DIR, 'phil', 'tmp') +FENG2_OUT_DIR = os.path.join(FENG_DIR, 'gCRF_dist/texts/results/test_batch_gold_seg') # level of detail for parseval DETAILED = False -SPAN_SEL = 'non-leaves' # None, 'leaves', 'non-leaves' +SPAN_SEL = None # None, 'leaves', 'non-leaves' # "PER_DOC = True" computes p, r, f as in DPLP: compute scores per doc, # then average over docs PER_DOC = False # should be False, except for comparison with the DPLP paper @@ -152,7 +156,7 @@ def main(): # predictions parser.add_argument('authors_pred', nargs='+', choices=['gold', 'silver', - 'joty', 'feng', 'ji', + 'joty', 'feng', 'feng2', 'ji', 'ours_chain', 'ours_tree', 'ours_tree_su'], help="Author(s) of the predictions") parser.add_argument('--nary_enc_pred', default='tree', @@ -161,7 +165,7 @@ def main(): # reference parser.add_argument('--author_true', default='gold', choices=['gold', 'silver', - 'joty', 'feng', 'ji', + 'joty', 'feng', 'feng2', 'ji', 'ours_chain', 'ours_tree'], help="Author of the reference") # * dtree eval @@ -218,10 +222,19 @@ def main(): if 'feng' in authors_pred: c_preds.append( - ('feng', load_feng_ctrees(FENG_OUT_DIR, REL_CONV)) + ('feng', load_feng_ctrees(FENG1_OUT_DIR, REL_CONV)) ) d_preds.append( - ('feng', load_feng_dtrees(FENG_OUT_DIR, REL_CONV, + ('feng', load_feng_dtrees(FENG1_OUT_DIR, REL_CONV, + nary_enc='chain')) + ) + + if 'feng2' in authors_pred: + c_preds.append( + ('gCRF', load_gcrf_ctrees(FENG2_OUT_DIR, REL_CONV)) + ) + d_preds.append( + ('gCRF', load_gcrf_dtrees(FENG2_OUT_DIR, REL_CONV, nary_enc='chain')) ) @@ -331,7 +344,8 @@ def main(): os.makedirs(parser_name) for doc_name, dt_true, dt_pred in zip( doc_names, dtree_true_list, dtree_pred_list): - with open(parser_name + '/' + doc_name + '.d_eval', mode='w') as f: + with codecs.open(parser_name + '/' + doc_name + '.d_eval', + mode='w', encoding='utf-8') as f: print(', '.join('{:.4f}'.format(x) for x in compute_uas_las( [dt_true], [dt_pred])), @@ -370,17 +384,20 @@ def main(): if not os.path.exists('gold'): os.makedirs('gold') for doc_name, ct in zip(doc_names, ctree_true_list): - with open('gold/' + ct.origin.doc, mode='w') as f: + with codecs.open('gold/' + ct.origin.doc, mode='w', + encoding='utf-8') as f: print(ct, file=f) if not os.path.exists(parser_name): os.makedirs(parser_name) for doc_name, ct in zip(doc_names, ctree_pred_list): - with open(parser_name + '/' + doc_name, mode='w') as f: + with codecs.open(parser_name + '/' + doc_name, mode='w', + encoding='utf-8') as f: print(ct, file=f) # WIP eval each tree in turn for doc_name, ct_true, ct_pred in zip( doc_names, ctree_true_list, ctree_pred_list): - with open(parser_name + '/' + doc_name + '.c_eval', mode='w') as f: + with codecs.open(parser_name + '/' + doc_name + '.c_eval', + mode='w', encoding='utf-8') as f: print(parseval_report([ct_true], [ct_pred], digits=4, span_sel=SPAN_SEL, per_doc=PER_DOC, From 0aa666a82bba6f900e3bb2e2c459f83c96d8202f Mon Sep 17 00:00:00 2001 From: moreymat Date: Wed, 12 Oct 2016 17:56:09 +0200 Subject: [PATCH 28/74] WIP repro: gCRF --- repro/gcrf/crf_classifier.py | 87 +++++++++ repro/gcrf/gold_segmenter.py | 112 +++++++++++ repro/gcrf/parse.py | 354 +++++++++++++++++++++++++++++++++++ repro/gcrf/preprocesser.py | 228 ++++++++++++++++++++++ 4 files changed, 781 insertions(+) create mode 100644 repro/gcrf/crf_classifier.py create mode 100644 repro/gcrf/gold_segmenter.py create mode 100644 repro/gcrf/parse.py create mode 100644 repro/gcrf/preprocesser.py diff --git a/repro/gcrf/crf_classifier.py b/repro/gcrf/crf_classifier.py new file mode 100644 index 0000000..58ee1ff --- /dev/null +++ b/repro/gcrf/crf_classifier.py @@ -0,0 +1,87 @@ +import os.path +import subprocess + +import paths + + +class CRFClassifier: + def __init__(self, name, model_type, model_path, model_file, verbose): + self.verbose = verbose + self.name = name + self.type = model_type + self.model_fname = model_file + self.model_path = model_path + + model_fpath = os.path.join(self.model_path, self.model_fname) + if not os.path.exists(model_fpath): + print ('The model path %s for CRF classifier %s does not exist.' + % model_fpath) + raise OSError('Could not create classifier subprocess') + + self.classifier_cmd = [ + '%s/crfsuite-stdin' % paths.CRFSUITE_PATH, + 'tag', '-pi', + '-m', '%s' % model_fpath + ] +# print self.classifier_cmd + self.classifier = subprocess.Popen(self.classifier_cmd, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + + if self.classifier.poll(): + raise OSError('Could not create classifier subprocess, with error info:\n%s' % self.classifier.stderr.readline()) + #self.cnt = 0 + + def classify(self, vectors): +# print '\n'.join(vectors) + "\n\n" + vectors_str = '\n'.join(vectors) + "\n\n" + + lines_out, lines_err = self.classifier.communicate(vectors_str) + + lines = [] + for line in lines_out.split('\n'): + if not line.strip(): + break + lines.append(line) + + # HACKY replace the subprocess closed by communicate() + self.classifier = subprocess.Popen(self.classifier_cmd, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + + if self.classifier.poll(): + raise OSError('Could not create classifier subprocess, with error info:\n%s' % self.classifier.stderr.readline()) + # end HACKY + + if self.classifier.poll(): + raise OSError('crf_classifier subprocess died') + + predictions = [] + for line in lines[1:]: + line = line.strip() +# print line + if line != '': + fields = line.split(':') +# print fields + label = fields[0] + prob = float(fields[1]) + predictions.append((label, prob)) + + seq_prob = float(lines[0].split('\t')[1]) + + return seq_prob, predictions + + def poll(self): + """ + Checks that the classifier processes are still alive + """ + if self.classifier is None: + return True + return self.classifier.poll() is not None + + def unload(self): + if self.classifier is not None and not self.poll(): + self.classifier.stdin.write('\n') + print 'Successfully unloaded %s' % self.name diff --git a/repro/gcrf/gold_segmenter.py b/repro/gcrf/gold_segmenter.py new file mode 100644 index 0000000..b963367 --- /dev/null +++ b/repro/gcrf/gold_segmenter.py @@ -0,0 +1,112 @@ +"""Pseudo-segmenter for manual (gold) EDU segmentation. + +""" + +from __future__ import print_function +import os + +import utils.utils + + +class GoldSegmenter(object): + """Gold segmenter""" + + def __init__(self, root, _name='gold_segmenter', verbose=False): + self.root = root # root dir for gold .edu files + self.name = _name + self.verbose = verbose + + def segment(self, doc, filename): + """Segment a document. + + Parameters + ---------- + doc: Document + Internal representation of a document + filename: str + Name of the document + """ + # load true segmentation + doc_predictions = [] + fname_doc = os.path.basename(filename) + fname_edus = os.path.join(self.root, fname_doc + '.edus') + with open(fname_edus) as f_edus: + fedus_sentences = f_edus.readlines() + doc_predictions = [] + for sent in fedus_sentences: + toks = sent.strip().split(' ') + predictions = [] + for tok in toks[:-1]: + if tok == 'EDU_BREAK': + if predictions: + # "not predictions" should not happen, but + # apparently it does, e.g. wsj_1376: + # "EDU_BREAK It provides..." + predictions[-1] = 1 + else: + predictions.append(0) + # set a marginal proba of 1.0 for each prediction + doc_predictions.append([(x, 1.0) for x in predictions]) + + # c/c + doc.edu_word_segmentation = [] + doc.cuts = [] + doc.edus = [] + # end c/c + + for sentence, predictions in zip(doc.sentences, doc_predictions): + self.segment_sentence(sentence, predictions) + + # c/c + doc.start_edu = 0 + doc.end_edu = len(doc.edus) + # end c/c + + def segment_sentence(self, sentence, predictions): + """Segment a sentence. + """ + # c/c from crf_segmenter + if len(sentence.tokens) == 1: + edus = [[sentence.tokens[0].word, sentence.raw_text[-3 : ]]] + + sentence.doc.cuts.append((len(sentence.doc.edus), len(sentence.doc.edus) + len(edus))) + sentence.start_edu = len(sentence.doc.edus) + sentence.end_edu = len(sentence.doc.edus) + len(edus) + sentence.doc.edu_word_segmentation.append([(0, 1)]) + sentence.doc.edus.extend(edus) + return + # end c/c + + # another c/c + edus = [] + edu_word_segmentations = [] + start = 0 + for i in range(len(predictions)): + pred = int(predictions[i][0]) + if pred == 1: +# print i, pred + edu_word_segmentations.append((start, i + 1)) + start = i + 1 + + edu_word_segmentations.append((start, len(sentence.tokens))) + + for (start_word, end_word) in edu_word_segmentations: + edu = [] + for j in range(start_word, end_word): + edu.extend(utils.utils.unescape_penn_special_word(sentence.tokens[j].word).split(' ')) + + if end_word == len(sentence.tokens): +# print sentence.raw_text + edu.append(sentence.raw_text[-3 : ]) + edus.append(edu) + + sentence.doc.cuts.append((len(sentence.doc.edus), len(sentence.doc.edus) + len(edus))) + sentence.start_edu = len(sentence.doc.edus) + sentence.end_edu = len(sentence.doc.edus) + len(edus) + sentence.doc.edu_word_segmentation.append(edu_word_segmentations) + sentence.doc.edus.extend(edus) + # end another c/c + + def unload(self): + """Unload ; a no-op here""" + pass diff --git a/repro/gcrf/parse.py b/repro/gcrf/parse.py new file mode 100644 index 0000000..419acf7 --- /dev/null +++ b/repro/gcrf/parse.py @@ -0,0 +1,354 @@ +''' +Created on 2014-01-17 + +@author: Vanessa Wei Feng +''' + +from segmenters.crf_segmenter import CRFSegmenter +from segmenters.gold_segmenter import GoldSegmenter # MM +from treebuilder.build_tree_CRF import CRFTreeBuilder + +from optparse import OptionParser + +import paths +import os.path +import sys +from document.doc import Document +import time +import traceback +from datetime import datetime + +from logs.log_writer import LogWriter +from prep.preprocesser import Preprocesser + +import utils.serialize + +class DiscourseParser(): + def __init__(self, options, output_dir = None, + log_writer = None): + self.verbose = options.verbose + self.skip_parsing = options.skip_parsing + self.global_features = options.global_features + self.save_preprocessed_doc = options.save_preprocessed_doc + + self.output_dir = os.path.join(paths.OUTPUT_PATH, output_dir if output_dir is not None else '') + if not os.path.exists(self.output_dir): + print 'Output directory %s not exists, creating it now.' % self.output_dir + os.makedirs(self.output_dir) + + self.log_writer = LogWriter(log_writer) + + self.feature_sets = 'gCRF' + + initStart = time.time() + + self.preprocesser = None + try: + self.preprocesser = Preprocesser() + except Exception, e: + print "*** Loading Preprocessing module failed..." + print traceback.print_exc() + + raise e + # MM replace CRF segmenter with a fake one that loads segmentation + # from a file + load_prepared_seg = True + if load_prepared_seg: + self.segmenter = GoldSegmenter('../texts/results/test_batch_gold_seg') + else: + try: + self.segmenter = CRFSegmenter(_name = self.feature_sets, verbose = self.verbose, global_features = self.global_features) + except Exception, e: + print "*** Loading Segmentation module failed..." + print traceback.print_exc() + + raise e + + try: + if not self.skip_parsing: + self.treebuilder = CRFTreeBuilder(_name = self.feature_sets, verbose = self.verbose) + else: + self.treebuilder = None + except Exception, e: + print "*** Loading Tree-building module failed..." + print traceback.print_exc() + raise e + + + initEnd = time.time() + print 'Finished initialization in %.2f seconds.' % (initEnd - initStart) + print + + + def unload(self): + if self.preprocesser is not None: + self.preprocesser.unload() + + if not self.segmenter is None: + self.segmenter.unload() + + if not self.treebuilder is None: + self.treebuilder.unload() + + + def parse(self, filename): + if not os.path.exists(filename): + print '%s does not exist.' % filename + return + + self.log_writer.write('***** Parsing %s...' % filename) + + try: + core_filename = os.path.split(filename)[1] + serialized_doc_filename = os.path.join(self.output_dir, core_filename + '.doc.ser') + doc = None + if os.path.exists(serialized_doc_filename): + doc = utils.serialize.loadData(core_filename, self.output_dir, '.doc.ser') + + if doc is None or not doc.preprocessed: + preprocessStart = time.time() + doc = Document() + doc.preprocess(filename, self.preprocesser) + + preprocessEnd = time.time() + + print 'Finished preprocessing in %.2f seconds.' % (preprocessEnd - preprocessStart) + self.log_writer.write('Finished preprocessing in %.2f seconds.' % (preprocessEnd - preprocessStart)) + + if self.save_preprocessed_doc: + print 'Saved preprocessed document data to %s.' % serialized_doc_filename + utils.serialize.saveData(core_filename, doc, self.output_dir, '.doc.ser') + + else: + print 'Loaded saved serialized document data.' + + print + except Exception, e: + print "*** Preprocessing failed ***" + print traceback.print_exc() + + raise e + + try: + if not doc.segmented: + segStart = time.time() + + self.segmenter.segment(doc, filename) # MM added filename for GoldSegmenter + + if self.verbose: + print 'edus' + for e in doc.edus: + print e + print + print 'cuts' + for cut in doc.cuts: + print cut + print + print 'edu_word_segmentation' + + segEnd = time.time() + print 'Finished segmentation in %.2f seconds.' % (segEnd - segStart) + print 'Segmented into %d EDUs.' % len(doc.edus) + + + self.log_writer.write('Finished segmentation in %.2f seconds. Segmented into %d EDUs.' % ((segEnd - segStart), len(doc.edus))) + if self.save_preprocessed_doc: + print 'Saved segmented document data to %s.' % serialized_doc_filename + utils.serialize.saveData(core_filename, doc, self.output_dir, '.doc.ser') + else: + print 'Already segmented into %d EDUs.' % len(doc.edus) + + print + + if options.verbose: + for e in doc.edus: + print e + + + except Exception, e: + print "*** Segmentation failed ***" + print traceback.print_exc() + + raise e + + + try: + ''' Step 2: build text-level discourse tree ''' + if self.skip_parsing: + outfname = os.path.join(self.output_dir, core_filename + ".edus") + print 'Output EDU segmentation result to %s' % outfname + f_o = open(outfname, "w") + for sentence in doc.sentences: + sent_id = sentence.sent_id + edu_segmentation = doc.edu_word_segmentation[sent_id] + i = 0 + sent_out = [] + for (j, token) in enumerate(sentence.tokens): + sent_out.append(token.word) + if j < len(sentence.tokens) - 1 and j == edu_segmentation[i][1] - 1: + sent_out.append('EDU_BREAK') + i += 1 + f_o.write(' '.join(sent_out) + '\n') + + f_o.flush() + f_o.close() + else: + treeBuildStart = time.time() + # + outfname = os.path.join(self.output_dir, core_filename + ".tree") + + pt = self.treebuilder.build_tree(doc) + + print 'Finished tree building.' + + if pt is None: + print "No tree could be built..." + + if not self.treebuilder is None: + self.treebuilder.unload() + + return -1 + + # Unescape the parse tree + if pt: + doc.discourse_tree = pt + treeBuildEnd = time.time() + + # print out + print 'Finished tree building in %.2f seconds.' % (treeBuildEnd - treeBuildStart) + self.log_writer.write('Finished tree building in %.2f seconds.' % (treeBuildEnd - treeBuildStart)) + + for i in range(len(doc.edus)): + pt.__setitem__(pt.leaf_treeposition(i), '_!%s!_' % ' '.join(doc.edus[i])) + + out = pt.pprint() + print 'Output tree building result to %s.' % outfname + f_o = open(outfname, "w") + f_o.write(out) + f_o.close() + + + if self.save_preprocessed_doc: + print 'Saved fully processed document data to %s.' % serialized_doc_filename + utils.serialize.saveData(core_filename, doc, self.output_dir, '.doc.ser') + + print + except Exception, e: + print traceback.print_exc() + + raise e + + print '===================================================' + #return dists#, probs + +def main(options, args): + parser = None + try: + if options.output_dir: + output_dir = args[0] + start_arg = 1 + else: + output_dir = None + start_arg = 0 + + log_writer = None + if options.logging: + log_fname = os.path.join(paths.LOGS_PATH, 'log_%s.txt' % (output_dir if output_dir else datetime.now().strftime('%Y_%m_%d_%H_%M_%S'))) + log_writer = open(log_fname, 'w') + + + if options.filelist: + file_fname = args[start_arg] + if not os.path.exists(file_fname) or not os.path.isfile(file_fname): + print 'The specified file list %s is not a file or does not exist' % file_fname + return + + parser = DiscourseParser(options = options, + output_dir = output_dir, + log_writer = log_writer) + + files = [] + skips = 0 + if options.filelist: + file_fname = args[start_arg] + for line in open(file_fname).readlines(): + fname = line.strip() + + if os.path.exists(fname): + if os.path.exists(os.path.join(parser.output_dir, os.path.split(fname)[1] + '.tree')): + skips += 1 + else: + files.append(fname) + else: + skips += 1 +# print 'Skip %s since it does not exist.' % fname + else: + fname = args[start_arg] +# print os.path.join(paths.tmp_folder, os.path.split(fname)[1] + '.xml') + if os.path.exists(fname): + if os.path.exists(os.path.join(parser.output_dir, os.path.split(fname)[1] + '.tree')): + skips += 1 + else: + files.append(fname) + else: + skips += 1 + + print 'Processing %d documents, skipping %d' % (len(files), skips) + + for (i, filename) in enumerate(files): + print 'Parsing %s, progress: %.2f (%d out of %d)' % (filename, i * 100.0 / len(files), i, len(files)) + + try: + parser.parse(filename) + + parser.log_writer.write('===================================================') + except Exception, e: + print 'Some error occurred, skipping the file' + raise e + + parser.unload() + + except Exception, e: + print traceback.print_exc() + if not parser is None: + parser.unload() + + + +v = '1.0' +if __name__ == '__main__': + usage = "Usage: %prog [options] input_file/dir" + + optParser = OptionParser(usage=usage, version="%prog " + v) + optParser.add_option("-v", "--verbose", + action="store_true", dest="verbose", default=False, + help="verbose mode") + optParser.add_option("-s", "--skip_parsing", + action="store_true", dest="skip_parsing", default=False, + help="Skip parsing, i.e., conduct segmentation only.") + optParser.add_option("-D", "--filelist", + action="store_true", dest="filelist", default=False, + help="parse all files specified in the filelist file, one file per line.") + optParser.add_option("-t", "--output_dir", + action="store_true", dest="output_dir", default=False, + help="Specify a directory for output files.") + optParser.add_option("-g", "--global_features", + action="store_true", dest="global_features", default=False, + help="Perform a second pass of EDU segmentation using global features.") + optParser.add_option("-l", "--logging", + action="store_true", dest="logging", default=False, + help="Perform logging while parsing.") + optParser.add_option("-e", "--save", + action="store_true", dest="save_preprocessed_doc", default=False, + help="Save preprocessed document into serialized file for future use.") + + + + (options, args) = optParser.parse_args() + if len(args) == 0: + optParser.print_help() + sys.exit(1) + + + main(options, args) + diff --git a/repro/gcrf/preprocesser.py b/repro/gcrf/preprocesser.py new file mode 100644 index 0000000..ed90fb4 --- /dev/null +++ b/repro/gcrf/preprocesser.py @@ -0,0 +1,228 @@ +''' +Created on 2014-01-18 + +@author: Wei +''' +import subprocess +import paths +from document.sentence import Sentence +from document.token import Token +from trees.lexicalized_tree import LexicalizedTree +import prep_utils +import os.path +from syntax_parser import SyntaxParser +from document.dependency import Dependency +import re + +class Preprocesser: + def __init__(self): + self.syntax_parser = None + + try: + self.syntax_parser = SyntaxParser() + except Exception, e: + raise e + + self.max_sentence_len = 100 + + def heuristic_sentence_splitting(self, raw_sent): + if len(raw_sent) == 0: + return [] + + if len(raw_sent.split()) <= self.max_sentence_len: + return [raw_sent] + + i = len(raw_sent) / 2 + j = i + k = i + 1 + boundaries = [';', ':', '!', '?'] + + results = [] + while j > 0 and k < len(raw_sent) - 1: + if raw_sent[j] in boundaries: + l_sent = raw_sent[ : j + 1] + r_sent = raw_sent[j + 1 : ].strip() + + if len(l_sent.split()) > 1 and len(r_sent.split()) > 1: + results.extend(self.heuristic_sentence_splitting(l_sent)) + results.extend(self.heuristic_sentence_splitting(r_sent)) + return results + else: + j -= 1 + k += 1 + elif raw_sent[k] in boundaries: + l_sent = raw_sent[ : k + 1] + r_sent = raw_sent[k + 1 : ].strip() + + if len(l_sent.split()) > 1 and len(r_sent.split()) > 1: + results.extend(self.heuristic_sentence_splitting(l_sent)) + results.extend(self.heuristic_sentence_splitting(r_sent)) + return results + else: + j -= 1 + k += 1 + else: + j -= 1 + k += 1 + + if len(results) == 0: + return [raw_sent] + + + def parse_single_sentence(self, raw_text): + return self.syntax_parser.parse_sentence(raw_text) + + + def process_single_sentence(self, doc, raw_text, end_of_para): + sentence = Sentence(len(doc.sentences), raw_text + ('' if not end_of_para else '

'), doc) + parse_tree_str, deps_str = self.parse_single_sentence(raw_text) + + parse = LexicalizedTree.parse(parse_tree_str, leaf_pattern = '(?<=\\s)[^\)\(]+') + sentence.set_unlexicalized_tree(parse) + + for (token_id, te) in enumerate(parse.leaves()): + word = te + token = Token(word, token_id + 1, sentence) + sentence.add_token(token) + + heads = self.get_heads(sentence, deps_str.split('\n')) + sentence.heads = heads + sentence.set_lexicalized_tree(prep_utils.create_lexicalized_tree(parse, heads)) + + doc.add_sentence(sentence) + + + def get_heads(self, sentence, dep_elems): + heads = [] + for token in sentence.tokens: + heads.append([token.word, token.get_PoS_tag(), 0]) + + for dep_e in dep_elems: + m = re.match('(.+?)\((.+?)-(\d+?), (.+?)-(\d+?)\)', dep_e) + if m: + relation = m.group(1) + gov_id = int(m.group(3)) + dep_id = int(m.group(5)) + + heads[dep_id - 1][2] = gov_id + sentence.add_dependency(Dependency(gov_id, dep_id, relation)) + + + return heads + + + def sentence_splitting(self, raw_filename, doc): + doc.sentences = [] + + cmd = 'perl %s/boundary.pl -d %s/HONORIFICS -i %s' % (paths.SSPLITTER_PATH, paths.SSPLITTER_PATH, os.path.abspath(raw_filename)) + + p = subprocess.Popen(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True) + output, errdata = p.communicate() + + if len(errdata) == 0: + raw_paras = output.strip().split('\n\n') + seg_sents = [] + for para_idx, raw_string in enumerate(raw_paras): + raw_sentences = raw_string.split('\n') + # MM + if (os.path.basename(raw_filename) == 'wsj_0655.out' + and para_idx == 8): + # the segmenter wrongly splits on "[{Mr.] [Ortega's}]" + # => repair by merging sentences + raw_sentences = ([raw_sentences[0] + ' ' + + raw_sentences[1]] + + raw_sentences[2:]) + elif (os.path.basename(raw_filename) == 'wsj_1169.out' + and para_idx == 0): + # "[Murata Mfg.] [Co.]" + raw_sentences = ([raw_sentences[0] + ' ' + + raw_sentences[1]] + + raw_sentences[2:]) + elif (os.path.basename(raw_filename) == 'wsj_1169.out' + and para_idx == 2): + # [G.m.b.] [H.] + raw_sentences = ([raw_sentences[0] + ' ' + + raw_sentences[1]] + + raw_sentences[2:]) + elif (os.path.basename(raw_filename) == 'wsj_1331.out' + and para_idx == 9): + # [all over again.] ['"] + raw_sentences = (raw_sentences[:1] + + [raw_sentences[1] + ' ' + + raw_sentences[2]]) + elif (os.path.basename(raw_filename) == 'wsj_1376.out' + and para_idx == 5): + # [society.] [. . .] + raw_sentences = (raw_sentences[:1] + + [raw_sentences[1] + ' ' + + raw_sentences[2]] + + raw_sentences[3:]) + elif (os.path.basename(raw_filename) == 'wsj_1376.out' + and para_idx == 6): + # [` Hello.] ['] (twice) + # move the trailing "'" up from the next raw sentence, + # and drop the whitespace after it + raw_sentences[3] = raw_sentences[3] + raw_sentences[4][0] + raw_sentences[4] = raw_sentences[4][2:] + # same for the next sentence + raw_sentences[4] = raw_sentences[4] + raw_sentences[5][0] + raw_sentences[5] = raw_sentences[5][2:] + elif (os.path.basename(raw_filename) == 'wsj_1376.out' + and para_idx == 21): + raw_sentences[0] = (raw_sentences[0] + ' ' + + raw_sentences[1] + ' ' + + raw_sentences[2]) + raw_sentences = raw_sentences[:1] + raw_sentences[3:] + elif (os.path.basename(raw_filename) == 'wsj_1380.out' + and para_idx == 6): + # [... Boston Inc. .] ['s First ...] + raw_sentences[0] = (raw_sentences[0] + ' ' + + raw_sentences[1]) + raw_sentences = raw_sentences[:1] + elif (os.path.basename(raw_filename) == 'wsj_2385.out' + and para_idx in [4, 5, 12]): + # double dash is equivalent here to ":", hence same + # sentence, ex: [... Co. .][-- ...] + raw_sentences[0] = (raw_sentences[0] + ' ' + + raw_sentences[1]) + raw_sentences = raw_sentences[:1] + elif (os.path.basename(raw_filename) == 'wsj_2386.out' + and para_idx == 2): + raw_sentences[0] = (raw_sentences[0] + ' ' + + raw_sentences[1]) + raw_sentences = raw_sentences[:1] + raw_sentences[2:] + elif False: + print para_idx + print raw_sentences + # end MM + for (i, raw_sent) in enumerate(raw_sentences): + if len(raw_sent.split()) > self.max_sentence_len: + chunked_raw_sents = self.heuristic_sentence_splitting(raw_sent) + if len(chunked_raw_sents) == 1: + continue + + for (j, sent) in enumerate(chunked_raw_sents): + seg_sents.append((sent, i == len(raw_sentences) - 1 and j == len(chunked_raw_sents))) + else: + seg_sents.append((raw_sent, i == len(raw_sentences) - 1)) + # MM + if False and os.path.basename(raw_filename) == 'wsj_2386.out': + raise ValueError('gni') + # end MM + else: + raise NameError("*** Sentence splitter crashed, with trace %s..." % errdata) + + + for (i, (raw_text, end_of_para)) in enumerate(seg_sents): + if i % 10 == 0: + print 'Processing sentence %d out of %d' % (i, len(seg_sents)) + + self.process_single_sentence(doc, raw_text, end_of_para) + + def preprocess(self, raw_filename, doc): + self.sentence_splitting(raw_filename, doc) + + + def unload(self): + if self.syntax_parser: + self.syntax_parser.unload() From 6cb02c737a1f0baa90bbd1f37f6c9962d4cb9333 Mon Sep 17 00:00:00 2001 From: moreymat Date: Wed, 12 Oct 2016 17:57:51 +0200 Subject: [PATCH 29/74] ENH disdep eval for gCRF, replace globals with options --- evals/dis2disdep.py | 16 ++++++++++++++-- evals/eval_disdep.py | 4 ++-- evals/showdown.py | 31 ++++++++++++++++++++++--------- 3 files changed, 38 insertions(+), 13 deletions(-) diff --git a/evals/dis2disdep.py b/evals/dis2disdep.py index d3140db..bcca38e 100755 --- a/evals/dis2disdep.py +++ b/evals/dis2disdep.py @@ -19,6 +19,7 @@ from educe.rst_dt.rst_wsj_corpus import (DOUBLE_FOLDER, TEST_FOLDER, TRAIN_FOLDER) +from evals.gcrf_tree_format import load_gcrf_dtrees from evals.ji import load_ji_dtrees @@ -37,7 +38,9 @@ # output of Joty's parser OUT_JOTY = os.path.join('/home/mmorey/melodi/rst/joty/Doc-level/') # output of Feng & Hirst's parser -OUT_FENG = os.path.join('/home/mmorey/melodi/rst/feng_hirst/tmp/') +OUT_FENG = os.path.join('/home/mmorey/melodi/rst/feng_hirst/phil/tmp/') +# output of Feng & Hirst's parser +OUT_FENG2 = os.path.join('/home/mmorey/melodi/rst/feng_hirst/gCRF_dist/texts/results/test_batch_gold_seg') # output of Ji's parser OUT_JI = os.path.join('/home/mmorey/melodi/rst/ji_eisenstein/DPLP/data/docs/test/') @@ -51,7 +54,8 @@ def main(): choices=['chain', 'tree'], help="Encoding for n-ary nodes") parser.add_argument('--author', default='gold', - choices=['gold', 'silver', 'joty', 'feng', 'ji'], + choices=['gold', 'silver', + 'joty', 'feng', 'feng2', 'ji'], help="Author of the version of the corpus") parser.add_argument('--split', default='test', choices=['train', 'test', 'double'], @@ -112,6 +116,14 @@ def main(): for doc_name, dtree in dtrees.items(): dtree.origin = FileId(doc_name, None, None, None) + elif author == 'feng2': + if corpus_split != 'test': + raise ValueError("The output of Feng & Hirst's parser is " + "available for the 'test' split only") + dtrees = load_gcrf_dtrees(OUT_FENG2, REL_CONV) + for doc_name, dtree in dtrees.items(): + dtree.origin = FileId(doc_name, None, None, None) + elif author == 'ji': if corpus_split != 'test': raise ValueError("The output of Ji & Eisenstein's parser is " diff --git a/evals/eval_disdep.py b/evals/eval_disdep.py index 8cbd6f6..81be1b8 100755 --- a/evals/eval_disdep.py +++ b/evals/eval_disdep.py @@ -17,12 +17,12 @@ description="Evaluate dis_dep trees against a given reference") parser.add_argument('authors_pred', nargs='+', choices=['gold', 'silver', - 'joty', 'feng', 'ji', + 'joty', 'feng', 'feng2', 'ji', 'ours'], help="Author(s) of the predictions") parser.add_argument('--author_true', default='gold', choices=['gold', 'silver', - 'joty', 'feng', 'ji', + 'joty', 'feng', 'feng2', 'ji', 'ours'], help="Author of the reference") parser.add_argument('--nary_enc', default='chain', diff --git a/evals/showdown.py b/evals/showdown.py index 3c0e01c..0f128f6 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -94,10 +94,6 @@ # level of detail for parseval DETAILED = False -SPAN_SEL = None # None, 'leaves', 'non-leaves' -# "PER_DOC = True" computes p, r, f as in DPLP: compute scores per doc, -# then average over docs -PER_DOC = False # should be False, except for comparison with the DPLP paper STRINGENT = False # hyperparams NUC_STRATEGY = 'unamb_else_most_frequent' @@ -177,6 +173,11 @@ def main(): help="Binarize the reference ctree for the eval") parser.add_argument('--simple_rsttree', action='store_true', help="Binarize ctree and move relations up") + parser.add_argument('--span_sel', default='none', + choices=['none', 'leaves', 'non-leaves'], + help="Binarize ctree and move relations up") + parser.add_argument('--per_doc', action='store_true', + help="Doc-averaged scores (cf. Ji's eval)") # args = parser.parse_args() author_true = args.author_true @@ -185,6 +186,18 @@ def main(): nary_enc_pred = args.nary_enc_pred binarize_true = args.binarize_true simple_rsttree = args.simple_rsttree + span_sel = args.span_sel + if span_sel == 'none': + span_sel = None + if simple_rsttree: + # the point of evaluating on simple rst trees is to get leaves + # out of the way + span_sel = 'non-leaves' + # "per_doc = True" computes p, r, f as in DPLP: compute scores per doc + # then average over docs + # it should be False, except for comparison with the DPLP paper + per_doc = args.per_doc + # if binarize_true and nary_enc_true != 'chain': raise ValueError("--binarize_true is compatible with " "--nary_enc_true chain only") @@ -399,8 +412,8 @@ def main(): with codecs.open(parser_name + '/' + doc_name + '.c_eval', mode='w', encoding='utf-8') as f: print(parseval_report([ct_true], [ct_pred], digits=4, - span_sel=SPAN_SEL, - per_doc=PER_DOC, + span_sel=span_sel, + per_doc=per_doc, stringent=STRINGENT), file=f) # end WIP @@ -408,14 +421,14 @@ def main(): # compute and print PARSEVAL scores print(parser_name) print(parseval_report(ctree_true_list, ctree_pred_list, digits=4, - span_sel=SPAN_SEL, - per_doc=PER_DOC, + span_sel=span_sel, + per_doc=per_doc, stringent=STRINGENT)) # detailed report on S+N+R if DETAILED: print(parseval_detailed_report(ctree_true_list, ctree_pred_list, metric_type='S+R', - span_sel=SPAN_SEL)) + span_sel=span_sel)) # end FIXME From c721d8518768a8eb29f0719f227a41758154b59b Mon Sep 17 00:00:00 2001 From: moreymat Date: Thu, 13 Oct 2016 12:51:25 +0200 Subject: [PATCH 30/74] FIX add option to load EDUs in gcrf parse.py --- repro/gcrf/parse.py | 90 +++++++++++++++++++------------------- repro/gcrf/preprocesser.py | 12 +++++ 2 files changed, 57 insertions(+), 45 deletions(-) diff --git a/repro/gcrf/parse.py b/repro/gcrf/parse.py index 419acf7..a0ca48f 100644 --- a/repro/gcrf/parse.py +++ b/repro/gcrf/parse.py @@ -4,34 +4,33 @@ @author: Vanessa Wei Feng ''' -from segmenters.crf_segmenter import CRFSegmenter -from segmenters.gold_segmenter import GoldSegmenter # MM -from treebuilder.build_tree_CRF import CRFTreeBuilder - -from optparse import OptionParser - -import paths import os.path import sys -from document.doc import Document import time import traceback from datetime import datetime +from optparse import OptionParser +import paths +import utils.serialize +from document.doc import Document from logs.log_writer import LogWriter from prep.preprocesser import Preprocesser +from segmenters.crf_segmenter import CRFSegmenter +from segmenters.gold_segmenter import GoldSegmenter # MM +from treebuilder.build_tree_CRF import CRFTreeBuilder -import utils.serialize class DiscourseParser(): - def __init__(self, options, output_dir = None, - log_writer = None): + def __init__(self, options, output_dir=None, log_writer=None): self.verbose = options.verbose self.skip_parsing = options.skip_parsing self.global_features = options.global_features self.save_preprocessed_doc = options.save_preprocessed_doc - self.output_dir = os.path.join(paths.OUTPUT_PATH, output_dir if output_dir is not None else '') + self.output_dir = os.path.join( + paths.OUTPUT_PATH, + output_dir if output_dir is not None else '') if not os.path.exists(self.output_dir): print 'Output directory %s not exists, creating it now.' % self.output_dir os.makedirs(self.output_dir) @@ -50,35 +49,37 @@ def __init__(self, options, output_dir = None, print traceback.print_exc() raise e - # MM replace CRF segmenter with a fake one that loads segmentation - # from a file - load_prepared_seg = True - if load_prepared_seg: - self.segmenter = GoldSegmenter('../texts/results/test_batch_gold_seg') + + # MM enable to load segmentation from .edus files + self.load_edus = options.load_edus + if self.load_edus: + # fake EDU segmenter that loads segmentation from files in a + # folder + self.segmenter = GoldSegmenter(self.load_edus) else: try: - self.segmenter = CRFSegmenter(_name = self.feature_sets, verbose = self.verbose, global_features = self.global_features) + self.segmenter = CRFSegmenter( + _name=self.feature_sets, verbose=self.verbose, + global_features=self.global_features) except Exception, e: print "*** Loading Segmentation module failed..." print traceback.print_exc() - raise e - + try: if not self.skip_parsing: - self.treebuilder = CRFTreeBuilder(_name = self.feature_sets, verbose = self.verbose) + self.treebuilder = CRFTreeBuilder( + _name=self.feature_sets, verbose=self.verbose) else: self.treebuilder = None except Exception, e: print "*** Loading Tree-building module failed..." print traceback.print_exc() raise e - - + initEnd = time.time() print 'Finished initialization in %.2f seconds.' % (initEnd - initStart) print - def unload(self): if self.preprocesser is not None: @@ -89,8 +90,7 @@ def unload(self): if not self.treebuilder is None: self.treebuilder.unload() - - + def parse(self, filename): if not os.path.exists(filename): print '%s does not exist.' % filename @@ -125,15 +125,17 @@ def parse(self, filename): print except Exception, e: print "*** Preprocessing failed ***" - print traceback.print_exc() - + print traceback.print_exc() raise e try: if not doc.segmented: segStart = time.time() - - self.segmenter.segment(doc, filename) # MM added filename for GoldSegmenter + if self.load_edus: + # MM GoldSegmenter needs a filename + self.segmenter.segment(doc, filename) + else: + self.segmenter.segment(doc) if self.verbose: print 'edus' @@ -149,8 +151,7 @@ def parse(self, filename): segEnd = time.time() print 'Finished segmentation in %.2f seconds.' % (segEnd - segStart) print 'Segmented into %d EDUs.' % len(doc.edus) - - + self.log_writer.write('Finished segmentation in %.2f seconds. Segmented into %d EDUs.' % ((segEnd - segStart), len(doc.edus))) if self.save_preprocessed_doc: print 'Saved segmented document data to %s.' % serialized_doc_filename @@ -163,15 +164,12 @@ def parse(self, filename): if options.verbose: for e in doc.edus: print e - - + except Exception, e: print "*** Segmentation failed ***" - print traceback.print_exc() - + print traceback.print_exc() raise e - - + try: ''' Step 2: build text-level discourse tree ''' if self.skip_parsing: @@ -256,7 +254,6 @@ def main(options, args): log_fname = os.path.join(paths.LOGS_PATH, 'log_%s.txt' % (output_dir if output_dir else datetime.now().strftime('%Y_%m_%d_%H_%M_%S'))) log_writer = open(log_fname, 'w') - if options.filelist: file_fname = args[start_arg] if not os.path.exists(file_fname) or not os.path.isfile(file_fname): @@ -314,7 +311,6 @@ def main(options, args): parser.unload() - v = '1.0' if __name__ == '__main__': usage = "Usage: %prog [options] input_file/dir" @@ -339,16 +335,20 @@ def main(options, args): action="store_true", dest="logging", default=False, help="Perform logging while parsing.") optParser.add_option("-e", "--save", - action="store_true", dest="save_preprocessed_doc", default=False, + action="store_true", dest="save_preprocessed_doc", + default=False, help="Save preprocessed document into serialized file for future use.") - - + # MM add option to load segmentation from the .edus files that result + # from calling this parser with the --skip_parsing option + optParser.add_option('-r', '--load_edus', + dest='load_edus', default=False, + help="Read segmentation from .edus files in folder") + # end MM (options, args) = optParser.parse_args() if len(args) == 0: optParser.print_help() sys.exit(1) - - + main(options, args) diff --git a/repro/gcrf/preprocesser.py b/repro/gcrf/preprocesser.py index ed90fb4..0d5be7b 100644 --- a/repro/gcrf/preprocesser.py +++ b/repro/gcrf/preprocesser.py @@ -127,6 +127,8 @@ def sentence_splitting(self, raw_filename, doc): # MM if (os.path.basename(raw_filename) == 'wsj_0655.out' and para_idx == 8): + # this error is in the original text *and* is redone + # by the segmenter: # the segmenter wrongly splits on "[{Mr.] [Ortega's}]" # => repair by merging sentences raw_sentences = ([raw_sentences[0] + ' ' @@ -134,24 +136,30 @@ def sentence_splitting(self, raw_filename, doc): + raw_sentences[2:]) elif (os.path.basename(raw_filename) == 'wsj_1169.out' and para_idx == 0): + # this error is in the original text *and* is redone + # by the segmenter: # "[Murata Mfg.] [Co.]" raw_sentences = ([raw_sentences[0] + ' ' + raw_sentences[1]] + raw_sentences[2:]) elif (os.path.basename(raw_filename) == 'wsj_1169.out' and para_idx == 2): + # this error is in the original text *and* is redone + # by the segmenter: # [G.m.b.] [H.] raw_sentences = ([raw_sentences[0] + ' ' + raw_sentences[1]] + raw_sentences[2:]) elif (os.path.basename(raw_filename) == 'wsj_1331.out' and para_idx == 9): + # text is correct, only the segmenter makes an error: # [all over again.] ['"] raw_sentences = (raw_sentences[:1] + [raw_sentences[1] + ' ' + raw_sentences[2]]) elif (os.path.basename(raw_filename) == 'wsj_1376.out' and para_idx == 5): + # text is correct, only the segmenter makes an error: # [society.] [. . .] raw_sentences = (raw_sentences[:1] + [raw_sentences[1] + ' ' @@ -169,18 +177,21 @@ def sentence_splitting(self, raw_filename, doc): raw_sentences[5] = raw_sentences[5][2:] elif (os.path.basename(raw_filename) == 'wsj_1376.out' and para_idx == 21): + # error by the segmenter raw_sentences[0] = (raw_sentences[0] + ' ' + raw_sentences[1] + ' ' + raw_sentences[2]) raw_sentences = raw_sentences[:1] + raw_sentences[3:] elif (os.path.basename(raw_filename) == 'wsj_1380.out' and para_idx == 6): + # error by the segmenter # [... Boston Inc. .] ['s First ...] raw_sentences[0] = (raw_sentences[0] + ' ' + raw_sentences[1]) raw_sentences = raw_sentences[:1] elif (os.path.basename(raw_filename) == 'wsj_2385.out' and para_idx in [4, 5, 12]): + # error by the segmenter # double dash is equivalent here to ":", hence same # sentence, ex: [... Co. .][-- ...] raw_sentences[0] = (raw_sentences[0] + ' ' @@ -188,6 +199,7 @@ def sentence_splitting(self, raw_filename, doc): raw_sentences = raw_sentences[:1] elif (os.path.basename(raw_filename) == 'wsj_2386.out' and para_idx == 2): + # error by the segmenter raw_sentences[0] = (raw_sentences[0] + ' ' + raw_sentences[1]) raw_sentences = raw_sentences[:1] + raw_sentences[2:] From f931c4468d4c5e2fcf889feb6c3f1f3f0bd14b8f Mon Sep 17 00:00:00 2001 From: moreymat Date: Thu, 13 Oct 2016 12:54:56 +0200 Subject: [PATCH 31/74] ENH add missing utils: conda env file, script to reinject gold EDU seg in .edus files --- repro/gcrf/environment.yml | 4 + repro/gcrf/gen_gold_edus.py | 179 ++++++++++++++++++++++++++++++++++++ 2 files changed, 183 insertions(+) create mode 100644 repro/gcrf/environment.yml create mode 100644 repro/gcrf/gen_gold_edus.py diff --git a/repro/gcrf/environment.yml b/repro/gcrf/environment.yml new file mode 100644 index 0000000..bb816bb --- /dev/null +++ b/repro/gcrf/environment.yml @@ -0,0 +1,4 @@ +name: gcrf +dependencies: + - python=2.7 + - nltk=2.0.4 diff --git a/repro/gcrf/gen_gold_edus.py b/repro/gcrf/gen_gold_edus.py new file mode 100644 index 0000000..4eea6a4 --- /dev/null +++ b/repro/gcrf/gen_gold_edus.py @@ -0,0 +1,179 @@ +"""Generate .edus files for Feng's gCRF parser, with gold EDUs. + +""" + +from __future__ import absolute_import, print_function + +import argparse +from difflib import SequenceMatcher +from glob import glob +import os + +import numpy as np + +TXT_MAP = [ + (' .', '.'), + (' ,', ','), + (' %', '%'), + (' :', ':'), + ('-LRB-', '('), + ('-RRB-', ')'), + # non-breaking space + # FIXME switch to unicode where this is a unique char: u"\u00A0" + ('\xc2\xa0', ' '), + ("do n't", "don't"), + ('...', '. . .'), +] + + +def dump_gcrf_edus_gold(f_gold, f_pred, f_dest): + """Reinject gold segmentation into .edus files output by gCRF. + + Parameters + ---------- + f_gold: str + Path to the gold .edus file + f_pred: str + Path to the predicted .edus file + f_dest: str + Path to the output + """ + txt_gold = f_gold.read() + i_gold = 0 # pointer in txt_gold + + skip_toks = 0 # nb of tokens from _pred that have already been consumed + + for line in f_pred: + tokens_pred = line.split(' ') + # the newline character (marking the end of sentence) is appended + # to the last token + assert tokens_pred[-1][-1] == '\n' + # + for i, tok in enumerate(tokens_pred): + if skip_toks: + # skip tokens from _pred that have already been consumed + skip_toks -= 1 + continue + + while txt_gold[i_gold] == ' ': + # skip whitespaces in gold + i_gold += 1 + + if (tok[0] == '.' and tokens_pred[i - 1][-1] == '.' + and txt_gold[i_gold] != '.'): + # preprocessing adds an extra full stop when the last + # token ends with one (e.g. for abbreviations: + # "Inc." => "Inc. .") + if len(tok) > 1: + # skip extra stop, resume normal matching procedure + tok = tok[1:] + else: + # token is exactly '.' => skip it + continue + + if tok == 'EDU_BREAK': + # predicted EDU break inside sentence + if txt_gold[i_gold] == '\n': + # also in gold => correctly predicted => leave it + print(tok, end=' ', file=f_dest) + i_gold += 1 + continue + else: + # not in gold => erroneously predicted => delete it + # (this is a silent operation) + continue + elif tok == '\n' and txt_gold[i_gold] == '\n': + # happens when the token before the newline was a copy of + # the punctuation added by preprocessing, removed above ; + # ex: "... Inc." => "... Inc. ." + print(tok, end='', file=f_dest) + i_gold += 1 + continue + + if txt_gold[i_gold:i_gold + 5] == '\n ': + # gold EDU break inside sentence, missing from predicted + print('EDU_BREAK', end=' ', file=f_dest) # FIXME to f_dest + i_gold += 5 + + # match token + # whitespaces inside tokens are non-breaking spaces: + # \xc2\xa0 in ascii, but we should really be processing + # them as unicode symbols... + tok_txt_gold = (tok + .replace('\xc2\xa0', ' ') + .replace('-LRB-', '(') + .replace('-RRB-', ')') + .replace('-LCB-', '{') + .replace('-RCB-', '}') + .replace('``', '"') + .replace("''", '"') + .replace('...', '. . .') + ) + if i < len(tokens_pred) - 1: + # all tokens except for the last of the sentence + if (txt_gold[i_gold:i_gold + len(tok_txt_gold)] + == tok_txt_gold): + # it is a match indeed + i_gold += len(tok_txt_gold) + # print token followed by a whitespace + print(tok, end=' ', file=f_dest) # FIXME to f_dest + continue + else: + print() + print('wow') + print(tokens_pred[i:]) + print(repr(txt_gold[i_gold:i_gold + len(tok_txt_gold)]), + repr(tok)) + raise ValueError('gni') + else: + # last token of the sentence + if (txt_gold[i_gold:i_gold + len(tok_txt_gold) + 1] + == tok_txt_gold[:-1] + ' ' + tok[-1]): + # gold has an extra whitespace before the newline + i_gold += len(tok_txt_gold) + 1 + # token but no following whitespace + print(tok, end='', file=f_dest) + elif (txt_gold[i_gold:i_gold + 7] == '. . . .' + and tok == '...\n'): + # pre-processing replaces '[. . .] [.]' with '...' ; + # let's assume it's normal + i_gold += 7 + print(tok, end='', file=f_dest) + else: + print() + print('i-2', tokens_pred[i - 2]) + print('i-1', tokens_pred[i - 1]) + print('i', tokens_pred[i]) + print(repr(txt_gold[i_gold:i_gold + len(tok_txt_gold)]), + repr(tok)) + raise ValueError('pouet') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Generate .edus files with gold segmentation') + parser.add_argument('dir_gold', metavar='DIR', + help='folder with the gold files (.edus)') + parser.add_argument('dir_pred', metavar='DIR', + help='folder with the predicted files (.edus)') + parser.add_argument('dir_dest', metavar='DIR', + help='output folder') + + args = parser.parse_args() + + # setup output dir + if not os.path.exists(args.dir_dest): + os.makedirs(args.dir_dest) + + files_edus_gold = sorted(glob(os.path.join(args.dir_gold, '*.edus'))) + files_edus_pred = sorted(glob(os.path.join(args.dir_pred, '*.edus'))) + for file_gold, file_pred in zip(files_edus_gold, files_edus_pred): + print(file_gold) + assert os.path.basename(file_gold) == os.path.basename(file_pred) + file_dest = os.path.join(args.dir_dest, + os.path.basename(file_pred)) + + with open(file_gold) as f_gold: + with open(file_pred) as f_pred: + with open(file_dest, mode='w') as f_dest: + dump_gcrf_edus_gold(f_gold, f_pred, f_dest) From d3befbd2d4518a31d8ced724540713958ed35625 Mon Sep 17 00:00:00 2001 From: moreymat Date: Thu, 13 Oct 2016 17:34:36 +0200 Subject: [PATCH 32/74] ENH load gold EDU seg in dplp --- repro/dplp/buildedu.py | 186 ++++++++++++++++++++++++++++++++++++++++ repro/dplp/rstparser.py | 32 +++++++ 2 files changed, 218 insertions(+) create mode 100644 repro/dplp/buildedu.py create mode 100644 repro/dplp/rstparser.py diff --git a/repro/dplp/buildedu.py b/repro/dplp/buildedu.py new file mode 100644 index 0000000..e7517d7 --- /dev/null +++ b/repro/dplp/buildedu.py @@ -0,0 +1,186 @@ +## buildedu.py +## Author: Yangfeng Ji +## Date: 05-03-2015 +## Time-stamp: + +from os import listdir +from os.path import join, basename +from model.classifier import Classifier +from model.docreader import DocReader +from model.sample import SampleGenerator +from cPickle import load +import gzip + + +# MM +from glob import glob +import os + +DOC_EDUS = {os.path.splitext(os.path.basename(f))[0]: f + for f in glob(os.path.join( + '/home/mmorey/melodi/rst/ji_eisenstein', + 'DPLP/data/edus/*/*.edus'))} + + +def load_gold_edus(conll_file): + """Load gold EDUs for injection into a conll file. + + Parameters + ---------- + conll_file: str + Path to the conll file. + + Returns + ------- + edu_idc: list? of int + Index of the EDU for each token. + """ + result = [] # 1 if token is the last of its EDU, 0 otherwise + + doc_name = os.path.splitext(os.path.basename(conll_file))[0] + # find corresponding file with gold EDUs + fname_edus = DOC_EDUS[doc_name] + edus = [] + with open(fname_edus) as f_edus: + for line in f_edus: + line = line.strip() + if not line: + continue + # non-empty line + edus.append(line) + # open conll file and align tokens + edu_idx = 0 + edu_txt = edus[edu_idx] # remaining text of current EDU + with open(conll_file) as f_conll: + for line in f_conll: + line = line.strip() + if not line: + continue + fields = line.split('\t') + wform_conll = fields[2] # word form + # try to read the same amount of characters off the current EDU + wform_edus = edu_txt[:len(wform_conll)] + try: + assert wform_edus == wform_conll + except AssertionError: + if len(wform_edus) < len(wform_conll): + # EDU boundary happens in the middle of a token: + # possible causes: error in the text of the original doc + # (missing whitespace, wrong version of quotes...), or + # a plain error of the segmenter + assert wform_conll.startswith(wform_edus) + # set the EDU boundary at the current token + result.append(1) + # remaining text + rem_txt = wform_conll[len(wform_edus):].strip() + # read the first characters off the next EDU + edu_idx += 1 + if edu_idx == len(edus): + edu_txt = '' + else: + edu_txt = edus[edu_idx] + # read the first characters off the beginning of the + # next EDU, assert that they match + assert edu_txt[:len(rem_txt)] == rem_txt + edu_txt = edu_txt[len(rem_txt):].lstrip() + else: + # we don't know how to handle this (yet) + print(wform_conll, wform_edus) + raise + else: + # print(fields + [edu_idx + 1]) + # update the state of edu_txt for the next iteration + edu_txt = edu_txt[len(wform_conll):].lstrip() + if not edu_txt: + # when the current EDU is exhausted, pass to the next + result.append(1) + edu_idx += 1 + if edu_idx == len(edus): + # normally, the text should be exhausted on both sides + # (.conll and .edus) at the same time ; + # if the .conll has extra text, the following should + # make the assertion above break at the next iteration + # of the loop + edu_txt = '' + else: + edu_txt = edus[edu_idx] + else: + result.append(0) + return result +# end MM + +def main(fmodel, fvocab, rpath, wpath): + clf = Classifier() + dr = DocReader() + clf.loadmodel(fmodel) + flist = [join(rpath,fname) for fname in listdir(rpath) if fname.endswith('conll')] + vocab = load(gzip.open(fvocab)) + for (fidx, fname) in enumerate(flist): + print "Processing file: {}".format(fname) + doc = dr.read(fname, withboundary=False) + # predict segmentation + if False: + sg = SampleGenerator(vocab) + sg.build(doc) + M, _ = sg.getmat() + predlabels = clf.predict(M) + else: + predlabels = load_gold_edus(fname) # RESUME HERE + doc = postprocess(doc, predlabels) + writedoc(doc, fname, wpath) + + +def postprocess(doc, predlabels): + """ Assign predlabels into doc + """ + tokendict = doc.tokendict + for gidx in tokendict.iterkeys(): + if predlabels[gidx] == 1: + tokendict[gidx].boundary = True + else: + tokendict[gidx].boundary = False + if tokendict[gidx].send: + tokendict[gidx].boundary = True + return doc + + +# def writedoc(doc, fname, wpath): +# """ Write doc into a file with the CoNLL-like format +# """ +# tokendict = doc.tokendict +# N = len(tokendict) +# fname = basename(fname) + '.edu' +# fname = join(wpath, fname) +# eduidx = 0 +# with open(fname, 'w') as fout: +# for gidx in range(N): +# fout.write(str(eduidx) + '\n') +# if tokendict[gidx].boundary: +# eduidx += 1 +# if tokendict[gidx].send: +# fout.write('\n') +# print 'Write segmentation: {}'.format(fname) + + +def writedoc(doc, fname, wpath): + """ Write file + """ + tokendict = doc.tokendict + N = len(tokendict) + fname = basename(fname).replace(".conll", ".merge") + fname = join(wpath, fname) + eduidx = 1 + with open(fname, 'w') as fout: + for gidx in range(N): + tok = tokendict[gidx] + line = str(tok.sidx) + "\t" + str(tok.tidx) + "\t" + line += tok.word + "\t" + tok.lemma + "\t" + line += tok.pos + "\t" + tok.deplabel + "\t" + line += str(tok.hidx) + "\t" + tok.ner + "\t" + line += tok.partialparse + "\t" + str(eduidx) + "\n" + fout.write(line) + # Boundary + if tok.boundary: + eduidx += 1 + if tok.send: + fout.write("\n") diff --git a/repro/dplp/rstparser.py b/repro/dplp/rstparser.py new file mode 100644 index 0000000..73b553b --- /dev/null +++ b/repro/dplp/rstparser.py @@ -0,0 +1,32 @@ +## main.py +## Author: Yangfeng Ji +## Date: 09-25-2015 +## Time-stamp: + +from code.evalparser import evalparser +from cPickle import load +import gzip, sys + +def main(path, draw=True): + with gzip.open("resources/bc3200.pickle.gz") as fin: + print 'Load Brown clusters for creating features ...' + bcvocab = load(fin) + evalparser(path=path, report=True, draw=draw, + bcvocab=bcvocab, + withdp=False) + + +if __name__ == '__main__': + if len(sys.argv) == 2: + path = sys.argv[1] + print 'Read files from: {}'.format(path) + main(path) + elif len(sys.argv) == 3: + path = sys.argv[1] + draw = eval(sys.argv[2]) + print 'Read files from {}'.format(path) + main(path, draw) + else: + print "Usage: python rstparser.py file_path [draw_rst_tree]" + print "\tfile_path - path to the segmented file" + From 484f7d58d5e6abbc044502c5bc9a6b384055bfa9 Mon Sep 17 00:00:00 2001 From: moreymat Date: Mon, 17 Oct 2016 17:05:55 +0200 Subject: [PATCH 33/74] ENH add support for output of hayashi et al's parsers, dep/li outputs --- evals/dis2disdep.py | 23 +++++++-- evals/eval_disdep.py | 35 +++++++++----- evals/hayashi_deps.py | 107 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 149 insertions(+), 16 deletions(-) create mode 100644 evals/hayashi_deps.py diff --git a/evals/dis2disdep.py b/evals/dis2disdep.py index bcca38e..a41b53f 100755 --- a/evals/dis2disdep.py +++ b/evals/dis2disdep.py @@ -20,6 +20,7 @@ TRAIN_FOLDER) from evals.gcrf_tree_format import load_gcrf_dtrees +from evals.hayashi_deps import load_hayashi_dtrees from evals.ji import load_ji_dtrees @@ -33,8 +34,9 @@ RELMAP_FILE = os.path.join('/home/mmorey/melodi/educe', 'educe', 'rst_dt', 'rst_112to18.txt') -REL_CONV = RstRelationConverter(RELMAP_FILE).convert_tree - +REL_CONV_BASE = RstRelationConverter(RELMAP_FILE) +REL_CONV = REL_CONV_BASE.convert_tree +REL_CONV_DTREE = REL_CONV_BASE.convert_dtree # output of Joty's parser OUT_JOTY = os.path.join('/home/mmorey/melodi/rst/joty/Doc-level/') # output of Feng & Hirst's parser @@ -43,6 +45,9 @@ OUT_FENG2 = os.path.join('/home/mmorey/melodi/rst/feng_hirst/gCRF_dist/texts/results/test_batch_gold_seg') # output of Ji's parser OUT_JI = os.path.join('/home/mmorey/melodi/rst/ji_eisenstein/DPLP/data/docs/test/') +# output of Hayashi et al.'s parsers +OUT_HAYASHI_HILDA = os.path.join('/home/mmorey/melodi/rst/hayashi/SIGDIAL/auto_parse/dep/li/') +OUT_HAYASHI_MST = os.path.join('/home/mmorey/melodi/rst/hayashi/SIGDIAL/auto_parse/cons/trans_li/') def main(): @@ -55,7 +60,8 @@ def main(): help="Encoding for n-ary nodes") parser.add_argument('--author', default='gold', choices=['gold', 'silver', - 'joty', 'feng', 'feng2', 'ji'], + 'joty', 'feng', 'feng2', 'ji', + 'hayashi_hilda', 'hayashi_mst'], help="Author of the version of the corpus") parser.add_argument('--split', default='test', choices=['train', 'test', 'double'], @@ -129,6 +135,17 @@ def main(): raise ValueError("The output of Ji & Eisenstein's parser is " "available for the 'test' split only") dtrees = load_ji_dtrees(OUT_JI, REL_CONV) + elif author == 'hayashi_mst': + if corpus_split != 'test': + raise ValueError("The output of Hayashi et al.'s parser is " + "available for the 'test' split only") + dtrees = load_hayashi_dtrees(OUT_HAYASHI_MST, REL_CONV_DTREE) + elif author == 'hayashi_hilda': + if corpus_split != 'test': + raise ValueError("The output of Hayashi et al.'s parser is " + "available for the 'test' split only") + dtrees = load_hayashi_dtrees(OUT_HAYASHI_HILDA, REL_CONV_DTREE) + # do dump dump_disdep_files(dtrees.values(), out_dir) diff --git a/evals/eval_disdep.py b/evals/eval_disdep.py index 81be1b8..7f84965 100755 --- a/evals/eval_disdep.py +++ b/evals/eval_disdep.py @@ -18,11 +18,13 @@ parser.add_argument('authors_pred', nargs='+', choices=['gold', 'silver', 'joty', 'feng', 'feng2', 'ji', + 'hayashi_hilda', 'hayashi_mst', 'ours'], help="Author(s) of the predictions") parser.add_argument('--author_true', default='gold', choices=['gold', 'silver', 'joty', 'feng', 'feng2', 'ji', + 'hayashi_hilda', 'hayashi_mst', 'ours'], help="Author of the reference") parser.add_argument('--nary_enc', default='chain', @@ -38,12 +40,16 @@ files_true = {os.path.basename(f).rsplit('.')[0]: f for f in glob(os.path.join(dir_true, '*.dis_dep'))} # table header - print('\t'.join(['parser', - 'a', 'l', 'n', 'r', - 'al', 'an', 'ar', - 'aln', 'alr', - 'alnr', - 'support'])) + len_author_str = max(len(x) for x in authors_pred) + print('\t'.join([ + '{parser_name: <{width}}'.format( + parser_name='parser', width=len_author_str), + 'a', 'l', 'n', 'r', + 'al', 'an', 'ar', + 'aln', 'alr', + 'alnr', + 'support' + ])) for author_pred in authors_pred: dir_pred = os.path.join('TMP_disdep', nary_enc, author_pred, 'test') @@ -98,10 +104,13 @@ cnt_alr += 1 if ok_a and ok_l and ok_n and ok_r: cnt_alnr += 1 - print('\t'.join([author_pred] - + ['{:.4f}'.format(float(cnt_x) / cnt_tot) - for cnt_x in [cnt_a, cnt_l, cnt_n, cnt_r, - cnt_al, cnt_an, cnt_ar, - cnt_aln, cnt_alr, - cnt_alnr]] - + [str(cnt_tot)])) + print('\t'.join( + ['{parser_name: <{width}}'.format( + parser_name=author_pred, width=len_author_str)] + + ['{:.4f}'.format(float(cnt_x) / cnt_tot) + for cnt_x in [cnt_a, cnt_l, cnt_n, cnt_r, + cnt_al, cnt_an, cnt_ar, + cnt_aln, cnt_alr, + cnt_alnr]] + + [str(cnt_tot)] + )) diff --git a/evals/hayashi_deps.py b/evals/hayashi_deps.py new file mode 100644 index 0000000..f613013 --- /dev/null +++ b/evals/hayashi_deps.py @@ -0,0 +1,107 @@ +"""Load dependencies output by Hayashi et al.'s parsers. + +This module enables to process files in auto_parse/{dep/li,cons/trans_li}. +""" + +from __future__ import absolute_import, print_function + +import os +from glob import glob + +from educe.rst_dt.corpus import Reader +from educe.rst_dt.deptree import RstDepTree + + +# load true ctrees, from the TEST section of the RST-DT, to get gold EDUs +RST_DT_DIR = '/home/mmorey/corpora/rst-dt/rst_discourse_treebank/data' +RST_TEST_DIR = os.path.join(RST_DT_DIR, 'RSTtrees-WSJ-main-1.0/TEST') +if not os.path.exists(RST_TEST_DIR): + raise ValueError('Unable to find RST test files at ', RST_TEST_DIR) +RST_TEST_READER = Reader(RST_TEST_DIR) +RST_TEST_CTREES_TRUE = {k.doc: v for k, v in RST_TEST_READER.slurp().items()} + + +def _load_hayashi_dep_file(f, edus): + """Do load. + + Parameters + ---------- + f: File + dep file, open + edus: list of EDU + True EDUs in this document. + + Returns + ------- + dt: RstDepTree + Predicted dtree + """ + dt = RstDepTree(edus=edus, origin=None, nary_enc='tree') # FIXME origin + for line in f: + line = line.strip() + if not line: + continue + dep_idx, gov_idx, lbl = line.split() + dep_idx = int(dep_idx) + gov_idx = int(gov_idx) + dt.add_dependency(gov_idx, dep_idx, label=lbl) + return dt + + +def load_hayashi_dep_file(fname, edus): + """Load a file. + + Parameters + ---------- + fname: str + Path to the file + + Returns + ------- + dt: RstDepTree + Dependency tree corresponding to the content of this file. + """ + with open(fname) as f: + return _load_hayashi_dep_file(f, edus) + + +def load_hayashi_dep_files(out_dir): + """Load dep files output by one of Hayashi et al.'s parser. + + Parameters + ---------- + out_dir: str + Path to the folder containing the .dis files. + """ + dtrees = dict() + for fname in glob(os.path.join(out_dir, '*.dis')): + doc_name = os.path.splitext(os.path.basename(fname))[0] + edus = RST_TEST_CTREES_TRUE[doc_name].leaves() + dtrees[doc_name] = load_hayashi_dep_file(fname, edus) + return dtrees + + +def load_hayashi_dtrees(out_dir, rel_conv): + """Load the dtrees output by one of Hayashi et al.'s parser. + + Parameters + ---------- + out_dir: str + Path to the folder containing .dis files. + rel_conv: RstRelationConverter + Converter for relation labels (fine- to coarse-grained, plus + normalization). + + Returns + ------- + dtree_pred: dict(str, RstDepTree) + RST dtree for each document. + """ + dtree_pred = dict() + + dtrees = load_hayashi_dep_files(out_dir) + for doc_name, dt_pred in dtrees.items(): + if rel_conv is not None: + dt_pred = rel_conv(dt_pred) + dtree_pred[doc_name] = dt_pred + return dtree_pred From d4f9418286e3c19fbe82c91faa840f2cd77ebf21 Mon Sep 17 00:00:00 2001 From: moreymat Date: Mon, 17 Oct 2016 18:47:56 +0200 Subject: [PATCH 34/74] FIX paths to hayashi outputs --- evals/dis2disdep.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evals/dis2disdep.py b/evals/dis2disdep.py index a41b53f..fd552fa 100755 --- a/evals/dis2disdep.py +++ b/evals/dis2disdep.py @@ -46,8 +46,8 @@ # output of Ji's parser OUT_JI = os.path.join('/home/mmorey/melodi/rst/ji_eisenstein/DPLP/data/docs/test/') # output of Hayashi et al.'s parsers -OUT_HAYASHI_HILDA = os.path.join('/home/mmorey/melodi/rst/hayashi/SIGDIAL/auto_parse/dep/li/') -OUT_HAYASHI_MST = os.path.join('/home/mmorey/melodi/rst/hayashi/SIGDIAL/auto_parse/cons/trans_li/') +OUT_HAYASHI_MST = os.path.join('/home/mmorey/melodi/rst/hayashi/SIGDIAL/auto_parse/dep/li/') +OUT_HAYASHI_HILDA = os.path.join('/home/mmorey/melodi/rst/hayashi/SIGDIAL/auto_parse/cons/trans_li/') def main(): From a4a4202f0fb50131ee6d2b2f23ac84fce349ba75 Mon Sep 17 00:00:00 2001 From: moreymat Date: Tue, 18 Oct 2016 16:14:40 +0200 Subject: [PATCH 35/74] ENH added conda environment.yml, fix local path to corenlp out --- environment.yml | 7 +++++++ irit_rst_dt/local.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 environment.yml diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..a1140ca --- /dev/null +++ b/environment.yml @@ -0,0 +1,7 @@ +name: irit-rst-dt +dependencies: + - python=2.7 + - nltk + - scikit-learn + - pip: + - "--editable=git+https://github.com/nlhepler/pydot.git#egg=pydot" diff --git a/irit_rst_dt/local.py b/irit_rst_dt/local.py index f805832..ab5c087 100644 --- a/irit_rst_dt/local.py +++ b/irit_rst_dt/local.py @@ -122,7 +122,7 @@ # CORENLP_OUT_DIR = None # CORENLP_OUT_DIR = '/projets/melodi/corpus/rst-dt-corenlp-2015-01-29' -CORENLP_OUT_DIR = '/home/mmorey/corpora/rst-dt-corenlp-2015-01-29' +CORENLP_OUT_DIR = '/home/mmorey/corpora/rst-dt/rst-dt-corenlp-2015-01-29' """ Where to read parses from CoreNLP from """ From a4b88275bb8897d34a02b036b02f502500330010 Mon Sep 17 00:00:00 2001 From: moreymat Date: Thu, 20 Oct 2016 17:20:05 +0200 Subject: [PATCH 36/74] ENH rst parseval metrics, now in educe --- evals/codra.py | 2 -- evals/li2014.py | 15 ++++++++++----- evals/showdown.py | 42 +++++++++++++++++++----------------------- 3 files changed, 29 insertions(+), 30 deletions(-) diff --git a/evals/codra.py b/evals/codra.py index f3b894e..eb9c6f6 100644 --- a/evals/codra.py +++ b/evals/codra.py @@ -15,8 +15,6 @@ from educe.rst_dt.document_plus import align_edus_with_paragraphs # from attelo.io import load_edus -from attelo.metrics.constituency import (parseval_detailed_report, - parseval_report) from attelo.metrics.deptree import compute_uas_las diff --git a/evals/li2014.py b/evals/li2014.py index d8c02a5..1135efc 100644 --- a/evals/li2014.py +++ b/evals/li2014.py @@ -3,6 +3,11 @@ This is a reimplementation of this evaluation procedure. """ +from educe.rst_dt.metrics.rst_parseval import (rst_parseval_report, + rst_parseval_detailed_report) + + + # FIXME legacy code brutally dumped here, broken def twisted_eval_li2014(data_true, data_pred): """Run Parseval on transformed gold trees, as in (Li et al., 2014). @@ -86,12 +91,12 @@ def eval_distortion_gold(corpus, nuc_strategy, rank_strategy, chn_bin_srtree_ref) gold_twis[doc_name] = chn_bin_rtree_ref - print(parseval_report(gold_orig, gold_twis, - metric_types=[x[0] for x in LBL_FNS], - digits=4)) + print(rst_parseval_report(gold_orig, gold_twis, + metric_types=[x[0] for x in LBL_FNS], + digits=4)) # detailed report on S+N+R - print(parseval_detailed_report(ctree_true, ctree_pred, - metric_type='S+R')) + print(rst_parseval_detailed_report(ctree_true, ctree_pred, + metric_type='S+R')) def comparative_distortion_on_gold(): diff --git a/evals/showdown.py b/evals/showdown.py index 0f128f6..6d578d1 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -15,9 +15,9 @@ from educe.rst_dt.dep2con import (DummyNuclearityClassifier, InsideOutAttachmentRanker) from educe.rst_dt.deptree import RstDepTree +from educe.rst_dt.metrics.rst_parseval import (rst_parseval_detailed_report, + rst_parseval_report) # -from attelo.metrics.constituency import (parseval_detailed_report, - parseval_report) from attelo.metrics.deptree import compute_uas_las, compute_uas_las_undirected # local to this package @@ -173,9 +173,6 @@ def main(): help="Binarize the reference ctree for the eval") parser.add_argument('--simple_rsttree', action='store_true', help="Binarize ctree and move relations up") - parser.add_argument('--span_sel', default='none', - choices=['none', 'leaves', 'non-leaves'], - help="Binarize ctree and move relations up") parser.add_argument('--per_doc', action='store_true', help="Doc-averaged scores (cf. Ji's eval)") # @@ -186,13 +183,7 @@ def main(): nary_enc_pred = args.nary_enc_pred binarize_true = args.binarize_true simple_rsttree = args.simple_rsttree - span_sel = args.span_sel - if span_sel == 'none': - span_sel = None - if simple_rsttree: - # the point of evaluating on simple rst trees is to get leaves - # out of the way - span_sel = 'non-leaves' + # "per_doc = True" computes p, r, f as in DPLP: compute scores per doc # then average over docs # it should be False, except for comparison with the DPLP paper @@ -393,6 +384,10 @@ def main(): for x in ctree_true_list] ctree_pred_list = [SimpleRSTTree.from_rst_tree(x) for x in ctree_pred_list] + ctree_type = 'SimpleRST' + else: + ctree_type = 'RST' + # WIP print SimpleRSTTrees if not os.path.exists('gold'): os.makedirs('gold') @@ -411,24 +406,25 @@ def main(): doc_names, ctree_true_list, ctree_pred_list): with codecs.open(parser_name + '/' + doc_name + '.c_eval', mode='w', encoding='utf-8') as f: - print(parseval_report([ct_true], [ct_pred], digits=4, - span_sel=span_sel, - per_doc=per_doc, - stringent=STRINGENT), + print(rst_parseval_report([ct_true], [ct_pred], + ctree_type=ctree_type, + digits=4, + per_doc=per_doc, + stringent=STRINGENT), file=f) # end WIP # FIXME # compute and print PARSEVAL scores print(parser_name) - print(parseval_report(ctree_true_list, ctree_pred_list, digits=4, - span_sel=span_sel, - per_doc=per_doc, - stringent=STRINGENT)) + print(rst_parseval_report(ctree_true_list, ctree_pred_list, + ctree_type=ctree_type, digits=4, + per_doc=per_doc, + stringent=STRINGENT)) # detailed report on S+N+R if DETAILED: - print(parseval_detailed_report(ctree_true_list, ctree_pred_list, - metric_type='S+R', - span_sel=span_sel)) + print(rst_parseval_detailed_report( + ctree_true_list, ctree_pred_list, ctree_type=ctree_type, + metric_type='S+R')) # end FIXME From b6104f20ce88f52e600e95deb5d9d2fa832c5a5f Mon Sep 17 00:00:00 2001 From: moreymat Date: Fri, 25 Nov 2016 17:55:51 +0100 Subject: [PATCH 37/74] WIP add support for output of li2016 --- evals/li_qi.py | 132 +++++++++++++++++++++++++++++ evals/showdown.py | 205 +++++++++++++++++++++++++--------------------- 2 files changed, 242 insertions(+), 95 deletions(-) create mode 100644 evals/li_qi.py diff --git a/evals/li_qi.py b/evals/li_qi.py new file mode 100644 index 0000000..abf1929 --- /dev/null +++ b/evals/li_qi.py @@ -0,0 +1,132 @@ +"""Load the output of the parser from (Li et al. 2016). + +This is 99% a copy/paste from our own evals/joty.py. +I really, really need to come up with a better API and refactor accordingly. +""" + +from __future__ import absolute_import, print_function + +import codecs +import glob +import itertools +import os + +from educe.rst_dt.parse import parse_rst_dt_tree +from educe.rst_dt.deptree import RstDepTree + + +def load_li_qi_output_files(root_dir): + """Load ctrees output by Li Qi's parser on the TEST section of the RST-DT. + + Parameters + ---------- + root_dir: string + Path to the main folder containing the parser's output + + Returns + ------- + data: dict + Dictionary that should be akin to a sklearn Bunch, with + interesting keys 'filenames', 'doc_names' and 'rst_ctrees'. + + Notes + ----- + To ensure compatibility with the rest of the code base, doc_names + are automatically added the ".out" extension. This would not work + for fileX documents, but they are absent from the TEST section of + the RST-WSJ treebank. + """ + # map output filename to doc filename: + # here, remove prefix "parsed_" + # ex of filename: parsed_wsj_0602.out + out_filenames = sorted(glob.glob(os.path.join(root_dir, 'parsed_*'))) + doc_names = [os.path.basename(out_fn).split('_', 1)[1] + for out_fn in out_filenames] + # load the RST trees + rst_ctrees = [] + for out_fn in out_filenames: + with codecs.open(out_fn, 'r', 'utf-8') as f: + # TODO(?) add support for and use RSTContext + rst_ctree = parse_rst_dt_tree(f.read(), None) + rst_ctrees.append(rst_ctree) + + data = dict(filenames=out_filenames, + doc_names=doc_names, + rst_ctrees=rst_ctrees) + return data + + +def load_li_qi_ctrees(out_dir, rel_conv): + """Load the ctrees output by Li Qi's parser as .dis files. + + This currently runs on the document-level files (.doc_dis). + + Parameters + ---------- + out_dir: str + Path to the base directory containing the output files. + + Returns + ------- + ctree_pred: dict(str, RSTTree) + RST ctree for each document. + """ + # load predicted trees + data_pred = load_li_qi_output_files(out_dir) + doc_names_pred = data_pred['doc_names'] + rst_ctrees_pred = data_pred['rst_ctrees'] + # map doc_name to ctree (RSTTree) + ctree_pred = dict() + for doc_name, ct_pred in itertools.izip(doc_names_pred, rst_ctrees_pred): + # ctree + # replace fine-grained labels with coarse-grained labels : + # the files we have already contain the coarse labels, except their + # initial letter is capitalized, except for same-unit and span, + # whereas ours are not + if rel_conv is not None: + ct_pred = rel_conv(ct_pred) + ctree_pred[doc_name] = ct_pred + + return ctree_pred + + +def load_li_qi_dtrees(out_dir, rel_conv, nary_enc='chain'): + """Get the dtrees that correspond to the ctrees output by Li Qi's parser. + + Parameters + ---------- + out_dir: str + Path to the base directory containing the output files. + nary_enc: one of {'chain', 'tree'} + Encoding for n-ary nodes. + + Returns + ------- + dtree_pred: dict(str, RstDepTree) + RST dtree for each document. + """ + # load predicted trees + data_pred = load_li_qi_output_files(out_dir) + # filenames = data_pred['filenames'] + doc_names_pred = data_pred['doc_names'] + rst_ctrees_pred = data_pred['rst_ctrees'] + + # build a dict from doc_name to ordered dtree (RstDepTree) + dtree_pred = dict() + for doc_name, ct_pred in itertools.izip(doc_names_pred, rst_ctrees_pred): + # constituency tree + # replace fine-grained labels with coarse-grained labels ; + # the files we have already contain the coarse labels, except their + # initial letter is capitalized whereas ours are not + if rel_conv is not None: + ct_pred = rel_conv(ct_pred) + # convert to an ordered dependency tree ; + # * 'tree' produces a weakly-ordered dtree strictly equivalent + # to the original ctree, + # * 'chain' produces a strictly-ordered dtree for which strict + # equivalence is not preserved + dt_pred = RstDepTree.from_rst_tree(ct_pred, nary_enc=nary_enc) + dtree_pred[doc_name] = dt_pred + + return dtree_pred + diff --git a/evals/showdown.py b/evals/showdown.py index 6d578d1..c50f336 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -25,6 +25,7 @@ from evals.feng import load_feng_ctrees, load_feng_dtrees from evals.gcrf_tree_format import load_gcrf_ctrees, load_gcrf_dtrees from evals.ji import load_ji_ctrees, load_ji_dtrees +from evals.li_qi import load_li_qi_ctrees, load_li_qi_dtrees from evals.ours import (load_deptrees_from_attelo_output, load_attelo_ctrees, load_attelo_dtrees) @@ -91,6 +92,8 @@ FENG_DIR = '/home/mmorey/melodi/rst/feng_hirst/' FENG1_OUT_DIR = os.path.join(FENG_DIR, 'phil', 'tmp') FENG2_OUT_DIR = os.path.join(FENG_DIR, 'gCRF_dist/texts/results/test_batch_gold_seg') +# Li Qi's parser +LI_QI_OUT_DIR = '/home/mmorey/melodi/rst/li_qi/result' # level of detail for parseval DETAILED = False @@ -153,6 +156,7 @@ def main(): parser.add_argument('authors_pred', nargs='+', choices=['gold', 'silver', 'joty', 'feng', 'feng2', 'ji', + 'li_qi', 'ours_chain', 'ours_tree', 'ours_tree_su'], help="Author(s) of the predictions") parser.add_argument('--nary_enc_pred', default='tree', @@ -162,6 +166,7 @@ def main(): parser.add_argument('--author_true', default='gold', choices=['gold', 'silver', 'joty', 'feng', 'feng2', 'ji', + 'li_qi', 'ours_chain', 'ours_tree'], help="Author of the reference") # * dtree eval @@ -224,101 +229,111 @@ def main(): c_preds = [] # predictions: [(parser_name, dict(doc_name, ct_pred))] d_preds = [] # predictions: [(parser_name, dict(doc_name, dt_pred))] - if 'feng' in authors_pred: - c_preds.append( - ('feng', load_feng_ctrees(FENG1_OUT_DIR, REL_CONV)) - ) - d_preds.append( - ('feng', load_feng_dtrees(FENG1_OUT_DIR, REL_CONV, - nary_enc='chain')) - ) - - if 'feng2' in authors_pred: - c_preds.append( - ('gCRF', load_gcrf_ctrees(FENG2_OUT_DIR, REL_CONV)) - ) - d_preds.append( - ('gCRF', load_gcrf_dtrees(FENG2_OUT_DIR, REL_CONV, - nary_enc='chain')) - ) - - if 'joty' in authors_pred: - # CODRA outputs RST ctrees ; eval_codra_output maps them to RST dtrees - c_preds.append( - ('joty', load_codra_ctrees(CODRA_OUT_DIR, REL_CONV)) - ) - d_preds.append( - ('joty', load_codra_dtrees(CODRA_OUT_DIR, REL_CONV, - nary_enc='chain')) - ) - # joty-{chain,tree} would be the same except nary_enc='tree' ; - # the nary_enc does not matter because codra outputs binary ctrees, - # hence both encodings result in (the same) strictly ordered dtrees - - if 'ji' in authors_pred: - # DPLP outputs RST ctrees in the form of lists of spans; - # load_ji_dtrees maps them to RST dtrees - c_preds.append( - ('ji', load_ji_ctrees(JI_OUT_DIR, REL_CONV)) - ) - d_preds.append( - ('ji', load_ji_dtrees(JI_OUT_DIR, REL_CONV, - nary_enc='chain')) - ) - # ji-{chain,tree} would be the same except nary_enc='tree' ; - # the nary_enc does not matter because codra outputs binary ctrees, - # hence both encodings result in (the same) strictly ordered dtrees - - if 'ours_chain' in authors_pred: - # Eisner, predicted syntax, chain - c_preds.append( - ('ours-chain', load_attelo_ctrees(EISNER_OUT_SYN_PRED, EDUS_FILE, - nuc_clf, rnk_clf)) - ) - d_preds.append( - ('ours-chain', load_attelo_dtrees(EISNER_OUT_SYN_PRED, EDUS_FILE, - nuc_clf, rnk_clf)) - ) - - if 'ours_tree' in authors_pred: - # Eisner, predicted syntax, tree + same-unit - c_preds.append( - ('ours-tree', load_attelo_ctrees(EISNER_OUT_TREE_SYN_PRED, - EDUS_FILE, - nuc_clf, rnk_clf)) - ) - d_preds.append( - ('ours-tree', load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED, - EDUS_FILE, - nuc_clf, rnk_clf)) - ) - if 'ours_tree_su' in authors_pred: - # Eisner, predicted syntax, tree + same-unit - c_preds.append( - ('ours-tree-su', load_attelo_ctrees(EISNER_OUT_TREE_SYN_PRED_SU, - EDUS_FILE, - nuc_clf, rnk_clf)) - ) - d_preds.append( - ('ours-tree-su', load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED_SU, - EDUS_FILE, - nuc_clf, rnk_clf)) - ) - - if False: # FIXME repair (or forget) these - print('Eisner, predicted syntax + same-unit') - load_deptrees_from_attelo_output(ctree_true, dtree_true, - EISNER_OUT_SYN_PRED_SU, EDUS_FILE, - nuc_clf, rnk_clf, - detailed=False) - print('======================') - - print('Eisner, gold syntax') - load_deptrees_from_attelo_output(ctree_true, dtree_true, - EISNER_OUT_SYN_GOLD, EDUS_FILE, - nuc_clf, rnk_clf, - detailed=False) - print('======================') + for author_pred in authors_pred: + if author_pred == 'li_qi': + c_preds.append( + ('li_qi', load_li_qi_ctrees(LI_QI_OUT_DIR, REL_CONV)) + ) + d_preds.append( + ('li_qi', load_li_qi_dtrees(LI_QI_OUT_DIR, REL_CONV, + nary_enc='chain')) + ) + + if author_pred == 'feng': + c_preds.append( + ('gSVM', load_feng_ctrees(FENG1_OUT_DIR, REL_CONV)) + ) + d_preds.append( + ('gSVM', load_feng_dtrees(FENG1_OUT_DIR, REL_CONV, + nary_enc='chain')) + ) + + if author_pred == 'feng2': + c_preds.append( + ('gCRF', load_gcrf_ctrees(FENG2_OUT_DIR, REL_CONV)) + ) + d_preds.append( + ('gCRF', load_gcrf_dtrees(FENG2_OUT_DIR, REL_CONV, + nary_enc='chain')) + ) + + if author_pred == 'joty': + # CODRA outputs RST ctrees ; eval_codra_output maps them to RST dtrees + c_preds.append( + ('TSP 1-1', load_codra_ctrees(CODRA_OUT_DIR, REL_CONV)) + ) + d_preds.append( + ('TSP 1-1', load_codra_dtrees(CODRA_OUT_DIR, REL_CONV, + nary_enc='chain')) + ) + # joty-{chain,tree} would be the same except nary_enc='tree' ; + # the nary_enc does not matter because codra outputs binary ctrees, + # hence both encodings result in (the same) strictly ordered dtrees + + if author_pred == 'ji': + # DPLP outputs RST ctrees in the form of lists of spans; + # load_ji_dtrees maps them to RST dtrees + c_preds.append( + ('DPLP', load_ji_ctrees(JI_OUT_DIR, REL_CONV)) + ) + d_preds.append( + ('DPLP', load_ji_dtrees(JI_OUT_DIR, REL_CONV, + nary_enc='chain')) + ) + # ji-{chain,tree} would be the same except nary_enc='tree' ; + # the nary_enc does not matter because codra outputs binary ctrees, + # hence both encodings result in (the same) strictly ordered dtrees + + if author_pred == 'ours_chain': + # Eisner, predicted syntax, chain + c_preds.append( + ('ours-chain', load_attelo_ctrees(EISNER_OUT_SYN_PRED, EDUS_FILE, + nuc_clf, rnk_clf)) + ) + d_preds.append( + ('ours-chain', load_attelo_dtrees(EISNER_OUT_SYN_PRED, EDUS_FILE, + nuc_clf, rnk_clf)) + ) + + if author_pred == 'ours_tree': + # Eisner, predicted syntax, tree + same-unit + c_preds.append( + ('ours-tree', load_attelo_ctrees(EISNER_OUT_TREE_SYN_PRED, + EDUS_FILE, + nuc_clf, rnk_clf)) + ) + d_preds.append( + ('ours-tree', load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED, + EDUS_FILE, + nuc_clf, rnk_clf)) + ) + if author_pred == 'ours_tree_su': + # Eisner, predicted syntax, tree + same-unit + c_preds.append( + ('ours-tree-su', load_attelo_ctrees(EISNER_OUT_TREE_SYN_PRED_SU, + EDUS_FILE, + nuc_clf, rnk_clf)) + ) + d_preds.append( + ('ours-tree-su', load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED_SU, + EDUS_FILE, + nuc_clf, rnk_clf)) + ) + + if False: # FIXME repair (or forget) these + print('Eisner, predicted syntax + same-unit') + load_deptrees_from_attelo_output(ctree_true, dtree_true, + EISNER_OUT_SYN_PRED_SU, EDUS_FILE, + nuc_clf, rnk_clf, + detailed=False) + print('======================') + + print('Eisner, gold syntax') + load_deptrees_from_attelo_output(ctree_true, dtree_true, + EISNER_OUT_SYN_GOLD, EDUS_FILE, + nuc_clf, rnk_clf, + detailed=False) + print('======================') # dependency eval From 3d63a625f8d514374b89fd9e0a966743b6cd687b Mon Sep 17 00:00:00 2001 From: moreymat Date: Tue, 6 Dec 2016 18:33:01 +0100 Subject: [PATCH 38/74] ENH add eval for Hayashi's HILDA + Li2014 dep parser output files --- evals/hayashi_cons.py | 154 +++++++++++++++++++++++++++ evals/ji.py | 2 +- evals/li_sujian.py | 239 ++++++++++++++++++++++++++++++++++++++++++ evals/ours.py | 12 +-- evals/showdown.py | 92 +++++++++++----- 5 files changed, 467 insertions(+), 32 deletions(-) create mode 100644 evals/hayashi_cons.py create mode 100644 evals/li_sujian.py diff --git a/evals/hayashi_cons.py b/evals/hayashi_cons.py new file mode 100644 index 0000000..6f76512 --- /dev/null +++ b/evals/hayashi_cons.py @@ -0,0 +1,154 @@ +"""Load RST c-trees output by Hayashi et al.'s reimplementation of HILDA. + +""" + +from __future__ import absolute_import, print_function + +from collections import namedtuple +import codecs +import glob +import itertools +import os + +from nltk import Tree + +from educe.annotation import Span +from educe.rst_dt.annotation import EDU, Node, RSTTree +from educe.rst_dt.deptree import RstDepTree + + +node_struct = namedtuple('node_struct', ['nuc', 'rel', 'span']) + +def read_node(s): + """Helper applied when reading a node""" + nuc, rel = s.split(':') if s != 'Root' else (s, '---') + res = node_struct(nuc=nuc, rel=rel, span=(0, 0)) + return res + + +leaf_struct = namedtuple('leaf_struct', ['edu_id', 'sent_id', 'para_id']) + +def read_leaf(s): + """Helper applied when reading a leaf""" + edu_id, sent_id, para_id = s[4:].split('_') # ex: leaf1_1_1 + res = leaf_struct(edu_id=edu_id, sent_id=sent_id, + para_id=para_id) + return res + +def propagate_spans(t): + """Propagate spans bottom-up in our custom NLTK tree.""" + dft_span = Span(0, 0) # default text span + dft_text = '' + + lbl = t.label() + if all(isinstance(kid, Tree) for kid in t): + new_kids = [propagate_spans(kid) for kid in t] + edu_start = new_kids[0].label().edu_span[0] + edu_end = new_kids[-1].label().edu_span[1] + else: + # pre-terminal + assert len(t) == 1 + kid = t[0] + new_kid = EDU(int(kid.edu_id), dft_span, dft_text) + new_kids = [new_kid] + edu_start = new_kid.num + edu_end = new_kid.num + new_lbl = Node(lbl.nuc, (edu_start, edu_end), dft_span, lbl.rel) + new_tree = RSTTree(new_lbl, new_kids) + return new_tree + + +def load_hayashi_con_files(root_dir): + """Load the ctrees output by Hayashi et al.'s reimplementation of HILDA. + + The RST ctrees are supposedly document-level RST trees, with classes of + relations. + + Parameters + ---------- + out_dir: str + Path to the base directory containing the output files. + + Returns + ------- + data: dict + Dictionary that should be akin to a sklearn Bunch, with + interesting keys 'filenames', 'doc_names' and 'rst_ctrees'. + """ + # map output filename to doc filename + # ex of filename: wsj_0602.out.dis + out_filenames = sorted(glob.glob(os.path.join(root_dir, '*.dis'))) + doc_names = [os.path.basename(out_fn).rsplit('.', 1)[0] + for out_fn in out_filenames] + # load the RST trees + rst_ctrees = [] + for out_fn in out_filenames: + with codecs.open(out_fn, 'r', 'utf-8') as f: + tree_str = f.read() + tree_raw = Tree.fromstring(tree_str, read_node=read_node, + read_leaf=read_leaf) + # TODO(?) add support for and use RSTContext + rst_ctree = propagate_spans(tree_raw) + rst_ctrees.append(rst_ctree) + + data = dict(filenames=out_filenames, + doc_names=doc_names, + rst_ctrees=rst_ctrees) + return data + + +def load_hayashi_hilda_ctrees(out_dir, rel_conv): + """Load the ctrees output by Hayashi et al.'s HILDA. + + Parameters + ---------- + out_dir: str + Path to the folder containing .dis files. + rel_conv: RstRelationConverter + Converter for relation labels (fine- to coarse-grained, plus + normalization). + + Returns + ------- + ctree_pred: dict(str, RSTTree) + RST ctree for each document. + """ + # load predicted ctrees + data_pred = load_hayashi_con_files(out_dir) + doc_names_pred = data_pred['doc_names'] + rst_ctrees_pred = data_pred['rst_ctrees'] + + # build a dict from doc_name to RST ctree + ctree_pred = dict() + for doc_name, ct_pred in itertools.izip(doc_names_pred, rst_ctrees_pred): + if rel_conv is not None: + ct_pred = rel_conv(ct_pred) + ctree_pred[doc_name] = ct_pred + return ctree_pred + + +def load_hayashi_hilda_dtrees(out_dir, rel_conv, nary_enc='chain'): + """Load the dtrees for the ctrees output by Hayashi et al.'s HILDA. + + Parameters + ---------- + out_dir: str + Path to the folder containing .dis files. + rel_conv: RstRelationConverter + Converter for relation labels (fine- to coarse-grained, plus + normalization). + + Returns + ------- + dtree_pred: dict(str, RstDepTree) + RST dtree for each document. + """ + # load predicted ctrees + ctree_pred = load_hayashi_hilda_ctrees(out_dir, rel_conv) + # convert to dtrees + dtree_pred = dict() + for doc_name, ct_pred in ctree_pred.items(): + dt_pred = RstDepTree.from_rst_tree(ct_pred, nary_enc=nary_enc) + dtree_pred[doc_name] = dt_pred + + return dtree_pred diff --git a/evals/ji.py b/evals/ji.py index 9862abf..3198a3f 100644 --- a/evals/ji.py +++ b/evals/ji.py @@ -16,7 +16,7 @@ from educe.rst_dt.rst_wsj_corpus import TEST_FOLDER # original RST corpus -RST_CORPUS = os.path.join('/home/mmorey/corpora/rst_discourse_treebank/data') +RST_CORPUS = os.path.join('/home/mmorey/corpora/rst-dt/rst_discourse_treebank/data') RST_MAIN_TEST = os.path.join(RST_CORPUS, TEST_FOLDER) diff --git a/evals/li_sujian.py b/evals/li_sujian.py new file mode 100644 index 0000000..1f0d89b --- /dev/null +++ b/evals/li_sujian.py @@ -0,0 +1,239 @@ +"""TODO + +""" + +from __future__ import absolute_import, print_function +from collections import Counter +from glob import glob +import os + +# educe +from educe.learning.edu_input_format import load_edu_input_file +from educe.rst_dt.corpus import (RstRelationConverter, + Reader as RstReader) +from educe.rst_dt.dep2con import deptree_to_rst_tree +from educe.rst_dt.deptree import NUC_S, RstDepTree, RstDtException +from educe.rst_dt.metrics.rst_parseval import rst_parseval_report +# attelo +from attelo.metrics.deptree import compute_uas_las as att_compute_uas_las +# local imports +from evals.showdown import EDUS_FILE, setup_dtree_postprocessor + + +# RST corpus +CORPUS_DIR = os.path.join('corpus', 'RSTtrees-WSJ-main-1.0/') +CD_TRAIN = os.path.join(CORPUS_DIR, 'TRAINING') +CD_TEST = os.path.join(CORPUS_DIR, 'TEST') +# relation converter (fine- to coarse-grained labels) +RELMAP_FILE = os.path.join('/home/mmorey/melodi/educe', + 'educe', 'rst_dt', + 'rst_112to18.txt') +REL_CONV = RstRelationConverter(RELMAP_FILE).convert_tree +# pattern for the .edu_input files of the docs from the test set +EDUS_FILE_PAT = "TMP/latest/data/TEST/{}.relations.edu-pairs.sparse.edu_input" + +# output of Li et al.'s parser +SAVE_DIR = "/home/mmorey/melodi/rst/li_sujian/TextLevelDiscourseParser/mybackup/mstparser-code-116-trunk/mstparser/save" +COARSE_FILES = [ + "136.0detailedOutVersion2.txt", + "151.0detailedOut.txt", + "164.0detailedOut.txt", + "177.0detailedOut.txt", + "335.0detailedOut.txt", + "37.0detailedOut.txt", + "424.0detailedOut.txt", + "448.0detailedOut.txt", + "455.0detailedOutVersion2.txt", + "513.0detailedOutVersion2.txt", + "529.0detailedOut.txt", + "615.0detailedOutVersion2.txt", + "712.0detailedOut.txt", + "917.0detailedOut.txt", +] +FINE_FILES = [ + "190.0detailedOut.txt", + "473.0detailedOutVersion2.txt", + "561.0detailedOut.txt", + "723.0detailedOut.txt", + "747.0detailedOutVersion2.txt", + "825.0detailedOut.txt", + "947.0detailedOut.txt", + "965.0detailedOutVersion2.txt", +] +# different format for predicted labels and description of EDU +COARSE_FEAT_FILES = [ + "441.0detailedOut.txt", +] + + +def load_output_file(out_file): + """Load an output file from Li et al.'s dep parser. + """ + doc_names = [] + heads_true = [] + labels_true = [] + heads_pred = [] + labels_pred = [] + with open(out_file) as f: + for line in f: + if line.startswith(".\\testdata"): + # file + doc_name = line.strip().split("\\")[2][:12] # drop .edus or else + # print(doc_name) + doc_names.append(doc_name) + heads_true.append([-1]) # initial pad for fake root + labels_true.append(['']) + heads_pred.append([-1]) + labels_pred.append(['']) + else: + edu_idx, hd_true, hd_pred, lbl_true, lbl_pred, edu_str = line.strip().split(' ', 5) + if lbl_pred == '': + # not sure whether this should be enabled + lbl_pred = 'Elaboration' + heads_true[-1].append(int(hd_true)) + labels_true[-1].append(lbl_true) + heads_pred[-1].append(int(hd_pred)) + labels_pred[-1].append(lbl_pred) + res = { + 'doc_names': doc_names, + 'heads_true': heads_true, + 'labels_true': labels_true, + 'heads_pred': heads_pred, + 'labels_pred': labels_pred, + } + return res + + +if __name__ == "__main__": + # load dep trees from corpus + reader_test = RstReader(CD_TEST) + corpus_test = reader_test.slurp() + + # setup conversion from c- to d-tree and back, and eval type + nary_enc = 'tree' + eval_li = True + + if eval_li: + order = 'strict' + nuc_strategy = 'constant' + nuc_constant = NUC_S + rnk_strategy = 'lllrrr' + rnk_prioritize_same_unit = False + TWIST_GOLD = True + ADD_TRIVIAL_SPANS = True + else: # comparable setup to what we use for our own parsers + order = 'weak' + nuc_strategy = "unamb_else_most_frequent" + nuc_constant = None + rnk_strategy = "sdist-edist-rl" + rnk_prioritize_same_unit = True + TWIST_GOLD = False + ADD_TRIVIAL_SPANS = False + + nuc_clf, rnk_clf = setup_dtree_postprocessor( + nary_enc=nary_enc, order=order, nuc_strategy=nuc_strategy, + nuc_constant=nuc_constant, rnk_strategy=rnk_strategy, + rnk_prioritize_same_unit=rnk_prioritize_same_unit) + + ctree_true = dict() + dtree_true = dict() + labelset_true = Counter() + for doc_id, ct_true in sorted(corpus_test.items()): + doc_name = doc_id.doc + ct_true = REL_CONV(ct_true) # map fine to coarse rels + ctree_true[doc_name] = ct_true + dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc=nary_enc) + # dirty hack: lowercase ROOT + dt_true.labels = [x.lower() if x == 'ROOT' else x + for x in dt_true.labels] + + dtree_true[doc_name] = dt_true + labelset_true.update(dt_true.labels[1:]) + + # load parser output + for fname in COARSE_FILES: + dtree_pred = dict() + labelset_pred = Counter() + # + f_cur = os.path.join(SAVE_DIR, fname) + dep_bunch = load_output_file(f_cur) + doc_names = dep_bunch['doc_names'] + # load and process _pred + for doc_name, heads_pred, labels_pred in zip( + dep_bunch['doc_names'], dep_bunch['heads_pred'], + dep_bunch['labels_pred']): + # create dtree _pred + edus_data = load_edu_input_file(EDUS_FILE_PAT.format(doc_name), + edu_type='rst-dt') + edus = edus_data['edus'] + edu2sent = edus_data['edu2sent'] + dt_pred = RstDepTree(edus) + # add predicted edges + for dep_idx, (gov_idx, lbl) in enumerate(zip( + heads_pred[1:], labels_pred[1:]), start=1): + if lbl == '': + lbl = 'Elaboration' + # print(lbl) + lbl = lbl.lower() + labelset_pred[lbl] += 1 + dt_pred.add_dependency(gov_idx, dep_idx, lbl) + dt_pred.sent_idx = [0] + edu2sent # 0 for fake root + dirty + dtree_pred[doc_name] = dt_pred + # end WIP + expected_labelset = ['attribution', 'background', 'cause', 'comparison', 'condition', 'contrast', 'elaboration', 'enablement', 'evaluation', 'explanation', 'joint', 'manner-means', 'root', 'same-unit', 'summary', 'temporal', 'textual', 'topic-change', 'topic-comment'] + assert sorted(labelset_pred.keys()) == expected_labelset + # wsj_1189 has a weird "span" label in a multinuclear rel at [7--9] + # see footnote in Hayashi et al's SIGDIAL 2016 paper + assert sorted(labelset_true.keys()) == sorted( + expected_labelset + ['span']) + + # compute UAS and LAS on the _true values from the corpus and + # _pred Educe RstDepTrees re-built from their output files + dtree_true_list = [dtree_true[doc_name] for doc_name in doc_names] + dtree_pred_list = [dtree_pred[doc_name] for doc_name in doc_names] + att_score_uas, att_score_las = att_compute_uas_las( + dtree_true_list, dtree_pred_list, include_ls=False, + include_las_n_o_no=False) + print("{}\tUAS={:.4f}\tLAS={:.4f} (attelo)".format( + fname, att_score_uas, att_score_las)) + + # build predicted c-trees using our heuristics for nuc and rank + ctree_pred = dict() + for doc_name, dt_pred in dtree_pred.items(): + # 1. enrich d-tree with nuc and order + # a. order: the procedure that generates spans produces a + # left-heavy branching: ((A B) C), which should be our + # "lllrrr" heuristic + dt_pred.ranks = rnk_clf.predict([dt_pred])[0] + # b. nuclearity: heuristic baseline + dt_pred.nucs = nuc_clf.predict([dt_pred])[0] + # 2. build _pred c-tree + try: + ct_pred = deptree_to_rst_tree(dt_pred) + ctree_pred[doc_name] = ct_pred + except RstDtException as rst_e: + print(rst_e) + raise + # 3. predict nuc and order in _true d-tree, replace the _true + # c-tree with a twisted one, like in their eval + if TWIST_GOLD: + dt_true = dtree_true[doc_name] + dt_true.sent_idx = [0] + edu2sent + dt_true.ranks = rnk_clf.predict([dt_true])[0] + dt_true.nucs = nuc_clf.predict([dt_true])[0] + ct_true = ctree_true[doc_name] + try: + ct_true = deptree_to_rst_tree(dt_true) + except RstDtException as rst_e: + print(rst_e) + raise + ctree_true[doc_name] = ct_true + + # compute RST-Parseval of these c-trees + ctree_true_list = [ctree_true[doc_name] for doc_name in doc_names] + ctree_pred_list = [ctree_pred[doc_name] for doc_name in doc_names] + print(rst_parseval_report(ctree_true_list, ctree_pred_list, + ctree_type='RST', digits=4, + per_doc=False, + add_trivial_spans=ADD_TRIVIAL_SPANS, + stringent=False)) diff --git a/evals/ours.py b/evals/ours.py index 0dbe1ce..f9d48bf 100644 --- a/evals/ours.py +++ b/evals/ours.py @@ -72,15 +72,12 @@ def load_attelo_dtrees(output_file, edus_file, nuc_clf, rnk_clf): """ dtree_pred = dict() # predicted dtrees # * setup... - # load EDUs as they are known to attelo (sigh) - # and predicted edges on these EDUs - att_edus = load_edus(edus_file) - edges_pred = load_attelo_output_file(output_file) - # rebuild educe EDUs from their attelo description - # and group them by doc_name + # load EDUs as they are known to attelo (sigh): rebuild educe EDUs + # from their attelo description and group them by doc_name educe_edus = defaultdict(list) edu2sent_idx = defaultdict(dict) gid2num = dict() + att_edus = load_edus(edus_file) for att_edu in att_edus: # doc name doc_name = att_edu.grouping @@ -103,7 +100,8 @@ def load_attelo_dtrees(output_file, edus_file, nuc_clf, rnk_clf): for e in doc_educe_edus]) for doc_name, doc_educe_edus in educe_edus.items()} - # rebuild RstDepTrees + # load predicted edges, on these EDUs, into RstDepTrees + edges_pred = load_attelo_output_file(output_file) for doc_name, es_pred in sorted(edges_pred.items()): # get educe EDUs doc_educe_edus = educe_edus[doc_name] diff --git a/evals/showdown.py b/evals/showdown.py index c50f336..9384f44 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -18,12 +18,16 @@ from educe.rst_dt.metrics.rst_parseval import (rst_parseval_detailed_report, rst_parseval_report) # -from attelo.metrics.deptree import compute_uas_las, compute_uas_las_undirected +from attelo.metrics.deptree import (compute_uas_las, + compute_uas_las_undirected) # local to this package from evals.codra import load_codra_ctrees, load_codra_dtrees from evals.feng import load_feng_ctrees, load_feng_dtrees from evals.gcrf_tree_format import load_gcrf_ctrees, load_gcrf_dtrees +from evals.hayashi_cons import (load_hayashi_hilda_ctrees, + load_hayashi_hilda_dtrees) +from evals.hayashi_deps import load_hayashi_dtrees from evals.ji import load_ji_ctrees, load_ji_dtrees from evals.li_qi import load_li_qi_ctrees, load_li_qi_dtrees from evals.ours import (load_deptrees_from_attelo_output, @@ -94,20 +98,31 @@ FENG2_OUT_DIR = os.path.join(FENG_DIR, 'gCRF_dist/texts/results/test_batch_gold_seg') # Li Qi's parser LI_QI_OUT_DIR = '/home/mmorey/melodi/rst/li_qi/result' +# Hayashi's HILDA +HAYASHI_OUT_DIR = '/home/mmorey/melodi/rst/hayashi/SIGDIAL' +HAYASHI_HILDA_OUT_DIR = os.path.join(HAYASHI_OUT_DIR, 'auto_parse/cons/HILDA') # level of detail for parseval DETAILED = False +EVAL_LI_DEP = True STRINGENT = False +# additional dependency metrics +INCLUDE_LS = False +UNDIRECTED_DEPS = False +EVAL_NUC_RANK = True # hyperparams NUC_STRATEGY = 'unamb_else_most_frequent' +NUC_CONSTANT = None # only useful for NUC_STRATEGY='constant' RNK_STRATEGY = 'sdist-edist-rl' RNK_PRIORITY_SU = True -def setup_dtree_postprocessor(nary_enc): +def setup_dtree_postprocessor(nary_enc='chain', order='strict', + nuc_strategy=NUC_STRATEGY, + nuc_constant=NUC_CONSTANT, + rnk_strategy=RNK_STRATEGY, + rnk_prioritize_same_unit=RNK_PRIORITY_SU): """Setup the nuclearity and rank classifiers to flesh out dtrees.""" - # tie the order with the encoding for n-ary nodes - order = 'weak' if nary_enc == 'tree' else 'strict' # load train section of the RST corpus, fit (currently dummy) classifiers # for nuclearity and rank reader_train = RstReader(CD_TRAIN) @@ -134,12 +149,13 @@ def setup_dtree_postprocessor(nary_enc): y_nuc_train.append(dt.nucs) y_rnk_train.append(dt.ranks) # nuclearity clf - nuc_clf = DummyNuclearityClassifier(strategy=NUC_STRATEGY) + nuc_clf = DummyNuclearityClassifier(strategy=nuc_strategy, + constant=nuc_constant) nuc_clf.fit(X_train, y_nuc_train) # rank clf - rnk_clf = InsideOutAttachmentRanker(strategy=RNK_STRATEGY, - prioritize_same_unit=RNK_PRIORITY_SU, - order=order) + rnk_clf = InsideOutAttachmentRanker( + strategy=rnk_strategy, prioritize_same_unit=rnk_prioritize_same_unit, + order=order) rnk_clf.fit(X_train, y_rnk_train) return nuc_clf, rnk_clf @@ -156,7 +172,7 @@ def main(): parser.add_argument('authors_pred', nargs='+', choices=['gold', 'silver', 'joty', 'feng', 'feng2', 'ji', - 'li_qi', + 'li_qi', 'hayashi_hilda', 'ours_chain', 'ours_tree', 'ours_tree_su'], help="Author(s) of the predictions") parser.add_argument('--nary_enc_pred', default='tree', @@ -166,7 +182,7 @@ def main(): parser.add_argument('--author_true', default='gold', choices=['gold', 'silver', 'joty', 'feng', 'feng2', 'ji', - 'li_qi', + 'li_qi', 'hayashi_hilda', 'ours_chain', 'ours_tree'], help="Author of the reference") # * dtree eval @@ -200,7 +216,10 @@ def main(): # 0. setup the postprocessors to flesh out unordered dtrees into ordered # ones with nuclearity - nuc_clf, rnk_clf = setup_dtree_postprocessor(nary_enc_pred) + # * tie the order with the encoding for n-ary nodes + order = 'weak' if nary_enc_pred == 'tree' else 'strict' + nuc_clf, rnk_clf = setup_dtree_postprocessor(nary_enc=nary_enc_pred, + order=order) # the eval compares parses for the test section of the RST corpus reader_test = RstReader(CD_TEST) @@ -230,6 +249,16 @@ def main(): d_preds = [] # predictions: [(parser_name, dict(doc_name, dt_pred))] for author_pred in authors_pred: + if author_pred == 'hayashi_hilda': + c_preds.append( + ('hayashi_hilda', load_hayashi_hilda_ctrees( + HAYASHI_HILDA_OUT_DIR, REL_CONV)) + ) + d_preds.append( + ('hayashi_hilda', load_hayashi_hilda_dtrees( + HAYASHI_HILDA_OUT_DIR, REL_CONV, nary_enc='chain')) + ) + if author_pred == 'li_qi': c_preds.append( ('li_qi', load_li_qi_ctrees(LI_QI_OUT_DIR, REL_CONV)) @@ -342,7 +371,13 @@ def main(): digits = 4 width = max(len(parser_name) for parser_name, _ in d_preds) - headers = ["UAS", "LAS", "LS", "UUAS", "ULAS"] + headers = ["UAS", "LAS"] + if INCLUDE_LS: + headers += ["LS"] + if EVAL_NUC_RANK: + headers += ["LAS+N", "LAS+O", "LAS+N+O"] + if UNDIRECTED_DEPS: + headers += ["UUAS", "ULAS"] fmt = '%% %ds' % width # first col: parser name fmt += ' ' fmt += ' '.join(['% 9s' for _ in headers]) @@ -367,22 +402,29 @@ def main(): mode='w', encoding='utf-8') as f: print(', '.join('{:.4f}'.format(x) for x in compute_uas_las( - [dt_true], [dt_pred])), + [dt_true], [dt_pred], + include_ls=INCLUDE_LS, + include_las_n_o_no=EVAL_NUC_RANK)), file=f) - # WIP scores for undirected edges - print(', '.join('{:.4f}'.format(x) - for x in compute_uas_las_undirected( - [dt_true], [dt_pred])), - file=f) - + if UNDIRECTED_DEPS: + # scores for undirected edges + print(', '.join('{:.4f}'.format(x) + for x in compute_uas_las_undirected( + [dt_true], [dt_pred])), + file=f) # end WIP print - score_uas, score_las, score_ls = compute_uas_las(dtree_true_list, - dtree_pred_list) - score_uuas, score_ulas = compute_uas_las_undirected(dtree_true_list, - dtree_pred_list) + + all_scores = [] + all_scores += list(compute_uas_las( + dtree_true_list, dtree_pred_list, include_ls=INCLUDE_LS, + include_las_n_o_no=EVAL_NUC_RANK)) + if UNDIRECTED_DEPS: + score_uuas, score_ulas = compute_uas_las_undirected( + dtree_true_list, dtree_pred_list) + all_scores += [score_uuas, score_ulas] # append to report values = ['{pname: <{fill}}'.format(pname=parser_name, fill=width)] - for v in (score_uas, score_las, score_ls, score_uuas, score_ulas): + for v in all_scores: values += ["{0:0.{1}f}".format(v, digits)] report += fmt % tuple(values) # end table content @@ -425,6 +467,7 @@ def main(): ctree_type=ctree_type, digits=4, per_doc=per_doc, + add_trivial_spans=EVAL_LI_DEP, stringent=STRINGENT), file=f) # end WIP @@ -434,6 +477,7 @@ def main(): print(rst_parseval_report(ctree_true_list, ctree_pred_list, ctree_type=ctree_type, digits=4, per_doc=per_doc, + add_trivial_spans=EVAL_LI_DEP, stringent=STRINGENT)) # detailed report on S+N+R if DETAILED: From b4bb2a10b6d18db8093dcb9422b5de3e6c43bf79 Mon Sep 17 00:00:00 2001 From: moreymat Date: Wed, 7 Dec 2016 14:35:50 +0100 Subject: [PATCH 39/74] MAINT enable to switch between coarse- and fine-grained rels in eval li dep --- evals/li_sujian.py | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/evals/li_sujian.py b/evals/li_sujian.py index 1f0d89b..b9f603b 100644 --- a/evals/li_sujian.py +++ b/evals/li_sujian.py @@ -109,16 +109,26 @@ def load_output_file(out_file): reader_test = RstReader(CD_TEST) corpus_test = reader_test.slurp() + # choice of predictions: granularity of relations + RST_RELS = 'coarse' + if RST_RELS == 'coarse': + PRED_FILES = COARSE_FILES + else: + PRED_FILES = FINE_FILES + # eval procedure: the one in the parser of Li et al. vs standard one + EVAL_LI = True + # setup conversion from c- to d-tree and back, and eval type nary_enc = 'tree' - eval_li = True - if eval_li: + if EVAL_LI: + # reconstruction of the c-tree order = 'strict' nuc_strategy = 'constant' nuc_constant = NUC_S rnk_strategy = 'lllrrr' rnk_prioritize_same_unit = False + # eval TWIST_GOLD = True ADD_TRIVIAL_SPANS = True else: # comparable setup to what we use for our own parsers @@ -140,7 +150,9 @@ def load_output_file(out_file): labelset_true = Counter() for doc_id, ct_true in sorted(corpus_test.items()): doc_name = doc_id.doc - ct_true = REL_CONV(ct_true) # map fine to coarse rels + if RST_RELS == 'coarse': + # map fine to coarse rels + ct_true = REL_CONV(ct_true) ctree_true[doc_name] = ct_true dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc=nary_enc) # dirty hack: lowercase ROOT @@ -151,7 +163,7 @@ def load_output_file(out_file): labelset_true.update(dt_true.labels[1:]) # load parser output - for fname in COARSE_FILES: + for fname in PRED_FILES: dtree_pred = dict() labelset_pred = Counter() # @@ -180,12 +192,14 @@ def load_output_file(out_file): dt_pred.sent_idx = [0] + edu2sent # 0 for fake root + dirty dtree_pred[doc_name] = dt_pred # end WIP - expected_labelset = ['attribution', 'background', 'cause', 'comparison', 'condition', 'contrast', 'elaboration', 'enablement', 'evaluation', 'explanation', 'joint', 'manner-means', 'root', 'same-unit', 'summary', 'temporal', 'textual', 'topic-change', 'topic-comment'] - assert sorted(labelset_pred.keys()) == expected_labelset - # wsj_1189 has a weird "span" label in a multinuclear rel at [7--9] - # see footnote in Hayashi et al's SIGDIAL 2016 paper - assert sorted(labelset_true.keys()) == sorted( - expected_labelset + ['span']) + + if RST_RELS == 'coarse': + expected_labelset = ['attribution', 'background', 'cause', 'comparison', 'condition', 'contrast', 'elaboration', 'enablement', 'evaluation', 'explanation', 'joint', 'manner-means', 'root', 'same-unit', 'summary', 'temporal', 'textual', 'topic-change', 'topic-comment'] + assert sorted(labelset_pred.keys()) == expected_labelset + # wsj_1189 has a weird "span" label in a multinuclear rel at [7--9] + # see footnote in Hayashi et al's SIGDIAL 2016 paper + assert sorted(labelset_true.keys()) == sorted( + expected_labelset + ['span']) # compute UAS and LAS on the _true values from the corpus and # _pred Educe RstDepTrees re-built from their output files From 1856197490c4088204ead9bc6921ef9d1caed23a Mon Sep 17 00:00:00 2001 From: moreymat Date: Fri, 9 Dec 2016 16:01:45 +0100 Subject: [PATCH 40/74] ENH+FIX hayashi_mst, li dep --- evals/hayashi_deps.py | 73 ++++++++++++++++++++++++++++++++++++++++--- evals/li_sujian.py | 35 +++++++++++---------- evals/showdown.py | 69 ++++++++++++++++++++++++++++------------ 3 files changed, 137 insertions(+), 40 deletions(-) diff --git a/evals/hayashi_deps.py b/evals/hayashi_deps.py index f613013..00a776b 100644 --- a/evals/hayashi_deps.py +++ b/evals/hayashi_deps.py @@ -8,8 +8,10 @@ import os from glob import glob +from educe.learning.edu_input_format import load_edu_input_file from educe.rst_dt.corpus import Reader from educe.rst_dt.deptree import RstDepTree +from educe.rst_dt.dep2con import deptree_to_rst_tree # load true ctrees, from the TEST section of the RST-DT, to get gold EDUs @@ -81,17 +83,28 @@ def load_hayashi_dep_files(out_dir): return dtrees -def load_hayashi_dtrees(out_dir, rel_conv): - """Load the dtrees output by one of Hayashi et al.'s parser. +def load_hayashi_dep_dtrees(out_dir, rel_conv, edus_file_pat, nuc_clf, + rnk_clf): + """Load the dtrees output by one of Hayashi et al.'s dep parsers. Parameters ---------- - out_dir: str + out_dir : str Path to the folder containing .dis files. - rel_conv: RstRelationConverter + + rel_conv : RstRelationConverter Converter for relation labels (fine- to coarse-grained, plus normalization). + edus_file_pat : str + Pattern for the .edu_input files. + + nuc_clf : NuclearityClassifier + Nuclearity classifier + + rnk_clf : RankClassifier + Rank classifier + Returns ------- dtree_pred: dict(str, RstDepTree) @@ -103,5 +116,57 @@ def load_hayashi_dtrees(out_dir, rel_conv): for doc_name, dt_pred in dtrees.items(): if rel_conv is not None: dt_pred = rel_conv(dt_pred) + # WIP add nuclearity and rank + edus_data = load_edu_input_file(edus_file_pat.format(doc_name), + edu_type='rst-dt') + edu2sent = edus_data['edu2sent'] + dt_pred.sent_idx = [0] + edu2sent # 0 for fake root ; DIRTY + dt_pred.nucs = nuc_clf.predict([dt_pred])[0] + dt_pred.ranks = rnk_clf.predict([dt_pred])[0] + # end WIP dtree_pred[doc_name] = dt_pred + return dtree_pred + + +def load_hayashi_dep_ctrees(out_dir, rel_conv, edus_file_pat, nuc_clf, + rnk_clf): + """Load the dtrees output by one of Hayashi et al.'s dep parsers. + + Parameters + ---------- + out_dir : str + Path to the folder containing .dis files. + + rel_conv : RstRelationConverter + Converter for relation labels (fine- to coarse-grained, plus + normalization). + + edus_file_pat : str + Pattern for the .edu_input files. + + nuc_clf : NuclearityClassifier + Nuclearity classifier + + rnk_clf : RankClassifier + Rank classifier + + Returns + ------- + ctree_pred: dict(str, RSTTree) + RST ctree for each document. + """ + ctree_pred = dict() + + dtree_pred = load_hayashi_dep_dtrees(out_dir, rel_conv, edus_file_pat, + nuc_clf, rnk_clf) + for doc_name, dt_pred in dtree_pred.items(): + try: + ct_pred = deptree_to_rst_tree(dt_pred) + except RstDtException: + print(doc_name) + raise + else: + ctree_pred[doc_name] = ct_pred + + return ctree_pred diff --git a/evals/li_sujian.py b/evals/li_sujian.py index b9f603b..6f80db4 100644 --- a/evals/li_sujian.py +++ b/evals/li_sujian.py @@ -17,7 +17,7 @@ # attelo from attelo.metrics.deptree import compute_uas_las as att_compute_uas_las # local imports -from evals.showdown import EDUS_FILE, setup_dtree_postprocessor +from evals.showdown import EDUS_FILE_PAT, setup_dtree_postprocessor # RST corpus @@ -29,8 +29,7 @@ 'educe', 'rst_dt', 'rst_112to18.txt') REL_CONV = RstRelationConverter(RELMAP_FILE).convert_tree -# pattern for the .edu_input files of the docs from the test set -EDUS_FILE_PAT = "TMP/latest/data/TEST/{}.relations.edu-pairs.sparse.edu_input" + # output of Li et al.'s parser SAVE_DIR = "/home/mmorey/melodi/rst/li_sujian/TextLevelDiscourseParser/mybackup/mstparser-code-116-trunk/mstparser/save" @@ -65,6 +64,9 @@ "441.0detailedOut.txt", ] +# default file(s) to include ; I picked a coarse-grained one with good scores +DEFAULT_FILES = ["712.0detailedOut.txt"] + def load_output_file(out_file): """Load an output file from Li et al.'s dep parser. @@ -112,14 +114,14 @@ def load_output_file(out_file): # choice of predictions: granularity of relations RST_RELS = 'coarse' if RST_RELS == 'coarse': - PRED_FILES = COARSE_FILES + PRED_FILES = DEFAULT_FILES # COARSE_FILES else: PRED_FILES = FINE_FILES # eval procedure: the one in the parser of Li et al. vs standard one - EVAL_LI = True + EVAL_LI = False # setup conversion from c- to d-tree and back, and eval type - nary_enc = 'tree' + nary_enc = 'chain' if EVAL_LI: # reconstruction of the c-tree @@ -201,16 +203,6 @@ def load_output_file(out_file): assert sorted(labelset_true.keys()) == sorted( expected_labelset + ['span']) - # compute UAS and LAS on the _true values from the corpus and - # _pred Educe RstDepTrees re-built from their output files - dtree_true_list = [dtree_true[doc_name] for doc_name in doc_names] - dtree_pred_list = [dtree_pred[doc_name] for doc_name in doc_names] - att_score_uas, att_score_las = att_compute_uas_las( - dtree_true_list, dtree_pred_list, include_ls=False, - include_las_n_o_no=False) - print("{}\tUAS={:.4f}\tLAS={:.4f} (attelo)".format( - fname, att_score_uas, att_score_las)) - # build predicted c-trees using our heuristics for nuc and rank ctree_pred = dict() for doc_name, dt_pred in dtree_pred.items(): @@ -242,6 +234,17 @@ def load_output_file(out_file): print(rst_e) raise ctree_true[doc_name] = ct_true + + # compute UAS and LAS on the _true values from the corpus and + # _pred Educe RstDepTrees re-built from their output files + dtree_true_list = [dtree_true[doc_name] for doc_name in doc_names] + dtree_pred_list = [dtree_pred[doc_name] for doc_name in doc_names] + sc_uas, sc_las, sc_las_n, sc_las_o, sc_las_no = att_compute_uas_las( + dtree_true_list, dtree_pred_list, include_ls=False, + include_las_n_o_no=True) + print(("{}\tUAS={:.4f}\tLAS={:.4f}\tLAS+N={:.4f}\tLAS+O={:.4f}\t" + "LAS+N+O={:.4f}").format( + fname, sc_uas, sc_las, sc_las_n, sc_las_o, sc_las_no)) # compute RST-Parseval of these c-trees ctree_true_list = [ctree_true[doc_name] for doc_name in doc_names] diff --git a/evals/showdown.py b/evals/showdown.py index 9384f44..11cc0b9 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -27,7 +27,8 @@ from evals.gcrf_tree_format import load_gcrf_ctrees, load_gcrf_dtrees from evals.hayashi_cons import (load_hayashi_hilda_ctrees, load_hayashi_hilda_dtrees) -from evals.hayashi_deps import load_hayashi_dtrees +from evals.hayashi_deps import (load_hayashi_dep_dtrees, + load_hayashi_dep_ctrees) from evals.ji import load_ji_ctrees, load_ji_dtrees from evals.li_qi import load_li_qi_ctrees, load_li_qi_dtrees from evals.ours import (load_deptrees_from_attelo_output, @@ -43,7 +44,9 @@ RELMAP_FILE = os.path.join('/home/mmorey/melodi/educe', 'educe', 'rst_dt', 'rst_112to18.txt') -REL_CONV = RstRelationConverter(RELMAP_FILE).convert_tree +REL_CONV_BASE = RstRelationConverter(RELMAP_FILE) +REL_CONV = REL_CONV_BASE.convert_tree +REL_CONV_DTREE = REL_CONV_BASE.convert_dtree # @@ -51,9 +54,14 @@ # # * syntax: pred vs gold +# old-style .edu_input: whole test set EDUS_FILE = os.path.join('/home/mmorey/melodi', 'irit-rst-dt/TMP/syn_gold_coarse', 'TEST.relations.sparse.edu_input') + +# new style .edu_input: one file per doc in test set +EDUS_FILE_PAT = "TMP/latest/data/TEST/{}.relations.edu-pairs.sparse.edu_input" + # outputs of parsers EISNER_OUT_SYN_PRED = os.path.join( '/home/mmorey/melodi', @@ -101,10 +109,10 @@ # Hayashi's HILDA HAYASHI_OUT_DIR = '/home/mmorey/melodi/rst/hayashi/SIGDIAL' HAYASHI_HILDA_OUT_DIR = os.path.join(HAYASHI_OUT_DIR, 'auto_parse/cons/HILDA') +HAYASHI_MST_OUT_DIR = os.path.join(HAYASHI_OUT_DIR, 'auto_parse/dep/li') # level of detail for parseval DETAILED = False -EVAL_LI_DEP = True STRINGENT = False # additional dependency metrics INCLUDE_LS = False @@ -172,7 +180,7 @@ def main(): parser.add_argument('authors_pred', nargs='+', choices=['gold', 'silver', 'joty', 'feng', 'feng2', 'ji', - 'li_qi', 'hayashi_hilda', + 'li_qi', 'hayashi_hilda', 'hayashi_mst', 'ours_chain', 'ours_tree', 'ours_tree_su'], help="Author(s) of the predictions") parser.add_argument('--nary_enc_pred', default='tree', @@ -182,7 +190,7 @@ def main(): parser.add_argument('--author_true', default='gold', choices=['gold', 'silver', 'joty', 'feng', 'feng2', 'ji', - 'li_qi', 'hayashi_hilda', + 'li_qi', 'hayashi_hilda', 'hayashi_mst', 'ours_chain', 'ours_tree'], help="Author of the reference") # * dtree eval @@ -194,8 +202,13 @@ def main(): help="Binarize the reference ctree for the eval") parser.add_argument('--simple_rsttree', action='store_true', help="Binarize ctree and move relations up") + # * non-standard evals parser.add_argument('--per_doc', action='store_true', help="Doc-averaged scores (cf. Ji's eval)") + parser.add_argument('--eval_li_dep', action='store_true', + help=("Evaluate as in the dep parser of Li et al. " + "2014: all relations are NS, spiders map to " + "left-heavy branching, three trivial spans ")) # args = parser.parse_args() author_true = args.author_true @@ -209,6 +222,11 @@ def main(): # then average over docs # it should be False, except for comparison with the DPLP paper per_doc = args.per_doc + # "eval_li_dep = True" replaces the original nuclearity and order with + # heuristically determined values for _pred but also _true, and adds + # three trivial spans + eval_li_dep = args.eval_li_dep + # if binarize_true and nary_enc_true != 'chain': raise ValueError("--binarize_true is compatible with " @@ -259,6 +277,18 @@ def main(): HAYASHI_HILDA_OUT_DIR, REL_CONV, nary_enc='chain')) ) + if author_pred == 'hayashi_mst': + c_preds.append( + ('hayashi_mst', load_hayashi_dep_ctrees( + HAYASHI_MST_OUT_DIR, REL_CONV_DTREE, EDUS_FILE_PAT, + nuc_clf, rnk_clf)) + ) + d_preds.append( + ('hayashi_mst', load_hayashi_dep_dtrees( + HAYASHI_MST_OUT_DIR, REL_CONV_DTREE, EDUS_FILE_PAT, + nuc_clf, rnk_clf)) + ) + if author_pred == 'li_qi': c_preds.append( ('li_qi', load_li_qi_ctrees(LI_QI_OUT_DIR, REL_CONV)) @@ -303,11 +333,12 @@ def main(): # DPLP outputs RST ctrees in the form of lists of spans; # load_ji_dtrees maps them to RST dtrees c_preds.append( - ('DPLP', load_ji_ctrees(JI_OUT_DIR, REL_CONV)) + ('DPLP', load_ji_ctrees( + JI_OUT_DIR, REL_CONV)) ) d_preds.append( - ('DPLP', load_ji_dtrees(JI_OUT_DIR, REL_CONV, - nary_enc='chain')) + ('DPLP', load_ji_dtrees( + JI_OUT_DIR, REL_CONV, nary_enc='chain')) ) # ji-{chain,tree} would be the same except nary_enc='tree' ; # the nary_enc does not matter because codra outputs binary ctrees, @@ -316,25 +347,23 @@ def main(): if author_pred == 'ours_chain': # Eisner, predicted syntax, chain c_preds.append( - ('ours-chain', load_attelo_ctrees(EISNER_OUT_SYN_PRED, EDUS_FILE, - nuc_clf, rnk_clf)) + ('ours-chain', load_attelo_ctrees( + EISNER_OUT_SYN_PRED, EDUS_FILE, nuc_clf, rnk_clf)) ) d_preds.append( - ('ours-chain', load_attelo_dtrees(EISNER_OUT_SYN_PRED, EDUS_FILE, - nuc_clf, rnk_clf)) + ('ours-chain', load_attelo_dtrees( + EISNER_OUT_SYN_PRED, EDUS_FILE, nuc_clf, rnk_clf)) ) if author_pred == 'ours_tree': # Eisner, predicted syntax, tree + same-unit c_preds.append( - ('ours-tree', load_attelo_ctrees(EISNER_OUT_TREE_SYN_PRED, - EDUS_FILE, - nuc_clf, rnk_clf)) + ('ours-tree', load_attelo_ctrees( + EISNER_OUT_TREE_SYN_PRED, EDUS_FILE, nuc_clf, rnk_clf)) ) d_preds.append( - ('ours-tree', load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED, - EDUS_FILE, - nuc_clf, rnk_clf)) + ('ours-tree', load_attelo_dtrees( + EISNER_OUT_TREE_SYN_PRED, EDUS_FILE, nuc_clf, rnk_clf)) ) if author_pred == 'ours_tree_su': # Eisner, predicted syntax, tree + same-unit @@ -467,7 +496,7 @@ def main(): ctree_type=ctree_type, digits=4, per_doc=per_doc, - add_trivial_spans=EVAL_LI_DEP, + add_trivial_spans=eval_li_dep, stringent=STRINGENT), file=f) # end WIP @@ -477,7 +506,7 @@ def main(): print(rst_parseval_report(ctree_true_list, ctree_pred_list, ctree_type=ctree_type, digits=4, per_doc=per_doc, - add_trivial_spans=EVAL_LI_DEP, + add_trivial_spans=eval_li_dep, stringent=STRINGENT)) # detailed report on S+N+R if DETAILED: From d2bd5c6f61b189e844a85f2ed408849b45d57012 Mon Sep 17 00:00:00 2001 From: moreymat Date: Sat, 10 Dec 2016 17:45:52 +0100 Subject: [PATCH 41/74] ENH read output of braud's parsers --- evals/braud_coling.py | 149 ++++++++++++++++++++++++++++++++++++++++++ evals/braud_eacl.py | 120 ++++++++++++++++++++++++++++++++++ evals/showdown.py | 47 ++++++++++++- 3 files changed, 315 insertions(+), 1 deletion(-) create mode 100644 evals/braud_coling.py create mode 100644 evals/braud_eacl.py diff --git a/evals/braud_coling.py b/evals/braud_coling.py new file mode 100644 index 0000000..36fad6a --- /dev/null +++ b/evals/braud_coling.py @@ -0,0 +1,149 @@ +"""Read the output of Braud et al.'s COLING parser. + +""" + +from __future__ import absolute_import, print_function + +import codecs +from glob import glob +import os + +from nltk import Tree + +from educe.annotation import Span +from educe.rst_dt.annotation import EDU, Node, SimpleRSTTree +from educe.rst_dt.deptree import RstDepTree + + +# map *.mrg.pred files to the original doc names +MRG_TO_RST = { + '12.mrg.pred': 'wsj_0644.out', # 4 + '4.mrg.pred': 'wsj_1129.out', # 5 + '26.mrg.pred': 'wsj_1197.out', # 6 + '24.mrg.pred': 'wsj_1113.out', # 8 + '14.mrg.pred': 'wsj_0684.out', # 10 + '32.mrg.pred': 'wsj_1354.out', # 11 + '18.mrg.pred': 'wsj_1183.out', # 12 + '29.mrg.pred': 'wsj_1346.out', # 15 + '28.mrg.pred': 'wsj_1169.out', # 17 + '37.mrg.pred': 'wsj_0667.out', # 17 + '19.mrg.pred': 'wsj_0607.out', # 19 + '7.mrg.pred': 'wsj_0654.out', # 19 + '16.mrg.pred': 'wsj_1325.out', # 21 + '25.mrg.pred': 'wsj_2375.out', # 22 + '31.mrg.pred': 'wsj_1380.out', # 23 + '1.mrg.pred': 'wsj_0623.out', # 25 + '15.mrg.pred': 'wsj_2373.out', # 31 + '30.mrg.pred': 'wsj_2336.out', # 31 + '3.mrg.pred': 'wsj_1365.out', # 39 + '34.mrg.pred': 'wsj_1148.out', # 43 + '11.mrg.pred': 'wsj_1306.out', # 47 + '10.mrg.pred': 'wsj_2354.out', # 52 + '35.mrg.pred': 'wsj_1126.out', # 55 + '0.mrg.pred': 'wsj_2385.out', # 60 + '2.mrg.pred': 'wsj_0632.out', # 62 + '20.mrg.pred': 'wsj_0602.out', # 69 + '27.mrg.pred': 'wsj_0627.out', # 69 + '13.mrg.pred': 'wsj_1189.out', # 91 + '6.mrg.pred': 'wsj_0616.out', # 92 + '36.mrg.pred': 'wsj_1307.out', # 98 + '33.mrg.pred': 'wsj_1142.out', # 106 + '9.mrg.pred': 'wsj_0655.out', # 110 + '21.mrg.pred': 'wsj_2386.out', # 127 + '23.mrg.pred': 'wsj_0689.out', # 132 + '8.mrg.pred': 'wsj_1387.out', # 134 + '17.mrg.pred': 'wsj_1331.out', # 158 + '22.mrg.pred': 'wsj_1376.out', # 202 + '5.mrg.pred': 'wsj_1146.out', # 304 +} + + +def tree_to_simple_rsttree(tree): + """Build a SimpleRSTTree from a NLTK Tree""" + origin = None # or is it? + if not tree: + # no kid: EDU (+pre-terminal) + num = int(tree.label()) + span = Span(num, num) # FIXME + text = '' # FIXME + edu = EDU(num, span, text, context=None, origin=origin) + # pre-terminal + edu_span = (num, num) + nuc = "leaf" + rel = "leaf" + node = Node(nuc, edu_span, span, rel, context=None) + return SimpleRSTTree(node, [edu], origin=origin) + + # internal node + new_kids = [tree_to_simple_rsttree(kid) for kid in tree] + # node + nuc, rel = tree.label().split('-', 1) + edu_beg = (new_kids[0].num if isinstance(new_kids[0], EDU) + else new_kids[0].label().edu_span[0]) + edu_end = (new_kids[-1].num if isinstance(new_kids[-1], EDU) + else new_kids[-1].label().edu_span[1]) + edu_span = (edu_beg, edu_end) + char_beg = (new_kids[0].num if isinstance(new_kids[0], EDU) + else new_kids[0].label().span.char_start) + char_end = (new_kids[-1].num if isinstance(new_kids[-1], EDU) + else new_kids[-1].label().span.char_end) + span = Span(char_beg, char_end) + new_node = Node(nuc, edu_span, span, rel, context=None) + new_tree = SimpleRSTTree(new_node, new_kids, origin=origin) + return new_tree + + +def _load_braud_coling_file(f): + """Do load file""" + tree = Tree.fromstring(f.read().strip()) + simple_ctree = tree_to_simple_rsttree(tree) + return simple_ctree + + +def load_braud_coling_file(fpath): + """Load a file.""" + with codecs.open(fpath, 'rb', 'utf-8') as f: + return _load_braud_coling_file(f) + + +def load_braud_coling_ctrees(out_dir, rel_conv): + """Load the ctrees output by Braud et al.'s parser + + Parameters + ---------- + out_dir : str + Path to the output directory. + + rel_conv : TODO + Relation converter + + Returns + ------- + ctree_pred : dict(str, RSTTree) + RST c-tree for each document. + """ + ctree_pred = dict() + for fpath in sorted(glob(os.path.join(out_dir, '*.mrg.pred'))): + fname = os.path.basename(fpath) + doc_name = MRG_TO_RST.get(fname, fname) + sct_pred = load_braud_coling_file(fpath) + # convert to regular RSTTree + ct_pred = SimpleRSTTree.to_binary_rst_tree(sct_pred) + # convert relation labels + ct_pred = rel_conv(ct_pred) + # TODO check ct_true: assert that mrg.gold == .out.dis + ctree_pred[doc_name] = ct_pred + return ctree_pred + + +def load_braud_coling_dtrees(out_dir, rel_conv, nary_enc='chain'): + """Do load dtrees""" + dtree_pred = dict() + ctree_pred = load_braud_coling_ctrees(out_dir, rel_conv) + for doc_name, ct_pred in ctree_pred.items(): + dt_pred = RstDepTree.from_rst_tree(ct_pred) + # print(dt_pred.labels) # DEBUG + # raise ValueError('debug me') + dtree_pred[doc_name] = dt_pred + # TODO load ctrees, convert + return dtree_pred diff --git a/evals/braud_eacl.py b/evals/braud_eacl.py new file mode 100644 index 0000000..89aea24 --- /dev/null +++ b/evals/braud_eacl.py @@ -0,0 +1,120 @@ +"""Read the output of Braud et al.'s EACL parsers. + +""" + +from __future__ import absolute_import, print_function + +import codecs +from glob import glob +import os + +from nltk import Tree + +from educe.annotation import Span +from educe.rst_dt.annotation import EDU, Node, SimpleRSTTree +from educe.rst_dt.deptree import RstDepTree + + +def tree_to_simple_rsttree(tree, edu_num=1): + """Build a SimpleRSTTree from a NLTK Tree. + + Parameters + ---------- + edu_num : int, defaults to 1 + Number of the next EDU + """ + origin = None + + if tree.label() == 'EDU': + # EDU (+pre-terminal) + num = edu_num + span = Span(num, num) + # 'EDU ' + text = tree[0] + edu = EDU(num, span, text, context=None, origin=origin) + # pre-terminal + edu_span = (num, num) + nuc = "leaf" + rel = "leaf" + node = Node(nuc, edu_span, span, rel, context=None) + return SimpleRSTTree(node, [edu], origin=origin) + + new_kids = [] + for kid in tree: + new_kid = tree_to_simple_rsttree(kid, edu_num=edu_num) + edu_num = new_kid.label().edu_span[1] + 1 + new_kids.append(new_kid) + + # ROOT + if tree.label() == 'ROOT': + assert len(new_kids) == 1 + return new_kids[0] + + # internal node + # label: 'NNTextualorganization' + nuc = tree.label()[:2] + rel = tree.label()[2:] + # same as in braud_coling + edu_beg = (new_kids[0].num if isinstance(new_kids[0], EDU) + else new_kids[0].label().edu_span[0]) + edu_end = (new_kids[-1].num if isinstance(new_kids[-1], EDU) + else new_kids[-1].label().edu_span[1]) + edu_span = (edu_beg, edu_end) + char_beg = (new_kids[0].num if isinstance(new_kids[0], EDU) + else new_kids[0].label().span.char_start) + char_end = (new_kids[-1].num if isinstance(new_kids[-1], EDU) + else new_kids[-1].label().span.char_end) + span = Span(char_beg, char_end) + new_node = Node(nuc, edu_span, span, rel, context=None) + new_tree = SimpleRSTTree(new_node, new_kids, origin=origin) + return new_tree + + +def _load_braud_eacl_file(f): + """Do load SimpleRSTTrees from f""" + sctrees = [] + for line in f: + tree = Tree.fromstring(line.strip()) + sctree = tree_to_simple_rsttree(tree) + sctrees.append(sctree) + return sctrees + +def load_braud_eacl_file(fpath): + """Load SimpleRSTTrees from a file""" + with codecs.open(fpath, 'rb', 'utf-8') as f: + return _load_braud_eacl_file(f) + +def load_braud_eacl_ctrees(fpath, rel_conv, doc_names): + """Load the ctrees output by Braud et al.'s parser + + Parameters + ---------- + fpath : str + Path to the output file. + + rel_conv : TODO + Relation converter. + + Returns + ------- + ctree_pred : dict(str, RSTTree) + RST c-tree for each document. + """ + ctree_pred = dict() + sctree_pred = load_braud_eacl_file(fpath) + for doc_name, sct_pred in zip(doc_names, sctree_pred): + ct_pred = SimpleRSTTree.to_binary_rst_tree(sct_pred) + ct_pred = rel_conv(ct_pred) + ctree_pred[doc_name] = ct_pred + return ctree_pred + + +def load_braud_eacl_dtrees(fpath, rel_conv, doc_names, nary_enc='chain'): + """Do load dtrees""" + dtree_pred = dict() + ctree_pred = load_braud_eacl_ctrees(fpath, rel_conv, doc_names) + for doc_name, ct_pred in ctree_pred.items(): + dt_pred = RstDepTree.from_rst_tree(ct_pred) + dtree_pred[doc_name] = dt_pred + # TODO load ctrees, convert + return dtree_pred diff --git a/evals/showdown.py b/evals/showdown.py index 11cc0b9..ab0df40 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -22,6 +22,10 @@ compute_uas_las_undirected) # local to this package +from evals.braud_coling import (load_braud_coling_ctrees, + load_braud_coling_dtrees) +from evals.braud_eacl import (load_braud_eacl_ctrees, + load_braud_eacl_dtrees) from evals.codra import load_codra_ctrees, load_codra_dtrees from evals.feng import load_feng_ctrees, load_feng_dtrees from evals.gcrf_tree_format import load_gcrf_ctrees, load_gcrf_dtrees @@ -110,6 +114,10 @@ HAYASHI_OUT_DIR = '/home/mmorey/melodi/rst/hayashi/SIGDIAL' HAYASHI_HILDA_OUT_DIR = os.path.join(HAYASHI_OUT_DIR, 'auto_parse/cons/HILDA') HAYASHI_MST_OUT_DIR = os.path.join(HAYASHI_OUT_DIR, 'auto_parse/dep/li') +# Braud +BRAUD_COLING_OUT_DIR = '/home/mmorey/melodi/rst/braud/coling16/pred_trees' +BRAUD_EACL_MONO = '/home/mmorey/melodi/rst/braud/eacl16/best-en-mono/test_it8_beam16' +BRAUD_EACL_CROSS_DEV = '/home/mmorey/melodi/rst/braud/eacl16/best-en-cross+dev/test_it10_beam32' # level of detail for parseval DETAILED = False @@ -181,6 +189,8 @@ def main(): choices=['gold', 'silver', 'joty', 'feng', 'feng2', 'ji', 'li_qi', 'hayashi_hilda', 'hayashi_mst', + 'braud_coling', 'braud_eacl_mono', + 'braud_eacl_cross_dev', 'ours_chain', 'ours_tree', 'ours_tree_su'], help="Author(s) of the predictions") parser.add_argument('--nary_enc_pred', default='tree', @@ -191,6 +201,8 @@ def main(): choices=['gold', 'silver', 'joty', 'feng', 'feng2', 'ji', 'li_qi', 'hayashi_hilda', 'hayashi_mst', + 'braud_coling', 'braud_eacl_mono', + 'braud_eacl_cross_dev', 'ours_chain', 'ours_tree'], help="Author of the reference") # * dtree eval @@ -261,12 +273,45 @@ def main(): # corresponding dtree dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc=nary_enc_true) dtree_true[doc_name] = dt_true - + # sorted doc_names, because braud_eacl put all predictions in one file + sorted_doc_names = sorted(dtree_true.keys()) c_preds = [] # predictions: [(parser_name, dict(doc_name, ct_pred))] d_preds = [] # predictions: [(parser_name, dict(doc_name, dt_pred))] for author_pred in authors_pred: + if author_pred == 'braud_coling': + c_preds.append( + ('braud_coling', load_braud_coling_ctrees( + BRAUD_COLING_OUT_DIR, REL_CONV)) + ) + d_preds.append( + ('braud_coling', load_braud_coling_dtrees( + BRAUD_COLING_OUT_DIR, REL_CONV, nary_enc='chain')) + ) + + if author_pred == 'braud_eacl_mono': + c_preds.append( + ('braud_eacl_mono', load_braud_eacl_ctrees( + BRAUD_EACL_MONO, REL_CONV, sorted_doc_names)) + ) + d_preds.append( + ('braud_eacl_mono', load_braud_eacl_dtrees( + BRAUD_EACL_MONO, REL_CONV, sorted_doc_names, + nary_enc='chain')) + ) + + if author_pred == 'braud_eacl_cross_dev': + c_preds.append( + ('braud_eacl_cross_dev', load_braud_eacl_ctrees( + BRAUD_EACL_CROSS_DEV, REL_CONV, sorted_doc_names)) + ) + d_preds.append( + ('braud_eacl_cross_dev', load_braud_eacl_dtrees( + BRAUD_EACL_CROSS_DEV, REL_CONV, sorted_doc_names, + nary_enc='chain')) + ) + if author_pred == 'hayashi_hilda': c_preds.append( ('hayashi_hilda', load_hayashi_hilda_ctrees( From 9d80e94bcd2b13a04e4f8f37e59ae050a02f5ae7 Mon Sep 17 00:00:00 2001 From: moreymat Date: Sat, 10 Dec 2016 18:23:34 +0100 Subject: [PATCH 42/74] WIP map braud output to same labelset --- evals/braud_coling.py | 10 +++++++++- evals/braud_eacl.py | 16 +++++++++++++++- evals/showdown.py | 6 ++++++ 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/evals/braud_coling.py b/evals/braud_coling.py index 36fad6a..25f9887 100644 --- a/evals/braud_coling.py +++ b/evals/braud_coling.py @@ -6,6 +6,7 @@ import codecs from glob import glob +import itertools import os from nltk import Tree @@ -78,6 +79,10 @@ def tree_to_simple_rsttree(tree): new_kids = [tree_to_simple_rsttree(kid) for kid in tree] # node nuc, rel = tree.label().split('-', 1) + # map to our coarse rel names + if rel == 'Textual-organization': + rel = 'Textual' + # end map edu_beg = (new_kids[0].num if isinstance(new_kids[0], EDU) else new_kids[0].label().edu_span[0]) edu_end = (new_kids[-1].num if isinstance(new_kids[-1], EDU) @@ -145,5 +150,8 @@ def load_braud_coling_dtrees(out_dir, rel_conv, nary_enc='chain'): # print(dt_pred.labels) # DEBUG # raise ValueError('debug me') dtree_pred[doc_name] = dt_pred - # TODO load ctrees, convert + # DEBUG + all_labels = set(itertools.chain.from_iterable(dt_pred.labels for dt_pred in dtree_pred.values())) + print(out_dir, sorted(all_labels)) + # end DEBUG return dtree_pred diff --git a/evals/braud_eacl.py b/evals/braud_eacl.py index 89aea24..eacab66 100644 --- a/evals/braud_eacl.py +++ b/evals/braud_eacl.py @@ -5,6 +5,7 @@ from __future__ import absolute_import, print_function import codecs +import itertools from glob import glob import os @@ -54,6 +55,16 @@ def tree_to_simple_rsttree(tree, edu_num=1): # label: 'NNTextualorganization' nuc = tree.label()[:2] rel = tree.label()[2:] + # map to our coarse rel names + rel_map = { + 'MannerMeans': 'manner-means', + 'Sameunit': 'same-unit', + 'TopicChange': 'topic-change', + 'TopicComment': 'topic-comment', + } + rel = rel_map.get(rel, rel) + # end map + # same as in braud_coling edu_beg = (new_kids[0].num if isinstance(new_kids[0], EDU) else new_kids[0].label().edu_span[0]) @@ -116,5 +127,8 @@ def load_braud_eacl_dtrees(fpath, rel_conv, doc_names, nary_enc='chain'): for doc_name, ct_pred in ctree_pred.items(): dt_pred = RstDepTree.from_rst_tree(ct_pred) dtree_pred[doc_name] = dt_pred - # TODO load ctrees, convert + # DEBUG + all_labels = set(itertools.chain.from_iterable(dt_pred.labels for dt_pred in dtree_pred.values())) + print(fpath, sorted(all_labels)) + # end DEBUG return dtree_pred diff --git a/evals/showdown.py b/evals/showdown.py index ab0df40..bfac168 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -462,6 +462,12 @@ def main(): report += '\n' # end table format and header line + # DEBUG + import itertools + all_labels = set(itertools.chain.from_iterable(dt_true.labels for dt_true in dtree_true.values())) + print("TRUE", sorted(all_labels)) + # end DEBUG + # * table content for parser_name, dtree_pred in d_preds: doc_names = sorted(dtree_true.keys()) From e6c1e9c74ec64de3bf0592d82a9ec133cee32065 Mon Sep 17 00:00:00 2001 From: moreymat Date: Mon, 12 Dec 2016 17:30:41 +0100 Subject: [PATCH 43/74] ENH add check to eval: labelset_pred is a subset of labelset_true --- evals/braud_coling.py | 6 ---- evals/braud_eacl.py | 6 ++-- evals/feng.py | 26 ++++++++------- evals/gcrf_tree_format.py | 10 ++++++ evals/hayashi_deps.py | 5 +++ evals/ji.py | 11 ++++++- evals/showdown.py | 66 +++++++++++++-------------------------- 7 files changed, 63 insertions(+), 67 deletions(-) diff --git a/evals/braud_coling.py b/evals/braud_coling.py index 25f9887..625cb19 100644 --- a/evals/braud_coling.py +++ b/evals/braud_coling.py @@ -147,11 +147,5 @@ def load_braud_coling_dtrees(out_dir, rel_conv, nary_enc='chain'): ctree_pred = load_braud_coling_ctrees(out_dir, rel_conv) for doc_name, ct_pred in ctree_pred.items(): dt_pred = RstDepTree.from_rst_tree(ct_pred) - # print(dt_pred.labels) # DEBUG - # raise ValueError('debug me') dtree_pred[doc_name] = dt_pred - # DEBUG - all_labels = set(itertools.chain.from_iterable(dt_pred.labels for dt_pred in dtree_pred.values())) - print(out_dir, sorted(all_labels)) - # end DEBUG return dtree_pred diff --git a/evals/braud_eacl.py b/evals/braud_eacl.py index eacab66..082efa5 100644 --- a/evals/braud_eacl.py +++ b/evals/braud_eacl.py @@ -90,11 +90,13 @@ def _load_braud_eacl_file(f): sctrees.append(sctree) return sctrees + def load_braud_eacl_file(fpath): """Load SimpleRSTTrees from a file""" with codecs.open(fpath, 'rb', 'utf-8') as f: return _load_braud_eacl_file(f) + def load_braud_eacl_ctrees(fpath, rel_conv, doc_names): """Load the ctrees output by Braud et al.'s parser @@ -127,8 +129,4 @@ def load_braud_eacl_dtrees(fpath, rel_conv, doc_names, nary_enc='chain'): for doc_name, ct_pred in ctree_pred.items(): dt_pred = RstDepTree.from_rst_tree(ct_pred) dtree_pred[doc_name] = dt_pred - # DEBUG - all_labels = set(itertools.chain.from_iterable(dt_pred.labels for dt_pred in dtree_pred.values())) - print(fpath, sorted(all_labels)) - # end DEBUG return dtree_pred diff --git a/evals/feng.py b/evals/feng.py index 802ddbc..fd65acf 100644 --- a/evals/feng.py +++ b/evals/feng.py @@ -8,6 +8,8 @@ import itertools +from nltk import Tree + from educe.rst_dt.feng import load_feng_output_files from educe.rst_dt.deptree import RstDepTree @@ -42,6 +44,15 @@ def load_feng_ctrees(out_dir, rel_conv): # initial letter is capitalized whereas ours are not if rel_conv is not None: ct_pred = rel_conv(ct_pred) + # "normalize" names of classes of RST relations: + # "textual-organization" => "textual" + for pos in ct_pred.treepositions(): + t = ct_pred[pos] + if isinstance(t, Tree): + node = t.label() + if node.rel == 'textual-organization': + node.rel = 'textual' + # end normalize ctree_pred[doc_name] = ct_pred return ctree_pred @@ -62,21 +73,12 @@ def load_feng_dtrees(out_dir, rel_conv, nary_enc='chain'): dtree_pred: dict(str, RstDepTree) RST dtree for each document. """ - # load predicted trees - data_pred = load_feng_output_files(out_dir) - # filenames = data_pred['filenames'] - doc_names_pred = data_pred['doc_names'] - rst_ctrees_pred = data_pred['rst_ctrees'] + # load predicted c-trees + ctree_pred = load_feng_ctrees(out_dir, rel_conv) # build a dict from doc_name to ordered dtree (RstDepTree) dtree_pred = dict() - for doc_name, ct_pred in itertools.izip(doc_names_pred, rst_ctrees_pred): - # constituency tree - # replace fine-grained labels with coarse-grained labels ; - # the files we have already contain the coarse labels, except their - # initial letter is capitalized whereas ours are not - if rel_conv is not None: - ct_pred = rel_conv(ct_pred) + for doc_name, ct_pred in ctree_pred.items(): # convert to an ordered dependency tree ; # * 'tree' produces a weakly-ordered dtree strictly equivalent # to the original ctree, diff --git a/evals/gcrf_tree_format.py b/evals/gcrf_tree_format.py index 4c7e379..1b4fd0b 100644 --- a/evals/gcrf_tree_format.py +++ b/evals/gcrf_tree_format.py @@ -181,7 +181,17 @@ def load_gcrf_ctrees(out_dir, rel_conv): ct_pred = SimpleRSTTree.to_binary_rst_tree(sct_pred) if rel_conv is not None: ct_pred = rel_conv(ct_pred) + # "normalize" names of classes of RST relations: + # "textual-organization" => "textual" + for pos in ct_pred.treepositions(): + t = ct_pred[pos] + if isinstance(t, Tree): + node = t.label() + if node.rel == 'textual-organization': + node.rel = 'textual' + # end normalize ctree_pred[doc_name] = ct_pred + return ctree_pred diff --git a/evals/hayashi_deps.py b/evals/hayashi_deps.py index 00a776b..442e688 100644 --- a/evals/hayashi_deps.py +++ b/evals/hayashi_deps.py @@ -116,6 +116,11 @@ def load_hayashi_dep_dtrees(out_dir, rel_conv, edus_file_pat, nuc_clf, for doc_name, dt_pred in dtrees.items(): if rel_conv is not None: dt_pred = rel_conv(dt_pred) + # normalize names of classes of RST relations: + # "root" is "ROOT" in my coarse labelset (TODO: make it consistent) + dt_pred.labels = ['ROOT' if x == 'root' else x + for x in dt_pred.labels] + # end normalize # WIP add nuclearity and rank edus_data = load_edu_input_file(edus_file_pat.format(doc_name), edu_type='rst-dt') diff --git a/evals/ji.py b/evals/ji.py index 3198a3f..e5c6a9c 100644 --- a/evals/ji.py +++ b/evals/ji.py @@ -114,7 +114,9 @@ def load_ji_ctrees(ji_out_dir, rel_conv): # convert relation labels if rel_conv is not None: ct_pred = rel_conv(ct_pred) - # change "same_unit" (in Ji's output) into "same-unit" (in ours) + # normalize names of classes of RST relations: + # "same_unit" => "same-unit" + # "topic" => "topic-change" or "topic-comment"? for pos in ct_pred.treepositions(): t = ct_pred[pos] if isinstance(t, RSTTree): @@ -122,6 +124,13 @@ def load_ji_ctrees(ji_out_dir, rel_conv): # replace "same_unit" with "same-unit" if node.rel == 'same_unit': node.rel = 'same-unit' + elif node.rel == 'topic': + # either "topic-comment" or "topic-change" ; + # I expect the parser to find "topic-comment" to + # be easier but apparently it has no consequence + # on the current output I reproduced + node.rel = 'topic-comment' + # end normalize # store the resulting RSTTree ctree_pred[doc_name] = ct_pred diff --git a/evals/showdown.py b/evals/showdown.py index bfac168..29a0af7 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -7,6 +7,7 @@ import argparse import codecs +import itertools import os from educe.rst_dt.annotation import _binarize, SimpleRSTTree @@ -41,7 +42,7 @@ # RST corpus -CORPUS_DIR = os.path.join('corpus', 'RSTtrees-WSJ-main-1.0/') +CORPUS_DIR = os.path.join('corpus', 'RSTtrees-WSJ-main-1.01/') CD_TRAIN = os.path.join(CORPUS_DIR, 'TRAINING') CD_TEST = os.path.join(CORPUS_DIR, 'TEST') # relation converter (fine- to coarse-grained labels) @@ -462,38 +463,27 @@ def main(): report += '\n' # end table format and header line - # DEBUG - import itertools - all_labels = set(itertools.chain.from_iterable(dt_true.labels for dt_true in dtree_true.values())) - print("TRUE", sorted(all_labels)) - # end DEBUG - # * table content + # _true + doc_names = sorted(dtree_true.keys()) + dtree_true_list = [dtree_true[doc_name] for doc_name in doc_names] + labelset_true = set(itertools.chain.from_iterable( + x.labels for x in dtree_true_list)) + labelset_true.add("span") # RST-DT v.1.0 has an error in wsj_1189 7-9 + # _pred for parser_name, dtree_pred in d_preds: - doc_names = sorted(dtree_true.keys()) - dtree_true_list = [dtree_true[doc_name] for doc_name in doc_names] dtree_pred_list = [dtree_pred[doc_name] for doc_name in doc_names] - # WIP print per doc eval - if not os.path.exists(parser_name): - os.makedirs(parser_name) - for doc_name, dt_true, dt_pred in zip( - doc_names, dtree_true_list, dtree_pred_list): - with codecs.open(parser_name + '/' + doc_name + '.d_eval', - mode='w', encoding='utf-8') as f: - print(', '.join('{:.4f}'.format(x) - for x in compute_uas_las( - [dt_true], [dt_pred], - include_ls=INCLUDE_LS, - include_las_n_o_no=EVAL_NUC_RANK)), - file=f) - if UNDIRECTED_DEPS: - # scores for undirected edges - print(', '.join('{:.4f}'.format(x) - for x in compute_uas_las_undirected( - [dt_true], [dt_pred])), - file=f) - # end WIP print - + # check that labelset_pred is a subset of labelset_true + labelset_pred = set(itertools.chain.from_iterable( + x.labels for x in dtree_pred_list)) + try: + assert labelset_pred.issubset(labelset_true) + except AssertionError: + print(parser_name) + print('T - P', labelset_true - labelset_pred) + print('P - T', labelset_pred - labelset_true) + raise + # end check all_scores = [] all_scores += list(compute_uas_las( dtree_true_list, dtree_pred_list, include_ls=INCLUDE_LS, @@ -516,6 +506,7 @@ def main(): doc_names = sorted(ctree_true.keys()) ctree_true_list = [ctree_true[doc_name] for doc_name in doc_names] ctree_pred_list = [ctree_pred[doc_name] for doc_name in doc_names] + if simple_rsttree: ctree_true_list = [SimpleRSTTree.from_rst_tree(x) for x in ctree_true_list] @@ -538,20 +529,7 @@ def main(): with codecs.open(parser_name + '/' + doc_name, mode='w', encoding='utf-8') as f: print(ct, file=f) - # WIP eval each tree in turn - for doc_name, ct_true, ct_pred in zip( - doc_names, ctree_true_list, ctree_pred_list): - with codecs.open(parser_name + '/' + doc_name + '.c_eval', - mode='w', encoding='utf-8') as f: - print(rst_parseval_report([ct_true], [ct_pred], - ctree_type=ctree_type, - digits=4, - per_doc=per_doc, - add_trivial_spans=eval_li_dep, - stringent=STRINGENT), - file=f) - # end WIP - # FIXME + # compute and print PARSEVAL scores print(parser_name) print(rst_parseval_report(ctree_true_list, ctree_pred_list, From fde2149ca17b05170992520ffff6d9b2360bf752 Mon Sep 17 00:00:00 2001 From: moreymat Date: Tue, 13 Dec 2016 15:28:33 +0100 Subject: [PATCH 44/74] ENH updated with actual output from Ji and Surdeanu --- evals/ji.py | 10 ++- evals/showdown.py | 20 ++++- evals/surdeanu.py | 211 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 237 insertions(+), 4 deletions(-) create mode 100644 evals/surdeanu.py diff --git a/evals/ji.py b/evals/ji.py index e5c6a9c..6f01512 100644 --- a/evals/ji.py +++ b/evals/ji.py @@ -122,14 +122,20 @@ def load_ji_ctrees(ji_out_dir, rel_conv): if isinstance(t, RSTTree): node = t.label() # replace "same_unit" with "same-unit" - if node.rel == 'same_unit': + if node.rel == 'same_unit': # DPLP v. 1 node.rel = 'same-unit' - elif node.rel == 'topic': + elif node.rel == 'topic': # DPLP v. 1 # either "topic-comment" or "topic-change" ; # I expect the parser to find "topic-comment" to # be easier but apparently it has no consequence # on the current output I reproduced node.rel = 'topic-comment' + elif node.rel == 'sameunit': # Ji's output + node.rel = 'same-unit' + elif node.rel == 'topicchange': # Ji's output + node.rel = 'topic-change' + elif node.rel == 'topiccomment': # Ji's output + node.rel = 'topic-comment' # end normalize # store the resulting RSTTree ctree_pred[doc_name] = ct_pred diff --git a/evals/showdown.py b/evals/showdown.py index 29a0af7..eb8ad09 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -39,7 +39,7 @@ from evals.ours import (load_deptrees_from_attelo_output, load_attelo_ctrees, load_attelo_dtrees) - +from evals.surdeanu import load_surdeanu_ctrees, load_surdeanu_dtrees # RST corpus CORPUS_DIR = os.path.join('corpus', 'RSTtrees-WSJ-main-1.01/') @@ -104,7 +104,8 @@ # output of Joty's parser CODRA CODRA_OUT_DIR = '/home/mmorey/melodi/rst/joty/Doc-level' # output of Ji's parser DPLP -JI_OUT_DIR = os.path.join('/home/mmorey/melodi/rst/ji_eisenstein/DPLP/data/docs/test/') +# JI_OUT_DIR = os.path.join('/home/mmorey/melodi/rst/ji_eisenstein', 'DPLP/data/docs/test/') +JI_OUT_DIR = os.path.join('/home/mmorey/melodi/rst/ji_eisenstein', 'official_output/outputs/') # Feng's parsers FENG_DIR = '/home/mmorey/melodi/rst/feng_hirst/' FENG1_OUT_DIR = os.path.join(FENG_DIR, 'phil', 'tmp') @@ -119,6 +120,9 @@ BRAUD_COLING_OUT_DIR = '/home/mmorey/melodi/rst/braud/coling16/pred_trees' BRAUD_EACL_MONO = '/home/mmorey/melodi/rst/braud/eacl16/best-en-mono/test_it8_beam16' BRAUD_EACL_CROSS_DEV = '/home/mmorey/melodi/rst/braud/eacl16/best-en-cross+dev/test_it10_beam32' +# Surdeanu +SURDEANU_LOG_FILE = '/home/mmorey/melodi/rst/surdeanu/output/log' + # level of detail for parseval DETAILED = False @@ -192,6 +196,7 @@ def main(): 'li_qi', 'hayashi_hilda', 'hayashi_mst', 'braud_coling', 'braud_eacl_mono', 'braud_eacl_cross_dev', + 'surdeanu', 'ours_chain', 'ours_tree', 'ours_tree_su'], help="Author(s) of the predictions") parser.add_argument('--nary_enc_pred', default='tree', @@ -204,6 +209,7 @@ def main(): 'li_qi', 'hayashi_hilda', 'hayashi_mst', 'braud_coling', 'braud_eacl_mono', 'braud_eacl_cross_dev', + 'surdeanu', 'ours_chain', 'ours_tree'], help="Author of the reference") # * dtree eval @@ -390,6 +396,16 @@ def main(): # the nary_enc does not matter because codra outputs binary ctrees, # hence both encodings result in (the same) strictly ordered dtrees + if author_pred == 'surdeanu': + c_preds.append( + ('surdeanu', load_surdeanu_ctrees( + SURDEANU_LOG_FILE, REL_CONV)) + ) + d_preds.append( + ('surdeanu', load_surdeanu_dtrees( + SURDEANU_LOG_FILE, REL_CONV, nary_enc='chain')) + ) + if author_pred == 'ours_chain': # Eisner, predicted syntax, chain c_preds.append( diff --git a/evals/surdeanu.py b/evals/surdeanu.py new file mode 100644 index 0000000..31111e8 --- /dev/null +++ b/evals/surdeanu.py @@ -0,0 +1,211 @@ +"""Load RST trees output by Surdeanu et al.'s parser. + +This format differs from the verbose output of the parser: PM added +brackets so they are easier to read. +""" + +from __future__ import absolute_import, print_function +import codecs +import re + +from nltk import Tree + +from educe.annotation import Span +from educe.corpus import FileId +from educe.rst_dt.annotation import EDU, Node, SimpleRSTTree +from educe.rst_dt.deptree import RstDepTree + + +# timestamped line +TS_LINE = r"\d\d:\d\d:\d\d.\d\d\d \[run-main-0\].*" +TS_RE = re.compile(TS_LINE) + + +def tree_to_simple_rsttree(tree, edu_num=1): + """Build a SimpleRSTTree from an NLTK Tree (formatted a la Surdeanu). + + Parameters + ---------- + tree : nltk.Tree + Tree + + edu_num : int, defaults to 1 + Number of the next EDU + + Returns + ------- + sct : SimpleRSTTree + The corresponding SimpleRSTTree. + """ + origin = None + + if tree.label() == 'TEXT': + # EDU (+pre-terminal) + num = edu_num + span = Span(num, num) + # 'TEXT ' + text = '__'.join(tree) + edu = EDU(num, span, text, context=None, origin=origin) + # pre-terminal + edu_span = (num, num) + nuc = "leaf" + rel = "leaf" + node = Node(nuc, edu_span, span, rel, context=None) + return SimpleRSTTree(node, [edu], origin=origin) + + new_kids = [] + for kid in tree: + new_kid = tree_to_simple_rsttree(kid, edu_num=edu_num) + edu_num = new_kid.label().edu_span[1] + 1 + new_kids.append(new_kid) + + # internal node + # (modified) label: 'elaboration:NS' or 'joint' (no explicit nuc: NN) + if tree.label()[-3] == ':': + rel = tree.label()[:-3] + nuc = tree.label()[-2:] + else: + rel = tree.label() + nuc = 'NN' + # map to our coarse rel names + # TODO? + # end map + # same as in braud_coling and braud_eacl + edu_beg = (new_kids[0].num if isinstance(new_kids[0], EDU) + else new_kids[0].label().edu_span[0]) + edu_end = (new_kids[-1].num if isinstance(new_kids[-1], EDU) + else new_kids[-1].label().edu_span[1]) + edu_span = (edu_beg, edu_end) + char_beg = (new_kids[0].num if isinstance(new_kids[0], EDU) + else new_kids[0].label().span.char_start) + char_end = (new_kids[-1].num if isinstance(new_kids[-1], EDU) + else new_kids[-1].label().span.char_end) + span = Span(char_beg, char_end) + new_node = Node(nuc, edu_span, span, rel, context=None) + new_tree = SimpleRSTTree(new_node, new_kids, origin=origin) + return new_tree + + +def _load_surdeanu_ctrees(log_file, rel_conv): + """Do load""" + doc_names = [] + nltk_ctrees = [] + ctree_pred = dict() # result + + ctree_cur = [] # lines for the current c-tree + state_cur = 0 # current state (finite state machine for dummies) + for line in log_file: + # DIRTY replace non-breaking spaces output by CoreNLP, as in + # educe.rst_dt.learning.doc_vectorizer + if isinstance(line, unicode): + line2 = line.replace(u'\xa0', u' ') + line = line2.encode('utf-8') + # end replace + + if state_cur == 0: + line = line.strip() + # skip initial lines until "Documents" + if line == "Documents": + state_cur = 1 + elif state_cur == 1: + line = line.strip() + # read list of document names + if line == "end Documents": + state_cur = 2 + else: + assert line.endswith('.dis') + doc_name = line[:-4] + doc_names.append(doc_name) + elif state_cur == 2: + # skip intermediate lines + if line.strip() == "System tree:": + state_cur = 3 + elif state_cur == 3: + if line.strip() == "System tree:": + if ctree_cur: + # parse the previous predicted c-tree ("System tree") + nltk_ct_pred = Tree.fromstring(''.join(ctree_cur)) + nltk_ctrees.append(nltk_ct_pred) + # reset accumulator + ctree_cur = [] + elif TS_RE.match(line): + # stop reading trees + state_cur = 4 + if ctree_cur: + # parse last predicted tree + nltk_ct_pred = Tree.fromstring(''.join(ctree_cur)) + nltk_ctrees.append(nltk_ct_pred) + ctree_cur = [] # reset (bc who wants side effects?) + else: + # accumulate lines for the next predicted c-tree + # we immediately replace " (LeftToRight)" with ":NS", + # " (RightToLeft)" with ":SN", otherwise it should be ":NN" + line = line.replace(" (LeftToRight)", ":NS").replace(" (RightToLeft)", ":SN").replace("TEXT:", "TEXT ") + ctree_cur.append(line) + elif state_cur == 4: + # just read on + continue + + # we got two predicted ctrees for each doc, with gold then predicted EDUs + # filter to keep only ctrees with gold EDUs, i.e. at even indices + nltk_ctrees = nltk_ctrees[::2] + # for each doc, create an RSTTree from the NLTK tree + for doc_name, nltk_ct_pred in zip(doc_names, nltk_ctrees): + # the c-tree read corresponds to a SimpleRstTree + sct_pred = tree_to_simple_rsttree(nltk_ct_pred) + ct_pred = SimpleRSTTree.to_binary_rst_tree(sct_pred) + ct_pred = rel_conv(ct_pred) + ctree_pred[doc_name] = ct_pred + return ctree_pred + + +def load_surdeanu_ctrees(log_file, rel_conv): + """Load c-trees output by Surdeanu's parser. + + Parameters + ---------- + log_file : str + Path to the log file with the document names followed by the + reference and predicted c-trees. + + rel_conv : RstRelationConverter + Converter to map fine-grained relation labels to classes. + + Returns + ------- + ctree_pred : dict(str, RSTTree) + Predicted c-tree for each doc. + """ + with codecs.open(log_file, mode='rb', encoding='utf-8') as f: + return _load_surdeanu_ctrees(f, rel_conv) + + +def load_surdeanu_dtrees(log_file, rel_conv, nary_enc='chain'): + """Get the dtrees for the ctrees output by Surdeanu's parser. + + Parameters + ---------- + log_file: str + Path to the log file with the output. + rel_conv: TODO + Relation converter, from fine- to coarse-grained labels. + nary_enc: one of {'chain', 'tree'} + Encoding for n-ary nodes. + + Returns + ------- + dtree_pred: dict(str, RstDepTree) + RST dtree for each document. + """ + dtree_pred = dict() + + ctree_pred = load_surdeanu_ctrees(log_file, rel_conv) + for doc_name, ct_pred in ctree_pred.items(): + dtree_pred[doc_name] = RstDepTree.from_rst_tree( + ct_pred, nary_enc=nary_enc) + # set reference to the document in the RstDepTree (required by + # dump_disdep_files) + for doc_name, dt_pred in dtree_pred.items(): + dt_pred.origin = FileId(doc_name, None, None, None) + + return dtree_pred From 7552821dc5b1d3904bba221cb66109edf47fe354 Mon Sep 17 00:00:00 2001 From: moreymat Date: Tue, 13 Dec 2016 20:22:28 +0100 Subject: [PATCH 45/74] DOC minor typo --- evals/hayashi_deps.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/evals/hayashi_deps.py b/evals/hayashi_deps.py index 442e688..cb812f5 100644 --- a/evals/hayashi_deps.py +++ b/evals/hayashi_deps.py @@ -136,7 +136,8 @@ def load_hayashi_dep_dtrees(out_dir, rel_conv, edus_file_pat, nuc_clf, def load_hayashi_dep_ctrees(out_dir, rel_conv, edus_file_pat, nuc_clf, rnk_clf): - """Load the dtrees output by one of Hayashi et al.'s dep parsers. + """Load the ctrees for the dtrees output by one of Hayashi et al.'s + dep parsers. Parameters ---------- From 2db94f73d5726e40e4e6b668eef5d2942ed9943c Mon Sep 17 00:00:00 2001 From: moreymat Date: Tue, 13 Dec 2016 20:47:10 +0100 Subject: [PATCH 46/74] ENH incorporate eval for li_sujian --- evals/li_sujian.py | 320 ++++++++++++++++++++++++--------------------- evals/showdown.py | 20 ++- 2 files changed, 190 insertions(+), 150 deletions(-) diff --git a/evals/li_sujian.py b/evals/li_sujian.py index 6f80db4..ee235bb 100644 --- a/evals/li_sujian.py +++ b/evals/li_sujian.py @@ -3,32 +3,15 @@ """ from __future__ import absolute_import, print_function -from collections import Counter -from glob import glob import os # educe from educe.learning.edu_input_format import load_edu_input_file -from educe.rst_dt.corpus import (RstRelationConverter, - Reader as RstReader) from educe.rst_dt.dep2con import deptree_to_rst_tree from educe.rst_dt.deptree import NUC_S, RstDepTree, RstDtException from educe.rst_dt.metrics.rst_parseval import rst_parseval_report # attelo from attelo.metrics.deptree import compute_uas_las as att_compute_uas_las -# local imports -from evals.showdown import EDUS_FILE_PAT, setup_dtree_postprocessor - - -# RST corpus -CORPUS_DIR = os.path.join('corpus', 'RSTtrees-WSJ-main-1.0/') -CD_TRAIN = os.path.join(CORPUS_DIR, 'TRAINING') -CD_TEST = os.path.join(CORPUS_DIR, 'TEST') -# relation converter (fine- to coarse-grained labels) -RELMAP_FILE = os.path.join('/home/mmorey/melodi/educe', - 'educe', 'rst_dt', - 'rst_112to18.txt') -REL_CONV = RstRelationConverter(RELMAP_FILE).convert_tree # output of Li et al.'s parser @@ -64,8 +47,8 @@ "441.0detailedOut.txt", ] -# default file(s) to include ; I picked a coarse-grained one with good scores -DEFAULT_FILES = ["712.0detailedOut.txt"] +# default file to include ; I picked a coarse-grained one with good scores +DEFAULT_FILE = os.path.join(SAVE_DIR, "712.0detailedOut.txt") def load_output_file(out_file): @@ -106,41 +89,137 @@ def load_output_file(out_file): return res -if __name__ == "__main__": - # load dep trees from corpus - reader_test = RstReader(CD_TEST) - corpus_test = reader_test.slurp() +def load_li_sujian_dep_dtrees(out_file, rel_conv_dtree, edus_file_pat, + nuc_clf, rnk_clf): + """Load the dtrees output by Li Sujian et al.'s dep parser. + + Parameters + ---------- + out_file : str + Path to the file containing all the predictions. + + rel_conv_dtree : RstRelationConverter + Converter to map relation labels to (normalized) coarse-grained + classes. + + edus_file_pat : str + Pattern for the .edu_input files. + + nuc_clf : NuclearityClassifier + Nuclearity classifier + + rnk_clf : RankClassifier + Rank classifier + + Returns + ------- + dtree_pred : dict(str, RstDepTree) + RST dtree for each doc. + """ + dtree_pred = dict() + + dep_bunch = load_output_file(out_file) + # load and process _pred + for doc_name, heads_pred, labels_pred in zip( + dep_bunch['doc_names'], dep_bunch['heads_pred'], + dep_bunch['labels_pred']): + # create dtree _pred + edus_data = load_edu_input_file(edus_file_pat.format(doc_name), + edu_type='rst-dt') + edus = edus_data['edus'] + edu2sent = edus_data['edu2sent'] + dt_pred = RstDepTree(edus) + # add predicted edges + for dep_idx, (gov_idx, lbl) in enumerate(zip( + heads_pred[1:], labels_pred[1:]), start=1): + if lbl == '': + lbl = 'Elaboration' + lbl = lbl.lower() + dt_pred.add_dependency(gov_idx, dep_idx, lbl) + # map to relation classes + dt_pred = rel_conv_dtree(dt_pred) + dt_pred.labels = ['ROOT' if x == 'root' else x + for x in dt_pred.labels] + # attach edu2sent, for later use by rnk_clf + dt_pred.sent_idx = [0] + edu2sent # 0 for fake root + dirty + dtree_pred[doc_name] = dt_pred + # end WIP + + for doc_name in sorted(dtree_pred.keys()): + dt_pred = dtree_pred[doc_name] + # enrich d-tree with nuc and order + dt_pred.ranks = rnk_clf.predict([dt_pred])[0] + dt_pred.nucs = nuc_clf.predict([dt_pred])[0] + dtree_pred[doc_name] = dt_pred + + return dtree_pred + + +def load_li_sujian_dep_ctrees(out_file, rel_conv_dtree, edus_file_pat, + nuc_clf, rnk_clf): + """Load the ctrees for the dtrees output by Li Sujian et al.'s parser. + + Parameters + ---------- + out_file : str + Path to the file containing all the predictions. + + rel_conv_dtree : RstRelationConverter + Converter to map relation labels to (normalized) coarse-grained + classes. + + edus_file_pat : str + Pattern for the .edu_input files. - # choice of predictions: granularity of relations - RST_RELS = 'coarse' - if RST_RELS == 'coarse': - PRED_FILES = DEFAULT_FILES # COARSE_FILES - else: - PRED_FILES = FINE_FILES - # eval procedure: the one in the parser of Li et al. vs standard one - EVAL_LI = False + nuc_clf : NuclearityClassifier + Nuclearity classifier + rnk_clf : RankClassifier + Rank classifier + + Returns + ------- + ctree_pred : dict(str, RSTTree) + RST ctree for each doc. + """ + ctree_pred = dict() + + dtree_pred = load_li_sujian_dep_dtrees( + out_file, rel_conv_dtree, edus_file_pat, nuc_clf, rnk_clf) + for doc_name, dt_pred in sorted(dtree_pred.items()): + ct_pred = deptree_to_rst_tree(dt_pred) + ctree_pred[doc_name] = ct_pred + return ctree_pred + + +def twisted_eval(out_file, rel_conv_dtree, setup_dtree_postprocessor, + ctree_true, dtree_true, edus_file_pat): + """Perform a twisted eval. + + Parameters + ---------- + setup_dtree_postprocessor : function + Function that sets up nuc_clf and rnk_clf. + + ctree_true : dict(str, RSTTree) + Gold ctrees + + dtree_true : dict(str, DepRstTree) + Gold dtrees + + out_file : str + Path to the output file. + """ # setup conversion from c- to d-tree and back, and eval type nary_enc = 'chain' - - if EVAL_LI: - # reconstruction of the c-tree - order = 'strict' - nuc_strategy = 'constant' - nuc_constant = NUC_S - rnk_strategy = 'lllrrr' - rnk_prioritize_same_unit = False - # eval - TWIST_GOLD = True - ADD_TRIVIAL_SPANS = True - else: # comparable setup to what we use for our own parsers - order = 'weak' - nuc_strategy = "unamb_else_most_frequent" - nuc_constant = None - rnk_strategy = "sdist-edist-rl" - rnk_prioritize_same_unit = True - TWIST_GOLD = False - ADD_TRIVIAL_SPANS = False + # reconstruction of the c-tree + order = 'strict' + nuc_strategy = 'constant' + nuc_constant = NUC_S + rnk_strategy = 'lllrrr' + rnk_prioritize_same_unit = False + # eval + add_trivial_spans = True nuc_clf, rnk_clf = setup_dtree_postprocessor( nary_enc=nary_enc, order=order, nuc_strategy=nuc_strategy, @@ -149,108 +228,51 @@ def load_output_file(out_file): ctree_true = dict() dtree_true = dict() - labelset_true = Counter() - for doc_id, ct_true in sorted(corpus_test.items()): - doc_name = doc_id.doc - if RST_RELS == 'coarse': - # map fine to coarse rels - ct_true = REL_CONV(ct_true) - ctree_true[doc_name] = ct_true - dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc=nary_enc) + for doc_name, dt_true in sorted(dtree_true.items()): # dirty hack: lowercase ROOT dt_true.labels = [x.lower() if x == 'ROOT' else x for x in dt_true.labels] - dtree_true[doc_name] = dt_true - labelset_true.update(dt_true.labels[1:]) - # load parser output - for fname in PRED_FILES: - dtree_pred = dict() - labelset_pred = Counter() - # - f_cur = os.path.join(SAVE_DIR, fname) - dep_bunch = load_output_file(f_cur) - doc_names = dep_bunch['doc_names'] - # load and process _pred - for doc_name, heads_pred, labels_pred in zip( - dep_bunch['doc_names'], dep_bunch['heads_pred'], - dep_bunch['labels_pred']): - # create dtree _pred - edus_data = load_edu_input_file(EDUS_FILE_PAT.format(doc_name), - edu_type='rst-dt') - edus = edus_data['edus'] - edu2sent = edus_data['edu2sent'] - dt_pred = RstDepTree(edus) - # add predicted edges - for dep_idx, (gov_idx, lbl) in enumerate(zip( - heads_pred[1:], labels_pred[1:]), start=1): - if lbl == '': - lbl = 'Elaboration' - # print(lbl) - lbl = lbl.lower() - labelset_pred[lbl] += 1 - dt_pred.add_dependency(gov_idx, dep_idx, lbl) - dt_pred.sent_idx = [0] + edu2sent # 0 for fake root + dirty - dtree_pred[doc_name] = dt_pred - # end WIP + dtree_pred = load_li_sujian_dep_dtrees( + out_file, rel_conv_dtree, edus_file_pat, nuc_clf, rnk_clf) + ctree_pred = load_li_sujian_dep_ctrees( + out_file, rel_conv_dtree, edus_file_pat, nuc_clf, rnk_clf) + + # use our heuristics to replace the true nuc and order in + # dt_true with a predicted one, replace ct_true with its + # twisted version + for doc_name, dt_true in dtree_true.items(): + dt_pred = dtree_pred[doc_name] + # twiste dt_true + dt_true.sent_idx = dt_pred.sent_idx + dt_true.ranks = rnk_clf.predict([dt_true])[0] + dt_true.nucs = nuc_clf.predict([dt_true])[0] + # re-gen ct_true + try: + ct_true = deptree_to_rst_tree(dt_true) + except RstDtException as rst_e: + print(rst_e) + raise + ctree_true[doc_name] = ct_true + + # compute UAS and LAS on the _true values from the corpus and + # _pred Educe RstDepTrees re-built from their output files + doc_names = sorted(dtree_true.keys()) + dtree_true_list = [dtree_true[doc_name] for doc_name in doc_names] + dtree_pred_list = [dtree_pred[doc_name] for doc_name in doc_names] + sc_uas, sc_las, sc_las_n, sc_las_o, sc_las_no = att_compute_uas_las( + dtree_true_list, dtree_pred_list, include_ls=False, + include_las_n_o_no=True) + print(("{}\tUAS={:.4f}\tLAS={:.4f}\tLAS+N={:.4f}\tLAS+O={:.4f}\t" + "LAS+N+O={:.4f}").format( + out_file, sc_uas, sc_las, sc_las_n, sc_las_o, sc_las_no)) - if RST_RELS == 'coarse': - expected_labelset = ['attribution', 'background', 'cause', 'comparison', 'condition', 'contrast', 'elaboration', 'enablement', 'evaluation', 'explanation', 'joint', 'manner-means', 'root', 'same-unit', 'summary', 'temporal', 'textual', 'topic-change', 'topic-comment'] - assert sorted(labelset_pred.keys()) == expected_labelset - # wsj_1189 has a weird "span" label in a multinuclear rel at [7--9] - # see footnote in Hayashi et al's SIGDIAL 2016 paper - assert sorted(labelset_true.keys()) == sorted( - expected_labelset + ['span']) - - # build predicted c-trees using our heuristics for nuc and rank - ctree_pred = dict() - for doc_name, dt_pred in dtree_pred.items(): - # 1. enrich d-tree with nuc and order - # a. order: the procedure that generates spans produces a - # left-heavy branching: ((A B) C), which should be our - # "lllrrr" heuristic - dt_pred.ranks = rnk_clf.predict([dt_pred])[0] - # b. nuclearity: heuristic baseline - dt_pred.nucs = nuc_clf.predict([dt_pred])[0] - # 2. build _pred c-tree - try: - ct_pred = deptree_to_rst_tree(dt_pred) - ctree_pred[doc_name] = ct_pred - except RstDtException as rst_e: - print(rst_e) - raise - # 3. predict nuc and order in _true d-tree, replace the _true - # c-tree with a twisted one, like in their eval - if TWIST_GOLD: - dt_true = dtree_true[doc_name] - dt_true.sent_idx = [0] + edu2sent - dt_true.ranks = rnk_clf.predict([dt_true])[0] - dt_true.nucs = nuc_clf.predict([dt_true])[0] - ct_true = ctree_true[doc_name] - try: - ct_true = deptree_to_rst_tree(dt_true) - except RstDtException as rst_e: - print(rst_e) - raise - ctree_true[doc_name] = ct_true - - # compute UAS and LAS on the _true values from the corpus and - # _pred Educe RstDepTrees re-built from their output files - dtree_true_list = [dtree_true[doc_name] for doc_name in doc_names] - dtree_pred_list = [dtree_pred[doc_name] for doc_name in doc_names] - sc_uas, sc_las, sc_las_n, sc_las_o, sc_las_no = att_compute_uas_las( - dtree_true_list, dtree_pred_list, include_ls=False, - include_las_n_o_no=True) - print(("{}\tUAS={:.4f}\tLAS={:.4f}\tLAS+N={:.4f}\tLAS+O={:.4f}\t" - "LAS+N+O={:.4f}").format( - fname, sc_uas, sc_las, sc_las_n, sc_las_o, sc_las_no)) - - # compute RST-Parseval of these c-trees - ctree_true_list = [ctree_true[doc_name] for doc_name in doc_names] - ctree_pred_list = [ctree_pred[doc_name] for doc_name in doc_names] - print(rst_parseval_report(ctree_true_list, ctree_pred_list, - ctree_type='RST', digits=4, - per_doc=False, - add_trivial_spans=ADD_TRIVIAL_SPANS, - stringent=False)) + # compute RST-Parseval of these c-trees + ctree_true_list = [ctree_true[doc_name] for doc_name in doc_names] + ctree_pred_list = [ctree_pred[doc_name] for doc_name in doc_names] + print(rst_parseval_report(ctree_true_list, ctree_pred_list, + ctree_type='RST', digits=4, + per_doc=False, + add_trivial_spans=add_trivial_spans, + stringent=False)) diff --git a/evals/showdown.py b/evals/showdown.py index eb8ad09..fe8b80c 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -36,6 +36,9 @@ load_hayashi_dep_ctrees) from evals.ji import load_ji_ctrees, load_ji_dtrees from evals.li_qi import load_li_qi_ctrees, load_li_qi_dtrees +from evals.li_sujian import (DEFAULT_FILE as LI_SUJIAN_OUT_FILE, + load_li_sujian_dep_ctrees, + load_li_sujian_dep_dtrees) from evals.ours import (load_deptrees_from_attelo_output, load_attelo_ctrees, load_attelo_dtrees) @@ -122,7 +125,8 @@ BRAUD_EACL_CROSS_DEV = '/home/mmorey/melodi/rst/braud/eacl16/best-en-cross+dev/test_it10_beam32' # Surdeanu SURDEANU_LOG_FILE = '/home/mmorey/melodi/rst/surdeanu/output/log' - +# Li Sujian dep parser +# imported, see above # level of detail for parseval DETAILED = False @@ -197,6 +201,7 @@ def main(): 'braud_coling', 'braud_eacl_mono', 'braud_eacl_cross_dev', 'surdeanu', + 'li_sujian', 'ours_chain', 'ours_tree', 'ours_tree_su'], help="Author(s) of the predictions") parser.add_argument('--nary_enc_pred', default='tree', @@ -210,6 +215,7 @@ def main(): 'braud_coling', 'braud_eacl_mono', 'braud_eacl_cross_dev', 'surdeanu', + 'li_sujian', 'ours_chain', 'ours_tree'], help="Author of the reference") # * dtree eval @@ -350,6 +356,18 @@ def main(): nary_enc='chain')) ) + if author_pred == 'li_sujian': + c_preds.append( + ('li_sujian', load_li_sujian_dep_ctrees( + LI_SUJIAN_OUT_FILE, REL_CONV_DTREE, EDUS_FILE_PAT, + nuc_clf, rnk_clf)) + ) + d_preds.append( + ('li_sujian', load_li_sujian_dep_dtrees( + LI_SUJIAN_OUT_FILE, REL_CONV_DTREE, EDUS_FILE_PAT, + nuc_clf, rnk_clf)) + ) + if author_pred == 'feng': c_preds.append( ('gSVM', load_feng_ctrees(FENG1_OUT_DIR, REL_CONV)) From 2a12e87d1aeb48770366b7297c815e7a94a18992 Mon Sep 17 00:00:00 2001 From: moreymat Date: Tue, 3 Jan 2017 15:37:34 +0100 Subject: [PATCH 47/74] MAINT move loaders for RST parsers from educe to ./evals --- evals/codra.py | 79 +++++++++++++++++++++++++++++++++++++-------- evals/dis2disdep.py | 48 +++++++++++++++++++-------- evals/feng.py | 55 +++++++++++++++++++++++++++++-- 3 files changed, 154 insertions(+), 28 deletions(-) diff --git a/evals/codra.py b/evals/codra.py index eb9c6f6..a586389 100644 --- a/evals/codra.py +++ b/evals/codra.py @@ -1,21 +1,73 @@ -"""Use the same evaluation procedure Evaluate the output of CODRA +"""This module enables to load the output of Joty's discourse parser CODRA. """ from __future__ import absolute_import, print_function +import codecs from collections import defaultdict +import glob import itertools +import os -import numpy as np - -from educe.rst_dt.codra import load_codra_output_files -from educe.rst_dt.dep2con import deptree_to_rst_tree from educe.rst_dt.deptree import RstDepTree -from educe.rst_dt.document_plus import align_edus_with_paragraphs -# -from attelo.io import load_edus -from attelo.metrics.deptree import compute_uas_las +from educe.rst_dt.parse import parse_rst_dt_tree + + +def load_codra_output_files(container_path, level='doc'): + """Load ctrees output by CODRA on the TEST section of RST-WSJ. + + Parameters + ---------- + container_path: string + Path to the main folder containing CODRA's output + + level: {'doc', 'sent'}, optional (default='doc') + Level of decoding: document-level or sentence-level + + Returns + ------- + data: dict + Dictionary that should be akin to a sklearn Bunch, with + interesting keys 'filenames', 'doc_names' and 'rst_ctrees'. + + Notes + ----- + To ensure compatibility with the rest of the code base, doc_names + are automatically added the ".out" extension. This would not work + for fileX documents, but they are absent from the TEST section of + the RST-WSJ treebank. + """ + if level == 'doc': + file_ext = '.doc_dis' + elif level == 'sent': + file_ext = '.sen_dis' + else: + raise ValueError("level {} not in ['doc', 'sent']".format(level)) + + # find all files with the right extension + pathname = os.path.join(container_path, '*{}'.format(file_ext)) + # filenames are sorted by name to avoid having to realign data + # loaded with different functions + filenames = sorted(glob.glob(pathname)) # glob.glob() returns a list + + # find corresponding doc names + doc_names = [os.path.splitext(os.path.basename(filename))[0] + '.out' + for filename in filenames] + + # load the RST trees + rst_ctrees = [] + for filename in filenames: + with codecs.open(filename, 'r', 'utf-8') as f: + # TODO (?) add support for and use RSTContext + rst_ctree = parse_rst_dt_tree(f.read(), None) + rst_ctrees.append(rst_ctree) + + data = dict(filenames=filenames, + doc_names=doc_names, + rst_ctrees=rst_ctrees) + + return data def load_codra_ctrees(codra_out_dir, rel_conv): @@ -127,8 +179,9 @@ def get_edu2sent(att_edus): edu2sent_idx[doc_name][edu_num] = sent_idx # sort EDUs by num # rebuild educe-style edu2sent ; prepend 0 for the fake root - doc_name2edu2sent = {doc_name: ([0] - + [s_idx for e_num, s_idx - in sorted(edu2sent.items())]) - for doc_name, edu2sent in edu2sent_idx.items()} + doc_name2edu2sent = { + doc_name: ([0] + + [s_idx for e_num, s_idx in sorted(edu2sent.items())]) + for doc_name, edu2sent in edu2sent_idx.items() + } return doc_name2edu2sent diff --git a/evals/dis2disdep.py b/evals/dis2disdep.py index fd552fa..5825cfc 100755 --- a/evals/dis2disdep.py +++ b/evals/dis2disdep.py @@ -12,16 +12,19 @@ from educe.corpus import FileId from educe.learning.disdep_format import dump_disdep_files -from educe.rst_dt.codra import load_codra_output_files from educe.rst_dt.corpus import Reader, RstRelationConverter from educe.rst_dt.deptree import RstDepTree from educe.rst_dt.feng import load_feng_output_files from educe.rst_dt.rst_wsj_corpus import (DOUBLE_FOLDER, TEST_FOLDER, TRAIN_FOLDER) +from evals.codra import load_codra_output_files from evals.gcrf_tree_format import load_gcrf_dtrees -from evals.hayashi_deps import load_hayashi_dtrees +from evals.hayashi_cons import load_hayashi_hilda_dtrees +from evals.hayashi_deps import load_hayashi_dep_dtrees from evals.ji import load_ji_dtrees +from evals.showdown import (setup_dtree_postprocessor, NUC_STRATEGY, + NUC_CONSTANT, RNK_STRATEGY, RNK_PRIORITY_SU) # original RST corpus @@ -30,6 +33,11 @@ RST_MAIN_TEST = os.path.join(RST_CORPUS, TEST_FOLDER) RST_DOUBLE = os.path.join(RST_CORPUS, DOUBLE_FOLDER) +# get edu2sent, set up rnk_clf and nuc_clf to predict rank and order for +# the output of Hayashi's MST parser +# * new style .edu_input: one file per doc in test set +EDUS_FILE_PAT = "TMP/latest/data/TEST/{}.relations.edu-pairs.sparse.edu_input" + # relation converter (fine- to coarse-grained labels) RELMAP_FILE = os.path.join('/home/mmorey/melodi/educe', 'educe', 'rst_dt', @@ -39,15 +47,18 @@ REL_CONV_DTREE = REL_CONV_BASE.convert_dtree # output of Joty's parser OUT_JOTY = os.path.join('/home/mmorey/melodi/rst/joty/Doc-level/') -# output of Feng & Hirst's parser -OUT_FENG = os.path.join('/home/mmorey/melodi/rst/feng_hirst/phil/tmp/') -# output of Feng & Hirst's parser -OUT_FENG2 = os.path.join('/home/mmorey/melodi/rst/feng_hirst/gCRF_dist/texts/results/test_batch_gold_seg') +# output of Feng & Hirst's parsers +FENG_BASEDIR = '/home/mmorey/melodi/rst/feng_hirst' +OUT_FENG = os.path.join(FENG_BASEDIR, 'phil/tmp/') +OUT_FENG2 = os.path.join(FENG_BASEDIR, + 'gCRF_dist/texts/results/test_batch_gold_seg') # output of Ji's parser -OUT_JI = os.path.join('/home/mmorey/melodi/rst/ji_eisenstein/DPLP/data/docs/test/') +JI_BASEDIR = '/home/mmorey/melodi/rst/ji_eisenstein' +OUT_JI = os.path.join(JI_BASEDIR, 'DPLP/data/docs/test/') # output of Hayashi et al.'s parsers -OUT_HAYASHI_MST = os.path.join('/home/mmorey/melodi/rst/hayashi/SIGDIAL/auto_parse/dep/li/') -OUT_HAYASHI_HILDA = os.path.join('/home/mmorey/melodi/rst/hayashi/SIGDIAL/auto_parse/cons/trans_li/') +HAYASHI_BASEDIR = '/home/mmorey/melodi/rst/hayashi/SIGDIAL/' +OUT_HAYASHI_MST = os.path.join(HAYASHI_BASEDIR, 'auto_parse/dep/li/') +OUT_HAYASHI_HILDA = os.path.join(HAYASHI_BASEDIR, 'auto_parse/cons/trans_li/') def main(): @@ -70,7 +81,8 @@ def main(): help="Root directory for the output") args = parser.parse_args() # precise output path, by default: TMP_disdep/chain/gold/train - out_dir = os.path.join(args.out_root, args.nary_enc, args.author, args.split) + out_dir = os.path.join(args.out_root, args.nary_enc, args.author, + args.split) if not os.path.exists(out_dir): os.makedirs(out_dir) # read RST trees @@ -139,13 +151,23 @@ def main(): if corpus_split != 'test': raise ValueError("The output of Hayashi et al.'s parser is " "available for the 'test' split only") - dtrees = load_hayashi_dtrees(OUT_HAYASHI_MST, REL_CONV_DTREE) + # setup nuc_clf, rnk_clf + nuc_clf, rnk_clf = setup_dtree_postprocessor( + nary_enc='tree', order='weak', + nuc_strategy=NUC_STRATEGY, + nuc_constant=NUC_CONSTANT, + rnk_strategy=RNK_STRATEGY, + rnk_prioritize_same_unit=RNK_PRIORITY_SU) + # end setup + dtrees = load_hayashi_dep_dtrees( + OUT_HAYASHI_MST, REL_CONV_DTREE, EDUS_FILE_PAT, + nuc_clf, rnk_clf) elif author == 'hayashi_hilda': if corpus_split != 'test': raise ValueError("The output of Hayashi et al.'s parser is " "available for the 'test' split only") - dtrees = load_hayashi_dtrees(OUT_HAYASHI_HILDA, REL_CONV_DTREE) - + dtrees = load_hayashi_hilda_dtrees(OUT_HAYASHI_HILDA, REL_CONV) + # do dump dump_disdep_files(dtrees.values(), out_dir) diff --git a/evals/feng.py b/evals/feng.py index fd65acf..a9c60f0 100644 --- a/evals/feng.py +++ b/evals/feng.py @@ -1,4 +1,4 @@ -"""Load the output of the parser from (Feng and Hirst, 2014). +"""Load the output of the RST parser from (Feng and Hirst, 2014). This is 99% a copy/paste from evals/joty.py . I need to come up with a better API and refactor accordingly. @@ -6,12 +6,63 @@ from __future__ import absolute_import, print_function +import codecs +import glob import itertools +import os from nltk import Tree -from educe.rst_dt.feng import load_feng_output_files from educe.rst_dt.deptree import RstDepTree +from educe.rst_dt.parse import parse_rst_dt_tree + + +def load_feng_output_files(root_dir): + """Load ctrees output by Feng & Hirst's parser on the TEST section of + RST-WSJ. + + Parameters + ---------- + root_dir: string + Path to the main folder containing the parser's output + + Returns + ------- + data: dict + Dictionary that should be akin to a sklearn Bunch, with + interesting keys 'filenames', 'doc_names' and 'rst_ctrees'. + + Notes + ----- + To ensure compatibility with the rest of the code base, doc_names + are automatically added the ".out" extension. This would not work + for fileX documents, but they are absent from the TEST section of + the RST-WSJ treebank. + """ + # find all files with the right extension + file_ext = '.txt.dis' + pathname = os.path.join(root_dir, '*{}'.format(file_ext)) + # filenames are sorted by name to avoid having to realign data + # loaded with different functions + filenames = sorted(glob.glob(pathname)) # glob.glob() returns a list + + # find corresponding doc names + doc_names = [os.path.basename(filename).rsplit('.', 2)[0] + '.out' + for filename in filenames] + + # load the RST trees + rst_ctrees = [] + for filename in filenames: + with codecs.open(filename, 'r', 'utf-8') as f: + # TODO (?) add support for and use RSTContext + rst_ctree = parse_rst_dt_tree(f.read(), None) + rst_ctrees.append(rst_ctree) + + data = dict(filenames=filenames, + doc_names=doc_names, + rst_ctrees=rst_ctrees) + + return data def load_feng_ctrees(out_dir, rel_conv): From 5750e89dc6eea7f7dd3e6e09a069c58c335cf3a0 Mon Sep 17 00:00:00 2001 From: moreymat Date: Fri, 3 Feb 2017 20:42:22 +0100 Subject: [PATCH 48/74] WIP disable frag pairs --- irit_rst_dt/cmd/gather.py | 46 ++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/irit_rst_dt/cmd/gather.py b/irit_rst_dt/cmd/gather.py index b986097..df37074 100644 --- a/irit_rst_dt/cmd/gather.py +++ b/irit_rst_dt/cmd/gather.py @@ -188,28 +188,30 @@ def main(args): vocab_path=vocab_path, label_path=label_path) - # frag pairs: supplementary pairs from/to each fragmented EDU to - # the other fragmented EDUs and the EDUs that don't belong to any - # fragmented EDU - instances = 'frag-pairs' - same_unit_types = set(('true' if isinstance(x, AttachOracle) - else 'pred') - for clf in same_unit_clfs) - for same_unit_type in sorted(same_unit_types): - # we use the vocabulary and labelset from "edu-pairs" ; this is the - # simplest solution currently and it seems correct, but maybe we - # could extend "edu-pairs" with these pairs when we learn the - # vocabulary? - if not args.skip_training: - extract_features(TRAINING_CORPUS, tdir_data, fix_pseudo_rels, - instances, frag_edus=same_unit_type, - vocab_path=vocab_path, - label_path=label_path) - if TEST_CORPUS is not None: - extract_features(TEST_CORPUS, tdir_data, fix_pseudo_rels, - instances, frag_edus=same_unit_type, - vocab_path=vocab_path, - label_path=label_path) + # WIP 2017-02-03 disable frag-pairs + if False: + # frag pairs: supplementary pairs from/to each fragmented EDU to + # the other fragmented EDUs and the EDUs that don't belong to any + # fragmented EDU + instances = 'frag-pairs' + same_unit_types = set(('true' if isinstance(x, AttachOracle) + else 'pred') + for clf in same_unit_clfs) + for same_unit_type in sorted(same_unit_types): + # we use the vocabulary and labelset from "edu-pairs" ; + # this is the simplest solution currently and it seems + # correct, but maybe we could extend "edu-pairs" with these + # pairs when we learn the vocabulary? + if not args.skip_training: + extract_features(TRAINING_CORPUS, tdir_data, fix_pseudo_rels, + instances, frag_edus=same_unit_type, + vocab_path=vocab_path, + label_path=label_path) + if TEST_CORPUS is not None: + extract_features(TEST_CORPUS, tdir_data, fix_pseudo_rels, + instances, frag_edus=same_unit_type, + vocab_path=vocab_path, + label_path=label_path) # end frag pairs with open(os.path.join(tdir_data, "versions-gather.txt"), "w") as stream: From 5784da8b2b9dcb0abf541bb6920973f3ec9be2ae Mon Sep 17 00:00:00 2001 From: moreymat Date: Tue, 7 Feb 2017 10:38:42 +0100 Subject: [PATCH 49/74] FIX paths for CDU-related files are now optional --- irit_rst_dt/harness.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/irit_rst_dt/harness.py b/irit_rst_dt/harness.py index 3242a31..a555411 100644 --- a/irit_rst_dt/harness.py +++ b/irit_rst_dt/harness.py @@ -108,22 +108,28 @@ def create_folds(self, mpack): # paths # ------------------------------------------------------ - def mpack_paths(self, test_data, stripped=False): + def mpack_paths(self, test_data, stripped=False, with_cdus=False): """ Parameters ---------- - test_data: boolean + test_data : boolean If true, the returned paths point to self.testset else to self.dataset. - stripped: boolean + + stripped : boolean, defaults to False TODO + with_cdus : boolean, defaults to False + If True, generate CDUs (eg. for fragmented EDUs), pairings + on them and the corresponding feature vectors. + Returns ------- - paths: dict of file paths - Path to: edu_input, pairings, features, vocab, labels, - cdu_input, cdu_pairings, cdu_features, corpus (to access - gold structures, WIP). + paths : dict of (glob patterns of) file paths + Path to: edu_input, pairings, features, vocab, labels. + Also contains 'corpus' (to access gold structures, WIP for + RST-DT) ; if `with_cdus` is True, also cdu_input, + cdu_pairings, cdu_features. """ base = 'relations.edu-pairs' ext = base + '.sparse' From de586487b55b80ae3de20a4d469bd0e914086ac2 Mon Sep 17 00:00:00 2001 From: moreymat Date: Tue, 7 Feb 2017 10:44:00 +0100 Subject: [PATCH 50/74] FIX paths for CDU-related files are now (really) optional --- irit_rst_dt/harness.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/irit_rst_dt/harness.py b/irit_rst_dt/harness.py index a555411..03801e4 100644 --- a/irit_rst_dt/harness.py +++ b/irit_rst_dt/harness.py @@ -138,9 +138,10 @@ def mpack_paths(self, test_data, stripped=False, with_cdus=False): vocab_path = fp.join(self.eval_dir, "%s.%s.vocab" % (dset, ext)) labels_path = fp.join(self.eval_dir, "%s.%s.labels" % (dset, base)) core_path = fp.join(self.eval_dir, dset, "*.%s" % ext) - # 2016-07-28 pairs on fragmented EDUs - frag_ext = 'relations.frag-pairs.sparse' - frag_path = fp.join(self.eval_dir, dset, "*.%s" % frag_ext) + if with_cdus: + # 2016-07-28 pairs on fragmented EDUs + frag_ext = 'relations.frag-pairs.sparse' + frag_path = fp.join(self.eval_dir, dset, "*.%s" % frag_ext) # WIP gold RST trees corpus_path = fp.abspath(TEST_CORPUS if test_data else TRAINING_CORPUS) @@ -153,10 +154,13 @@ def mpack_paths(self, test_data, stripped=False, with_cdus=False): 'vocab': vocab_path, 'labels': labels_path, # fragmented EDUs - 'cdu_input': frag_path + '.cdu_input', - 'cdu_pairings': frag_path + '.cdu_pairings', - 'cdu_features': ((frag_path + '.stripped') if stripped - else frag_path), + 'cdu_input': (frag_path + '.cdu_input' if with_cdus + else None) + 'cdu_pairings': (frag_path + '.cdu_pairings' if with_cdus + else None) + 'cdu_features': (((frag_path + '.stripped') if stripped + else frag_path) if with_cdus + else None), # corpus for gold RST trees 'corpus': corpus_path, } From ad2dd783157f648d1ec5bb25d3deeeb65c757bf5 Mon Sep 17 00:00:00 2001 From: moreymat Date: Tue, 7 Feb 2017 10:48:54 +0100 Subject: [PATCH 51/74] FIX paths for CDU-related files are now (really) optional --- irit_rst_dt/harness.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/irit_rst_dt/harness.py b/irit_rst_dt/harness.py index 03801e4..b843168 100644 --- a/irit_rst_dt/harness.py +++ b/irit_rst_dt/harness.py @@ -155,9 +155,9 @@ def mpack_paths(self, test_data, stripped=False, with_cdus=False): 'labels': labels_path, # fragmented EDUs 'cdu_input': (frag_path + '.cdu_input' if with_cdus - else None) + else None), 'cdu_pairings': (frag_path + '.cdu_pairings' if with_cdus - else None) + else None), 'cdu_features': (((frag_path + '.stripped') if stripped else frag_path) if with_cdus else None), From 056a7b151167b2e58226de14ce3d6dcae88f4272 Mon Sep 17 00:00:00 2001 From: moreymat Date: Mon, 13 Feb 2017 10:03:28 +0100 Subject: [PATCH 52/74] FIX disable cdu paths --- irit_rst_dt/harness.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/irit_rst_dt/harness.py b/irit_rst_dt/harness.py index b843168..40f89ec 100644 --- a/irit_rst_dt/harness.py +++ b/irit_rst_dt/harness.py @@ -138,10 +138,6 @@ def mpack_paths(self, test_data, stripped=False, with_cdus=False): vocab_path = fp.join(self.eval_dir, "%s.%s.vocab" % (dset, ext)) labels_path = fp.join(self.eval_dir, "%s.%s.labels" % (dset, base)) core_path = fp.join(self.eval_dir, dset, "*.%s" % ext) - if with_cdus: - # 2016-07-28 pairs on fragmented EDUs - frag_ext = 'relations.frag-pairs.sparse' - frag_path = fp.join(self.eval_dir, dset, "*.%s" % frag_ext) # WIP gold RST trees corpus_path = fp.abspath(TEST_CORPUS if test_data else TRAINING_CORPUS) @@ -153,17 +149,23 @@ def mpack_paths(self, test_data, stripped=False, with_cdus=False): else core_path), 'vocab': vocab_path, 'labels': labels_path, - # fragmented EDUs - 'cdu_input': (frag_path + '.cdu_input' if with_cdus - else None), - 'cdu_pairings': (frag_path + '.cdu_pairings' if with_cdus - else None), - 'cdu_features': (((frag_path + '.stripped') if stripped - else frag_path) if with_cdus - else None), # corpus for gold RST trees 'corpus': corpus_path, } + if with_cdus: + # 2016-07-28 fragmented EDUs + frag_ext = 'relations.frag-pairs.sparse' + frag_path = fp.join(self.eval_dir, dset, "*.%s" % frag_ext) + res.update([ + ('cdu_input', (frag_path + '.cdu_input' if with_cdus + else None)), + ('cdu_pairings', (frag_path + '.cdu_pairings' if with_cdus + else None)), + ('cdu_features', (((frag_path + '.stripped') if stripped + else frag_path) if with_cdus + else None)), + ]) + return res def model_paths(self, rconf, fold, parser): From 86a73e9ee636fcd142dfc7c38046ac8e224158bb Mon Sep 17 00:00:00 2001 From: moreymat Date: Mon, 13 Feb 2017 10:52:22 +0100 Subject: [PATCH 53/74] FIX add graphviz to environment.yml --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index a1140ca..417ecaa 100644 --- a/environment.yml +++ b/environment.yml @@ -1,6 +1,7 @@ name: irit-rst-dt dependencies: - python=2.7 + - graphviz=2.38.0 - nltk - scikit-learn - pip: From 4165175fc29bad6d307154d2badb05a6da0d7d9a Mon Sep 17 00:00:00 2001 From: moreymat Date: Tue, 11 Apr 2017 10:29:44 +0200 Subject: [PATCH 54/74] FIX evals paths --- evals/li_sujian.py | 2 +- evals/showdown.py | 36 +++++++++++++++++++----------------- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/evals/li_sujian.py b/evals/li_sujian.py index ee235bb..d84ae9f 100644 --- a/evals/li_sujian.py +++ b/evals/li_sujian.py @@ -15,7 +15,7 @@ # output of Li et al.'s parser -SAVE_DIR = "/home/mmorey/melodi/rst/li_sujian/TextLevelDiscourseParser/mybackup/mstparser-code-116-trunk/mstparser/save" +SAVE_DIR = "/home/mmorey/melodi/rst/replication/li_sujian/TextLevelDiscourseParser/mybackup/mstparser-code-116-trunk/mstparser/save" COARSE_FILES = [ "136.0detailedOutVersion2.txt", "151.0detailedOut.txt", diff --git a/evals/showdown.py b/evals/showdown.py index fe8b80c..497ea29 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -63,29 +63,31 @@ # * syntax: pred vs gold # old-style .edu_input: whole test set -EDUS_FILE = os.path.join('/home/mmorey/melodi', +EDUS_FILE = os.path.join('/home/mmorey/melodi/rst', 'irit-rst-dt/TMP/syn_gold_coarse', 'TEST.relations.sparse.edu_input') # new style .edu_input: one file per doc in test set -EDUS_FILE_PAT = "TMP/latest/data/TEST/{}.relations.edu-pairs.sparse.edu_input" +# was: TMP/latest/data..., replaced latest with 2016-09-30T1701 but +# might be wrong (or it might have no consequence here) +EDUS_FILE_PAT = "TMP/2016-09-30T1701/data/TEST/{}.relations.edu-pairs.sparse.edu_input" # outputs of parsers EISNER_OUT_SYN_PRED = os.path.join( - '/home/mmorey/melodi', + '/home/mmorey/melodi/rst', 'irit-rst-dt/TMP/syn_pred_coarse', # lbl 'scratch-current/combined', 'output.maxent-iheads-global-AD.L-jnt-eisner') # 2016-09-14 "tree" transform, predicted syntax EISNER_OUT_TREE_SYN_PRED = os.path.join( - '/home/mmorey/melodi', + '/home/mmorey/melodi/rst', 'irit-rst-dt/TMP/2016-09-12T0825', # lbl 'scratch-current/combined', 'output.maxent-iheads-global-AD.L-jnt-eisner') EISNER_OUT_TREE_SYN_PRED_SU = os.path.join( - '/home/mmorey/melodi', + '/home/mmorey/melodi/rst', 'irit-rst-dt/TMP/2016-09-12T0825', # lbl 'scratch-current/combined', 'output.maxent-iheads-global-AD.L-jnt_su-eisner') @@ -93,38 +95,38 @@ EISNER_OUT_SYN_PRED_SU = os.path.join( - '/home/mmorey/melodi', + '/home/mmorey/melodi/rst', 'irit-rst-dt/TMP/latest', # lbl 'scratch-current/combined', 'output.maxent-AD.L-jnt_su-eisner') EISNER_OUT_SYN_GOLD = os.path.join( - '/home/mmorey/melodi', + '/home/mmorey/melodi/rst', 'irit-rst-dt/TMP/syn_gold_coarse', # lbl 'scratch-current/combined', 'output.maxent-iheads-global-AD.L-jnt-eisner') # output of Joty's parser CODRA -CODRA_OUT_DIR = '/home/mmorey/melodi/rst/joty/Doc-level' +CODRA_OUT_DIR = '/home/mmorey/melodi/rst/replication/joty/Doc-level' # output of Ji's parser DPLP -# JI_OUT_DIR = os.path.join('/home/mmorey/melodi/rst/ji_eisenstein', 'DPLP/data/docs/test/') -JI_OUT_DIR = os.path.join('/home/mmorey/melodi/rst/ji_eisenstein', 'official_output/outputs/') +# JI_OUT_DIR = os.path.join('/home/mmorey/melodi/rst/replication/ji_eisenstein', 'DPLP/data/docs/test/') +JI_OUT_DIR = os.path.join('/home/mmorey/melodi/rst/replication/ji_eisenstein', 'official_output/outputs/') # Feng's parsers -FENG_DIR = '/home/mmorey/melodi/rst/feng_hirst/' +FENG_DIR = '/home/mmorey/melodi/rst/replication/feng_hirst/' FENG1_OUT_DIR = os.path.join(FENG_DIR, 'phil', 'tmp') FENG2_OUT_DIR = os.path.join(FENG_DIR, 'gCRF_dist/texts/results/test_batch_gold_seg') # Li Qi's parser -LI_QI_OUT_DIR = '/home/mmorey/melodi/rst/li_qi/result' +LI_QI_OUT_DIR = '/home/mmorey/melodi/rst/replication/li_qi/result' # Hayashi's HILDA -HAYASHI_OUT_DIR = '/home/mmorey/melodi/rst/hayashi/SIGDIAL' +HAYASHI_OUT_DIR = '/home/mmorey/melodi/rst/replication/hayashi/SIGDIAL' HAYASHI_HILDA_OUT_DIR = os.path.join(HAYASHI_OUT_DIR, 'auto_parse/cons/HILDA') HAYASHI_MST_OUT_DIR = os.path.join(HAYASHI_OUT_DIR, 'auto_parse/dep/li') # Braud -BRAUD_COLING_OUT_DIR = '/home/mmorey/melodi/rst/braud/coling16/pred_trees' -BRAUD_EACL_MONO = '/home/mmorey/melodi/rst/braud/eacl16/best-en-mono/test_it8_beam16' -BRAUD_EACL_CROSS_DEV = '/home/mmorey/melodi/rst/braud/eacl16/best-en-cross+dev/test_it10_beam32' +BRAUD_COLING_OUT_DIR = '/home/mmorey/melodi/rst/replication/braud/coling16/pred_trees' +BRAUD_EACL_MONO = '/home/mmorey/melodi/rst/replication/braud/eacl16/best-en-mono/test_it8_beam16' +BRAUD_EACL_CROSS_DEV = '/home/mmorey/melodi/rst/replication/braud/eacl16/best-en-cross+dev/test_it10_beam32' # Surdeanu -SURDEANU_LOG_FILE = '/home/mmorey/melodi/rst/surdeanu/output/log' +SURDEANU_LOG_FILE = '/home/mmorey/melodi/rst/replication/surdeanu/output/log' # Li Sujian dep parser # imported, see above From b95e184eb3e2d299b9e117ed3da22c6b09aa9fbc Mon Sep 17 00:00:00 2001 From: moreymat Date: Tue, 11 Apr 2017 10:55:37 +0200 Subject: [PATCH 55/74] ENH showdown: param digits --- evals/showdown.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/evals/showdown.py b/evals/showdown.py index 497ea29..6c93b33 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -236,6 +236,9 @@ def main(): help=("Evaluate as in the dep parser of Li et al. " "2014: all relations are NS, spiders map to " "left-heavy branching, three trivial spans ")) + # * display options + parser.add_argument('--digits', type=int, default=3, + help='Precision (number of digits) of scores') # args = parser.parse_args() author_true = args.author_true @@ -244,6 +247,8 @@ def main(): nary_enc_pred = args.nary_enc_pred binarize_true = args.binarize_true simple_rsttree = args.simple_rsttree + # display + digits = args.digits # "per_doc = True" computes p, r, f as in DPLP: compute scores per doc # then average over docs @@ -479,7 +484,6 @@ def main(): # report # * table format - digits = 4 width = max(len(parser_name) for parser_name, _ in d_preds) headers = ["UAS", "LAS"] @@ -569,15 +573,15 @@ def main(): # compute and print PARSEVAL scores print(parser_name) print(rst_parseval_report(ctree_true_list, ctree_pred_list, - ctree_type=ctree_type, digits=4, + ctree_type=ctree_type, digits=digits, per_doc=per_doc, add_trivial_spans=eval_li_dep, stringent=STRINGENT)) - # detailed report on S+N+R + # detailed report on R if DETAILED: print(rst_parseval_detailed_report( ctree_true_list, ctree_pred_list, ctree_type=ctree_type, - metric_type='S+R')) + metric_type='R')) # end FIXME From 1ecb3d33d80abd261fe4bd4c22d3da85c4559a32 Mon Sep 17 00:00:00 2001 From: moreymat Date: Tue, 11 Apr 2017 17:26:47 +0200 Subject: [PATCH 56/74] ENH param detailed, compact report --- evals/showdown.py | 115 ++++++++++++++++++++++++++++------------------ 1 file changed, 71 insertions(+), 44 deletions(-) diff --git a/evals/showdown.py b/evals/showdown.py index 6c93b33..708b2a9 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -17,6 +17,7 @@ InsideOutAttachmentRanker) from educe.rst_dt.deptree import RstDepTree from educe.rst_dt.metrics.rst_parseval import (rst_parseval_detailed_report, + rst_parseval_compact_report, rst_parseval_report) # from attelo.metrics.deptree import (compute_uas_las, @@ -131,7 +132,6 @@ # imported, see above # level of detail for parseval -DETAILED = False STRINGENT = False # additional dependency metrics INCLUDE_LS = False @@ -239,6 +239,8 @@ def main(): # * display options parser.add_argument('--digits', type=int, default=3, help='Precision (number of digits) of scores') + parser.add_argument('--detailed', type=int, default=0, + help='Level of detail for evaluations') # args = parser.parse_args() author_true = args.author_true @@ -249,6 +251,8 @@ def main(): simple_rsttree = args.simple_rsttree # display digits = args.digits + # level of detail for evals + detailed = args.detailed # "per_doc = True" computes p, r, f as in DPLP: compute scores per doc # then average over docs @@ -470,14 +474,14 @@ def main(): load_deptrees_from_attelo_output(ctree_true, dtree_true, EISNER_OUT_SYN_PRED_SU, EDUS_FILE, nuc_clf, rnk_clf, - detailed=False) + detailed=(detailed >= 3)) print('======================') print('Eisner, gold syntax') load_deptrees_from_attelo_output(ctree_true, dtree_true, EISNER_OUT_SYN_GOLD, EDUS_FILE, nuc_clf, rnk_clf, - detailed=False) + detailed=(detailed >= 3)) print('======================') # dependency eval @@ -542,47 +546,70 @@ def main(): # end report # constituency eval - for parser_name, ctree_pred in c_preds: - doc_names = sorted(ctree_true.keys()) - ctree_true_list = [ctree_true[doc_name] for doc_name in doc_names] - ctree_pred_list = [ctree_pred[doc_name] for doc_name in doc_names] - - if simple_rsttree: - ctree_true_list = [SimpleRSTTree.from_rst_tree(x) - for x in ctree_true_list] - ctree_pred_list = [SimpleRSTTree.from_rst_tree(x) - for x in ctree_pred_list] - ctree_type = 'SimpleRST' - else: - ctree_type = 'RST' - - # WIP print SimpleRSTTrees - if not os.path.exists('gold'): - os.makedirs('gold') - for doc_name, ct in zip(doc_names, ctree_true_list): - with codecs.open('gold/' + ct.origin.doc, mode='w', - encoding='utf-8') as f: - print(ct, file=f) - if not os.path.exists(parser_name): - os.makedirs(parser_name) - for doc_name, ct in zip(doc_names, ctree_pred_list): - with codecs.open(parser_name + '/' + doc_name, mode='w', - encoding='utf-8') as f: - print(ct, file=f) - - # compute and print PARSEVAL scores - print(parser_name) - print(rst_parseval_report(ctree_true_list, ctree_pred_list, - ctree_type=ctree_type, digits=digits, - per_doc=per_doc, - add_trivial_spans=eval_li_dep, - stringent=STRINGENT)) - # detailed report on R - if DETAILED: - print(rst_parseval_detailed_report( - ctree_true_list, ctree_pred_list, ctree_type=ctree_type, - metric_type='R')) - # end FIXME + ctree_type = 'SimpleRST' if simple_rsttree else 'RST' + + doc_names = sorted(ctree_true.keys()) + ctree_true_list = [ctree_true[doc_name] for doc_name in doc_names] + if simple_rsttree: + ctree_true_list = [SimpleRSTTree.from_rst_tree(x) + for x in ctree_true_list] + # WIP print SimpleRSTTrees + if not os.path.exists('gold'): + os.makedirs('gold') + for doc_name, ct in zip(doc_names, ctree_true_list): + with codecs.open('gold/' + ct.origin.doc, mode='w', + encoding='utf-8') as f: + print(ct, file=f) + + # sort the predictions of each parser, so they match the order of + # documents and reference trees in _true + ctree_preds = [(parser_name, + [ctree_pred[doc_name] for doc_name in doc_names]) + for parser_name, ctree_pred in c_preds] + if simple_rsttree: + ctree_preds = [(parser_name, + [SimpleRSTTree.from_rst_tree(x) + for x in ctree_pred_list]) + for parser_name, ctree_pred_list in ctree_preds] + # generate report + if detailed == 0: + # compact report, f1-scores only + print(rst_parseval_compact_report(ctree_true_list, ctree_preds, + ctree_type=ctree_type, + metric_types=['S', 'N', 'R', 'F'], + digits=digits, + per_doc=per_doc, + add_trivial_spans=eval_li_dep, + stringent=STRINGENT)) + else: + # standard reports: 1 table per parser, 1 line per metric, + # cols = [p, r, f1, support_true, support_pred] + for parser_name, ctree_pred_list in ctree_preds: + # WIP print SimpleRSTTrees + if not os.path.exists(parser_name): + os.makedirs(parser_name) + for doc_name, ct in zip(doc_names, ctree_pred_list): + with codecs.open(parser_name + '/' + doc_name, mode='w', + encoding='utf-8') as f: + print(ct, file=f) + + # compute and print PARSEVAL scores + print(parser_name) + # metric_types=None includes the variants with head: + # S+H, N+H, R+H, F+H + print(rst_parseval_report(ctree_true_list, ctree_pred_list, + ctree_type=ctree_type, + metric_types=None, + digits=digits, + per_doc=per_doc, + add_trivial_spans=eval_li_dep, + stringent=STRINGENT)) + # detailed report on R + if detailed >= 2: + print(rst_parseval_detailed_report( + ctree_true_list, ctree_pred_list, ctree_type=ctree_type, + metric_type='R')) + # end FIXME if __name__ == '__main__': From 5b0291f0d86abf4d21f4ed62695d27b080c4ab9f Mon Sep 17 00:00:00 2001 From: moreymat Date: Wed, 12 Apr 2017 11:53:42 +0200 Subject: [PATCH 57/74] MAINT+ENH pylint showdown, parseval double --- evals/showdown.py | 90 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 80 insertions(+), 10 deletions(-) diff --git a/evals/showdown.py b/evals/showdown.py index 708b2a9..b3a4d3e 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -49,6 +49,7 @@ CORPUS_DIR = os.path.join('corpus', 'RSTtrees-WSJ-main-1.01/') CD_TRAIN = os.path.join(CORPUS_DIR, 'TRAINING') CD_TEST = os.path.join(CORPUS_DIR, 'TEST') +DOUBLE_DIR = os.path.join('corpus', 'RSTtrees-WSJ-double-1.0') # relation converter (fine- to coarse-grained labels) RELMAP_FILE = os.path.join('/home/mmorey/melodi/educe', 'educe', 'rst_dt', @@ -299,11 +300,12 @@ def main(): dtree_true[doc_name] = dt_true # sorted doc_names, because braud_eacl put all predictions in one file sorted_doc_names = sorted(dtree_true.keys()) - + c_preds = [] # predictions: [(parser_name, dict(doc_name, ct_pred))] d_preds = [] # predictions: [(parser_name, dict(doc_name, dt_pred))] for author_pred in authors_pred: + # braud coling 2016 if author_pred == 'braud_coling': c_preds.append( ('braud_coling', load_braud_coling_ctrees( @@ -312,8 +314,8 @@ def main(): d_preds.append( ('braud_coling', load_braud_coling_dtrees( BRAUD_COLING_OUT_DIR, REL_CONV, nary_enc='chain')) - ) - + ) + # braud eacl 2017 - mono if author_pred == 'braud_eacl_mono': c_preds.append( ('braud_eacl_mono', load_braud_eacl_ctrees( @@ -323,8 +325,8 @@ def main(): ('braud_eacl_mono', load_braud_eacl_dtrees( BRAUD_EACL_MONO, REL_CONV, sorted_doc_names, nary_enc='chain')) - ) - + ) + # braud eacl 2017 - cross+dev if author_pred == 'braud_eacl_cross_dev': c_preds.append( ('braud_eacl_cross_dev', load_braud_eacl_ctrees( @@ -334,7 +336,7 @@ def main(): ('braud_eacl_cross_dev', load_braud_eacl_dtrees( BRAUD_EACL_CROSS_DEV, REL_CONV, sorted_doc_names, nary_enc='chain')) - ) + ) if author_pred == 'hayashi_hilda': c_preds.append( @@ -473,15 +475,13 @@ def main(): print('Eisner, predicted syntax + same-unit') load_deptrees_from_attelo_output(ctree_true, dtree_true, EISNER_OUT_SYN_PRED_SU, EDUS_FILE, - nuc_clf, rnk_clf, - detailed=(detailed >= 3)) + nuc_clf, rnk_clf) print('======================') print('Eisner, gold syntax') load_deptrees_from_attelo_output(ctree_true, dtree_true, EISNER_OUT_SYN_GOLD, EDUS_FILE, - nuc_clf, rnk_clf, - detailed=(detailed >= 3)) + nuc_clf, rnk_clf) print('======================') # dependency eval @@ -611,6 +611,76 @@ def main(): metric_type='R')) # end FIXME + # 2017-04-11 compute agreement between human annotators, on DOUBLE + if 'silver' in authors_pred: + # read the annotation we'll consider as "silver" + reader_dbl = RstReader(DOUBLE_DIR) + corpus_dbl_pred = {k.doc: v for k, v in reader_dbl.slurp().items()} + docs_dbl = sorted(k for k in corpus_dbl_pred.keys()) + # collect the "true" annotation for the docs in double, from train + # and test + # (test has already been read at the beginning of this script) + corpus_test_dbl = {k.doc: v for k, v in corpus_test.items() + if k.doc in docs_dbl} + # read the docs from train that are in double + reader_train = RstReader(CD_TRAIN) + corpus_train = reader_train.slurp() + corpus_train_dbl = {k.doc: v for k, v in corpus_train.items() + if k.doc in docs_dbl} + # assemble the "true" version of the double subset + corpus_dbl_true = dict(corpus_test_dbl.items() + + corpus_train_dbl.items()) + assert (sorted(corpus_dbl_true.keys()) == + sorted(corpus_dbl_pred.keys())) + # extra check? + for doc_name in docs_dbl: + leaf_spans_true = [x.text_span() for x + in corpus_dbl_true[doc_name].leaves()] + leaf_spans_pred = [x.text_span() for x + in corpus_dbl_pred[doc_name].leaves()] + if (leaf_spans_true != leaf_spans_pred): + print(doc_name, 'EEEE') + print('true - pred', + set(leaf_spans_true) - set(leaf_spans_pred)) + print('pred - true', + set(leaf_spans_pred) - set(leaf_spans_true)) + else: + print(doc_name, 'ok') + # end extra check + + # 48 docs in train, + # 5 docs in test: ['wsj_0627.out', 'wsj_0684.out', 'wsj_1129.out', + # 'wsj_1365.out', 'wsj_1387.out'] + # create parallel lists of ctrees for _true and _pred, mapped to + # coarse rels and binarized + # _pred: + ctree_dbl_pred = [corpus_dbl_pred[doc_name] for doc_name in docs_dbl] + ctree_dbl_pred = [REL_CONV(x) for x in ctree_dbl_pred] + if binarize_true: # maybe not? + ctree_dbl_pred = [_binarize(x) for x in ctree_dbl_pred] + if simple_rsttree: + ctree_dbl_pred = [SimpleRSTTree.from_rst_tree(x) + for x in ctree_dbl_pred] + # _true: + ctree_dbl_true = [corpus_dbl_true[doc_name] for doc_name in docs_dbl] + ctree_dbl_true = [REL_CONV(x) for x in ctree_dbl_true] + if binarize_true: + ctree_dbl_true = [_binarize(x) for x in ctree_dbl_true] + if simple_rsttree: + ctree_dbl_true = [SimpleRSTTree.from_rst_tree(x) + for x in ctree_dbl_true] + # generate report + ctree_dbl_preds = [('silver', ctree_dbl_pred)] + print(rst_parseval_compact_report(ctree_dbl_true, ctree_dbl_preds, + ctree_type=ctree_type, + span_type='chars', + metric_types=['S', 'N', 'R', 'F'], + digits=digits, + per_doc=per_doc, + add_trivial_spans=eval_li_dep, + stringent=STRINGENT)) + # end 2017-04-11 agreement between human annotators + if __name__ == '__main__': main() From c97f68aa64f5228db0aa55a7b4acb403b5c5ddd7 Mon Sep 17 00:00:00 2001 From: moreymat Date: Wed, 12 Apr 2017 16:10:17 +0200 Subject: [PATCH 58/74] FIX disable print of differing spans between RST double and main --- evals/showdown.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/evals/showdown.py b/evals/showdown.py index b3a4d3e..2757dd8 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -633,19 +633,20 @@ def main(): assert (sorted(corpus_dbl_true.keys()) == sorted(corpus_dbl_pred.keys())) # extra check? - for doc_name in docs_dbl: - leaf_spans_true = [x.text_span() for x - in corpus_dbl_true[doc_name].leaves()] - leaf_spans_pred = [x.text_span() for x - in corpus_dbl_pred[doc_name].leaves()] - if (leaf_spans_true != leaf_spans_pred): - print(doc_name, 'EEEE') - print('true - pred', - set(leaf_spans_true) - set(leaf_spans_pred)) - print('pred - true', - set(leaf_spans_pred) - set(leaf_spans_true)) - else: - print(doc_name, 'ok') + if False: + for doc_name in docs_dbl: + leaf_spans_true = [x.text_span() for x + in corpus_dbl_true[doc_name].leaves()] + leaf_spans_pred = [x.text_span() for x + in corpus_dbl_pred[doc_name].leaves()] + if (leaf_spans_true != leaf_spans_pred): + print(doc_name, 'EEEE') + print('true - pred', + set(leaf_spans_true) - set(leaf_spans_pred)) + print('pred - true', + set(leaf_spans_pred) - set(leaf_spans_true)) + else: + print(doc_name, 'ok') # end extra check # 48 docs in train, From adf07c1d70a14654c0a5277dc0f8ef48c1614ba9 Mon Sep 17 00:00:00 2001 From: moreymat Date: Wed, 17 May 2017 11:40:53 +0200 Subject: [PATCH 59/74] ENH evals/showdown: changes in options and table display --- evals/showdown.py | 80 ++++++++++++++++++++++++++++++----------------- 1 file changed, 51 insertions(+), 29 deletions(-) diff --git a/evals/showdown.py b/evals/showdown.py index 2757dd8..e4ad93a 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -212,7 +212,8 @@ def main(): help="Encoding of n-ary nodes for the predictions") # reference parser.add_argument('--author_true', default='gold', - choices=['gold', 'silver', + choices=['each', # NEW generate sim matrix + 'gold', 'silver', 'joty', 'feng', 'feng2', 'ji', 'li_qi', 'hayashi_hilda', 'hayashi_mst', 'braud_coling', 'braud_eacl_mono', @@ -221,13 +222,13 @@ def main(): 'li_sujian', 'ours_chain', 'ours_tree'], help="Author of the reference") - # * dtree eval - parser.add_argument('--nary_enc_true', default='tree', - choices=['tree', 'chain'], - help="Encoding of n-ary nodes for the reference") - # * ctree eval - parser.add_argument('--binarize_true', action='store_true', - help="Binarize the reference ctree for the eval") + # * ctree/dtree eval: the value of binarize_true determines the values + # of nary_enc_true and order_true (the latter is yet unused) + parser.add_argument('--binarize_true', default='none', + choices=['none', 'right', 'right_mixed', 'left'], + help=("Binarization method for the reference ctree" + "in the eval ; defaults to 'none' for no " + "binarization")) parser.add_argument('--simple_rsttree', action='store_true', help="Binarize ctree and move relations up") # * non-standard evals @@ -240,18 +241,23 @@ def main(): # * display options parser.add_argument('--digits', type=int, default=3, help='Precision (number of digits) of scores') + parser.add_argument('--percent', action='store_true', + help='Scores are displayed as percentages (ex: 57.9)') parser.add_argument('--detailed', type=int, default=0, help='Level of detail for evaluations') # args = parser.parse_args() author_true = args.author_true - nary_enc_true = args.nary_enc_true authors_pred = args.authors_pred nary_enc_pred = args.nary_enc_pred binarize_true = args.binarize_true simple_rsttree = args.simple_rsttree # display digits = args.digits + percent = args.percent + if percent: + if digits < 3: + raise ValueError('--percent requires --digits >= 3') # level of detail for evals detailed = args.detailed @@ -264,10 +270,15 @@ def main(): # three trivial spans eval_li_dep = args.eval_li_dep - # - if binarize_true and nary_enc_true != 'chain': - raise ValueError("--binarize_true is compatible with " - "--nary_enc_true chain only") + if binarize_true in ('right', 'right_mixed'): + nary_enc_true = 'chain' + order_true = 'strict' + elif binarize_true == 'left': + nary_enc_true = 'tree' + order_true = 'strict' + else: # 'none' for no binarization of the reference tree + nary_enc_true = 'tree' + order_true = 'weak' # 0. setup the postprocessors to flesh out unordered dtrees into ordered # ones with nuclearity @@ -291,9 +302,9 @@ def main(): doc_name = doc_id.doc # original reference ctree, with coarse labels ct_true = REL_CONV(ct_true) # map fine to coarse relations - if binarize_true: + if binarize_true != "none": # binarize ctree if required - ct_true = _binarize(ct_true) + ct_true = _binarize(ct_true, branching=binarize_true) ctree_true[doc_name] = ct_true # corresponding dtree dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc=nary_enc_true) @@ -485,16 +496,18 @@ def main(): print('======================') # dependency eval - + dep_metrics = ["U"] + if EVAL_NUC_RANK: + dep_metrics += ['O', 'N'] + dep_metrics += ["R"] + if INCLUDE_LS: + dep_metrics += ["tag_R"] + if EVAL_NUC_RANK: + dep_metrics += ["R+N", "R+O", "F"] # report # * table format width = max(len(parser_name) for parser_name, _ in d_preds) - - headers = ["UAS", "LAS"] - if INCLUDE_LS: - headers += ["LS"] - if EVAL_NUC_RANK: - headers += ["LAS+N", "LAS+O", "LAS+N+O"] + headers = dep_metrics if UNDIRECTED_DEPS: headers += ["UUAS", "ULAS"] fmt = '%% %ds' % width # first col: parser name @@ -505,6 +518,8 @@ def main(): headers = [""] + headers report = fmt % tuple(headers) report += '\n' + # display percentages + dep_digits = digits - 2 if percent else digits # end table format and header line # * table content @@ -530,8 +545,8 @@ def main(): # end check all_scores = [] all_scores += list(compute_uas_las( - dtree_true_list, dtree_pred_list, include_ls=INCLUDE_LS, - include_las_n_o_no=EVAL_NUC_RANK)) + dtree_true_list, dtree_pred_list, metrics=dep_metrics, + doc_names=doc_names)) if UNDIRECTED_DEPS: score_uuas, score_ulas = compute_uas_las_undirected( dtree_true_list, dtree_pred_list) @@ -539,7 +554,9 @@ def main(): # append to report values = ['{pname: <{fill}}'.format(pname=parser_name, fill=width)] for v in all_scores: - values += ["{0:0.{1}f}".format(v, digits)] + if percent: + v = v * 100.0 + values += ["{0:0.{1}f}".format(v, dep_digits)] report += fmt % tuple(values) # end table content print(report) @@ -578,6 +595,7 @@ def main(): ctree_type=ctree_type, metric_types=['S', 'N', 'R', 'F'], digits=digits, + percent=percent, per_doc=per_doc, add_trivial_spans=eval_li_dep, stringent=STRINGENT)) @@ -601,6 +619,7 @@ def main(): ctree_type=ctree_type, metric_types=None, digits=digits, + percent=percent, per_doc=per_doc, add_trivial_spans=eval_li_dep, stringent=STRINGENT)) @@ -657,16 +676,18 @@ def main(): # _pred: ctree_dbl_pred = [corpus_dbl_pred[doc_name] for doc_name in docs_dbl] ctree_dbl_pred = [REL_CONV(x) for x in ctree_dbl_pred] - if binarize_true: # maybe not? - ctree_dbl_pred = [_binarize(x) for x in ctree_dbl_pred] + if binarize_true != 'none': # maybe not? + ctree_dbl_pred = [_binarize(x, branching=binarize_true) + for x in ctree_dbl_pred] if simple_rsttree: ctree_dbl_pred = [SimpleRSTTree.from_rst_tree(x) for x in ctree_dbl_pred] # _true: ctree_dbl_true = [corpus_dbl_true[doc_name] for doc_name in docs_dbl] ctree_dbl_true = [REL_CONV(x) for x in ctree_dbl_true] - if binarize_true: - ctree_dbl_true = [_binarize(x) for x in ctree_dbl_true] + if binarize_true != 'none': + ctree_dbl_true = [_binarize(x, branching=binarize_true) + for x in ctree_dbl_true] if simple_rsttree: ctree_dbl_true = [SimpleRSTTree.from_rst_tree(x) for x in ctree_dbl_true] @@ -677,6 +698,7 @@ def main(): span_type='chars', metric_types=['S', 'N', 'R', 'F'], digits=digits, + percent=percent, per_doc=per_doc, add_trivial_spans=eval_li_dep, stringent=STRINGENT)) From 2faff312d24fc63b5f28db72096098606a9e6dd9 Mon Sep 17 00:00:00 2001 From: moreymat Date: Wed, 17 May 2017 15:23:35 +0200 Subject: [PATCH 60/74] ENH showdown: use any author as _true --- evals/hayashi_deps.py | 2 +- evals/ours.py | 2 + evals/showdown.py | 155 ++++++++++++++++++++++++------------------ 3 files changed, 92 insertions(+), 67 deletions(-) diff --git a/evals/hayashi_deps.py b/evals/hayashi_deps.py index cb812f5..cbde909 100644 --- a/evals/hayashi_deps.py +++ b/evals/hayashi_deps.py @@ -38,7 +38,7 @@ def _load_hayashi_dep_file(f, edus): dt: RstDepTree Predicted dtree """ - dt = RstDepTree(edus=edus, origin=None, nary_enc='tree') # FIXME origin + dt = RstDepTree(edus=edus, origin=None, nary_enc='chain') # FIXME origin for line in f: line = line.strip() if not line: diff --git a/evals/ours.py b/evals/ours.py index f9d48bf..938a53c 100644 --- a/evals/ours.py +++ b/evals/ours.py @@ -10,6 +10,7 @@ from educe.annotation import Span as EduceSpan from educe.rst_dt.annotation import (EDU as EduceEDU, SimpleRSTTree) +from educe.rst_dt.corpus import mk_key from educe.rst_dt.dep2con import (deptree_to_simple_rst_tree, deptree_to_rst_tree) from educe.rst_dt.deptree import RstDepTree, RstDtException @@ -115,6 +116,7 @@ def load_attelo_dtrees(output_file, edus_file, nuc_clf, rnk_clf): raise ValueError('Weird root label: {}'.format(lbl)) else: dt_pred.add_dependency(gid2num[src_id], gid2num[tgt_id], lbl) + dt_pred.origin = mk_key(doc_name) # add nuclearity: heuristic baseline dt_pred.nucs = nuc_clf.predict([dt_pred])[0] # add rank: heuristic baseline, needs edu2sent diff --git a/evals/showdown.py b/evals/showdown.py index e4ad93a..daef76a 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -143,6 +143,18 @@ NUC_CONSTANT = None # only useful for NUC_STRATEGY='constant' RNK_STRATEGY = 'sdist-edist-rl' RNK_PRIORITY_SU = True +# known 'authors' +AUTHORS = [ + 'gold', # RST-main + 'silver', # RST-double + 'JCN15_1S1S', 'FH14_gSVM', 'FH14_gCRF', 'JE14', + 'LLC16', 'HHN16_hilda', 'HHN16_mst', + 'BPS16', 'BCS17_mono', + 'BCS17_cross_dev', + 'SHV15_D', + 'li_sujian', + 'ours-chain', 'ours-tree', 'ours-tree-su' +] def setup_dtree_postprocessor(nary_enc='chain', order='strict', @@ -198,29 +210,14 @@ def main(): description="Evaluate parsers' output against a given reference") # predictions parser.add_argument('authors_pred', nargs='+', - choices=['gold', 'silver', - 'joty', 'feng', 'feng2', 'ji', - 'li_qi', 'hayashi_hilda', 'hayashi_mst', - 'braud_coling', 'braud_eacl_mono', - 'braud_eacl_cross_dev', - 'surdeanu', - 'li_sujian', - 'ours_chain', 'ours_tree', 'ours_tree_su'], + choices=AUTHORS, help="Author(s) of the predictions") parser.add_argument('--nary_enc_pred', default='tree', choices=['tree', 'chain'], help="Encoding of n-ary nodes for the predictions") # reference parser.add_argument('--author_true', default='gold', - choices=['each', # NEW generate sim matrix - 'gold', 'silver', - 'joty', 'feng', 'feng2', 'ji', - 'li_qi', 'hayashi_hilda', 'hayashi_mst', - 'braud_coling', 'braud_eacl_mono', - 'braud_eacl_cross_dev', - 'surdeanu', - 'li_sujian', - 'ours_chain', 'ours_tree'], + choices=AUTHORS + ['each'], # NEW generate sim matrix help="Author of the reference") # * ctree/dtree eval: the value of binarize_true determines the values # of nary_enc_true and order_true (the latter is yet unused) @@ -291,11 +288,7 @@ def main(): reader_test = RstReader(CD_TEST) corpus_test = reader_test.slurp() - # reference - # current assumption: author_true is 'gold' - if author_true != 'gold': - raise NotImplementedError('Not yet') - + # reference: author_true can be any of the authors_pred (defaults to gold) ctree_true = dict() # ctrees dtree_true = dict() # dtrees from the original ctrees ('tree' transform) for doc_id, ct_true in sorted(corpus_test.items()): @@ -317,66 +310,66 @@ def main(): for author_pred in authors_pred: # braud coling 2016 - if author_pred == 'braud_coling': + if author_pred == 'BPS16': c_preds.append( - ('braud_coling', load_braud_coling_ctrees( + ('BPS16', load_braud_coling_ctrees( BRAUD_COLING_OUT_DIR, REL_CONV)) ) d_preds.append( - ('braud_coling', load_braud_coling_dtrees( + ('BPS16', load_braud_coling_dtrees( BRAUD_COLING_OUT_DIR, REL_CONV, nary_enc='chain')) ) # braud eacl 2017 - mono - if author_pred == 'braud_eacl_mono': + if author_pred == 'BCS17_mono': c_preds.append( - ('braud_eacl_mono', load_braud_eacl_ctrees( + ('BCS17_mono', load_braud_eacl_ctrees( BRAUD_EACL_MONO, REL_CONV, sorted_doc_names)) ) d_preds.append( - ('braud_eacl_mono', load_braud_eacl_dtrees( + ('BCS17_mono', load_braud_eacl_dtrees( BRAUD_EACL_MONO, REL_CONV, sorted_doc_names, nary_enc='chain')) ) # braud eacl 2017 - cross+dev - if author_pred == 'braud_eacl_cross_dev': + if author_pred == 'BCS17_cross_dev': c_preds.append( - ('braud_eacl_cross_dev', load_braud_eacl_ctrees( + ('BCS17_cross_dev', load_braud_eacl_ctrees( BRAUD_EACL_CROSS_DEV, REL_CONV, sorted_doc_names)) ) d_preds.append( - ('braud_eacl_cross_dev', load_braud_eacl_dtrees( + ('BCS17_cross_dev', load_braud_eacl_dtrees( BRAUD_EACL_CROSS_DEV, REL_CONV, sorted_doc_names, nary_enc='chain')) ) - if author_pred == 'hayashi_hilda': + if author_pred == 'HHN16_hilda': c_preds.append( - ('hayashi_hilda', load_hayashi_hilda_ctrees( + ('HHN16_hilda', load_hayashi_hilda_ctrees( HAYASHI_HILDA_OUT_DIR, REL_CONV)) ) d_preds.append( - ('hayashi_hilda', load_hayashi_hilda_dtrees( + ('HHN16_hilda', load_hayashi_hilda_dtrees( HAYASHI_HILDA_OUT_DIR, REL_CONV, nary_enc='chain')) ) - if author_pred == 'hayashi_mst': + if author_pred == 'HHN16_mst': c_preds.append( - ('hayashi_mst', load_hayashi_dep_ctrees( + ('HHN16_mst', load_hayashi_dep_ctrees( HAYASHI_MST_OUT_DIR, REL_CONV_DTREE, EDUS_FILE_PAT, nuc_clf, rnk_clf)) ) d_preds.append( - ('hayashi_mst', load_hayashi_dep_dtrees( + ('HHN16_mst', load_hayashi_dep_dtrees( HAYASHI_MST_OUT_DIR, REL_CONV_DTREE, EDUS_FILE_PAT, nuc_clf, rnk_clf)) ) - if author_pred == 'li_qi': + if author_pred == 'LLC16': c_preds.append( - ('li_qi', load_li_qi_ctrees(LI_QI_OUT_DIR, REL_CONV)) + ('LLC16', load_li_qi_ctrees(LI_QI_OUT_DIR, REL_CONV)) ) d_preds.append( - ('li_qi', load_li_qi_dtrees(LI_QI_OUT_DIR, REL_CONV, + ('LLC16', load_li_qi_dtrees(LI_QI_OUT_DIR, REL_CONV, nary_enc='chain')) ) @@ -392,63 +385,63 @@ def main(): nuc_clf, rnk_clf)) ) - if author_pred == 'feng': + if author_pred == 'FH14_gSVM': c_preds.append( - ('gSVM', load_feng_ctrees(FENG1_OUT_DIR, REL_CONV)) + ('FH14_gSVM', load_feng_ctrees(FENG1_OUT_DIR, REL_CONV)) ) d_preds.append( - ('gSVM', load_feng_dtrees(FENG1_OUT_DIR, REL_CONV, - nary_enc='chain')) + ('FH14_gSVM', load_feng_dtrees(FENG1_OUT_DIR, REL_CONV, + nary_enc='chain')) ) - if author_pred == 'feng2': + if author_pred == 'FH14_gCRF': c_preds.append( - ('gCRF', load_gcrf_ctrees(FENG2_OUT_DIR, REL_CONV)) + ('FH14_gCRF', load_gcrf_ctrees(FENG2_OUT_DIR, REL_CONV)) ) d_preds.append( - ('gCRF', load_gcrf_dtrees(FENG2_OUT_DIR, REL_CONV, - nary_enc='chain')) + ('FH14_gCRF', load_gcrf_dtrees(FENG2_OUT_DIR, REL_CONV, + nary_enc='chain')) ) - if author_pred == 'joty': + if author_pred == 'JCN15_1S1S': # CODRA outputs RST ctrees ; eval_codra_output maps them to RST dtrees c_preds.append( - ('TSP 1-1', load_codra_ctrees(CODRA_OUT_DIR, REL_CONV)) + ('JCN15_1S1S', load_codra_ctrees(CODRA_OUT_DIR, REL_CONV)) ) d_preds.append( - ('TSP 1-1', load_codra_dtrees(CODRA_OUT_DIR, REL_CONV, - nary_enc='chain')) + ('JCN15_1S1S', load_codra_dtrees(CODRA_OUT_DIR, REL_CONV, + nary_enc='chain')) ) # joty-{chain,tree} would be the same except nary_enc='tree' ; # the nary_enc does not matter because codra outputs binary ctrees, # hence both encodings result in (the same) strictly ordered dtrees - if author_pred == 'ji': + if author_pred == 'JE14': # DPLP outputs RST ctrees in the form of lists of spans; # load_ji_dtrees maps them to RST dtrees c_preds.append( - ('DPLP', load_ji_ctrees( + ('JE14', load_ji_ctrees( JI_OUT_DIR, REL_CONV)) ) d_preds.append( - ('DPLP', load_ji_dtrees( + ('JE14', load_ji_dtrees( JI_OUT_DIR, REL_CONV, nary_enc='chain')) ) # ji-{chain,tree} would be the same except nary_enc='tree' ; # the nary_enc does not matter because codra outputs binary ctrees, # hence both encodings result in (the same) strictly ordered dtrees - if author_pred == 'surdeanu': + if author_pred == 'SHV15_D': c_preds.append( - ('surdeanu', load_surdeanu_ctrees( + ('SHV15_D', load_surdeanu_ctrees( SURDEANU_LOG_FILE, REL_CONV)) ) d_preds.append( - ('surdeanu', load_surdeanu_dtrees( + ('SHV15_D', load_surdeanu_dtrees( SURDEANU_LOG_FILE, REL_CONV, nary_enc='chain')) ) - if author_pred == 'ours_chain': + if author_pred == 'ours-chain': # Eisner, predicted syntax, chain c_preds.append( ('ours-chain', load_attelo_ctrees( @@ -459,7 +452,7 @@ def main(): EISNER_OUT_SYN_PRED, EDUS_FILE, nuc_clf, rnk_clf)) ) - if author_pred == 'ours_tree': + if author_pred == 'ours-tree': # Eisner, predicted syntax, tree + same-unit c_preds.append( ('ours-tree', load_attelo_ctrees( @@ -469,7 +462,7 @@ def main(): ('ours-tree', load_attelo_dtrees( EISNER_OUT_TREE_SYN_PRED, EDUS_FILE, nuc_clf, rnk_clf)) ) - if author_pred == 'ours_tree_su': + if author_pred == 'ours-tree-su': # Eisner, predicted syntax, tree + same-unit c_preds.append( ('ours-tree-su', load_attelo_ctrees(EISNER_OUT_TREE_SYN_PRED_SU, @@ -481,6 +474,14 @@ def main(): EDUS_FILE, nuc_clf, rnk_clf)) ) + # 2017-05-17 enable "gold" as parser, should give perfect scores + if author_pred == 'gold': + c_preds.append( + ('gold', ctree_true) + ) + d_preds.append( + ('gold', dtree_true) + ) if False: # FIXME repair (or forget) these print('Eisner, predicted syntax + same-unit') @@ -525,10 +526,18 @@ def main(): # * table content # _true doc_names = sorted(dtree_true.keys()) - dtree_true_list = [dtree_true[doc_name] for doc_name in doc_names] labelset_true = set(itertools.chain.from_iterable( - x.labels for x in dtree_true_list)) + x.labels for x in dtree_true.values())) labelset_true.add("span") # RST-DT v.1.0 has an error in wsj_1189 7-9 + # 2017-05-17 any author can be used as reference + # FIXME + # dtree_true_list = [dtree_true[doc_name] for doc_name in doc_names] + dtree_true_list = [] + for parser_name, dtree_pred in d_preds: + if parser_name == author_true: + dtree_true_list = [dtree_pred[doc_name] for doc_name in doc_names] + break + # end FIXME # _pred for parser_name, dtree_pred in d_preds: dtree_pred_list = [dtree_pred[doc_name] for doc_name in doc_names] @@ -539,8 +548,9 @@ def main(): assert labelset_pred.issubset(labelset_true) except AssertionError: print(parser_name) - print('T - P', labelset_true - labelset_pred) - print('P - T', labelset_pred - labelset_true) + print('T & P', sorted(labelset_true.intersection(labelset_pred))) + print('T - P', sorted(labelset_true - labelset_pred)) + print('P - T', sorted(labelset_pred - labelset_true)) raise # end check all_scores = [] @@ -566,7 +576,15 @@ def main(): ctree_type = 'SimpleRST' if simple_rsttree else 'RST' doc_names = sorted(ctree_true.keys()) - ctree_true_list = [ctree_true[doc_name] for doc_name in doc_names] + # ctree_true_list = [ctree_true[doc_name] for doc_name in doc_names] + # FIXME + ctree_true_list = [] + for parser_name, ctree_pred in c_preds: + if parser_name == author_true: + ctree_true_list = [ctree_pred[doc_name] for doc_name in doc_names] + break + # end FIXME + if simple_rsttree: ctree_true_list = [SimpleRSTTree.from_rst_tree(x) for x in ctree_true_list] @@ -632,6 +650,11 @@ def main(): # 2017-04-11 compute agreement between human annotators, on DOUBLE if 'silver' in authors_pred: + # 'silver' can be meaningfully compared to 'gold' only (too few + # documents otherwise) + if author_true != 'gold': + raise NotImplementedError('Not yet') + # read the annotation we'll consider as "silver" reader_dbl = RstReader(DOUBLE_DIR) corpus_dbl_pred = {k.doc: v for k, v in reader_dbl.slurp().items()} From b4ef3ec14e5253b7bc241ec81d2911653aafc937 Mon Sep 17 00:00:00 2001 From: moreymat Date: Wed, 17 May 2017 16:22:02 +0200 Subject: [PATCH 61/74] ENH showdown: compact reports use author_true --- evals/showdown.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/evals/showdown.py b/evals/showdown.py index daef76a..4b62709 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -609,7 +609,7 @@ def main(): # generate report if detailed == 0: # compact report, f1-scores only - print(rst_parseval_compact_report(ctree_true_list, ctree_preds, + print(rst_parseval_compact_report(author_true, ctree_preds, ctree_type=ctree_type, metric_types=['S', 'N', 'R', 'F'], digits=digits, @@ -715,8 +715,9 @@ def main(): ctree_dbl_true = [SimpleRSTTree.from_rst_tree(x) for x in ctree_dbl_true] # generate report - ctree_dbl_preds = [('silver', ctree_dbl_pred)] - print(rst_parseval_compact_report(ctree_dbl_true, ctree_dbl_preds, + ctree_dbl_preds = [('silver', ctree_dbl_pred), + ('gold', ctree_dbl_true)] + print(rst_parseval_compact_report(author_true, ctree_dbl_preds, ctree_type=ctree_type, span_type='chars', metric_types=['S', 'N', 'R', 'F'], From 1b9487cf460348c2afb692c1202dfc9ce68ba19f Mon Sep 17 00:00:00 2001 From: moreymat Date: Thu, 18 May 2017 12:07:01 +0200 Subject: [PATCH 62/74] ENH showdown: similarity matrix --- evals/showdown.py | 205 ++++++++++++++++++++++++++-------------------- 1 file changed, 115 insertions(+), 90 deletions(-) diff --git a/evals/showdown.py b/evals/showdown.py index 4b62709..167f6a1 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -18,7 +18,8 @@ from educe.rst_dt.deptree import RstDepTree from educe.rst_dt.metrics.rst_parseval import (rst_parseval_detailed_report, rst_parseval_compact_report, - rst_parseval_report) + rst_parseval_report, + rst_parseval_similarity) # from attelo.metrics.deptree import (compute_uas_las, compute_uas_las_undirected) @@ -532,69 +533,65 @@ def main(): # 2017-05-17 any author can be used as reference # FIXME # dtree_true_list = [dtree_true[doc_name] for doc_name in doc_names] - dtree_true_list = [] - for parser_name, dtree_pred in d_preds: - if parser_name == author_true: - dtree_true_list = [dtree_pred[doc_name] for doc_name in doc_names] - break - # end FIXME - # _pred - for parser_name, dtree_pred in d_preds: - dtree_pred_list = [dtree_pred[doc_name] for doc_name in doc_names] - # check that labelset_pred is a subset of labelset_true - labelset_pred = set(itertools.chain.from_iterable( - x.labels for x in dtree_pred_list)) - try: - assert labelset_pred.issubset(labelset_true) - except AssertionError: - print(parser_name) - print('T & P', sorted(labelset_true.intersection(labelset_pred))) - print('T - P', sorted(labelset_true - labelset_pred)) - print('P - T', sorted(labelset_pred - labelset_true)) - raise - # end check - all_scores = [] - all_scores += list(compute_uas_las( - dtree_true_list, dtree_pred_list, metrics=dep_metrics, - doc_names=doc_names)) - if UNDIRECTED_DEPS: - score_uuas, score_ulas = compute_uas_las_undirected( - dtree_true_list, dtree_pred_list) - all_scores += [score_uuas, score_ulas] - # append to report - values = ['{pname: <{fill}}'.format(pname=parser_name, fill=width)] - for v in all_scores: - if percent: - v = v * 100.0 - values += ["{0:0.{1}f}".format(v, dep_digits)] - report += fmt % tuple(values) - # end table content - print(report) - # end report + parsers_true = [author_true] if author_true != 'each' else authors_pred + for parser_true in parsers_true: + dtree_true_list = [] + for parser_name, dtree_pred in d_preds: + if parser_name == parser_true: + dtree_true_list = [dtree_pred[doc_name] for doc_name in doc_names] + break + # end FIXME + # _pred + for parser_name, dtree_pred in d_preds: + dtree_pred_list = [dtree_pred[doc_name] for doc_name in doc_names] + # check that labelset_pred is a subset of labelset_true + labelset_pred = set(itertools.chain.from_iterable( + x.labels for x in dtree_pred_list)) + try: + assert labelset_pred.issubset(labelset_true) + except AssertionError: + print(parser_name) + print('T & P', sorted(labelset_true.intersection(labelset_pred))) + print('T - P', sorted(labelset_true - labelset_pred)) + print('P - T', sorted(labelset_pred - labelset_true)) + raise + # end check + all_scores = [] + all_scores += list(compute_uas_las( + dtree_true_list, dtree_pred_list, metrics=dep_metrics, + doc_names=doc_names)) + if UNDIRECTED_DEPS: + score_uuas, score_ulas = compute_uas_las_undirected( + dtree_true_list, dtree_pred_list) + all_scores += [score_uuas, score_ulas] + # append to report + values = ['{pname: <{fill}}'.format(pname=parser_name, fill=width)] + for v in all_scores: + if percent: + v = v * 100.0 + values += ["{0:0.{1}f}".format(v, dep_digits)] + report += fmt % tuple(values) + # end table content + print(report) + # end report # constituency eval ctree_type = 'SimpleRST' if simple_rsttree else 'RST' doc_names = sorted(ctree_true.keys()) - # ctree_true_list = [ctree_true[doc_name] for doc_name in doc_names] - # FIXME - ctree_true_list = [] - for parser_name, ctree_pred in c_preds: - if parser_name == author_true: - ctree_true_list = [ctree_pred[doc_name] for doc_name in doc_names] - break - # end FIXME - if simple_rsttree: - ctree_true_list = [SimpleRSTTree.from_rst_tree(x) - for x in ctree_true_list] - # WIP print SimpleRSTTrees - if not os.path.exists('gold'): - os.makedirs('gold') - for doc_name, ct in zip(doc_names, ctree_true_list): - with codecs.open('gold/' + ct.origin.doc, mode='w', - encoding='utf-8') as f: - print(ct, file=f) + if False: # back when 'gold' was the only possible ref + ctree_true_list = [ctree_true[doc_name] for doc_name in doc_names] + if simple_rsttree: + ctree_true_list = [SimpleRSTTree.from_rst_tree(x) + for x in ctree_true_list] + # WIP print SimpleRSTTrees + if not os.path.exists('gold'): + os.makedirs('gold') + for doc_name, ct in zip(doc_names, ctree_true_list): + with codecs.open('gold/' + ct.origin.doc, mode='w', + encoding='utf-8') as f: + print(ct, file=f) # sort the predictions of each parser, so they match the order of # documents and reference trees in _true @@ -606,48 +603,76 @@ def main(): [SimpleRSTTree.from_rst_tree(x) for x in ctree_pred_list]) for parser_name, ctree_pred_list in ctree_preds] + + # 2017-05-17 allow any parser to be ref # generate report if detailed == 0: - # compact report, f1-scores only - print(rst_parseval_compact_report(author_true, ctree_preds, + # 2017-05-17 WIP similarity matrix: author_true='each': restrict + # to the S metric only, so as to display a sim. matrix + if author_true == 'each': + metric_type = 'S' + print(rst_parseval_similarity(ctree_preds, ctree_type=ctree_type, - metric_types=['S', 'N', 'R', 'F'], + metric_type=metric_type, digits=digits, percent=percent, + print_support=False, per_doc=per_doc, add_trivial_spans=eval_li_dep, - stringent=STRINGENT)) + stringent=STRINGENT, + out_format='latex')) + else: + metric_types = ['S', 'N', 'R', 'F'] + # compact report, f1-scores only + print(rst_parseval_compact_report(author_true, ctree_preds, + ctree_type=ctree_type, + metric_types=metric_types, + digits=digits, + percent=percent, + per_doc=per_doc, + add_trivial_spans=eval_li_dep, + stringent=STRINGENT)) else: - # standard reports: 1 table per parser, 1 line per metric, - # cols = [p, r, f1, support_true, support_pred] - for parser_name, ctree_pred_list in ctree_preds: - # WIP print SimpleRSTTrees - if not os.path.exists(parser_name): - os.makedirs(parser_name) - for doc_name, ct in zip(doc_names, ctree_pred_list): - with codecs.open(parser_name + '/' + doc_name, mode='w', - encoding='utf-8') as f: - print(ct, file=f) - - # compute and print PARSEVAL scores - print(parser_name) - # metric_types=None includes the variants with head: - # S+H, N+H, R+H, F+H - print(rst_parseval_report(ctree_true_list, ctree_pred_list, - ctree_type=ctree_type, - metric_types=None, - digits=digits, - percent=percent, - per_doc=per_doc, - add_trivial_spans=eval_li_dep, - stringent=STRINGENT)) - # detailed report on R - if detailed >= 2: - print(rst_parseval_detailed_report( - ctree_true_list, ctree_pred_list, ctree_type=ctree_type, - metric_type='R')) + parsers_true = [author_true] if author_true != 'each' else authors_pred + for parser_true in parsers_true: + # standard reports: 1 table per parser, 1 line per metric, + # cols = [p, r, f1, support_true, support_pred] + # FIXME + ctree_true_list = [] + for parser_name, ctree_pred in c_preds: + if parser_name == parser_true: + ctree_true_list = [ctree_pred[doc_name] for doc_name in doc_names] + break # end FIXME + for parser_name, ctree_pred_list in ctree_preds: + # WIP print SimpleRSTTrees + if not os.path.exists(parser_name): + os.makedirs(parser_name) + for doc_name, ct in zip(doc_names, ctree_pred_list): + with codecs.open(parser_name + '/' + doc_name, mode='w', + encoding='utf-8') as f: + print(ct, file=f) + + # compute and print PARSEVAL scores + print(parser_name) + # metric_types=None includes the variants with head: + # S+H, N+H, R+H, F+H + print(rst_parseval_report(ctree_true_list, ctree_pred_list, + ctree_type=ctree_type, + metric_types=None, + digits=digits, + percent=percent, + per_doc=per_doc, + add_trivial_spans=eval_li_dep, + stringent=STRINGENT)) + # detailed report on R + if detailed >= 2: + print(rst_parseval_detailed_report( + ctree_true_list, ctree_pred_list, ctree_type=ctree_type, + metric_type='R')) + # end FIXME + # 2017-04-11 compute agreement between human annotators, on DOUBLE if 'silver' in authors_pred: # 'silver' can be meaningfully compared to 'gold' only (too few From ca02b4993c5de86fc698c9c001da61c382801678 Mon Sep 17 00:00:00 2001 From: moreymat Date: Thu, 18 May 2017 16:41:21 +0200 Subject: [PATCH 63/74] ENH showdown: dep_compact_report --- evals/showdown.py | 95 ++++++++++++++++------------------------------- 1 file changed, 31 insertions(+), 64 deletions(-) diff --git a/evals/showdown.py b/evals/showdown.py index 167f6a1..2aaba37 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -22,7 +22,7 @@ rst_parseval_similarity) # from attelo.metrics.deptree import (compute_uas_las, - compute_uas_las_undirected) + dep_compact_report) # local to this package from evals.braud_coling import (load_braud_coling_ctrees, @@ -137,7 +137,6 @@ STRINGENT = False # additional dependency metrics INCLUDE_LS = False -UNDIRECTED_DEPS = False EVAL_NUC_RANK = True # hyperparams NUC_STRATEGY = 'unamb_else_most_frequent' @@ -506,74 +505,21 @@ def main(): dep_metrics += ["tag_R"] if EVAL_NUC_RANK: dep_metrics += ["R+N", "R+O", "F"] - # report - # * table format - width = max(len(parser_name) for parser_name, _ in d_preds) - headers = dep_metrics - if UNDIRECTED_DEPS: - headers += ["UUAS", "ULAS"] - fmt = '%% %ds' % width # first col: parser name - fmt += ' ' - fmt += ' '.join(['% 9s' for _ in headers]) - fmt += '\n' - - headers = [""] + headers - report = fmt % tuple(headers) - report += '\n' - # display percentages - dep_digits = digits - 2 if percent else digits - # end table format and header line - - # * table content + # _true doc_names = sorted(dtree_true.keys()) labelset_true = set(itertools.chain.from_iterable( x.labels for x in dtree_true.values())) labelset_true.add("span") # RST-DT v.1.0 has an error in wsj_1189 7-9 # 2017-05-17 any author can be used as reference - # FIXME - # dtree_true_list = [dtree_true[doc_name] for doc_name in doc_names] - parsers_true = [author_true] if author_true != 'each' else authors_pred - for parser_true in parsers_true: - dtree_true_list = [] - for parser_name, dtree_pred in d_preds: - if parser_name == parser_true: - dtree_true_list = [dtree_pred[doc_name] for doc_name in doc_names] - break - # end FIXME - # _pred - for parser_name, dtree_pred in d_preds: - dtree_pred_list = [dtree_pred[doc_name] for doc_name in doc_names] - # check that labelset_pred is a subset of labelset_true - labelset_pred = set(itertools.chain.from_iterable( - x.labels for x in dtree_pred_list)) - try: - assert labelset_pred.issubset(labelset_true) - except AssertionError: - print(parser_name) - print('T & P', sorted(labelset_true.intersection(labelset_pred))) - print('T - P', sorted(labelset_true - labelset_pred)) - print('P - T', sorted(labelset_pred - labelset_true)) - raise - # end check - all_scores = [] - all_scores += list(compute_uas_las( - dtree_true_list, dtree_pred_list, metrics=dep_metrics, - doc_names=doc_names)) - if UNDIRECTED_DEPS: - score_uuas, score_ulas = compute_uas_las_undirected( - dtree_true_list, dtree_pred_list) - all_scores += [score_uuas, score_ulas] - # append to report - values = ['{pname: <{fill}}'.format(pname=parser_name, fill=width)] - for v in all_scores: - if percent: - v = v * 100.0 - values += ["{0:0.{1}f}".format(v, dep_digits)] - report += fmt % tuple(values) - # end table content - print(report) - # end report + if author_true != 'each': + parser_true = author_true + print(dep_compact_report(parser_true, d_preds, dep_metrics, + doc_names, labelset_true, + digits=digits, + percent=percent)) + else: + raise ValueError("Sim matrix on dependencies not implemented yet") # constituency eval ctree_type = 'SimpleRST' if simple_rsttree else 'RST' @@ -722,11 +668,17 @@ def main(): # create parallel lists of ctrees for _true and _pred, mapped to # coarse rels and binarized # _pred: + # * ctree ctree_dbl_pred = [corpus_dbl_pred[doc_name] for doc_name in docs_dbl] ctree_dbl_pred = [REL_CONV(x) for x in ctree_dbl_pred] if binarize_true != 'none': # maybe not? ctree_dbl_pred = [_binarize(x, branching=binarize_true) for x in ctree_dbl_pred] + # * dtree (as dict from doc_name to dtree !?) + dtree_dbl_pred = {doc_name: RstDepTree.from_rst_tree( + ct, nary_enc=nary_enc_true) + for doc_name, ct in zip(docs_dbl, ctree_dbl_pred)} + # * simple_rsttree (?) if simple_rsttree: ctree_dbl_pred = [SimpleRSTTree.from_rst_tree(x) for x in ctree_dbl_pred] @@ -736,10 +688,15 @@ def main(): if binarize_true != 'none': ctree_dbl_true = [_binarize(x, branching=binarize_true) for x in ctree_dbl_true] + # * dtree (as dict from doc_name to dtree !?) + dtree_dbl_true = {doc_name: RstDepTree.from_rst_tree( + ct, nary_enc=nary_enc_true) + for doc_name, ct in zip(docs_dbl, ctree_dbl_true)} if simple_rsttree: ctree_dbl_true = [SimpleRSTTree.from_rst_tree(x) for x in ctree_dbl_true] # generate report + # * ctree eval ctree_dbl_preds = [('silver', ctree_dbl_pred), ('gold', ctree_dbl_true)] print(rst_parseval_compact_report(author_true, ctree_dbl_preds, @@ -751,6 +708,16 @@ def main(): per_doc=per_doc, add_trivial_spans=eval_li_dep, stringent=STRINGENT)) + # * dtree eval + if False: + # TODO cope with differences in segmentation + dtree_dbl_preds = [('silver', dtree_dbl_pred), + ('gold', dtree_dbl_true)] + print(dep_compact_report(author_true, dtree_dbl_preds, + dep_metrics, docs_dbl, + labelset_true, + digits=digits, + percent=percent)) # end 2017-04-11 agreement between human annotators From 3731b653246ed7a43bd0fe19865f1de158b43edf Mon Sep 17 00:00:00 2001 From: moreymat Date: Sat, 20 May 2017 11:12:08 +0200 Subject: [PATCH 64/74] FIX showdown: skip dep sim matrix --- evals/showdown.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/evals/showdown.py b/evals/showdown.py index 2aaba37..e8591f9 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -519,7 +519,8 @@ def main(): digits=digits, percent=percent)) else: - raise ValueError("Sim matrix on dependencies not implemented yet") + pass + # raise ValueError("Sim matrix on dependencies not implemented yet") # constituency eval ctree_type = 'SimpleRST' if simple_rsttree else 'RST' From a056968694e6960c49d0561eaa21e2fa3500ddf3 Mon Sep 17 00:00:00 2001 From: moreymat Date: Sun, 21 May 2017 13:33:38 +0200 Subject: [PATCH 65/74] ENH showdown: pairwise dep similarity --- evals/showdown.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/evals/showdown.py b/evals/showdown.py index e8591f9..6c540f9 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -22,7 +22,8 @@ rst_parseval_similarity) # from attelo.metrics.deptree import (compute_uas_las, - dep_compact_report) + dep_compact_report, + dep_similarity) # local to this package from evals.braud_coling import (load_braud_coling_ctrees, @@ -150,7 +151,7 @@ 'JCN15_1S1S', 'FH14_gSVM', 'FH14_gCRF', 'JE14', 'LLC16', 'HHN16_hilda', 'HHN16_mst', 'BPS16', 'BCS17_mono', - 'BCS17_cross_dev', + 'BCS17_cross', 'SHV15_D', 'li_sujian', 'ours-chain', 'ours-tree', 'ours-tree-su' @@ -331,13 +332,13 @@ def main(): nary_enc='chain')) ) # braud eacl 2017 - cross+dev - if author_pred == 'BCS17_cross_dev': + if author_pred == 'BCS17_cross': c_preds.append( - ('BCS17_cross_dev', load_braud_eacl_ctrees( + ('BCS17_cross', load_braud_eacl_ctrees( BRAUD_EACL_CROSS_DEV, REL_CONV, sorted_doc_names)) ) d_preds.append( - ('BCS17_cross_dev', load_braud_eacl_dtrees( + ('BCS17_cross', load_braud_eacl_dtrees( BRAUD_EACL_CROSS_DEV, REL_CONV, sorted_doc_names, nary_enc='chain')) ) @@ -519,7 +520,9 @@ def main(): digits=digits, percent=percent)) else: - pass + print(dep_similarity(d_preds, doc_names, labelset_true, + dep_metric='U', digits=digits, percent=percent, + out_format='latex')) # raise ValueError("Sim matrix on dependencies not implemented yet") # constituency eval From 349d8e46b47624f903d50bc4826eaaf74a4ac6b0 Mon Sep 17 00:00:00 2001 From: moreymat Date: Mon, 5 Jun 2017 15:48:12 +0200 Subject: [PATCH 66/74] ENH evals.showdown: new parser WLW17 --- evals/ji.py | 6 ++++++ evals/showdown.py | 20 +++++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/evals/ji.py b/evals/ji.py index 6f01512..08fbd8b 100644 --- a/evals/ji.py +++ b/evals/ji.py @@ -136,6 +136,12 @@ def load_ji_ctrees(ji_out_dir, rel_conv): node.rel = 'topic-change' elif node.rel == 'topiccomment': # Ji's output node.rel = 'topic-comment' + elif node.rel == 'textual-organization': # WLW17 output + # we use 'textual' as the coarse label ; + # JE14 outputs textualorganization which is the + # fine label in our taxonomy, hence is mapped to + # textual beforehand + node.rel = 'textual' # end normalize # store the resulting RSTTree ctree_pred[doc_name] = ct_pred diff --git a/evals/showdown.py b/evals/showdown.py index 6c540f9..9e910a1 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -133,6 +133,8 @@ SURDEANU_LOG_FILE = '/home/mmorey/melodi/rst/replication/surdeanu/output/log' # Li Sujian dep parser # imported, see above +# Wang, Li and Wang at ACL 2017 +WLW17_OUT_DIR = '/home/mmorey/melodi/rst/replication/wang/rst-dt/RSTtrees-WSJ-main-1.0/TEST' # level of detail for parseval STRINGENT = False @@ -153,6 +155,7 @@ 'BPS16', 'BCS17_mono', 'BCS17_cross', 'SHV15_D', + 'WLW17', # Wang, Li and Wang, ACL17 'li_sujian', 'ours-chain', 'ours-tree', 'ours-tree-su' ] @@ -429,7 +432,22 @@ def main(): JI_OUT_DIR, REL_CONV, nary_enc='chain')) ) # ji-{chain,tree} would be the same except nary_enc='tree' ; - # the nary_enc does not matter because codra outputs binary ctrees, + # the nary_enc does not matter because DPLP outputs binary ctrees, + # hence both encodings result in (the same) strictly ordered dtrees + + if author_pred == 'WLW17': + # WLW17 outputs RST ctrees in the form of lists of spans, just + # like JE14 ; + # load_ji_dtrees maps them to RST dtrees + c_preds.append( + ('WLW17', load_ji_ctrees( + WLW17_OUT_DIR, REL_CONV)) + ) + d_preds.append( + ('WLW17', load_ji_dtrees( + WLW17_OUT_DIR, REL_CONV, nary_enc='chain')) + ) + # the nary_enc does not matter because WLW17 outputs binary ctrees, # hence both encodings result in (the same) strictly ordered dtrees if author_pred == 'SHV15_D': From 3bfca65b814614d0ea096d6e84e175d0ae218109 Mon Sep 17 00:00:00 2001 From: moreymat Date: Tue, 13 Jun 2017 15:44:06 +0200 Subject: [PATCH 67/74] FIX irit_rst_dt.harness: pick upstream/master to resolve conflict --- irit_rst_dt/harness.py | 55 ------------------------------------------ 1 file changed, 55 deletions(-) diff --git a/irit_rst_dt/harness.py b/irit_rst_dt/harness.py index 63ac448..726ef05 100644 --- a/irit_rst_dt/harness.py +++ b/irit_rst_dt/harness.py @@ -107,31 +107,6 @@ def create_folds(self, mpack): # ------------------------------------------------------ # paths # ------------------------------------------------------ - -<<<<<<< HEAD - def mpack_paths(self, test_data, stripped=False, with_cdus=False): - """ - Parameters - ---------- - test_data : boolean - If true, the returned paths point to self.testset else to - self.dataset. - - stripped : boolean, defaults to False - TODO - - with_cdus : boolean, defaults to False - If True, generate CDUs (eg. for fragmented EDUs), pairings - on them and the corresponding feature vectors. - - Returns - ------- - paths : dict of (glob patterns of) file paths - Path to: edu_input, pairings, features, vocab, labels. - Also contains 'corpus' (to access gold structures, WIP for - RST-DT) ; if `with_cdus` is True, also cdu_input, - cdu_pairings, cdu_features. -======= def mpack_paths(self, test_data, stripped=False): """Return a dict of paths needed to read a datapack. @@ -150,7 +125,6 @@ def mpack_paths(self, test_data, stripped=False): Paths to files that enable to read a datapack. Useful keys are 'edu_input', 'pairings', 'features', 'vocab', 'corpus' (WIP, used to access gold structures). ->>>>>>> upstream/master """ base = 'relations.edu-pairs' ext = base + '.sparse' @@ -162,34 +136,6 @@ def mpack_paths(self, test_data, stripped=False): # WIP gold RST trees corpus_path = fp.abspath(TEST_CORPUS if test_data else TRAINING_CORPUS) -<<<<<<< HEAD - # end gold RST trees - res = { - 'edu_input': core_path + '.edu_input', - 'pairings': core_path + '.pairings', - 'features': ((core_path + '.stripped') if stripped - else core_path), - 'vocab': vocab_path, - 'labels': labels_path, - # corpus for gold RST trees - 'corpus': corpus_path, - } - if with_cdus: - # 2016-07-28 fragmented EDUs - frag_ext = 'relations.frag-pairs.sparse' - frag_path = fp.join(self.eval_dir, dset, "*.%s" % frag_ext) - res.update([ - ('cdu_input', (frag_path + '.cdu_input' if with_cdus - else None)), - ('cdu_pairings', (frag_path + '.cdu_pairings' if with_cdus - else None)), - ('cdu_features', (((frag_path + '.stripped') if stripped - else frag_path) if with_cdus - else None)), - ]) - - return res -======= # end WIP return { 'edu_input': core_path + '.edu_input', @@ -198,7 +144,6 @@ def mpack_paths(self, test_data, stripped=False): 'vocab': core_path + '.vocab', 'corpus': corpus_path } ->>>>>>> upstream/master def model_paths(self, rconf, fold, parser): """Paths to the learner(s) model(s). From 80345140bd591b6dcfe68c5116cfa01de8a88147 Mon Sep 17 00:00:00 2001 From: moreymat Date: Wed, 29 Nov 2017 17:41:45 +0100 Subject: [PATCH 68/74] ENH d-metrics: add N+O, rm R+O ; c-metrics: add +H --- evals/showdown.py | 46 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/evals/showdown.py b/evals/showdown.py index 9e910a1..e26ccaa 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -29,7 +29,7 @@ from evals.braud_coling import (load_braud_coling_ctrees, load_braud_coling_dtrees) from evals.braud_eacl import (load_braud_eacl_ctrees, - load_braud_eacl_dtrees) + load_braud_eacl_dtrees) from evals.codra import load_codra_ctrees, load_codra_dtrees from evals.feng import load_feng_ctrees, load_feng_dtrees from evals.gcrf_tree_format import load_gcrf_ctrees, load_gcrf_dtrees @@ -67,7 +67,8 @@ # * syntax: pred vs gold # old-style .edu_input: whole test set -EDUS_FILE = os.path.join('/home/mmorey/melodi/rst', +EDUS_FILE = os.path.join('/home/mmorey', + 'melodi/rst', 'irit-rst-dt/TMP/syn_gold_coarse', 'TEST.relations.sparse.edu_input') @@ -78,20 +79,23 @@ # outputs of parsers EISNER_OUT_SYN_PRED = os.path.join( - '/home/mmorey/melodi/rst', + '/home/mmorey', + 'melodi/rst', 'irit-rst-dt/TMP/syn_pred_coarse', # lbl 'scratch-current/combined', 'output.maxent-iheads-global-AD.L-jnt-eisner') # 2016-09-14 "tree" transform, predicted syntax EISNER_OUT_TREE_SYN_PRED = os.path.join( - '/home/mmorey/melodi/rst', + '/home/mmorey', + 'melodi/rst', 'irit-rst-dt/TMP/2016-09-12T0825', # lbl 'scratch-current/combined', 'output.maxent-iheads-global-AD.L-jnt-eisner') EISNER_OUT_TREE_SYN_PRED_SU = os.path.join( - '/home/mmorey/melodi/rst', + '/home/mmorey', + 'melodi/rst', 'irit-rst-dt/TMP/2016-09-12T0825', # lbl 'scratch-current/combined', 'output.maxent-iheads-global-AD.L-jnt_su-eisner') @@ -99,28 +103,37 @@ EISNER_OUT_SYN_PRED_SU = os.path.join( - '/home/mmorey/melodi/rst', + '/home/mmorey', + 'melodi/rst', 'irit-rst-dt/TMP/latest', # lbl 'scratch-current/combined', 'output.maxent-AD.L-jnt_su-eisner') EISNER_OUT_SYN_GOLD = os.path.join( - '/home/mmorey/melodi/rst', + '/home/mmorey', + 'melodi/rst', 'irit-rst-dt/TMP/syn_gold_coarse', # lbl 'scratch-current/combined', 'output.maxent-iheads-global-AD.L-jnt-eisner') # output of Joty's parser CODRA -CODRA_OUT_DIR = '/home/mmorey/melodi/rst/replication/joty/Doc-level' +CODRA_OUT_DIR = os.path.join( + '/home/mmorey', + 'melodi/rst/replication/joty/Doc-level' +) # output of Ji's parser DPLP # JI_OUT_DIR = os.path.join('/home/mmorey/melodi/rst/replication/ji_eisenstein', 'DPLP/data/docs/test/') -JI_OUT_DIR = os.path.join('/home/mmorey/melodi/rst/replication/ji_eisenstein', 'official_output/outputs/') +JI_OUT_DIR = os.path.join('/home/mmorey', + 'melodi/rst/replication/ji_eisenstein', + 'official_output/outputs/') # Feng's parsers -FENG_DIR = '/home/mmorey/melodi/rst/replication/feng_hirst/' +FENG_DIR = os.path.join('/home/mmorey', + 'melodi/rst/replication/feng_hirst/') FENG1_OUT_DIR = os.path.join(FENG_DIR, 'phil', 'tmp') FENG2_OUT_DIR = os.path.join(FENG_DIR, 'gCRF_dist/texts/results/test_batch_gold_seg') # Li Qi's parser -LI_QI_OUT_DIR = '/home/mmorey/melodi/rst/replication/li_qi/result' +LI_QI_OUT_DIR = os.path.join('/home/mmorey', + 'melodi/rst/replication/li_qi/result') # Hayashi's HILDA HAYASHI_OUT_DIR = '/home/mmorey/melodi/rst/replication/hayashi/SIGDIAL' HAYASHI_HILDA_OUT_DIR = os.path.join(HAYASHI_OUT_DIR, 'auto_parse/cons/HILDA') @@ -134,7 +147,9 @@ # Li Sujian dep parser # imported, see above # Wang, Li and Wang at ACL 2017 -WLW17_OUT_DIR = '/home/mmorey/melodi/rst/replication/wang/rst-dt/RSTtrees-WSJ-main-1.0/TEST' +WLW17_OUT_DIR = os.path.join( + '/home/mmorey', + 'melodi/rst/replication/wang/rst-dt/RSTtrees-WSJ-main-1.0/TEST') # level of detail for parseval STRINGENT = False @@ -523,7 +538,7 @@ def main(): if INCLUDE_LS: dep_metrics += ["tag_R"] if EVAL_NUC_RANK: - dep_metrics += ["R+N", "R+O", "F"] + dep_metrics += ["N+O", "R+N", "F"] # 2017-11-29 disable "R+O" # _true doc_names = sorted(dtree_true.keys()) @@ -590,7 +605,10 @@ def main(): stringent=STRINGENT, out_format='latex')) else: - metric_types = ['S', 'N', 'R', 'F'] + metric_types = [ + 'S', 'N', 'R', 'F', + 'S+H', 'N+H', 'R+H', 'F+H', + ] # compact report, f1-scores only print(rst_parseval_compact_report(author_true, ctree_preds, ctree_type=ctree_type, From 1043db3e719e0e5a6f44050b827051455e737178 Mon Sep 17 00:00:00 2001 From: moreymat Date: Fri, 1 Dec 2017 17:37:59 +0100 Subject: [PATCH 69/74] FIX showdown: out_fmt, span metrics +H+K+HH --- evals/showdown.py | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/evals/showdown.py b/evals/showdown.py index e26ccaa..994a9f0 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -166,7 +166,7 @@ 'gold', # RST-main 'silver', # RST-double 'JCN15_1S1S', 'FH14_gSVM', 'FH14_gCRF', 'JE14', - 'LLC16', 'HHN16_hilda', 'HHN16_mst', + 'LLC16', 'HHN16_HILDA', 'HHN16_MST', 'BPS16', 'BCS17_mono', 'BCS17_cross', 'SHV15_D', @@ -261,6 +261,9 @@ def main(): help='Scores are displayed as percentages (ex: 57.9)') parser.add_argument('--detailed', type=int, default=0, help='Level of detail for evaluations') + parser.add_argument('--out_fmt', default='text', + choices=['text', 'latex'], + help='Output format') # args = parser.parse_args() author_true = args.author_true @@ -276,6 +279,7 @@ def main(): raise ValueError('--percent requires --digits >= 3') # level of detail for evals detailed = args.detailed + out_fmt = args.out_fmt # "per_doc = True" computes p, r, f as in DPLP: compute scores per doc # then average over docs @@ -361,24 +365,24 @@ def main(): nary_enc='chain')) ) - if author_pred == 'HHN16_hilda': + if author_pred == 'HHN16_HILDA': c_preds.append( - ('HHN16_hilda', load_hayashi_hilda_ctrees( + ('HHN16_HILDA', load_hayashi_hilda_ctrees( HAYASHI_HILDA_OUT_DIR, REL_CONV)) ) d_preds.append( - ('HHN16_hilda', load_hayashi_hilda_dtrees( + ('HHN16_HILDA', load_hayashi_hilda_dtrees( HAYASHI_HILDA_OUT_DIR, REL_CONV, nary_enc='chain')) ) - if author_pred == 'HHN16_mst': + if author_pred == 'HHN16_MST': c_preds.append( - ('HHN16_mst', load_hayashi_dep_ctrees( + ('HHN16_MST', load_hayashi_dep_ctrees( HAYASHI_MST_OUT_DIR, REL_CONV_DTREE, EDUS_FILE_PAT, nuc_clf, rnk_clf)) ) d_preds.append( - ('HHN16_mst', load_hayashi_dep_dtrees( + ('HHN16_MST', load_hayashi_dep_dtrees( HAYASHI_MST_OUT_DIR, REL_CONV_DTREE, EDUS_FILE_PAT, nuc_clf, rnk_clf)) ) @@ -533,12 +537,12 @@ def main(): # dependency eval dep_metrics = ["U"] if EVAL_NUC_RANK: - dep_metrics += ['O', 'N'] + dep_metrics += ['O', 'N', 'O+N'] dep_metrics += ["R"] if INCLUDE_LS: dep_metrics += ["tag_R"] if EVAL_NUC_RANK: - dep_metrics += ["N+O", "R+N", "F"] # 2017-11-29 disable "R+O" + dep_metrics += ["R+N", "F"] # 2017-11-29 disable "R+O" # _true doc_names = sorted(dtree_true.keys()) @@ -551,11 +555,12 @@ def main(): print(dep_compact_report(parser_true, d_preds, dep_metrics, doc_names, labelset_true, digits=digits, - percent=percent)) + percent=percent, + out_format=out_fmt)) else: print(dep_similarity(d_preds, doc_names, labelset_true, dep_metric='U', digits=digits, percent=percent, - out_format='latex')) + out_format=out_fmt)) # raise ValueError("Sim matrix on dependencies not implemented yet") # constituency eval @@ -603,11 +608,15 @@ def main(): per_doc=per_doc, add_trivial_spans=eval_li_dep, stringent=STRINGENT, - out_format='latex')) + out_format=out_fmt)) else: metric_types = [ 'S', 'N', 'R', 'F', - 'S+H', 'N+H', 'R+H', 'F+H', + # 'S+H', 'N+H', 'R+H', 'F+H', + # 'S+K', 'N+K', 'R+K', 'F+K', + # 'S+HH', 'N+HH', 'R+HH', 'F+HH', + # 'S+K+HH', 'N+K+HH', 'R+K+HH', 'F+K+HH', + 'S+H+K+HH', 'N+H+K+HH', 'R+H+K+HH', 'F+H+K+HH', ] # compact report, f1-scores only print(rst_parseval_compact_report(author_true, ctree_preds, @@ -615,9 +624,11 @@ def main(): metric_types=metric_types, digits=digits, percent=percent, + print_support=False, per_doc=per_doc, add_trivial_spans=eval_li_dep, - stringent=STRINGENT)) + stringent=STRINGENT, + out_format=out_fmt)) else: parsers_true = [author_true] if author_true != 'each' else authors_pred for parser_true in parsers_true: From ff4177d66fccab94e025bb76f12778f5d3105ed5 Mon Sep 17 00:00:00 2001 From: moreymat Date: Tue, 12 Dec 2017 23:20:44 +0100 Subject: [PATCH 70/74] WIP nuc_clf --- evals/prepare_nuc_dataset.py | 167 ++++++++++++++++++++++++++++++++++ evals/showdown.py | 38 +++++++- evals/train_nuc_classifier.py | 147 ++++++++++++++++++++++++++++++ 3 files changed, 347 insertions(+), 5 deletions(-) create mode 100644 evals/prepare_nuc_dataset.py create mode 100644 evals/train_nuc_classifier.py diff --git a/evals/prepare_nuc_dataset.py b/evals/prepare_nuc_dataset.py new file mode 100644 index 0000000..97e1c6e --- /dev/null +++ b/evals/prepare_nuc_dataset.py @@ -0,0 +1,167 @@ +"""This utility script outputs a dataset of the nuclearity of RST edges. + +Given the path to the RST-DT corpus and a dataset of candidate RST +dependencies labelled with their gold coarse (class) RST relation (or +none if they are unrelated), produce a similar dataset for the task +of nuclearity prediction. + +As of 2017-12-08, we filter out the instances for unrelated pairs of EDUs +and left-oriented dependencies, only keeping right-oriented dependencies +(except for "ROOT"). +The resulting dataset describes a binary classification problem. +""" + +from __future__ import absolute_import, print_function + +import argparse +import codecs +import itertools +import os + +from educe.rst_dt.annotation import NUC_N, NUC_S +from educe.rst_dt.corpus import RstRelationConverter, RELMAP_112_18_FILE +from educe.rst_dt.dep_corpus import read_corpus +from educe.rst_dt.deptree import RstDepTree + + +def main(corpus, dataset, out_dir, nary_enc): + """Do prepare the nuclearity dataset. + + Parameters + ---------- + corpus : str + Path to the RST-DT "main" corpus. + dataset : str + Path to the existing dataset labelled with coarse relations. + out_dir : str + Path to the output folder. + """ + # (re-)create a d-corpus from the RST-DT c-corpus + corpus_subset = os.path.basename(dataset).split('.')[0] + if corpus_subset not in ('TRAINING', 'TEST'): + raise ValueError("dataset must be a filepath that starts with" + "one of {'TRAINING', 'TEST'}") + if corpus_subset == 'TRAINING': + section = 'train' + else: # 'TEST' + section = 'test' + rst_ccorpus = read_corpus(corpus, section=section) + rel_conv = RstRelationConverter(RELMAP_112_18_FILE).convert_dtree + rst_dcorpus = dict() # FileId.doc -> RstDepTree + for doc_key, rst_ctree in rst_ccorpus[section].items(): + rst_dtree = RstDepTree.from_rst_tree(rst_ctree, nary_enc=nary_enc) + rst_dtree_coarse = rel_conv(rst_dtree) + rst_dcorpus[doc_key.doc] = rst_dtree_coarse + # for each candidate dependency in the dataset, read the nuclearity + # from the RST d-corpus + # Nota: we stream through the dataset to avoid loading it entirely in + # memory ; we don't need to open the vocabulary file (.vocab), nor the + # description of the EDUs (.edu_input) + pairings = dataset + '.pairings' + # edu_desc = dataset + '.edu_input' + new_dataset = os.path.join(out_dir, os.path.basename(dataset)) + new_pairs = os.path.join(out_dir, os.path.basename(pairings)) + if ((os.path.abspath(new_dataset) == os.path.abspath(dataset) or + os.path.abspath(new_pairs) == os.path.abspath(pairings))): + raise ValueError("I won't let you erase your base dataset") + with codecs.open(dataset, mode='rb', encoding='utf-8') as f_data: + with codecs.open(pairings, mode='rb', encoding='utf-8') as f_pairs: + with codecs.open(new_dataset, mode='wb', encoding='utf-8') as data_out: + with codecs.open(new_pairs, mode='wb', encoding='utf-8') as pairs_out: + # read header line in svmlight file + header = f_data.readline() + header_prefix = '# labels: ' + assert header.startswith(header_prefix) + labels = header[len(header_prefix):].split() + int2lbl = dict(enumerate(labels, start=1)) + lbl2int = {lbl: i for i, lbl in int2lbl.items()} + unrelated = lbl2int["UNRELATED"] + root = lbl2int["ROOT"] + # write labels in header of new svmlight file, as an + # ordered list mapped to {1, 2} + print(header_prefix + ' '.join((NUC_N, NUC_S)), + file=data_out) + # stream through lines + for pair, line in itertools.izip(f_pairs, f_data): + # read candidate pair of EDUs + src_id, tgt_id = pair.strip().split('\t') + if src_id == 'ROOT': + continue + # now both src_id and tgt_id are of form "docname_int" + # ex: "wsj_0600.out_1" + src_idx = int(src_id.rsplit('_', 1)[1]) + doc_name, tgt_idx = tgt_id.rsplit('_', 1) + tgt_idx = int(tgt_idx) + if tgt_idx < src_idx: + # skip left dependencies: by construction, + # their nuclearity can only be Satellite + # (SN edges) + continue + # print(doc_name, src_id, tgt_id, src_idx, tgt_idx) + # read corresponding ref class (label), feature vector + lbl_idx, feat_vector = line.strip().split(' ', 1) + lbl_idx = int(lbl_idx) # lbl currently encoded as int + if lbl_idx in (unrelated, root): + continue + try: + lbl = int2lbl[lbl_idx] + except KeyError: + # the test set in RST-DT 1.0 has an error: + # wsj_1189.out [8-9] is labelled "span" instead of + # "Consequence" ; some runs used this erroneous + # version, hence had a class "0" (unknown) for + # this line in the dataset + if ((doc_name == 'wsj_1189.out' and + src_idx == 7 and + tgt_idx == 9)): + lbl = 'cause' + lbl_idx = lbl2int[lbl] + else: + print(doc_name, src_idx, tgt_idx) + raise + # print(src_id, tgt_id, lbl) + dtree = rst_dcorpus[doc_name] + assert dtree.heads[tgt_idx] == src_idx + assert dtree.labels[tgt_idx] == lbl + if dtree.nucs[tgt_idx] == NUC_N: + nuc_idx = 1 + elif dtree.nucs[tgt_idx] == NUC_S: + nuc_idx = 2 + else: + raise ValueError("weird nuclearity {}".format( + dtree.nucs[tgt_idx])) + print(str(nuc_idx) + ' ' + feat_vector, + file=data_out) + print(pair.strip(), file=pairs_out) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description='Prepare a nuclearity dataset.' + ) + parser.add_argument('--corpus', + help='Path to the RST-DT "main" corpus', + default=os.path.join( + os.path.expanduser('~'), + 'corpora/rst-dt/rst_discourse_treebank/data', + 'RSTtrees-WSJ-main-1.01' + )) + parser.add_argument('--dataset', + help='Base file of the dataset', + default=os.path.join( + os.path.expanduser('~'), + 'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse', + 'TRAINING.relations.sparse' + )) + parser.add_argument('--out_dir', + help='Output folder', + default=os.path.join( + os.path.expanduser('~'), + 'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_NUC' + )) + parser.add_argument('--nary_enc', + help='Encoding for n-ary nodes', + choices=['chain', 'tree'], + default='chain') + args = parser.parse_args() + main(args.corpus, args.dataset, args.out_dir, args.nary_enc) diff --git a/evals/showdown.py b/evals/showdown.py index 994a9f0..9193612 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -10,6 +10,8 @@ import itertools import os +from sklearn.datasets import load_svmlight_files + from educe.rst_dt.annotation import _binarize, SimpleRSTTree from educe.rst_dt.corpus import (RstRelationConverter, Reader as RstReader) @@ -46,6 +48,9 @@ load_attelo_ctrees, load_attelo_dtrees) from evals.surdeanu import load_surdeanu_ctrees, load_surdeanu_dtrees +# 2017-12-12 nuc_clf WIP +from evals.train_nuc_classifier import RightBinaryNuclearityClassifier +# end WIP nuc_clf # RST corpus CORPUS_DIR = os.path.join('corpus', 'RSTtrees-WSJ-main-1.01/') @@ -204,13 +209,36 @@ def setup_dtree_postprocessor(nary_enc='chain', order='strict', y_nuc_train = [] y_rnk_train = [] for doc_name, dt in sorted(dtree_true.items()): + # print(dt.__dict__) + # raise ValueError('wip wip nuc_clf') X_train.append(dt) y_nuc_train.append(dt.nucs) y_rnk_train.append(dt.ranks) # nuclearity clf - nuc_clf = DummyNuclearityClassifier(strategy=nuc_strategy, - constant=nuc_constant) - nuc_clf.fit(X_train, y_nuc_train) + if False: + nuc_clf = DummyNuclearityClassifier(strategy=nuc_strategy, + constant=nuc_constant) + nuc_clf.fit(X_train, y_nuc_train) + else: + # 2017-12-12 WIP nuc_clf + # shiny new nuc_clf ; still very hacky + # import the nuclearity TRAIN and TEST sets generated from + # the svmlight feature vectors (ahem) + dset_folder = os.path.join( + os.path.expanduser('~'), + 'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_NUC' + ) + dset_train = os.path.join(dset_folder, 'TRAINING.relations.sparse') + dset_test = os.path.join(dset_folder, 'TEST.relations.sparse') + # FIXME read n_features from .vocab + X_nuc_train, y_nuc_train, X_nuc_test, y_nuc_test = load_svmlight_files( + (dset_train, dset_test), + n_features=46731, + zero_based=False + ) + nuc_clf = RightBinaryNuclearityClassifier() + nuc_clf = nuc_clf.fit(X_nuc_train, y_nuc_train) + # end WIP nuc_clf # rank clf rnk_clf = InsideOutAttachmentRanker( strategy=rnk_strategy, prioritize_same_unit=rnk_prioritize_same_unit, @@ -612,11 +640,11 @@ def main(): else: metric_types = [ 'S', 'N', 'R', 'F', - # 'S+H', 'N+H', 'R+H', 'F+H', + 'S+H', 'N+H', 'R+H', 'F+H', # 'S+K', 'N+K', 'R+K', 'F+K', # 'S+HH', 'N+HH', 'R+HH', 'F+HH', # 'S+K+HH', 'N+K+HH', 'R+K+HH', 'F+K+HH', - 'S+H+K+HH', 'N+H+K+HH', 'R+H+K+HH', 'F+H+K+HH', + # 'S+H+K+HH', 'N+H+K+HH', 'R+H+K+HH', 'F+H+K+HH', ] # compact report, f1-scores only print(rst_parseval_compact_report(author_true, ctree_preds, diff --git a/evals/train_nuc_classifier.py b/evals/train_nuc_classifier.py new file mode 100644 index 0000000..bb25adf --- /dev/null +++ b/evals/train_nuc_classifier.py @@ -0,0 +1,147 @@ +"""This utility script trains a classifier for nuclearity of RST edges. + +Given the path to a nuclearity dataset, it trains a classifier and +evaluates it. +""" + + +from __future__ import absolute_import, print_function + +import argparse +import codecs +from collections import defaultdict +import itertools +import os + +from sklearn.datasets import load_svmlight_file, load_svmlight_files +from sklearn.model_selection import cross_val_score +from sklearn.linear_model.logistic import LogisticRegression +# from sklearn.model_selection import GridSearchCV +from sklearn.preprocessing import LabelEncoder +import matplotlib.pyplot as plt + +from educe.rst_dt.annotation import NUC_N, NUC_S + + +if False: + # import the nuclearity TRAIN and TEST sets + dset_folder = os.path.join( + os.path.expanduser('~'), + 'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_NUC' + ) + dset_train = os.path.join(dset_folder, 'TRAINING.relations.sparse') + dset_test = os.path.join(dset_folder, 'TEST.relations.sparse') + + X_train, y_train, X_test, y_test = load_svmlight_files( + (dset_train, dset_test), + zero_based=False + ) + nuc_clf = LogisticRegression(penalty='l1', n_jobs=2) + # train nuclearity classifier, cross-validate performance on train + scores = cross_val_score(nuc_clf, X_train, y_train, cv=10) + print(scores) + print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) + # fit a + nuc_clf = nuc_clf.fit(X_train, y_train) + print(nuc_clf.score(X_test, y_test)) + + +# 2017-12-06 non-dummy nuc_clf +# DIRTY load the feature vector for all candidate edges in the TEST +# set +feat_vecs = dict() +dset_folder = os.path.join( + os.path.expanduser('~'), + 'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse' +) +dset_test = os.path.join(dset_folder, 'TEST.relations.sparse') +# we use the original svmlight files whose label is the relation +# class (which we actually don't need here) +# FIXME read n_features from .vocab +X_test, y_lbl_test = load_svmlight_file(dset_test, n_features=46731, + zero_based=False) +# build mapping from doc_name, src_idx, tgt_idx to line number +# in X_test +pairs = dset_test + '.pairings' +pair_map = defaultdict(lambda: defaultdict(dict)) +with codecs.open(pairs, mode='rb', encoding='utf-8') as f_pairs: + for i, line in enumerate(f_pairs): + src_id, tgt_id = line.strip().split('\t') + src_idx = (0 if src_id == 'ROOT' + else int(src_id.rsplit('_', 1)[1])) + doc_name, tgt_idx = tgt_id.rsplit('_', 1) + tgt_idx = int(tgt_idx) + # print(line) + # print(doc_name, src_idx, tgt_idx) + pair_map[doc_name][src_idx][tgt_idx] = i +# end DIRTY + + +class RightBinaryNuclearityClassifier(object): + """Predict the nuclearity of right-oriented dependencies (binary). + + The nuclearity of ordinary, right-oriented dependencies can be + either `NUC_S` or `NUC_N` (NS or NN relations). + Right-oriented dependencies from the fake root have nuclearity + `NUC_R` by convention ; Left-oriented dependencies have nuclearity + `NUC_S`. + + Parameters + ---------- + bin_clf : sklearn classifier + Binary classifier for right dependencies: NN vs NS. + """ + + def __init__(self, bin_clf=LogisticRegression(penalty='l1', n_jobs=2)): + """Init""" + self.bin_clf = bin_clf + + def fit(self, X, y): + """Fit""" + self.bin_clf = self.bin_clf.fit(X, y) + if True: # verbose + scores = cross_val_score(self.bin_clf, X, y, cv=10) + print(scores) + print("Accuracy: %0.2f (+/- %0.2f)" % ( + scores.mean(), scores.std() * 2)) + return self + + def predict(self, X): + """Predict nuclearity of edges in RstDepTrees X from the TEST set. + """ + y = [] + for dtree in X: + doc_name = dtree.origin.doc + yi = [] + for i, head in enumerate(dtree.heads): + if i == 0: + # fake root !? maybe we shouldn't write anything + # here ; + # FIXME check how to be consistent throughout educe and + # eval code + yi.append(NUC_N) + elif i < head: + # left edge: SN + yi.append(NUC_S) + elif head == 0: + # FIXME NUC_R for edges from the root? + yi.append(NUC_N) + else: + # right edge: NN or NS? + line_idx = pair_map[doc_name][head][i] + # X_test[line_idx,:] is a matrix with 1 row + Xi = X_test[line_idx,:] + try: + y_pred = self.bin_clf.predict(Xi) + except ValueError: + print(Xi) + raise + if y_pred == 1: + yi.append(NUC_N) + elif y_pred == 2: + yi.append(NUC_S) + else: + raise ValueError("Weird prediction: {}".format( + y_pred)) + y.append(yi) + return y From 822af14a0ee51d49906d7f304d3e62d0c3112050 Mon Sep 17 00:00:00 2001 From: moreymat Date: Thu, 14 Dec 2017 16:46:05 +0100 Subject: [PATCH 71/74] FIX load either c- or d-trees once, pass them to the other loader --- evals/braud_coling.py | 15 ++++- evals/braud_eacl.py | 15 ++++- evals/codra.py | 22 ++++--- evals/gcrf_tree_format.py | 8 ++- evals/hayashi_cons.py | 11 +++- evals/hayashi_deps.py | 21 +++---- evals/ji.py | 9 ++- evals/li_qi.py | 21 ++++--- evals/ours.py | 14 +++-- evals/showdown.py | 111 +++++++++++++++++++++------------- evals/surdeanu.py | 10 ++- evals/train_nuc_classifier.py | 9 +-- 12 files changed, 169 insertions(+), 97 deletions(-) diff --git a/evals/braud_coling.py b/evals/braud_coling.py index 625cb19..4856aac 100644 --- a/evals/braud_coling.py +++ b/evals/braud_coling.py @@ -141,10 +141,19 @@ def load_braud_coling_ctrees(out_dir, rel_conv): return ctree_pred -def load_braud_coling_dtrees(out_dir, rel_conv, nary_enc='chain'): - """Do load dtrees""" +def load_braud_coling_dtrees(out_dir, rel_conv, nary_enc='chain', + ctree_pred=None): + """Do load dtrees. + + Parameters + ---------- + ctree_pred : dict(str, RSTTree), optional + RST c-trees, indexed by doc_name. If c-trees are provided this + way, `out_dir` is ignored. + """ dtree_pred = dict() - ctree_pred = load_braud_coling_ctrees(out_dir, rel_conv) + if ctree_pred is None: + ctree_pred = load_braud_coling_ctrees(out_dir, rel_conv) for doc_name, ct_pred in ctree_pred.items(): dt_pred = RstDepTree.from_rst_tree(ct_pred) dtree_pred[doc_name] = dt_pred diff --git a/evals/braud_eacl.py b/evals/braud_eacl.py index 082efa5..e865a8e 100644 --- a/evals/braud_eacl.py +++ b/evals/braud_eacl.py @@ -122,10 +122,19 @@ def load_braud_eacl_ctrees(fpath, rel_conv, doc_names): return ctree_pred -def load_braud_eacl_dtrees(fpath, rel_conv, doc_names, nary_enc='chain'): - """Do load dtrees""" +def load_braud_eacl_dtrees(fpath, rel_conv, doc_names, nary_enc='chain', + ctree_pred=None): + """Do load dtrees + + Parameters + ---------- + ctree_pred : dict(str, RSTTree), optional + RST c-trees, indexed by doc_name. If c-trees are provided this + way, `out_dir` is ignored. + """ dtree_pred = dict() - ctree_pred = load_braud_eacl_ctrees(fpath, rel_conv, doc_names) + if ctree_pred is None: + ctree_pred = load_braud_eacl_ctrees(fpath, rel_conv, doc_names) for doc_name, ct_pred in ctree_pred.items(): dt_pred = RstDepTree.from_rst_tree(ct_pred) dtree_pred[doc_name] = dt_pred diff --git a/evals/codra.py b/evals/codra.py index a586389..11b5aea 100644 --- a/evals/codra.py +++ b/evals/codra.py @@ -105,7 +105,8 @@ def load_codra_ctrees(codra_out_dir, rel_conv): return ctree_pred -def load_codra_dtrees(codra_out_dir, rel_conv, nary_enc='chain'): +def load_codra_dtrees(codra_out_dir, rel_conv, nary_enc='chain', + ctree_pred=None): """Get the dtrees that correspond to the ctrees output by CODRA. Parameters @@ -114,21 +115,26 @@ def load_codra_dtrees(codra_out_dir, rel_conv, nary_enc='chain'): Path to the base directory containing the output files. nary_enc: one of {'chain', 'tree'} Encoding for n-ary nodes. + ctree_pred : dict(str, RSTTree), optional + RST c-trees, indexed by doc_name. If c-trees are provided this + way, `out_dir` is ignored. Returns ------- dtree_pred: dict(str, RstDepTree) RST dtree for each document. """ - # load predicted trees - data_pred = load_codra_output_files(codra_out_dir) - # filenames = data_pred['filenames'] - doc_names_pred = data_pred['doc_names'] - rst_ctrees_pred = data_pred['rst_ctrees'] - + if ctree_pred is None: + # load predicted trees + data_pred = load_codra_output_files(codra_out_dir) + # filenames = data_pred['filenames'] + doc_names_pred = data_pred['doc_names'] + rst_ctrees_pred = data_pred['rst_ctrees'] + ctree_pred = {doc_name: ct_pred for doc_name, ct_pred + in itertools.izip(doc_names_pred, rst_ctrees_pred)} # build a dict from doc_name to ordered dtree (RstDepTree) dtree_pred = dict() - for doc_name, ct_pred in itertools.izip(doc_names_pred, rst_ctrees_pred): + for doc_name, ct_pred in ctree_pred.items(): # constituency tree # replace fine-grained labels with coarse-grained labels ; # the files we have already contain the coarse labels, except their diff --git a/evals/gcrf_tree_format.py b/evals/gcrf_tree_format.py index 1b4fd0b..ba8fe27 100644 --- a/evals/gcrf_tree_format.py +++ b/evals/gcrf_tree_format.py @@ -195,7 +195,7 @@ def load_gcrf_ctrees(out_dir, rel_conv): return ctree_pred -def load_gcrf_dtrees(out_dir, rel_conv, nary_enc='chain'): +def load_gcrf_dtrees(out_dir, rel_conv, nary_enc='chain', ctree_pred=None): """Get the dtrees that correspond to the ctrees output by gCRF. Parameters @@ -204,13 +204,17 @@ def load_gcrf_dtrees(out_dir, rel_conv, nary_enc='chain'): Path to the base directory containing the output files. nary_enc: one of {'chain', 'tree'} Encoding for n-ary nodes. + ctree_pred : dict(str, RSTTree), optional + RST c-trees, indexed by doc_name. If c-trees are provided this + way, `out_dir` is ignored. Returns ------- dtree_pred: dict(str, RstDepTree) RST dtree for each document. """ - ctree_pred = load_gcrf_ctrees(out_dir, rel_conv) + if ctree_pred is None: + ctree_pred = load_gcrf_ctrees(out_dir, rel_conv) dtree_pred = dict() for doc_name, ct_pred in ctree_pred.items(): dt_pred = RstDepTree.from_rst_tree(ct_pred, nary_enc=nary_enc) diff --git a/evals/hayashi_cons.py b/evals/hayashi_cons.py index 6f76512..7bdb9a7 100644 --- a/evals/hayashi_cons.py +++ b/evals/hayashi_cons.py @@ -127,7 +127,8 @@ def load_hayashi_hilda_ctrees(out_dir, rel_conv): return ctree_pred -def load_hayashi_hilda_dtrees(out_dir, rel_conv, nary_enc='chain'): +def load_hayashi_hilda_dtrees(out_dir, rel_conv, nary_enc='chain', + ctree_pred=None): """Load the dtrees for the ctrees output by Hayashi et al.'s HILDA. Parameters @@ -137,14 +138,18 @@ def load_hayashi_hilda_dtrees(out_dir, rel_conv, nary_enc='chain'): rel_conv: RstRelationConverter Converter for relation labels (fine- to coarse-grained, plus normalization). + ctree_pred : dict(str, RSTTree), optional + RST c-trees, indexed by doc_name. If c-trees are provided this + way, `out_dir` is ignored. Returns ------- dtree_pred: dict(str, RstDepTree) RST dtree for each document. """ - # load predicted ctrees - ctree_pred = load_hayashi_hilda_ctrees(out_dir, rel_conv) + if ctree_pred is None: + # load predicted ctrees + ctree_pred = load_hayashi_hilda_ctrees(out_dir, rel_conv) # convert to dtrees dtree_pred = dict() for doc_name, ct_pred in ctree_pred.items(): diff --git a/evals/hayashi_deps.py b/evals/hayashi_deps.py index cbde909..c5fd6b3 100644 --- a/evals/hayashi_deps.py +++ b/evals/hayashi_deps.py @@ -10,7 +10,7 @@ from educe.learning.edu_input_format import load_edu_input_file from educe.rst_dt.corpus import Reader -from educe.rst_dt.deptree import RstDepTree +from educe.rst_dt.deptree import RstDepTree, RstDtException from educe.rst_dt.dep2con import deptree_to_rst_tree @@ -91,17 +91,13 @@ def load_hayashi_dep_dtrees(out_dir, rel_conv, edus_file_pat, nuc_clf, ---------- out_dir : str Path to the folder containing .dis files. - rel_conv : RstRelationConverter Converter for relation labels (fine- to coarse-grained, plus normalization). - edus_file_pat : str Pattern for the .edu_input files. - nuc_clf : NuclearityClassifier Nuclearity classifier - rnk_clf : RankClassifier Rank classifier @@ -135,7 +131,7 @@ def load_hayashi_dep_dtrees(out_dir, rel_conv, edus_file_pat, nuc_clf, def load_hayashi_dep_ctrees(out_dir, rel_conv, edus_file_pat, nuc_clf, - rnk_clf): + rnk_clf, dtree_pred=None): """Load the ctrees for the dtrees output by one of Hayashi et al.'s dep parsers. @@ -143,19 +139,18 @@ def load_hayashi_dep_ctrees(out_dir, rel_conv, edus_file_pat, nuc_clf, ---------- out_dir : str Path to the folder containing .dis files. - rel_conv : RstRelationConverter Converter for relation labels (fine- to coarse-grained, plus normalization). - edus_file_pat : str Pattern for the .edu_input files. - nuc_clf : NuclearityClassifier Nuclearity classifier - rnk_clf : RankClassifier Rank classifier + dtree_pred : dict(str, RstDepTree), optional + RST d-trees, indexed by doc_name. If d-trees are provided this + way, `out_dir` is ignored. Returns ------- @@ -163,9 +158,9 @@ def load_hayashi_dep_ctrees(out_dir, rel_conv, edus_file_pat, nuc_clf, RST ctree for each document. """ ctree_pred = dict() - - dtree_pred = load_hayashi_dep_dtrees(out_dir, rel_conv, edus_file_pat, - nuc_clf, rnk_clf) + if dtree_pred is None: + dtree_pred = load_hayashi_dep_dtrees(out_dir, rel_conv, edus_file_pat, + nuc_clf, rnk_clf) for doc_name, dt_pred in dtree_pred.items(): try: ct_pred = deptree_to_rst_tree(dt_pred) diff --git a/evals/ji.py b/evals/ji.py index 08fbd8b..c6ab6e8 100644 --- a/evals/ji.py +++ b/evals/ji.py @@ -149,7 +149,7 @@ def load_ji_ctrees(ji_out_dir, rel_conv): return ctree_pred -def load_ji_dtrees(ji_out_dir, rel_conv, nary_enc='chain'): +def load_ji_dtrees(ji_out_dir, rel_conv, nary_enc='chain', ctree_pred=None): """Get the dtrees that correspond to the ctrees output by DPLP. Parameters @@ -160,6 +160,9 @@ def load_ji_dtrees(ji_out_dir, rel_conv, nary_enc='chain'): Relation converter, from fine- to coarse-grained labels. nary_enc: one of {'chain', 'tree'} Encoding for n-ary nodes. + ctree_pred : dict(str, RSTTree), optional + RST c-trees, indexed by doc_name. If c-trees are provided this + way, `out_dir` is ignored. Returns ------- @@ -167,8 +170,8 @@ def load_ji_dtrees(ji_out_dir, rel_conv, nary_enc='chain'): RST dtree for each document. """ dtree_pred = dict() - - ctree_pred = load_ji_ctrees(ji_out_dir, rel_conv) + if ctree_pred is None: + ctree_pred = load_ji_ctrees(ji_out_dir, rel_conv) for doc_name, ct_pred in ctree_pred.items(): dtree_pred[doc_name] = RstDepTree.from_rst_tree( ct_pred, nary_enc=nary_enc) diff --git a/evals/li_qi.py b/evals/li_qi.py index abf1929..2df67d2 100644 --- a/evals/li_qi.py +++ b/evals/li_qi.py @@ -90,7 +90,7 @@ def load_li_qi_ctrees(out_dir, rel_conv): return ctree_pred -def load_li_qi_dtrees(out_dir, rel_conv, nary_enc='chain'): +def load_li_qi_dtrees(out_dir, rel_conv, nary_enc='chain', ctree_pred=None): """Get the dtrees that correspond to the ctrees output by Li Qi's parser. Parameters @@ -99,21 +99,26 @@ def load_li_qi_dtrees(out_dir, rel_conv, nary_enc='chain'): Path to the base directory containing the output files. nary_enc: one of {'chain', 'tree'} Encoding for n-ary nodes. + ctree_pred : dict(str, RSTTree), optional + RST c-trees, indexed by doc_name. If c-trees are provided this + way, `out_dir` is ignored. Returns ------- dtree_pred: dict(str, RstDepTree) RST dtree for each document. """ - # load predicted trees - data_pred = load_li_qi_output_files(out_dir) - # filenames = data_pred['filenames'] - doc_names_pred = data_pred['doc_names'] - rst_ctrees_pred = data_pred['rst_ctrees'] - + if ctree_pred is None: + # load predicted trees + data_pred = load_li_qi_output_files(out_dir) + # filenames = data_pred['filenames'] + doc_names_pred = data_pred['doc_names'] + rst_ctrees_pred = data_pred['rst_ctrees'] + ctree_pred = {doc_name: ct_pred for doc_name, ct_pred + in itertools.izip(doc_names_pred, rst_ctrees_pred)} # build a dict from doc_name to ordered dtree (RstDepTree) dtree_pred = dict() - for doc_name, ct_pred in itertools.izip(doc_names_pred, rst_ctrees_pred): + for doc_name, ct_pred in ctree_pred.items(): # constituency tree # replace fine-grained labels with coarse-grained labels ; # the files we have already contain the coarse labels, except their diff --git a/evals/ours.py b/evals/ours.py index 938a53c..2e50c2f 100644 --- a/evals/ours.py +++ b/evals/ours.py @@ -117,7 +117,7 @@ def load_attelo_dtrees(output_file, edus_file, nuc_clf, rnk_clf): else: dt_pred.add_dependency(gid2num[src_id], gid2num[tgt_id], lbl) dt_pred.origin = mk_key(doc_name) - # add nuclearity: heuristic baseline + # add nuclearity: heuristic baseline WIP or true classifier dt_pred.nucs = nuc_clf.predict([dt_pred])[0] # add rank: heuristic baseline, needs edu2sent edu2sent = doc_name2edu2sent[doc_name] @@ -129,7 +129,8 @@ def load_attelo_dtrees(output_file, edus_file, nuc_clf, rnk_clf): return dtree_pred -def load_attelo_ctrees(output_file, edus_file, nuc_clf, rnk_clf): +def load_attelo_ctrees(output_file, edus_file, nuc_clf, rnk_clf, + dtree_pred=None): """Load RST ctrees from attelo output files. Parameters @@ -142,13 +143,18 @@ def load_attelo_ctrees(output_file, edus_file, nuc_clf, rnk_clf): Classifier to predict nuclearity rnk_clf: RankClassifier Classifier to predict attachment ranking + dtree_pred : dict(str, RstDepTree), optional + RST d-trees, indexed by doc_name. If d-trees are provided this + way, `out_dir` is ignored. Returns ------- TODO """ - # load RST dtrees, with heuristics for nuc and rank - dtree_pred = load_attelo_dtrees(output_file, edus_file, nuc_clf, rnk_clf) + if dtree_pred is None: + # load RST dtrees, with heuristics for nuc and rank + dtree_pred = load_attelo_dtrees(output_file, edus_file, nuc_clf, + rnk_clf) # convert to RST ctrees ctree_pred = dict() for doc_name, dt_pred in dtree_pred.items(): diff --git a/evals/showdown.py b/evals/showdown.py index 9193612..4c311e6 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -11,6 +11,7 @@ import os from sklearn.datasets import load_svmlight_files +from sklearn.linear_model.logistic import LogisticRegressionCV from educe.rst_dt.annotation import _binarize, SimpleRSTTree from educe.rst_dt.corpus import (RstRelationConverter, @@ -215,7 +216,7 @@ def setup_dtree_postprocessor(nary_enc='chain', order='strict', y_nuc_train.append(dt.nucs) y_rnk_train.append(dt.ranks) # nuclearity clf - if False: + if True: nuc_clf = DummyNuclearityClassifier(strategy=nuc_strategy, constant=nuc_constant) nuc_clf.fit(X_train, y_nuc_train) @@ -236,7 +237,10 @@ def setup_dtree_postprocessor(nary_enc='chain', order='strict', n_features=46731, zero_based=False ) - nuc_clf = RightBinaryNuclearityClassifier() + bin_clf = LogisticRegressionCV(Cs=10, # defaults to 10 + penalty='l1', solver='liblinear', + n_jobs=3) + nuc_clf = RightBinaryNuclearityClassifier(bin_clf=bin_clf) nuc_clf = nuc_clf.fit(X_nuc_train, y_nuc_train) # end WIP nuc_clf # rank clf @@ -362,69 +366,79 @@ def main(): for author_pred in authors_pred: # braud coling 2016 if author_pred == 'BPS16': + ctree_pred = load_braud_coling_ctrees(BRAUD_COLING_OUT_DIR, + REL_CONV) c_preds.append( - ('BPS16', load_braud_coling_ctrees( - BRAUD_COLING_OUT_DIR, REL_CONV)) + ('BPS16', ctree_pred) ) d_preds.append( ('BPS16', load_braud_coling_dtrees( - BRAUD_COLING_OUT_DIR, REL_CONV, nary_enc='chain')) + BRAUD_COLING_OUT_DIR, REL_CONV, nary_enc='chain', + ctree_pred=ctree_pred)) ) # braud eacl 2017 - mono if author_pred == 'BCS17_mono': + ctree_pred = load_braud_eacl_ctrees(BRAUD_EACL_MONO, REL_CONV, + sorted_doc_names) c_preds.append( - ('BCS17_mono', load_braud_eacl_ctrees( - BRAUD_EACL_MONO, REL_CONV, sorted_doc_names)) + ('BCS17_mono', ctree_pred) ) d_preds.append( ('BCS17_mono', load_braud_eacl_dtrees( BRAUD_EACL_MONO, REL_CONV, sorted_doc_names, - nary_enc='chain')) + nary_enc='chain', ctree_pred=ctree_pred)) ) # braud eacl 2017 - cross+dev if author_pred == 'BCS17_cross': + ctree_pred = load_braud_eacl_ctrees(BRAUD_EACL_CROSS_DEV, + REL_CONV, sorted_doc_names) c_preds.append( - ('BCS17_cross', load_braud_eacl_ctrees( - BRAUD_EACL_CROSS_DEV, REL_CONV, sorted_doc_names)) + ('BCS17_cross', ctree_pred) ) d_preds.append( ('BCS17_cross', load_braud_eacl_dtrees( BRAUD_EACL_CROSS_DEV, REL_CONV, sorted_doc_names, - nary_enc='chain')) + nary_enc='chain', ctree_pred=ctree_pred)) ) if author_pred == 'HHN16_HILDA': + ctree_pred = load_hayashi_hilda_ctrees(HAYASHI_HILDA_OUT_DIR, + REL_CONV) c_preds.append( - ('HHN16_HILDA', load_hayashi_hilda_ctrees( - HAYASHI_HILDA_OUT_DIR, REL_CONV)) + ('HHN16_HILDA', ctree_pred) ) d_preds.append( ('HHN16_HILDA', load_hayashi_hilda_dtrees( - HAYASHI_HILDA_OUT_DIR, REL_CONV, nary_enc='chain')) + HAYASHI_HILDA_OUT_DIR, REL_CONV, nary_enc='chain', + ctree_pred=ctree_pred)) ) if author_pred == 'HHN16_MST': + dtree_pred = load_hayashi_dep_dtrees( + HAYASHI_MST_OUT_DIR, REL_CONV_DTREE, EDUS_FILE_PAT, + nuc_clf, rnk_clf) c_preds.append( ('HHN16_MST', load_hayashi_dep_ctrees( HAYASHI_MST_OUT_DIR, REL_CONV_DTREE, EDUS_FILE_PAT, - nuc_clf, rnk_clf)) + nuc_clf, rnk_clf, dtree_pred=dtree_pred)) ) d_preds.append( - ('HHN16_MST', load_hayashi_dep_dtrees( - HAYASHI_MST_OUT_DIR, REL_CONV_DTREE, EDUS_FILE_PAT, - nuc_clf, rnk_clf)) + ('HHN16_MST', dtree_pred) ) if author_pred == 'LLC16': + ctree_pred = load_li_qi_ctrees(LI_QI_OUT_DIR, REL_CONV) c_preds.append( - ('LLC16', load_li_qi_ctrees(LI_QI_OUT_DIR, REL_CONV)) + ('LLC16', ctree_pred) ) d_preds.append( ('LLC16', load_li_qi_dtrees(LI_QI_OUT_DIR, REL_CONV, - nary_enc='chain')) + nary_enc='chain', + ctree_pred=ctree_pred)) ) if author_pred == 'li_sujian': + # FIXME load d-trees once, pass dtree_pred to the c-loader c_preds.append( ('li_sujian', load_li_sujian_dep_ctrees( LI_SUJIAN_OUT_FILE, REL_CONV_DTREE, EDUS_FILE_PAT, @@ -437,6 +451,7 @@ def main(): ) if author_pred == 'FH14_gSVM': + # FIXME load c-trees once, pass ctree_pred to the d-loader c_preds.append( ('FH14_gSVM', load_feng_ctrees(FENG1_OUT_DIR, REL_CONV)) ) @@ -446,22 +461,26 @@ def main(): ) if author_pred == 'FH14_gCRF': + ctree_pred = load_gcrf_ctrees(FENG2_OUT_DIR, REL_CONV) c_preds.append( - ('FH14_gCRF', load_gcrf_ctrees(FENG2_OUT_DIR, REL_CONV)) + ('FH14_gCRF', ctree_pred) ) d_preds.append( ('FH14_gCRF', load_gcrf_dtrees(FENG2_OUT_DIR, REL_CONV, - nary_enc='chain')) + nary_enc='chain', + ctree_pred=ctree_pred)) ) if author_pred == 'JCN15_1S1S': # CODRA outputs RST ctrees ; eval_codra_output maps them to RST dtrees + ctree_pred = load_codra_ctrees(CODRA_OUT_DIR, REL_CONV) c_preds.append( - ('JCN15_1S1S', load_codra_ctrees(CODRA_OUT_DIR, REL_CONV)) + ('JCN15_1S1S', ctree_pred) ) d_preds.append( ('JCN15_1S1S', load_codra_dtrees(CODRA_OUT_DIR, REL_CONV, - nary_enc='chain')) + nary_enc='chain', + ctree_pred=ctree_pred)) ) # joty-{chain,tree} would be the same except nary_enc='tree' ; # the nary_enc does not matter because codra outputs binary ctrees, @@ -470,13 +489,14 @@ def main(): if author_pred == 'JE14': # DPLP outputs RST ctrees in the form of lists of spans; # load_ji_dtrees maps them to RST dtrees + ctree_pred = load_ji_ctrees(JI_OUT_DIR, REL_CONV) c_preds.append( - ('JE14', load_ji_ctrees( - JI_OUT_DIR, REL_CONV)) + ('JE14', ctree_pred) ) d_preds.append( - ('JE14', load_ji_dtrees( - JI_OUT_DIR, REL_CONV, nary_enc='chain')) + ('JE14', load_ji_dtrees(JI_OUT_DIR, REL_CONV, + nary_enc='chain', + ctree_pred=ctree_pred)) ) # ji-{chain,tree} would be the same except nary_enc='tree' ; # the nary_enc does not matter because DPLP outputs binary ctrees, @@ -498,47 +518,52 @@ def main(): # hence both encodings result in (the same) strictly ordered dtrees if author_pred == 'SHV15_D': + ctree_pred = load_surdeanu_ctrees(SURDEANU_LOG_FILE, REL_CONV) c_preds.append( - ('SHV15_D', load_surdeanu_ctrees( - SURDEANU_LOG_FILE, REL_CONV)) + ('SHV15_D', ctree_pred) ) d_preds.append( ('SHV15_D', load_surdeanu_dtrees( - SURDEANU_LOG_FILE, REL_CONV, nary_enc='chain')) + SURDEANU_LOG_FILE, REL_CONV, nary_enc='chain', + ctree_pred=ctree_pred)) ) if author_pred == 'ours-chain': # Eisner, predicted syntax, chain + dtree_pred = load_attelo_dtrees(EISNER_OUT_SYN_PRED, EDUS_FILE, + nuc_clf, rnk_clf) c_preds.append( ('ours-chain', load_attelo_ctrees( - EISNER_OUT_SYN_PRED, EDUS_FILE, nuc_clf, rnk_clf)) + EISNER_OUT_SYN_PRED, EDUS_FILE, nuc_clf, rnk_clf, + dtree_pred=dtree_pred)) ) d_preds.append( - ('ours-chain', load_attelo_dtrees( - EISNER_OUT_SYN_PRED, EDUS_FILE, nuc_clf, rnk_clf)) + ('ours-chain', dtree_pred) ) if author_pred == 'ours-tree': # Eisner, predicted syntax, tree + same-unit + dtree_pred = load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED, + EDUS_FILE, nuc_clf, rnk_clf) c_preds.append( ('ours-tree', load_attelo_ctrees( - EISNER_OUT_TREE_SYN_PRED, EDUS_FILE, nuc_clf, rnk_clf)) + EISNER_OUT_TREE_SYN_PRED, EDUS_FILE, nuc_clf, rnk_clf, + dtree_pred=dtree_pred)) ) d_preds.append( - ('ours-tree', load_attelo_dtrees( - EISNER_OUT_TREE_SYN_PRED, EDUS_FILE, nuc_clf, rnk_clf)) + ('ours-tree', dtree_pred) ) if author_pred == 'ours-tree-su': # Eisner, predicted syntax, tree + same-unit + dtree_pred = load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED_SU, + EDUS_FILE, nuc_clf, rnk_clf) c_preds.append( - ('ours-tree-su', load_attelo_ctrees(EISNER_OUT_TREE_SYN_PRED_SU, - EDUS_FILE, - nuc_clf, rnk_clf)) + ('ours-tree-su', load_attelo_ctrees( + EISNER_OUT_TREE_SYN_PRED_SU, EDUS_FILE, nuc_clf, rnk_clf, + dtree_pred=dtree_pred)) ) d_preds.append( - ('ours-tree-su', load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED_SU, - EDUS_FILE, - nuc_clf, rnk_clf)) + ('ours-tree-su', dtree_pred) ) # 2017-05-17 enable "gold" as parser, should give perfect scores if author_pred == 'gold': diff --git a/evals/surdeanu.py b/evals/surdeanu.py index 31111e8..7884e34 100644 --- a/evals/surdeanu.py +++ b/evals/surdeanu.py @@ -180,7 +180,8 @@ def load_surdeanu_ctrees(log_file, rel_conv): return _load_surdeanu_ctrees(f, rel_conv) -def load_surdeanu_dtrees(log_file, rel_conv, nary_enc='chain'): +def load_surdeanu_dtrees(log_file, rel_conv, nary_enc='chain', + ctree_pred=None): """Get the dtrees for the ctrees output by Surdeanu's parser. Parameters @@ -191,6 +192,9 @@ def load_surdeanu_dtrees(log_file, rel_conv, nary_enc='chain'): Relation converter, from fine- to coarse-grained labels. nary_enc: one of {'chain', 'tree'} Encoding for n-ary nodes. + ctree_pred : dict(str, RSTTree), optional + RST c-trees, indexed by doc_name. If c-trees are provided this + way, `out_dir` is ignored. Returns ------- @@ -198,8 +202,8 @@ def load_surdeanu_dtrees(log_file, rel_conv, nary_enc='chain'): RST dtree for each document. """ dtree_pred = dict() - - ctree_pred = load_surdeanu_ctrees(log_file, rel_conv) + if ctree_pred is None: + ctree_pred = load_surdeanu_ctrees(log_file, rel_conv) for doc_name, ct_pred in ctree_pred.items(): dtree_pred[doc_name] = RstDepTree.from_rst_tree( ct_pred, nary_enc=nary_enc) diff --git a/evals/train_nuc_classifier.py b/evals/train_nuc_classifier.py index bb25adf..784e944 100644 --- a/evals/train_nuc_classifier.py +++ b/evals/train_nuc_classifier.py @@ -15,8 +15,7 @@ from sklearn.datasets import load_svmlight_file, load_svmlight_files from sklearn.model_selection import cross_val_score -from sklearn.linear_model.logistic import LogisticRegression -# from sklearn.model_selection import GridSearchCV +from sklearn.linear_model.logistic import LogisticRegression, LogisticRegressionCV from sklearn.preprocessing import LabelEncoder import matplotlib.pyplot as plt @@ -36,7 +35,8 @@ (dset_train, dset_test), zero_based=False ) - nuc_clf = LogisticRegression(penalty='l1', n_jobs=2) + nuc_clf = LogisticRegressionCV(penalty='l1', solver='liblinear', + n_jobs=2) # train nuclearity classifier, cross-validate performance on train scores = cross_val_score(nuc_clf, X_train, y_train, cv=10) print(scores) @@ -92,7 +92,7 @@ class RightBinaryNuclearityClassifier(object): Binary classifier for right dependencies: NN vs NS. """ - def __init__(self, bin_clf=LogisticRegression(penalty='l1', n_jobs=2)): + def __init__(self, bin_clf=LogisticRegression(penalty='l1', solver='liblinear', n_jobs=2)): """Init""" self.bin_clf = bin_clf @@ -143,5 +143,6 @@ def predict(self, X): else: raise ValueError("Weird prediction: {}".format( y_pred)) + y.append(yi) return y From 96937967851d5d5711e058064fb934f4eff75b6a Mon Sep 17 00:00:00 2001 From: moreymat Date: Tue, 19 Dec 2017 11:18:33 +0100 Subject: [PATCH 72/74] ENH rel_clf, nuc_clf, model_split='sent' --- evals/ours.py | 12 +- evals/prepare_nuc_dataset.py | 262 ++++++++++++++++++++++++---------- evals/prepare_rel_dataset.py | 256 +++++++++++++++++++++++++++++++++ evals/showdown.py | 170 ++++++++++++++++++---- evals/train_nuc_classifier.py | 196 +++++++++++++++++++------ evals/train_rel_relabeller.py | 201 ++++++++++++++++++++++++++ 6 files changed, 951 insertions(+), 146 deletions(-) create mode 100644 evals/prepare_rel_dataset.py create mode 100644 evals/train_rel_relabeller.py diff --git a/evals/ours.py b/evals/ours.py index 2e50c2f..6d651b4 100644 --- a/evals/ours.py +++ b/evals/ours.py @@ -57,7 +57,7 @@ def load_attelo_output_file(output_file): return edges_pred -def load_attelo_dtrees(output_file, edus_file, nuc_clf, rnk_clf): +def load_attelo_dtrees(output_file, edus_file, rel_clf, nuc_clf, rnk_clf): """Load RST dtrees from attelo output files. Parameters @@ -117,6 +117,10 @@ def load_attelo_dtrees(output_file, edus_file, nuc_clf, rnk_clf): else: dt_pred.add_dependency(gid2num[src_id], gid2num[tgt_id], lbl) dt_pred.origin = mk_key(doc_name) + # 2017-12-14 relabel relations + if rel_clf is not None: + dt_pred.labels = rel_clf.predict([dt_pred])[0] + # end relabel relations # add nuclearity: heuristic baseline WIP or true classifier dt_pred.nucs = nuc_clf.predict([dt_pred])[0] # add rank: heuristic baseline, needs edu2sent @@ -129,7 +133,7 @@ def load_attelo_dtrees(output_file, edus_file, nuc_clf, rnk_clf): return dtree_pred -def load_attelo_ctrees(output_file, edus_file, nuc_clf, rnk_clf, +def load_attelo_ctrees(output_file, edus_file, rel_clf, nuc_clf, rnk_clf, dtree_pred=None): """Load RST ctrees from attelo output files. @@ -153,8 +157,8 @@ def load_attelo_ctrees(output_file, edus_file, nuc_clf, rnk_clf, """ if dtree_pred is None: # load RST dtrees, with heuristics for nuc and rank - dtree_pred = load_attelo_dtrees(output_file, edus_file, nuc_clf, - rnk_clf) + dtree_pred = load_attelo_dtrees(output_file, edus_file, + rel_clf, nuc_clf, rnk_clf) # convert to RST ctrees ctree_pred = dict() for doc_name, dt_pred in dtree_pred.items(): diff --git a/evals/prepare_nuc_dataset.py b/evals/prepare_nuc_dataset.py index 97e1c6e..e9d534f 100644 --- a/evals/prepare_nuc_dataset.py +++ b/evals/prepare_nuc_dataset.py @@ -24,7 +24,7 @@ from educe.rst_dt.deptree import RstDepTree -def main(corpus, dataset, out_dir, nary_enc): +def main(corpus, dataset, out_dir, nary_enc, model_split): """Do prepare the nuclearity dataset. Parameters @@ -35,6 +35,12 @@ def main(corpus, dataset, out_dir, nary_enc): Path to the existing dataset labelled with coarse relations. out_dir : str Path to the output folder. + nary_enc : str, one of {'chain', 'tree'} + Encoding for n-ary nodes. + model_split : str, one of {'none', 'sent', 'sent-para'} + If not 'none', use distinct models for subsets of instances: + * 'sent': intra- vs inter-sentential, + * 'sent-para': intra-sentential, intra-paragraph, rest (doc-level). """ # (re-)create a d-corpus from the RST-DT c-corpus corpus_subset = os.path.basename(dataset).split('.')[0] @@ -59,80 +65,181 @@ def main(corpus, dataset, out_dir, nary_enc): # description of the EDUs (.edu_input) pairings = dataset + '.pairings' # edu_desc = dataset + '.edu_input' - new_dataset = os.path.join(out_dir, os.path.basename(dataset)) - new_pairs = os.path.join(out_dir, os.path.basename(pairings)) - if ((os.path.abspath(new_dataset) == os.path.abspath(dataset) or - os.path.abspath(new_pairs) == os.path.abspath(pairings))): - raise ValueError("I won't let you erase your base dataset") - with codecs.open(dataset, mode='rb', encoding='utf-8') as f_data: - with codecs.open(pairings, mode='rb', encoding='utf-8') as f_pairs: - with codecs.open(new_dataset, mode='wb', encoding='utf-8') as data_out: - with codecs.open(new_pairs, mode='wb', encoding='utf-8') as pairs_out: - # read header line in svmlight file - header = f_data.readline() - header_prefix = '# labels: ' - assert header.startswith(header_prefix) - labels = header[len(header_prefix):].split() - int2lbl = dict(enumerate(labels, start=1)) - lbl2int = {lbl: i for i, lbl in int2lbl.items()} - unrelated = lbl2int["UNRELATED"] - root = lbl2int["ROOT"] - # write labels in header of new svmlight file, as an - # ordered list mapped to {1, 2} - print(header_prefix + ' '.join((NUC_N, NUC_S)), - file=data_out) - # stream through lines - for pair, line in itertools.izip(f_pairs, f_data): - # read candidate pair of EDUs - src_id, tgt_id = pair.strip().split('\t') - if src_id == 'ROOT': - continue - # now both src_id and tgt_id are of form "docname_int" - # ex: "wsj_0600.out_1" - src_idx = int(src_id.rsplit('_', 1)[1]) - doc_name, tgt_idx = tgt_id.rsplit('_', 1) - tgt_idx = int(tgt_idx) - if tgt_idx < src_idx: - # skip left dependencies: by construction, - # their nuclearity can only be Satellite - # (SN edges) - continue - # print(doc_name, src_id, tgt_id, src_idx, tgt_idx) - # read corresponding ref class (label), feature vector - lbl_idx, feat_vector = line.strip().split(' ', 1) - lbl_idx = int(lbl_idx) # lbl currently encoded as int - if lbl_idx in (unrelated, root): - continue - try: - lbl = int2lbl[lbl_idx] - except KeyError: - # the test set in RST-DT 1.0 has an error: - # wsj_1189.out [8-9] is labelled "span" instead of - # "Consequence" ; some runs used this erroneous - # version, hence had a class "0" (unknown) for - # this line in the dataset - if ((doc_name == 'wsj_1189.out' and - src_idx == 7 and - tgt_idx == 9)): - lbl = 'cause' - lbl_idx = lbl2int[lbl] - else: - print(doc_name, src_idx, tgt_idx) - raise - # print(src_id, tgt_id, lbl) - dtree = rst_dcorpus[doc_name] - assert dtree.heads[tgt_idx] == src_idx - assert dtree.labels[tgt_idx] == lbl - if dtree.nucs[tgt_idx] == NUC_N: - nuc_idx = 1 - elif dtree.nucs[tgt_idx] == NUC_S: - nuc_idx = 2 - else: - raise ValueError("weird nuclearity {}".format( - dtree.nucs[tgt_idx])) - print(str(nuc_idx) + ' ' + feat_vector, + if model_split == 'none': + new_dataset = os.path.join(out_dir, os.path.basename(dataset)) + new_pairs = os.path.join(out_dir, os.path.basename(pairings)) + if ((os.path.abspath(new_dataset) == os.path.abspath(dataset) or + os.path.abspath(new_pairs) == os.path.abspath(pairings))): + raise ValueError("I won't let you erase your base dataset") + with codecs.open(dataset, mode='rb', encoding='utf-8') as f_data: + with codecs.open(pairings, mode='rb', encoding='utf-8') as f_pairs: + with codecs.open(new_dataset, mode='wb', encoding='utf-8') as data_out: + with codecs.open(new_pairs, mode='wb', encoding='utf-8') as pairs_out: + # read header line in svmlight file + header = f_data.readline() + header_prefix = '# labels: ' + assert header.startswith(header_prefix) + labels = header[len(header_prefix):].split() + int2lbl = dict(enumerate(labels, start=1)) + lbl2int = {lbl: i for i, lbl in int2lbl.items()} + unrelated = lbl2int["UNRELATED"] + root = lbl2int["ROOT"] + # write labels in header of new svmlight file, as an + # ordered list mapped to {1, 2} + print(header_prefix + ' '.join((NUC_N, NUC_S)), file=data_out) - print(pair.strip(), file=pairs_out) + # stream through lines + for pair, line in itertools.izip(f_pairs, f_data): + # read candidate pair of EDUs + src_id, tgt_id = pair.strip().split('\t') + if src_id == 'ROOT': + continue + # now both src_id and tgt_id are of form "docname_int" + # ex: "wsj_0600.out_1" + src_idx = int(src_id.rsplit('_', 1)[1]) + doc_name, tgt_idx = tgt_id.rsplit('_', 1) + tgt_idx = int(tgt_idx) + if tgt_idx < src_idx: + # skip left dependencies: by construction, + # their nuclearity can only be Satellite + # (SN edges) + continue + # print(doc_name, src_id, tgt_id, src_idx, tgt_idx) + # read corresponding ref class (label), feature vector + lbl_idx, feat_vector = line.strip().split(' ', 1) + lbl_idx = int(lbl_idx) # lbl currently encoded as int + if lbl_idx in (unrelated, root): + continue + try: + lbl = int2lbl[lbl_idx] + except KeyError: + # the test set in RST-DT 1.0 has an error: + # wsj_1189.out [8-9] is labelled "span" instead of + # "Consequence" ; some runs used this erroneous + # version, hence had a class "0" (unknown) for + # this line in the dataset + if ((doc_name == 'wsj_1189.out' and + src_idx == 7 and + tgt_idx == 9)): + lbl = 'cause' + lbl_idx = lbl2int[lbl] + else: + print(doc_name, src_idx, tgt_idx) + raise + # print(src_id, tgt_id, lbl) + dtree = rst_dcorpus[doc_name] + assert dtree.heads[tgt_idx] == src_idx + assert dtree.labels[tgt_idx] == lbl + if dtree.nucs[tgt_idx] == NUC_N: + nuc_idx = 1 + elif dtree.nucs[tgt_idx] == NUC_S: + nuc_idx = 2 + else: + raise ValueError("weird nuclearity {}".format( + dtree.nucs[tgt_idx])) + print(str(nuc_idx) + ' ' + feat_vector, + file=data_out) + print(pair.strip(), file=pairs_out) + elif model_split == 'sent': + # 2 datasets: intra- and inter-sentential + new_dataset = ( + os.path.join(out_dir + '_intrasent', os.path.basename(dataset)), + os.path.join(out_dir + '_intersent', os.path.basename(dataset)) + ) + new_pairs = ( + os.path.join(out_dir + '_intrasent', os.path.basename(pairings)), + os.path.join(out_dir + '_intersent', os.path.basename(pairings)) + ) + if ((os.path.abspath(new_dataset[0]) == os.path.abspath(dataset) or + os.path.abspath(new_pairs[0]) == os.path.abspath(pairings) or + os.path.abspath(new_dataset[1]) == os.path.abspath(dataset) or + os.path.abspath(new_pairs[1]) == os.path.abspath(pairings))): + raise ValueError("I won't let you erase your base dataset") + with codecs.open(dataset, mode='rb', encoding='utf-8') as f_data: + with codecs.open(pairings, mode='rb', encoding='utf-8') as f_pairs: + with codecs.open(new_dataset[0], mode='wb', encoding='utf-8') as data_out_intra: + with codecs.open(new_pairs[0], mode='wb', encoding='utf-8') as pairs_out_intra: + with codecs.open(new_dataset[1], mode='wb', encoding='utf-8') as data_out_inter: + with codecs.open(new_pairs[1], mode='wb', encoding='utf-8') as pairs_out_inter: + # read header line in svmlight file + header = f_data.readline() + header_prefix = '# labels: ' + assert header.startswith(header_prefix) + labels = header[len(header_prefix):].split() + int2lbl = dict(enumerate(labels, start=1)) + lbl2int = {lbl: i for i, lbl in int2lbl.items()} + unrelated = lbl2int["UNRELATED"] + root = lbl2int["ROOT"] + # write labels in header of new svmlight file, as an + # ordered list mapped to {1, 2} + print(header_prefix + ' '.join((NUC_N, NUC_S)), + file=data_out_intra) + print(header_prefix + ' '.join((NUC_N, NUC_S)), + file=data_out_inter) + # stream through lines + for pair, line in itertools.izip(f_pairs, f_data): + # read candidate pair of EDUs + src_id, tgt_id = pair.strip().split('\t') + if src_id == 'ROOT': + continue + # now both src_id and tgt_id are of form "docname_int" + # ex: "wsj_0600.out_1" + src_idx = int(src_id.rsplit('_', 1)[1]) + doc_name, tgt_idx = tgt_id.rsplit('_', 1) + tgt_idx = int(tgt_idx) + if tgt_idx < src_idx: + # skip left dependencies: by construction, + # their nuclearity can only be Satellite + # (SN edges) + continue + # print(doc_name, src_id, tgt_id, src_idx, tgt_idx) + # read corresponding ref class (label), feature vector + lbl_idx, feat_vector = line.strip().split(' ', 1) + lbl_idx = int(lbl_idx) # lbl currently encoded as int + if lbl_idx in (unrelated, root): + continue + try: + lbl = int2lbl[lbl_idx] + except KeyError: + # the test set in RST-DT 1.0 has an error: + # wsj_1189.out [8-9] is labelled "span" instead of + # "Consequence" ; some runs used this erroneous + # version, hence had a class "0" (unknown) for + # this line in the dataset + if ((doc_name == 'wsj_1189.out' and + src_idx == 7 and + tgt_idx == 9)): + lbl = 'cause' + lbl_idx = lbl2int[lbl] + else: + print(doc_name, src_idx, tgt_idx) + raise + # print(src_id, tgt_id, lbl) + dtree = rst_dcorpus[doc_name] + assert dtree.heads[tgt_idx] == src_idx + assert dtree.labels[tgt_idx] == lbl + if dtree.nucs[tgt_idx] == NUC_N: + nuc_idx = 1 + elif dtree.nucs[tgt_idx] == NUC_S: + nuc_idx = 2 + else: + raise ValueError("weird nuclearity {}".format( + dtree.nucs[tgt_idx])) + if ((' 269:' in feat_vector or + ' 303:' in feat_vector)): + # 269 is same_sentence_intra_right + # 303 is same_sentence_intra_left + # FIXME find a cleaner way + print(str(nuc_idx) + ' ' + feat_vector, + file=data_out_intra) + print(pair.strip(), + file=pairs_out_intra) + else: + # inter-sentential + print(str(nuc_idx) + ' ' + feat_vector, + file=data_out_inter) + print(pair.strip(), + file=pairs_out_inter) if __name__ == "__main__": @@ -163,5 +270,10 @@ def main(corpus, dataset, out_dir, nary_enc): help='Encoding for n-ary nodes', choices=['chain', 'tree'], default='chain') + parser.add_argument('--model_split', + help='Separate models for subsets of instances', + choices=['none', 'sent', 'sent-para'], + default='none') args = parser.parse_args() - main(args.corpus, args.dataset, args.out_dir, args.nary_enc) + main(args.corpus, args.dataset, args.out_dir, args.nary_enc, + args.model_split) diff --git a/evals/prepare_rel_dataset.py b/evals/prepare_rel_dataset.py new file mode 100644 index 0000000..0bd6a5b --- /dev/null +++ b/evals/prepare_rel_dataset.py @@ -0,0 +1,256 @@ +"""This utility script outputs a dataset of the relation of RST edges. + +Given the path to the RST-DT corpus and a dataset of candidate RST +dependencies labelled with their gold coarse (class) RST relation (or +none if they are unrelated), produce a filtered version of the dataset +for the task of relation labelling. + +As of 2017-12-14, we filter out the instances for unrelated pairs of EDUs +and dependencies headed by the fake root. +The resulting dataset describes a n-ary classification problem whose +labelset is the set of (coarse-grained) classes of RST relations. +""" + +from __future__ import absolute_import, print_function + +import argparse +import codecs +import itertools +import os + +from educe.rst_dt.annotation import NUC_N, NUC_S +from educe.rst_dt.corpus import RstRelationConverter, RELMAP_112_18_FILE +from educe.rst_dt.dep_corpus import read_corpus +from educe.rst_dt.deptree import RstDepTree + + +def main(corpus, dataset, out_dir, nary_enc, model_split): + """Do prepare the RST relation dataset. + + Parameters + ---------- + corpus : str + Path to the RST-DT "main" corpus. + dataset : str + Path to the existing dataset labelled with coarse relations. + out_dir : str + Path to the output folder. + model_split : str, one of {'none', 'sent', 'sent-para'} + If not 'none', use distinct models for subsets of instances: + * 'sent': intra- vs inter-sentential, + * 'sent-para': intra-sentential, intra-paragraph, rest (doc-level). + """ + # (re-)create a d-corpus from the RST-DT c-corpus + corpus_subset = os.path.basename(dataset).split('.')[0] + if corpus_subset not in ('TRAINING', 'TEST'): + raise ValueError("dataset must be a filepath that starts with" + "one of {'TRAINING', 'TEST'}") + if corpus_subset == 'TRAINING': + section = 'train' + else: # 'TEST' + section = 'test' + rst_ccorpus = read_corpus(corpus, section=section) + rel_conv = RstRelationConverter(RELMAP_112_18_FILE).convert_dtree + rst_dcorpus = dict() # FileId.doc -> RstDepTree + for doc_key, rst_ctree in rst_ccorpus[section].items(): + rst_dtree = RstDepTree.from_rst_tree(rst_ctree, nary_enc=nary_enc) + rst_dtree_coarse = rel_conv(rst_dtree) + rst_dcorpus[doc_key.doc] = rst_dtree_coarse + # for each candidate dependency in the dataset, read the nuclearity + # from the RST d-corpus + # Nota: we stream through the dataset to avoid loading it entirely in + # memory ; we don't need to open the vocabulary file (.vocab), nor the + # description of the EDUs (.edu_input) + pairings = dataset + '.pairings' + # edu_desc = dataset + '.edu_input' + if model_split == 'none': + new_dataset = os.path.join(out_dir, os.path.basename(dataset)) + new_pairs = os.path.join(out_dir, os.path.basename(pairings)) + if ((os.path.abspath(new_dataset) == os.path.abspath(dataset) or + os.path.abspath(new_pairs) == os.path.abspath(pairings))): + raise ValueError("I won't let you erase your base dataset") + with codecs.open(dataset, mode='rb', encoding='utf-8') as f_data: + with codecs.open(pairings, mode='rb', encoding='utf-8') as f_pairs: + with codecs.open(new_dataset, mode='wb', encoding='utf-8') as data_out: + with codecs.open(new_pairs, mode='wb', encoding='utf-8') as pairs_out: + # read header line in svmlight file + header = f_data.readline() + header_prefix = '# labels: ' + assert header.startswith(header_prefix) + labels = header[len(header_prefix):].split() + int2lbl = dict(enumerate(labels, start=1)) + lbl2int = {lbl: i for i, lbl in int2lbl.items()} + unrelated = lbl2int["UNRELATED"] + root = lbl2int["ROOT"] + # write labels in header of new svmlight file, here + # we just copy the existing header (even if it has + # ROOT and UNRELATED that should never appear here) + print(header, file=data_out) + # stream through lines + for pair, line in itertools.izip(f_pairs, f_data): + # read candidate pair of EDUs + src_id, tgt_id = pair.strip().split('\t') + if src_id == 'ROOT': + continue + # now both src_id and tgt_id are of form "docname_int" + # ex: "wsj_0600.out_1" + src_idx = int(src_id.rsplit('_', 1)[1]) + doc_name, tgt_idx = tgt_id.rsplit('_', 1) + tgt_idx = int(tgt_idx) + # read corresponding ref class (label), feature vector + lbl_idx, feat_vector = line.strip().split(' ', 1) + lbl_idx = int(lbl_idx) # lbl currently encoded as int + if lbl_idx in (unrelated, root): + continue + try: + lbl = int2lbl[lbl_idx] + except KeyError: + # the test set in RST-DT 1.0 has an error: + # wsj_1189.out [8-9] is labelled "span" instead of + # "Consequence" ; some runs used this erroneous + # version, hence had a class "0" (unknown) for + # this line in the dataset + if ((doc_name == 'wsj_1189.out' and + src_idx == 7 and + tgt_idx == 9)): + lbl = 'cause' + lbl_idx = lbl2int[lbl] + else: + print(doc_name, src_idx, tgt_idx) + raise + # print(src_id, tgt_id, lbl) + dtree = rst_dcorpus[doc_name] + assert dtree.heads[tgt_idx] == src_idx + assert dtree.labels[tgt_idx] == lbl + print(str(lbl_idx) + ' ' + feat_vector, + file=data_out) + print(pair.strip(), file=pairs_out) + elif model_split == 'sent': + # 2 datasets: intra- and inter-sentential + new_dataset = ( + os.path.join(out_dir + '_intrasent', os.path.basename(dataset)), + os.path.join(out_dir + '_intersent', os.path.basename(dataset)) + ) + new_pairs = ( + os.path.join(out_dir + '_intrasent', os.path.basename(pairings)), + os.path.join(out_dir + '_intersent', os.path.basename(pairings)) + ) + if ((os.path.abspath(new_dataset[0]) == os.path.abspath(dataset) or + os.path.abspath(new_pairs[0]) == os.path.abspath(pairings) or + os.path.abspath(new_dataset[1]) == os.path.abspath(dataset) or + os.path.abspath(new_pairs[1]) == os.path.abspath(pairings))): + raise ValueError("I won't let you erase your base dataset") + with codecs.open(dataset, mode='rb', encoding='utf-8') as f_data: + with codecs.open(pairings, mode='rb', encoding='utf-8') as f_pairs: + with codecs.open(new_dataset[0], mode='wb', encoding='utf-8') as data_out_intra: + with codecs.open(new_pairs[0], mode='wb', encoding='utf-8') as pairs_out_intra: + with codecs.open(new_dataset[1], mode='wb', encoding='utf-8') as data_out_inter: + with codecs.open(new_pairs[1], mode='wb', encoding='utf-8') as pairs_out_inter: + # read header line in svmlight file + header = f_data.readline() + header_prefix = '# labels: ' + assert header.startswith(header_prefix) + labels = header[len(header_prefix):].split() + int2lbl = dict(enumerate(labels, start=1)) + lbl2int = {lbl: i for i, lbl in int2lbl.items()} + unrelated = lbl2int["UNRELATED"] + root = lbl2int["ROOT"] + # write labels in header of new svmlight file + print(header, file=data_out_intra) + print(header, file=data_out_inter) + # stream through lines + for pair, line in itertools.izip(f_pairs, f_data): + # read candidate pair of EDUs + src_id, tgt_id = pair.strip().split('\t') + if src_id == 'ROOT': + continue + # now both src_id and tgt_id are of form "docname_int" + # ex: "wsj_0600.out_1" + src_idx = int(src_id.rsplit('_', 1)[1]) + doc_name, tgt_idx = tgt_id.rsplit('_', 1) + tgt_idx = int(tgt_idx) + # read corresponding ref class (label), feature vector + lbl_idx, feat_vector = line.strip().split(' ', 1) + lbl_idx = int(lbl_idx) # lbl currently encoded as int + if lbl_idx in (unrelated, root): + continue + try: + lbl = int2lbl[lbl_idx] + except KeyError: + # the test set in RST-DT 1.0 has an error: + # wsj_1189.out [8-9] is labelled "span" instead of + # "Consequence" ; some runs used this erroneous + # version, hence had a class "0" (unknown) for + # this line in the dataset + if ((doc_name == 'wsj_1189.out' and + src_idx == 7 and + tgt_idx == 9)): + lbl = 'cause' + lbl_idx = lbl2int[lbl] + else: + print(doc_name, src_idx, tgt_idx) + raise + # print(src_id, tgt_id, lbl) + dtree = rst_dcorpus[doc_name] + assert dtree.heads[tgt_idx] == src_idx + assert dtree.labels[tgt_idx] == lbl + if ((' 269:' in feat_vector or + ' 303:' in feat_vector) and + (' 103:' in feat_vector or + ' 158:' in feat_vector or + ' 234:' in feat_vector or + ' 314:' in feat_vector)): + # 269 is same_sentence_intra_right + # 303 is same_sentence_intra_left ; + # 103 is same_para_inter_right + # 158 is same_para_inter_left + # 234 is same_para_intra_right + # 314 is same_para_intra_left + # FIXME find a cleaner way + print(str(lbl_idx) + ' ' + feat_vector, + file=data_out_intra) + print(pair.strip(), + file=pairs_out_intra) + else: + # inter-sentential + print(str(lbl_idx) + ' ' + feat_vector, + file=data_out_inter) + print(pair.strip(), + file=pairs_out_inter) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description='Prepare a relation dataset.' + ) + parser.add_argument('--corpus', + help='Path to the RST-DT "main" corpus', + default=os.path.join( + os.path.expanduser('~'), + 'corpora/rst-dt/rst_discourse_treebank/data', + 'RSTtrees-WSJ-main-1.01' + )) + parser.add_argument('--dataset', + help='Base file of the dataset', + default=os.path.join( + os.path.expanduser('~'), + 'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse', + 'TRAINING.relations.sparse' + )) + parser.add_argument('--out_dir', + help='Output folder', + default=os.path.join( + os.path.expanduser('~'), + 'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_REL' + )) + parser.add_argument('--nary_enc', + help='Encoding for n-ary nodes', + choices=['chain', 'tree'], + default='chain') + parser.add_argument('--model_split', + help='Separate models for subsets of instances', + choices=['none', 'sent', 'sent-para'], + default='none') + args = parser.parse_args() + main(args.corpus, args.dataset, args.out_dir, args.nary_enc, + args.model_split) diff --git a/evals/showdown.py b/evals/showdown.py index 4c311e6..af6f117 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -7,11 +7,12 @@ import argparse import codecs +from collections import defaultdict import itertools import os from sklearn.datasets import load_svmlight_files -from sklearn.linear_model.logistic import LogisticRegressionCV +from sklearn.linear_model.logistic import LogisticRegression, LogisticRegressionCV from educe.rst_dt.annotation import _binarize, SimpleRSTTree from educe.rst_dt.corpus import (RstRelationConverter, @@ -51,6 +52,7 @@ from evals.surdeanu import load_surdeanu_ctrees, load_surdeanu_dtrees # 2017-12-12 nuc_clf WIP from evals.train_nuc_classifier import RightBinaryNuclearityClassifier +from evals.train_rel_relabeller import RelationRelabeller # end WIP nuc_clf # RST corpus @@ -204,6 +206,25 @@ def setup_dtree_postprocessor(nary_enc='chain', order='strict', # flavours of dtree dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc=nary_enc) dtree_true[doc_name] = dt_true + # 2017-12-18 WIP print spiders in d-trees, see if some could be + # solved with para_idx + rnk_deps = defaultdict(list) # gov -> list of (rnk, dep) + for i, (gov, rnk, nuc, lbl) in enumerate( + zip(dt_true.heads[1:], dt_true.ranks[1:], dt_true.nucs[1:], + dt_true.labels[1:]), + start=1): + rnk_deps[gov].append((rnk, i)) + ordered_deps = {k: sorted(v) for k, v in rnk_deps.items()} + for gov, ord_deps in sorted(ordered_deps.items()): + if ((any(x[1] < gov for x in ord_deps) and + any(x[1] > gov for x in ord_deps))): + if doc_name.startswith('wsj_06'): + print(doc_name, gov, ord_deps) + elif doc_name.startswith('file'): + pass + else: + raise ValueError("spider!") + # end 2017-12-18 WIP spiders # fit classifiers for nuclearity and rank (DIRTY) # NB: both are (dummily) fit on weakly ordered dtrees X_train = [] @@ -215,8 +236,68 @@ def setup_dtree_postprocessor(nary_enc='chain', order='strict', X_train.append(dt) y_nuc_train.append(dt.nucs) y_rnk_train.append(dt.ranks) + # 2017-12-14 WIP relation relabeller + if False: + model_split = 'sent' # {'none', 'sent'} + if model_split == 'none': + dset_folder = os.path.join( + os.path.expanduser('~'), + 'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_REL' + ) + dset_rel_train = os.path.join(dset_folder, 'TRAINING.relations.sparse') + dset_rel_test = os.path.join(dset_folder, 'TEST.relations.sparse') + # FIXME read n_features from .vocab + X_rel_train, y_rel_train, X_rel_test, y_rel_test = load_svmlight_files( + (dset_rel_train, dset_rel_test), + n_features=46731, + zero_based=False + ) + elif model_split == 'sent': + # * intra + dset_folder_intra = os.path.join( + os.path.expanduser('~'), + 'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_REL_intrasent' + ) + dset_train_intra = os.path.join(dset_folder_intra, 'TRAINING.relations.sparse') + dset_test_intra = os.path.join(dset_folder_intra, 'TEST.relations.sparse') + # FIXME read n_features from .vocab + X_rel_train_intra, y_rel_train_intra, X_rel_test_intra, y_rel_test_intra = load_svmlight_files( + (dset_train_intra, dset_test_intra), + n_features=46731, + zero_based=False + ) + # * inter + dset_folder_inter = os.path.join( + os.path.expanduser('~'), + 'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_REL_intersent' + ) + dset_train_inter = os.path.join(dset_folder_inter, 'TRAINING.relations.sparse') + dset_test_inter = os.path.join(dset_folder_inter, 'TEST.relations.sparse') + # FIXME read n_features from .vocab + X_rel_train_inter, y_rel_train_inter, X_rel_test_inter, y_rel_test_inter = load_svmlight_files( + (dset_train_inter, dset_test_inter), + n_features=46731, + zero_based=False + ) + # put together intra and inter + X_rel_train = (X_rel_train_intra, X_rel_train_inter) + y_rel_train = (y_rel_train_intra, y_rel_train_inter) + # TODO the same for {X,y}_rel_test ? + else: + raise ValueError("what model_split?") + # common call + mul_clf = LogisticRegressionCV(Cs=10, # defaults to 10, + penalty='l1', solver='liblinear', + n_jobs=3) + rel_clf = RelationRelabeller(mul_clf=mul_clf, model_split=model_split) + rel_clf = rel_clf.fit(X_rel_train, y_rel_train) + else: + rel_clf = None + # end 2017-12-14 relations relabeller # nuclearity clf if True: + # TODO see whether intra/inter-sentential would be good + # for the dummy nuc clf nuc_clf = DummyNuclearityClassifier(strategy=nuc_strategy, constant=nuc_constant) nuc_clf.fit(X_train, y_nuc_train) @@ -225,22 +306,59 @@ def setup_dtree_postprocessor(nary_enc='chain', order='strict', # shiny new nuc_clf ; still very hacky # import the nuclearity TRAIN and TEST sets generated from # the svmlight feature vectors (ahem) - dset_folder = os.path.join( - os.path.expanduser('~'), - 'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_NUC' - ) - dset_train = os.path.join(dset_folder, 'TRAINING.relations.sparse') - dset_test = os.path.join(dset_folder, 'TEST.relations.sparse') - # FIXME read n_features from .vocab - X_nuc_train, y_nuc_train, X_nuc_test, y_nuc_test = load_svmlight_files( - (dset_train, dset_test), - n_features=46731, - zero_based=False - ) + model_split = 'sent' + # + if model_split == 'none': + dset_folder = os.path.join( + os.path.expanduser('~'), + 'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_NUC' + ) + dset_train = os.path.join(dset_folder, 'TRAINING.relations.sparse') + dset_test = os.path.join(dset_folder, 'TEST.relations.sparse') + # FIXME read n_features from .vocab + X_nuc_train, y_nuc_train, X_nuc_test, y_nuc_test = load_svmlight_files( + (dset_train, dset_test), + n_features=46731, + zero_based=False + ) + elif model_split == 'sent': + # * intra + dset_folder_intra = os.path.join( + os.path.expanduser('~'), + 'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_NUC_intrasent' + ) + dset_train_intra = os.path.join(dset_folder_intra, 'TRAINING.relations.sparse') + dset_test_intra = os.path.join(dset_folder_intra, 'TEST.relations.sparse') + # FIXME read n_features from .vocab + X_nuc_train_intra, y_nuc_train_intra, X_nuc_test_intra, y_nuc_test_intra = load_svmlight_files( + (dset_train_intra, dset_test_intra), + n_features=46731, + zero_based=False + ) + # * inter + dset_folder_inter = os.path.join( + os.path.expanduser('~'), + 'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_NUC_intersent' + ) + dset_train_inter = os.path.join(dset_folder_inter, 'TRAINING.relations.sparse') + dset_test_inter = os.path.join(dset_folder_inter, 'TEST.relations.sparse') + # FIXME read n_features from .vocab + X_nuc_train_inter, y_nuc_train_inter, X_nuc_test_inter, y_nuc_test_inter = load_svmlight_files( + (dset_train_inter, dset_test_inter), + n_features=46731, + zero_based=False + ) + # put together intra and inter + X_nuc_train = (X_nuc_train_intra, X_nuc_train_inter) + y_nuc_train = (y_nuc_train_intra, y_nuc_train_inter) + # TODO the same for {X,y}_nuc_test ? + else: + raise ValueError("what model_split?") bin_clf = LogisticRegressionCV(Cs=10, # defaults to 10 penalty='l1', solver='liblinear', n_jobs=3) - nuc_clf = RightBinaryNuclearityClassifier(bin_clf=bin_clf) + nuc_clf = RightBinaryNuclearityClassifier(bin_clf=bin_clf, + model_split=model_split) nuc_clf = nuc_clf.fit(X_nuc_train, y_nuc_train) # end WIP nuc_clf # rank clf @@ -248,7 +366,7 @@ def setup_dtree_postprocessor(nary_enc='chain', order='strict', strategy=rnk_strategy, prioritize_same_unit=rnk_prioritize_same_unit, order=order) rnk_clf.fit(X_train, y_rnk_train) - return nuc_clf, rnk_clf + return nuc_clf, rnk_clf, rel_clf # FIXME: @@ -336,8 +454,8 @@ def main(): # ones with nuclearity # * tie the order with the encoding for n-ary nodes order = 'weak' if nary_enc_pred == 'tree' else 'strict' - nuc_clf, rnk_clf = setup_dtree_postprocessor(nary_enc=nary_enc_pred, - order=order) + nuc_clf, rnk_clf, rel_clf = setup_dtree_postprocessor( + nary_enc=nary_enc_pred, order=order) # the eval compares parses for the test section of the RST corpus reader_test = RstReader(CD_TEST) @@ -531,11 +649,11 @@ def main(): if author_pred == 'ours-chain': # Eisner, predicted syntax, chain dtree_pred = load_attelo_dtrees(EISNER_OUT_SYN_PRED, EDUS_FILE, - nuc_clf, rnk_clf) + rel_clf, nuc_clf, rnk_clf) c_preds.append( - ('ours-chain', load_attelo_ctrees( - EISNER_OUT_SYN_PRED, EDUS_FILE, nuc_clf, rnk_clf, - dtree_pred=dtree_pred)) + ('ours-chain', load_attelo_ctrees(EISNER_OUT_SYN_PRED, EDUS_FILE, + rel_clf, nuc_clf, rnk_clf, + dtree_pred=dtree_pred)) ) d_preds.append( ('ours-chain', dtree_pred) @@ -543,12 +661,12 @@ def main(): if author_pred == 'ours-tree': # Eisner, predicted syntax, tree + same-unit - dtree_pred = load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED, - EDUS_FILE, nuc_clf, rnk_clf) + dtree_pred = load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED, EDUS_FILE, + rel_clf, nuc_clf, rnk_clf) c_preds.append( - ('ours-tree', load_attelo_ctrees( - EISNER_OUT_TREE_SYN_PRED, EDUS_FILE, nuc_clf, rnk_clf, - dtree_pred=dtree_pred)) + ('ours-tree', load_attelo_ctrees(EISNER_OUT_TREE_SYN_PRED, EDUS_FILE, + rel_clf, nuc_clf, rnk_clf, + dtree_pred=dtree_pred)) ) d_preds.append( ('ours-tree', dtree_pred) diff --git a/evals/train_nuc_classifier.py b/evals/train_nuc_classifier.py index 784e944..3126882 100644 --- a/evals/train_nuc_classifier.py +++ b/evals/train_nuc_classifier.py @@ -10,44 +10,21 @@ import argparse import codecs from collections import defaultdict +import copy import itertools import os +import sys from sklearn.datasets import load_svmlight_file, load_svmlight_files -from sklearn.model_selection import cross_val_score from sklearn.linear_model.logistic import LogisticRegression, LogisticRegressionCV +from sklearn.model_selection import cross_val_score from sklearn.preprocessing import LabelEncoder -import matplotlib.pyplot as plt from educe.rst_dt.annotation import NUC_N, NUC_S -if False: - # import the nuclearity TRAIN and TEST sets - dset_folder = os.path.join( - os.path.expanduser('~'), - 'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_NUC' - ) - dset_train = os.path.join(dset_folder, 'TRAINING.relations.sparse') - dset_test = os.path.join(dset_folder, 'TEST.relations.sparse') - - X_train, y_train, X_test, y_test = load_svmlight_files( - (dset_train, dset_test), - zero_based=False - ) - nuc_clf = LogisticRegressionCV(penalty='l1', solver='liblinear', - n_jobs=2) - # train nuclearity classifier, cross-validate performance on train - scores = cross_val_score(nuc_clf, X_train, y_train, cv=10) - print(scores) - print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) - # fit a - nuc_clf = nuc_clf.fit(X_train, y_train) - print(nuc_clf.score(X_test, y_test)) - - # 2017-12-06 non-dummy nuc_clf -# DIRTY load the feature vector for all candidate edges in the TEST +# DIRTY load the feature vectors of all candidate edges in the TEST # set feat_vecs = dict() dset_folder = os.path.join( @@ -90,24 +67,62 @@ class RightBinaryNuclearityClassifier(object): ---------- bin_clf : sklearn classifier Binary classifier for right dependencies: NN vs NS. + model_split : str, one of {'none', 'sent', 'sent-para'} + Distinct models for subsets of instances. """ - def __init__(self, bin_clf=LogisticRegression(penalty='l1', solver='liblinear', n_jobs=2)): + def __init__(self, bin_clf=LogisticRegression(penalty='l1', solver='liblinear', n_jobs=2), model_split='none'): """Init""" - self.bin_clf = bin_clf + self.model_split = model_split + if model_split == 'none': + self.bin_clf = bin_clf + elif model_split == 'sent': + self.bin_clf_intra = copy.deepcopy(bin_clf) + self.bin_clf_inter = copy.deepcopy(bin_clf) + else: + raise ValueError("model_split?") def fit(self, X, y): - """Fit""" - self.bin_clf = self.bin_clf.fit(X, y) - if True: # verbose - scores = cross_val_score(self.bin_clf, X, y, cv=10) - print(scores) - print("Accuracy: %0.2f (+/- %0.2f)" % ( - scores.mean(), scores.std() * 2)) + """Fit. + + FIXME X is currently expected to be a (flat) list of candidate + edges instead of a list of RstDepTrees. + """ + if self.model_split == 'none': + self.bin_clf = self.bin_clf.fit(X, y) + if True: # verbose + scores = cross_val_score(self.bin_clf, X, y, cv=10) + print(scores) + print("Accuracy: %0.2f (+/- %0.2f)" % ( + scores.mean(), scores.std() * 2)) + elif self.model_split == 'sent': + assert len(X) == 2 # intra, inter + assert len(y) == 2 # intra, inter + # * intra + self.bin_clf_intra = self.bin_clf_intra.fit(X[0], y[0]) + if True: # verbose + scores = cross_val_score(self.bin_clf_intra, X[0], y[0], cv=10) + print(scores) + print("Accuracy: %0.2f (+/- %0.2f)" % ( + scores.mean(), scores.std() * 2)) + # * inter + self.bin_clf_inter = self.bin_clf_inter.fit(X[1], y[1]) + if True: # verbose + scores = cross_val_score(self.bin_clf_inter, X[1], y[1], cv=10) + print(scores) + print("Accuracy: %0.2f (+/- %0.2f)" % ( + scores.mean(), scores.std() * 2)) + return self def predict(self, X): """Predict nuclearity of edges in RstDepTrees X from the TEST set. + + Parameters + ---------- + X : list of RstDepTree + D-trees ; the feature vectors of all edges are already + available from the global context. """ y = [] for dtree in X: @@ -131,11 +146,30 @@ def predict(self, X): line_idx = pair_map[doc_name][head][i] # X_test[line_idx,:] is a matrix with 1 row Xi = X_test[line_idx,:] - try: - y_pred = self.bin_clf.predict(Xi) - except ValueError: - print(Xi) - raise + if self.model_split == 'none': + try: + y_pred = self.bin_clf.predict(Xi) + except ValueError: + print(Xi) + raise + elif self.model_split == 'sent': + # same_sentence_intra_{right,left}: 269, 303 + # our vocab is 1-based but sklearn converts it to + # 0-based ; + # check it's not a left dep + assert Xi[0, 302] == 0 + # + if Xi[0, 268] == 1: + sel_clf = self.bin_clf_intra + else: + sel_clf = self.bin_clf_inter + # + try: + y_pred = sel_clf.predict(Xi) + except ValueError: + print(Xi) + raise + # append prediction if y_pred == 1: yi.append(NUC_N) elif y_pred == 2: @@ -146,3 +180,83 @@ def predict(self, X): y.append(yi) return y + + +if __name__ == "__main__": + model_split = 'sent' # {'none', 'sent'} + # eval on intra- and inter-sent + # * intra + dset_folder_intra = os.path.join( + os.path.expanduser('~'), + 'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_NUC_intrasent' + ) + dset_train_intra = os.path.join(dset_folder_intra, 'TRAINING.relations.sparse') + dset_test_intra = os.path.join(dset_folder_intra, 'TEST.relations.sparse') + X_train_intra, y_train_intra, X_test_intra, y_test_intra = load_svmlight_files( + (dset_train_intra, dset_test_intra), + n_features=46731, + zero_based=False + ) + # * inter + dset_folder_inter = os.path.join( + os.path.expanduser('~'), + 'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_NUC_intersent' + ) + dset_train_inter = os.path.join(dset_folder_inter, 'TRAINING.relations.sparse') + dset_test_inter = os.path.join(dset_folder_inter, 'TEST.relations.sparse') + X_train_inter, y_train_inter, X_test_inter, y_test_inter = load_svmlight_files( + (dset_train_inter, dset_test_inter), + n_features=46731, + zero_based=False + ) + # + if model_split == 'none': + # import the nuclearity TRAIN and TEST sets + dset_folder = os.path.join( + os.path.expanduser('~'), + 'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_NUC' + ) + dset_train = os.path.join(dset_folder, 'TRAINING.relations.sparse') + dset_test = os.path.join(dset_folder, 'TEST.relations.sparse') + + X_train, y_train, X_test, y_test = load_svmlight_files( + (dset_train, dset_test), + n_features=46731, + zero_based=False + ) + nuc_clf = LogisticRegressionCV(penalty='l1', solver='liblinear', + n_jobs=3) + # train nuclearity classifier, cross-validate performance on train + scores = cross_val_score(nuc_clf, X_train, y_train, cv=10) + print(scores) + print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) + # fit a + nuc_clf = nuc_clf.fit(X_train, y_train) + print(nuc_clf.score(X_test, y_test)) + print('separate eval on intra then inter') + print(nuc_clf.score(X_test_intra, y_test_intra)) + print(nuc_clf.score(X_test_inter, y_test_inter)) + elif model_split == 'sent': + # fit distinct classifiers for intra- and inter-sentential + # * intra: train nuclearity classifier, cross-validate performance on train + nuc_clf_intra = LogisticRegressionCV(penalty='l1', solver='liblinear', + n_jobs=3) + scores_intra = cross_val_score(nuc_clf_intra, X_train_intra, y_train_intra, + cv=10) + print(scores_intra) + print("Accuracy: %0.2f (+/- %0.2f)" % ( + scores_intra.mean(), scores_intra.std() * 2)) + # + nuc_clf_intra = nuc_clf_intra.fit(X_train_intra, y_train_intra) + print(nuc_clf_intra.score(X_test_intra, y_test_intra)) + # * inter: train nuclearity classifier, cross-validate performance on train + nuc_clf_inter = LogisticRegressionCV(penalty='l1', solver='liblinear', + n_jobs=3) + scores_inter = cross_val_score(nuc_clf_inter, X_train_inter, y_train_inter, + cv=10) + print(scores_inter) + print("Accuracy: %0.2f (+/- %0.2f)" % ( + scores_inter.mean(), scores_inter.std() * 2)) + # + nuc_clf_inter = nuc_clf_inter.fit(X_train_inter, y_train_inter) + print(nuc_clf_inter.score(X_test_inter, y_test_inter)) diff --git a/evals/train_rel_relabeller.py b/evals/train_rel_relabeller.py new file mode 100644 index 0000000..4ccf661 --- /dev/null +++ b/evals/train_rel_relabeller.py @@ -0,0 +1,201 @@ +"""This utility script trains a (re)labeller for RST edges. + +Given the path to a relation labelling dataset, it trains a classifier +and evaluates it. +""" + +from __future__ import absolute_import, print_function + +import argparse +import codecs +from collections import defaultdict +import copy +import os + +from sklearn.datasets import load_svmlight_file, load_svmlight_files +from sklearn.linear_model.logistic import LogisticRegression, LogisticRegressionCV +from sklearn.model_selection import cross_val_score + +from educe.rst_dt.deptree import _ROOT_HEAD, _ROOT_LABEL + + +# build mapping from int to label (reverse label encoding) +dset_rel_folder = os.path.join( + os.path.expanduser('~'), + 'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse_REL' +) +dset_rel_train = os.path.join(dset_rel_folder, 'TRAINING.relations.sparse') +dset_rel_test = os.path.join(dset_rel_folder, 'TEST.relations.sparse') + +with codecs.open(dset_rel_train, mode='rb', encoding='utf-8') as f_train: + header = f_train.readline() + header_prefix = '# labels: ' + assert header.startswith(header_prefix) + # DEBUG? explicit cast from unicode to str + labels = [str(lbl) for lbl in header[len(header_prefix):].split()] + int2lbl = dict(enumerate(labels, start=1)) + lbl2int = {lbl: i for i, lbl in int2lbl.items()} + # unrelated = lbl2int["UNRELATED"] + # root = lbl2int["ROOT"] + +# 2017-12-14 relation (re)labeller +# DIRTY load the feature vector for all *candidate* edges in the TEST +# set (for predict()) +feat_vecs = dict() +dset_folder = os.path.join( + os.path.expanduser('~'), + 'melodi/rst/irit-rst-dt/TMP/syn_pred_coarse' +) +dset_test = os.path.join(dset_folder, 'TEST.relations.sparse') +# we use the original svmlight files whose label is the relation +# class (which we actually don't need here) +# FIXME read n_features from .vocab +X_test, y_lbl_test = load_svmlight_file(dset_test, n_features=46731, + zero_based=False) +# build mapping from doc_name, src_idx, tgt_idx to line number +# in X_test +pairs = dset_test + '.pairings' +pair_map = defaultdict(lambda: defaultdict(dict)) +with codecs.open(pairs, mode='rb', encoding='utf-8') as f_pairs: + for i, line in enumerate(f_pairs): + src_id, tgt_id = line.strip().split('\t') + src_idx = (0 if src_id == 'ROOT' + else int(src_id.rsplit('_', 1)[1])) + doc_name, tgt_idx = tgt_id.rsplit('_', 1) + tgt_idx = int(tgt_idx) + # print(line) + # print(doc_name, src_idx, tgt_idx) + pair_map[doc_name][src_idx][tgt_idx] = i +# end DIRTY + + +if False: + # load the relation TRAIN and TEST sets + X_rel_train, y_rel_train, X_rel_test, y_rel_test = load_svmlight_files( + (dset_rel_train, dset_rel_test), + zero_based=False + ) + rel_clf = LogisticRegressionCV(penalty='l1', solver='liblinear', + n_jobs=3) + # train relation classifier, cross-validate performance on train + scores = cross_val_score(rel_clf, X_rel_train, y_rel_train, cv=10) + print(scores) + print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) + # fit a + rel_clf = rel_clf.fit(X_rel_train, y_rel_train) + print(rel_clf.score(X_rel_test, y_rel_test)) + + +class RelationRelabeller(object): + """Predict the coarse-grained RST relation of dependencies. + + Dependencies headed by the fake root node are labelled "ROOT" by + convention. + + Parameters + ---------- + mul_clf : sklearn classifier + Multi-class classifier for RST (coarse-grained) relations. + """ + + def __init__(self, mul_clf=LogisticRegression(penalty='l1', solver='liblinear', n_jobs=3), model_split='none'): + """Init""" + self.model_split = model_split + if model_split == 'none': + self.mul_clf = mul_clf + elif model_split == 'sent': + self.mul_clf_intra = copy.deepcopy(mul_clf) + self.mul_clf_inter = copy.deepcopy(mul_clf) + else: + raise ValueError("model_split?") + + def fit(self, X, y): + """Fit. + + FIXME X is currently expected to be a (flat) list of candidate + edges instead of a list of RstDepTrees. + """ + if self.model_split == 'none': + self.mul_clf = self.mul_clf.fit(X, y) + if True: # verbose + scores = cross_val_score(self.mul_clf, X, y, cv=10) + print(scores) + print("Accuracy: %0.2f (+/- %0.2f)" % ( + scores.mean(), scores.std() * 2)) + elif self.model_split == 'sent': + assert len(X) == 2 # intra, inter + assert len(y) == 2 # intra, inter + # * intra + self.mul_clf_intra = self.mul_clf_intra.fit(X[0], y[0]) + if True: # verbose + scores = cross_val_score(self.mul_clf_intra, X[0], y[0], cv=10) + print(scores) + print("Accuracy: %0.2f (+/- %0.2f)" % ( + scores.mean(), scores.std() * 2)) + # * inter + self.mul_clf_inter = self.mul_clf_inter.fit(X[1], y[1]) + if True: # verbose + scores = cross_val_score(self.mul_clf_inter, X[1], y[1], cv=10) + print(scores) + print("Accuracy: %0.2f (+/- %0.2f)" % ( + scores.mean(), scores.std() * 2)) + + return self + + def predict(self, X): + """Predict relation of edges in RstDepTrees X from the TEST set. + """ + y = [] + for dtree in X: + doc_name = dtree.origin.doc + yi = [] + for i, (head, rel) in enumerate(zip(dtree.heads, dtree.labels)): + if i == 0: + # fake root !? maybe we shouldn't write anything + # here ; + # FIXME check how to be consistent throughout educe and + # eval code + # yi.append(_ROOT_LABEL) + yi.append(None) + elif head == 0: + # TODO check the expected value (consistency) + yi.append(_ROOT_LABEL) + else: + # regular edge + line_idx = pair_map[doc_name][head][i] + # X_test[line_idx,:] is a matrix with 1 row + Xi = X_test[line_idx,:] + if self.model_split == 'none': + try: + y_pred = self.mul_clf.predict(Xi) + except ValueError: + print(Xi) + raise + elif self.model_split == 'sent': + # same_sentence_intra_{right,left}: 269, 303 + # our vocab is 1-based but sklearn converts it to + # 0-based ; + # same_para_* : 103, 158, 234, 314 + if ((Xi[0, 268] == 1 or Xi[0, 302] == 1) and + (Xi[0, 102] == 1 or Xi[0, 157] == 1 or + Xi[0, 233] == 1 or Xi[0, 313] == 1)): + sel_clf = self.mul_clf_intra + else: + sel_clf = self.mul_clf_inter + # + try: + y_pred = sel_clf.predict(Xi) + except ValueError: + print(Xi) + raise + # append prediction + try: + yi.append(int2lbl[int(y_pred[0])]) + if False and rel != int2lbl[int(y_pred[0])]: + print(doc_name, head, i, + rel, int2lbl[int(y_pred[0])]) # DEBUG + except KeyError: + raise ValueError("Weird prediction: {}".format( + y_pred)) + y.append(yi) + return y From 4fb6eaa3935ab6145d30ff8c617624540c925c12 Mon Sep 17 00:00:00 2001 From: moreymat Date: Tue, 19 Dec 2017 22:50:35 +0100 Subject: [PATCH 73/74] FIX pass doc_edus to loaders --- evals/hayashi_deps.py | 36 ++++++++++++++++----------------- evals/ji.py | 31 +++++++++++++---------------- evals/ours.py | 27 +++++++++++++++++++++---- evals/showdown.py | 46 ++++++++++++++++--------------------------- 4 files changed, 72 insertions(+), 68 deletions(-) diff --git a/evals/hayashi_deps.py b/evals/hayashi_deps.py index c5fd6b3..b6f40d2 100644 --- a/evals/hayashi_deps.py +++ b/evals/hayashi_deps.py @@ -9,20 +9,10 @@ from glob import glob from educe.learning.edu_input_format import load_edu_input_file -from educe.rst_dt.corpus import Reader from educe.rst_dt.deptree import RstDepTree, RstDtException from educe.rst_dt.dep2con import deptree_to_rst_tree -# load true ctrees, from the TEST section of the RST-DT, to get gold EDUs -RST_DT_DIR = '/home/mmorey/corpora/rst-dt/rst_discourse_treebank/data' -RST_TEST_DIR = os.path.join(RST_DT_DIR, 'RSTtrees-WSJ-main-1.0/TEST') -if not os.path.exists(RST_TEST_DIR): - raise ValueError('Unable to find RST test files at ', RST_TEST_DIR) -RST_TEST_READER = Reader(RST_TEST_DIR) -RST_TEST_CTREES_TRUE = {k.doc: v for k, v in RST_TEST_READER.slurp().items()} - - def _load_hayashi_dep_file(f, edus): """Do load. @@ -67,24 +57,27 @@ def load_hayashi_dep_file(fname, edus): return _load_hayashi_dep_file(f, edus) -def load_hayashi_dep_files(out_dir): +def load_hayashi_dep_files(out_dir, doc_edus): """Load dep files output by one of Hayashi et al.'s parser. Parameters ---------- out_dir: str Path to the folder containing the .dis files. + doc_edus : dict(str, list(EDU)) + Mapping from doc_name to the list of its EDUs (read from the + corpus). """ dtrees = dict() for fname in glob(os.path.join(out_dir, '*.dis')): doc_name = os.path.splitext(os.path.basename(fname))[0] - edus = RST_TEST_CTREES_TRUE[doc_name].leaves() + edus = doc_edus[doc_name] dtrees[doc_name] = load_hayashi_dep_file(fname, edus) return dtrees -def load_hayashi_dep_dtrees(out_dir, rel_conv, edus_file_pat, nuc_clf, - rnk_clf): +def load_hayashi_dep_dtrees(out_dir, rel_conv, doc_edus, edus_file_pat, + nuc_clf, rnk_clf): """Load the dtrees output by one of Hayashi et al.'s dep parsers. Parameters @@ -94,6 +87,9 @@ def load_hayashi_dep_dtrees(out_dir, rel_conv, edus_file_pat, nuc_clf, rel_conv : RstRelationConverter Converter for relation labels (fine- to coarse-grained, plus normalization). + doc_edus : dict(str, list(EDU)) + Mapping from doc_name to the list of its EDUs (read from the + corpus). edus_file_pat : str Pattern for the .edu_input files. nuc_clf : NuclearityClassifier @@ -108,7 +104,7 @@ def load_hayashi_dep_dtrees(out_dir, rel_conv, edus_file_pat, nuc_clf, """ dtree_pred = dict() - dtrees = load_hayashi_dep_files(out_dir) + dtrees = load_hayashi_dep_files(out_dir, doc_edus) for doc_name, dt_pred in dtrees.items(): if rel_conv is not None: dt_pred = rel_conv(dt_pred) @@ -130,8 +126,8 @@ def load_hayashi_dep_dtrees(out_dir, rel_conv, edus_file_pat, nuc_clf, return dtree_pred -def load_hayashi_dep_ctrees(out_dir, rel_conv, edus_file_pat, nuc_clf, - rnk_clf, dtree_pred=None): +def load_hayashi_dep_ctrees(out_dir, rel_conv, doc_edus, edus_file_pat, + nuc_clf, rnk_clf, dtree_pred=None): """Load the ctrees for the dtrees output by one of Hayashi et al.'s dep parsers. @@ -142,6 +138,9 @@ def load_hayashi_dep_ctrees(out_dir, rel_conv, edus_file_pat, nuc_clf, rel_conv : RstRelationConverter Converter for relation labels (fine- to coarse-grained, plus normalization). + doc_edus : dict(str, list(EDU)) + Mapping from doc_name to the list of its EDUs (read from the + corpus). edus_file_pat : str Pattern for the .edu_input files. nuc_clf : NuclearityClassifier @@ -159,7 +158,8 @@ def load_hayashi_dep_ctrees(out_dir, rel_conv, edus_file_pat, nuc_clf, """ ctree_pred = dict() if dtree_pred is None: - dtree_pred = load_hayashi_dep_dtrees(out_dir, rel_conv, edus_file_pat, + dtree_pred = load_hayashi_dep_dtrees(out_dir, rel_conv, doc_edus, + edus_file_pat, nuc_clf, rnk_clf) for doc_name, dt_pred in dtree_pred.items(): try: diff --git a/evals/ji.py b/evals/ji.py index c6ab6e8..1b1808c 100644 --- a/evals/ji.py +++ b/evals/ji.py @@ -11,35 +11,28 @@ from educe.annotation import Span from educe.corpus import FileId from educe.rst_dt.annotation import Node, RSTTree -from educe.rst_dt.corpus import Reader from educe.rst_dt.deptree import RstDepTree -from educe.rst_dt.rst_wsj_corpus import TEST_FOLDER -# original RST corpus -RST_CORPUS = os.path.join('/home/mmorey/corpora/rst-dt/rst_discourse_treebank/data') -RST_MAIN_TEST = os.path.join(RST_CORPUS, TEST_FOLDER) - -def load_ji_ctrees(ji_out_dir, rel_conv): +def load_ji_ctrees(ji_out_dir, rel_conv, doc_edus): """Load the ctrees output by DPLP as .brackets files. Parameters ---------- - ji_out_dir: str + ji_out_dir : str Path to the base directory containing the output files. + rel_conv : RstRelationConverter? + Relation converter. + doc_edus : dict(str, list(EDU)) + Mapping from doc_name to the list of its EDUs (read from the + corpus). Returns ------- ctree_pred: dict(str, RSTTree) RST ctree for each document. """ - # * load the text of the EDUs - # FIXME get the text of EDUs from the .merge files - corpus_dir = RST_MAIN_TEST - reader_true = Reader(corpus_dir) - ctree_true = reader_true.slurp() - doc_edus = {k.doc: ct_true.leaves() for k, ct_true - in ctree_true.items()} + # FIXME? get the text of EDUs from the .merge files? # * for each doc, load the predicted spans from the .brackets ctree_pred = dict() files_pred = os.path.join(ji_out_dir, '*.brackets') @@ -149,7 +142,8 @@ def load_ji_ctrees(ji_out_dir, rel_conv): return ctree_pred -def load_ji_dtrees(ji_out_dir, rel_conv, nary_enc='chain', ctree_pred=None): +def load_ji_dtrees(ji_out_dir, rel_conv, doc_edus, nary_enc='chain', + ctree_pred=None): """Get the dtrees that correspond to the ctrees output by DPLP. Parameters @@ -160,6 +154,9 @@ def load_ji_dtrees(ji_out_dir, rel_conv, nary_enc='chain', ctree_pred=None): Relation converter, from fine- to coarse-grained labels. nary_enc: one of {'chain', 'tree'} Encoding for n-ary nodes. + doc_edus : dict(str, list(EDU)) + Mapping from doc_name to the list of its EDUs (read from the + corpus). ctree_pred : dict(str, RSTTree), optional RST c-trees, indexed by doc_name. If c-trees are provided this way, `out_dir` is ignored. @@ -171,7 +168,7 @@ def load_ji_dtrees(ji_out_dir, rel_conv, nary_enc='chain', ctree_pred=None): """ dtree_pred = dict() if ctree_pred is None: - ctree_pred = load_ji_ctrees(ji_out_dir, rel_conv) + ctree_pred = load_ji_ctrees(ji_out_dir, rel_conv, doc_edus) for doc_name, ct_pred in ctree_pred.items(): dtree_pred[doc_name] = RstDepTree.from_rst_tree( ct_pred, nary_enc=nary_enc) diff --git a/evals/ours.py b/evals/ours.py index 6d651b4..5a8f210 100644 --- a/evals/ours.py +++ b/evals/ours.py @@ -57,7 +57,8 @@ def load_attelo_output_file(output_file): return edges_pred -def load_attelo_dtrees(output_file, edus_file, rel_clf, nuc_clf, rnk_clf): +def load_attelo_dtrees(output_file, edus_file, rel_clf, nuc_clf, rnk_clf, + doc_edus=None): """Load RST dtrees from attelo output files. Parameters @@ -66,6 +67,13 @@ def load_attelo_dtrees(output_file, edus_file, rel_clf, nuc_clf, rnk_clf): Path to the file that contains attelo's output edus_file: string Path to the file that describes EDUs. + doc_edus : dict(str, list(EDU)), optional + Mapping from doc_name to the list of its EDUs (read from the + corpus). If None, each EDU is re-created using information in + the `.edu_input` file, otherwise EDUs are created but their text + is taken from `doc_edus`. + FIXME avoid creating "new" EDUs altogether if `doc_edus` is not + None. Returns ------- @@ -85,7 +93,10 @@ def load_attelo_dtrees(output_file, edus_file, rel_clf, nuc_clf, rnk_clf): # EDU info edu_num = int(att_edu.id.rsplit('_', 1)[1]) edu_span = EduceSpan(att_edu.start, att_edu.end) - edu_text = att_edu.text + if doc_edus is not None: + edu_text = doc_edus[doc_name][edu_num - 1].raw_text + else: + edu_text = att_edu.text educe_edus[doc_name].append(EduceEDU(edu_num, edu_span, edu_text)) # map global id of EDU to num of EDU inside doc gid2num[att_edu.id] = edu_num @@ -134,7 +145,7 @@ def load_attelo_dtrees(output_file, edus_file, rel_clf, nuc_clf, rnk_clf): def load_attelo_ctrees(output_file, edus_file, rel_clf, nuc_clf, rnk_clf, - dtree_pred=None): + doc_edus=None, dtree_pred=None): """Load RST ctrees from attelo output files. Parameters @@ -147,6 +158,13 @@ def load_attelo_ctrees(output_file, edus_file, rel_clf, nuc_clf, rnk_clf, Classifier to predict nuclearity rnk_clf: RankClassifier Classifier to predict attachment ranking + doc_edus : dict(str, list(EDU)), optional + Mapping from doc_name to the list of its EDUs (read from the + corpus). If None, each EDU is re-created using information in + the `.edu_input` file, otherwise EDUs are created but their text + is taken from `doc_edus`. + FIXME avoid creating "new" EDUs altogether if `doc_edus` is not + None. dtree_pred : dict(str, RstDepTree), optional RST d-trees, indexed by doc_name. If d-trees are provided this way, `out_dir` is ignored. @@ -158,7 +176,8 @@ def load_attelo_ctrees(output_file, edus_file, rel_clf, nuc_clf, rnk_clf, if dtree_pred is None: # load RST dtrees, with heuristics for nuc and rank dtree_pred = load_attelo_dtrees(output_file, edus_file, - rel_clf, nuc_clf, rnk_clf) + rel_clf, nuc_clf, rnk_clf, + doc_edus=doc_edus) # convert to RST ctrees ctree_pred = dict() for doc_name, dt_pred in dtree_pred.items(): diff --git a/evals/showdown.py b/evals/showdown.py index af6f117..f6a7587 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -7,7 +7,6 @@ import argparse import codecs -from collections import defaultdict import itertools import os @@ -206,25 +205,6 @@ def setup_dtree_postprocessor(nary_enc='chain', order='strict', # flavours of dtree dt_true = RstDepTree.from_rst_tree(ct_true, nary_enc=nary_enc) dtree_true[doc_name] = dt_true - # 2017-12-18 WIP print spiders in d-trees, see if some could be - # solved with para_idx - rnk_deps = defaultdict(list) # gov -> list of (rnk, dep) - for i, (gov, rnk, nuc, lbl) in enumerate( - zip(dt_true.heads[1:], dt_true.ranks[1:], dt_true.nucs[1:], - dt_true.labels[1:]), - start=1): - rnk_deps[gov].append((rnk, i)) - ordered_deps = {k: sorted(v) for k, v in rnk_deps.items()} - for gov, ord_deps in sorted(ordered_deps.items()): - if ((any(x[1] < gov for x in ord_deps) and - any(x[1] > gov for x in ord_deps))): - if doc_name.startswith('wsj_06'): - print(doc_name, gov, ord_deps) - elif doc_name.startswith('file'): - pass - else: - raise ValueError("spider!") - # end 2017-12-18 WIP spiders # fit classifiers for nuclearity and rank (DIRTY) # NB: both are (dummily) fit on weakly ordered dtrees X_train = [] @@ -460,6 +440,8 @@ def main(): # the eval compares parses for the test section of the RST corpus reader_test = RstReader(CD_TEST) corpus_test = reader_test.slurp() + doc_edus_test = {k.doc: ct_true.leaves() for k, ct_true + in corpus_test.items()} # reference: author_true can be any of the authors_pred (defaults to gold) ctree_true = dict() # ctrees @@ -533,12 +515,12 @@ def main(): if author_pred == 'HHN16_MST': dtree_pred = load_hayashi_dep_dtrees( - HAYASHI_MST_OUT_DIR, REL_CONV_DTREE, EDUS_FILE_PAT, - nuc_clf, rnk_clf) + HAYASHI_MST_OUT_DIR, REL_CONV_DTREE, doc_edus_test, + EDUS_FILE_PAT, nuc_clf, rnk_clf) c_preds.append( ('HHN16_MST', load_hayashi_dep_ctrees( - HAYASHI_MST_OUT_DIR, REL_CONV_DTREE, EDUS_FILE_PAT, - nuc_clf, rnk_clf, dtree_pred=dtree_pred)) + HAYASHI_MST_OUT_DIR, REL_CONV_DTREE, doc_edus_test, + EDUS_FILE_PAT, nuc_clf, rnk_clf, dtree_pred=dtree_pred)) ) d_preds.append( ('HHN16_MST', dtree_pred) @@ -607,12 +589,12 @@ def main(): if author_pred == 'JE14': # DPLP outputs RST ctrees in the form of lists of spans; # load_ji_dtrees maps them to RST dtrees - ctree_pred = load_ji_ctrees(JI_OUT_DIR, REL_CONV) + ctree_pred = load_ji_ctrees(JI_OUT_DIR, REL_CONV, doc_edus_test) c_preds.append( ('JE14', ctree_pred) ) d_preds.append( - ('JE14', load_ji_dtrees(JI_OUT_DIR, REL_CONV, + ('JE14', load_ji_dtrees(JI_OUT_DIR, REL_CONV, doc_edus_test, nary_enc='chain', ctree_pred=ctree_pred)) ) @@ -649,10 +631,12 @@ def main(): if author_pred == 'ours-chain': # Eisner, predicted syntax, chain dtree_pred = load_attelo_dtrees(EISNER_OUT_SYN_PRED, EDUS_FILE, - rel_clf, nuc_clf, rnk_clf) + rel_clf, nuc_clf, rnk_clf, + doc_edus=doc_edus_test) c_preds.append( ('ours-chain', load_attelo_ctrees(EISNER_OUT_SYN_PRED, EDUS_FILE, rel_clf, nuc_clf, rnk_clf, + doc_edus=doc_edus_test, dtree_pred=dtree_pred)) ) d_preds.append( @@ -662,10 +646,12 @@ def main(): if author_pred == 'ours-tree': # Eisner, predicted syntax, tree + same-unit dtree_pred = load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED, EDUS_FILE, - rel_clf, nuc_clf, rnk_clf) + rel_clf, nuc_clf, rnk_clf, + doc_edus=doc_edus_test) c_preds.append( ('ours-tree', load_attelo_ctrees(EISNER_OUT_TREE_SYN_PRED, EDUS_FILE, rel_clf, nuc_clf, rnk_clf, + doc_edus=doc_edus_test, dtree_pred=dtree_pred)) ) d_preds.append( @@ -674,10 +660,12 @@ def main(): if author_pred == 'ours-tree-su': # Eisner, predicted syntax, tree + same-unit dtree_pred = load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED_SU, - EDUS_FILE, nuc_clf, rnk_clf) + EDUS_FILE, nuc_clf, rnk_clf, + doc_edus=doc_edus_test) c_preds.append( ('ours-tree-su', load_attelo_ctrees( EISNER_OUT_TREE_SYN_PRED_SU, EDUS_FILE, nuc_clf, rnk_clf, + doc_edus=doc_edus_test, dtree_pred=dtree_pred)) ) d_preds.append( From 9f068655250fb0ec47740a9855a50662690588c8 Mon Sep 17 00:00:00 2001 From: moreymat Date: Wed, 20 Dec 2017 10:48:09 +0100 Subject: [PATCH 74/74] FIX nary_enc_pred tied to each set of predictions, no longer CLI arg --- evals/showdown.py | 88 +++++++++++++++++++++++++++-------------------- 1 file changed, 51 insertions(+), 37 deletions(-) diff --git a/evals/showdown.py b/evals/showdown.py index f6a7587..1fe101a 100644 --- a/evals/showdown.py +++ b/evals/showdown.py @@ -361,9 +361,6 @@ def main(): parser.add_argument('authors_pred', nargs='+', choices=AUTHORS, help="Author(s) of the predictions") - parser.add_argument('--nary_enc_pred', default='tree', - choices=['tree', 'chain'], - help="Encoding of n-ary nodes for the predictions") # reference parser.add_argument('--author_true', default='gold', choices=AUTHORS + ['each'], # NEW generate sim matrix @@ -398,7 +395,6 @@ def main(): args = parser.parse_args() author_true = args.author_true authors_pred = args.authors_pred - nary_enc_pred = args.nary_enc_pred binarize_true = args.binarize_true simple_rsttree = args.simple_rsttree # display @@ -419,7 +415,8 @@ def main(): # heuristically determined values for _pred but also _true, and adds # three trivial spans eval_li_dep = args.eval_li_dep - + # nary_enc_true is used ; order_true currently is not (implicit in + # nary_enc_true) if binarize_true in ('right', 'right_mixed'): nary_enc_true = 'chain' order_true = 'strict' @@ -433,9 +430,14 @@ def main(): # 0. setup the postprocessors to flesh out unordered dtrees into ordered # ones with nuclearity # * tie the order with the encoding for n-ary nodes - order = 'weak' if nary_enc_pred == 'tree' else 'strict' - nuc_clf, rnk_clf, rel_clf = setup_dtree_postprocessor( - nary_enc=nary_enc_pred, order=order) + nuc_clf_chain, rnk_clf_chain, rel_clf_chain = setup_dtree_postprocessor( + nary_enc='chain', order='strict') + # FIXME explicit differenciation between (heuristic) classifiers for + # the "chain" vs "tree" transforms (2 parameters: nary_enc, order) ; + # nuc_clf, rnk_clf, rel_clf might contain implicit assumptions + # tied to the "chain" transform, might not be optimal for "tree" + nuc_clf_tree, rnk_clf_tree, rel_clf_tree = setup_dtree_postprocessor( + nary_enc='tree', order='weak') # the eval compares parses for the test section of the RST corpus reader_test = RstReader(CD_TEST) @@ -514,13 +516,15 @@ def main(): ) if author_pred == 'HHN16_MST': + # paper: {nary_enc_pred='chain', order='strict'} dtree_pred = load_hayashi_dep_dtrees( HAYASHI_MST_OUT_DIR, REL_CONV_DTREE, doc_edus_test, - EDUS_FILE_PAT, nuc_clf, rnk_clf) + EDUS_FILE_PAT, nuc_clf_chain, rnk_clf_chain) c_preds.append( ('HHN16_MST', load_hayashi_dep_ctrees( HAYASHI_MST_OUT_DIR, REL_CONV_DTREE, doc_edus_test, - EDUS_FILE_PAT, nuc_clf, rnk_clf, dtree_pred=dtree_pred)) + EDUS_FILE_PAT, nuc_clf_chain, rnk_clf_chain, + dtree_pred=dtree_pred)) ) d_preds.append( ('HHN16_MST', dtree_pred) @@ -538,16 +542,18 @@ def main(): ) if author_pred == 'li_sujian': - # FIXME load d-trees once, pass dtree_pred to the c-loader + # FIXME load d-trees once, pass dtree_pred to the c-loader ; + # paper says 'chain' transform, but it might be worth + # checking c_preds.append( ('li_sujian', load_li_sujian_dep_ctrees( LI_SUJIAN_OUT_FILE, REL_CONV_DTREE, EDUS_FILE_PAT, - nuc_clf, rnk_clf)) + nuc_clf_chain, rnk_clf_chain)) ) d_preds.append( ('li_sujian', load_li_sujian_dep_dtrees( LI_SUJIAN_OUT_FILE, REL_CONV_DTREE, EDUS_FILE_PAT, - nuc_clf, rnk_clf)) + nuc_clf_chain, rnk_clf_chain)) ) if author_pred == 'FH14_gSVM': @@ -630,14 +636,16 @@ def main(): if author_pred == 'ours-chain': # Eisner, predicted syntax, chain - dtree_pred = load_attelo_dtrees(EISNER_OUT_SYN_PRED, EDUS_FILE, - rel_clf, nuc_clf, rnk_clf, - doc_edus=doc_edus_test) + dtree_pred = load_attelo_dtrees( + EISNER_OUT_SYN_PRED, EDUS_FILE, + rel_clf_chain, nuc_clf_chain, rnk_clf_chain, + doc_edus=doc_edus_test) c_preds.append( - ('ours-chain', load_attelo_ctrees(EISNER_OUT_SYN_PRED, EDUS_FILE, - rel_clf, nuc_clf, rnk_clf, - doc_edus=doc_edus_test, - dtree_pred=dtree_pred)) + ('ours-chain', load_attelo_ctrees( + EISNER_OUT_SYN_PRED, EDUS_FILE, + rel_clf_chain, nuc_clf_chain, rnk_clf_chain, + doc_edus=doc_edus_test, + dtree_pred=dtree_pred)) ) d_preds.append( ('ours-chain', dtree_pred) @@ -645,26 +653,30 @@ def main(): if author_pred == 'ours-tree': # Eisner, predicted syntax, tree + same-unit - dtree_pred = load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED, EDUS_FILE, - rel_clf, nuc_clf, rnk_clf, - doc_edus=doc_edus_test) + dtree_pred = load_attelo_dtrees( + EISNER_OUT_TREE_SYN_PRED, EDUS_FILE, + rel_clf_tree, nuc_clf_tree, rnk_clf_tree, + doc_edus=doc_edus_test) c_preds.append( - ('ours-tree', load_attelo_ctrees(EISNER_OUT_TREE_SYN_PRED, EDUS_FILE, - rel_clf, nuc_clf, rnk_clf, - doc_edus=doc_edus_test, - dtree_pred=dtree_pred)) + ('ours-tree', load_attelo_ctrees( + EISNER_OUT_TREE_SYN_PRED, EDUS_FILE, + rel_clf_tree, nuc_clf_tree, rnk_clf_tree, + doc_edus=doc_edus_test, + dtree_pred=dtree_pred)) ) d_preds.append( ('ours-tree', dtree_pred) ) if author_pred == 'ours-tree-su': # Eisner, predicted syntax, tree + same-unit - dtree_pred = load_attelo_dtrees(EISNER_OUT_TREE_SYN_PRED_SU, - EDUS_FILE, nuc_clf, rnk_clf, - doc_edus=doc_edus_test) + dtree_pred = load_attelo_dtrees( + EISNER_OUT_TREE_SYN_PRED_SU, EDUS_FILE, + rel_clf_tree, nuc_clf_tree, rnk_clf_tree, + doc_edus=doc_edus_test) c_preds.append( ('ours-tree-su', load_attelo_ctrees( - EISNER_OUT_TREE_SYN_PRED_SU, EDUS_FILE, nuc_clf, rnk_clf, + EISNER_OUT_TREE_SYN_PRED_SU, EDUS_FILE, + rel_clf_tree, nuc_clf_tree, rnk_clf_tree, doc_edus=doc_edus_test, dtree_pred=dtree_pred)) ) @@ -682,15 +694,17 @@ def main(): if False: # FIXME repair (or forget) these print('Eisner, predicted syntax + same-unit') - load_deptrees_from_attelo_output(ctree_true, dtree_true, - EISNER_OUT_SYN_PRED_SU, EDUS_FILE, - nuc_clf, rnk_clf) + load_deptrees_from_attelo_output( + ctree_true, dtree_true, + EISNER_OUT_SYN_PRED_SU, EDUS_FILE, + rel_clf_chain, nuc_clf_chain, rnk_clf_chain) print('======================') print('Eisner, gold syntax') - load_deptrees_from_attelo_output(ctree_true, dtree_true, - EISNER_OUT_SYN_GOLD, EDUS_FILE, - nuc_clf, rnk_clf) + load_deptrees_from_attelo_output( + ctree_true, dtree_true, + EISNER_OUT_SYN_GOLD, EDUS_FILE, + rel_clf_chain, nuc_clf_chain, rnk_clf_chain) print('======================') # dependency eval