From 8fad11431d7e5518a4b3a147145d587ad9ef8a5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Fl=C3=BCckiger?= Date: Mon, 1 Jun 2020 12:51:11 +0200 Subject: [PATCH] improve logging --- ner_evaluation/ner_eval.py | 39 ++++++++++++++++++++++++++++---------- ner_evaluation/utils.py | 6 +++--- 2 files changed, 32 insertions(+), 13 deletions(-) diff --git a/ner_evaluation/ner_eval.py b/ner_evaluation/ner_eval.py index c9b9da5..2a98f36 100644 --- a/ner_evaluation/ner_eval.py +++ b/ner_evaluation/ner_eval.py @@ -18,7 +18,6 @@ get_all_tags, column_selector, check_tag_selection, - check_spurious_tags, ) @@ -44,6 +43,9 @@ def __init__(self, f_true, f_pred, glueing_cols=None): :return: Evaluator object. """ + + logging.info(f"Reading system response file '{f_pred}' and gold standard '{f_true}'.") + self.f_true = f_true self.f_pred = f_pred @@ -105,7 +107,8 @@ def __init__(self, f_true, f_pred, glueing_cols=None): } def check_segment_mismatch(self): - """Assert the alignment between gold standard and the system response. + """ + Assert the alignment between gold standard and the system response. """ logging.info("Datasets imported (Gold/Predictions).") @@ -180,11 +183,10 @@ def evaluate( if isinstance(columns, str): columns = [columns] - tags = self.set_evaluation_tags(columns, tags, eval_type) + logging.info(f"Evaluating column {columns} in system response file '{self.f_pred}'") - logging.info( - f"Evaluating system response '{self.f_pred}' on {columns} for the following tags: {tags}" - ) + tags = self.set_evaluation_tags(columns, tags, eval_type) + logging.info(f"Evaluation on the following tags: {tags}") # Create an accumulator to store overall results results = deepcopy(self.metric_schema) @@ -338,7 +340,6 @@ def compute_metrics(self, true_named_entities: list, pred_named_entities: list, # only allow alternatives in prediction file, not in gold standard true_named_entities = [ent[0] for ent in true_named_entities if ent[0].e_type in tags] - # pred_named_entities = [ent for ent in pred_named_entities if [ent[0]].e_type in tags] pred_named_entities = [ ent for ent in pred_named_entities if any([e.e_type in tags for e in ent]) ] @@ -542,7 +543,7 @@ def set_evaluation_tags(self, columns, tags, eval_type): for col in columns: y_pred += [column_selector(doc, col) for doc in self.pred] except AttributeError: - msg = f"The provided annotation columns {columns} are not available in both the gold standard and the system response '{self.f_pred}'." + msg = f"Missing columns {columns} in the system response file '{self.f_pred}' or the gold standard." logging.error(msg) raise AssertionError(msg) @@ -555,10 +556,10 @@ def set_evaluation_tags(self, columns, tags, eval_type): elif eval_type == "nerc": # For NERC, only tags which are covered by the gold standard are considered tags = true_tags - check_spurious_tags(y_true, y_pred) + self.check_spurious_tags(y_true, y_pred, columns) if not pred_tags: - msg = f"There are no tags in the system response file '{self.f_pred}' for the column: {columns}" + msg = f"No tags in the column '{columns}' of the system response file: '{self.f_pred}'" logging.error(msg) elif eval_type == "nel": @@ -567,6 +568,24 @@ def set_evaluation_tags(self, columns, tags, eval_type): return tags + def check_spurious_tags(self, y_true: list, y_pred: list, columns: list): + """Log any tags of the system response which are not in the gold standard. + + :param list y_true: a nested list of gold labels with the structure "[docs [sents [tokens]]]". + :param list y_pred: a nested list of system labels with the structure "[docs [sents [tokens]]]". + :return: None. + :rtype: None + + """ + + tags_true = get_all_tags(y_true) + tags_pred = get_all_tags(y_pred) + + for pred in tags_pred: + if pred not in tags_true: + msg = f"Spurious entity label '{pred}' in column {columns} of system response file: '{self.f_pred}'. As the tag is not part of the gold standard, it is ignored in the evaluation." + logging.error(msg) + def find_overlap(true_range, pred_range): """Find the overlap between two ranges diff --git a/ner_evaluation/utils.py b/ner_evaluation/utils.py index 70b56df..300bff4 100644 --- a/ner_evaluation/utils.py +++ b/ner_evaluation/utils.py @@ -23,7 +23,7 @@ def __init__(self, properties: dict): try: v = v.upper() except AttributeError: - msg = f"There are empty values in column '{k}'. They get replaced by an underscore." + msg = f"Empty values in column '{k}'. They get replaced by an underscore." logging.warning(msg) v = "_" @@ -88,7 +88,7 @@ def check_tag_selection(y_cand: list, tags_ref: list): return clean_tags -def check_spurious_tags(y_true: list, y_pred: list): +def check_spurious_tags(y_true: list, y_pred: list, columns: list): """Log any tags of the system response which are not in the gold standard. :param list y_true: a nested list of gold labels with the structure "[docs [sents [tokens]]]". @@ -103,7 +103,7 @@ def check_spurious_tags(y_true: list, y_pred: list): for pred in tags_pred: if pred not in tags_true: - msg = f"Spurious entity label '{pred}' in predictions. Tag is not part of the gold standard and ignored in the evaluation." + msg = f"Spurious entity label '{pred}' in column {columns} in system response, which is part of the gold standard. Tag is ignored in the evaluation." logging.error(msg)