-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathevaluate_extraction.py
141 lines (125 loc) · 5.79 KB
/
evaluate_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Evaluate PIE extraction performance against an exhaustively PIE annotated corpus, output recall, precision and F1-score.
'''
import json, argparse, csv, random
from collections import Counter
# Read in arguments
parser = argparse.ArgumentParser(description = 'Parameters for PIE detection evaluation')
parser.add_argument('extracted', metavar = 'extracted_idioms.csv', type = str, help = "Specify the file containing the extracted PIEs.")
parser.add_argument('annotated', metavar = 'annotated_idioms.json', type = str, help = "Specify the file containing the annotated PIEs")
args = parser.parse_args()
# Read input data
extracted_idioms = []
with open(args.extracted, 'r') as csvfile:
csvreader = csv.reader(csvfile, delimiter = '\t', quoting=csv.QUOTE_MINIMAL, quotechar = '"')
for csvrow in csvreader:
extracted_idioms.append({'document_id': csvrow[4], 'sentence_number': csvrow[5], 'idiom': csvrow[0], 'context': unicode(csvrow[3], 'utf-8'), 'start': csvrow[1], 'end': csvrow[2]})
annotated_idioms = json.load(open(args.annotated, 'r'))
# Check if datasets cover same documents
assert set([idiom['document_id'] for idiom in extracted_idioms]) <= set([idiom['document_id'] for idiom in annotated_idioms])
# Select only the PIEs from the set of annotated PIE candidates
annotated_idioms = [annotated_idiom for annotated_idiom in annotated_idioms if annotated_idiom['PIE_label'] == 'y']
# Keep track of false negatives
for annotated_idiom in annotated_idioms:
annotated_idiom['evaluation'] = 'fn'
# Count true/false positives/negatives
# We do not have true negatives
tp = 0.
fp = 0.
fn = 0.
for extracted_idiom in extracted_idioms:
for annotated_idiom in annotated_idioms:
# Lower case PIEs for comparison, as they are annotated as lower-case, but not necessarily extracted so
if extracted_idiom['document_id'] == annotated_idiom['document_id'] and extracted_idiom['sentence_number'] == annotated_idiom['sentence_number'] and extracted_idiom['idiom'].lower() == annotated_idiom['idiom'].lower():
tp += 1
extracted_idiom['evaluation'] = 'tp'
annotated_idiom['evaluation'] = 'tp'
break
else: # No break
fp += 1
extracted_idiom['evaluation'] = 'fp'
fn = len(annotated_idioms) - tp # False negatives = all missed PIEs = # annotated PIEs - # correctly found PIEs
# Get precision, recall, F1-score
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)
# Print results
print '### RESULTS ###'
print 'Total number of annotated PIEs: {0}'.format(len(annotated_idioms))
print 'Total number of extracted PIEs: {0}\n'.format(len(extracted_idioms))
print 'True Positives: {0}\nFalse Positives: {1}\nFalse Negatives: {2}\n'.format(tp, fp, fn)
print 'Precision: {0}%'.format(precision*100)
print 'Recall: {0}%'.format(recall*100)
print 'F1-score: {0}%'.format(f1*100)
# Print examples of classifications
def show_examples(idioms, evaluation):
# Define colours
stop = '\x1b[0m'
red = '\x1b[1;31;1m'
# Count number of examples shown
count = 0
for idiom in idioms:
if idiom['evaluation'] == evaluation:
# Highlight idiom in context
try:
context = idiom['context']
start = int(idiom['start'])
end = int(idiom['end'])
except KeyError:
context = idiom['sentence']
start = idiom['offsets'][0][0]
end = idiom['offsets'][-1][-1]
highlighted_context = context[:start]
highlighted_context += red
highlighted_context += context[start:end]
highlighted_context += stop
highlighted_context += context[end:]
print highlighted_context,
print '({2} - doc. {0} - sent. {1})'.format(idiom['document_id'], idiom['sentence_number'], idiom['idiom'])
count += 1
if count % 10 == 0:
user_input = unicode(raw_input("Show 10 more examples? (y/n): "), 'utf-8')
if user_input.lower() != 'y':
break
else: # No break
print 'No more examples!'
# Prompt and show examples for different classes
# Shuffle idiom lists to avoid seeing same examples again and again
random.shuffle(extracted_idioms)
random.shuffle(annotated_idioms)
user_input = unicode(raw_input("Show examples of classifications? (y/n): "), 'utf-8')
if user_input.lower() == 'y':
user_input = unicode(raw_input("Show examples of true positives? (y/n): "), 'utf-8')
if user_input.lower() == 'y':
show_examples(extracted_idioms, 'tp')
user_input = unicode(raw_input("Show examples of false positives? (y/n): "), 'utf-8')
if user_input.lower() == 'y':
show_examples(extracted_idioms, 'fp')
user_input = unicode(raw_input("Show examples of false negatives? (y/n): "), 'utf-8')
if user_input.lower() == 'y':
show_examples(annotated_idioms, 'fn')
# Split performance for most frequent PIE types in corpus
def performance_per_type(annotated_idioms, extracted_idioms, n):
most_frequent_types = Counter([x['idiom'] for x in annotated_idioms]).most_common()
print 'PIE Type' + 17*' ' + 'Count\tPrecision\tRecall\tF1-score'
for pie_type in most_frequent_types[:n]:
pie = pie_type[0]
count = pie_type[1]
tp = float(len([x['evaluation'] for x in extracted_idioms if x['idiom'] == pie and x['evaluation'] == 'tp']))
fp = float(len([x['evaluation'] for x in extracted_idioms if x['idiom'] == pie and x['evaluation'] == 'fp']))
fn = float(len([x['evaluation'] for x in annotated_idioms if x['idiom'] == pie and x['evaluation'] == 'fn']))
try:
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)
except ZeroDivisionError:
precision = 0.
recall = 0.
f1 = 0.
pie += (25 - len(pie)) * ' '
print '{0}{1}\t{2:.2f}\t\t{3:.2f}\t{4:.2f}'.format(pie, count, precision * 100, recall * 100, f1 * 100)
user_input = unicode(raw_input("Show performance for 25 most frequent PIE types? (y/n): "), 'utf-8')
if user_input.lower() == 'y':
performance_per_type(annotated_idioms, extracted_idioms, 25)