-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsumcmp.py
executable file
·133 lines (108 loc) · 4.72 KB
/
sumcmp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env python3
import argparse
from collections import Counter
import logging
import math
import sys
from respdiff import cli
from respdiff.database import LMDB
from respdiff.dataformat import DiffReport
from respdiff.query import (
convert_queries,
get_printable_queries_format,
get_query_iterator,
)
ANSWERS_DIFFERENCE_THRESHOLD_WARNING = 0.05
def check_report_summary(report: DiffReport):
if report.summary is None:
logging.critical("report doesn't contain diff summary")
sys.exit(1)
def check_usable_answers(report: DiffReport, ref_report: DiffReport):
if report.summary is None or ref_report.summary is None:
raise RuntimeError("Report doesn't contain necessary data!")
answers_difference = (
math.fabs(report.summary.usable_answers - ref_report.summary.usable_answers)
/ ref_report.summary.usable_answers
)
if answers_difference >= ANSWERS_DIFFERENCE_THRESHOLD_WARNING:
logging.warning(
"Number of usable answers changed by {:.1f} %!".format(
answers_difference * 100.0
)
)
def main():
cli.setup_logging()
parser = argparse.ArgumentParser(description="compare two diff summaries")
cli.add_arg_config(parser)
parser.add_argument("old_datafile", type=str, help="report to compare against")
parser.add_argument("new_datafile", type=str, help="report to compare evaluate")
cli.add_arg_envdir(
parser
) # TODO remove when we no longer need to read queries from lmdb
cli.add_arg_limit(parser)
args = parser.parse_args()
report = DiffReport.from_json(cli.get_datafile(args, key="new_datafile"))
field_weights = args.cfg["report"]["field_weights"]
ref_report = DiffReport.from_json(cli.get_datafile(args, key="old_datafile"))
check_report_summary(report)
check_report_summary(ref_report)
check_usable_answers(report, ref_report)
cli.print_global_stats(report, ref_report)
cli.print_differences_stats(report, ref_report)
if report.summary or ref_report.summary: # when there are any differences to report
field_counters = report.summary.get_field_counters()
ref_field_counters = ref_report.summary.get_field_counters()
# make sure "disappeared" fields show up as well
for field in ref_field_counters:
if field not in field_counters:
field_counters[field] = Counter()
cli.print_fields_overview(
field_counters, len(report.summary), ref_field_counters
)
for field in field_weights:
if field in field_counters:
counter = field_counters[field]
ref_counter = ref_field_counters.get(field, Counter())
# make sure "disappeared" mismatches show up as well
for mismatch in ref_counter:
if mismatch not in counter:
counter[mismatch] = 0
cli.print_field_mismatch_stats(
field, counter, len(report.summary), ref_counter
)
# query details
with LMDB(args.envdir, readonly=True) as lmdb:
lmdb.open_db(LMDB.QUERIES)
queries_all = convert_queries(
get_query_iterator(lmdb, report.summary.keys())
)
ref_queries_all = convert_queries(
get_query_iterator(lmdb, ref_report.summary.keys())
)
for field in field_weights:
if field in field_counters:
# ensure "disappeared" mismatches are shown
field_mismatches = dict(report.summary.get_field_mismatches(field))
ref_field_mismatches = dict(
ref_report.summary.get_field_mismatches(field)
)
mismatches = set(field_mismatches.keys())
mismatches.update(ref_field_mismatches.keys())
for mismatch in mismatches:
qids = field_mismatches.get(mismatch, set())
queries = convert_queries(get_query_iterator(lmdb, qids))
ref_queries = convert_queries(
get_query_iterator(
lmdb, ref_field_mismatches.get(mismatch, set())
)
)
cli.print_mismatch_queries(
field,
mismatch,
get_printable_queries_format(
queries, queries_all, ref_queries, ref_queries_all
),
args.limit,
)
if __name__ == "__main__":
main()