-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path22_Cluster_Stats.py
executable file
·59 lines (54 loc) · 2.4 KB
/
22_Cluster_Stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/python3
import os
import logging
from collections import Counter
from amr_coref.utils.logging import setup_logging
from amr_coref.coref.coref_mention_data import CorefMentionData
from amr_coref.utils.data_utils import load_json
from amr_coref.coref.vocab_embeddings import load_word_set
if __name__ == '__main__':
setup_logging(level=logging.ERROR)
coref_fpath = 'data/tdata/train.json.gz'
#coref_fpath = 'data/tdata/test.json.gz'
men_set_fn = 'data/tdata/mention_tokens.txt'
num_print = 10
# Load data and get the token counts
print('Loading', coref_fpath)
token_in_ctr = Counter()
token_no_ctr = Counter()
mention_set = load_word_set(men_set_fn)
cr_data = load_json(coref_fpath)
mdata = CorefMentionData(cr_data, mention_set)
for doc_name, mentions in mdata.mentions.items():
for midx, mention in enumerate(mentions):
if mention.cluster_id != None:
token_in_ctr[mention.token] += 1
else:
token_no_ctr[mention.token] += 1
print()
# Stats for mention data
num_men_in = sum(token_in_ctr.values())
num_mentions = num_men_in + sum(token_no_ctr.values())
tok_in_set = set(token_in_ctr.keys())
tok_no_set = set(token_no_ctr.keys())
num_tokens = len(tok_in_set|tok_no_set)
num_tok_in = len(tok_in_set)
print('Stats for CorefMentionData(men_set_fn=%s)' % str(men_set_fn))
print('There are {:,} mentions and {:,} are in a cluster = {:.1f}%'.format(\
num_mentions, num_men_in, 100*num_men_in/num_mentions))
print('There are {:,} unique tokens and {:,} appear in clusters = {:.1f}%'.format(\
num_tokens, num_tok_in, 100*num_tok_in/num_tokens))
print()
# Print counts / percentages of tokens in clusters
pct_list = []
for token, in_count in token_in_ctr.most_common():
no_count = token_no_ctr[token]
pct = 100*in_count/(in_count+no_count)
pct_list.append((token, in_count, no_count, pct))
#pct_list = sorted(pct_list, key=lambda x:x[3], reverse=True) # sort by pct
pct_list = sorted(pct_list, key=lambda x:x[1], reverse=True) # sort by in_count
print('%-24s %12s %12s' % ('Token', '#Cluster', '#No-Cluster'))
print('-'*60)
for token, in_count, no_count, pct in pct_list[:num_print]:
print('%-24s %12d %12d %6.1f%%' % (token, in_count, no_count, pct))
print()