forked from michaelmilleryoder/fanfiction-nlp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchapter_merge.py
72 lines (58 loc) · 3.08 KB
/
chapter_merge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import csv, json, sys
def internal_merge(chap_json):
with open(chap_json,'r') as f:
chap_dict = json.load(f)
char_dict = chap_dict['clusters']
num_chars = len(char_dict)
all_unique_names = list(set([char['name'] for char in char_dict]))
merged_dict = {'document':chap_dict['document'],'clusters':[{'name':'','mentions':[]} for _ in range(len(all_unique_names))]}
for clu in char_dict:
name = clu['name']
idx = all_unique_names.index(name)
merged_dict['clusters'][idx]['name'] = name
merged_dict['clusters'][idx]['mentions'] += clu['mentions']
#print([clu['name'] for clu in merged_dict['clusters']]) #sanity check
with open(chap_json[:-5]+' merged.json','w') as f2:
json.dump(merged_dict, f2)
return merged_dict
def merge_two_chapters_clusters(chapt1_json,chapt2_json):
'''INPUT: Chap1,Chap2 json
OUTPUT: merged json with same headers
note:this only merge characters that overlap in the 2 chapters - filters lots of smaller chars that only
appear in 1 chapter, but might be problematic later'''
with open(chapt1_json,'r') as f:
chapt1_dict = json.load(f)
chapt1_dict = internal_merge(chapt1_json)
with open(chapt2_json,'r') as f1:
chapt2_dict = json.load(f1)
chapt2_dict = internal_merge(chapt2_json)
last_idx_chap1 = len(chapt1_dict['document'])
new_dict = {"document":chapt1_dict['document'] + chapt2_dict['document'],'clusters':[]}
# "clusters":[{'name':'','mentions':[]} for _ in range(int(idx_map_list[-1][0]))]}
chapt1_clusters = chapt1_dict['clusters']
chapt2_clusters = chapt2_dict['clusters']
for clu1 in chapt1_clusters:
for clu2 in chapt2_clusters:
shifted_mentions = [{'position':[ment['position'][0]+last_idx_chap1,ment['position'][1]+last_idx_chap1],'text':ment['text']} for ment in clu2['mentions']]
if clu1['name'] == clu2['name']:
#merge
#shifted_mentions = [{'position':[ment['position'][0]+last_idx_chap1,ment['position'][1]+last_idx_chap1]} for ment in clu2['mentions']]
merged_clu = {'name':clu1['name'],'mentions':[clu1['mentions']+shifted_mentions]}
new_dict['clusters'].append(merged_clu)
print([clu['name'] for clu in new_dict['clusters']])
with open('merged_chapters.json','w') as f:
json.dump(new_dict,f)
return new_dict
def merge_all_chapters(dir,outfile_name):
'''INPUT: a dir of the json outputs for all chapters from a book
OUTPUT: a large json file with everything, export json'''
pass
def get_char_stats(char_json):
'''INPUT: json file of char
OUTPUT: readable stats report (maybe csv?) on char status '''
pass
if __name__=="__main__":
chap1 = sys.argv[1]
chap2 = sys.argv[2]
#internal_merge(chap1)
merge_two_chapters_clusters(chap1,chap2)