forked from guyfe/LongSumm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdiscourse_analysis.py
97 lines (68 loc) · 3.4 KB
/
discourse_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
'''
# Use rouge and BERTscore at sentence-level to compare each sentence
# in each section with the each sentence in the summary + compute
# average.
'''
import os, json, re
import pandas as pd
import numpy as np
from bert_score import BERTScorer
from rouge_score import rouge_scorer
from nltk.tokenize import sent_tokenize
df = pd.DataFrame()
longsumm_fp = "./abstractive_summaries/all_longsumm_abs_full_science_parse.json"
longsumm_file = open(longsumm_fp, "r")
longsumm_json = json.loads(longsumm_file.read())
for id_str, content in longsumm_json.items():
if "X" in content.keys() and "Y" in content.keys() and "id" in content["X"].keys() \
and content["X"]["id"] != "empty" and "sections" in content["X"].keys():
summary_text = " ".join(content["Y"]["summary"])
if "sections" not in content['X'].keys():
print(content['X'])
title = content["X"]["title"] if "title" in content["X"].keys() else "N/A"
year = content["X"]["year"] if "year" in content["X"].keys() else "N/A"
abstract = content["X"]["abstractText"] if "abstractText" in content["X"].keys() else "N/A"
full_text = " ".join([sec["text"] for sec in content["X"]["sections"]])
sections = content["X"]["sections"] if "sections" in content['X'].keys() else "N/A"
df = df.append([[id_str, title, summary_text, sections, abstract, full_text, year]])
df.columns =['ls_id', 'title', 'summary', 'sections', 'abstract', 'full_text', "year"]
print(len(df))
metrics = ['rouge1', 'rouge2', 'rougeL']
rs = rouge_scorer.RougeScorer(metrics, use_stemmer=True)
bs = BERTScorer(lang="en")
example_1 = "This is an example sentence."
example_2 = "An example sentence, this is."
print(rs.score(example_1.strip(), example_2.strip()))
print(bs.score([example_1.strip()], [example_2.strip()])) # precision, recall, f-score
paper_sec_scores = []
i = 1
for index, row in df.iterrows():
sections = row['sections']
summary = row['summary']
summary_sents = sent_tokenize(summary)
section_scores = {}
print(f'{i}/{len(list(df["summary"]))}')
for section in sections:
heading = section["heading"] if "heading" in section.keys() else "[N/A]"
text = section["text"] if "text" in section.keys() else ""
text_sents = sent_tokenize(text)
BERTscores = []
rouge_scores = []
for text_sent in text_sents:
for summary_sent in summary_sents:
rs_results = rs.score(text_sent.strip(), summary_sent.strip())
bs_results = bs.score([text_sent.strip()], [summary_sent.strip()])[2].item()
rouge_scores.append({k: np.average([v.fmeasure, v.recall]) for k,v in rs_results.items()})
BERTscores.append(bs_results)
section_scores[heading] = {
'BERTScore': np.average(BERTscores),
'rouge1': np.average([item['rouge1'] for item in rouge_scores]),
'rouge2': np.average([item['rouge2'] for item in rouge_scores]),
'rougeL': np.average([item['rougeL'] for item in rouge_scores])
}
paper_sec_scores.append(section_scores)
with open("./sec_scores_record.txt", "w") as output_file:
output_file.write(json.dumps(paper_sec_scores))
i += 1
df['section_scores'] = paper_sec_scores
df.to_csv("section_analysis_ds.csv", sep="\t")