-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathliu_process.py
212 lines (172 loc) · 7.83 KB
/
liu_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
"""
python liu_process.py -n liuetal
download the input file first using liu_download.sh into ./data
Uses the data set published in this paper
Liu, X., Eshghi, A., Swietojanski, P. and Rieser, V., 2019.
Benchmarking natural language understanding services for building conversational agents.
arXiv preprint arXiv:1903.05566."
https://arxiv.org/pdf/1903.05566.pdf
https://github.com/xliuhw/NLU-Evaluation-Data
"""
# *********************************************************************************************************************
# standard imports
import datetime
import copy
import re
# third party imports
import pandas
import numpy
import click
import humanfirst
# define regex
re_deannotate = re.compile(r"\[\s*([A-Za-z0-9-_]+)\s*:\s*([A-Za-z0-9@-_’'\. ]+)\]")
@click.command()
@click.option('-n','--name',type=str,default='liuetal',help='Name used for all files')
@click.option('-s','--sample',type=int,default=0,help='n conversations to sample from dataset')
@click.option('-w','--workspace',type=bool,default=False,help='whether to reload workspace')
def main(name: str, sample: int, workspace: str):
"""Main Function"""
# TODO - test workspace function
# read file
dtypes = {
'userid': str,
'answerid': str,
'scenario': str,
'status': str,
'intent': str,
'answer_annotation': str,
'notes': str,
'suggested_entities': str,
'answer_normalised': str,
'answer': str,
'question': str
}
df = pandas.read_csv(f'./data/{name}.csv', encoding='utf8', sep=';', dtype=dtypes, keep_default_na=False)
assert isinstance(df,pandas.DataFrame)
print(f'Read file of df.shape: {df.shape}')
print(df.columns)
print("user id summary")
print(df[["userid","answerid"]].groupby("userid").count())
# need to drop from original set items that the paper decided where irrelevant/wrong
drop_statuses = ['IRR','IRR_XL','IRR_XR']
# full list of statuses kept ADD (addition by annotator),
# MOD (see notes for modification),
# MOD_XL (see notes for modication), null, blank)
df = df[~df['status'].isin(drop_statuses)]
print(f'After dropping {drop_statuses} shape is {df.shape}')
# have many duplicates on answer id.
df["uid"] = df["answer_annotation"].apply(humanfirst.objects.hash_string)
# we have music as an intent and as a scenario. We have intent query identically named in many scenarios.
# TODO: look at humanfirst.py and decide if need update on how taxonmy managed.
# in meantime workaround is makeing intent scenario_intent under parent scenario.
df['scenario_intent'] = df['scenario'] + '_' + df['intent']
print(df['scenario_intent'].unique())
# allow sampling for a smaller subset
if sample > 0:
df = df.sample(sample)
print(f'Sampled down to {sample} conversations')
else:
print('No sampling down, full set')
# create workspaces
labelled_workspace = humanfirst.objects.HFWorkspace()
unlabelled_workspace = humanfirst.objects.HFWorkspace()
# if we have already created intents reload them from source control
# we are reading from CSV to represent a cleansed source control dataset without tags etc.
# i.e like a user might have before starting to use humanfirst
if workspace:
df_intents = pandas.read_csv(f'./workspaces/{workspace}/{workspace}-intents.csv',
delimiter=',',
names=['utterance','slash_sep_hier'])
assert isinstance(df_intents,pandas.DataFrame)
df_intents.apply(build_intents_from_file, args=[labelled_workspace],axis=1)
# Liu set has no dateimtes on the data
# Shard the data across 30 days of an arbitary month for performance in HumanFirst studio
df['day'] = numpy.random.randint(0, 30, df.shape[0])
start_datetime = datetime.datetime(2022,5,1,0,0,0)
df["created_at"] = df["day"].apply(add_datetimes,args=[start_datetime])
print("Distribution by day is")
print(df[['created_at','uid']].groupby('created_at').count())
# create deannoated text
df['deannotated_text'] = df['answer_annotation'].apply(deannotate)
# check for duplicates when lowercased
before = df.shape[0]
print(f'Before deduplication {before}')
df["deannotated_lower"] = df['deannotated_text'].str.lower()
df.drop_duplicates(subset="deannotated_lower",inplace=True)
after = df.shape[0]
print(f'After deduplication {after}')
# create example for each row.
df = df.apply(create_example,axis=1,args=[unlabelled_workspace,labelled_workspace])
# write unlabelled
with open(f'./data/{name}_unlabelled.json', 'w', encoding='utf8') as file_out:
unlabelled_workspace.write_json(file_out)
# write labelled
with open(f'./data/{name}_labelled.json', 'w', encoding='utf8') as file_out:
labelled_workspace.write_json(file_out)
def create_example(row: pandas.Series,
unlabelled_workspace: humanfirst.objects.HFWorkspace,
labelled_workspace: humanfirst.objects.HFWorkspace) -> list:
'''Parse a single utterance to an example'''
# HFMetadata just dict[str,all]
# extract these keys into the metadata dict
keys_to_extract = ['userid', 'answerid','scenario','intent','status','notes','suggested_entities','question']
convo_metadata_dict = {}
for key in keys_to_extract:
if row[key]:
convo_metadata_dict[key] = str(row[key])
else:
convo_metadata_dict[key] = ''
# Tags
tags=[]
tags_to_label = ['scenario'] # 'scenario_subflow' - removed as too noisy with speaker role and seq
for tag in tags_to_label:
tags.append(unlabelled_workspace.tag(row[tag]))
labelled_workspace.tag(row[tag])
# Context with the conversation_id linking utterances (here none just role: client
context = humanfirst.objects.HFContext(f'convo-{row["answerid"]}','conversation', 'client')
# Create the unlabelled example without intents
example = humanfirst.objects.HFExample(
text=row['deannotated_text'],
id=f'example-unlabelled-{row["uid"]}',
created_at=row['created_at'],
intents=[],
tags=tags,
metadata=copy.deepcopy(convo_metadata_dict),
context=context
)
# add to the unlabelled_workspace
unlabelled_workspace.add_example(example)
# get or create the intent in the labelled_workspace using parent as scenario and child as intent
# i.e alarm/query, alarm/remove with no metadata
intents = [labelled_workspace.intent(name_or_hier=[row['scenario_intent']])]
labelled_example = humanfirst.objects.HFExample(
text=row["deannotated_text"],
id=f'example-labelled-{row["uid"]}',
created_at=row['created_at'],
intents=intents,
tags=[],
metadata={},
context={}
)
labelled_workspace.add_example(labelled_example)
def deannotate(text: str) -> str:
"""Deannotates a given string"""
findalls = re_deannotate.findall(text)
output_text = ''
for _ in findalls:
matches = re_deannotate.search(text)
assert isinstance(matches, re.Match)
output_text = output_text + text[0:matches.start()] + matches.group(2)
text = text[matches.end():]
output_text = output_text + text
return output_text
# TODO: move into humanfirst
def build_intents_from_file(row: pandas.Series, labelled_workspace: humanfirst.objects.HFWorkspace):
"""Build intents"""
hierarchy = str(row['slash_sep_hier']).split('/')
labelled_workspace.example(row['utterance'],intents=[labelled_workspace.intent(hierarchy)])
def add_datetimes(days: int, start_datetime: datetime.datetime):
"""creates a timestamp n days after the start_datetime"""
return start_datetime + datetime.timedelta(days=days)
if __name__ == '__main__':
main() # pylint: disable=no-value-for-parameter