forked from Sotwi-zz/NL-Augmenter
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtransformation.py
86 lines (75 loc) Β· 3.38 KB
/
transformation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import itertools
import random
from typing import List, Tuple
from interfaces.TaggingOperation import TaggingOperation
from tasks.TaskTypes import TaskType
"""
A tagging implementation for NER systems.
John Smith cooked a curry in the evening. --> John D. Smith cooked a curry in the evening.
"""
class LongerNamesNer(TaggingOperation):
tasks = [TaskType.TEXT_TAGGING]
languages = "All"
no_of_repeats = 2 # values should not be larger than 3-4
def __init__(self, seed=0, no_of_repeats=2, max_outputs=1):
super().__init__(seed, max_outputs=max_outputs)
self.no_of_repeats = no_of_repeats
def generate(
self, token_sequence: List[str], tag_sequence: List[str]
) -> List[Tuple[List[str], List[str]]]:
random.seed(self.seed)
token_seq = token_sequence.copy()
tag_seq = tag_sequence.copy()
perturbed_sentences = []
# TODO: Currently perturb only the first name in the sentence. Needs to handle for other names.
tag = "PERSON" if "B-PERSON" in tag_seq else "PER"
b_tag = "B-" + tag
i_tag = "I-" + tag
assert len(token_seq) == len(
tag_seq
), "Lengths of `token_seq` and `tag_seq` should be the same"
if b_tag in tag_seq:
begin = tag_seq.index(b_tag)
next = begin + 1
for _ in itertools.repeat(None, self.max_outputs):
if next < len(tag_seq) and i_tag == tag_seq[next]:
for _ in range(self.no_of_repeats):
random_upper_letter = chr(
random.randint(ord("A"), ord("Z"))
)
token_seq.insert(next, random_upper_letter)
tag_seq.insert(next, i_tag)
perturbed_sentences.append((token_seq, tag_seq))
token_seq = token_sequence.copy()
tag_seq = tag_sequence.copy()
return perturbed_sentences
"""
# Sample code to demonstrate usage. Can also assist in adding test cases.
if __name__ == '__main__':
import json
from TestRunner import convert_to_snake_case
tf = LongerNamesNer(max_outputs=3)
test_cases = []
src = ["Manmohan Singh served as the PM of India .",
"Neil Alden Armstrong was an American astronaut",
"Katheryn Elizabeth Hudson is an American singer",
"The owner of the mall is Anthony Gonsalves.",
"Roger Michael Humphrey Binny ( born 19 July 1955 ) is an Indian former cricketer ."]
tgt = ["B-PER I-PER O O O O O B-COUNTRY O",
"B-PER I-PER I-PER O O B-COUNTRY O",
"B-PER I-PER I-PER O O B-COUNTRY O",
"O O O O O O B-PER I-PER",
"B-PER I-PER I-PER I-PER O O B-DATE I-DATE I-DATE O O O O O O O"
]
for idx, (token_sequence, tag_sequence) in enumerate(zip(src, tgt)):
sentences = tf.generate(token_sequence.split(" "), tag_sequence.split(" "))
test_cases.append({
"class": tf.name(),
"inputs": {"token_sequence": token_sequence, "tag_sequence": tag_sequence},
"outputs": []}
)
for sentence, target in sentences:
test_cases[idx]["outputs"].append({"token_sequence": " ".join(sentence), "tag_sequence": " ".join(target)})
json_file = {"type": convert_to_snake_case(tf.name()), "test_cases": test_cases}
print(json.dumps(json_file))
"""