-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathML.py
70 lines (57 loc) · 2.73 KB
/
ML.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import spacy
import random
from spacy.util import minibatch, compounding
from pathlib import Path
nlp = spacy.load('en_core_web_sm')
TRAIN_DATA = [
("Kinder Capital has raised £81 million in its second seed fund", {"entities": [(0, 14, "ORG")]}),
("Airmeet has raised $12 million in Series A funding led by Sequoia Capital", {"entities": [(0, 7, "ORG")]}),
("veem raised $31M from Truist Ventures", {"entities": [(0, 4, "ORG")]}),
("AppDirect raised $185 million in growth funding", {"entities": [(0, 9, "ORG")]}),
("ApplyBoard has raised C$70M (US$55M) in Series C extension funding", {"entities": [(0, 10, "ORG")]}),
("eFileCabinet closed an $11.5m Series C funding", {"entities": [(0, 12, "ORG")]}),
("AnalytIQ Sports Technologies has raised angel funding of an undisclosed amount", {"entities": [(0, 27, "ORGT")]}),
("WM Motor raised 10 billion yuan ($1.47 billion)", {"entities": [(0, 9, "ORG")]}),
("KRE8.TV $1.1 million Seed round led by Mellanox co-founder and President Eyal Waldman",
{"entities": [(0, 7, "ORG")]}),
("Phytolon secured $4.1m", {"entities": [(0, 8, "ORG")]}),
("Fourth Partner Energy raised $16 million in mezzanine funding from investment funds",
{"entities": [(0, 21, "ORG")]}),
]
ner = nlp.get_pipe("ner")
for _, annotations in TRAIN_DATA:
for ent in annotations.get("entities"):
ner.add_label(ent[2])
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*unaffected_pipes):
for iteration in range(30):
# shuufling examples before every iteration
random.shuffle(TRAIN_DATA)
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(
texts, # batch of texts
annotations, # batch of annotations
drop=0.5, # dropout - make it harder to memorise data
losses=losses,
)
doc = nlp("daksh raised $1 from daksh in seed funding")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
# Save the model to directory
from pathlib import Path
output_dir = Path('/Desktop/')
nlp.to_disk(output_dir)
print("Saved model to", output_dir)
# Save the model to directory
output_dir = Path('/content/')
nlp.to_disk(output_dir)
print("Saved model to", output_dir)
# Load the saved model and predict
print("Loading from", output_dir)
nlp_updated = spacy.load(output_dir)
doc = nlp_updated("Fridge can be ordered in FlipKart" )
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])