-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsklearn_crf.py
32 lines (28 loc) · 811 Bytes
/
sklearn_crf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import sklearn_crfsuite
import re,os
def tokenit(path1):
with open(path1,'r') as rf:
strr = rf.read()
list_x_y = re.split(r'[\n\t]', strr)
x_train = list_x_y[0::2][:-1]
y_train = list_x_y[1::2]
return x_train, y_train
x_train = []
y_train =[]
for i in os.listdir('/home/mm/FDDC_datasets_dir/tokenized_datasets_for_anago/chongzu/')[0:3]:
x,y = tokenit('/home/mm/FDDC_datasets_dir/tokenized_datasets_for_anago/chongzu/'+i)
if len(x)==len(y):
x_train += x
y_train += y
if len(x_train)==len(y_train):
print("It is OK {}".format(i))
else:
print("baaaaad")
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
c1=0.1,
c2=0.1,
max_iterations=100,
all_possible_transitions=True
)
crf.fit(x_train, y_train)