-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathMEMM_2.py
203 lines (184 loc) · 6.74 KB
/
MEMM_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
import nltk
from nltk.stem.porter import *
from nltk.classify import MaxentClassifier
import pickle
import os,sys
from io import open
reload(sys)
sys.setdefaultencoding('utf8')
f = open('my_classifier.pickle', 'rb')
training_file = open("train.np", "rb")
testing_file = open("dev.np", "rb")
output_file = open("boi_output.txt", "wb")
change_of_sentence_flag = 0 #a marker for the end of sentence
BOI_list = ['B-NP', 'I-NP', 'O']
boi_full_list = [] #store all the boi tags that occur in the training set
boi_end_list = [] #store boi tags that are at the end of the sentence
wordStartList = [] #store words that are begining of the sentence
labeled_features = []#store features from the training set
maxent_classifier = pickle.load(f)
change_of_sentence_flag = 0 #a marker for the end of sentence
wordStartList = [] #store words that are begining of the sentence
f.close()
#****************************************************************building input features part.1
previous_BOI = "start"
input_file = training_file
for line in input_file:
s = re.match(r'^\s*$', line) #find empty line
if s:
change_of_sentence_flag = 1
previous_BOI = "start"
else:
sentenceList = line.split()
word = sentenceList[0]
tag = sentenceList[1]
boi = sentenceList[2]
#store words that are begining of the sentence
if change_of_sentence_flag == 1:
wordStartList.append(word)
boi_end_list.append(boi_full_list[-1])
change_of_sentence_flag = 0
boi_full_list.append(boi)
item = word, tag, boi, previous_BOI
labeled_features.append(item)
previous_BOI = boi
input_file.close()
#****************************************************************calculate End Transition
#calculate the End prior
dicE = {} #temporarry dic
countTag = 0
countEnd = 0
#calculate the prior (End|state) = C(state, End)/C(state)
for i in BOI_list:
for j in range(len(boi_end_list)):
for f in boi_full_list:
if j == 0:
if i == f:
countTag = countTag + 1
if i == boi_end_list[j]:
countEnd = countEnd + 1
ProbE = format(countEnd/(countTag*1.0), '.5f')
dicE.update({i: {"END":ProbE}})
countEnd = 0
countTag = 0
#****************************************************************building input features part.2
def MEMM_features(word, tag, previous_BOI):
stemmer = PorterStemmer()
features = {}
features['current_word'] = word
features['current_tag'] = tag
puc = '-'.decode("utf-8")
features['capitalization'] = word[0].isupper()
features['start_of_sentence'] = word in wordStartList
features['cap_start'] = word not in wordStartList and word[0].isupper()
features['previous_NC'] = previous_BOI
return features
#********************************************************************Viterbi
def MEMM(wordList,tagList):
BOI_list = ['B-NP', 'I-NP', 'O']
w1 = wordList[0] #the first word of the sentence
t1 = tagList[0]
tRange = len(BOI_list)
wRange = len(wordList)
viterbi = [[0 for x in range(300)] for x in range(300)]
backpointer = [['' for x in range(300)] for x in range(300)]
#intialization
for t in range(tRange):#t = 0,1,2
probability = maxent_classifier.prob_classify(MEMM_features(w1,t1, "start" ))
posterior = float(probability.prob(BOI_list[t]))
#print ("boi: " + BOI_list[t] + ' posterior (start)' + str(posterior))
#score transition 0(start) -> q given w1
viterbi[t][1] = posterior
backpointer[t][1] = 0 #stand for q0 (start point)
#for word w from 2 to T
maxViterbi = 0
maxPreviousState = 0
maxPreTerminalProb = 0
for w in range (1, wRange):
for t in range (tRange):
word = wordList[w]
tag = tagList[w]
probability = maxent_classifier.prob_classify(MEMM_features(word,tag,BOI_list[0] ))
posterior = float(probability.prob(BOI_list[t]))
maxViterbi = float(viterbi[0][w]) * posterior
maxPreviousState = 0
for i in range (1, tRange):
word = wordList[w]
tag = tagList[w]
probability = maxent_classifier.prob_classify(MEMM_features(word,tag,BOI_list[i] ))
posterior = float(probability.prob(BOI_list[t]))
if float(viterbi[i][w]) * posterior > maxViterbi:
maxViterbi = float(viterbi[i][w]) * posterior
maxPreviousState = i #content BOI_List[i]
viterbi[t][w+1] = maxViterbi
backpointer[t][w+1] = BOI_list[maxPreviousState] #points to the matrix x axis (max previous)
maxViterbi = 0
maxPreviousState = 0
maxPreTerminalProb = 0
#termination step
#viterbi[qF, T] = max (viterbi[s,T] *as,qF)
maxPreTerminalProb = float(viterbi[0][wRange] )* float(dicE[BOI_list[0]]["END"])
maxPreviousState = 0
for i in range (1, tRange):
if float(viterbi[i][wRange]) * float(dicE[BOI_list[i]]["END"]) > maxPreTerminalProb:
maxPreTerminalProb = float(viterbi[i][wRange]) * float(dicE[BOI_list[i]]["END"])
maxPreviousState = i
viterbi[tRange][wRange+1] = maxPreTerminalProb
#store the state that returns the maxPreTerminalProbability
backpointer[tRange][wRange+1] = BOI_list[maxPreviousState]
#return POS tag path
pathReverse = [BOI_list[maxPreviousState]]
maxPreviousTag = BOI_list[maxPreviousState]
i = 0
while i < (wRange -1):
pathReverse.append(backpointer[BOI_list.index(maxPreviousTag)][wRange - i])
maxPreviousTag = backpointer[BOI_list.index(maxPreviousTag)][wRange - i]
i = i + 1
#reverse the path to make it correct
index = len(pathReverse)
path = []
while index >= 1 :
path.append(pathReverse[index - 1])
index = index -1
return path
#*******************************************************************MaxEnt+Viterbi = MEMM
#main()
wordList = [] #store words in a sentence
tagList = [] #store part-of-speech tag in a sentence
boiList = [] #store boi tags in a sentence
#prob_table = {} #stpre the posterior
previous_BOI = "start"
BOI_list = ['B-NP', 'I-NP', 'O']
input_file = testing_file
for line in input_file:
if line.strip() != '': #if not empty do following
sentenceList = line.split()
word = sentenceList[0]
print (word)
tag = sentenceList[1]
boi = sentenceList[2]
wordList.append(word)
tagList.append(tag)
boiList.append(boi)
#store words that are begining of the sentence
#store tags that are the begining of the sentence
#store the end of sentence tag in tagEndList
if change_of_sentence_flag == 1:
wordStartList.append(word)
change_of_sentence_flag = 0
#{'B-NP':{'December':0.45}}
s = re.match(r'^\s*$', line) #find empty line
if s:
#print (wordList)
change_of_sentence_flag = 1
previous_BOI = "start"
path = MEMM(wordList, tagList) #list of BOI_tags returned by HMM function call
for i in range(len(wordList)): #part_of_speech_tag(tagList) and token_list(wordList) has the same length
output_file.write(wordList[i]+" "+ tagList[i]+ " " + boiList[i] + " " + path[i] + "\n")
output_file.write("\n")
wordList = [] # refresh word list
tagList = []
boiList = []
#prob_table = {}#refresh prob_table
input_file.close()
output_file.close()