-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathtagger.py
97 lines (73 loc) · 3.58 KB
/
tagger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""
source: https://github.com/rickardlofberg/HMM-PoS-Tagger
"""
"""This code consists of a tagger class which takes the dictionaries from the probCalc.py probabilities function.
After the object has been created the method tagSentence() takes a list of tokens as parameter and returns a list of the same length
with suggested tags."""
class Tagger():
def __init__(self, uniProb, biProb, triProb, wordProb):
"""Used for HMM tagging of sentences"""
self.uniProb = uniProb
self.biProb = biProb
self.triProb = triProb
self.wordProb = wordProb
self.endTag = {'end' : 1.0} # Filler end tag
# Dictionary for when word not found
self.posTags = dict()
for tag in self.uniProb.keys():
self.posTags[tag] = 1 / len(self.uniProb)
def tagSentence(self, sentToTag):
"""Takes a sentence (sequenced list) and returns a list of same length with PoS-tags"""
# Make sure it's a sentence
if isinstance(sentToTag, list):
sentence = sentToTag + [None] # Add extra value to list to be able to use end tag
else:
return "Sentence not submitted in the correct format"
# A list which holds all sequences and there current probabilities
possibleSeq = list()
# A list to keep track of best path so far and possible best paths
startFiller = ['start', 'start']
currentPaths = [(1, startFiller)] # The probability of start = 1 and the first nodes are start
newPaths = []
# For each word
for word in sentence:
# If there are tags for the word use them, otherwise check against all tags
possibleTags = self.wordProb.get(word, self.posTags)
# Filler ending
if word == None:
possibleTags = self.endTag
# For each possible tag for that word
for tag in possibleTags:
nodePosValues = []
# Go from all possible previous paths to current tag (node)
for path in currentPaths:
# Create bigram and trigram
trigram = ' '.join(path[1][-2:] + [tag])
bigram = ' '.join(path[1][-1:] + [tag])
# We can calculate emission and old node value here
pathProb = path[0] * possibleTags[tag]
# If we can calculate with trigram
if self.triProb.get(trigram, -1) != -1:
pathProb *= self.triProb[trigram]
elif self.biProb.get(bigram, -1) != -1: # Try bigram
pathProb *= self.biProb[bigram]
else: # Otherwise just use unigram
pathProb *= self.uniProb[tag]
# Add this as a possible path to current node
nodePosValues.append((pathProb, path[1] + [tag]))
# Keep the best path
newPaths.append(max(nodePosValues))
# Update current paths
currentPaths = newPaths
newPaths = []
# Return the best path
currentPaths = max(currentPaths)
lastTags = currentPaths[1][2:-1]
result = []
for i in range(len(sentToTag)):
result.append((sentToTag[i], lastTags[i].upper()))
# Return the Pos sequence without the two starting tags
return result
if __name__ == '__main__':
"""This is here by intention, this code is not meant to be run as main."""
pass