-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathMyNgram.py
88 lines (68 loc) · 3.37 KB
/
MyNgram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#This file override Ngram class
from itertools import chain
from math import log
from nltk.probability import (ConditionalProbDist, ConditionalFreqDist,
SimpleGoodTuringProbDist)
from nltk.util import ingrams
from nltk.model.api import ModelI
from nltk.model import NgramModel
def _estimator(fdist, bins):
"""
Default estimator function using a SimpleGoodTuringProbDist.
"""
# can't be an instance method of NgramModel as they
# can't be pickled either.
return SimpleGoodTuringProbDist(fdist)
class MyNgramModel(NgramModel):
"""
A processing interface for assigning a probability to the next word.
"""
def __init__(self, n, train, pad_left=True, pad_right=False,estimator=None, *estimator_args, **estimator_kwargs):
super(MyNgramModel,self).__init__(n,train,pad_left,pad_right,estimator,*estimator_args, **estimator_kwargs)
assert(isinstance(pad_left, bool))
assert(isinstance(pad_right, bool))
self._n = n
self._lpad = ('',) * (n - 1) if pad_left else ()
self._rpad = ('',) * (n - 1) if pad_right else ()
if estimator is None:
estimator = _estimator
self._cfd = ConditionalFreqDist()
self._ngrams = set()
# If given a list of strings instead of a list of lists, create enclosing list
if (train is not None) and isinstance(train[0], basestring):
train = [train]
for sent in train:
for ngram in ingrams(chain(self._lpad, sent, self._rpad), n):
self._ngrams.add(ngram)
context = tuple(ngram[:-1])
token = ngram[-1]
self._cfd[context].inc(token)
if not estimator_args and not estimator_kwargs:
self._model = ConditionalProbDist(self._cfd, estimator, len(self._cfd))
else:
self._model = ConditionalProbDist(self._cfd, estimator, *estimator_args, **estimator_kwargs)
# recursively construct the lower-order models
self._backoff = None
if n > 1:
self._backoff = MyNgramModel(n-1, train, pad_left, pad_right, estimator, *estimator_args, **estimator_kwargs)
if self._backoff is not None:
self._backoff_alphas = dict()
# For each condition (or context)
for ctxt in self._cfd.conditions():
pd = self._model[ctxt] # prob dist for this context
backoff_ctxt = ctxt[1:]
backoff_total_pr = 0
total_observed_pr = 0
for word in self._cfd[ctxt].keys(): # this is the subset of words that we OBSERVED
backoff_total_pr += self._backoff.prob(word,backoff_ctxt)
total_observed_pr += pd.prob(word)
assert total_observed_pr <= 1 and total_observed_pr > 0
assert backoff_total_pr <= 1 and backoff_total_pr > 0
alpha_ctxt = (1.0-total_observed_pr) / (1.0-backoff_total_pr)
self._backoff_alphas[ctxt] = alpha_ctxt
# Updated _alpha function, discarded the _beta function
def _alpha(self, tokens):
if tokens in self._backoff_alphas:
return self._backoff_alphas[tokens]
else:
return 1