-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpos_tagger.py
48 lines (38 loc) · 1.89 KB
/
pos_tagger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2018/7/25 16:14
# @Author : Ting
# 定义 词性标注模型
from hmm import *
from nltk import ConditionalFreqDist, ConditionalProbDist, MLEProbDist
from ngram import ngram
class Tagger(HMM):
def __init__(self, corpus, n):
# corpus, 训练标注器的语料, 格式为 [[('Hello', 'NNP'), ('world', 'NN'), ('!', '.')], [...], ...]
# n - 语言模型 n-gram 中的 n
# 定义词性标注任务
# 1. transition 为 n-gram 模型
# 2. emission 为 P( pos |Word )
# 3. initial distribution 为 P('START') = 1.0
# 预处理词库,给每句话加上开始和结束符号
brown_tags_words = []
for sent in corpus:
brown_tags_words.append(('START', 'START'))
brown_tags_words.extend([(tag[:2], word) for word, tag in sent])
brown_tags_words.append(('END', 'END'))
# 从语料集获得 emission - 统计条件概率
cfd_tagwords = ConditionalFreqDist(brown_tags_words)
# P(W = word, condition = pos)
cpd_tagwords = ConditionalProbDist(cfd_tagwords, MLEProbDist)
emission = {tag: {word: cpd_tagwords[tag].prob(word) for word in cfd_tagwords[tag]} for tag in cpd_tagwords}
# 从语料集获得 transition - 调用 n-gram 模型
tags = [[tag for _, tag in sent] for sent in corpus]
transition = Transition(ngram(tags, n))
# 定义 initial distribution - 以 START 为句首, 概率为 1
initial_distribution = {('START',): 1.0}
# 定义 词性标注器
HMM.__init__(self, initial_distribution, transition, emission, n)
def tag(self, sentence):
# 为句子添加开始符合和结束符号
sentence = ['START'] * (self.length-1) + ['END'] * (self.length-1)
return self.viterbi(sentence)