-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathseg.py
executable file
·102 lines (82 loc) · 2.69 KB
/
seg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/usr/bin/env python3
# Copyright (C) 2016.
# Author: Jesús Manuel Mager Hois
# e-mail: <[email protected]>
# Project website: http://turing.iimas.unam.mx/wix/
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import sys
import codecs
from .wmorph import Verb
class Word:
def __init__(self):
F = codecs.open("data/dic", mode="r", encoding="utf-8")
self.dic = {}
self.symbols = '!¡"¿?,.'
line = F.readline()
while line:
line = line.split()
if line:
wr = line[0].replace('\n', '')
self.dic[wr] = line
line = F.readline()
def checkdic(self,word):
if word in self.dic.keys():
return (word, 1)
else:
if word in self.symbols:
return (word, 1)
else:
return (word, 0)
def segment(line, joinm=1, s=0):
debug=0
F = codecs.open("data/dic", mode="r", encoding="utf-8")
w = Word()
tokens = line.split()
words = []
for token in tokens:
word = Verb(token, debug=debug)
typ = w.checkdic(token)
if debug:
print(typ)
if typ[1] == 1:
if debug:
print("NOT VERB:", token)
words.append(token)
else:
print("VERB:", token, str(word.paths))
pathsize = 100000
chpath=0
for p in word.paths:
if len(p) < pathsize:
chpath = p
pathsize = len(p)
if chpath:
for affix in chpath:
if not s:
words.append(str(affix[1])+str(affix[0]))
else:
words.append(str(affix[1]))
else:
words.append(token)
if joinm:
return " ".join(words)
return words
def segtext(text, s=0):
lines = text.split("\n")
seglines = []
for line in lines:
seglines.append(segment(line, s=s))
return "\n".join(seglines)
if __name__ == "__main__":
alo = "nanait+a"
words = segment(alo)
print(words)