-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtext.py
187 lines (156 loc) · 6.49 KB
/
text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import regex
from structure import Singleton
import logging
import util
class TextHandler(object):
"""docstring for StopWords"""
def __init__(self):
super(TextHandler, self).__init__()
self.punctuation = regex.compile(r'[^\p{Ll}\p{Lu}\p{Lt}\p{Lo}\p{Nd}\p{Pc}\s]')
self.spacesplit = regex.compile(r'\s+')
#get stop words from file
self.swlist = StopWordSource.instance()
self.tabnspaces = regex.compile(r"(\t|^\s+|\s+$|\s{2,})")
self.linebreaks = regex.compile(r"\n")
self.cutoff = 2
self.minboostable = 5
def removepunctuation(self,text):
return self.punctuation.sub('',text)
def removetabnspace(self,text):
return self.tabnspaces.sub(' ',text)
def widenlinebreak(self,text):
return self.linebreaks.sub('',text)
def splittext(self, text):
return self.spacesplit.split(text)
def getstopwordscount(self, content):
ws = WordStat()
if content is None:
return ws
strippedinput = self.removepunctuation(content)
candidate = self.splittext(strippedinput)
swlist = StopWordSource.instance().getall()
foundstpwords = []
for c in candidate:
if c.lower() in swlist:
foundstpwords.append(c.lower())
ws.wordcount = len(candidate)
ws.stopwordcount = len(foundstpwords)
ws.stopwords = foundstpwords
#logging.info("found %d stopwords in text " % ws.stopwordcount )
return ws
def gettextscore(self,content):
return self.getstopwordscount(content)
def getcutoff(self):
return self.cutoff
def getminboostable(self):
return self.minboostable
class LengthbsdTextHandler(TextHandler):
def __init__(self):
super(LengthbsdTextHandler,self).__init__()
self.cutoff = 25
self.minboostable = 50
def gettextscore(self, content):
return self.getwordscount(content)
def getwordscount(self,content):
strippedinput = self.removepunctuation(content)
candidate = self.splittext(strippedinput)
return len(candidate)
class WordStat(object):
"""status of a text"""
def __init__(self):
self.wordcount = 0
self.stopwordcount = 0
self.stopwords = None
@Singleton
class StopWordSource(object):
"""contain all stop words"""
def __init__(self):
self.infile = "stopwords-en.txt"
self.swbuffer = None
def getall(self):
if self.swbuffer == None:
self.swbuffer = []
infile = open(self.infile)
for l in infile:
self.swbuffer.append(l.strip())
logging.info("Source contains %d stop words " % len(self.swbuffer))
return self.swbuffer
class Formatter(object):
""" format resulting article text"""
def __init__(self,config):
#self.texthandler = TextHandler()
self.config = config
self.texthandler = config.texthandler()
def getformattedtext(self, topnode):
"""remove all unnecessary elements"""
#logging.debug("\nINITIAL \n" + util.getinnerhtml(topnode))
self.remove_negscorenodes(topnode)
#logging.debug("After remove neg score nodes \n" + util.getinnerhtml(topnode))
self.linkstotext(topnode)
#logging.debug("\nAfter linkstotext \n" + util.getinnerhtml(topnode))
self.tagstotext(topnode)
#logging.debug("\nAfter tagstotext \n" + util.getinnerhtml(topnode))
self.removetagswithfewwords(topnode)
#logging.debug("\nAfter remove fewword tags\n" + util.getinnerhtml(topnode))
return self.totext(topnode)
def remove_negscorenodes(self,topnode):
scorednodes = topnode.cssselect("*[score]")
for item in scorednodes:
score = item.get('score')
score = float(score) if score else 0
if score < 1:
item.getparent().remove(item)
def linkstotext(self,topnode):
""" clean and convert nodes that should be considered text"""
for link in topnode.iterdescendants('a'):
hasimg = link.iterdescendants('img')
try:
next(hasimg)
except StopIteration:
#replace with text
util.replacewithtext(link)
def tagstotext(self,topnode):
"""replace common non-blk with just text <b> <i> <strong>. Except <a> tag which is considered in linkstotext"""
for tag in self.config.nonblktags:
if tag != "a":
for item in topnode.iterdescendants(tag):
util.replacewithtext(item)
def removetagswithfewwords(self, topnode):
"""tags with fewer words than a threshold could be noise"""
for item in topnode.iterdescendants():
#ws = self.texthandler.getstopwordscount(util.getinnertext(item, True))
itemtext = util.getinnertext(item, True)
if itemtext is not None and self.texthandler.gettextscore(itemtext) < self.texthandler.getcutoff():
try:
next(item.iterdescendants('object'))
next(item.iterdescendants('embed'))
except StopIteration:
#remove node with there is no <object> <embed> tags
logging.debug("remove fewwordpara %s: %s "%(item.tag, item.text))
item.getparent().remove(item)
def totext(self,topnode):
buff = []
for child in topnode.iterchildren():
logging.debug("\n ==== tag = %s id = %s class = %s " % (child.tag, child.get('id'), child.get('class')))
content = util.getinnertext(child,True)
logging.debug(content)
if content:
buff.append(content)
if len(buff) > 0:
return "\n\n".join(buff)
else:
return None
class LengthbsdFormatter(Formatter):
"""a Formatter that evaluate text using word counts"""
def __init__(self,config):
super(LengthbsdFormatter, self).__init__(config)
def removetagswithfewwords(self, topnode):
for item in topnode.iterdescendants():
itemtext = util.getinnertext(item, True)
wordcount = self.texthandler.getwordscount(util.getinnertext(item,True))
if wordcount < 25:
try:
next(item.iterdescendants('object'))
next(item.iterdescendants('embed'))
except StopIteration:
item.getparent().remove(item)