-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathtestImport.py
75 lines (58 loc) · 2.38 KB
/
testImport.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from time import clock, time
import sys
import os
import nltk
import wikipydia
from optArgs import optParse, options, arguments
from wpTextExtractor import wiki2sentences
languages = [p.split(':') for p in '''en:english cz:czech da:danish nl:dutch et:estonian fi:finnish fr:french de:german el:greek it:italian no:norwegian pt:portuguese sl:slovene es:spanish sw:swedish tr:turkish'''.split()]
def lang2long(lang):
for p in languages:
if lang in p: return p[1]
def lang2short(lang):
for p in languages:
if lang in p: return p[0]
def main():
optParse(
trace__T=None,
language__L='|'.join(l for p in languages for l in p),
fromDump__D='',
showType__S=None,
withTags__W=None
)
sent_detector = nltk.data.load('tokenizers/punkt/%s.pickle' % lang2long(options.language)).tokenize
if options.fromDump:
if options.fromDump.endswith('.gz'):
source = os.popen('zcat %s' % options.fromDump)
else:
source = open(options.fromDump)
currentLines = []
for line in source:
line = line.strip()
if line.startswith('<title>'):
print line
elif line.startswith('<text'):
currentLines.append(line.split('>',1)[1])
elif currentLines:
if line.endswith('</text>'):
currentLines.append(line.rsplit('<',1)[0])
print '\n'.join(wiki2sentences('\n'.join(currentLines),
sent_detector,False))
currentLines = []
else:
currentLines.append(line)
else:
for title in arguments:
if title == 'Barack Obama' and options.language=='en':
text = open('obama.src').read().decode('utf-8')
else:
text = wikipydia.query_text_raw(title, language=lang2short(options.language))['text']
if options.withTags:
for s,t in zip(*wiki2sentences(text,sent_detector,True)):
print t[:4],s.encode('utf-8')
else:
print '\n'.join(wiki2sentences(text,sent_detector,False)).encode('utf-8')
if __name__ == "__main__":
tc,tt=clock(),time()
try: main()
finally: print >> sys.stderr, "%.3f/%.3f seconds overall" % (clock()-tc, time()-tt)