diff --git a/testImport.py b/testImport.py
index 5d67d9d..846c0ca 100644
--- a/testImport.py
+++ b/testImport.py
@@ -1,13 +1,75 @@
-
-from wpTextExtractor import wiki2sentences
-import wikipydia
+from time import clock, time
+import sys
+import os
import nltk
+import wikipydia
+from optArgs import optParse, options, arguments
+from wpTextExtractor import wiki2sentences
+
+
+
+languages = [p.split(':') for p in '''en:english cz:czech da:danish nl:dutch et:estonian fi:finnish fr:french de:german el:greek it:italian no:norwegian pt:portuguese sl:slovene es:spanish sw:swedish tr:turkish'''.split()]
+
+def lang2long(lang):
+ for p in languages:
+ if lang in p: return p[1]
+
+def lang2short(lang):
+ for p in languages:
+ if lang in p: return p[0]
+
+
+def main():
+ optParse(
+ trace__T=None,
+ language__L='|'.join(l for p in languages for l in p),
+ fromDump__D='',
+ showType__S=None,
+ withTags__W=None
+ )
+
+ sent_detector = nltk.data.load('tokenizers/punkt/%s.pickle' % lang2long(options.language)).tokenize
+
+
+ if options.fromDump:
+ if options.fromDump.endswith('.gz'):
+ source = os.popen('zcat %s' % options.fromDump)
+ else:
+ source = open(options.fromDump)
+ currentLines = []
+ for line in source:
+ line = line.strip()
+ if line.startswith('
'):
+ print line
+ elif line.startswith('',1)[1])
+ elif currentLines:
+ if line.endswith(''):
+ currentLines.append(line.rsplit('<',1)[0])
+ print '\n'.join(wiki2sentences('\n'.join(currentLines),
+ sent_detector,False))
+ currentLines = []
+ else:
+ currentLines.append(line)
+
+
+ else:
+ for title in arguments:
+ if title == 'Barack Obama' and options.language=='en':
+ text = open('obama.src').read().decode('utf-8')
+ else:
+ text = wikipydia.query_text_raw(title, language=lang2short(options.language))['text']
+ if options.withTags:
+ for s,t in zip(*wiki2sentences(text,sent_detector,True)):
+ print t[:4],s.encode('utf-8')
+ else:
+ print '\n'.join(wiki2sentences(text,sent_detector,False)).encode('utf-8')
+
-sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
-t=wikipydia.query_text_raw('Barack Obama')['text']
-sents= wiki2sentences(t,sent_detector)
-for s,t in zip(*sents):
- print t,s
+if __name__ == "__main__":
+ tc,tt=clock(),time()
+ try: main()
+ finally: print >> sys.stderr, "%.3f/%.3f seconds overall" % (clock()-tc, time()-tt)
diff --git a/wpTextExtractor.py b/wpTextExtractor.py
index d907e25..fe5f56a 100755
--- a/wpTextExtractor.py
+++ b/wpTextExtractor.py
@@ -1,30 +1,21 @@
-#! /home/cl-home/eisele/lns-root-07/bin/python2.6
+#! /usr/bin/env python2.6
# -*- coding: utf-8 -*-
__author__ = 'Andreas Eisele '
__created__ = "Tue Jan 26 21:41:40 2010"
-__id__ = '$Id: trymwlib.py 53 2010-01-27 13:53:08Z anei00 $'.strip("$")
+'''
+extract clear text from wikipedia articles
+'''
-# purpose of file:
-# extract clear text from wikipedia articles
-
-from time import clock, time
-import sys
-import os
-import mwlib
-import nltk
-import wikipydia
import re
-from optArgs import optParse, options, arguments
import mwlib
from mwlib.refine.compat import parse_txt
from mwlib.refine import core
from mwlib.parser import nodes
-
# map all node types to the empty string
nodeTypes = [getattr(nodes,d) for d in dir(nodes)]
nodeTypes = [x for x in nodeTypes if type(x)==type]
@@ -33,17 +24,45 @@
node2markup[nodes.Section]=''
node2markup[nodes.Item]=''
+def wiki2sentences(wiki, sent_detector,withTags=True):
+ # get rid of (nested) template calls
+ oldLen = 1E10
+ while len(wiki)'):
+ sentences.append(line[3:].strip())
+ tags.append('Section')
+ elif line.startswith(''):
+ sentences.append(line[3:].strip())
+ tags.append('Item')
+ else:
+ newSentences = sent_detector(line.strip())
+ sentences += newSentences
+ tags += ['Sentence']*(len(newSentences)-1)
+ tags.append('LastSentence')
+ if withTags:
+ return sentences,tags
+ else:
+ return sentences
-def tree2string(tree):
+def tree2string(tree,trace=False):
snippets = []
- _tree2string(tree,snippets)
+ _tree2string(tree,snippets,trace)
return ''.join(snippets)
-def _tree2string(tree,snippets,level=0):
+def _tree2string(tree,snippets,trace,level=0):
snippets.append(node2markup[type(tree)])
- if options.trace: print ' '*level,type(tree)
+ if trace: print ' '*level,type(tree)
try:
if type(tree)==nodes.ArticleLink:
if not tree.children:
@@ -51,7 +70,7 @@ def _tree2string(tree,snippets,level=0):
snippets.append(tree.text)
else:
snippets.append(tree.target)
- if options.trace:
+ if trace:
print ' '*level,'ArticleLink: children:',len(tree.children)
print ' '*level,'target',tree.target.encode('utf-8')
print ' '*level,'text:',tree.text.encode('utf-8')
@@ -59,21 +78,15 @@ def _tree2string(tree,snippets,level=0):
elif type(tree)==nodes.TagNode:
return
elif tree.text:
- if options.trace: print ' '*level,'text:',tree.text.encode('utf-8')
+ if trace: print ' '*level,'text:',tree.text.encode('utf-8')
snippets.append(tree.text)
except AttributeError: pass
try:
for node in tree.children:
- _tree2string(node,snippets,level+1)
+ _tree2string(node,snippets,trace,level+1)
except AttributeError: pass
def cleanup(text):
- # get rid of (nested) template calls
- oldLen = 1E10
- while len(text)'):
- sentences.append(line[3:].strip())
- tags.append('Section')
- elif line.startswith(''):
- sentences.append(line[3:].strip())
- tags.append('Item')
- else:
- newSentences = sent_detector.tokenize(line.strip())
- sentences += newSentences
- tags += ['Sentence']*(len(newSentences)-1)
- tags.append('LastSentence')
- if withTags:
- return sentences,tags
- else:
- return sentences
-
-
-def main():
- optParse(
- trace__T=None,
- language__L='|'.join(l for p in languages for l in p),
- fromDump__D='',
- showType__S=None
- )
-
- sent_detector = nltk.data.load('tokenizers/punkt/%s.pickle' % lang2long(options.language))
-
-
- if options.fromDump:
- if options.fromDump.endswith('.gz'):
- source = os.popen('zcat %s' % options.fromDump)
- else:
- source = open(options.fromDump)
- currentLines = []
- for line in source:
- line = line.strip()
- if line.startswith(''):
- print line
- elif line.startswith('',1)[1])
- elif currentLines:
- if line.endswith(''):
- currentLines.append(line.rsplit('<',1)[0])
- print '\n'.join(wiki2sentences('\n'.join(currentLines)),
- sent_detector)
- currentLines = []
- else:
- currentLines.append(line)
-
-
- else:
- for title in arguments:
- if title == 'Barack Obama' and options.language=='en':
- text = open('obama.src').read().decode('utf-8')
- else:
- text = wikipydia.query_text_raw(title, language=lang2short(options.language))['text']
- print '\n'.join(wiki2sentences(text),sent_detector).encode('utf-8')
-
-
-
-
-if __name__ == "__main__":
- tc,tt=clock(),time()
- try: main()
- finally: print >> sys.stderr, "%.3f/%.3f seconds overall" % (clock()-tc, time()-tt)
-
-else:
- options.trace=False