refactored to have a minimal core for use in wikitrans

AndreasEisele · Jan 29, 2010 · 3a32438 · 3a32438
1 parent d57ab13
commit 3a32438
Show file tree

Hide file tree

Showing 2 changed files with 109 additions and 119 deletions.
diff --git a/testImport.py b/testImport.py
@@ -1,13 +1,75 @@
 
-
-from wpTextExtractor import wiki2sentences
-import wikipydia
+from time import clock, time
+import sys
+import os
 import nltk
+import wikipydia
+from optArgs import optParse, options, arguments
+from wpTextExtractor import wiki2sentences
+
+
+
+languages = [p.split(':') for p in '''en:english cz:czech da:danish nl:dutch et:estonian fi:finnish fr:french de:german el:greek it:italian no:norwegian pt:portuguese sl:slovene es:spanish sw:swedish tr:turkish'''.split()]
+
+def lang2long(lang):
+    for p in languages:
+        if lang in p: return p[1]
+
+def lang2short(lang):
+    for p in languages:
+        if lang in p: return p[0]
+
+
+def main():
+    optParse(
+        trace__T=None,
+        language__L='|'.join(l for p in languages for l in p),
+        fromDump__D='',
+        showType__S=None,
+        withTags__W=None
+        )
+
+    sent_detector = nltk.data.load('tokenizers/punkt/%s.pickle' % lang2long(options.language)).tokenize
+
+
+    if options.fromDump:
+        if options.fromDump.endswith('.gz'):
+            source = os.popen('zcat %s' % options.fromDump)
+        else:
+            source = open(options.fromDump)
+        currentLines = []
+        for line in source:
+            line = line.strip()
+            if line.startswith('<title>'):
+                print line
+            elif line.startswith('<text'):
+                currentLines.append(line.split('>',1)[1])
+            elif currentLines:
+                if line.endswith('</text>'):
+                    currentLines.append(line.rsplit('<',1)[0])
+                    print '\n'.join(wiki2sentences('\n'.join(currentLines),
+                                                   sent_detector,False))
+                    currentLines = []
+                else:
+                    currentLines.append(line)
+
+
+    else:
+        for title in arguments:
+            if title == 'Barack Obama' and options.language=='en':
+                text = open('obama.src').read().decode('utf-8')
+            else:
+                text = wikipydia.query_text_raw(title, language=lang2short(options.language))['text']
+            if options.withTags:
+                for s,t in zip(*wiki2sentences(text,sent_detector,True)):
+                    print t[:4],s.encode('utf-8')
+            else:
+                print '\n'.join(wiki2sentences(text,sent_detector,False)).encode('utf-8')
+
 
-sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
 
-t=wikipydia.query_text_raw('Barack Obama')['text']
-sents= wiki2sentences(t,sent_detector)
 
-for s,t in zip(*sents):
-    print t,s
+if __name__ == "__main__":
+    tc,tt=clock(),time()
+    try: main()
+    finally: print >> sys.stderr, "%.3f/%.3f seconds overall" % (clock()-tc, time()-tt)
diff --git a/wpTextExtractor.py b/wpTextExtractor.py
@@ -1,30 +1,21 @@
-#! /home/cl-home/eisele/lns-root-07/bin/python2.6 
+#! /usr/bin/env python2.6
 # -*- coding: utf-8 -*-
 
 __author__ = 'Andreas Eisele <[email protected]>'
 __created__ = "Tue Jan 26 21:41:40 2010"
-__id__ = '$Id: trymwlib.py 53 2010-01-27 13:53:08Z anei00 $'.strip("$")
 
+'''
+extract clear text from wikipedia articles
+'''
 
-# purpose of file:
-# extract clear text from wikipedia articles
 
-
-from time import clock, time
-import sys
-import os
-import mwlib
-import nltk
-import wikipydia
 import re
-from optArgs import optParse, options, arguments
 import mwlib
 from mwlib.refine.compat import parse_txt
 from mwlib.refine import core 
 from mwlib.parser import nodes
 
 
-
 # map all node types to the empty string
 nodeTypes = [getattr(nodes,d) for d in dir(nodes)]
 nodeTypes = [x for x in nodeTypes if type(x)==type]
@@ -33,47 +24,69 @@
 node2markup[nodes.Section]='<s>'
 node2markup[nodes.Item]='<i>'
 
+def wiki2sentences(wiki, sent_detector,withTags=True):
+    # get rid of (nested) template calls 
+    oldLen = 1E10
+    while len(wiki)<oldLen:
+        oldLen = len(wiki)
+        wiki = re.sub('{[^{}]*}',' ',wiki)
+
+    tree = parse_txt(wiki)
+    text = tree2string(tree)
+    lines = cleanup(text).split('\n')
+    sentences = []
+    tags = []
+    for line in lines:
+        if line.startswith('<s>'):
+            sentences.append(line[3:].strip())
+            tags.append('Section')
+        elif line.startswith('<i>'):
+            sentences.append(line[3:].strip())
+            tags.append('Item')
+        else:
+            newSentences = sent_detector(line.strip())
+            sentences += newSentences
+            tags += ['Sentence']*(len(newSentences)-1)
+            tags.append('LastSentence')
+    if withTags:
+        return sentences,tags
+    else:
+        return sentences
 
 
 
-def tree2string(tree):
+def tree2string(tree,trace=False):
     snippets = []
-    _tree2string(tree,snippets)
+    _tree2string(tree,snippets,trace)
     return ''.join(snippets)
 
-def _tree2string(tree,snippets,level=0):
+def _tree2string(tree,snippets,trace,level=0):
     snippets.append(node2markup[type(tree)])
-    if options.trace: print '  '*level,type(tree)
+    if trace: print '  '*level,type(tree)
     try:
         if type(tree)==nodes.ArticleLink:
             if not tree.children:
                 if tree.text:
                     snippets.append(tree.text)
                 else:
                     snippets.append(tree.target)
-                if options.trace: 
+                if trace: 
                     print '  '*level,'ArticleLink: children:',len(tree.children)
                     print '  '*level,'target',tree.target.encode('utf-8')
                     print '  '*level,'text:',tree.text.encode('utf-8')
                 return
         elif type(tree)==nodes.TagNode:
             return
         elif tree.text:
-            if options.trace: print '  '*level,'text:',tree.text.encode('utf-8')
+            if trace: print '  '*level,'text:',tree.text.encode('utf-8')
             snippets.append(tree.text)
     except AttributeError: pass
     try:
         for node in tree.children:
-            _tree2string(node,snippets,level+1)
+            _tree2string(node,snippets,trace,level+1)
     except AttributeError: pass
 
 def cleanup(text):
-    # get rid of (nested) template calls 
-    oldLen = 1E10
-    while len(text)<oldLen:
-        oldLen = len(text)
-        text = re.sub('{[^{}]*}',' ',text)
-
     # little hack to change the order of 
     text = text.replace('."','".')
 
@@ -85,88 +98,3 @@ def cleanup(text):
     return text
 
 
-languages = [p.split(':') for p in '''en:english cz:czech da:danish nl:dutch et:estonian fi:finnish fr:french de:german el:greek it:italian no:norwegian pt:portuguese sl:slovene es:spanish sw:swedish tr:turkish'''.split()]
-
-def lang2long(lang):
-    for p in languages:
-        if lang in p: return p[1]
-
-def lang2short(lang):
-    for p in languages:
-        if lang in p: return p[0]
-
-def wiki2sentences(wiki, sent_detector,withTags=True):
-    tree = parse_txt(wiki)
-    text = tree2string(tree)
-    lines = cleanup(text).split('\n')
-    sentences = []
-    tags = []
-    for line in lines:
-        if line.startswith('<s>'):
-            sentences.append(line[3:].strip())
-            tags.append('Section')
-        elif line.startswith('<i>'):
-            sentences.append(line[3:].strip())
-            tags.append('Item')
-        else:
-            newSentences = sent_detector.tokenize(line.strip())
-            sentences += newSentences
-            tags += ['Sentence']*(len(newSentences)-1)
-            tags.append('LastSentence')
-    if withTags:
-        return sentences,tags
-    else:
-        return sentences
-
-
-def main():
-    optParse(
-        trace__T=None,
-        language__L='|'.join(l for p in languages for l in p),
-        fromDump__D='',
-        showType__S=None
-        )
-
-    sent_detector = nltk.data.load('tokenizers/punkt/%s.pickle' % lang2long(options.language))
-
-
-    if options.fromDump:
-        if options.fromDump.endswith('.gz'):
-            source = os.popen('zcat %s' % options.fromDump)
-        else:
-            source = open(options.fromDump)
-        currentLines = []
-        for line in source:
-            line = line.strip()
-            if line.startswith('<title>'):
-                print line
-            elif line.startswith('<text'):
-                currentLines.append(line.split('>',1)[1])
-            elif currentLines:
-                if line.endswith('</text>'):
-                    currentLines.append(line.rsplit('<',1)[0])
-                    print '\n'.join(wiki2sentences('\n'.join(currentLines)),
-                                    sent_detector)
-                    currentLines = []
-                else:
-                    currentLines.append(line)
-
-
-    else:
-        for title in arguments:
-            if title == 'Barack Obama' and options.language=='en':
-                text = open('obama.src').read().decode('utf-8')
-            else:
-                text = wikipydia.query_text_raw(title, language=lang2short(options.language))['text']
-            print '\n'.join(wiki2sentences(text),sent_detector).encode('utf-8')
-
-
-
-
-if __name__ == "__main__":
-    tc,tt=clock(),time()
-    try: main()
-    finally: print >> sys.stderr, "%.3f/%.3f seconds overall" % (clock()-tc, time()-tt)
-
-else:
-    options.trace=False