Skip to content

Commit

Permalink
refactored to have a minimal core for use in wikitrans
Browse files Browse the repository at this point in the history
  • Loading branch information
Andreas Eisele committed Jan 29, 2010
1 parent d57ab13 commit 3a32438
Show file tree
Hide file tree
Showing 2 changed files with 109 additions and 119 deletions.
78 changes: 70 additions & 8 deletions testImport.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,75 @@


from wpTextExtractor import wiki2sentences
import wikipydia
from time import clock, time
import sys
import os
import nltk
import wikipydia
from optArgs import optParse, options, arguments
from wpTextExtractor import wiki2sentences



languages = [p.split(':') for p in '''en:english cz:czech da:danish nl:dutch et:estonian fi:finnish fr:french de:german el:greek it:italian no:norwegian pt:portuguese sl:slovene es:spanish sw:swedish tr:turkish'''.split()]

def lang2long(lang):
for p in languages:
if lang in p: return p[1]

def lang2short(lang):
for p in languages:
if lang in p: return p[0]


def main():
optParse(
trace__T=None,
language__L='|'.join(l for p in languages for l in p),
fromDump__D='',
showType__S=None,
withTags__W=None
)

sent_detector = nltk.data.load('tokenizers/punkt/%s.pickle' % lang2long(options.language)).tokenize


if options.fromDump:
if options.fromDump.endswith('.gz'):
source = os.popen('zcat %s' % options.fromDump)
else:
source = open(options.fromDump)
currentLines = []
for line in source:
line = line.strip()
if line.startswith('<title>'):
print line
elif line.startswith('<text'):
currentLines.append(line.split('>',1)[1])
elif currentLines:
if line.endswith('</text>'):
currentLines.append(line.rsplit('<',1)[0])
print '\n'.join(wiki2sentences('\n'.join(currentLines),
sent_detector,False))
currentLines = []
else:
currentLines.append(line)


else:
for title in arguments:
if title == 'Barack Obama' and options.language=='en':
text = open('obama.src').read().decode('utf-8')
else:
text = wikipydia.query_text_raw(title, language=lang2short(options.language))['text']
if options.withTags:
for s,t in zip(*wiki2sentences(text,sent_detector,True)):
print t[:4],s.encode('utf-8')
else:
print '\n'.join(wiki2sentences(text,sent_detector,False)).encode('utf-8')


sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

t=wikipydia.query_text_raw('Barack Obama')['text']
sents= wiki2sentences(t,sent_detector)

for s,t in zip(*sents):
print t,s
if __name__ == "__main__":
tc,tt=clock(),time()
try: main()
finally: print >> sys.stderr, "%.3f/%.3f seconds overall" % (clock()-tc, time()-tt)
150 changes: 39 additions & 111 deletions wpTextExtractor.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,21 @@
#! /home/cl-home/eisele/lns-root-07/bin/python2.6
#! /usr/bin/env python2.6
# -*- coding: utf-8 -*-

__author__ = 'Andreas Eisele <[email protected]>'
__created__ = "Tue Jan 26 21:41:40 2010"
__id__ = '$Id: trymwlib.py 53 2010-01-27 13:53:08Z anei00 $'.strip("$")

'''
extract clear text from wikipedia articles
'''

# purpose of file:
# extract clear text from wikipedia articles


from time import clock, time
import sys
import os
import mwlib
import nltk
import wikipydia
import re
from optArgs import optParse, options, arguments
import mwlib
from mwlib.refine.compat import parse_txt
from mwlib.refine import core
from mwlib.parser import nodes



# map all node types to the empty string
nodeTypes = [getattr(nodes,d) for d in dir(nodes)]
nodeTypes = [x for x in nodeTypes if type(x)==type]
Expand All @@ -33,47 +24,69 @@
node2markup[nodes.Section]='<s>'
node2markup[nodes.Item]='<i>'

def wiki2sentences(wiki, sent_detector,withTags=True):
# get rid of (nested) template calls
oldLen = 1E10
while len(wiki)<oldLen:
oldLen = len(wiki)
wiki = re.sub('{[^{}]*}',' ',wiki)

tree = parse_txt(wiki)
text = tree2string(tree)
lines = cleanup(text).split('\n')
sentences = []
tags = []
for line in lines:
if line.startswith('<s>'):
sentences.append(line[3:].strip())
tags.append('Section')
elif line.startswith('<i>'):
sentences.append(line[3:].strip())
tags.append('Item')
else:
newSentences = sent_detector(line.strip())
sentences += newSentences
tags += ['Sentence']*(len(newSentences)-1)
tags.append('LastSentence')
if withTags:
return sentences,tags
else:
return sentences



def tree2string(tree):
def tree2string(tree,trace=False):
snippets = []
_tree2string(tree,snippets)
_tree2string(tree,snippets,trace)
return ''.join(snippets)

def _tree2string(tree,snippets,level=0):
def _tree2string(tree,snippets,trace,level=0):
snippets.append(node2markup[type(tree)])
if options.trace: print ' '*level,type(tree)
if trace: print ' '*level,type(tree)
try:
if type(tree)==nodes.ArticleLink:
if not tree.children:
if tree.text:
snippets.append(tree.text)
else:
snippets.append(tree.target)
if options.trace:
if trace:
print ' '*level,'ArticleLink: children:',len(tree.children)
print ' '*level,'target',tree.target.encode('utf-8')
print ' '*level,'text:',tree.text.encode('utf-8')
return
elif type(tree)==nodes.TagNode:
return
elif tree.text:
if options.trace: print ' '*level,'text:',tree.text.encode('utf-8')
if trace: print ' '*level,'text:',tree.text.encode('utf-8')
snippets.append(tree.text)
except AttributeError: pass
try:
for node in tree.children:
_tree2string(node,snippets,level+1)
_tree2string(node,snippets,trace,level+1)
except AttributeError: pass

def cleanup(text):
# get rid of (nested) template calls
oldLen = 1E10
while len(text)<oldLen:
oldLen = len(text)
text = re.sub('{[^{}]*}',' ',text)

# little hack to change the order of
text = text.replace('."','".')

Expand All @@ -85,88 +98,3 @@ def cleanup(text):
return text


languages = [p.split(':') for p in '''en:english cz:czech da:danish nl:dutch et:estonian fi:finnish fr:french de:german el:greek it:italian no:norwegian pt:portuguese sl:slovene es:spanish sw:swedish tr:turkish'''.split()]

def lang2long(lang):
for p in languages:
if lang in p: return p[1]

def lang2short(lang):
for p in languages:
if lang in p: return p[0]

def wiki2sentences(wiki, sent_detector,withTags=True):
tree = parse_txt(wiki)
text = tree2string(tree)
lines = cleanup(text).split('\n')
sentences = []
tags = []
for line in lines:
if line.startswith('<s>'):
sentences.append(line[3:].strip())
tags.append('Section')
elif line.startswith('<i>'):
sentences.append(line[3:].strip())
tags.append('Item')
else:
newSentences = sent_detector.tokenize(line.strip())
sentences += newSentences
tags += ['Sentence']*(len(newSentences)-1)
tags.append('LastSentence')
if withTags:
return sentences,tags
else:
return sentences


def main():
optParse(
trace__T=None,
language__L='|'.join(l for p in languages for l in p),
fromDump__D='',
showType__S=None
)

sent_detector = nltk.data.load('tokenizers/punkt/%s.pickle' % lang2long(options.language))


if options.fromDump:
if options.fromDump.endswith('.gz'):
source = os.popen('zcat %s' % options.fromDump)
else:
source = open(options.fromDump)
currentLines = []
for line in source:
line = line.strip()
if line.startswith('<title>'):
print line
elif line.startswith('<text'):
currentLines.append(line.split('>',1)[1])
elif currentLines:
if line.endswith('</text>'):
currentLines.append(line.rsplit('<',1)[0])
print '\n'.join(wiki2sentences('\n'.join(currentLines)),
sent_detector)
currentLines = []
else:
currentLines.append(line)


else:
for title in arguments:
if title == 'Barack Obama' and options.language=='en':
text = open('obama.src').read().decode('utf-8')
else:
text = wikipydia.query_text_raw(title, language=lang2short(options.language))['text']
print '\n'.join(wiki2sentences(text),sent_detector).encode('utf-8')




if __name__ == "__main__":
tc,tt=clock(),time()
try: main()
finally: print >> sys.stderr, "%.3f/%.3f seconds overall" % (clock()-tc, time()-tt)

else:
options.trace=False

0 comments on commit 3a32438

Please sign in to comment.