-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refactored to have a minimal core for use in wikitrans
- Loading branch information
Andreas Eisele
committed
Jan 29, 2010
1 parent
d57ab13
commit 3a32438
Showing
2 changed files
with
109 additions
and
119 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,75 @@ | ||
|
||
|
||
from wpTextExtractor import wiki2sentences | ||
import wikipydia | ||
from time import clock, time | ||
import sys | ||
import os | ||
import nltk | ||
import wikipydia | ||
from optArgs import optParse, options, arguments | ||
from wpTextExtractor import wiki2sentences | ||
|
||
|
||
|
||
languages = [p.split(':') for p in '''en:english cz:czech da:danish nl:dutch et:estonian fi:finnish fr:french de:german el:greek it:italian no:norwegian pt:portuguese sl:slovene es:spanish sw:swedish tr:turkish'''.split()] | ||
|
||
def lang2long(lang): | ||
for p in languages: | ||
if lang in p: return p[1] | ||
|
||
def lang2short(lang): | ||
for p in languages: | ||
if lang in p: return p[0] | ||
|
||
|
||
def main(): | ||
optParse( | ||
trace__T=None, | ||
language__L='|'.join(l for p in languages for l in p), | ||
fromDump__D='', | ||
showType__S=None, | ||
withTags__W=None | ||
) | ||
|
||
sent_detector = nltk.data.load('tokenizers/punkt/%s.pickle' % lang2long(options.language)).tokenize | ||
|
||
|
||
if options.fromDump: | ||
if options.fromDump.endswith('.gz'): | ||
source = os.popen('zcat %s' % options.fromDump) | ||
else: | ||
source = open(options.fromDump) | ||
currentLines = [] | ||
for line in source: | ||
line = line.strip() | ||
if line.startswith('<title>'): | ||
print line | ||
elif line.startswith('<text'): | ||
currentLines.append(line.split('>',1)[1]) | ||
elif currentLines: | ||
if line.endswith('</text>'): | ||
currentLines.append(line.rsplit('<',1)[0]) | ||
print '\n'.join(wiki2sentences('\n'.join(currentLines), | ||
sent_detector,False)) | ||
currentLines = [] | ||
else: | ||
currentLines.append(line) | ||
|
||
|
||
else: | ||
for title in arguments: | ||
if title == 'Barack Obama' and options.language=='en': | ||
text = open('obama.src').read().decode('utf-8') | ||
else: | ||
text = wikipydia.query_text_raw(title, language=lang2short(options.language))['text'] | ||
if options.withTags: | ||
for s,t in zip(*wiki2sentences(text,sent_detector,True)): | ||
print t[:4],s.encode('utf-8') | ||
else: | ||
print '\n'.join(wiki2sentences(text,sent_detector,False)).encode('utf-8') | ||
|
||
|
||
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') | ||
|
||
t=wikipydia.query_text_raw('Barack Obama')['text'] | ||
sents= wiki2sentences(t,sent_detector) | ||
|
||
for s,t in zip(*sents): | ||
print t,s | ||
if __name__ == "__main__": | ||
tc,tt=clock(),time() | ||
try: main() | ||
finally: print >> sys.stderr, "%.3f/%.3f seconds overall" % (clock()-tc, time()-tt) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,30 +1,21 @@ | ||
#! /home/cl-home/eisele/lns-root-07/bin/python2.6 | ||
#! /usr/bin/env python2.6 | ||
# -*- coding: utf-8 -*- | ||
|
||
__author__ = 'Andreas Eisele <[email protected]>' | ||
__created__ = "Tue Jan 26 21:41:40 2010" | ||
__id__ = '$Id: trymwlib.py 53 2010-01-27 13:53:08Z anei00 $'.strip("$") | ||
|
||
''' | ||
extract clear text from wikipedia articles | ||
''' | ||
|
||
# purpose of file: | ||
# extract clear text from wikipedia articles | ||
|
||
|
||
from time import clock, time | ||
import sys | ||
import os | ||
import mwlib | ||
import nltk | ||
import wikipydia | ||
import re | ||
from optArgs import optParse, options, arguments | ||
import mwlib | ||
from mwlib.refine.compat import parse_txt | ||
from mwlib.refine import core | ||
from mwlib.parser import nodes | ||
|
||
|
||
|
||
# map all node types to the empty string | ||
nodeTypes = [getattr(nodes,d) for d in dir(nodes)] | ||
nodeTypes = [x for x in nodeTypes if type(x)==type] | ||
|
@@ -33,47 +24,69 @@ | |
node2markup[nodes.Section]='<s>' | ||
node2markup[nodes.Item]='<i>' | ||
|
||
def wiki2sentences(wiki, sent_detector,withTags=True): | ||
# get rid of (nested) template calls | ||
oldLen = 1E10 | ||
while len(wiki)<oldLen: | ||
oldLen = len(wiki) | ||
wiki = re.sub('{[^{}]*}',' ',wiki) | ||
|
||
tree = parse_txt(wiki) | ||
text = tree2string(tree) | ||
lines = cleanup(text).split('\n') | ||
sentences = [] | ||
tags = [] | ||
for line in lines: | ||
if line.startswith('<s>'): | ||
sentences.append(line[3:].strip()) | ||
tags.append('Section') | ||
elif line.startswith('<i>'): | ||
sentences.append(line[3:].strip()) | ||
tags.append('Item') | ||
else: | ||
newSentences = sent_detector(line.strip()) | ||
sentences += newSentences | ||
tags += ['Sentence']*(len(newSentences)-1) | ||
tags.append('LastSentence') | ||
if withTags: | ||
return sentences,tags | ||
else: | ||
return sentences | ||
|
||
|
||
|
||
def tree2string(tree): | ||
def tree2string(tree,trace=False): | ||
snippets = [] | ||
_tree2string(tree,snippets) | ||
_tree2string(tree,snippets,trace) | ||
return ''.join(snippets) | ||
|
||
def _tree2string(tree,snippets,level=0): | ||
def _tree2string(tree,snippets,trace,level=0): | ||
snippets.append(node2markup[type(tree)]) | ||
if options.trace: print ' '*level,type(tree) | ||
if trace: print ' '*level,type(tree) | ||
try: | ||
if type(tree)==nodes.ArticleLink: | ||
if not tree.children: | ||
if tree.text: | ||
snippets.append(tree.text) | ||
else: | ||
snippets.append(tree.target) | ||
if options.trace: | ||
if trace: | ||
print ' '*level,'ArticleLink: children:',len(tree.children) | ||
print ' '*level,'target',tree.target.encode('utf-8') | ||
print ' '*level,'text:',tree.text.encode('utf-8') | ||
return | ||
elif type(tree)==nodes.TagNode: | ||
return | ||
elif tree.text: | ||
if options.trace: print ' '*level,'text:',tree.text.encode('utf-8') | ||
if trace: print ' '*level,'text:',tree.text.encode('utf-8') | ||
snippets.append(tree.text) | ||
except AttributeError: pass | ||
try: | ||
for node in tree.children: | ||
_tree2string(node,snippets,level+1) | ||
_tree2string(node,snippets,trace,level+1) | ||
except AttributeError: pass | ||
|
||
def cleanup(text): | ||
# get rid of (nested) template calls | ||
oldLen = 1E10 | ||
while len(text)<oldLen: | ||
oldLen = len(text) | ||
text = re.sub('{[^{}]*}',' ',text) | ||
|
||
# little hack to change the order of | ||
text = text.replace('."','".') | ||
|
||
|
@@ -85,88 +98,3 @@ def cleanup(text): | |
return text | ||
|
||
|
||
languages = [p.split(':') for p in '''en:english cz:czech da:danish nl:dutch et:estonian fi:finnish fr:french de:german el:greek it:italian no:norwegian pt:portuguese sl:slovene es:spanish sw:swedish tr:turkish'''.split()] | ||
|
||
def lang2long(lang): | ||
for p in languages: | ||
if lang in p: return p[1] | ||
|
||
def lang2short(lang): | ||
for p in languages: | ||
if lang in p: return p[0] | ||
|
||
def wiki2sentences(wiki, sent_detector,withTags=True): | ||
tree = parse_txt(wiki) | ||
text = tree2string(tree) | ||
lines = cleanup(text).split('\n') | ||
sentences = [] | ||
tags = [] | ||
for line in lines: | ||
if line.startswith('<s>'): | ||
sentences.append(line[3:].strip()) | ||
tags.append('Section') | ||
elif line.startswith('<i>'): | ||
sentences.append(line[3:].strip()) | ||
tags.append('Item') | ||
else: | ||
newSentences = sent_detector.tokenize(line.strip()) | ||
sentences += newSentences | ||
tags += ['Sentence']*(len(newSentences)-1) | ||
tags.append('LastSentence') | ||
if withTags: | ||
return sentences,tags | ||
else: | ||
return sentences | ||
|
||
|
||
def main(): | ||
optParse( | ||
trace__T=None, | ||
language__L='|'.join(l for p in languages for l in p), | ||
fromDump__D='', | ||
showType__S=None | ||
) | ||
|
||
sent_detector = nltk.data.load('tokenizers/punkt/%s.pickle' % lang2long(options.language)) | ||
|
||
|
||
if options.fromDump: | ||
if options.fromDump.endswith('.gz'): | ||
source = os.popen('zcat %s' % options.fromDump) | ||
else: | ||
source = open(options.fromDump) | ||
currentLines = [] | ||
for line in source: | ||
line = line.strip() | ||
if line.startswith('<title>'): | ||
print line | ||
elif line.startswith('<text'): | ||
currentLines.append(line.split('>',1)[1]) | ||
elif currentLines: | ||
if line.endswith('</text>'): | ||
currentLines.append(line.rsplit('<',1)[0]) | ||
print '\n'.join(wiki2sentences('\n'.join(currentLines)), | ||
sent_detector) | ||
currentLines = [] | ||
else: | ||
currentLines.append(line) | ||
|
||
|
||
else: | ||
for title in arguments: | ||
if title == 'Barack Obama' and options.language=='en': | ||
text = open('obama.src').read().decode('utf-8') | ||
else: | ||
text = wikipydia.query_text_raw(title, language=lang2short(options.language))['text'] | ||
print '\n'.join(wiki2sentences(text),sent_detector).encode('utf-8') | ||
|
||
|
||
|
||
|
||
if __name__ == "__main__": | ||
tc,tt=clock(),time() | ||
try: main() | ||
finally: print >> sys.stderr, "%.3f/%.3f seconds overall" % (clock()-tc, time()-tt) | ||
|
||
else: | ||
options.trace=False |