diff --git a/testImport.py b/testImport.py index 5d67d9d..846c0ca 100644 --- a/testImport.py +++ b/testImport.py @@ -1,13 +1,75 @@ - -from wpTextExtractor import wiki2sentences -import wikipydia +from time import clock, time +import sys +import os import nltk +import wikipydia +from optArgs import optParse, options, arguments +from wpTextExtractor import wiki2sentences + + + +languages = [p.split(':') for p in '''en:english cz:czech da:danish nl:dutch et:estonian fi:finnish fr:french de:german el:greek it:italian no:norwegian pt:portuguese sl:slovene es:spanish sw:swedish tr:turkish'''.split()] + +def lang2long(lang): + for p in languages: + if lang in p: return p[1] + +def lang2short(lang): + for p in languages: + if lang in p: return p[0] + + +def main(): + optParse( + trace__T=None, + language__L='|'.join(l for p in languages for l in p), + fromDump__D='', + showType__S=None, + withTags__W=None + ) + + sent_detector = nltk.data.load('tokenizers/punkt/%s.pickle' % lang2long(options.language)).tokenize + + + if options.fromDump: + if options.fromDump.endswith('.gz'): + source = os.popen('zcat %s' % options.fromDump) + else: + source = open(options.fromDump) + currentLines = [] + for line in source: + line = line.strip() + if line.startswith(''): + print line + elif line.startswith('<text'): + currentLines.append(line.split('>',1)[1]) + elif currentLines: + if line.endswith('</text>'): + currentLines.append(line.rsplit('<',1)[0]) + print '\n'.join(wiki2sentences('\n'.join(currentLines), + sent_detector,False)) + currentLines = [] + else: + currentLines.append(line) + + + else: + for title in arguments: + if title == 'Barack Obama' and options.language=='en': + text = open('obama.src').read().decode('utf-8') + else: + text = wikipydia.query_text_raw(title, language=lang2short(options.language))['text'] + if options.withTags: + for s,t in zip(*wiki2sentences(text,sent_detector,True)): + print t[:4],s.encode('utf-8') + else: + print '\n'.join(wiki2sentences(text,sent_detector,False)).encode('utf-8') + -sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') -t=wikipydia.query_text_raw('Barack Obama')['text'] -sents= wiki2sentences(t,sent_detector) -for s,t in zip(*sents): - print t,s +if __name__ == "__main__": + tc,tt=clock(),time() + try: main() + finally: print >> sys.stderr, "%.3f/%.3f seconds overall" % (clock()-tc, time()-tt) diff --git a/wpTextExtractor.py b/wpTextExtractor.py index d907e25..fe5f56a 100755 --- a/wpTextExtractor.py +++ b/wpTextExtractor.py @@ -1,30 +1,21 @@ -#! /home/cl-home/eisele/lns-root-07/bin/python2.6 +#! /usr/bin/env python2.6 # -*- coding: utf-8 -*- __author__ = 'Andreas Eisele <eisele@dfki.de>' __created__ = "Tue Jan 26 21:41:40 2010" -__id__ = '$Id: trymwlib.py 53 2010-01-27 13:53:08Z anei00 $'.strip("$") +''' +extract clear text from wikipedia articles +''' -# purpose of file: -# extract clear text from wikipedia articles - -from time import clock, time -import sys -import os -import mwlib -import nltk -import wikipydia import re -from optArgs import optParse, options, arguments import mwlib from mwlib.refine.compat import parse_txt from mwlib.refine import core from mwlib.parser import nodes - # map all node types to the empty string nodeTypes = [getattr(nodes,d) for d in dir(nodes)] nodeTypes = [x for x in nodeTypes if type(x)==type] @@ -33,17 +24,45 @@ node2markup[nodes.Section]='<s>' node2markup[nodes.Item]='<i>' +def wiki2sentences(wiki, sent_detector,withTags=True): + # get rid of (nested) template calls + oldLen = 1E10 + while len(wiki)<oldLen: + oldLen = len(wiki) + wiki = re.sub('{[^{}]*}',' ',wiki) + + tree = parse_txt(wiki) + text = tree2string(tree) + lines = cleanup(text).split('\n') + sentences = [] + tags = [] + for line in lines: + if line.startswith('<s>'): + sentences.append(line[3:].strip()) + tags.append('Section') + elif line.startswith('<i>'): + sentences.append(line[3:].strip()) + tags.append('Item') + else: + newSentences = sent_detector(line.strip()) + sentences += newSentences + tags += ['Sentence']*(len(newSentences)-1) + tags.append('LastSentence') + if withTags: + return sentences,tags + else: + return sentences -def tree2string(tree): +def tree2string(tree,trace=False): snippets = [] - _tree2string(tree,snippets) + _tree2string(tree,snippets,trace) return ''.join(snippets) -def _tree2string(tree,snippets,level=0): +def _tree2string(tree,snippets,trace,level=0): snippets.append(node2markup[type(tree)]) - if options.trace: print ' '*level,type(tree) + if trace: print ' '*level,type(tree) try: if type(tree)==nodes.ArticleLink: if not tree.children: @@ -51,7 +70,7 @@ def _tree2string(tree,snippets,level=0): snippets.append(tree.text) else: snippets.append(tree.target) - if options.trace: + if trace: print ' '*level,'ArticleLink: children:',len(tree.children) print ' '*level,'target',tree.target.encode('utf-8') print ' '*level,'text:',tree.text.encode('utf-8') @@ -59,21 +78,15 @@ def _tree2string(tree,snippets,level=0): elif type(tree)==nodes.TagNode: return elif tree.text: - if options.trace: print ' '*level,'text:',tree.text.encode('utf-8') + if trace: print ' '*level,'text:',tree.text.encode('utf-8') snippets.append(tree.text) except AttributeError: pass try: for node in tree.children: - _tree2string(node,snippets,level+1) + _tree2string(node,snippets,trace,level+1) except AttributeError: pass def cleanup(text): - # get rid of (nested) template calls - oldLen = 1E10 - while len(text)<oldLen: - oldLen = len(text) - text = re.sub('{[^{}]*}',' ',text) - # little hack to change the order of text = text.replace('."','".') @@ -85,88 +98,3 @@ def cleanup(text): return text -languages = [p.split(':') for p in '''en:english cz:czech da:danish nl:dutch et:estonian fi:finnish fr:french de:german el:greek it:italian no:norwegian pt:portuguese sl:slovene es:spanish sw:swedish tr:turkish'''.split()] - -def lang2long(lang): - for p in languages: - if lang in p: return p[1] - -def lang2short(lang): - for p in languages: - if lang in p: return p[0] - -def wiki2sentences(wiki, sent_detector,withTags=True): - tree = parse_txt(wiki) - text = tree2string(tree) - lines = cleanup(text).split('\n') - sentences = [] - tags = [] - for line in lines: - if line.startswith('<s>'): - sentences.append(line[3:].strip()) - tags.append('Section') - elif line.startswith('<i>'): - sentences.append(line[3:].strip()) - tags.append('Item') - else: - newSentences = sent_detector.tokenize(line.strip()) - sentences += newSentences - tags += ['Sentence']*(len(newSentences)-1) - tags.append('LastSentence') - if withTags: - return sentences,tags - else: - return sentences - - -def main(): - optParse( - trace__T=None, - language__L='|'.join(l for p in languages for l in p), - fromDump__D='', - showType__S=None - ) - - sent_detector = nltk.data.load('tokenizers/punkt/%s.pickle' % lang2long(options.language)) - - - if options.fromDump: - if options.fromDump.endswith('.gz'): - source = os.popen('zcat %s' % options.fromDump) - else: - source = open(options.fromDump) - currentLines = [] - for line in source: - line = line.strip() - if line.startswith('<title>'): - print line - elif line.startswith('<text'): - currentLines.append(line.split('>',1)[1]) - elif currentLines: - if line.endswith('</text>'): - currentLines.append(line.rsplit('<',1)[0]) - print '\n'.join(wiki2sentences('\n'.join(currentLines)), - sent_detector) - currentLines = [] - else: - currentLines.append(line) - - - else: - for title in arguments: - if title == 'Barack Obama' and options.language=='en': - text = open('obama.src').read().decode('utf-8') - else: - text = wikipydia.query_text_raw(title, language=lang2short(options.language))['text'] - print '\n'.join(wiki2sentences(text),sent_detector).encode('utf-8') - - - - -if __name__ == "__main__": - tc,tt=clock(),time() - try: main() - finally: print >> sys.stderr, "%.3f/%.3f seconds overall" % (clock()-tc, time()-tt) - -else: - options.trace=False