-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Andreas Eisele
committed
Jan 27, 2010
0 parents
commit 21d801c
Showing
2 changed files
with
279 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
#! /usr/bin/env python2.3 | ||
|
||
|
||
__author__ = 'Andreas Eisele <[email protected]>' | ||
__created__ = "Fri Feb 6 12:41:04 2004" | ||
__date__ = ('$Date: 2004/02/09 08:23:13 $').strip("$") | ||
__version__ = '$Revision: 1.2 $'.strip("$") | ||
|
||
|
||
# purpose of file: | ||
|
||
# a convenience wrapper around optparse | ||
|
||
|
||
# todo: | ||
# provide more informative help text | ||
# support for boolean values | ||
|
||
|
||
|
||
# completed | ||
|
||
from optparse import OptionParser, Values | ||
|
||
|
||
# provide names that can be imported | ||
global options, arguments, optionsParsed | ||
options=Values() | ||
arguments=[] | ||
optionsParsed=False | ||
|
||
def key2opt(name): | ||
if len(name)==1: return "-"+name | ||
return "--"+name | ||
|
||
|
||
def optParse(usage=None, version=None, **kw): | ||
global options, arguments, optionsParsed | ||
|
||
if optionsParsed: | ||
raise "optParse cannot be called twice" | ||
else: | ||
optionsParsed=True | ||
op=OptionParser(usage=usage, version=version) | ||
for key,val in kw.items(): | ||
keys=[key2opt(k) for k in key.split("__")] | ||
if type(val) == type(1): | ||
otype="int" | ||
elif type(val) == type(1.0): | ||
otype="float" | ||
elif type(val)==type("") and "|" in val: | ||
choices=val.split("|") | ||
defVal=choices[0] | ||
op.add_option(default=defVal, choices=choices, help="one of: %s, [%s]"%(" ".join(choices),defVal), *keys) | ||
continue | ||
elif val==None: | ||
op.add_option(action="store_true", *keys) | ||
continue | ||
else: | ||
otype="string" | ||
op.add_option(default=val, type=otype, help="[%s]"%val,*keys) | ||
|
||
(opts, args)=op.parse_args() | ||
|
||
for a in args: arguments.append(a) | ||
options._update_loose(opts.__dict__) | ||
|
||
|
||
|
||
|
||
|
||
def main(): | ||
optParse(iOpt__i=1, sOpt__s="a") | ||
global options, arguments | ||
|
||
print "iOpt=",options.iOpt | ||
print "sOpt=",options.sOpt | ||
|
||
print "arguments=", arguments | ||
|
||
|
||
if __name__ == "__main__": | ||
main() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,195 @@ | ||
#! /home/cl-home/eisele/lns-root-07/bin/python2.6 | ||
# -*- coding: utf-8 -*- | ||
|
||
__author__ = 'Andreas Eisele <[email protected]>' | ||
__created__ = "Tue Jan 26 21:41:40 2010" | ||
__id__ = '$Id: trymwlib.py 53 2010-01-27 13:53:08Z anei00 $'.strip("$") | ||
|
||
|
||
# purpose of file: | ||
# extract clear text from wikipedia articles | ||
|
||
|
||
from time import clock, time | ||
import sys | ||
import os | ||
import mwlib | ||
import nltk | ||
import wikipydia | ||
import re | ||
from optArgs import optParse, options, arguments | ||
|
||
|
||
#from pyparsing import nestedExpr | ||
|
||
|
||
|
||
#from mwlib.uparser import simpleparse | ||
#from mwlib.uparser import parseString, simpleparse | ||
import mwlib | ||
from mwlib.refine.compat import parse_txt | ||
from mwlib.refine import core | ||
from mwlib.parser import nodes | ||
|
||
|
||
def splitToSections(parse,lead=None,sections=None): | ||
if not sections: sections=[] | ||
# returns a list of subtrees of type section | ||
for child in parse.children: | ||
if type(child) == nodes.Section: | ||
sections.append(child) | ||
elif not sections: | ||
if not lead: | ||
lead = parse_txt(raw='== ==').children[0] | ||
lead.children=[] | ||
lead.children.append(child) | ||
else: | ||
splitToSections(child,lead,sections) | ||
return [lead]+sections | ||
|
||
|
||
def extractText(simpleParse,collectedText=None): | ||
if collectedText==None: collectedText=[] | ||
for child in simpleParse.children: | ||
#print type(child),len(child.children) | ||
if type(child)==nodes.TagNode: | ||
pass | ||
elif len(child.children)==0: | ||
if child.text: | ||
collectedText.append(child.text) | ||
if options.trace: print child.text.encode('utf-8'), | ||
elif child.target: | ||
if type(child)==nodes.ArticleLink: | ||
collectedText.append(child.target) | ||
if options.trace: print child.target.encode('utf-8'), | ||
elif hasattr(child,'math') and child.math: | ||
collectedText.append(child.math) | ||
if options.trace: print child.math.encode('utf-8'), | ||
#elif type(child)==nodes.Node: pass | ||
else: | ||
print >> sys.stderr,'#######cannot handle',type(child), | ||
if options.trace: | ||
print >> sys.stderr, dir(child) | ||
for a in dir(child): | ||
print >> sys.stderr, a, getattr(child,a) | ||
else: | ||
extractText(child,collectedText) | ||
return collectedText | ||
|
||
|
||
def processArticle(text): | ||
tree = parse_txt(text) | ||
if options.trace: | ||
print '############# parse ###########' | ||
core.show(tree) | ||
print '############# sentences ###########' | ||
|
||
return processTree(tree) | ||
|
||
def processTree(tree): | ||
return [processSection(section) for section in splitToSections(tree)] | ||
|
||
|
||
|
||
|
||
def processSection(section,splitAtNL=True): | ||
result = [] | ||
extractedText = ''.join(extractText(section)) | ||
|
||
|
||
''' | ||
expr = nestedExpr('{{','}}').leaveWhitespace() | ||
bracketedItems = expr.parseString('{{'+extractedText+'}}').asList()[0] | ||
res = [] | ||
for item in bracketedItems: | ||
if not isinstance(item, list): | ||
res.append(item) | ||
extractedText = ' '.join(res) | ||
''' | ||
|
||
while len(set(extractedText) & set('{}'))==2: | ||
extractedText = re.sub('{[^{}]*}',' ',extractedText) | ||
|
||
# little hack to change the order of | ||
extractedText = extractedText.replace('."','".') | ||
|
||
if splitAtNL: lines = extractedText.split('\n') | ||
else: lines = [extractedText] | ||
|
||
for text in lines: | ||
for sentence in sent_detector.tokenize(text.strip()): | ||
if sentence: | ||
result.append(sentence) | ||
return result | ||
|
||
|
||
|
||
def processLines(lines): | ||
if not lines: return | ||
data='\n'.join(lines) | ||
data=data.split('>',1)[1] | ||
data=data.rsplit('<',1)[0] | ||
processArticle(data) | ||
|
||
|
||
languages = [p.split(':') for p in '''en:english cz:czech da:danish nl:dutch et:estonian fi:finnish fr:french de:german el:greek it:italian no:norwegian pt:portuguese sl:slovene es:spanish sw:swedish tr:turkish'''.split()] | ||
|
||
def lang2long(lang): | ||
for p in languages: | ||
if lang in p: return p[1] | ||
|
||
def lang2short(lang): | ||
for p in languages: | ||
if lang in p: return p[0] | ||
|
||
|
||
def main(): | ||
global sent_detector | ||
optParse( | ||
trace__T=None, | ||
language__L='|'.join(l for p in languages for l in p) | ||
) | ||
|
||
sent_detector = nltk.data.load('tokenizers/punkt/%s.pickle' % lang2long(options.language)) | ||
|
||
|
||
# print mwlib.__file__ | ||
|
||
''' | ||
source = os.popen("zcat /share/emplus/corpora/wikipedia-201001/dewiki-20100117-pages-articles.xml.gz | gawk '/<title>/{print} /<text/,/<[/]text/'") | ||
lines = [] | ||
for line in source: | ||
if line.strip().startswith('<title>'): | ||
processLines(lines) | ||
lines=[] | ||
print line | ||
else: | ||
lines.append(line) | ||
''' | ||
|
||
|
||
for title in arguments: | ||
if title == 'Barack Obama' and options.language=='en': | ||
text = open('obama.src').read().decode('utf-8') | ||
else: | ||
text = wikipydia.query_text_raw(title, language=lang2short(options.language))['text'] | ||
|
||
if options.trace: | ||
print '############# ',title,', source ###########' | ||
print text.encode('utf-8') | ||
sections = processArticle(text) | ||
print '\n'.join(x for section in sections for x in section).encode('utf-8') | ||
|
||
|
||
|
||
|
||
|
||
|
||
if __name__ == "__main__": | ||
tc,tt=clock(),time() | ||
try: main() | ||
finally: print >> sys.stderr, "%.3f/%.3f seconds overall" % (clock()-tc, time()-tt) | ||
|
||
|