Skip to content

Commit

Permalink
first version of text extractor
Browse files Browse the repository at this point in the history
  • Loading branch information
Andreas Eisele committed Jan 27, 2010
0 parents commit 21d801c
Show file tree
Hide file tree
Showing 2 changed files with 279 additions and 0 deletions.
84 changes: 84 additions & 0 deletions optArgs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#! /usr/bin/env python2.3


__author__ = 'Andreas Eisele <[email protected]>'
__created__ = "Fri Feb 6 12:41:04 2004"
__date__ = ('$Date: 2004/02/09 08:23:13 $').strip("$")
__version__ = '$Revision: 1.2 $'.strip("$")


# purpose of file:

# a convenience wrapper around optparse


# todo:
# provide more informative help text
# support for boolean values



# completed

from optparse import OptionParser, Values


# provide names that can be imported
global options, arguments, optionsParsed
options=Values()
arguments=[]
optionsParsed=False

def key2opt(name):
if len(name)==1: return "-"+name
return "--"+name


def optParse(usage=None, version=None, **kw):
global options, arguments, optionsParsed

if optionsParsed:
raise "optParse cannot be called twice"
else:
optionsParsed=True
op=OptionParser(usage=usage, version=version)
for key,val in kw.items():
keys=[key2opt(k) for k in key.split("__")]
if type(val) == type(1):
otype="int"
elif type(val) == type(1.0):
otype="float"
elif type(val)==type("") and "|" in val:
choices=val.split("|")
defVal=choices[0]
op.add_option(default=defVal, choices=choices, help="one of: %s, [%s]"%(" ".join(choices),defVal), *keys)
continue
elif val==None:
op.add_option(action="store_true", *keys)
continue
else:
otype="string"
op.add_option(default=val, type=otype, help="[%s]"%val,*keys)

(opts, args)=op.parse_args()

for a in args: arguments.append(a)
options._update_loose(opts.__dict__)





def main():
optParse(iOpt__i=1, sOpt__s="a")
global options, arguments

print "iOpt=",options.iOpt
print "sOpt=",options.sOpt

print "arguments=", arguments


if __name__ == "__main__":
main()

195 changes: 195 additions & 0 deletions trymwlib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
#! /home/cl-home/eisele/lns-root-07/bin/python2.6
# -*- coding: utf-8 -*-

__author__ = 'Andreas Eisele <[email protected]>'
__created__ = "Tue Jan 26 21:41:40 2010"
__id__ = '$Id: trymwlib.py 53 2010-01-27 13:53:08Z anei00 $'.strip("$")


# purpose of file:
# extract clear text from wikipedia articles


from time import clock, time
import sys
import os
import mwlib
import nltk
import wikipydia
import re
from optArgs import optParse, options, arguments


#from pyparsing import nestedExpr



#from mwlib.uparser import simpleparse
#from mwlib.uparser import parseString, simpleparse
import mwlib
from mwlib.refine.compat import parse_txt
from mwlib.refine import core
from mwlib.parser import nodes


def splitToSections(parse,lead=None,sections=None):
if not sections: sections=[]
# returns a list of subtrees of type section
for child in parse.children:
if type(child) == nodes.Section:
sections.append(child)
elif not sections:
if not lead:
lead = parse_txt(raw='== ==').children[0]
lead.children=[]
lead.children.append(child)
else:
splitToSections(child,lead,sections)
return [lead]+sections


def extractText(simpleParse,collectedText=None):
if collectedText==None: collectedText=[]
for child in simpleParse.children:
#print type(child),len(child.children)
if type(child)==nodes.TagNode:
pass
elif len(child.children)==0:
if child.text:
collectedText.append(child.text)
if options.trace: print child.text.encode('utf-8'),
elif child.target:
if type(child)==nodes.ArticleLink:
collectedText.append(child.target)
if options.trace: print child.target.encode('utf-8'),
elif hasattr(child,'math') and child.math:
collectedText.append(child.math)
if options.trace: print child.math.encode('utf-8'),
#elif type(child)==nodes.Node: pass
else:
print >> sys.stderr,'#######cannot handle',type(child),
if options.trace:
print >> sys.stderr, dir(child)
for a in dir(child):
print >> sys.stderr, a, getattr(child,a)
else:
extractText(child,collectedText)
return collectedText


def processArticle(text):
tree = parse_txt(text)
if options.trace:
print '############# parse ###########'
core.show(tree)
print '############# sentences ###########'

return processTree(tree)

def processTree(tree):
return [processSection(section) for section in splitToSections(tree)]




def processSection(section,splitAtNL=True):
result = []
extractedText = ''.join(extractText(section))


'''
expr = nestedExpr('{{','}}').leaveWhitespace()
bracketedItems = expr.parseString('{{'+extractedText+'}}').asList()[0]
res = []
for item in bracketedItems:
if not isinstance(item, list):
res.append(item)
extractedText = ' '.join(res)
'''

while len(set(extractedText) & set('{}'))==2:
extractedText = re.sub('{[^{}]*}',' ',extractedText)

# little hack to change the order of
extractedText = extractedText.replace('."','".')

if splitAtNL: lines = extractedText.split('\n')
else: lines = [extractedText]

for text in lines:
for sentence in sent_detector.tokenize(text.strip()):
if sentence:
result.append(sentence)
return result



def processLines(lines):
if not lines: return
data='\n'.join(lines)
data=data.split('>',1)[1]
data=data.rsplit('<',1)[0]
processArticle(data)


languages = [p.split(':') for p in '''en:english cz:czech da:danish nl:dutch et:estonian fi:finnish fr:french de:german el:greek it:italian no:norwegian pt:portuguese sl:slovene es:spanish sw:swedish tr:turkish'''.split()]

def lang2long(lang):
for p in languages:
if lang in p: return p[1]

def lang2short(lang):
for p in languages:
if lang in p: return p[0]


def main():
global sent_detector
optParse(
trace__T=None,
language__L='|'.join(l for p in languages for l in p)
)

sent_detector = nltk.data.load('tokenizers/punkt/%s.pickle' % lang2long(options.language))


# print mwlib.__file__

'''
source = os.popen("zcat /share/emplus/corpora/wikipedia-201001/dewiki-20100117-pages-articles.xml.gz | gawk '/<title>/{print} /<text/,/<[/]text/'")
lines = []
for line in source:
if line.strip().startswith('<title>'):
processLines(lines)
lines=[]
print line
else:
lines.append(line)
'''


for title in arguments:
if title == 'Barack Obama' and options.language=='en':
text = open('obama.src').read().decode('utf-8')
else:
text = wikipydia.query_text_raw(title, language=lang2short(options.language))['text']

if options.trace:
print '############# ',title,', source ###########'
print text.encode('utf-8')
sections = processArticle(text)
print '\n'.join(x for section in sections for x in section).encode('utf-8')






if __name__ == "__main__":
tc,tt=clock(),time()
try: main()
finally: print >> sys.stderr, "%.3f/%.3f seconds overall" % (clock()-tc, time()-tt)


0 comments on commit 21d801c

Please sign in to comment.