-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathworker.py
67 lines (63 loc) · 2.78 KB
/
worker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python
# AKA curriculum-insular resolver/worker
import beanstalkc
from pattern.web import URL, plaintext, URLError, MIMETYPE_WEBPAGE, MIMETYPE_PLAINTEXT, HTTPError
from pattern.en import parsetree
from pymongo import MongoClient
import datetime
beanstalk = beanstalkc.Connection(host='localhost', port=14711)
client = MongoClient(tz_aware=True)
db = client.curriculum
while True:
# take URL and groupID
# resolve URL into chunks and shove them in 'keywords' with same groupID
# delete job
print 'starting worker outer loop'
job = beanstalk.reserve() # this is blocking, waits till there's something on the stalk
pay = job.body.split('|')
groupID = pay[0]
url = URL(pay[-1])
print 'new url, we think', url
timestamp = datetime.datetime.utcnow()
try:
s = url.download(cached=True)
print url.mimetype
if (url.mimetype in MIMETYPE_WEBPAGE) or (url.mimetype in MIMETYPE_PLAINTEXT):
s = plaintext(s)
'''
parsetree(string,
tokenize = True, # Split punctuation marks from words?
tags = True, # Parse part-of-speech tags? (NN, JJ, ...)
chunks = True, # Parse chunks? (NP, VP, PNP, ...)
relations = False, # Parse chunk relations? (-SBJ, -OBJ, ...)
lemmata = False, # Parse lemmata? (ate => eat)
encoding = 'utf-8' # Input string encoding.
tagset = None) # Penn Treebank II (default) or UNIVERSAL.
'''
parsed = parsetree(s, chunks=True)
for sentence in parsed:
# only noun phrases for now but let's pick some good other ones next week
# seeing ADJP, ADVP, PP and VP mostly tho NP are predominant
# gen = (the_chunk for the_chunk in sentence.chunks if the_chunk.type=='NP')
# for chunk in gen:
for chunk in sentence.chunks:
d = db.keywords.update(
{'keyword': chunk.string, 'type': chunk.type, 'groupID': groupID},
{'$push': {'timestamp': timestamp, 'url': url.string}, '$set': {'latest': timestamp}},
upsert=True
)
# print d
else:
'we failed the mimetype test again wtf'
except HTTPError, e:
# e = sys.exc_info()[0]
# print 'URLError on ', url
print url
print e
pass
except:
print url
print 'AKA unhandled exception that we will try to just destroy without halting and catching fire'
# end of if(isThere < 2)
job.delete()
print 'job deleted, we think this was all successful - loop over'