-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcraig.py
80 lines (64 loc) · 2.29 KB
/
craig.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from flask import Flask, render_template, request
from processor import TextProcessor, Document
import re
from bs4 import BeautifulSoup
import urllib3
import numpy as np
from flask.ext.pymongo import PyMongo
import os
import sys
app = Flask(__name__)
app.config['MONGO_URI'] = os.getenv('MONGOHQ_URL')
mongo = PyMongo(app)
processor = TextProcessor()
with app.app_context():
processor.map_data(mongo.db.postings.find())
processor.build_doc_matrix()
@app.route('/', methods=['GET', 'POST'])
def main():
if request.method == 'GET':
return render_template('index.html')
url = request.form['url'].strip()
domain = 'newyork.craigslist.org/'
if not domain in url:
return render_template('index.html', error='Please enter a valid URL')
# process unseen document
try:
qry_doc = _get_qry_doc(url)
except CraigParseError, e:
return render_template('index.html', error=e.msg)
vect = processor.vectorizer.transform([' '.join(qry_doc.processed)]) # this returns a sparse vector of csr_matrix type
# build similarity matrix and extract top matches
sim_vect = processor.doc_mat * vect.T
# we need to first convert the sparse array to dense form and flatten it
top_sim_ind = processor.get_top_ind(sim_vect.A.flatten(), 10)
matches = [processor.doc_collection[i] for i in top_sim_ind]
# exclude exact match
if matches[0].link == qry_doc.link:
del matches[0]
return render_template('index.html', qry=qry_doc, matches=matches)
def _get_qry_doc(url):
http = urllib3.PoolManager()
page = http.request('GET', url).data
data = BeautifulSoup(page)
title, desc = data.h2, data.find(id='postingbody')
expired_msg = 'This posting has been deleted by its author'
not_found_msg = 'No web page for this address'
if title:
title = title.text.strip()
if expired_msg in title:
raise CraigParseError(expired_msg)
if desc:
desc = desc.text.strip()
if not_found_msg in desc:
raise CraigParseError(not_found_msg)
if not title or not desc:
raise CraigParseError('The content of the queried page cannot be processed')
return Document(url, title, desc,
processor.process_doc((title + ' ' + desc).encode('utf-8')))
class CraigParseError(Exception):
def __init__(self, msg):
self.msg = msg
if __name__ == '__main__':
port = int(os.environ.get('PORT', 7000))
app.run(host='0.0.0.0', port=port, debug=True)