-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathcooccurrence.py
135 lines (110 loc) · 5.29 KB
/
cooccurrence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/python
import re
import string
import urllib2
from bs4 import BeautifulSoup
from numpy import average, std
class CooccurrenceFinder():
def corpus_scraper(self, word, numdocs, redo=False):
"""
Purpose: Scrapes Wikipedia search results for a word and compiles all text to a single text file
Inputs: Words to be searched for, number of documents to be scraped (the more the better the results), if corpuses for the word should be re-scraped due to changes in parameters, etc
Outputs: The file path of the written file
"""
genCorpus = False
try:
open('corpuses/' + word + '_corpuses.txt')
except IOError:
genCorpus = True
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15'}
if genCorpus or redo:
print 'Generating corpus for ' + word
req = urllib2.Request(url = 'http://en.wikipedia.org/w/index.php?title=Special:Search&search=' + str(word) + '&fulltext=Search&profile=advanced&redirs=1', headers = headers)
site = urllib2.urlopen(req)
results = BeautifulSoup(site)
site.close()
anchors = []
for link in results.find('ul', {'class':'mw-search-results'}).find_all('a')[0 : numdocs]:
anchors.append(link.get('href'))
output = open('corpuses/' + word + '_corpuses.txt', 'w')
for anchor in anchors:
req = urllib2.Request(url = 'http://en.wikipedia.org' + str(anchor), headers = headers)
site = urllib2.urlopen(req)
page = BeautifulSoup(site)
site.close()
output.write(page.find('div', {'class':'mw-body'}).get_text().encode('utf8') + '\n\n\n')
output.close()
return 'corpuses/' + word + '_corpuses.txt'
#TODO: make this algorithm weighted by distance
def find_relateds(self, corpus, word, distance, extra_stops, stdevs):
"""
Purpose: To divine a list of words associated with a provided word as determined by a threshold of co-occurrence
Inputs: The corpus to be analyzed, the words to be analyzed for, the max distance that would satisfy a co-occurrence
Outputs: Significant co-occurrences
"""
#TODO: make it so that a newline wipes
file = open(corpus, 'r')
text = file.read().lower()
file.close()
file = open('stopword.txt', 'r')
dump = file.read()
file.close()
stopwords = dump.split()
#since this is being used for colors for now, I don't want, for example, 'blue' to be in 'red's' associate list
stopwords.extend(extra_stops)
self.counts = {}
pairs = self.find_close_words(distance, text, word)
for i in range(distance):
self.tally_occurrences(word, pairs[i], stopwords)
return self.find_significant_cooccurrences(self.counts, stdevs)
#I realize that this can have a much better runtime by running a single regex for the entire duration. I like having the distances on hand. See the comment for 'tally_occurrences()'
def find_close_words(self, dist, text, word):
"""
Purpose: Finds all the words in a corpus within a specified distance of a target word
Inputs: Desired distance, text corpus, target word
Outputs: Dictionary of distances to words that distance after the target
"""
term = [word+'s?', r'(\w+)']
temp_pairs = {}
for i in range(dist):
if i is 0:
temp_pairs[i] = re.findall(' '.join(term), text)
term.reverse()
temp_pairs[i] = re.findall(' '.join(term), text)
else:
#works regardless of term orientation
term.insert(1, r'\w+')
temp_pairs[i] = re.findall(' '.join(term), text)
term.reverse()
temp_pairs[i] = re.findall(' '.join(term), text)
return temp_pairs
#TODO: make it so that closer words carry more significance
def tally_occurrences(self, word, pair_set, stopwords):
"""
Purpose: Create a frequency distribution for the area after the target word, but not including stop words or derivatives of the target
Inputs: Target word, dictionary of distances to words, stopwords
Outputs: Nothing explicitly
"""
for targ in pair_set:
if stopwords.count(targ) > 0 or re.search(word, targ):
continue
elif targ in self.counts:
self.counts[targ] += 1
else:
self.counts[targ] = 1
def find_significant_cooccurrences(self, counts, SDs):
"""
Purpose: The find the significant co-occurrences from a frequency distribution
Inputs: Frequency distribution, number of standard deviations for signficance
Outputs: List of significant cooccurrences
"""
allCounts = []
sigCos = []
for coll in counts.keys():
allCounts.append(counts[coll])
av = average(allCounts)
stdv = std(allCounts)
for coll in counts.keys():
if (counts[coll] - av)/stdv > SDs:
sigCos.append(coll)
return sigCos