-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathoxford.py
215 lines (209 loc) · 11 KB
/
oxford.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Get idioms from the online Oxford Dictionary of English Idioms, by scraping the pages at www.oxfordreference.com.
Refines the idioms by removing duplicates, and expanding things in parentheses, dealing with special cases.
'''
import re, requests, itertools
from bs4 import BeautifulSoup
def get_idioms(url, landing_url, use_socks_proxy = False):
'''
Scrapes idioms from the ODEI website, gets 100 entries per page,
navigates to entry page, gets idiom, cycles through pages
'''
idioms = []
# Set proxy, if applicable, requires pysocks to be installed
if use_socks_proxy:
proxies = {'http': "socks5://127.0.0.1:8080"}
else:
proxies = {}
# Get and parse first page
page = requests.get(landing_url)
soup = BeautifulSoup(page.content, 'html.parser')
# Scrape pagination information
links = soup.find_all('a')
for link in links:
if link.parent.name == 'div':
try:
if link.parent['class'][0] == 't-data-grid-pager':
last_page = link.text # Number of pages to cycle through
url_template = link['href']
except KeyError:
pass # Sometimes parent has no class
# Cycle through pages, get actual idioms
for i in range(1, int(last_page) + 1):
print 'Scraping page {0} of {1}'.format(i, last_page) # Very slow, so give progress updates
# Get next page url
if i < int(last_page):
next_page = url + re.sub('gridpager/{0}'.format(last_page), 'gridpager/{0}'.format(i + 1), url_template)
# Find links to pages containing idioms
links = soup.find_all('a')
for link in links:
if link.parent.name == 'h2':
try:
if link.parent['class'][0] == 'itemTitle':
# Get page with idiom entries
entry_page = requests.get(url + link['href'], proxies=proxies)
entry_soup = BeautifulSoup(entry_page.content, 'html.parser')
# Extract idiom
for idiom in entry_soup.find_all('em'):
try:
if idiom.parent.parent['class'][0] == 'div1':
if ' ' in idiom.text: # Filter out single word 'idioms'
idioms.append(idiom.text) # Store the actual idiom
except KeyError:
pass # Sometimes grandparent has no class
except KeyError:
pass # Sometimes parent has no class
# Get and parse next page
page = requests.get(next_page)
soup = BeautifulSoup(page.content, 'html.parser')
return sorted(list(set(idioms)))
def refine_idioms(idioms):
'''
Oxford scraping output is messy. Removes duplicates containing ':'.
Expands optionals in parentheses. Deals with some exceptional cases individually
'''
refined_idioms = []
for idiom in idioms:
# Fix scraping errors
if idiom == 'like (or as if) it is going out of fashion (or style':
idiom += ')'
if idiom == 'cog in the wheel (or machine':
idiom += ')'
if idiom == 'get you (him, her':
idiom += ', etc.)!'
has_parentheses = False
if idiom[-1] != ':': # All duplicates end in ':'
# Get all parenthesis pairs + content
if re.findall('\(.*\)', idiom):
pairs_of_parentheses = re.finditer('\(.*?\)', idiom)
# Cycle through pairs of parentheses, collect parts of idiom, and their expansions/variations
idiom_parts = []
previous_end = 0
for pair_of_parentheses in pairs_of_parentheses:
starts_with_also = False # e.g. (also sure as fate)
starts_with_or = False # e.g. (or get something off the ground)
or_in_middle = False # e.g. (final or last)
contains_etc = False # e.g. (me, him, etc.)
# Get indices
start = pair_of_parentheses.start()
end = pair_of_parentheses.end()
# Examine content between parentheses - set conditions
content_between_parentheses = pair_of_parentheses.group(0)[1:-1] # Get content without ()
if re.match('also\\b', content_between_parentheses):
starts_with_also = True
if re.match('or\\b', content_between_parentheses):
starts_with_or = True
if re.search('.\\bor\\b', content_between_parentheses):
or_in_middle = True
if re.search('etc\.', content_between_parentheses):
contains_etc = True
# Add the non-parenthesized bit before the current pair of parentheses (if it exists)
idiom_part_before = idiom[previous_end:start]
if idiom_part_before:
idiom_parts.append([idiom_part_before])
## Deal with different types of content between parentheses
# Deal with the case with the '/', which occurs in exactly 1 idiom entry
if content_between_parentheses == 'or get your fingers burned/burnt':
content_between_parentheses = 'or get your fingers burned or get your fingers burnt'
or_in_middle = True
# Deal with some especially difficult parentheses cases first, individually
if '(' in content_between_parentheses:
if content_between_parentheses == 'or bring someone back (down':
refined_idioms.append(u'bring someone back to earth')
refined_idioms.append(u'bring someone back down to earth')
end = len(idiom)
if content_between_parentheses == 'or give someone pause (for thought':
refined_idioms.append(u'give someone pause')
refined_idioms.append(u'give someone pause for thought')
end = len(idiom)
if content_between_parentheses == 'or herein (or therein':
idiom_parts[-1].append(u'herein lies')
idiom_parts[-1].append(u'therein lies')
idiom_parts.append([u'a tale'])
end = len(idiom)
# Simplest case, just generate idiom with parentheses removed, keeping content in parentheses
# EXAMPLE: (all) at sea -> all at sea, at sea
elif not starts_with_also and not starts_with_or and not or_in_middle and not contains_etc:
idiom_part_between_parentheses = ['', content_between_parentheses]
idiom_parts.append(idiom_part_between_parentheses)
# Simplest'case starting with 'or'. Generate idiom with n words before parentheses replaced by the n words in the parentheses
# EXAMPLE: I should cocoa (or coco) -> I should cocoa, I should coco
elif not starts_with_also and starts_with_or and not or_in_middle and not contains_etc:
num_words_to_replace = len(content_between_parentheses.split(' ')) - 1 # -1 because of or
content_between_parentheses_without_or = ' '.join(content_between_parentheses.split(' ')[1:])
idiom_part_before_split = idiom_part_before.strip().split(' ')
idiom_part_before_trimmed = ' '.join(idiom_part_before_split[:-num_words_to_replace])
idiom_part_before_variant = idiom_part_before_trimmed + ' ' + content_between_parentheses_without_or
if idiom_part_before[0] == ' ': # Add initial space if it got removed incidentally
idiom_part_before_variant = ' ' + idiom_part_before_variant
idiom_parts[-1].append(idiom_part_before_variant) # Add as variant to previous part
# Simplest case with or in the middle. Generate idioms for each part separated by 'or'.
# EXAMPLE: a (final or last) turn of the screw -> a final turn of the screw, a last turn of the screw
elif not starts_with_also and not starts_with_or and or_in_middle and not contains_etc:
content_parts = content_between_parentheses.split(' or ')
idiom_parts.append(content_parts)
# Case with both or at the start and in the middle. Generate idioms with replacement for each part separated by 'or'
# EXAMPLE: a bad (or bitter or nasty) taste -> a bad taste, a bitter taste, a nasty taste
elif not starts_with_also and starts_with_or and or_in_middle and not contains_etc:
content_parts = content_between_parentheses[3:].split(' or ') # Strip initial 'or' and split in parts
idiom_part_before_split = idiom_part_before.strip().split(' ')
for content_part in content_parts:
num_words_to_replace = len(content_part.split(' '))
idiom_part_before_trimmed = ' '.join(idiom_part_before_split[:-num_words_to_replace])
idiom_part_before_variant = idiom_part_before_trimmed + ' ' + content_part
idiom_parts[-1].append(idiom_part_before_variant)
# Case with 'also' at the start, signals full replacement, only two cases, one also with 'or'
# 1. sure as eggs is eggs (also sure as fate) 2. left, right, and centre (also left and right or right and left)
elif starts_with_also and not contains_etc:
if not or_in_middle:
idiom_part_before_variant = content_between_parentheses[5:] # Remove 'also'
idiom_parts[-1].append(idiom_part_before_variant)
else:
idiom_part_before_variants = content_between_parentheses[5:].split(' or ')
idiom_parts[-1] += idiom_part_before_variants
# Cases with etc. are rare, and require individual treatment
elif contains_etc:
if content_between_parentheses in ['me, him, etc.', 'him, her, etc.']:
expanded_series = ['me', 'you', 'him', 'her', 'us', 'them', 'it']
idiom_parts.append(expanded_series)
elif content_between_parentheses == 'or tell, etc.':
idiom_part_before_variant = 'tell'
idiom_parts[-1].append(idiom_part_before_variant)
elif content_between_parentheses == 'or herself, etc.':
idiom_part_before_trimmed = ' '.join(idiom_part_before.split()[:-1])
variant_series = ['myself', 'yourself', 'herself', 'itself', 'ourselves', 'yourselves', 'themselves']
for variant in variant_series:
idiom_part_before_variant = idiom_part_before_trimmed + ' ' + variant
idiom_parts[-1].append(idiom_part_before_variant)
elif content_between_parentheses == 'or bore etc.':
idiom_part_before_variant = 'bore'
idiom_parts[-1].append(idiom_part_before_variant)
elif content_between_parentheses == 'or your etc.':
idiom_part_before_trimmed = ' '.join(idiom_part_before.split()[:-1])
variant_series = ['my', 'your', 'his', 'her', 'its', 'our', 'your', 'their']
for variant in variant_series:
idiom_part_before_variant = idiom_part_before_trimmed + ' ' + variant
idiom_parts[-1].append(idiom_part_before_variant)
elif content_between_parentheses in ['or you or him, etc.', 'or her, him, etc.']:
idiom_part_before_trimmed = ' '.join(idiom_part_before.split()[:-1])
variant_series = ['you', 'him', 'her', 'us', 'them', 'it']
for variant in variant_series:
idiom_part_before_variant = idiom_part_before_trimmed + ' ' + variant
idiom_parts[-1].append(idiom_part_before_variant)
elif content_between_parentheses == 'or forty-something, etc.':
idiom_parts = [] # Single-word idiom, ignore
previous_end = end
# Add remaining part of idiom after final pair of parentheses
idiom_parts.append([idiom[end:]])
# From the collected idiom parts and variations, generate all idiom variations and add them to the list
for refined_idiom in itertools.product(*idiom_parts):
refined_idiom = ''.join(refined_idiom)
refined_idiom = re.sub(' +', ' ', refined_idiom) # Remove double spaces
refined_idiom = re.sub('(^ )|( $)', '', refined_idiom) # Remove initial spaces and final spaces
if len(refined_idiom.split(' ')) > 1: # Remove single-word idioms, e.g. 'forty-something' (or thirty-something')
refined_idioms.append(refined_idiom)
else:
refined_idioms.append(idiom)
return refined_idioms