-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpyn.py
393 lines (297 loc) · 9.76 KB
/
pyn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
from soupy import Soupy, Q
import urllib.request
import urllib.error
import codecs
import os
import copy
def safe_chars(string):
'''
given a string, returns a string with characters that cannot be used in
a windows filepath removed.
'''
valid_chars = "-_().abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
return ''.join(c for c in string if c in valid_chars)
def download(url, retry = 5):
'''
downloads a file given a url. Tries 5 times before raising an error
'''
attempts = 0
while attempts <= retry:
try:
f = urllib.request.urlopen(url)
return f
break
except urllib.error.URLError as Error:
attempts += 1
raise NameError('Download Failed')
def make_dir(directory):
'''
given a filepath, check if the directory exists and if
not, create it.
'''
if not os.path.exists(directory):
os.makedirs(directory)
def format_xhtml(body, title = "Lorem Ipsum"):
'''
given a body of html, correctly returns a string with
proper xhtml formatting
'''
return """<?xml version="1.0" encoding="utf-8" standalone="no"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title></title>
</head>
<body>
<h3 align="right">{}</h3>
<hr />
{}
</body>
</html>
""".format(title, body)
def write_img(image, filepath = "temp.jpg"):
'''
given an image and a filepath, writes to disk
'''
with open(filepath, 'wb') as f:
f.write(image.read())
def write_html(html, filepath = "temp.html"):
'''
given html and a filepath, writes to disk
'''
with codecs.open(filepath, 'w', 'UTF-8') as f:
f.write(html)
def has_string(tag):
'''
tests if a tag has a string or not. Is used for ugly sites which don't
put text into <p> tags like normal people
'''
t = tag.name.val()
if t == 'script' or t == 'a' or t == 'li' or t == 'style':
return False
elif len(tag.text.val()) > 12:
return True
return False
def is_junk(tag):
'''
detects if the tag is 'junk' or not. Junk tags include social media
buttons, comments, random javascript, etc. Not perfect, but kills most things
'''
name = tag.name.val()
if 'table' in name: return True
elif 'th' in name: return True
for key, value in tag.attrs.val().items():
for meta in value:
metadata = meta.lower()
if 'comment' in metadata: return True
elif 'reply' in metadata: return True
elif 'google' in metadata: return True
elif 'share' in metadata: return True
elif 'twitter' in metadata: return True
elif 'facebook' in metadata: return True
elif 'social' in metadata: return True
return False
def grab_from_start_tag(start_tag):
'''
a generator that yields all the tags from a starting tag forward
'''
for sibling in start_tag.next_siblings:
for child in sibling.find_all():
pass
#yield child
yield sibling
def is_null(tag):
'''
Soupy does weird things sometimes. .notnull() does not work. Like,
at all. But this does.
'''
if tag.name.val():
return True
return False
def find_start_tag(soup):
'''
companion function for grab_from_start_tag. Tries to find the tag
where 'content' (i.e text) first starts by detecting large,
consecutive strings of tags which contain text.
'''
for tag in soup.find_all():
correct = 0
fail = 0
for siblings in tag.next_siblings.filter(is_null):
if has_string(siblings) and not is_junk(siblings):
correct += 1
else:
fail += 1
if correct > 4:
return tag
elif fail > 2:
break
raise NameError('Cannot find content')
def correct_content_generator(soup):
'''
generator that uses find_start_tag and grab_from_start_tag to
(hopefully) yield only relevant content
'''
start = find_start_tag(soup)
for tag in grab_from_start_tag(start):
yield tag
def only_p_generator(soup):
'''
generator that simply yields all p tags
'''
for p in soup.find_all(['p', 'img']):
yield p
def everything_generator(soup):
'''
generator that yields everything with text
'''
for tag in soup.find_all().filter(has_string):
yield tag
def get_img_url(img):
'''
given an img tag, returns the src
'''
for key, value in img.attrs.val().items():
if key == 'src':
return value
raise NameError('Could not find href')
def detect_img_tag(tag):
'''
detects if a tag is a <img>
'''
if tag.name.val() == 'img':
return True
for img in tag.find_all('img'):
return True
return False
def img_tag_generator(tag):
'''
generator that yields all img tags in a tag
because people stick <img>'s in <div>'s and stuff
'''
if tag.name.val() == 'img':
yield tag
else:
for img in tag.find_all('img'):
yield img
def page_from_list_generator(url_list):
'''
given a list of urls to pages, yields the page
'''
for url in url_list:
yield Page(return_href_from_a(url))
def baka_page_from_list_generator(url_list):
'''
given a list of urls to pages, yields baka pages
'''
for url in url_list:
yield BakaPage(return_href_from_a(url))
def download_pages(page_generator):
'''
given a generator of pages, downloads the images and writes the
tags into an xhtml. A 'temp' directory is created because
I don't want to mess with the actual library to make
an actual temp directory.
Yeah, everything is saved as a .jpg. Oh well.
'''
make_dir('temp')
htmlfiles = []
imgfiles = []
chapter_titles = []
for page in page_generator:
chapter_titles.append(page.title)
for i, image in enumerate(page.img_url):
img = download(image)
imgfilepath = 'temp/' + page.safe_title + str(i) + '.jpg'
write_img(img, imgfilepath)
imgfiles.append(imgfilepath)
content = format_xhtml(page.return_html_as_string(), page.title)
filepath = 'temp/' + page.safe_title + '.xhtml'
write_html(content, filepath)
htmlfiles.append(filepath)
return htmlfiles, imgfiles, chapter_titles
def return_href_from_a(tag):
for key, value in tag.attrs.items():
if 'href'in key:
return value
def is_tag_not_anchor(tag):
for key, value in tag.attrs.val().items():
if 'href'in key:
return True
return False
def return_all_links(url):
soup = Soupy(download(url))
return [tag for tag in soup.find_all('a') if is_tag_not_anchor(tag)]
class Page:
'''
class that represents a page of the site/epub/whatever
contains a list of all the tags and a list of links to
images used to later download them
'''
def __init__(self, url, generator = correct_content_generator):
self.url = url
self.tags = []
self.img_url = []
soup = Soupy(download(url))
self.title = soup.find('title').text.val() or 'Lorem Ipsum'
self.safe_title = safe_chars(self.title)
try:
find_start_tag(soup)
except NameError as err:
generator = only_p_generator
for tag in generator(soup):
self.retrieve_file(tag)
def return_html_as_string(self):
return '\n'.join([str(tag.val()) for tag in self.tags])
def retrieve_file(self, tag):
if detect_img_tag(tag):
for img in img_tag_generator(tag):
filepath = '../Images/' + self.safe_title + str(len(self.img_url)) + '.jpg'
self.img_url.append(get_img_url(img))
img['src'] = filepath
self.tags.append(img)
elif has_string(tag) and not is_junk(tag):
self.tags.append(tag)
class BakaPage(Page):
'''
class that is made for the quirks of Bakatsuki pages
Inherits from page
'''
def __init__(self, url):
super(BakaPage, self).__init__(self.fix_baka_link(url))
def convert_baka_thumbnail_to_full(self, url):
'''
Don't ask me how this works, I did this a while ago
it's probably awful, but it does work, so I'll keep
copypasting it
'''
unfUrl = url.replace('/thumb', '')
find_ending = url[-4:]
a = unfUrl.find(find_ending)
return unfUrl[0:a+4]
def fix_baka_link(self, url):
'''
because some bakatsuki links are relative and some
are absolute. whhhhhyyyyy
converts all urls to absolute
'''
if not 'http' in url:
return 'https://www.baka-tsuki.org' + url
else:
return url
def retrieve_file(self, tag):
if detect_img_tag(tag):
for img in img_tag_generator(tag):
try:
img['width'] = ""
img['height'] = ""
except:
#yeah, yeah, antipattern, just a little laziness
pass
filepath = '../Images/' + self.safe_title + str(len(self.img_url)) + '.jpg'
fixed_url = self.fix_baka_link(self.convert_baka_thumbnail_to_full(get_img_url(img)))
self.img_url.append(fixed_url)
img['src'] = filepath
self.tags.append(img)
elif has_string(tag) and not is_junk(tag):
self.tags.append(tag)