-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathurl_sorter.py
521 lines (458 loc) · 20.1 KB
/
url_sorter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
#-------------------------------------------------------------------------------
# Name: linksorter
# Purpose: Sorting urls from a file into lists for each site
#
# Author: new
#
# Created: 20/03/2013
# Copyright: (c) new 2013
# Licence: <your licence>
#-------------------------------------------------------------------------------
#!/usr/bin/env python
import re
import urlparse
import os
import fnmatch
import logging
def uniquify(seq, idfun=None):
# List uniquifier from
# http://www.peterbe.com/plog/uniqifiers-benchmark
# order preserving
if idfun is None:
def idfun(x): return x
seen = {}
result = []
for item in seq:
marker = idfun(item)
# in old Python versions:
# if seen.has_key(marker)
# but in new ones:
if marker in seen: continue
seen[marker] = 1
result.append(item)
return result
def extractlinks(html):
# Copied from:
# http://stackoverflow.com/questions/520031/whats-the-cleanest-way-to-extract-urls-from-a-string-using-python
# old regex http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+~]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
links = re.findall(url_regex,html, re.DOTALL)
return links
def load_textfile(filepath):
# Return data in specified file, if no file found create it.
new_file_text = 'Put text containing URLs here'
if os.path.exists(filepath):
f = open(filepath,'rU')
file_data = f.read()
f.close()
print 'File loaded'
return file_data
else:
f = open(filepath,'w')
f.write(new_file_text)
f.close()
def save_text(filepath,data):
#print 'save_text:filepath', filepath
save_dir = os.path.dirname(filepath)
#print 'save_text:save_dir', save_dir
if not os.path.exists(save_dir):
os.makedirs(save_dir)
f = open(filepath,'w')
f.write(data)
f.close()
#print 'save_text:Saved data to file', filepath
def extract_domain(url):
#return the domain from given url
# print 'extract_domain:url', url
full_domain = urlparse.urlparse(url).netloc
# print 'extract_domain:full_domain', full_domain
domain = re.sub('^www\.','',full_domain)
domain = sanitize_dict_name(domain)
# Handle known problem cases
# DeviantArt.com
if 'deviantart.com' in domain:
short_domain = re.sub('.+deviantart.com', 'deviantart.com', domain)
return short_domain
# Tumblr.com
elif '.tumblr.com' in domain:
short_domain = re.sub('.+tumblr.com', 'tumblr.com', domain)
return short_domain
# Too long
elif len(domain) > 200:
return "TOO_LONG"
else:
return domain
def sanitize_filename(filename):
# Sanitize a filename (not a path)
sanitized_filename = re.sub('[^\./a-zA-Z0-9_-]+', '', filename)
return sanitized_filename
def sanitize_dict_name(dict_name):
sanitized_dict_name = re.sub("[^\.a-zA-Z0-9_-]", "", dict_name)
return sanitized_dict_name
def build_link_dict(unsorted_data):
#turn a string with urls in it into a dict using format {'DomainName.com':['url1','url2']}
url_list = extractlinks(unsorted_data)
#print 'url_list', url_list
sorting_dict = {}# {'DomainOne.com':['url1','url2']}
for url in url_list:
# print 'url', url
url_domain = extract_domain(url)
# print 'url_domain', url_domain
if url_domain not in sorting_dict.keys():
sorting_dict[url_domain] = []
sorting_dict[url_domain].append(url)
return sorting_dict
def export_urls_from_file(input_file_path='paste_here.txt'):
#read the specified text file and output a list of links for each domain
unsorted_data = load_textfile(input_file_path)
#print 'unsorted_data', unsorted_data
link_dict = build_link_dict(unsorted_data)
export_urls_from_dict(link_dict)
def export_urls_from_dict(link_dict):
for domain_key in link_dict.keys():
#print 'domain_key', domain_key
output_filename = sanitize_filename(domain_key) + '.txt'
output_data = ''
for output_url in link_dict[domain_key]:
output_data += (output_url + '\n')
#print 'output_data', output_data
output_path = 'output/' + output_filename
save_text(output_path,output_data)
# Converter functions; These take URLs and return usernames for that site
def deviantart_convert(url):
"""Turn a DeviantArt URL into a DeviantArt username."""
# Submission, user, and gallery pages
# http://ssenarrya.deviantart.com/
# https://nawa88.deviantart.com/art/Pinkie-Pie-s-after-party-at-night-rule-34-313639046
extractor_1_pattern = r'https?://(.+?)\.deviantart.com'
extractor_1_username_search = re.search(extractor_1_pattern,url, re.DOTALL | re.IGNORECASE)
if extractor_1_username_search:
username = extractor_1_username_search.group(1)
return username
# Image direct links
# http://fc08.deviantart.net/fs70/f/2014/004/8/1/mlp_fim___athewm_70_by_fadri-d70udbu.jpg
# fadri
extractor_2_pattern = r'_by_(.+)-[a-zA-Z0-9]+\.+[a-zA-Z0-9]{,5}$'
extractor_2_username_search = re.search(extractor_2_pattern,url, re.DOTALL | re.IGNORECASE)
if extractor_2_username_search:
username = extractor_2_username_search.group(1)
return username
def furaffinity_convert(url):
# Turn a furaffinity URL into a furaffinity username.
# Valid URL examples:
# http://www.furaffinity.net/user/scorpdk/
#print 'furaffinity_convert: url:',url
pattern = r'furaffinity\.net/user/(.+)/?'
username_search = re.search(pattern,url, re.DOTALL | re.IGNORECASE)
if username_search:
username = username_search.group(1)
if username[-1] == '/':#crop trailing backslash
username = username[0:-1]
return username
#print 'furaffinity_convert: username:',username
def inkbunny_convert(url):
# Turn an InkBunny URL into an InkBunny username.
# Valid URL examples:
# https://inkbunny.net/nargleflex
# Watch out for submission pages when calling this
#
# Check for things we can't sort
bad_strings = [
'.php',
'wiki.inkbunny'
]
for bad_string in bad_strings:
if bad_string in url:
print 'inkbunny_convert: Cannot parse url:',url
return None
pattern = r'inkbunny\.net/([^/]+$)'
username_search = re.search(pattern,url, re.DOTALL | re.IGNORECASE)
if username_search:
username = username_search.group(1)
return username
def tumblr_convert(url):
# Turn a Tumblr URL into a Tumblr username.
# Valid URL examples:
# http://peanutbtter.tumblr.com/
# Sometimes tumblr blogs use their own domain instead of tumblr, this will not work on those.
pattern = r'https?://(?:www\.)(.+?)\.tumblr\.com/'
username_search = re.search(pattern,url, re.DOTALL | re.IGNORECASE)
if username_search:
username = username_search.group(1)
return username
# if no match, just return the input url
return url
def pixiv_convert(url):
# Turn a Pixiv URL into a Pixiv UserID.
# Valid URL examples:
# http://www.pixiv.net/member.php?id=312468
# http://www.pixiv.net/bookmark.php?id=293363&rest=show&p=3
# http://www.pixiv.net/member_illust.php?id=2947383
# http://www.pixiv.com/users/5097
patterns = [
r'pixiv\.net/member.\php\?id=(\d+)',
r'pixiv\.net/bookmark\.php\?id=(\d+)',
r'pixiv\.net/member_illust\.php\?id=(\d+)',
'pixiv.com/users/(\d+)'
]
for pattern in patterns:
username_search = re.search(pattern,url, re.DOTALL | re.IGNORECASE)
if username_search:
username = username_search.group(1)
return username
def aryion_convert(url):
# Turn an Ekas Portal URL into an Ekas Portal username.
# Valid URL examples:
#http://aryion.com/g4/user/GTSdev
pattern = r'aryion\.com/g4/user/(.+)'
username_search = re.search(pattern,url, re.DOTALL | re.IGNORECASE)
if username_search:
username = username_search.group(1)
return username
def hentaifoundry_convert(link):
"""Convert a url into a username if possible"""
# New style links (1-12-2013 onwards?)
# Profile page
# http://www.hentai-foundry.com/user/NinjaKitty/profile
# http://www.hentai-foundry.com/user/NinjaKitty/faves/stories
#http://www.hentai-foundry.com/user/NinjaKitty/faves/pictures
profile_page_search_regex = """hentai\-foundry\.com/user/([^/]+)/?"""
profile_page_search = re.search(profile_page_search_regex, link)
if profile_page_search:
return profile_page_search.group(1)
# Picture gallery page, also picture submission page
# http://www.hentai-foundry.com/pictures/user/NinjaKitty
# http://www.hentai-foundry.com/pictures/user/NinjaKitty/242338/Raffle-Winner-AnimeFlux---123-Square
picture_gallery_page_search_regex = """hentai\-foundry\.com/pictures/user/([^/]+)"""
picture_gallery_page_search = re.search(picture_gallery_page_search_regex, link)
if picture_gallery_page_search:
return picture_gallery_page_search.group(1)
# Stories gallery page
# http://www.hentai-foundry.com/stories/user/NinjaKitty
stories_gallery_page_search_regex = """hentai\-foundry\.com/pictures/user/([^/]+)"""
stories_gallery_page_search = re.search(stories_gallery_page_search_regex, link)
if stories_gallery_page_search:
return stories_gallery_page_search.group(1)
# Favorite pictures page
# http://www.hentai-foundry.com/users/Faves?username=NinjaKitty
favorite_pictures_page_search_regex = """http://www.hentai-foundry.com/users/Faves\?username=(\w+)"""
favorite_pictures_page_search = re.search(favorite_pictures_page_search_regex, link)
if favorite_pictures_page_search:
return favorite_pictures_page_search.group(1)
# Favorite users page
# http://www.hentai-foundry.com/users/Faves?username=NinjaKitty
favorite_users_page_search_regex = """hentai\-foundry\.com/users/Faves?username=(.+)$"""
favorite_users_page_search = re.search(favorite_users_page_search_regex, link)
if favorite_users_page_search:
return favorite_users_page_search.group(1)
# Favorite users recent pictures/stories page
# http://www.hentai-foundry.com/users/FaveUsersRecentPictures?username=NinjaKitty
# http://www.hentai-foundry.com/users/FaveUsersRecentStories?username=NinjaKitty
favorite_users_recent_pictures_page_search_regex = """hentai-foundry.com/users/FaveUsersRecent(?:Pictures)?(?:Stories)?\?username=(\w+)"""
favorite_users_recent_pictures_page_search = re.search(favorite_users_recent_pictures_page_search_regex, link)
if favorite_users_recent_pictures_page_search:
return favorite_users_recent_pictures_page_search.group(1)
#
# Old style links (before 1-12-2013)
#
# Old profile page
# http://www.hentai-foundry.com/profile-hizzacked.php
old_profile_page_search_regex = """hentai\-foundry\.com/profile-(.+?)\.php"""
old_profile_page_search = re.search(old_profile_page_search_regex, link)
if old_profile_page_search:
return old_profile_page_search.group(1)
# Old user page
# http://www.hentai-foundry.com/user-Dboy.php
old_user_page_search_regex = """hentai\-foundry\.com/user-(.+?)\.php"""
old_user_page_search = re.search(old_user_page_search_regex, link)
if old_user_page_search:
return old_user_page_search.group(1)
# Old user gallery
# http://www.hentai-foundry.com/user_pictures-fab3716.page-1.php
# http://www.hentai-foundry.com/user_stories-nihaotomita.page-1.php
old_user_gallery_search_regex = """hentai-foundry.com/user_(?:pictures)?(?:stories)?-([^\.]+)(?:\.page-\d+)?.php"""
old_user_gallery_search = re.search(old_user_gallery_search_regex, link)
if old_user_gallery_search:
return old_user_gallery_search.group(1)
# Old user stories gallery
# http://www.hentai-foundry.com/stories-fab3716.php
old_user_stories_search_regex = """hentai-foundry.com/stories-([^\.]+)(?:\.page-\d+)?.php"""
old_user_stories_search = re.search(old_user_stories_search_regex, link)
if old_user_stories_search:
return old_user_stories_search.group(1)
# Old user bookmarks
# http://www.hentai-foundry.com/favorite_pictures-DarkDP.php
old_user_favs_search_regex = """hentai-foundry.com/favorite_(?:pictures)?(?:stories)?-([^\.]+)(?:\.page-\d+)?.php"""
old_user_favs_search = re.search(old_user_favs_search_regex, link)
if old_user_favs_search:
return old_user_favs_search.group(1)
def weasyl_convert(link):
"""Extract username from weasyl urls"""
# Userpages
# https://www.weasyl.com/~graphitedisk
# (?:http)s?://(?:www\.)weasyl.com/~([^/]+)
userpage_regex = """(?:http)s?://(?:www\.)weasyl.com/~([^/]+)"""
userpage_search = re.search(userpage_regex, link)
if userpage_search:
userpage_username = userpage_search.group(1)
return userpage_username
# Submissions gallery
# https://www.weasyl.com/submissions/graphitedisk
# https://www.weasyl.com/submissions/gollygeewhiz?nextid=65058
# (?:http)s?://(?:www\.)weasyl.com/submissions/([^/?]+)
submissions_gallery_regex = """(?:http)s?://(?:www\.)weasyl.com/submissions/([^/?]+)"""
submissions_gallery_search = re.search(submissions_gallery_regex, link)
if submissions_gallery_search:
submissions_gallery_username = submissions_gallery_search.group(1)
return submissions_gallery_username
# Collections gallery
# https://www.weasyl.com/characters/gollygeewhiz
collections_gallery_regex = """(?:http)s?://(?:www\.)weasyl.com/collections/([^/?]+)"""
collections_gallery_search = re.search(collections_gallery_regex, link)
if collections_gallery_search:
collections_gallery_username = collections_gallery_search.group(1)
return collections_gallery_username
# Characters gallery
characters_gallery_regex = """(?:http)s?://(?:www\.)weasyl.com/characters/([^/?]+)"""
characters_gallery_search = re.search(characters_gallery_regex, link)
if characters_gallery_search:
characters_gallery_username = characters_gallery_search.group(1)
return characters_gallery_username
# Shouts listing
shouts_gallery_regex = """(?:http)s?://(?:www\.)weasyl.com/shouts/([^/?]+)"""
shouts_gallery_search = re.search(shouts_gallery_regex, link)
if shouts_gallery_search:
shouts_gallery_username = shouts_gallery_search.group(1)
return shouts_gallery_username
# Journals listing
journals_gallery_regex = """(?:http)s?://(?:www\.)weasyl.com/journals/([^/?]+)"""
journals_gallery_search = re.search(journals_gallery_regex, link)
if journals_gallery_search:
journals_gallery_username = journals_gallery_search.group(1)
return journals_gallery_username
def derpibooru_convert(link):
# Submission_page
submission_page_regex = """(?:https?://)?(?:www)?\.?derpibooru\.org/(\d+)"""
submission_page_search = re.search(submission_page_regex, link)
if submission_page_search:
submission_page = submission_page_search.group(1)
return submission_page
# tag_search
tag_search_regex = """https?://derpibooru\.org/tags/([^/]+)"""
tag_search_search = re.search(tag_search_regex, link)
if tag_search_search:
tag_search = tag_search_search.group(1)
return tag_search
def vocaroo_convert(link):
clip_id = link.split("/")[-1]
return clip_id
# End converter functions
def export_usernames_from_file(input_file_path='paste_here.txt'):
#read the specified text file and output a list of usernames for each recognized domain
unsorted_data = load_textfile(input_file_path)
#print 'unsorted_data', unsorted_data
link_dict = build_link_dict(unsorted_data)
export_usernames_from_dict(link_dict)
def export_usernames_from_dict(link_dict):
for domain_key in link_dict.keys():
print 'domain_key', domain_key
output_filename = sanitize_filename(domain_key) + '.txt'
domain_lines = []
pixiv_domains = ["pixiv.net", "www.pixiv.net", "pixiv.com", "www.pixiv.com"]
for output_url in link_dict[domain_key]:
#print 'output_url', output_url
# Handle DeviantArt
if 'deviantart.com' in domain_key:
domain_lines.append(deviantart_convert(output_url))
# Handle Furaffinity
elif domain_key == 'furaffinity.net':
domain_lines.append(furaffinity_convert(output_url))
# Handle Inkbunny
elif domain_key == 'inkbunny.net':
if '.php' not in output_url:
domain_lines.append(inkbunny_convert(output_url))
# Handle Pixiv
elif domain_key in pixiv_domains:
domain_lines.append(pixiv_convert(output_url))
# Handle Ekas Portal
elif domain_key == 'aryion.com':
domain_lines.append(aryion_convert(output_url))
# Handle HentaiFoundry
elif domain_key == 'hentai-foundry.com':
domain_lines.append(hentaifoundry_convert(output_url))
# Handle weasyl
elif domain_key == 'weasyl.com':
domain_lines.append(weasyl_convert(output_url))
# Handle Derpibooru
elif 'derpibooru' in domain_key:
domain_lines.append(derpibooru_convert(output_url))
# Handle Derpibooru
elif 'vocaroo' in domain_key:
domain_lines.append(vocaroo_convert(output_url))
# print 'domain_lines', domain_lines
if len(domain_lines) > 0:
unique_domain_lines = uniquify(domain_lines)
output_string = ''
# Assemble output string from URL strings
for line in unique_domain_lines:
if line is None:
continue
else:
output_string += str(line) + '\n'
# print 'output_string', output_string
output_path = 'parsed_output/' + output_filename
save_text(output_path, output_string)
def export_from_file(input_file_path='paste_here.txt'):
"""Parse data from a single file"""
unsorted_data = load_textfile(input_file_path)
link_dict = build_link_dict(unsorted_data)
export_usernames_from_dict(link_dict)
export_urls_from_dict(link_dict)
def walk_for_files(start_path,pattern_list):
"""Use os.walk to collect a list of paths to files mathcing input parameters.
Takes in a starting path and a list of patterns to check against filenames
Patterns follow fnmatch conventions."""
assert(type(start_path) == type(""))
assert(type(pattern_list) == type([]))
matches = []
for root, dirs, files in os.walk(start_path):
for pattern in pattern_list:
assert(type(pattern) == type(""))
for filename in fnmatch.filter(files,pattern):
match = os.path.join(root,filename)
matches.append(match)
return matches
def import_folder(folder_path="to_scan"):
"""Scan through a folder, read any text files, concatenate the text inside,
and return the found text as a string"""
# Scan folder
filename_patterns = [
"*.txt",
"*.htm",
"*.html"
]
input_file_paths = walk_for_files(folder_path, filename_patterns)
# Read files
joined_input_data = ""
for input_file_path in input_file_paths:
file_data = load_textfile(input_file_path)
joined_input_data += ("\n\n\nFILE_SEPERATOR\n\n\n" + file_data)# Add seperator for debugging
return joined_input_data
def export_from_folder(folder_path="to_scan"):
"""Read data from every file in a folder, parse the found data, and export what is extracted."""
if not os.path.exists(folder_path):
os.makedirs(folder_path)
return
data_to_scan = import_folder(folder_path)
# Parse files into link dicts
link_dict = build_link_dict(data_to_scan)
# Export link dicts
export_usernames_from_dict(link_dict)
export_urls_from_dict(link_dict)
def main():
#export_from_file()
export_from_folder()
print 'Job done.'
raw_input('Press Return to exit.')
if __name__ == '__main__':
main()