-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscrape.py
155 lines (122 loc) · 4.21 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/env python
import datetime
import json
import os
import argparse
import youtube_dl
import slugify
DEFAULT_LANGUAGE = "rus"
JSON_FORMAT_KWARGS = {
"indent": 2,
"separators": (",", ": "),
"sort_keys": True,
"ensure_ascii": False,
}
def main():
namespace = setup_interface()
list_filename = namespace.URLS_LIST_FILENAME
FOLDER_NAME = namespace.FOLDER_NAME
urls = filter(None, map(str.strip, open(list_filename).readlines()))
videos_meta = filter(None, sum((get_entries(u) for u in urls), []))
for meta in videos_meta:
prepared_meta = get_prepared_meta(meta)
filename = generate_filename(meta)
with open(FOLDER_NAME + filename, "w", encoding="utf8") as json_file:
json.dump(prepared_meta, json_file, **JSON_FORMAT_KWARGS)
def get_entries(url):
youtube = youtube_dl.YoutubeDL({"ignoreerrors": True})
data = youtube.extract_info(url, download=False)
return data.get("entries") or [data]
def get_prepared_meta(data):
meta = {
"copyright_text": data["license"],
"description": extract_decsription(data),
"duration": data["duration"],
"language": DEFAULT_LANGUAGE,
"recorded": extract_date_recorded(data),
"related_urls": [
{"label": "GitHub", "url": "https://github.com/minskpython"},
],
"speakers": list(extract_speakers(data)),
"tags": ["minsk", "belarus"],
"thumbnail_url": extract_thumbnail_url(data),
"title": extract_title(data),
"videos": [{"type": "youtube", "url": data["webpage_url"]}],
}
return meta
def generate_filename(data):
filename = "%s-%s-%s.json" % (
extract_date_recorded(data),
slugify.slugify(extract_title(data)),
slugify.slugify("-".join(extract_speakers(data))),
)
return filename
def extract_decsription(data):
separator = "Присоединяйся к нам!"
return sanitize(data["description"].split(separator)[0])
def extract_date_recorded(data):
# date detection logic needs improvement
splitter = "Python Meetup"
title = data["title"]
date_str = title.split(splitter)[-1].split("]")[0].strip()
year, month, day = date_str[-4:], date_str[3:5], date_str[:2]
upload_date = datetime.date(*map(int, (year, month, day)))
return upload_date.isoformat()
def extract_title(data):
# TODO: use smart regex here
raw_title = data["title"]
title_parts = raw_title.split("/")
title_position_index = 0
if len(title_parts) > 1:
extracted_title = title_parts[title_position_index]
return sanitize(extracted_title)
return sanitize(raw_title)
def extract_speakers(data):
# TODO: use smart regex here
speaker_names = []
title = data["title"]
title_parts = title.split("/")
speaker_name_position_index = 1
if len(title_parts) > 1:
speaker_names.append(
sanitize(title_parts[speaker_name_position_index])
)
return speaker_names
def extract_thumbnail_url(data):
thumbnail_candidate = data["thumbnail"]
if "hqdefault" in thumbnail_candidate:
if "?sqp" not in thumbnail_candidate:
# hqdefault image without '?sqp' modifier isn't so good
# trying to get more suitable thumbnail...
thumbnail_candidate = data["thumbnails"][-1]["url"]
return thumbnail_candidate
def sanitize(title_substring):
return title_substring.replace("\u200b", "").strip()
def create_interface():
parser = argparse.ArgumentParser()
parser._actions[0].help = 'Show parameters'
parser.add_argument(
'-f', '--file',
help='Use your own file with urls',
action='store',
default='urls.list',
metavar='file',
)
parser.add_argument(
'-d', '--directory',
help='directory name for JSON files',
action='store',
default='.',
metavar='dir',
)
args = parser.parse_args()
return args
def setup_interface():
namespace = create_interface()
FOLDER_NAME = namespace.directory
if not os.path.exists(FOLDER_NAME):
os.mkdir(FOLDER_NAME)
namespace.FOLDER_NAME = FOLDER_NAME + os.sep
return namespace
if __name__ == "__main__":
main()