-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtwarc_videos.py
153 lines (128 loc) · 5.5 KB
/
twarc_videos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import os
import sys
import json
import time
import click
import logging
import youtube_dl
import multiprocessing as mp
from urllib.parse import urlparse
from twarc import ensure_flattened
from datetime import datetime, timedelta
from youtube_dl.utils import match_filter_func
@click.command()
@click.option('--max-downloads', type=int, help='max downloads per URL')
@click.option('--max-filesize', type=int, help='max filesize to download (bytes)')
@click.option('--ignore-livestreams', is_flag=True, default=False, help='ignore livestreams')
@click.option('--download-dir', type=str, default='videos', help='directory to download to')
@click.option('--block', multiple=True, help='hostname(s) to block (repeatable)')
@click.option('--timeout', type=int, default=120, help='seconds to wait for a video download to finish')
@click.option('--quiet', is_flag=True, default=False, help='silence terminal output')
@click.argument('infile', type=click.File('r'), default='-')
def videos(max_downloads, max_filesize, ignore_livestreams, download_dir, block, timeout, infile, quiet):
"""
Download videos referenced in tweets and their metadata.
"""
# make download directory
download_dir = download_dir
if not os.path.isdir(download_dir):
os.mkdir(download_dir)
# setup logger
log_file = "{}/twarc-videos.log".format(download_dir)
logging.basicConfig(filename=log_file, level=logging.INFO)
log = logging.getLogger()
# setup youtube_dl config
ydl_opts = {
"format": "best",
"logger": log,
"restrictfilenames": True,
"ignoreerrors": True,
"nooverwrites": True,
"writedescription": True,
"writeinfojson": True,
"writesubtitles": True,
"writeautomaticsub": True,
"outtmpl": "{}/%(extractor)s/%(id)s/%(title)s.%(ext)s".format(download_dir),
"download_archive": "{}/archive.txt".format(download_dir)
}
if ignore_livestreams:
ydl_opts["matchfilter"] = match_filter_func("!is_live")
if max_downloads:
ydl_opts['max_downloads'] = max_downloads
if max_filesize:
ydl_opts['max_filesize'] = max_filesize
# keep track of domains to block
blocklist = []
if block:
blocklist = block
# read in existing mapping file to know which urls we can ignorej
seen = set()
mapping_file = os.path.join(download_dir, 'mapping.tsv')
if os.path.isfile(mapping_file):
for line in open(mapping_file):
url, path = line.split('\t')
log.info('found %s in %s', url, mapping_file)
seen.add(url)
# loop through the tweets
results = open(mapping_file, 'a')
for line in infile:
for tweet in ensure_flattened(json.loads(line)):
log.info('analyzing %s', tweet['id'])
for url in video_urls(tweet):
if url in seen:
log.info('already processed %s', url)
continue
seen.add(url)
# check for blocks
uri = urlparse(url)
if uri.netloc in blocklist:
logging.warn("%s in block list", url)
continue
log.info('processing %s', url)
manager = mp.Manager()
return_list = manager.list()
p = mp.Process(target=download, args=(url, ydl_opts, log, max_downloads, return_list))
p.start()
started = datetime.now()
while True:
# if we've exceeded the timeout terminate the process
if timeout and datetime.now() - started > timedelta(seconds=timeout):
log.warning('reached timeout %s', timeout)
p.terminate()
break
# if the process is done we can stop
elif not p.is_alive():
break
# otherwise sleep and the check again
time.sleep(1)
p.join()
# if the queue was empty there either wasn't a download or it timed out
filename = return_list.pop() if len(return_list) > 0 else None
if not quiet and filename:
click.echo(f'downloaded {click.style(url, fg="blue")} as {click.style(filename, fg="green")}')
# write the result to the mapping file
results.write("{}\t{}\n".format(url, filename))
def video_urls(t):
if 'entities' not in t:
return
# if video is attached to the tweet return the tweet url
attachments = t['entities'].get('attachments', {})
for media in attachments.get('media', []):
if media['type'] == 'video':
yield f"https://twitter.com/{t['author']['username']}/status/{t['id']}"
# return every url in a tweet to see if video can be extracted
for url in t['entities'].get('urls', []):
yield url['expanded_url']
def download(url, ydl_opts, log, max_downloads, return_list):
try:
ydl = youtube_dl.YoutubeDL(ydl_opts)
info = ydl.extract_info(url)
if info:
filename = ydl.prepare_filename(info)
if os.path.isfile(filename):
return_list.append(filename)
logging.info('downloaded %s as %s', url, filename)
else:
logging.warning("%s doesn't look like a video", url)
except youtube_dl.utils.MaxDownloadsReached as e:
logging.warning('only %s downloads per url allowed', max_downloads)