This repository has been archived by the owner on May 12, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdb-search-old.py
114 lines (99 loc) · 3.51 KB
/
db-search-old.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env python3
import csv, datetime, sys, time
import requests
import soundcloud
import config
licenses = [
##"no-rights-reserved",
#####"all-rights-reserved",
"cc-by",
#"cc-by-nc",
#"cc-by-nd",
#"cc-by-sa",
#"cc-by-nc-nd",
#"cc-by-nc-sa"
]
page_size = 100
# https://blog.soundcloud.com/2008/10/17/cc/
date_start = datetime.datetime(2016, 2, 6)
date_today = datetime.date.today()
date_stride = datetime.timedelta(days=1)
date_to_offset = datetime.timedelta(hours=23, minutes=59, seconds=59)
def created_at_range (date):
date_to = date + date_to_offset
print(date.strftime("%Y-%m-%d %H:%M:%S"))
print(date_to.strftime("%Y-%m-%d %H:%M:%S"))
return {"from": date.strftime("%Y-%m-%d %H:%M:%S"),
"to": date_to.strftime("%Y-%m-%d %H:%M:%S")}
def escape (field):
if field:
field = field.replace("\n", "\\n")
field = field.replace("\r", "\\r")
field = field.replace("\t", "\\t")
return field
def print_track (writer, track):
writer.writerow((track.download_url,
track.license,
track.uri,
escape(track.title),
escape(track.description),
track.created_at,
escape(track.genre),
escape(track.tag_list),
escape(track.track_type),
escape(track.user['username']),
escape(track.label_name)))
def print_tracks (writer, tracks):
for track in tracks.collection:
print_track(writer, track)
def get_next_href (tracks):
try:
next_href = tracks.next_href
except Exception as e:
next_href = False
return next_href
#TODO: check for urlsfile and use last url if appropriate
# use the offset from the last url if that's invalid
# or just restart if not present/catastrophic failure
# get first 100 tracks
def initial_fetch (client, csvwriter, license_to_find):
try:
tracks = client.get('/tracks', license=license_to_find,
created_at=created_at_range(date_start),
filter='public', order='created_at',
limit=page_size, linked_partitioning=1)
except requests.HTTPError as e:
print(e)
return False
except Exception as e:
print(e)
exit(1)
print_tracks(csvwriter, tracks)
return get_next_href(tracks)
def subsequent_fetches (client, csvwriter, urlsfile, next_href):
while next_href:
print(".")
print(next_href, file=urlsfile)
# Make sure it's written immediately
urlsfile.flush()
time.sleep(1)
try:
tracks = client.get(next_href)
except requests.HTTPError as e:
print(e)
return
except Exception as e:
print(e)
print_tracks(csvwriter, tracks)
next_href = get_next_href(tracks)
def fetch_all_licenses_sequentially (client, licenses):
for license_to_find in licenses:
with open(license_to_find + '.tsv', mode='w',
encoding='utf-8') as outfile:
csvwriter = csv.writer(outfile, delimiter="\t")
urlsfile = open(license_to_find + '-urls.txt', 'w+')
next_href = initial_fetch(client, csvwriter, license_to_find)
subsequent_fetches (client, csvwriter, urlsfile, next_href)
if __name__ == "__main__":
client = soundcloud.Client(client_id=config.client_id)
fetch_all_licenses_sequentially(client, licenses)