forked from datasets/awesome-data
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild.py
executable file
·111 lines (92 loc) · 3.21 KB
/
build.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim: ai ts=4 sts=4 et sw=4 nu
from __future__ import (unicode_literals, absolute_import,
division, print_function)
import json
import urllib2
try:
from markdown import markdown
except ImportError:
print('WARNING: failed to import markdown')
markdown = lambda x: x
def load_dataset(datapackage_url):
print('Processing: %s' % datapackage_url)
base = datapackage_url.rstrip('datapackage.json')
# TODO: deal with 404s gracefully
try:
datapackage = json.load(urllib2.urlopen(datapackage_url))
except:
print('Failed to load %s' % datapackage_url)
return None
# ensure certain fields exist
if not 'description' in datapackage:
datapackage['description'] = ''
# get the readme
readme_url = base + 'README.md'
try:
readmefo = urllib2.urlopen(readme_url)
datapackage['readme'] = readmefo.read().replace('\r\n', '\n')
except:
datapackage['readme'] = datapackage['description']
pass
# set description as first paragraph of readme if we no description
if not datapackage['description'] and 'readme' in datapackage:
# first extract plain text ...
html = markdown(unicode(datapackage['readme'], 'utf8'))
plain = strip_tags(html).split('\n\n')[0].replace(' \n', '').replace('\n', ' ')
datapackage['description'] = plain.encode('utf8')
for info in datapackage['resources']:
if (not info.get('url') and info.get('path')):
info['url'] = base + info.get('path')
return datapackage
from HTMLParser import HTMLParser
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_endtag(self, tag):
if tag == 'p':
self.fed.append('\n\n')
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
def load(dataset_names):
out = [ load_dataset(name) for name in dataset_names if name ]
# if failed loads returned dp is None
out = [ x for x in out if x ]
out = dict([ (x['name'], x) for x in out ])
return out
def build_index(dataset_list_url, outpath='datapackage-index.json'):
dataset_list = open(dataset_list_url).read().split('\n')
print(dataset_list)
# strip out blank lines or similar which can creep in
dataset_list = [_to_dp_url(ds) for ds in dataset_list if ds]
index = load(dataset_list)
with open(outpath, 'w') as dest:
json.dump(index, dest, indent=2, sort_keys=True)
def _to_dp_url(nameOrUrl):
if '/' not in nameOrUrl:
url = 'https://raw.github.com/opendatamali/datasets/master/' + nameOrUrl + '/'
else:
url = nameOrUrl
if not url.endswith('datapackage.json'):
url = url.rstrip('/')
url += '/datapackage.json'
return url
import sys
if __name__ == '__main__':
if len(sys.argv) > 1:
listpath = sys.argv[1]
else:
listpath = 'datapackage-list.txt'
if len(sys.argv) > 2:
outpath = sys.argv[2]
else:
outpath = 'datapackage-index.json'
build_index(listpath, outpath)