-
Notifications
You must be signed in to change notification settings - Fork 71
/
Copy pathcrx_stats.py
59 lines (46 loc) · 1.35 KB
/
crx_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# some stats about collected crxs
import os, time
from tqdm import tqdm
from extstats.CONSTS import CRX_DIRECTORY as DIR
from distutils.version import LooseVersion
#DIR = 'crawled/crx4chrome/'
def sort_semverfiles(files):
def keyfunc(filename):
return LooseVersion(filename.replace('.zip', ''))
return sorted(files, key=keyfunc)
ext_obj = {}
exts = []
TO_RM = []
for ext in tqdm(os.listdir(DIR)):
files = os.listdir(DIR+ext)
files_details = []
try:
for file in sort_semverfiles(files):
fullpath = DIR+ext+'/'+file
size = os.path.getsize(fullpath)
if size < 10:
print(ext, file, 'IS 0000000', size)
TO_RM.append('rm '+fullpath)
files_details.append({
'name': file,
'size': size,
'created': time.ctime(os.path.getctime(fullpath))
})
except TypeError as e:
print('error with ', ext, files)
raise e
exts.append({
'ext': ext,
'files': files,
})
ext_obj[ext] = files_details
if len(TO_RM) > 0:
for RM in TO_RM:
print(RM)
exts.sort(key=lambda x: -len(x['files']))
for ext in exts[:10]:
print(ext['ext'], len(ext['files']))
print(*ext['files'])
import json
with open('data/crx_stats.json','w') as f:
json.dump(ext_obj, f, indent=2)