-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmeasure-metadata.py
executable file
·103 lines (91 loc) · 3.57 KB
/
measure-metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/python3
# measure-metadata.py - raw RPM header parsing and data measurements
# Copyright (C) 2018 Red Hat, Inc.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Author: Will Woods <[email protected]>
import json
import gzip
from rpmtoys.tags import Tag, BIN_TAGS
from rpmtoys.repo import iter_repo_rpms
from rpmtoys.hdr import rpmhdr
from rpmtoys.progress import progress
from collections import Counter, defaultdict
def rpm_basename(rpmfn):
return rpmfn[rpmfn.rfind('/')+1:rpmfn.rfind('.')]
def dump_sizedata(repo_paths, outfile="sizedata.json.gz"):
sizedata = dict()
valcount = defaultdict(Counter)
for rpmfn in progress(iter_repo_rpms(repo_paths), itemfmt=rpm_basename):
r = rpmhdr(rpmfn)
sizedata[r.envra] = [
[r.sig.size, r.hdr.size, r.payloadsize],
[(te.tag, te.offset, te.size, te.realsize)
for te in r.hdr.tagent.values()]
]
for t in r.hdr.tagval:
if t >= 1000 and t not in BIN_TAGS:
v = r.hdr.jsonval(t)
valcount[t].update(v if type(v) == tuple else [v])
print("\ndumping to {}...".format(outfile))
outdata = [sizedata,
[(t, vc.most_common()) for t, vc in valcount.items()]]
json.dump(outdata, gzip.open(outfile, 'wt'))
print("done!")
return outdata
def load_sizedata(infile):
sizedata, valcount_list = json.load(gzip.open(infile))
valcount = dict()
while valcount_list:
t, v = valcount_list.pop()
valcount[t] = Counter(dict(v))
return sizedata, valcount
def analyze_sizedata(sizedata):
tagsizes = Counter()
tagcounts = Counter()
for p, ts in sizedata.values():
tsd = Counter({Tag(t): rs for t, o, s, rs in ts})
tagsizes.update(tsd)
tagcounts.update(tsd.keys())
return tagsizes, tagcounts
# THIS IS A ROUGH HACK, MY FRIENDS.
if __name__ == '__main__':
import os
import sys
prog = os.path.basename(sys.argv[0])
usage = """
usage: {0} generate SIZEFILE REPODIR [REPODIR...]
{0} analyze SIZEFILE
{0} interactive SIZEFILE""".strip().format(prog)
if len(sys.argv) <= 2:
print(usage)
elif sys.argv[1] == "generate":
sizedata, valcount = dump_sizedata(sys.argv[3:], sys.argv[2])
elif sys.argv[1] == "analyze":
sizedata, valcount = load_sizedata(sys.argv[2])
# this could be nicer..
tagsizes, tagcounts = analyze_sizedata(sizedata)
for tag, size in tagsizes.most_common():
count = tagcounts[tag]
print("{:26}: {:5} times, {} bytes".format(tag.shortname, count, size))
elif sys.argv[1] == "interactive":
print("loading (sizedata, valcount) from {}...".format(sys.argv[2]))
sizedata, valcount = load_sizedata(sys.argv[2])
print("generating tagsizes, tagcounts...")
tagsizes, tagcounts = analyze_sizedata(sizedata)
else:
print("error: unknown command '{}'".format(sys.argv[1]))
print(usage)
raise SystemExit(2)