Skip to content

Commit

Permalink
update dnf-count-rpm-words.py and README
Browse files Browse the repository at this point in the history
  • Loading branch information
wgwoods committed Mar 29, 2018
1 parent 97738f0 commit 8f700d6
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 17 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,14 @@ parsing and RPM tag metadata! Fun!
A script to examine actual RPM headers and determine the amount of space used
by each individual tag.

## `dnf-count-rpm-words.py`

Use DNF to fetch a list of all the RPM names in 'fedora' and 'updates', split
them up into meaningful "words", and dump some CSV data about those words.

This isn't particularly useful, but it's part of a blog post I'm writing and
it's also a nice simple example of using DNF directly in Python.

## `LICENSE`

Check the individual files for their licenses. If any file is somehow missing
Expand Down
35 changes: 18 additions & 17 deletions dnf-count-rpm-words.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#!/usr/bin/python3

import dnf
import json
from collections import Counter, defaultdict


Expand Down Expand Up @@ -159,6 +158,8 @@ def wordsplit(rpmname):
namewords = list()
wordnames = defaultdict(set)
for pkg in sack.query():
if pkg.name in names:
continue
name = pkg.name
src = pkg.source_name
words = wordsplit(name)
Expand All @@ -180,11 +181,10 @@ def wordsplit(rpmname):
# How many different source packages - i.e. spec files - use each word?
# (this reduces noise from things like texlive...)
srcwords = Counter({w:len(set(names[n] for n in wordnames[w])) for w in words})
prefixes = Counter({w:len(set(names[n] for n in wordnames[w] if n.startswith(w+'-'))) for w in words})
suffixes = Counter({w:len(set(names[n] for n in wordnames[w] if n.endswith('-'+w))) for w in words})

commonwords = Counter({w:srcwords[w] for cat in (srcwords, prefixes, suffixes)
for w,c in cat.most_common(100)})
# A wordlist that includes every word in the top 100 of each category
commonwords = set(w for cat in (words, srcwords, prefixes, suffixes)
for w,c in cat.most_common(100))

# bonus stuff for finding words like "coin-or"
def iterpairs(iterable):
Expand All @@ -195,21 +195,22 @@ def iterpairs(iterable):
left = right
pairs = Counter(p for nw in namewords for p in iterpairs(nw))

def dumpcsv(fobj, wordlist, key=srcwords.get):
fobj.write("word,specfiles,rpms,as prefix,as suffix,meaning\n")
for word in sorted(wordlist, key=key, reverse=True):
fobj.write("{},{},{},{},{},{}\n".format(word,
srcwords.get(word, 0),
words.get(word, 0),
prefixes.get(word, 0),
suffixes.get(word, 0),
all_words.get(word, '')))


# run with "ipython3 -i dnf-count-rpm-words.py" to do interactive exploration!
if __name__ == '__main__':
with open("rpm-name-word-counts.csv", 'wt') as fobj:
print("writing {}...".format(fobj.name))
fobj.write("word,pkgs,prefix,suffix,meaning\n")
for word, count in srcwords.most_common():
fobj.write("{},{},{},{},{}\n".format(word, count,
prefixes.get(word, 0),
suffixes.get(word, 0),
all_words.get(word, '')))
dumpcsv(fobj, words)
with open("rpm-name-word-counts-common.csv", 'wt') as fobj:
print("writing {}...".format(fobj.name))
fobj.write("word,pkgs,prefix,suffix,meaning\n")
for word,count in commonwords.most_common():
fobj.write("{},{},{},{},{}\n".format(word, count,
prefixes.get(word, 0),
suffixes.get(word, 0),
all_words.get(word, '')))
dumpcsv(fobj, commonwords)

0 comments on commit 8f700d6

Please sign in to comment.