Skip to content

Commit

Permalink
dino: index updates, xz compression support, varint fixes
Browse files Browse the repository at this point in the history
* Index sections now support 64-bit size/offsets, varint-encoding,
  storing uncompressed data sizes, and omitting fanout table
* dino.compression can now handle xz
* varint fixes
* renamed dino.struct to dino.dstruct to avoid collisions with 'struct'
  • Loading branch information
wgwoods committed Oct 2, 2019
1 parent ca002cb commit f09401a
Show file tree
Hide file tree
Showing 7 changed files with 347 additions and 62 deletions.
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,24 @@ Should work anywhere with POSIX `sh`, POSIX.2 `od`, and either a `sh`-builtin
Python module used by scripts here. Includes low-level pure-Python RPM header
parsing and RPM tag metadata! Fun!

## `dino/`

A Python module (that uses `rpmtoys`) I'm using for prototyping the
work-in-progress [DINO] package repo/packfile format.

## `mkdino.py`

A simple CLI to build [DINO] packfiles out of sets of RPMs, extract RPMs from
packfiles, examine packfile contents, etc.

Requirements:

* [python-libarchive-c]: `dnf install python3-libarchive-c` or `pip-3 install python-libarchive-c`
* [zstandard]: `pip-3 install zstandard`

[python-libarchive-c]: https://github.com/Changaco/python-libarchive-c
[DINO]: https://github.com/wgwoods/libdino

## `measure-metadata.py`

A script to examine actual RPM headers and determine the amount of space used
Expand Down
8 changes: 5 additions & 3 deletions dino/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@

from .const import *
from .section import *
from .struct import Dhdrp, Shdrp, StringTable
from .dstruct import Dhdrp, Shdrp, StringTable
from .compression import get_compressor, get_decompressor

# This only exports the public-facing stuff enums and classes.
Expand Down Expand Up @@ -178,12 +178,14 @@ def section_index(self, section):
# TODO: this needs a progress callback or something...
def write_to(self, fobj):
wrote = fobj.write(self.pack_hdrs())
for n,(name,sec) in enumerate(self.sections()):
# FIXME: pass through the compressor?
for sec in self.sectab:
# FIXME: pass through the compressor if that flag is set
# compr = self.get_compressor()
wrote += sec.write_to(fobj)
return wrote

def get_compressor(self, level=None):
# TODO: compression_opts!
return get_compressor(self.compression_id, level=level)

def get_decompressor(self):
Expand Down
106 changes: 92 additions & 14 deletions dino/compression.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,109 @@
# dino.compression - compression/decompression helpers

import logging as log

from .const import CompressionID

# TODO: Define a CompressionOpts structure that we can store in the header
# TODO: Define a CompressionOpts structure that we can store in the header,
# like squashfs does...

available_compressors = {"zstd", "xz"}

DEFAULT_COMPRESSION_LEVEL = {
CompressionID.XZ: 2, # Fedora default (ca. F30)
CompressionID.ZSTD: 10, # Diminishing returns above here...
}

DEFAULT_CHUNK_SIZE = 4*1024



class CompressionStreamWriter(object):
def __init__(self, cobj, fobj):
self._cobj = cobj
self._fobj = fobj

def write(self, data):
return self._fobj.write(self._cobj.compress(data))

def flush(self):
r = self._fobj.write(self._cobj.flush())
self._cobj = None
return r

class MultiCompressor(object):
def __init__(self, make_compress_obj, **kwargs):
if not callable(make_compress_obj):
raise ValueError(f'{make_compress_obj} is not callable')
self._mkcobj = make_compress_obj
self.args = kwargs
log.debug("MultiCompressor(%s, kwargs=%s)", make_compress_obj, kwargs)

def copy_stream(self, inf, outf, size=0, read_size=None, write_size=None):
if read_size is None:
read_size = DEFAULT_CHUNK_SIZE
if write_size is None:
write_size = DEFAULT_CHUNK_SIZE
read = 0
wrote = 0
to_read = size or -1
cobj = self._mkcobj(**self.args)
while to_read and (read < to_read):
chunk = inf.read(min(read_size, to_read))
if not chunk:
break
read += len(chunk)
wrote += outf.write(cobj.compress(chunk))
wrote += outf.write(cobj.flush())
return read, wrote

class CopyStreamMultiCompressor(MultiCompressor):
def __init__(self, cctx):
self._cctx = cctx
def copy_stream(self, inf, outf, size=0, read_size=None, write_size=None):
kwargs = dict()
if size:
kwargs['size'] = size
if read_size:
kwargs['read_size'] = read_size
if write_size:
kwargs['write_size'] = write_size
return self._cctx.copy_stream(inf, outf, **kwargs)


# Utility function to get CompressionID by id or name (or None)
cidmap = {n.lower():cid for n,cid in CompressionID.__members__.items()}
cidmap['gzip'] = cidmap['zlib']
cidmap['gz'] = cidmap['gzip']
def get_compressid(which):
if isinstance(which, int):
return CompressionID(which)
if which is None:
return CompressionID.NONE
if not isinstance(which, str):
which = str(which, 'ascii', 'ignore')
return cidmap.get(which.lower())

# We don't import the compression modules at the toplevel because I want this
# to work even if you don't have Every Compression Library installed.
# As long as you have the ones you actually use, we should be fine.

def get_compressor(which, level=None):
which = CompressionID(which)
which = get_compressid(which)
if level is None or level < 0:
level = DEFAULT_COMPRESSION_LEVEL.get(which)
if which == CompressionID.ZSTD:
import zstandard as zstd
if level and level < 0:
level = zstd.MAX_COMPRESSION_LEVEL
cctx = zstd.ZstdCompressor(write_content_size=True, level=level)
return cctx
return CopyStreamMultiCompressor(cctx)
elif which == CompressionID.XZ:
import lzma
if level and level < 0:
level = 9
# TODO: this doesn't support zstd's copy_stream function.
# Might need a wrapper object to make the different compressors
# all play nice, while still making sure they can flush and
# start a new compression frame when needed..
cctx = lzma.LZMACompressor(preset=level)
return cctx
return MultiCompressor(lzma.LZMACompressor, preset=level)
else:
raise NotImplementedError(f"{which.name} not implemented!")

def get_decompressor(which):
which = CompressionID(which)
which = get_compressid(which)
if which == CompressionID.ZSTD:
import zstandard as zstd
return zstd.ZstdDecompressor()
Expand All @@ -39,3 +112,8 @@ def get_decompressor(which):
return lzma.LZMADecompressor()
else:
raise NotImplementedError("{which.name} not implemented!")

# FIXME: need tests to confirm that each chunk of output from the compressor
# can be individually uncompressed....
# FIXME: also need some benchmarks to compare performance of algorithms
# (compresion ratio, decompression speed/mem use)
19 changes: 15 additions & 4 deletions dino/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,22 @@ class Arch(IntEnum):
RISCV = 243

class HeaderEncoding(IntFlag):
LE = 0b00000000 # Little-endian is the default; no bit set
BE = 0b00000001 # Big-endian
OFF64 = 0b00000010 # TODO: 64-bit sizes/offsets
# Endianness could be a single bit - 0=LE, 1=BE - but I'd prefer that an
# empty value be invalid, to reduce the probability that garbage data will
# be interpreted as valid. So instead we'll use the low two bits to store
# one of two valid ELF EI_DATA values - LSB=1, MSB=2.
# Anything else is invalid and should be rejected.
INVALID = 0b00000000 # No bits set! Invalid!
LE = 0b00000001 # Little-endian (our default)
BE = 0b00000010 # Big-endian
SEC64 = 0b00000100 # 64-bit sizes/offsets in sectab

def byteorder(self):
return self & 0b1
bo = self & 0b11
if bo == 0b11:
return self.INVALID
else:
return bo

def endian(self):
return '>' if self.byteorder() == self.BE else '<'
Expand Down Expand Up @@ -121,3 +131,4 @@ class SectionType(IntEnum):
class SectionFlags(IntFlag):
NONE = 0b00000000
COMPRESSED = 0b00000001
VARINT = 0b00000010
File renamed without changes.
Loading

0 comments on commit f09401a

Please sign in to comment.