dino: index updates, xz compression support, varint fixes

* Index sections now support 64-bit size/offsets, varint-encoding, storing uncompressed data sizes, and omitting fanout table * dino.compression can now handle xz * varint fixes * renamed dino.struct to dino.dstruct to avoid collisions with 'struct'
wgwoods · Oct 2, 2019 · f09401a · f09401a
1 parent ca002cb
commit f09401a
Show file tree

Hide file tree

Showing 7 changed files with 347 additions and 62 deletions.
diff --git a/README.md b/README.md
@@ -18,6 +18,24 @@ Should work anywhere with POSIX `sh`, POSIX.2 `od`, and either a `sh`-builtin
 Python module used by scripts here. Includes low-level pure-Python RPM header
 parsing and RPM tag metadata! Fun!
 
+## `dino/`
+
+A Python module (that uses `rpmtoys`) I'm using for prototyping the
+work-in-progress [DINO] package repo/packfile format.
+
+## `mkdino.py`
+
+A simple CLI to build [DINO] packfiles out of sets of RPMs, extract RPMs from
+packfiles, examine packfile contents, etc.
+
+Requirements:
+
+* [python-libarchive-c]: `dnf install python3-libarchive-c` or `pip-3 install python-libarchive-c`
+* [zstandard]: `pip-3 install zstandard`
+
+[python-libarchive-c]: https://github.com/Changaco/python-libarchive-c
+[DINO]: https://github.com/wgwoods/libdino
+
 ## `measure-metadata.py`
 
 A script to examine actual RPM headers and determine the amount of space used

diff --git a/dino/__init__.py b/dino/__init__.py
@@ -32,7 +32,7 @@
 
 from .const import *
 from .section import *
-from .struct import Dhdrp, Shdrp, StringTable
+from .dstruct import Dhdrp, Shdrp, StringTable
 from .compression import get_compressor, get_decompressor
 
 # This only exports the public-facing stuff enums and classes.
@@ -178,12 +178,14 @@ def section_index(self, section):
     # TODO: this needs a progress callback or something...
     def write_to(self, fobj):
         wrote = fobj.write(self.pack_hdrs())
-        for n,(name,sec) in enumerate(self.sections()):
-            # FIXME: pass through the compressor?
+        for sec in self.sectab:
+            # FIXME: pass through the compressor if that flag is set
+            # compr = self.get_compressor()
             wrote += sec.write_to(fobj)
         return wrote
 
     def get_compressor(self, level=None):
+        # TODO: compression_opts!
         return get_compressor(self.compression_id, level=level)
 
     def get_decompressor(self):

diff --git a/dino/compression.py b/dino/compression.py
@@ -1,36 +1,109 @@
 # dino.compression - compression/decompression helpers
 
+import logging as log
+
 from .const import CompressionID
 
-# TODO: Define a CompressionOpts structure that we can store in the header
+# TODO: Define a CompressionOpts structure that we can store in the header,
+# like squashfs does...
+
+available_compressors = {"zstd", "xz"}
+
+DEFAULT_COMPRESSION_LEVEL = {
+    CompressionID.XZ: 2,        # Fedora default (ca. F30)
+    CompressionID.ZSTD: 10,     # Diminishing returns above here...
+}
+
+DEFAULT_CHUNK_SIZE = 4*1024
+
+
+
+class CompressionStreamWriter(object):
+    def __init__(self, cobj, fobj):
+        self._cobj = cobj
+        self._fobj = fobj
+
+    def write(self, data):
+        return self._fobj.write(self._cobj.compress(data))
+
+    def flush(self):
+        r = self._fobj.write(self._cobj.flush())
+        self._cobj = None
+        return r
+
+class MultiCompressor(object):
+    def __init__(self, make_compress_obj, **kwargs):
+        if not callable(make_compress_obj):
+            raise ValueError(f'{make_compress_obj} is not callable')
+        self._mkcobj = make_compress_obj
+        self.args = kwargs
+        log.debug("MultiCompressor(%s, kwargs=%s)", make_compress_obj, kwargs)
+
+    def copy_stream(self, inf, outf, size=0, read_size=None, write_size=None):
+        if read_size is None:
+            read_size = DEFAULT_CHUNK_SIZE
+        if write_size is None:
+            write_size = DEFAULT_CHUNK_SIZE
+        read = 0
+        wrote = 0
+        to_read = size or -1
+        cobj = self._mkcobj(**self.args)
+        while to_read and (read < to_read):
+            chunk = inf.read(min(read_size, to_read))
+            if not chunk:
+                break
+            read += len(chunk)
+            wrote += outf.write(cobj.compress(chunk))
+        wrote += outf.write(cobj.flush())
+        return read, wrote
+
+class CopyStreamMultiCompressor(MultiCompressor):
+    def __init__(self, cctx):
+        self._cctx = cctx
+    def copy_stream(self, inf, outf, size=0, read_size=None, write_size=None):
+        kwargs = dict()
+        if size:
+            kwargs['size'] = size
+        if read_size:
+            kwargs['read_size'] = read_size
+        if write_size:
+            kwargs['write_size'] = write_size
+        return self._cctx.copy_stream(inf, outf, **kwargs)
+
+
+# Utility function to get CompressionID by id or name (or None)
+cidmap = {n.lower():cid for n,cid in CompressionID.__members__.items()}
+cidmap['gzip'] = cidmap['zlib']
+cidmap['gz'] = cidmap['gzip']
+def get_compressid(which):
+    if isinstance(which, int):
+        return CompressionID(which)
+    if which is None:
+        return CompressionID.NONE
+    if not isinstance(which, str):
+        which = str(which, 'ascii', 'ignore')
+    return cidmap.get(which.lower())
 
 # We don't import the compression modules at the toplevel because I want this
 # to work even if you don't have Every Compression Library installed.
 # As long as you have the ones you actually use, we should be fine.
 
 def get_compressor(which, level=None):
-    which = CompressionID(which)
+    which = get_compressid(which)
+    if level is None or level < 0:
+        level = DEFAULT_COMPRESSION_LEVEL.get(which)
     if which == CompressionID.ZSTD:
         import zstandard as zstd
-        if level and level < 0:
-            level = zstd.MAX_COMPRESSION_LEVEL
         cctx = zstd.ZstdCompressor(write_content_size=True, level=level)
-        return cctx
+        return CopyStreamMultiCompressor(cctx)
     elif which == CompressionID.XZ:
         import lzma
-        if level and level < 0:
-            level = 9
-        # TODO: this doesn't support zstd's copy_stream function.
-        # Might need a wrapper object to make the different compressors
-        # all play nice, while still making sure they can flush and
-        # start a new compression frame when needed..
-        cctx = lzma.LZMACompressor(preset=level)
-        return cctx
+        return MultiCompressor(lzma.LZMACompressor, preset=level)
     else:
         raise NotImplementedError(f"{which.name} not implemented!")
 
 def get_decompressor(which):
-    which = CompressionID(which)
+    which = get_compressid(which)
     if which == CompressionID.ZSTD:
         import zstandard as zstd
         return zstd.ZstdDecompressor()
@@ -39,3 +112,8 @@ def get_decompressor(which):
         return lzma.LZMADecompressor()
     else:
         raise NotImplementedError("{which.name} not implemented!")
+
+# FIXME: need tests to confirm that each chunk of output from the compressor
+# can be individually uncompressed....
+# FIXME: also need some benchmarks to compare performance of algorithms
+# (compresion ratio, decompression speed/mem use)
diff --git a/dino/const.py b/dino/const.py
@@ -53,12 +53,22 @@ class Arch(IntEnum):
     RISCV   = 243
 
 class HeaderEncoding(IntFlag):
-    LE          = 0b00000000 # Little-endian is the default; no bit set
-    BE          = 0b00000001 # Big-endian
-    OFF64       = 0b00000010 # TODO: 64-bit sizes/offsets
+    # Endianness could be a single bit - 0=LE, 1=BE - but I'd prefer that an
+    # empty value be invalid, to reduce the probability that garbage data will
+    # be interpreted as valid. So instead we'll use the low two bits to store
+    # one of two valid ELF EI_DATA values - LSB=1, MSB=2.
+    # Anything else is invalid and should be rejected.
+    INVALID     = 0b00000000 # No bits set! Invalid!
+    LE          = 0b00000001 # Little-endian (our default)
+    BE          = 0b00000010 # Big-endian
+    SEC64       = 0b00000100 # 64-bit sizes/offsets in sectab
 
     def byteorder(self):
-        return self & 0b1
+        bo = self & 0b11
+        if bo == 0b11:
+            return self.INVALID
+        else:
+            return bo
 
     def endian(self):
         return '>' if self.byteorder() == self.BE else '<'
@@ -121,3 +131,4 @@ class SectionType(IntEnum):
 class SectionFlags(IntFlag):
     NONE       = 0b00000000
     COMPRESSED = 0b00000001
+    VARINT     = 0b00000010
diff --git a/dino/struct.py → dino/dstruct.py b/dino/struct.py → dino/dstruct.py