Skip to content

Commit

Permalink
feat(metadata): report unknown chunks
Browse files Browse the repository at this point in the history
  • Loading branch information
e3krisztian committed May 4, 2022
1 parent db181f2 commit d1a2bff
Show file tree
Hide file tree
Showing 5 changed files with 50 additions and 57 deletions.
35 changes: 9 additions & 26 deletions tests/test_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,39 +3,22 @@
import pytest

from unblob.extractor import (
carve_unknown_chunks,
carve_unknown_chunk,
fix_extracted_directory,
fix_permission,
fix_symlink,
)
from unblob.models import File, TaskResult, UnknownChunk


class TestCarveUnknownChunks:
def test_no_chunks(self, tmp_path: Path):
test_file = File.from_bytes(b"some file")
carve_unknown_chunks(tmp_path, test_file, [])
assert list(tmp_path.iterdir()) == []

def test_one_chunk(self, tmp_path: Path):
content = b"test file"
test_file = File.from_bytes(content)
chunk = UnknownChunk(0, len(content))
carve_unknown_chunks(tmp_path, test_file, [chunk])
written_path = tmp_path / "0-9.unknown"
assert list(tmp_path.iterdir()) == [written_path]
assert written_path.read_bytes() == content

def test_multiple_chunks(self, tmp_path: Path):
content = b"test file"
test_file = File.from_bytes(content)
chunks = [UnknownChunk(0, 4), UnknownChunk(4, 9)]
carve_unknown_chunks(tmp_path, test_file, chunks)
written_path1 = tmp_path / "0-4.unknown"
written_path2 = tmp_path / "4-9.unknown"
assert sorted(tmp_path.iterdir()) == [written_path1, written_path2]
assert written_path1.read_bytes() == content[:4]
assert written_path2.read_bytes() == content[4:]
def test_carve_unknown_chunk(tmp_path: Path):
content = b"test file"
test_file = File.from_bytes(content)
chunk = UnknownChunk(1, 8)
carve_unknown_chunk(tmp_path, test_file, chunk)
written_path = tmp_path / "1-8.unknown"
assert list(tmp_path.iterdir()) == [written_path]
assert written_path.read_bytes() == content[1:8]


def test_fix_permission(tmpdir: Path):
Expand Down
24 changes: 6 additions & 18 deletions unblob/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
"""
import os
from pathlib import Path
from typing import List

from structlog import get_logger

Expand Down Expand Up @@ -95,23 +94,12 @@ def fix_extracted_directory(outdir: Path, task_result: TaskResult):
fix_permission(path)


def carve_unknown_chunks(
extract_dir: Path, file: File, unknown_chunks: List[UnknownChunk]
) -> List[Path]:
if not unknown_chunks:
return []

carved_paths = []
logger.warning("Found unknown Chunks", chunks=unknown_chunks)

for chunk in unknown_chunks:
filename = f"{chunk.start_offset}-{chunk.end_offset}.unknown"
carve_path = extract_dir / filename
logger.info("Extracting unknown chunk", path=carve_path, chunk=chunk)
carve_chunk_to_file(carve_path, file, chunk)
carved_paths.append(carve_path)

return carved_paths
def carve_unknown_chunk(extract_dir: Path, file: File, chunk: UnknownChunk) -> Path:
filename = f"{chunk.start_offset}-{chunk.end_offset}.unknown"
carve_path = extract_dir / filename
logger.info("Extracting unknown chunk", path=carve_path, chunk=chunk)
carve_chunk_to_file(carve_path, file, chunk)
return carve_path


def carve_valid_chunk(extract_dir: Path, file: File, chunk: ValidChunk) -> Path:
Expand Down
20 changes: 16 additions & 4 deletions unblob/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from .file_utils import Endian, File, InvalidInputFormat, StructParser
from .identifiers import new_id
from .parser import hexstring2regex
from .report import ChunkReport, ErrorReport, Report
from .report import ChunkReport, ErrorReport, Report, UnknownChunkReport

logger = get_logger()

Expand Down Expand Up @@ -114,9 +114,17 @@ class UnknownChunk(Chunk):
entropy, other chunks inside it, metadata, etc.
These are not extracted, just logged for information purposes and further analysis,
like most common bytest (like \x00 and \xFF), ASCII strings, high entropy, etc.
like most common bytes (like \x00 and \xFF), ASCII strings, high entropy, etc.
"""

def as_report(self) -> UnknownChunkReport:
return UnknownChunkReport(
id=self.id,
start_offset=self.start_offset,
end_offset=self.end_offset,
size=self.size,
)


@attr.define
class TaskResult:
Expand All @@ -141,13 +149,17 @@ def errors(self) -> List[ErrorReport]:
reports = itertools.chain.from_iterable(
r.reports for r in self.results.values()
)
interesting_reports = (r for r in reports if isinstance(r, (ErrorReport, ChunkReport)))
interesting_reports = (
r for r in reports if isinstance(r, (ErrorReport, ChunkReport))
)
errors = []
for report in interesting_reports:
if isinstance(report, ErrorReport):
errors.append(report)
else:
errors.extend(r for r in report.extraction_reports if isinstance(r, ErrorReport))
errors.extend(
r for r in report.extraction_reports if isinstance(r, ErrorReport)
)
return errors

def update(self, other: "ProcessResult"):
Expand Down
20 changes: 11 additions & 9 deletions unblob/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from unblob.handlers import BUILTIN_HANDLERS, Handlers

from .extractor import carve_unknown_chunks, carve_valid_chunk, fix_extracted_directory
from .extractor import carve_unknown_chunk, carve_valid_chunk, fix_extracted_directory
from .file_utils import iterate_file, valid_path
from .finder import search_chunks
from .iter_utils import pairwise
Expand Down Expand Up @@ -288,7 +288,7 @@ def process(self):
else:
# we don't consider whole files as unknown chunks, but we still want to
# calculate entropy for whole files which produced no valid chunks
self._calculate_entropies([self.task.path])
self._calculate_entropy(self.task.path)

self._ensure_root_extract_dir()

Expand All @@ -298,10 +298,13 @@ def _process_chunks(
outer_chunks: List[ValidChunk],
unknown_chunks: List[UnknownChunk],
):
carved_unknown_paths = carve_unknown_chunks(
self.carve_dir, file, unknown_chunks
)
self._calculate_entropies(carved_unknown_paths)
if unknown_chunks:
logger.warning("Found unknown Chunks", chunks=unknown_chunks)

for chunk in unknown_chunks:
carved_unknown_path = carve_unknown_chunk(self.carve_dir, file, chunk)
self._calculate_entropy(carved_unknown_path)
self.result.add_report(chunk.as_report())

for chunk in outer_chunks:
self._extract_chunk(file, chunk)
Expand All @@ -311,10 +314,9 @@ def _ensure_root_extract_dir(self):
if self.task.depth == 0:
self.carve_dir.mkdir(parents=True, exist_ok=True)

def _calculate_entropies(self, paths: List[Path]):
def _calculate_entropy(self, path: Path):
if self.task.depth < self.config.entropy_depth:
for path in paths:
calculate_entropy(path, draw_plot=self.config.entropy_plot)
calculate_entropy(path, draw_plot=self.config.entropy_plot)

def _extract_chunk(self, file, chunk: ValidChunk):
is_whole_file_chunk = chunk.start_offset == 0 and chunk.end_offset == self.size
Expand Down
8 changes: 8 additions & 0 deletions unblob/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,11 @@ class ChunkReport(Report):
size: int
is_encrypted: bool
extraction_reports: List[Report]


@attr.define(kw_only=True)
class UnknownChunkReport(Report):
id: str
start_offset: int
end_offset: int
size: int

0 comments on commit d1a2bff

Please sign in to comment.