Skip to content

Commit

Permalink
Merge pull request scikit-hep#981 from CoffeaTeam/slice_files
Browse files Browse the repository at this point in the history
feat: max_files
  • Loading branch information
lgray authored Jan 8, 2024
2 parents 9e927d1 + 3b8a046 commit d958598
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 2 deletions.
4 changes: 4 additions & 0 deletions src/coffea/dataset_tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
get_failed_steps_for_dataset,
get_failed_steps_for_fileset,
max_chunks,
max_files,
slice_chunks,
slice_files,
)
from coffea.dataset_tools.preprocess import preprocess

Expand All @@ -13,6 +15,8 @@
"apply_to_fileset",
"max_chunks",
"slice_chunks",
"max_files",
"slice_files",
"get_failed_steps_for_dataset",
"get_failed_steps_for_fileset",
]
46 changes: 46 additions & 0 deletions src/coffea/dataset_tools/manipulations.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,52 @@ def slice_chunks(fileset: FilesetSpec, theslice: Any = slice(None)) -> FilesetSp
return out


def max_files(fileset: FilesetSpec, maxfiles: int | None = None) -> FilesetSpec:
"""
Modify the input dataset so that only the first "maxfiles" files of each dataset will be processed.
Parameters
----------
fileset: FilesetSpec
The set of datasets reduce to max-files files per dataset.
maxfiles: int | None, default None
How many files to keep for each dataset.
Returns
-------
out : FilesetSpec
The reduced fileset with only the first maxfiles files left in.
"""
return slice_files(fileset, slice(maxfiles))


def slice_files(fileset: FilesetSpec, theslice: Any = slice(None)) -> FilesetSpec:
"""
Modify the input dataset so that only the files of each dataset specified by the input slice are processed.
Parameters
----------
fileset: FilesetSpec
The set of datasets to be sliced.
theslice: Any, default slice(None)
How to slice the array of files in the input datasets. We slice in key-order.
Returns
-------
out : FilesetSpec
The reduce fileset with only the files specific by theslice left.
"""
if not isinstance(theslice, slice):
theslice = slice(theslice)

out = copy.deepcopy(fileset)
for name, entry in fileset.items():
fnames = list(entry["files"].keys())[theslice]
finfos = list(entry["files"].values())[theslice]

out[name]["files"] = {fname: finfo for fname, finfo in zip(fnames, finfos)}

return out


def get_failed_steps_for_dataset(
dataset: DatasetSpec, report: awkward.Array
) -> DatasetSpec:
Expand Down
48 changes: 46 additions & 2 deletions tests/test_dataset_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@
apply_to_fileset,
get_failed_steps_for_fileset,
max_chunks,
max_files,
preprocess,
slice_chunks,
slice_files,
)
from coffea.nanoevents import BaseSchema, NanoAODSchema
from coffea.processor.test_items import NanoEventsProcessor, NanoTestProcessor
Expand Down Expand Up @@ -226,7 +228,49 @@ def test_preprocess_failed_file():
)


def test_maxchunks():
def test_max_files():
maxed_files = max_files(_updated_result, 1)

assert maxed_files == {
"ZJets": {
"files": {
"tests/samples/nano_dy.root": {
"object_path": "Events",
"steps": [[0, 7], [7, 14], [14, 21], [21, 28], [28, 35], [35, 40]],
"uuid": "a9490124-3648-11ea-89e9-f5b55c90beef",
}
}
},
"Data": {
"files": {
"tests/samples/nano_dimuon.root": {
"object_path": "Events",
"steps": [[0, 7], [7, 14], [14, 21], [21, 28], [28, 35], [35, 40]],
"uuid": "a210a3f8-3648-11ea-a29f-f5b55c90beef",
}
}
},
}


def test_slice_files():
sliced_files = slice_files(_updated_result, slice(1, None, 2))

assert sliced_files == {
"ZJets": {"files": {}},
"Data": {
"files": {
"tests/samples/nano_dimuon_not_there.root": {
"object_path": "Events",
"steps": None,
"uuid": None,
}
}
},
}


def test_max_chunks():
max_chunked = max_chunks(_runnable_result, 3)

assert max_chunked == {
Expand All @@ -251,7 +295,7 @@ def test_maxchunks():
}


def test_slicechunks():
def test_slice_chunks():
slice_chunked = slice_chunks(_runnable_result, slice(None, None, 2))

assert slice_chunked == {
Expand Down

0 comments on commit d958598

Please sign in to comment.