Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add globbing support in makeReader and CreateDataSource #729

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion include/podio/DataSource.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,10 @@ class DataSource : public ROOT::RDF::RDataSource {
ROOT::RDataFrame CreateDataFrame(const std::vector<std::string>& filePathList);

///
/// @brief Create RDataFrame from a Podio file.
/// @brief Create RDataFrame from a Podio file or glob pattern matching multiple Podio files.
///
/// @param[in] filePath File path from which the RDataFrame will be created.
/// The file path can include glob patterns to match multiple files.
/// @return RDataFrame created from input file list.
///
ROOT::RDataFrame CreateDataFrame(const std::string& filePath);
Expand Down
5 changes: 3 additions & 2 deletions include/podio/Reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -206,12 +206,13 @@ class Reader {
}
};

/// Create a Reader is able to read the file
/// Create a Reader that is able to read the file or files matching a glob pattern
///
/// This will inspect the filename as well as peek at the file contents to
/// instantiate the correct low level reader to open and read the file
///
/// @param filename The (path to the) file to read from
/// @param filename The (path to the) file to read from.
/// The file path can include glob patterns to match multiple files.
///
/// @returns A Reader that has been initialized and that can be used for reading
/// data from the passed file
Expand Down
38 changes: 38 additions & 0 deletions include/podio/utilities/Glob.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#ifndef PODIO_UTILITIES_GLOB_H
#define PODIO_UTILITIES_GLOB_H
#include <string>
#include <vector>

namespace podio::utils {
/**
* @brief Expands a given glob pattern into a list of matching file paths.
*
* This function takes a glob pattern as input and returns a vector of strings
* containing the paths that match the pattern. It supports standard glob rules
* extended with tilde expansion and brace expansion. If the pattern doesn't
* contain any wildcards then it is placed in the returned vector as is. Paths
* that cannot be accessed are displayed on std::cerr, but the expansion process
* is not aborted. On platforms without <glob.h> no expansion is done and vector
* containing the original pattern is returned
*
* @param pattern The glob pattern to expand.
* @return A vector of strings containing the matching file paths.
*
* @throws std::runtime_error If no matches are found or if there is an error
* during glob expansion.
*/
std::vector<std::string> expand_glob(const std::string& pattern);
/**
* @brief Checks if a given pattern is a glob pattern.
*
* This function determines whether the provided pattern contains any standard
* glob or brace expansion wildcards.
*
* @param pattern The pattern to check.
* @return true if the pattern is a glob pattern, false otherwise.
*/
bool is_glob_pattern(const std::string& pattern);

} // namespace podio::utils

#endif // PODIO_UTILITIES_GLOB_H
30 changes: 20 additions & 10 deletions python/podio/reading.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,32 +50,42 @@ def _determine_root_format(filename):
return RootFileFormat.RNTUPLE


def get_reader(filename):
"""Get an appropriate reader for the passed file.
def get_reader(filenames):
"""Get an appropriate reader for the passed files.
m-fila marked this conversation as resolved.
Show resolved Hide resolved

The reader is inferred from the first file if multiple are given.
All files are assumed to be of the same I/O format.

Args:
filename (str): The input file
filenames (str or list[str]): The input file(s)

Returns:
root_io.[Legacy]Reader, sio_io.[Legacy]Reader: an initialized reader that
is able to process the input file.
is able to process the input file(s).

Raises:
ValueError: If the file cannot be recognized, or if podio has not been
ValueError: If the files cannot be recognized, or if podio has not been
built with the necessary backend I/O support
IndexError: If filenames is an empty list
"""

if isinstance(filenames, str):
filename = filenames
else:
filename = filenames[0]

if filename.endswith(".sio"):
if _is_frame_sio_file(filename):
return sio_io.Reader(filename)
return sio_io.LegacyReader(filename)
return sio_io.Reader(filenames)
return sio_io.LegacyReader(filenames)

if filename.endswith(".root"):
root_flavor = _determine_root_format(filename)
if root_flavor == RootFileFormat.TTREE:
return root_io.Reader(filename)
return root_io.Reader(filenames)
if root_flavor == RootFileFormat.RNTUPLE:
return root_io.RNTupleReader(filename)
return root_io.RNTupleReader(filenames)
if root_flavor == RootFileFormat.LEGACY:
return root_io.LegacyReader(filename)
return root_io.LegacyReader(filenames)

raise ValueError("file must end on .root or .sio")
29 changes: 29 additions & 0 deletions python/podio/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from collections.abc import Iterable
from pathlib import Path

from ROOT import podio


def convert_to_str_paths(filenames):
"""Converts filenames to string paths, handling both string and pathlib.Path objects and
Expand All @@ -22,3 +24,30 @@ def convert_to_str_paths(filenames):
return [os.fspath(fn) for fn in filenames]

return [os.fspath(filenames)]


def expand_glob(pattern):
"""
Expands a given glob pattern into a list of matching file paths.

This function takes a glob pattern as input and returns a list of strings
containing the paths that match the pattern. It supports standard glob rules
extended with tilde expansion and brace expansion. If the pattern doesn't
contain any wildcards, it is placed in the returned list as is. Paths that
cannot be accessed are displayed on stderr, but the expansion process is not
aborted.

Args:
pattern (str): The glob pattern to expand.

Returns:
list of str: A list of strings containing the matching file paths.

Raises:
cppyy.gbl.std.runtime_error: If no matches are found or if there is an error during glob
expansion.
"""
return [str(x) for x in podio.utils.expand_glob(pattern)]


is_glob_pattern = podio.utils.is_glob_pattern
2 changes: 2 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ SET(core_sources
CollectionBufferFactory.cc
MurmurHash3.cpp
SchemaEvolution.cc
Glob.cc
)

SET(core_headers
Expand All @@ -69,6 +70,7 @@ SET(core_headers
${PROJECT_SOURCE_DIR}/include/podio/utilities/DatamodelRegistryIOHelpers.h
${PROJECT_SOURCE_DIR}/include/podio/GenericParameters.h
${PROJECT_SOURCE_DIR}/include/podio/LinkCollection.h
${PROJECT_SOURCE_DIR}/include/podio/utilities/Glob.h
)

PODIO_ADD_LIB_AND_DICT(podio "${core_headers}" "${core_sources}" selection.xml)
Expand Down
5 changes: 2 additions & 3 deletions src/DataSource.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "podio/DataSource.h"
#include "podio/Reader.h"
#include "podio/utilities/Glob.h"

// podio
#include <podio/FrameCategories.h>
Expand All @@ -13,9 +14,7 @@
#include <memory>

namespace podio {
DataSource::DataSource(const std::string& filePath, int nEvents) : m_nSlots{1} {
m_filePathList.emplace_back(filePath);
SetupInput(nEvents);
DataSource::DataSource(const std::string& filePath, int nEvents) : DataSource(utils::expand_glob(filePath), nEvents) {
}

DataSource::DataSource(const std::vector<std::string>& filePathList, int nEvents) :
Expand Down
64 changes: 64 additions & 0 deletions src/Glob.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#include "podio/utilities/Glob.h"
#include <iostream>
#include <stdexcept>
#include <string>
#include <vector>
#if __has_include(<glob.h>)
#include <glob.h>
#else
#include <system_error>
#endif
m-fila marked this conversation as resolved.
Show resolved Hide resolved

namespace podio::utils {

bool is_glob_pattern(const std::string& pattern) {
bool escape = false;
for (auto c : pattern) {
if (escape) {
escape = false;
} else if (c == '\\') {
escape = true;
} else if (c == '*' || c == '?' || c == '[' || c == '{') {
return true;
}
}
return false;
}

#if __has_include(<glob.h>)

int glob_err_handler(const char* epath, int eerrno) {
std::cerr << "Glob expansion error accessing path: " << epath << " (error code: " << eerrno << ")\n";
return 0;
}

std::vector<std::string> expand_glob(const std::string& pattern) {
glob_t glob_result;
auto retv = glob(pattern.c_str(), GLOB_TILDE | GLOB_BRACE | GLOB_NOMAGIC, glob_err_handler, &glob_result);
if (retv == GLOB_NOMATCH) {
throw std::runtime_error("Glob expansion found no matches for pattern: " + pattern);
} else if (retv != 0) {
globfree(&glob_result);
throw std::runtime_error("Glob expansion error");
}
std::vector<std::string> results;
results.reserve(glob_result.gl_pathc);
for (size_t i = 0; i < glob_result.gl_pathc; ++i) {
results.emplace_back(glob_result.gl_pathv[i]);
}
globfree(&glob_result);
return results;
}

#else

std::vector<std::string> expand_glob(const std::string& pattern) {
if (is_glob_pattern(pattern)) {
throw std::system_error("Glob expansion is not supported on this platform")
}
return {pattern};
}

#endif // __has_include(<glob.h>)

} // namespace podio::utils
4 changes: 3 additions & 1 deletion src/Reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#include "podio/SIOReader.h"
#endif

#include "podio/utilities/Glob.h"

#include "TFile.h"
#include "TKey.h"
#include <memory>
Expand All @@ -19,7 +21,7 @@ Reader::Reader(std::unique_ptr<T> reader) : m_self(std::make_unique<ReaderModel<
}

Reader makeReader(const std::string& filename) {
return makeReader(std::vector<std::string>{filename});
return makeReader(utils::expand_glob(filename));
}

Reader makeReader(const std::vector<std::string>& filenames) {
Expand Down
3 changes: 3 additions & 0 deletions src/selection.xml
Original file line number Diff line number Diff line change
Expand Up @@ -46,5 +46,8 @@
<class name="podio::LinkData"/>
<class name="std::vector<podio::LinkData>"/>

<function name="podio::utilities::is_glob_pattern"/>
<function name="podio::utilities::expand_glob"/>
m-fila marked this conversation as resolved.
Show resolved Hide resolved

</selection>
</lcgdict>
2 changes: 2 additions & 0 deletions tests/CTestCustom.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ if ((NOT "@FORCE_RUN_ALL_TESTS@" STREQUAL "ON") AND (NOT "@USE_SANITIZER@" STREQ

write_frame_root
read_frame_root
read_glob
read_python_multiple

write_interface_root
read_interface_root
Expand Down
6 changes: 6 additions & 0 deletions tests/root_io/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ set(root_dependent_tests
read_and_write_frame_root.cpp
write_interface_root.cpp
read_interface_root.cpp
read_glob.cpp
)
if(ENABLE_RNTUPLE)
set(root_dependent_tests
Expand Down Expand Up @@ -39,11 +40,16 @@ set_tests_properties(
read_frame_root
read_frame_root_multiple
read_and_write_frame_root
read_glob

PROPERTIES
DEPENDS write_frame_root
)

add_test(NAME read_python_multiple COMMAND python3 ${PROJECT_SOURCE_DIR}/tests/root_io/read_multiple.py)
PODIO_SET_TEST_ENV(read_python_multiple)
set_property(TEST read_python_multiple PROPERTY DEPENDS write_frame_root)

if(ENABLE_RNTUPLE)
set_property(TEST read_rntuple PROPERTY DEPENDS write_rntuple)
set_property(TEST read_interface_rntuple PROPERTY DEPENDS write_interface_rntuple)
Expand Down
4 changes: 4 additions & 0 deletions tests/root_io/read_datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,7 @@
rdf = CreateDataFrame("example_frame.root")

assert rdf.Count().GetValue() == 10

rdf = CreateDataFrame("example_frame_?.root")

assert rdf.Count().GetValue() == 20
40 changes: 40 additions & 0 deletions tests/root_io/read_glob.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#include "podio/Reader.h"
#include "podio/utilities/Glob.h"
#if PODIO_ENABLE_DATASOURCE
#include "podio/DataSource.h"
#endif

#define ASSERT(condition, msg) \
if (!(condition)) { \
throw std::runtime_error(msg); \
}

int main() {
const auto pattern = "example_frame_?.root";
const auto expected_events = 20;
// standalone globbing

ASSERT(podio::utils::is_glob_pattern(pattern), "Failed to recognize glob pattern");
auto files = podio::utils::expand_glob(pattern);
ASSERT(files.size() == 2, "Glob expanded to a wrong number of files");
ASSERT(files[0] == "example_frame_0.root", "Glob expanded to an unexpected file");
ASSERT(files[1] == "example_frame_1.root", "Glob expanded to an unexpected file");
{
// externally resolved glob
const auto reader = podio::makeReader(files);
ASSERT((reader.getEvents() == expected_events), "Reader read invalid number of events");
#if PODIO_ENABLE_DATASOURCE
auto rdf = podio::CreateDataFrame(files);
ASSERT(rdf.Count().GetValue() == expected_events, "DataSource read invalid number of events");
#endif // PODIO_ENABLE_DATASOURCE
}
{
// implicit globbing
const auto reader = podio::makeReader(pattern);
ASSERT((reader.getEvents() == expected_events), "Reader read invalid number of events");
#if PODIO_ENABLE_DATASOURCE
auto rdf = podio::CreateDataFrame(pattern);
ASSERT(rdf.Count().GetValue() == expected_events, "DataSource read invalid number of events");
#endif // PODIO_ENABLE_DATASOURCE
}
}
13 changes: 13 additions & 0 deletions tests/root_io/read_multiple.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
"""Small test case for checking get_reader working with
a single file, list of files, and a glob pattern"""

import podio

assert podio.utils.is_glob_pattern("example_frame_?.root")
files = podio.utils.expand_glob("example_frame_?.root")
assert files == ["example_frame_0.root", "example_frame_1.root"]

reader = podio.reading.get_reader("example_frame.root")
assert len(reader.get("events")) == 10
reader = podio.reading.get_reader(files)
assert len(reader.get("events")) == 20
8 changes: 7 additions & 1 deletion tests/root_io/write_frame_root.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,13 @@

#include "podio/ROOTWriter.h"

#include <filesystem>

int main(int, char**) {
write_frames<podio::ROOTWriter>("example_frame.root");
const auto filename = "example_frame.root";
write_frames<podio::ROOTWriter>(filename);
// copy file multiple times for tests with glob
std::filesystem::copy_file(filename, "example_frame_0.root", std::filesystem::copy_options::overwrite_existing);
std::filesystem::copy_file(filename, "example_frame_1.root", std::filesystem::copy_options::overwrite_existing);
return 0;
}
Loading