From 45bbd5c7fb957079c391e01a313b4d7b31911c57 Mon Sep 17 00:00:00 2001 From: Andres Rios Tascon Date: Thu, 2 Jan 2025 09:41:27 -0500 Subject: [PATCH 1/6] Writing RNTuple with no data now works --- src/uproot/models/RNTuple.py | 2 +- src/uproot/writing/_cascade.py | 79 ++++--- src/uproot/writing/_cascadentuple.py | 322 ++++++++++++++------------- 3 files changed, 209 insertions(+), 194 deletions(-) diff --git a/src/uproot/models/RNTuple.py b/src/uproot/models/RNTuple.py index 6652aec29..033a0292a 100644 --- a/src/uproot/models/RNTuple.py +++ b/src/uproot/models/RNTuple.py @@ -270,7 +270,7 @@ def read_members(self, chunk, cursor, context, file): -_rntuple_anchor_format.size - _rntuple_anchor_checksum_format.size : -_rntuple_anchor_checksum_format.size ] - ) + ), "Anchor checksum does not match! File is corrupted or incompatible." cursor.skip(-_rntuple_anchor_checksum_format.size) self._header_chunk_ready = False diff --git a/src/uproot/writing/_cascade.py b/src/uproot/writing/_cascade.py index fc37e4a45..1d4651894 100644 --- a/src/uproot/writing/_cascade.py +++ b/src/uproot/writing/_cascade.py @@ -27,6 +27,7 @@ import uuid import numpy +import xxhash import uproot.compression import uproot.const @@ -1730,58 +1731,62 @@ def add_tree( def add_rntuple(self, sink, name, title, akform): import uproot.writing._cascadentuple + rntuple_spec_version_epoch = 1 + rntuple_spec_version_major = 0 + rntuple_spec_version_minor = 0 + rntuple_spec_version_patch = 0 + anchor = uproot.writing._cascadentuple.NTuple_Anchor( - None, 0, 0, 48, None, None, None, None, None, None, 0 + None, + rntuple_spec_version_epoch, + rntuple_spec_version_major, + rntuple_spec_version_minor, + rntuple_spec_version_patch, + None, + None, + None, + None, + None, + None, + 0, # TODO: Fix this ) header = uproot.writing._cascadentuple.NTuple_Header(None, name, "", akform) footer = uproot.writing._cascadentuple.NTuple_Footer( - None, 0, header._crc32, akform + None, 0, header._checksum, akform ) - # the empty page list is hard-coded bytes which represents: - # 0 1 2 3 - # 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 - # +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - # | Envelope Version | Minimum Version | - # +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - # | Size |T| - # +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - # | Number of Items (for list frames) |Reserv.| - # +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - # | FRAME PAYLOAD | - # +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - # | CRC32 | - # +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - # - # - Envelope Version = 1 (0x0100) - # - Minimum Version = 1 (0x0100) - # - Size = -8 (0xf8ffffff) [value is negative because this is a list] - # - Number of Items = 0 (0x00000000) [empty list] - # - FRAME PAYLOAD = empty [because number of items is 0] - # - CRC32 = 2678769841 - # manually calculate CRC32: - - # In [1]: zlib.crc32(b'\x01\x00\x01\x00\xf8\xff\xff\xff\00\00\00\00') - # Out[1]: 2678769841 - # In [2]: np.array([177, 200, 170, 159], dtype=np.uint8).view("uint32") - # Out[2]: array([2678769841], dtype=uint32) - - empty_page_list_bytes = numpy.array( - [1, 0, 1, 0, 248, 255, 255, 255, 0, 0, 0, 0, 177, 200, 170, 159], - dtype=numpy.uint8, + empty_page_list_headerbytes = ( + uproot.writing._cascadentuple._serialize_envelope_header( + uproot.const.RNTupleEnvelopeType.PAGELIST, 48 + ) ) - offset = self._freesegments.allocate(16) - footer.cluster_group_record_frames[0].page_list_envlink.locator = ( - uproot.writing._cascadentuple.NTuple_Locator(16, offset) + header.serialize() # so that checksum is computed + empty_page_list_payloadbytes = ( + uproot.models.RNTuple._rntuple_checksum_format.pack(header._checksum) + ) + empty_page_list_payloadbytes += ( + uproot.writing._cascadentuple._serialize_rntuple_list_frame([]) + ) # cluster summaries + empty_page_list_payloadbytes += ( + uproot.writing._cascadentuple._serialize_rntuple_list_frame([]) + ) # page locations + empty_page_list_bytes = ( + empty_page_list_headerbytes + empty_page_list_payloadbytes + ) + + empty_page_checksum = xxhash.xxh3_64_intdigest(empty_page_list_bytes) + checksum_bytes = uproot.models.RNTuple._rntuple_checksum_format.pack( + empty_page_checksum ) + empty_page_list_bytes += checksum_bytes ntuple = uproot.writing._cascadentuple.NTuple( self, name, title, akform, self._freesegments, header, footer, [], anchor ) - sink.write(offset, empty_page_list_bytes) + # sink.write(offset, empty_page_list_bytes) ntuple.write(sink) sink.flush() return ntuple diff --git a/src/uproot/writing/_cascadentuple.py b/src/uproot/writing/_cascadentuple.py index 8fcb39142..0892b81b2 100644 --- a/src/uproot/writing/_cascadentuple.py +++ b/src/uproot/writing/_cascadentuple.py @@ -16,6 +16,7 @@ import awkward import numpy +import xxhash import uproot import uproot.compression @@ -23,24 +24,32 @@ import uproot.reading import uproot.serialization from uproot.models.RNTuple import ( + _rntuple_anchor_checksum_format, _rntuple_anchor_format, + _rntuple_checksum_format, _rntuple_cluster_group_format, _rntuple_cluster_summary_format, _rntuple_column_record_format, + _rntuple_env_header_format, + _rntuple_envlink_size_format, _rntuple_feature_flag_format, _rntuple_field_description_format, + _rntuple_frame_num_items_format, _rntuple_frame_size_format, + _rntuple_locator_offset_format, _rntuple_locator_size_format, ) from uproot.writing._cascade import CascadeLeaf, CascadeNode, Key, String +_rntuple_string_length_format = struct.Struct(" Date: Thu, 2 Jan 2025 10:39:39 -0500 Subject: [PATCH 2/6] Re-enabled existing writing tests and fixed a few things --- src/uproot/writing/_cascadentuple.py | 35 +++++------- src/uproot/writing/writable.py | 2 +- tests/test_0705_rntuple_writing_metadata.py | 62 ++++++++++----------- 3 files changed, 44 insertions(+), 55 deletions(-) diff --git a/src/uproot/writing/_cascadentuple.py b/src/uproot/writing/_cascadentuple.py index 0892b81b2..e270fa99d 100644 --- a/src/uproot/writing/_cascadentuple.py +++ b/src/uproot/writing/_cascadentuple.py @@ -67,27 +67,20 @@ # "splitint16": 21, } _ak_primitive_to_num_dict = { - "i64": 1, - "i32": 2, - # "switch": 3, - # "byte": 4, - # "char": 5, - "bool": 6, - "float64": 7, - "float32": 8, - "float16": 9, - "int64": 10, - "int32": 11, - "int16": 12, - "int8": 13, - # "splitindex64": 14, - # "splitindex32": 15, - # "splitreal64": 16, - # "splitreal32": 17, - # "splitreal16": 18, - # "splitin64": 19, - # "splitint32": 20, - # "splitint16": 21, + "bool": 0x00, + "int8": 0x03, + "uint8": 0x04, + "int16": 0x05, + "uint16": 0x06, + "int32": 0x07, + "uint32": 0x08, + "int64": 0x09, + "uint64": 0x0A, + "float16": 0x0B, + "float32": 0x0C, + "float64": 0x0D, + "i32": 0x0E, + "i64": 0x0F, } diff --git a/src/uproot/writing/writable.py b/src/uproot/writing/writable.py index 5c0c8aec6..921ebe0b4 100644 --- a/src/uproot/writing/writable.py +++ b/src/uproot/writing/writable.py @@ -1009,7 +1009,7 @@ def _get(self, name, cycle): raise TypeError( "WritableDirectory cannot view preexisting TTrees; open the file with uproot.open instead of uproot.recreate or uproot.update" ) - elif key.classname.string == "ROOT::Experimental::RNTuple": + elif key.classname.string == "ROOT::RNTuple": if self._file._has_ntuple(key.seek_location): return self._file._get_ntuple(key.seek_location) else: diff --git a/tests/test_0705_rntuple_writing_metadata.py b/tests/test_0705_rntuple_writing_metadata.py index 5c652f281..10a447cb0 100644 --- a/tests/test_0705_rntuple_writing_metadata.py +++ b/tests/test_0705_rntuple_writing_metadata.py @@ -14,9 +14,6 @@ ak = pytest.importorskip("awkward") -@pytest.mark.skip( - reason="RNTuple writing is pending until specification 1.0.0 is released." -) def test_header(tmp_path): filepath = os.path.join(tmp_path, "test.root") @@ -34,7 +31,7 @@ def test_header(tmp_path): file = uproot.open(filepath)["ntuple"] header = file.header - assert header.crc32 == file.footer.header_crc32 + assert header.checksum == file.footer.header_checksum frs = header.field_records assert frs[0].parent_field_id == 0 @@ -45,23 +42,20 @@ def test_header(tmp_path): assert frs[2].field_name == "three" assert frs[0].type_name == "double" assert frs[1].type_name == "std::int32_t" - assert frs[2].type_name == "bit" + assert frs[2].type_name == "bool" crs = header.column_records - assert crs[0].type == 7 - assert crs[1].type == 11 - assert crs[2].type == 6 + assert crs[0].type == uproot.const.rntuple_col_type_to_num_dict["real64"] + assert crs[1].type == uproot.const.rntuple_col_type_to_num_dict["int32"] + assert crs[2].type == uproot.const.rntuple_col_type_to_num_dict["bit"] assert crs[0].field_id == 0 assert crs[1].field_id == 1 assert crs[2].field_id == 2 - assert crs[0].nbits == 64 - assert crs[1].nbits == 32 - assert crs[2].nbits == 1 + assert crs[0].nbits == uproot.const.rntuple_col_num_to_size_dict[crs[0].type] + assert crs[1].nbits == uproot.const.rntuple_col_num_to_size_dict[crs[1].type] + assert crs[2].nbits == uproot.const.rntuple_col_num_to_size_dict[crs[2].type] -@pytest.mark.skip( - reason="RNTuple writing is pending until specification 1.0.0 is released." -) def test_writable(tmp_path): filepath = os.path.join(tmp_path, "test.root") @@ -72,28 +66,30 @@ def test_writable(tmp_path): ], ["one"], ) - file.mkrntuple("ntuple", akform) + rn = file.mkrntuple("ntuple", akform) + print(rn) assert type(file["ntuple"]).__name__ == "WritableNTuple" -# FIXME get ROOT to recognize it -# ROOT = pytest.importorskip("ROOT") +ROOT = pytest.importorskip("ROOT") +if ROOT.gROOT.GetVersionInt() < 63500: + pytest.skip("ROOT version does not support RNTuple v1.0.0.0") -# def test_ROOT(tmp_path, capfd): -# filepath = os.path.join(tmp_path, "test.root") +def test_ROOT(tmp_path, capfd): + filepath = os.path.join(tmp_path, "test.root") -# with uproot.recreate(filepath) as file: -# akform = ak.forms.RecordForm( -# [ -# ak.forms.NumpyForm("float64"), -# ak.forms.NumpyForm("int32"), -# ], -# ["one", "two"], -# ) -# file.mkrntuple("ntuple", akform) -# RT = ROOT.Experimental.RNTupleReader.Open("ntuple", filepath) -# RT.PrintInfo() -# out = capfd.readouterr().out -# assert "* Field 1 : one (double)" in out -# assert "* Field 2 : two (std::int32_t)" in out + with uproot.recreate(filepath) as file: + akform = ak.forms.RecordForm( + [ + ak.forms.NumpyForm("float64"), + ak.forms.NumpyForm("int32"), + ], + ["one", "two"], + ) + file.mkrntuple("ntuple", akform) + RT = ROOT.Experimental.RNTupleReader.Open("ntuple", filepath) + RT.PrintInfo() + out = capfd.readouterr().out + assert "* Field 1 : one (double)" in out + assert "* Field 2 : two (std::int32_t)" in out From bcd82d1862e33b38ef836eb1c98c77fc213e1653 Mon Sep 17 00:00:00 2001 From: Andres Rios Tascon Date: Tue, 11 Feb 2025 10:42:44 -0500 Subject: [PATCH 3/6] It works now for flat arrays of native types --- src/uproot/writing/_cascadentuple.py | 199 +++++++++++++++++++-------- 1 file changed, 142 insertions(+), 57 deletions(-) diff --git a/src/uproot/writing/_cascadentuple.py b/src/uproot/writing/_cascadentuple.py index e270fa99d..9c3407dbf 100644 --- a/src/uproot/writing/_cascadentuple.py +++ b/src/uproot/writing/_cascadentuple.py @@ -12,10 +12,8 @@ import datetime import struct -import zlib import awkward -import numpy import xxhash import uproot @@ -29,6 +27,8 @@ _rntuple_checksum_format, _rntuple_cluster_group_format, _rntuple_cluster_summary_format, + _rntuple_column_compression_settings_format, + _rntuple_column_element_offset_format, _rntuple_column_record_format, _rntuple_env_header_format, _rntuple_envlink_size_format, @@ -38,6 +38,7 @@ _rntuple_frame_size_format, _rntuple_locator_offset_format, _rntuple_locator_size_format, + _rntuple_page_num_elements_format, ) from uproot.writing._cascade import CascadeLeaf, CascadeNode, Key, String @@ -120,13 +121,19 @@ def _record_frame_wrap(payload, includeself=True): return raw_bytes -def _serialize_rntuple_list_frame(items, wrap=True): +def _serialize_rntuple_list_frame(items, wrap=True, rawinput=False, extra_payload=None): # when items is [], b'\xf4\xff\xff\xff\xff\xff\xff\xff\x00\x00\x00\x00' n_items = len(items) - if wrap: + if wrap and rawinput: + payload_bytes = b"".join([_record_frame_wrap(x) for x in items]) + elif rawinput: + payload_bytes = b"".join(items) + elif wrap: payload_bytes = b"".join([_record_frame_wrap(x.serialize()) for x in items]) else: payload_bytes = b"".join([x.serialize() for x in items]) + if extra_payload is not None: + payload_bytes += extra_payload size = ( _rntuple_frame_size_format.size + _rntuple_frame_num_items_format.size @@ -350,9 +357,7 @@ def serialize(self): ) out.append(_record_frame_wrap(schema_extension_payload)) - out.append( - _serialize_rntuple_list_frame(self.cluster_group_record_frames) - ) # never empty + out.append(_serialize_rntuple_list_frame(self.cluster_group_record_frames)) payload = b"".join(out) env_header = _serialize_envelope_header( @@ -402,13 +407,81 @@ def __repr__(self): return f"{type(self).__name__}({self.uncomp_size}, {self.locator})" +class NTuple_PageListEnvelope: + def __init__( + self, header_checksum, cluster_summaries, page_data, compression_settings=0 + ): + self.header_checksum = header_checksum + self.cluster_summaries = cluster_summaries + self.page_data = page_data + self.compression_settings = compression_settings + self._checksum = None + assert len(cluster_summaries) == len(page_data) + + def serialize(self): + # For now we, only support one cluster per page list envelope + nested_pagelist_rawbytes = _serialize_rntuple_list_frame( + [ # list of clusters + _serialize_rntuple_list_frame( + [ # list of columns + _serialize_rntuple_list_frame( + [ # list of pages + NTuple_PageDescription(page[1], page[0]) for page in col + ], + wrap=False, + extra_payload=b"".join( + [ + _rntuple_column_element_offset_format.pack( + col[0][2] + ), + _rntuple_column_compression_settings_format.pack( + self.compression_settings + ), + ] + ), + ) + for col in cluster_page_locations + ], + rawinput=True, + wrap=False, + ) + for cluster_page_locations in self.page_data + ], + rawinput=True, + wrap=False, + ) + out = [ + _rntuple_checksum_format.pack(self.header_checksum), + _serialize_rntuple_list_frame(self.cluster_summaries), + nested_pagelist_rawbytes, + ] + payload = b"".join(out) + + env_header = _serialize_envelope_header( + uproot.const.RNTupleEnvelopeType.PAGELIST, + len(payload) + + _rntuple_env_header_format.size + + _rntuple_checksum_format.size, + ) + header_and_payload = b"".join([env_header, payload]) + self._checksum = xxhash.xxh3_64_intdigest(header_and_payload) + checksum_bytes = _rntuple_checksum_format.pack(self._checksum) + + final_bytes = b"".join([header_and_payload, checksum_bytes]) + return final_bytes + + class NTuple_ClusterGroupRecord: - def __init__(self, num_clusters, page_list_envlink): + def __init__(self, min_entry, entry_span, num_clusters, page_list_envlink): + self.min_entry = min_entry + self.entry_span = entry_span self.num_clusters = num_clusters self.page_list_envlink = page_list_envlink def serialize(self): - header_bytes = _rntuple_cluster_group_format.pack(0, 1, self.num_clusters) + header_bytes = _rntuple_cluster_group_format.pack( + self.min_entry, self.entry_span, self.num_clusters + ) page_list_link_bytes = self.page_list_envlink.serialize() return header_bytes + page_list_link_bytes @@ -417,20 +490,23 @@ def __repr__(self): class NTuple_ClusterSummary: - def __init__(self, num_first_entry, num_entries): + def __init__(self, num_first_entry, num_entries, flags=0): self.num_first_entry = num_first_entry self.num_entries = num_entries + self.flags = flags def serialize(self): - # from spec: - # to save space, the page descriptions (inner items) are not in a record frame. + # Highest 8 bits are flags reserved for future use + assert 0 <= self.num_first_entry < 2**56 + assert 0 <= self.flags < 2**8 + num_entries = (self.flags << 56) | self.num_entries payload_bytes = _rntuple_cluster_summary_format.pack( - self.num_first_entry, self.num_entries + self.num_first_entry, num_entries ) return payload_bytes def __repr__(self): - return f"{type(self).__name__}({self.num_first_entry}, {self.num_entries})" + return f"{type(self).__name__}({self.num_first_entry}, {self.num_entries}, {self.flags})" class NTuple_InnerListLocator: @@ -448,16 +524,20 @@ def __repr__(self): class NTuple_PageDescription: - def __init__(self, num_elements, locator): - assert num_elements <= 65536 - self.num_elements = num_elements + def __init__(self, num_entries, locator): + assert num_entries <= 65536 + self.num_entries = num_entries self.locator = locator def serialize(self): - return struct.Struct(" Date: Wed, 12 Feb 2025 13:45:48 -0500 Subject: [PATCH 4/6] Fix test --- tests/test_0705_rntuple_writing_metadata.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/test_0705_rntuple_writing_metadata.py b/tests/test_0705_rntuple_writing_metadata.py index 10a447cb0..6eb4e07d1 100644 --- a/tests/test_0705_rntuple_writing_metadata.py +++ b/tests/test_0705_rntuple_writing_metadata.py @@ -71,12 +71,11 @@ def test_writable(tmp_path): assert type(file["ntuple"]).__name__ == "WritableNTuple" -ROOT = pytest.importorskip("ROOT") -if ROOT.gROOT.GetVersionInt() < 63500: - pytest.skip("ROOT version does not support RNTuple v1.0.0.0") - - def test_ROOT(tmp_path, capfd): + ROOT = pytest.importorskip("ROOT") + if ROOT.gROOT.GetVersionInt() < 63500: + pytest.skip("ROOT version does not support RNTuple v1.0.0.0") + filepath = os.path.join(tmp_path, "test.root") with uproot.recreate(filepath) as file: From 50260d367e31446e728501c7f342f19cbde64525 Mon Sep 17 00:00:00 2001 From: Andres Rios Tascon Date: Wed, 12 Feb 2025 14:31:29 -0500 Subject: [PATCH 5/6] Some cleanup --- src/uproot/const.py | 3 + src/uproot/writing/_cascade.py | 48 ++------- src/uproot/writing/_cascadentuple.py | 139 ++------------------------- 3 files changed, 19 insertions(+), 171 deletions(-) diff --git a/src/uproot/const.py b/src/uproot/const.py index 9bed664ac..64a48abc0 100644 --- a/src/uproot/const.py +++ b/src/uproot/const.py @@ -118,6 +118,9 @@ kStreamedMemberWise = numpy.uint16(1 << 14) ############ RNTuple https://github.com/root-project/root/blob/0b9cdbcfd326ba50ee6c2f202675656129eafbe7/tree/ntuple/v7/doc/BinaryFormatSpecification.md + +rntuple_version_for_writing = (1, 0, 0, 0) + rntuple_col_num_to_dtype_dict = { 0x00: "bit", 0x01: "uint8", # uninterpreted byte diff --git a/src/uproot/writing/_cascade.py b/src/uproot/writing/_cascade.py index 1d4651894..690cd9c1c 100644 --- a/src/uproot/writing/_cascade.py +++ b/src/uproot/writing/_cascade.py @@ -27,7 +27,6 @@ import uuid import numpy -import xxhash import uproot.compression import uproot.const @@ -1728,20 +1727,12 @@ def add_tree( tree.write_anew(sink) return tree - def add_rntuple(self, sink, name, title, akform): + def add_rntuple(self, sink, name, title, akform, description=""): import uproot.writing._cascadentuple - rntuple_spec_version_epoch = 1 - rntuple_spec_version_major = 0 - rntuple_spec_version_minor = 0 - rntuple_spec_version_patch = 0 - anchor = uproot.writing._cascadentuple.NTuple_Anchor( None, - rntuple_spec_version_epoch, - rntuple_spec_version_major, - rntuple_spec_version_minor, - rntuple_spec_version_patch, + *uproot.const.rntuple_version_for_writing, None, None, None, @@ -1751,44 +1742,17 @@ def add_rntuple(self, sink, name, title, akform): 0, # TODO: Fix this ) - header = uproot.writing._cascadentuple.NTuple_Header(None, name, "", akform) - - footer = uproot.writing._cascadentuple.NTuple_Footer( - None, 0, header._checksum, akform + header = uproot.writing._cascadentuple.NTuple_Header( + None, name, description, akform ) - empty_page_list_headerbytes = ( - uproot.writing._cascadentuple._serialize_envelope_header( - uproot.const.RNTupleEnvelopeType.PAGELIST, 48 - ) - ) - header.serialize() # so that checksum is computed - empty_page_list_payloadbytes = ( - uproot.models.RNTuple._rntuple_checksum_format.pack(header._checksum) - ) - empty_page_list_payloadbytes += ( - uproot.writing._cascadentuple._serialize_rntuple_list_frame([]) - ) # cluster summaries - empty_page_list_payloadbytes += ( - uproot.writing._cascadentuple._serialize_rntuple_list_frame([]) - ) # page locations - empty_page_list_bytes = ( - empty_page_list_headerbytes + empty_page_list_payloadbytes - ) - - empty_page_checksum = xxhash.xxh3_64_intdigest(empty_page_list_bytes) - checksum_bytes = uproot.models.RNTuple._rntuple_checksum_format.pack( - empty_page_checksum - ) - empty_page_list_bytes += checksum_bytes + footer = uproot.writing._cascadentuple.NTuple_Footer(None, header._checksum) ntuple = uproot.writing._cascadentuple.NTuple( - self, name, title, akform, self._freesegments, header, footer, [], anchor + self, akform, self._freesegments, header, footer, [], anchor ) - # sink.write(offset, empty_page_list_bytes) ntuple.write(sink) - sink.flush() return ntuple diff --git a/src/uproot/writing/_cascadentuple.py b/src/uproot/writing/_cascadentuple.py index 9c3407dbf..ddd9f6ea6 100644 --- a/src/uproot/writing/_cascadentuple.py +++ b/src/uproot/writing/_cascadentuple.py @@ -319,10 +319,8 @@ def serialize(self): # https://github.com/root-project/root/blob/8cd9eed6f3a32e55ef1f0f1df8e5462e753c735d/tree/ntuple/v7/doc/BinaryFormatSpecification.md#footer-envelope class NTuple_Footer(CascadeLeaf): - def __init__(self, location, feature_flags, header_checksum, akform): - self._feature_flags = feature_flags + def __init__(self, location, header_checksum): self._header_checksum = header_checksum - self._akform = akform self.extension_field_record_frames = [] self.extension_column_record_frames = [] @@ -343,7 +341,7 @@ def serialize(self): out = [] out.extend( [ - _rntuple_feature_flag_format.pack(self._feature_flags), + _rntuple_feature_flag_format.pack(0), _rntuple_checksum_format.pack(self._header_checksum), ] ) @@ -640,8 +638,6 @@ class NTuple(CascadeNode): def __init__( self, directory, - name, - title, ak_form, freesegments, header, @@ -651,8 +647,6 @@ def __init__( ): super().__init__(footer, anchor, freesegments) self._directory = directory - self._name = name - self._title = title self._header = header self._footer = footer self._cluster_metadata = cluster_metadata @@ -664,7 +658,7 @@ def __init__( self._num_entries = 0 def __repr__(self): - return f"{type(self).__name__}({self._directory}, {self._name}, {self._title}, {self._header}, {self._footer}, {self._cluster_metadata}, {self._anchor}, {self._freesegments})" + return f"{type(self).__name__}({self._directory}, {self._header}, {self._footer}, {self._cluster_metadata}, {self._anchor}, {self._freesegments})" @property def directory(self): @@ -702,125 +696,12 @@ def location(self): def num_entries(self): return self._num_entries - def actually_use(self, array): - pass - # print(type(array)) - # print(f"using {array!r}") - - def array_to_type(self, array, type): - if isinstance(type, awkward.types.ArrayType): - type = type.content - # type is unknown - if isinstance(type, awkward.types.UnknownType): - raise TypeError("cannot write data of unknown type to RNTuple") - - # type is primitive (e.g. "float32") - elif isinstance(type, awkward.types.NumpyType): - if isinstance(array, awkward.contents.IndexedArray): - self.array_to_type(array.project(), type) # always project IndexedArray - return - elif isinstance(array, awkward.contents.EmptyArray): - self.array_to_type( - array.to_NumpyArray( - awkward.types.numpytype.primitive_to_dtype(type.primitive) - ), - type, - ) - return - elif isinstance(array, awkward.contents.NumpyArray): - if array.form.type != type: - raise TypeError(f"expected {type!s}, found {array.form.type!s}") - else: - self.actually_use(array.data) - return - else: - raise TypeError(f"expected {type!s}, found {array.form.type!s}") - - # type is regular-length lists (e.g. "3 * float32") - elif isinstance(type, awkward.types.RegularType): - if isinstance(array, awkward.contents.IndexedArray): - self.array_to_type(array.project(), type) # always project IndexedArray - return - elif isinstance(array, awkward.contents.RegularArray): - if array.size != type.size: - raise TypeError(f"expected {type!s}, found {array.form.type!s}") - else: - if type.parameter("__array__") == "string": - # maybe the fact that this is a string changes how it's used - self.actually_use(f"regular strings of length {type.size}") - else: - self.actually_use(f"regular lists of length {type.size}") - self.array_to_type(array.content, type.content) - return - else: - raise TypeError(f"expected {type!s}, found {array.form.type!s}") - - # type is variable-length lists (e.g. "var * float32") - elif isinstance(type, awkward.types.ListType): - if isinstance(array, awkward.contents.IndexedArray): - self.array_to_type(array.project(), type) # always project IndexedArray - return - elif isinstance(array, awkward.contents.ListArray): - self.array_to_type(array.toListOffsetArray64(True), type) - return - elif isinstance(array, awkward.contents.ListOffsetArray): - if type.parameter("__array__") == "string": - # maybe the fact that this is a string changes how it's used - self.actually_use("variable-length strings") - else: - self.actually_use("variable-length lists") - self.actually_use(array.offsets.data) - self.array_to_type(array.content, type.content) - return - else: - raise TypeError(f"expected {type!s}, found {array.form.type!s}") - - # type is potentially missing data (e.g. "?float32") - elif isinstance(type, awkward.types.OptionType): - raise NotImplementedError("RNTuple does not yet have an option-type") - - # type is struct-like records (e.g. "{x: float32, y: var * int64}") - elif isinstance(type, awkward.types.RecordType): - if isinstance(array, awkward.contents.IndexedArray): - self.array_to_type(array.project(), type) # always project IndexedArray - return - elif isinstance(array, awkward.contents.RecordArray): - self.actually_use("begin record") - for field, subtype in zip(type.fields, type.contents): - self.actually_use(f"field {field}") - self.array_to_type(array[field], subtype) - self.actually_use("end record") - return - else: - raise TypeError(f"expected {type!s}, found {array.form.type!s}") - - # type is heterogeneous unions/variants (e.g. "union[float32, var * int64]") - elif isinstance(type, awkward.types.UnionType): - if isinstance(array, awkward.contents.IndexedArray): - self.array_to_type(array.project(), type) # always project IndexedArray - return - elif isinstance(array, awkward.contents.UnionArray): - self.actually_use("begin union") - self.actually_use(array.tags.data) - self.actually_use(array.index.data) - for index, subtype in enumerate(type.contents): - self.actually_use(f"index {index}") - self.array_to_type(array.project(index), subtype) - self.actually_use("end union") - return - else: - raise TypeError(f"expected {type!s}, found {array.form.type!s}") - - else: - raise AssertionError(f"type must be an Awkward Type, not {type!r}") - def extend(self, file, sink, data): """ 1. Write pages 2. Write page list for new cluster group - 3. page list envelopes - 4. relocate footer - 5. update anchor's foot metadata values in-place + 3. Relocate footer + 4. Update anchor's foot metadata values in-place """ if data.layout.form != self._header._akform: @@ -870,9 +751,8 @@ def extend(self, file, sink, data): self._footer.cluster_group_record_frames.append(cluster_group) - # self.array_to_type(data.layout, data.type) # TODO: what does this do? + # 3. Relocate footer - #### relocate Footer ############################## old_footer_key = self._footer_key self._freesegments.release( old_footer_key.location, old_footer_key.location + old_footer_key.allocation @@ -885,7 +765,8 @@ def extend(self, file, sink, data): big=False, ) - ### update anchor + # 4. Update anchor's foot metadata values in-place + self._anchor.seek_footer = ( self._footer_key.location + self._footer_key.allocation ) @@ -974,8 +855,8 @@ def write(self, sink): self._key = self._directory.add_object( sink, "ROOT::RNTuple", - self._name, - self._title, + self._header._name, + self._header._name, anchor_raw_data, len(anchor_raw_data), replaces=self._key, From aa62a00f18c3eabfda75afdebeb7bce1a4bf5276 Mon Sep 17 00:00:00 2001 From: Andres Rios Tascon Date: Wed, 12 Feb 2025 15:08:11 -0500 Subject: [PATCH 6/6] Added test for basic writing --- tests/test_1356_basic_rntuple_writing.py | 56 ++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 tests/test_1356_basic_rntuple_writing.py diff --git a/tests/test_1356_basic_rntuple_writing.py b/tests/test_1356_basic_rntuple_writing.py new file mode 100644 index 000000000..b8bdaad51 --- /dev/null +++ b/tests/test_1356_basic_rntuple_writing.py @@ -0,0 +1,56 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/uproot5/blob/main/LICENSE + +import json +import os +import queue +import sys + +import numpy +import pytest +import skhep_testdata + +import uproot + +ak = pytest.importorskip("awkward") + + +def test_flat_arrays(tmp_path): + filepath = os.path.join(tmp_path, "test.root") + + with uproot.recreate(filepath) as file: + data = ak.Array({"one": [1, 2, 3], "two": [1.1, 2.2, 3.3]}) + obj = file.mkrntuple("ntuple", data.layout.form) + obj.extend(data) + + obj = uproot.open(filepath)["ntuple"] + arrays = obj.arrays() + + assert arrays.one.tolist() == data.one.tolist() + assert arrays.two.tolist() == data.two.tolist() + + +def test_flat_arrayst_ROOT(tmp_path, capfd): + ROOT = pytest.importorskip("ROOT") + if ROOT.gROOT.GetVersionInt() < 63500: + pytest.skip("ROOT version does not support RNTuple v1.0.0.0") + + filepath = os.path.join(tmp_path, "test.root") + + with uproot.recreate(filepath) as file: + data = ak.Array({"one": [1, 2, 3], "two": [1.1, 2.2, 3.3]}) + obj = file.mkrntuple("ntuple", data.layout.form) + obj.extend(data) + + RT = ROOT.Experimental.RNTupleReader.Open("ntuple", filepath) + RT.PrintInfo() + RT.Show(0) + RT.Show(2) + out = capfd.readouterr().out + assert "* N-Tuple : ntuple" in out + assert "* Entries : 3" in out + assert "* Field 1 : one (std::int64_t)" in out + assert "* Field 2 : two (double)" in out + assert ' "one": 1,' in out + assert ' "two": 1.1' in out + assert ' "one": 3' in out + assert ' "two": 3.3' in out