diff --git a/CITATION.cff b/CITATION.cff index b6bfb95f..e2e64b23 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,6 +1,9 @@ cff-version: 1.1.0 message: "Cite as" -authors: +author: + - family-names: Bauer + given-names: Daniel + orcid: https://orcid.org/0000-0001-9447-460X - family-names: Chadwick given-names: Eli orcid: https://orcid.org/0000-0002-0035-6475 diff --git a/README.md b/README.md index ed67a13b..50488e96 100644 --- a/README.md +++ b/README.md @@ -451,6 +451,7 @@ Options: * Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH * Copyright 2024 Data Centre, SciLifeLab, SE * Copyright 2024 National Institute of Informatics (NII), JP +* Copyright 2025 Senckenberg Society for Nature Research (SGN), DE Licensed under the Apache License, version 2.0 , diff --git a/examples/fastapi/main.py b/examples/fastapi/main.py new file mode 100644 index 00000000..a5c5ea57 --- /dev/null +++ b/examples/fastapi/main.py @@ -0,0 +1,64 @@ +# Copyright 2019-2024 The University of Manchester, UK +# Copyright 2020-2024 Vlaams Instituut voor Biotechnologie (VIB), BE +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), ES +# Copyright 2020-2024 Center for Advanced Studies, Research and Development in Sardinia (CRS4), IT +# Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH +# Copyright 2024 Data Centre, SciLifeLab, SE +# Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Streaming RO-Crates from a web server + +This example demonstrates how to create an RO-Crate on-the-fly +and stream the result to the client. +By using `stream_zip`, the RO-Crate is not written to disk and remote +data is only fetched on the fly. + +To run: `fastapi dev main.py`, then visit http://localhost:8000/crate +""" + +from fastapi import FastAPI +from fastapi.responses import StreamingResponse +from rocrate.rocrate import ROCrate +from io import StringIO + +app = FastAPI() + + +@app.get("/crate") +async def get(): + crate = ROCrate() + + # Add a remote file + crate.add_file( + "https://raw.githubusercontent.com/ResearchObject/ro-crate-py/refs/heads/master/test/test-data/sample_file.txt", + fetch_remote=True + ) + + # Add a file containing a string to the crate + crate.add_file( + source=StringIO("Hello, World!"), + dest_path="test-data/hello.txt" + ) + + # Stream crate to client as a zip file + return StreamingResponse( + crate.stream_zip(), + media_type="application/rocrate+zip", + headers={ + "Content-Disposition": "attachment; filename=crate.zip", + } + ) diff --git a/examples/fastapi/requirements.txt b/examples/fastapi/requirements.txt new file mode 100644 index 00000000..09eae111 --- /dev/null +++ b/examples/fastapi/requirements.txt @@ -0,0 +1,3 @@ +../../ +fastapi +fastapi-cli diff --git a/examples/read_test_metadata.py b/examples/read_test_metadata.py index 722d0a16..88b77d52 100644 --- a/examples/read_test_metadata.py +++ b/examples/read_test_metadata.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/__init__.py b/rocrate/__init__.py index b4ef8599..53905bd9 100644 --- a/rocrate/__init__.py +++ b/rocrate/__init__.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -30,6 +31,7 @@ """ __author__ = ", ".join(( + 'Daniel Bauer', 'Eli Chadwick', 'Paul De Geest', 'Bert Droesbeke', @@ -52,6 +54,7 @@ Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH Copyright 2024 Data Centre, SciLifeLab, SE Copyright 2024 National Institute of Informatics (NII), JP +Copyright 2025 Senckenberg Society for Nature Research (SGN), DE """ __license__ = ("Apache License, version 2.0 " "") diff --git a/rocrate/cli.py b/rocrate/cli.py index a042dbd9..64693906 100644 --- a/rocrate/cli.py +++ b/rocrate/cli.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/memory_buffer.py b/rocrate/memory_buffer.py new file mode 100644 index 00000000..9ed5b789 --- /dev/null +++ b/rocrate/memory_buffer.py @@ -0,0 +1,52 @@ +# Copyright 2019-2024 The University of Manchester, UK +# Copyright 2020-2024 Vlaams Instituut voor Biotechnologie (VIB), BE +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), ES +# Copyright 2020-2024 Center for Advanced Studies, Research and Development in Sardinia (CRS4), IT +# Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH +# Copyright 2024 Data Centre, SciLifeLab, SE +# Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from io import RawIOBase + + +class MemoryBuffer(RawIOBase): + """ + A buffer class that supports reading and writing binary data. + The buffer automatically resets upon reading to make sure all data is read only once. + """ + + def __init__(self): + self._buffer = b'' + + def write(self, data): + if self.closed: + raise ValueError('write to closed file') + self._buffer += data + return len(data) + + def read(self, size=-1): + if self.closed: + raise ValueError('read from closed file') + if size < 0: + data = self._buffer + self._buffer = b'' + else: + data = self._buffer[:size] + self._buffer = self._buffer[size:] + return data + + def __len__(self): + return len(self._buffer) diff --git a/rocrate/metadata.py b/rocrate/metadata.py index 29cdb218..7a5ea720 100644 --- a/rocrate/metadata.py +++ b/rocrate/metadata.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/__init__.py b/rocrate/model/__init__.py index 5ae3c862..2c482d59 100644 --- a/rocrate/model/__init__.py +++ b/rocrate/model/__init__.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/computationalworkflow.py b/rocrate/model/computationalworkflow.py index 1ca93773..6754583e 100644 --- a/rocrate/model/computationalworkflow.py +++ b/rocrate/model/computationalworkflow.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/computerlanguage.py b/rocrate/model/computerlanguage.py index 88546f97..511c53ce 100644 --- a/rocrate/model/computerlanguage.py +++ b/rocrate/model/computerlanguage.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/contextentity.py b/rocrate/model/contextentity.py index 9ccc2fb9..1bd94db8 100644 --- a/rocrate/model/contextentity.py +++ b/rocrate/model/contextentity.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/creativework.py b/rocrate/model/creativework.py index 32e4341d..1b1b2498 100644 --- a/rocrate/model/creativework.py +++ b/rocrate/model/creativework.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/data_entity.py b/rocrate/model/data_entity.py index 22e2f01e..2c44e5ab 100644 --- a/rocrate/model/data_entity.py +++ b/rocrate/model/data_entity.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,3 +29,14 @@ class DataEntity(Entity): def write(self, base_path): pass + + def stream(self, chunk_size=8192): + """ Stream the data from the source. Each chunk of the content is yielded as a tuple + containing the name of the destination file relative to the crate and the chunk of data. + The destination file name is required because a DataEntity can be a file or a + collection of files (Dataset) and the caller need to know to which file a chunk belongs. + For collection of files, the caller can assume that files are streamed one after another, + meaning once the destination name changes, a file can be closed and the next one can be + openend. + """ + yield from () diff --git a/rocrate/model/dataset.py b/rocrate/model/dataset.py index 0e0e52ff..77f4f93c 100644 --- a/rocrate/model/dataset.py +++ b/rocrate/model/dataset.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -22,7 +23,7 @@ import errno import os -import shutil +import warnings from pathlib import Path from urllib.request import urlopen @@ -43,37 +44,84 @@ def _empty(self): def format_id(self, identifier): return identifier.rstrip("/") + "/" + def _write_from_url(self, base_path): + if self.validate_url and not self.fetch_remote: + with urlopen(self.source) as _: + self._jsonld['sdDatePublished'] = iso_now() + if self.fetch_remote: + out_file_path, out_file = None, None + for rel_path, chunk in self._stream_folder_from_url(): + path = base_path / rel_path + if path != out_file_path: + if out_file: + out_file.close() + out_file_path = Path(path) + out_file_path.parent.mkdir(parents=True, exist_ok=True) + out_file = open(out_file_path, 'wb') + out_file.write(chunk) + if out_file: + out_file.close() + + def _copy_folder(self, base_path): + abs_out_path = base_path / self.id + if self.source is None: + abs_out_path.mkdir(parents=True, exist_ok=True) + else: + if not Path(self.source).exists(): + raise FileNotFoundError( + errno.ENOENT, os.strerror(errno.ENOENT), str(self.source) + ) + abs_out_path.mkdir(parents=True, exist_ok=True) + if not self.crate.source: + self.crate._copy_unlisted(self.source, abs_out_path) + def write(self, base_path): - out_path = Path(base_path) / self.id + base_path = Path(base_path) if is_url(str(self.source)): - if self.validate_url and not self.fetch_remote: + self._write_from_url(base_path) + else: + self._copy_folder(base_path) + + def stream(self, chunk_size=8192): + if self.source is None: + return + elif is_url(str(self.source)): + yield from self._stream_folder_from_url(chunk_size) + else: + yield from self._stream_folder_from_path(chunk_size) + + def _stream_folder_from_path(self, chunk_size=8192): + if not Path(str(self.source)).exists(): + raise FileNotFoundError( + errno.ENOENT, os.strerror(errno.ENOENT), str(self.source) + ) + if not self.crate.source: + for root, _, files in os.walk(self.source): + root = Path(root) + for name in files: + source = root / name + dest = source.relative_to(Path(self.source).parent) + with open(source, 'rb') as f: + while chunk := f.read(chunk_size): + yield str(dest), chunk + + def _stream_folder_from_url(self, chunk_size=8192): + if not self.fetch_remote: + if self.validate_url: with urlopen(self.source) as _: self._jsonld['sdDatePublished'] = iso_now() - if self.fetch_remote: - self.__get_parts(out_path) else: - if self.source is None: - out_path.mkdir(parents=True, exist_ok=True) - else: - if not Path(self.source).exists(): - raise FileNotFoundError( - errno.ENOENT, os.strerror(errno.ENOENT), str(self.source) - ) - out_path.mkdir(parents=True, exist_ok=True) - if not self.crate.source: - self.crate._copy_unlisted(self.source, out_path) + base = self.source.rstrip("/") + for entry in self._jsonld.get("hasPart", []): + try: + part = entry["@id"] + if is_url(part) or part.startswith("/"): + raise RuntimeError(f"'{self.source}': part '{part}' is not a relative path") + part_uri = f"{base}/{part}" + rel_out_path = Path(self.id) / part - def __get_parts(self, out_path): - out_path.mkdir(parents=True, exist_ok=True) - base = self.source.rstrip("/") - for entry in self._jsonld.get("hasPart", []): - try: - part = entry["@id"] - except KeyError: - continue - if is_url(part) or part.startswith("/"): - raise RuntimeError(f"'{self.source}': part '{part}' is not a relative path") - part_uri = f"{base}/{part}" - part_out_path = out_path / part - with urlopen(part_uri) as r, open(part_out_path, 'wb') as f: - shutil.copyfileobj(r, f) + with urlopen(part_uri) as response: + while chunk := response.read(chunk_size): + yield str(rel_out_path), chunk + except KeyError: + warnings.warn(f"'hasPart' entry in {self.id} is missing '@id'. Skipping.") diff --git a/rocrate/model/entity.py b/rocrate/model/entity.py index 0e504f06..d0cbcd62 100644 --- a/rocrate/model/entity.py +++ b/rocrate/model/entity.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/file.py b/rocrate/model/file.py index 8cd95286..d0363b76 100644 --- a/rocrate/model/file.py +++ b/rocrate/model/file.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -40,41 +41,97 @@ def _empty(self): } return val + def _has_writeable_stream(self): + if isinstance(self.source, (BytesIO, StringIO)): + return True + elif is_url(str(self.source)): + return self.fetch_remote + else: + return self.source is not None + + def _write_from_stream(self, out_file_path): + if not self._has_writeable_stream(): + # is this does not correspond to a writeable stream (i.e. it is a url but fetch_remote is False), + # we still want to consume the stream to consume file headers, run the size calculation, etc. + all(self.stream()) + return + + out_file_path.parent.mkdir(parents=True, exist_ok=True) + with open(out_file_path, 'wb') as out_file: + for _, chunk in self.stream(): + out_file.write(chunk) + + def _copy_file(self, path, out_file_path): + out_file_path.parent.mkdir(parents=True, exist_ok=True) + if not out_file_path.exists() or not out_file_path.samefile(path): + shutil.copy(path, out_file_path) + if self.record_size: + self._jsonld['contentSize'] = str(out_file_path.stat().st_size) + def write(self, base_path): out_file_path = Path(base_path) / self.id - if isinstance(self.source, (BytesIO, StringIO)): - out_file_path.parent.mkdir(parents=True, exist_ok=True) - mode = 'w' + ('b' if isinstance(self.source, BytesIO) else 't') - kw = {} if isinstance(self.source, BytesIO) else {'encoding': 'utf-8'} - with open(out_file_path, mode, **kw) as out_file: - content = self.source.getvalue() - out_file.write(content) + if isinstance(self.source, (BytesIO, StringIO)) or is_url(str(self.source)): + self._write_from_stream(out_file_path) + elif self.source is None: + # Allows to record a File entity whose @id does not exist, see #73 + warnings.warn(f"No source for {self.id}") + else: + self._copy_file(self.source, out_file_path) + + def _stream_from_stream(self, stream): + size = 0 + read = stream.read() + if isinstance(self.source, StringIO): + read = read.encode('utf-8') + while len(read) > 0: + yield self.id, read + size += len(read) + read = stream.read() + if isinstance(self.source, StringIO): + read = read.encode('utf-8') + + if self.record_size: + self._jsonld['contentSize'] = str(size) + + def _stream_from_url(self, url, chunk_size=8192): + if self.fetch_remote or self.validate_url: + if self.validate_url: + if url.startswith("http"): + with requests.head(url) as response: + self._jsonld.update({ + 'contentSize': response.headers.get('Content-Length'), + 'encodingFormat': response.headers.get('Content-Type') + }) + if not self.fetch_remote: + date_published = response.headers.get("Last-Modified", iso_now()) + self._jsonld['sdDatePublished'] = date_published + if self.fetch_remote: + size = 0 + self._jsonld['contentUrl'] = str(url) + with urllib.request.urlopen(url) as response: + while chunk := response.read(chunk_size): + yield self.id, chunk + size += len(chunk) + if self.record_size: - self._jsonld['contentSize'] = str(len(content)) + self._jsonld['contentSize'] = str(size) + + def _stream_from_file(self, path, chunk_size=8192): + size = 0 + with open(path, 'rb') as f: + while chunk := f.read(chunk_size): + yield self.id, chunk + size += len(chunk) + if self.record_size: + self._jsonld['contentSize'] = str(size) + + def stream(self, chunk_size=8192): + if isinstance(self.source, (BytesIO, StringIO)): + yield from self._stream_from_stream(self.source) elif is_url(str(self.source)): - if self.fetch_remote or self.validate_url: - if self.validate_url: - if self.source.startswith("http"): - with requests.head(self.source) as response: - self._jsonld.update({ - 'contentSize': response.headers.get('Content-Length'), - 'encodingFormat': response.headers.get('Content-Type') - }) - if not self.fetch_remote: - date_published = response.headers.get("Last-Modified", iso_now()) - self._jsonld['sdDatePublished'] = date_published - if self.fetch_remote: - out_file_path.parent.mkdir(parents=True, exist_ok=True) - urllib.request.urlretrieve(self.source, out_file_path) - self._jsonld['contentUrl'] = str(self.source) - if self.record_size: - self._jsonld['contentSize'] = str(out_file_path.stat().st_size) + yield from self._stream_from_url(self.source, chunk_size) elif self.source is None: # Allows to record a File entity whose @id does not exist, see #73 warnings.warn(f"No source for {self.id}") else: - out_file_path.parent.mkdir(parents=True, exist_ok=True) - if not out_file_path.exists() or not out_file_path.samefile(self.source): - shutil.copy(self.source, out_file_path) - if self.record_size: - self._jsonld['contentSize'] = str(out_file_path.stat().st_size) + yield from self._stream_from_file(self.source, chunk_size) diff --git a/rocrate/model/file_or_dir.py b/rocrate/model/file_or_dir.py index 4193b53e..a66e3c50 100644 --- a/rocrate/model/file_or_dir.py +++ b/rocrate/model/file_or_dir.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/metadata.py b/rocrate/model/metadata.py index aa7cf801..5d432a87 100644 --- a/rocrate/model/metadata.py +++ b/rocrate/model/metadata.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -74,11 +75,16 @@ def generate(self): context = context[0] return {'@context': context, '@graph': graph} - def write(self, base_path): - write_path = Path(base_path) / self.id - as_jsonld = self.generate() - with open(write_path, 'w', encoding='utf-8') as outfile: - json.dump(as_jsonld, outfile, indent=4, sort_keys=True) + def stream(self, chunk_size=8192): + content = self.generate() + yield self.id, str.encode(json.dumps(content, indent=4, sort_keys=True), encoding='utf-8') + + def _has_writeable_stream(self): + return True + + def write(self, dest_base): + write_path = Path(dest_base) / self.id + super()._write_from_stream(write_path) @property def root(self) -> Dataset: diff --git a/rocrate/model/person.py b/rocrate/model/person.py index c6b6e6df..cfe7ec1f 100644 --- a/rocrate/model/person.py +++ b/rocrate/model/person.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/preview.py b/rocrate/model/preview.py index 12abf817..3f5e08b8 100644 --- a/rocrate/model/preview.py +++ b/rocrate/model/preview.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -90,11 +91,15 @@ def is_object_list(a): out_html = src.render(crate=self.crate, context=context_entities, data=data_entities) return out_html - def write(self, dest_base): + def stream(self, chunk_size=8192): if self.source: - super().write(dest_base) + yield from super().stream() else: - write_path = Path(dest_base) / self.id - out_html = self.generate_html() - with open(write_path, 'w', encoding='utf-8') as outfile: - outfile.write(out_html) + yield self.id, str.encode(self.generate_html(), encoding='utf-8') + + def _has_writeable_stream(self): + return True + + def write(self, dest_base): + write_path = Path(dest_base) / self.id + super()._write_from_stream(write_path) diff --git a/rocrate/model/root_dataset.py b/rocrate/model/root_dataset.py index 2d52b2ee..ebef3814 100644 --- a/rocrate/model/root_dataset.py +++ b/rocrate/model/root_dataset.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/softwareapplication.py b/rocrate/model/softwareapplication.py index 874294b4..2cd3c530 100644 --- a/rocrate/model/softwareapplication.py +++ b/rocrate/model/softwareapplication.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/testdefinition.py b/rocrate/model/testdefinition.py index 3de24afd..a17677c1 100644 --- a/rocrate/model/testdefinition.py +++ b/rocrate/model/testdefinition.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/testinstance.py b/rocrate/model/testinstance.py index 94d9f0ef..956f4a33 100644 --- a/rocrate/model/testinstance.py +++ b/rocrate/model/testinstance.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/testservice.py b/rocrate/model/testservice.py index c5f44c12..c2b65988 100644 --- a/rocrate/model/testservice.py +++ b/rocrate/model/testservice.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/model/testsuite.py b/rocrate/model/testsuite.py index b99b103c..ba0442e0 100644 --- a/rocrate/model/testsuite.py +++ b/rocrate/model/testsuite.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index b489c948..6694c081 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -31,6 +32,7 @@ from pathlib import Path from urllib.parse import urljoin +from .memory_buffer import MemoryBuffer from .model import ( ComputationalWorkflow, ComputerLanguage, @@ -469,15 +471,56 @@ def write(self, base_path): def write_zip(self, out_path): out_path = Path(out_path) - if out_path.suffix == ".zip": - out_path = out_path.parent / out_path.stem - tmp_dir = tempfile.mkdtemp(prefix="rocrate_") - try: - self.write(tmp_dir) - archive = shutil.make_archive(out_path, "zip", tmp_dir) - finally: - shutil.rmtree(tmp_dir) - return archive + with open(out_path, "wb") as f: + for chunk in self._stream_zip(out_path=out_path): + f.write(chunk) + return out_path + + def stream_zip(self, chunk_size=8192): + """ Create a stream of bytes representing the RO-Crate as a ZIP file. """ + yield from self._stream_zip(chunk_size=chunk_size) + + def _stream_zip(self, chunk_size=8192, out_path=None): + """ Create a stream of bytes representing the RO-Crate as a ZIP file. + The out_path argument is used to exclude the file from the ZIP stream if the output is inside the crate folder + and can be omitted if the stream is not written into a file inside the crate dir. + """ + with MemoryBuffer() as buffer: + with zipfile.ZipFile(buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as archive: + for writeable_entity in self.data_entities + self.default_entities: + current_file_path, current_out_file = None, None + for path, chunk in writeable_entity.stream(chunk_size=chunk_size): + if path != current_file_path: + if current_out_file: + current_out_file.close() + current_file_path = path + current_out_file = archive.open(path, mode='w', force_zip64=True) + current_out_file.write(chunk) + while len(buffer) >= chunk_size: + yield buffer.read(chunk_size) + if current_out_file: + current_out_file.close() + + # add additional unlisted files to stream + listed_files = [archived_file for archived_file in archive.namelist()] + for root, dirs, files in walk(str(self.source), exclude=self.exclude): + for name in files: + source = Path(root) / name + + # ignore out_path to not include a zip in itself + if out_path and out_path.samefile(source): + continue + + rel = source.relative_to(self.source) + if not self.dereference(str(rel)) and not str(rel) in listed_files: + with archive.open(str(rel), mode='w') as out_file, open(source, 'rb') as in_file: + while chunk := in_file.read(chunk_size): + out_file.write(chunk) + while len(buffer) >= chunk_size: + yield buffer.read(chunk_size) + + while chunk := buffer.read(chunk_size): + yield chunk def add_workflow( self, source=None, dest_path=None, fetch_remote=False, validate_url=False, properties=None, diff --git a/rocrate/utils.py b/rocrate/utils.py index aa1aeab2..5f565187 100644 --- a/rocrate/utils.py +++ b/rocrate/utils.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/rocrate/vocabs.py b/rocrate/vocabs.py index e492294f..902a682d 100644 --- a/rocrate/vocabs.py +++ b/rocrate/vocabs.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/setup.py b/setup.py index c4fc1236..bc4a0765 100755 --- a/setup.py +++ b/setup.py @@ -7,6 +7,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -58,6 +59,7 @@ long_description_content_type='text/markdown', long_description=long_description, author=", ".join(( + 'Daniel Bauer', 'Eli Chadwick', 'Paul De Geest', 'Bert Droesbeke', diff --git a/test/conftest.py b/test/conftest.py index 3dfee400..b7488d51 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/test/test_cli.py b/test/test_cli.py index b4fbb94c..353e9aea 100644 --- a/test/test_cli.py +++ b/test/test_cli.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/test/test_jsonld.py b/test/test_jsonld.py index 7c9759c9..44ee9492 100644 --- a/test/test_jsonld.py +++ b/test/test_jsonld.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/test/test_metadata.py b/test/test_metadata.py index e6ba0cea..ffacb5c1 100644 --- a/test/test_metadata.py +++ b/test/test_metadata.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/test/test_model.py b/test/test_model.py index 165084fd..08400871 100644 --- a/test/test_model.py +++ b/test/test_model.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/test/test_read.py b/test/test_read.py index c3b7a608..1b60b876 100644 --- a/test/test_read.py +++ b/test/test_read.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/test/test_readwrite.py b/test/test_readwrite.py index 0fab0d90..0c894049 100644 --- a/test/test_readwrite.py +++ b/test/test_readwrite.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/test/test_test_metadata.py b/test/test_test_metadata.py index 71c5e0ab..5645fe58 100644 --- a/test/test_test_metadata.py +++ b/test/test_test_metadata.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/test/test_utils.py b/test/test_utils.py index a73b4a1e..1476e9a5 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/test/test_workflow_ro_crate.py b/test/test_workflow_ro_crate.py index eb4f5471..edad0018 100644 --- a/test/test_workflow_ro_crate.py +++ b/test/test_workflow_ro_crate.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/test/test_write.py b/test/test_write.py index 94835dab..8c09debb 100644 --- a/test/test_write.py +++ b/test/test_write.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -395,6 +396,21 @@ def test_no_parts(tmpdir, helpers): assert "hasPart" not in json_entities["./"] +def test_write_zip_copy_unlisted(test_data_dir, tmpdir): + crate_dir = test_data_dir / 'ro-crate-galaxy-sortchangecase' + crate = ROCrate(crate_dir) + + zip_name = 'ro_crate_out.crate.zip' + zip_path = tmpdir / zip_name + crate.write_zip(zip_path) + out_path = tmpdir / 'ro_crate_out' + with zipfile.ZipFile(zip_path, "r") as zf: + zf.extractall(out_path) + + assert (out_path / "test" / "test1" / "input.bed").is_file() + assert (out_path / "test" / "test1" / "output_exp.bed").is_file() + + def test_no_zip_in_zip(test_data_dir, tmpdir): crate_dir = test_data_dir / 'ro-crate-galaxy-sortchangecase' crate = ROCrate(crate_dir) @@ -462,3 +478,25 @@ def test_http_header(tmpdir): assert "sdDatePublished" in props with requests.head(url) as response: assert props["sdDatePublished"] == response.headers.get("last-modified") + + +def test_stream(test_data_dir, tmpdir): + source = test_data_dir / "read_crate" + crate = ROCrate(source) + + out_path = tmpdir / 'ro_crate_out.zip' + with open(out_path, "wb") as out: + for chunk in crate.stream_zip(): + out.write(chunk) + + with zipfile.ZipFile(out_path, "r") as zf: + assert not zf.testzip() + for info in zf.infolist(): + assert info.file_size > 0 + + extract_path = tmpdir / 'ro_crate_out' + with zipfile.ZipFile(out_path, "r") as zf: + zf.extractall(extract_path) + assert (extract_path / "ro-crate-metadata.jsonld").is_file() + assert (extract_path / "examples" / "README.txt").is_file() + assert (extract_path / "test" / "test-metadata.json").is_file() diff --git a/test/test_wrroc.py b/test/test_wrroc.py index 82405d07..66ab79aa 100644 --- a/test/test_wrroc.py +++ b/test/test_wrroc.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tools/add_boilerplate.py b/tools/add_boilerplate.py index 22ad428f..73ba0e58 100644 --- a/tools/add_boilerplate.py +++ b/tools/add_boilerplate.py @@ -5,6 +5,7 @@ # Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH # Copyright 2024 Data Centre, SciLifeLab, SE # Copyright 2024 National Institute of Informatics (NII), JP +# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License.