Skip to content

Commit

Permalink
Merge pull request #212 from dnlbauer/stream_crate
Browse files Browse the repository at this point in the history
Stream RO-Crate Zip
  • Loading branch information
simleo authored Feb 4, 2025
2 parents e22ff83 + 04128fa commit efc865c
Show file tree
Hide file tree
Showing 45 changed files with 448 additions and 80 deletions.
5 changes: 4 additions & 1 deletion CITATION.cff
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
cff-version: 1.1.0
message: "Cite as"
authors:
author:
- family-names: Bauer
given-names: Daniel
orcid: https://orcid.org/0000-0001-9447-460X
- family-names: Chadwick
given-names: Eli
orcid: https://orcid.org/0000-0002-0035-6475
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,7 @@ Options:
* Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH
* Copyright 2024 Data Centre, SciLifeLab, SE
* Copyright 2024 National Institute of Informatics (NII), JP
* Copyright 2025 Senckenberg Society for Nature Research (SGN), DE

Licensed under the
Apache License, version 2.0 <https://www.apache.org/licenses/LICENSE-2.0>,
Expand Down
64 changes: 64 additions & 0 deletions examples/fastapi/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Copyright 2019-2024 The University of Manchester, UK
# Copyright 2020-2024 Vlaams Instituut voor Biotechnologie (VIB), BE
# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), ES
# Copyright 2020-2024 Center for Advanced Studies, Research and Development in Sardinia (CRS4), IT
# Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH
# Copyright 2024 Data Centre, SciLifeLab, SE
# Copyright 2024 National Institute of Informatics (NII), JP
# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Streaming RO-Crates from a web server
This example demonstrates how to create an RO-Crate on-the-fly
and stream the result to the client.
By using `stream_zip`, the RO-Crate is not written to disk and remote
data is only fetched on the fly.
To run: `fastapi dev main.py`, then visit http://localhost:8000/crate
"""

from fastapi import FastAPI
from fastapi.responses import StreamingResponse
from rocrate.rocrate import ROCrate
from io import StringIO

app = FastAPI()


@app.get("/crate")
async def get():
crate = ROCrate()

# Add a remote file
crate.add_file(
"https://raw.githubusercontent.com/ResearchObject/ro-crate-py/refs/heads/master/test/test-data/sample_file.txt",
fetch_remote=True
)

# Add a file containing a string to the crate
crate.add_file(
source=StringIO("Hello, World!"),
dest_path="test-data/hello.txt"
)

# Stream crate to client as a zip file
return StreamingResponse(
crate.stream_zip(),
media_type="application/rocrate+zip",
headers={
"Content-Disposition": "attachment; filename=crate.zip",
}
)
3 changes: 3 additions & 0 deletions examples/fastapi/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
../../
fastapi
fastapi-cli
1 change: 1 addition & 0 deletions examples/read_test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH
# Copyright 2024 Data Centre, SciLifeLab, SE
# Copyright 2024 National Institute of Informatics (NII), JP
# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
3 changes: 3 additions & 0 deletions rocrate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH
# Copyright 2024 Data Centre, SciLifeLab, SE
# Copyright 2024 National Institute of Informatics (NII), JP
# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -30,6 +31,7 @@
"""

__author__ = ", ".join((
'Daniel Bauer',
'Eli Chadwick',
'Paul De Geest',
'Bert Droesbeke',
Expand All @@ -52,6 +54,7 @@
Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH
Copyright 2024 Data Centre, SciLifeLab, SE
Copyright 2024 National Institute of Informatics (NII), JP
Copyright 2025 Senckenberg Society for Nature Research (SGN), DE
"""
__license__ = ("Apache License, version 2.0 "
"<https://www.apache.org/licenses/LICENSE-2.0>")
Expand Down
1 change: 1 addition & 0 deletions rocrate/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH
# Copyright 2024 Data Centre, SciLifeLab, SE
# Copyright 2024 National Institute of Informatics (NII), JP
# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
52 changes: 52 additions & 0 deletions rocrate/memory_buffer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Copyright 2019-2024 The University of Manchester, UK
# Copyright 2020-2024 Vlaams Instituut voor Biotechnologie (VIB), BE
# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), ES
# Copyright 2020-2024 Center for Advanced Studies, Research and Development in Sardinia (CRS4), IT
# Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH
# Copyright 2024 Data Centre, SciLifeLab, SE
# Copyright 2024 National Institute of Informatics (NII), JP
# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from io import RawIOBase


class MemoryBuffer(RawIOBase):
"""
A buffer class that supports reading and writing binary data.
The buffer automatically resets upon reading to make sure all data is read only once.
"""

def __init__(self):
self._buffer = b''

def write(self, data):
if self.closed:
raise ValueError('write to closed file')
self._buffer += data
return len(data)

def read(self, size=-1):
if self.closed:
raise ValueError('read from closed file')
if size < 0:
data = self._buffer
self._buffer = b''
else:
data = self._buffer[:size]
self._buffer = self._buffer[size:]
return data

def __len__(self):
return len(self._buffer)
1 change: 1 addition & 0 deletions rocrate/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH
# Copyright 2024 Data Centre, SciLifeLab, SE
# Copyright 2024 National Institute of Informatics (NII), JP
# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
1 change: 1 addition & 0 deletions rocrate/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH
# Copyright 2024 Data Centre, SciLifeLab, SE
# Copyright 2024 National Institute of Informatics (NII), JP
# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
1 change: 1 addition & 0 deletions rocrate/model/computationalworkflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH
# Copyright 2024 Data Centre, SciLifeLab, SE
# Copyright 2024 National Institute of Informatics (NII), JP
# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
1 change: 1 addition & 0 deletions rocrate/model/computerlanguage.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH
# Copyright 2024 Data Centre, SciLifeLab, SE
# Copyright 2024 National Institute of Informatics (NII), JP
# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
1 change: 1 addition & 0 deletions rocrate/model/contextentity.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH
# Copyright 2024 Data Centre, SciLifeLab, SE
# Copyright 2024 National Institute of Informatics (NII), JP
# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
1 change: 1 addition & 0 deletions rocrate/model/creativework.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH
# Copyright 2024 Data Centre, SciLifeLab, SE
# Copyright 2024 National Institute of Informatics (NII), JP
# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
12 changes: 12 additions & 0 deletions rocrate/model/data_entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH
# Copyright 2024 Data Centre, SciLifeLab, SE
# Copyright 2024 National Institute of Informatics (NII), JP
# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -28,3 +29,14 @@ class DataEntity(Entity):

def write(self, base_path):
pass

def stream(self, chunk_size=8192):
""" Stream the data from the source. Each chunk of the content is yielded as a tuple
containing the name of the destination file relative to the crate and the chunk of data.
The destination file name is required because a DataEntity can be a file or a
collection of files (Dataset) and the caller need to know to which file a chunk belongs.
For collection of files, the caller can assume that files are streamed one after another,
meaning once the destination name changes, a file can be closed and the next one can be
openend.
"""
yield from ()
106 changes: 77 additions & 29 deletions rocrate/model/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH
# Copyright 2024 Data Centre, SciLifeLab, SE
# Copyright 2024 National Institute of Informatics (NII), JP
# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -22,7 +23,7 @@

import errno
import os
import shutil
import warnings
from pathlib import Path
from urllib.request import urlopen

Expand All @@ -43,37 +44,84 @@ def _empty(self):
def format_id(self, identifier):
return identifier.rstrip("/") + "/"

def _write_from_url(self, base_path):
if self.validate_url and not self.fetch_remote:
with urlopen(self.source) as _:
self._jsonld['sdDatePublished'] = iso_now()
if self.fetch_remote:
out_file_path, out_file = None, None
for rel_path, chunk in self._stream_folder_from_url():
path = base_path / rel_path
if path != out_file_path:
if out_file:
out_file.close()
out_file_path = Path(path)
out_file_path.parent.mkdir(parents=True, exist_ok=True)
out_file = open(out_file_path, 'wb')
out_file.write(chunk)
if out_file:
out_file.close()

def _copy_folder(self, base_path):
abs_out_path = base_path / self.id
if self.source is None:
abs_out_path.mkdir(parents=True, exist_ok=True)
else:
if not Path(self.source).exists():
raise FileNotFoundError(
errno.ENOENT, os.strerror(errno.ENOENT), str(self.source)
)
abs_out_path.mkdir(parents=True, exist_ok=True)
if not self.crate.source:
self.crate._copy_unlisted(self.source, abs_out_path)

def write(self, base_path):
out_path = Path(base_path) / self.id
base_path = Path(base_path)
if is_url(str(self.source)):
if self.validate_url and not self.fetch_remote:
self._write_from_url(base_path)
else:
self._copy_folder(base_path)

def stream(self, chunk_size=8192):
if self.source is None:
return
elif is_url(str(self.source)):
yield from self._stream_folder_from_url(chunk_size)
else:
yield from self._stream_folder_from_path(chunk_size)

def _stream_folder_from_path(self, chunk_size=8192):
if not Path(str(self.source)).exists():
raise FileNotFoundError(
errno.ENOENT, os.strerror(errno.ENOENT), str(self.source)
)
if not self.crate.source:
for root, _, files in os.walk(self.source):
root = Path(root)
for name in files:
source = root / name
dest = source.relative_to(Path(self.source).parent)
with open(source, 'rb') as f:
while chunk := f.read(chunk_size):
yield str(dest), chunk

def _stream_folder_from_url(self, chunk_size=8192):
if not self.fetch_remote:
if self.validate_url:
with urlopen(self.source) as _:
self._jsonld['sdDatePublished'] = iso_now()
if self.fetch_remote:
self.__get_parts(out_path)
else:
if self.source is None:
out_path.mkdir(parents=True, exist_ok=True)
else:
if not Path(self.source).exists():
raise FileNotFoundError(
errno.ENOENT, os.strerror(errno.ENOENT), str(self.source)
)
out_path.mkdir(parents=True, exist_ok=True)
if not self.crate.source:
self.crate._copy_unlisted(self.source, out_path)
base = self.source.rstrip("/")
for entry in self._jsonld.get("hasPart", []):
try:
part = entry["@id"]
if is_url(part) or part.startswith("/"):
raise RuntimeError(f"'{self.source}': part '{part}' is not a relative path")
part_uri = f"{base}/{part}"
rel_out_path = Path(self.id) / part

def __get_parts(self, out_path):
out_path.mkdir(parents=True, exist_ok=True)
base = self.source.rstrip("/")
for entry in self._jsonld.get("hasPart", []):
try:
part = entry["@id"]
except KeyError:
continue
if is_url(part) or part.startswith("/"):
raise RuntimeError(f"'{self.source}': part '{part}' is not a relative path")
part_uri = f"{base}/{part}"
part_out_path = out_path / part
with urlopen(part_uri) as r, open(part_out_path, 'wb') as f:
shutil.copyfileobj(r, f)
with urlopen(part_uri) as response:
while chunk := response.read(chunk_size):
yield str(rel_out_path), chunk
except KeyError:
warnings.warn(f"'hasPart' entry in {self.id} is missing '@id'. Skipping.")
1 change: 1 addition & 0 deletions rocrate/model/entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# Copyright 2022-2024 École Polytechnique Fédérale de Lausanne, CH
# Copyright 2024 Data Centre, SciLifeLab, SE
# Copyright 2024 National Institute of Informatics (NII), JP
# Copyright 2025 Senckenberg Society for Nature Research (SGN), DE
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
Loading

0 comments on commit efc865c

Please sign in to comment.