Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
zimeon committed Nov 20, 2024
1 parent 59a3bee commit f1c051c
Show file tree
Hide file tree
Showing 12 changed files with 172 additions and 52 deletions.
1 change: 1 addition & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,5 @@ functions.
ocfl.version
ocfl.inventory_validator
ocfl.version_metadata
ocfl.new_version
ocfl.constants
10 changes: 10 additions & 0 deletions docs/ocfl.new_version.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
``ocfl.NewVersion``
===================

.. automodule:: ocfl.new_version

.. autoclass:: ocfl.NewVersion
:members:

.. autoclass:: ocfl.NewVersionException
:members:
2 changes: 1 addition & 1 deletion ocfl-object.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def do_object_operation(args):
obj = ocfl.Object(identifier=args.id,
spec_version=args.spec_version,
digest_algorithm=args.digest,
filepath_normalization=args.normalization,
content_path_normalization=args.normalization,
forward_delta=not args.no_forward_delta,
dedupe=not args.no_dedupe,
lax_digests=args.lax_digests,
Expand Down
1 change: 1 addition & 0 deletions ocfl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .digest import file_digest, string_digest, digest_regex, normalized_digest
from .inventory import Inventory, Version, InventoryException
from .inventory_validator import InventoryValidator
from .new_version import NewVersion, NewVersionException
from .object import Object
from .object_utils import find_path_type, ObjectException
from .storage_root import StorageRoot, StorageRootException
Expand Down
6 changes: 6 additions & 0 deletions ocfl/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,9 @@

DEFAULT_SPEC_VERSION = "1.1"
"""str: OCFL specification version number to assume if none specified."""

DEFAULT_DIGEST_ALGORITHM = "sha512"
"""str: default digest algorithm to use for content addressing."""

DEFAULT_CONTENT_DIRECTORY = "content"
"""str: default content directy name if none is specified."""
3 changes: 2 additions & 1 deletion ocfl/inventory.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@
import os.path
import re

from .constants import DEFAULT_CONTENT_DIRECTORY
from .digest import normalized_digest
from .object_utils import first_version_directory, next_version_directory, \
parse_version_directory, make_unused_filepath
Expand Down Expand Up @@ -177,7 +178,7 @@ def content_directory(self):
@property
def content_directory_to_use(self):
"""Get contentDirectory to use, default 'content' is not specified."""
return self.data.get("contentDirectory", "content")
return self.data.get("contentDirectory", DEFAULT_CONTENT_DIRECTORY)

@content_directory.setter
def content_directory(self, value):
Expand Down
4 changes: 2 additions & 2 deletions ocfl/inventory_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
"""
import re

from .constants import SPEC_VERSIONS_SUPPORTED
from .constants import SPEC_VERSIONS_SUPPORTED, DEFAULT_CONTENT_DIRECTORY
from .digest import digest_regex, normalized_digest
from .validation_logger import ValidationLogger
from .w3c_datetime import str_to_datetime
Expand Down Expand Up @@ -87,7 +87,7 @@ def __init__(self, *, log=None, where="???",
self.id = None
self.spec_version = self.default_spec_version
self.digest_algorithm = "sha512"
self.content_directory = "content"
self.content_directory = DEFAULT_CONTENT_DIRECTORY
self.content_directory_set = False
self.all_versions = []
self.manifest_files = None
Expand Down
146 changes: 120 additions & 26 deletions ocfl/new_version.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,20 @@
"""NewVersion class to assemble what will become a new Object version."""
"""NewVersion class to assemble what will become a new Object version.
It is expected that instances of this class will only be created
and used through ocfl.Object, see the start_new_version() and
write_new_version() methods.
"""
import copy
import hashlib
import logging
from urllib.parse import quote as urlquote

import fs.path

from .constants import DEFAULT_DIGEST_ALGORITHM, DEFAULT_CONTENT_DIRECTORY
from .digest import file_digest
from .inventory import InventoryException
from .inventory import Inventory, InventoryException
from .object_utils import make_unused_filepath
from .pyfs import pyfs_openfs


Expand All @@ -15,54 +26,88 @@ class NewVersion():
"""Class to represent a new version to be added to an Object."""

def __init__(self, *,
inventory,
inventory=None,
objdir=None,
srcdir=None,
metadata=None,
digest_algorithm=None,
content_directory=None,
content_path_normalization="uri",
carry_content_forward=True,
forward_delta=True,
dedupe=False,
old_digest_algorithm=None):
"""Create NewVersion object.
Arguments:
object (ocfl.Object): instance for which this is a new version.
inventory (ocfl.Inventory): inventory that we will modify to build
the new version.
content_path_normalization (str): the path normalization strategy
to use with content paths when files are added to this object
(default "uri")
carry_content_forward (bool): True to carry forward the state from
the last current version as a starting point. False to start
with empty version state.
Example:
# mkdir tmp
# cp -rp fixtures/1.1/good-objects/spec-ex-full tmp/spec-ex-full
# python
>>> import ocfl
>>> object = ocfl.Object()
>>> nv = object.start_new_version(objdir="tmp/spec-ex-full", carry_content_forward=True)
>>> nv.inventory.current_version.logical_paths
['foo/bar.xml', 'empty2.txt', 'image.tiff']
>>> nv.delete("foo/bar.xml")
>>> nv.rename("empty2.txt", "empty3.txt")
>>> nv.add_content("fixtures/1.1/content/README.md", "readme", "v4/readme")
>>> object.commit_new_version(nv)
INFO:root:Updated OCFL object ark:/12345/bcd987 in tmp/spec-ex-full by adding v4
<ocfl.inventory.Inventory object at 0x1014e6cd0>
Example use:
>>> # Prep:
>>> # mkdir tmp
>>> # cp -rp fixtures/1.1/good-objects/spec-ex-full tmp/spec-ex-full
>>>
>>> import ocfl
>>> object = ocfl.Object()
>>> nv = object.start_new_version(objdir="tmp/spec-ex-full", carry_content_forward=True)
>>> nv.inventory.current_version.logical_paths
['foo/bar.xml', 'empty2.txt', 'image.tiff']
>>> nv.delete("foo/bar.xml")
>>> nv.rename("empty2.txt", "empty3.txt")
>>> nv.add("fixtures/1.1/content/README.md", "readme", "v4/readme")
>>> object.write_new_version(nv)
INFO:root:Updated OCFL object ark:/12345/bcd987 in tmp/spec-ex-full by adding v4
<ocfl.inventory.Inventory object at 0x1014e6cd0>
"""
# Configuration
self.inventory = inventory
self.objdir = objdir
self.srcdir = srcdir
self.src_fs = None
self.content_path_normalization = content_path_normalization
self.forward_delta = forward_delta
self.dedupe = dedupe
# Additional state needed for final commit
self.old_digest_algorithm = old_digest_algorithm
self.files_to_copy = {} # dict: src_path -> content_path
self._start_new_version(carry_content_forward=carry_content_forward,
metadata=metadata)
if inventory is None:
self._start_first_version(digest_algorithm=digest_algorithm,
content_directory=content_directory,
metadata=metadata)
else:
self._start_next_version(carry_content_forward=carry_content_forward,
metadata=metadata)
self.src_fs = pyfs_openfs(self.srcdir)

def _start_first_version(self, *,
digest_algorithm=None,
content_directory=None,
metadata):
"""Start the first version for this object.
Arguments:
digest_algorithm (str or None):
content_directort (str or None):
"""
inventory = Inventory()
inventory.add_version(metadata=metadata) # also sets head "v1"
if digest_algorithm is None:
digest_algorithm = DEFAULT_DIGEST_ALGORITHM
inventory.digest_algorithm = digest_algorithm
if (content_directory is not None
and content_directory != DEFAULT_CONTENT_DIRECTORY):
inventory.content_directory = content_directory
self.inventory = inventory

def _start_new_version(self, *, metadata, carry_content_forward=False):
def _start_next_version(self, *, metadata, carry_content_forward=False):
"""Start the new version by adjusting inventory.
If carry_content_forward is set then the state block of the previous
Expand All @@ -80,9 +125,50 @@ def _start_new_version(self, *, metadata, carry_content_forward=False):
if carry_content_forward:
state = copy.deepcopy(self.inventory.current_version.state)
self.inventory.add_version(state=state, metadata=metadata)
self.src_fs = pyfs_openfs(self.srcdir)

def add_content(self, src_path, logical_path, content_path=None):
@property
def content_directory(self):
"""Get content directory catering for default."""
return self.inventory.content_directory_to_use

def _map_filepath(self, filepath):
"""Map source filepath to a content path within the object.
The purpose of the mapping might be normalization, sanitization,
content distribution, or something else. The mapping is set by the
content_path_normalization attribute where None indicates no mapping, the
source file name and path are preserved.
Arguments:
filepath: the source filepath (possibly including directories) that
will be mapped into the object content path.
Returns:
str: the full content path for this content that starts
with `vdir/content_directory/`.
"""
if self.content_path_normalization == "uri":
filepath = urlquote(filepath)
# also encode any leading period to unhide files
if filepath[0] == ".":
filepath = "%2E" + filepath[1:]
elif self.content_path_normalization == "md5":
# Truncated MD5 hash of the _filepath_ as an illustration of diff
# paths for the specification. Not sure whether there should be any
# real application of this
filepath = hashlib.md5(filepath.encode("utf-8")).hexdigest()[0:16]
elif self.content_path_normalization is not None:
raise NewVersionException("Unknown filepath normalization '%s' requested"
% (self.content_path_normalization))
vfilepath = fs.path.join(self.inventory.head, self.content_directory, filepath) # path relative to root, inc v#/content
# Check we don't already have this vfilepath from many to one
# normalization, add suffix to distinguish if necessary
used = self.inventory.content_paths
if vfilepath in used:
vfilepath = make_unused_filepath(vfilepath, used)
return vfilepath

def add(self, src_path, logical_path, content_path=None):
"""Add a file to the new version.
Arguments:
Expand All @@ -91,8 +177,16 @@ def add_content(self, src_path, logical_path, content_path=None):
logical_path (str): logical filepath that this content should
have within the version of the object
"""
logging.debug("add_content(%s %s %s)", src_path, content_path, logical_path)
inventory = self.inventory
if content_path is None:
content_path = self._map_filepath(src_path)
elif not content_path.startswith(inventory.head + "/" + self.content_directory + "/"):
raise NewVersionException("Bad content path %s, must start with version directory and content directory path elements"
% (content_path))
elif content_path in inventory.content_paths:
raise NewVersionException("Bad content path %s, already exists!"
% (content_path))
logging.debug("add(%s %s %s)", src_path, content_path, logical_path)
# Does this logical path already exist?
if logical_path in inventory.current_version.logical_paths:
raise NewVersionException("Logical path %s already exists in new version %s" % (logical_path, inventory.head))
Expand Down Expand Up @@ -126,7 +220,7 @@ def delete(self, logical_path):
the previous state (initialization with carry_content_forward=True).
Assumes that the content is used in a previous version to will not
check to delete content from the manifest. Thus add_content() followed
check to delete content from the manifest. Thus add() followed
but delete_content() could leave the new version in a bad state.
Arguments:
Expand Down
Loading

0 comments on commit f1c051c

Please sign in to comment.