From f1c051c28b6a4822f229c2ce8c32751b073665ab Mon Sep 17 00:00:00 2001 From: Simeon Warner Date: Wed, 20 Nov 2024 12:55:51 -0500 Subject: [PATCH] WIP --- docs/api.rst | 1 + docs/ocfl.new_version.rst | 10 +++ ocfl-object.py | 2 +- ocfl/__init__.py | 1 + ocfl/constants.py | 6 ++ ocfl/inventory.py | 3 +- ocfl/inventory_validator.py | 4 +- ocfl/new_version.py | 146 +++++++++++++++++++++++++++++------- ocfl/object.py | 41 +++++----- ocfl/object_utils.py | 1 - ocfl/validator.py | 5 +- tests/test_object.py | 4 +- 12 files changed, 172 insertions(+), 52 deletions(-) create mode 100644 docs/ocfl.new_version.rst diff --git a/docs/api.rst b/docs/api.rst index 44391c1..90106fb 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -14,4 +14,5 @@ functions. ocfl.version ocfl.inventory_validator ocfl.version_metadata + ocfl.new_version ocfl.constants diff --git a/docs/ocfl.new_version.rst b/docs/ocfl.new_version.rst new file mode 100644 index 0000000..fe19528 --- /dev/null +++ b/docs/ocfl.new_version.rst @@ -0,0 +1,10 @@ +``ocfl.NewVersion`` +=================== + +.. automodule:: ocfl.new_version + +.. autoclass:: ocfl.NewVersion + :members: + +.. autoclass:: ocfl.NewVersionException + :members: diff --git a/ocfl-object.py b/ocfl-object.py index 043e022..5007255 100755 --- a/ocfl-object.py +++ b/ocfl-object.py @@ -127,7 +127,7 @@ def do_object_operation(args): obj = ocfl.Object(identifier=args.id, spec_version=args.spec_version, digest_algorithm=args.digest, - filepath_normalization=args.normalization, + content_path_normalization=args.normalization, forward_delta=not args.no_forward_delta, dedupe=not args.no_dedupe, lax_digests=args.lax_digests, diff --git a/ocfl/__init__.py b/ocfl/__init__.py index 81f6de1..40073e4 100644 --- a/ocfl/__init__.py +++ b/ocfl/__init__.py @@ -6,6 +6,7 @@ from .digest import file_digest, string_digest, digest_regex, normalized_digest from .inventory import Inventory, Version, InventoryException from .inventory_validator import InventoryValidator +from .new_version import NewVersion, NewVersionException from .object import Object from .object_utils import find_path_type, ObjectException from .storage_root import StorageRoot, StorageRootException diff --git a/ocfl/constants.py b/ocfl/constants.py index 5889870..ab09d90 100644 --- a/ocfl/constants.py +++ b/ocfl/constants.py @@ -8,3 +8,9 @@ DEFAULT_SPEC_VERSION = "1.1" """str: OCFL specification version number to assume if none specified.""" + +DEFAULT_DIGEST_ALGORITHM = "sha512" +"""str: default digest algorithm to use for content addressing.""" + +DEFAULT_CONTENT_DIRECTORY = "content" +"""str: default content directy name if none is specified.""" diff --git a/ocfl/inventory.py b/ocfl/inventory.py index 6f4b00f..033b3b1 100644 --- a/ocfl/inventory.py +++ b/ocfl/inventory.py @@ -73,6 +73,7 @@ import os.path import re +from .constants import DEFAULT_CONTENT_DIRECTORY from .digest import normalized_digest from .object_utils import first_version_directory, next_version_directory, \ parse_version_directory, make_unused_filepath @@ -177,7 +178,7 @@ def content_directory(self): @property def content_directory_to_use(self): """Get contentDirectory to use, default 'content' is not specified.""" - return self.data.get("contentDirectory", "content") + return self.data.get("contentDirectory", DEFAULT_CONTENT_DIRECTORY) @content_directory.setter def content_directory(self, value): diff --git a/ocfl/inventory_validator.py b/ocfl/inventory_validator.py index 2b9b21b..2abc6d4 100644 --- a/ocfl/inventory_validator.py +++ b/ocfl/inventory_validator.py @@ -27,7 +27,7 @@ """ import re -from .constants import SPEC_VERSIONS_SUPPORTED +from .constants import SPEC_VERSIONS_SUPPORTED, DEFAULT_CONTENT_DIRECTORY from .digest import digest_regex, normalized_digest from .validation_logger import ValidationLogger from .w3c_datetime import str_to_datetime @@ -87,7 +87,7 @@ def __init__(self, *, log=None, where="???", self.id = None self.spec_version = self.default_spec_version self.digest_algorithm = "sha512" - self.content_directory = "content" + self.content_directory = DEFAULT_CONTENT_DIRECTORY self.content_directory_set = False self.all_versions = [] self.manifest_files = None diff --git a/ocfl/new_version.py b/ocfl/new_version.py index f684fd2..3aa982d 100644 --- a/ocfl/new_version.py +++ b/ocfl/new_version.py @@ -1,9 +1,20 @@ -"""NewVersion class to assemble what will become a new Object version.""" +"""NewVersion class to assemble what will become a new Object version. + +It is expected that instances of this class will only be created +and used through ocfl.Object, see the start_new_version() and +write_new_version() methods. +""" import copy +import hashlib import logging +from urllib.parse import quote as urlquote + +import fs.path +from .constants import DEFAULT_DIGEST_ALGORITHM, DEFAULT_CONTENT_DIRECTORY from .digest import file_digest -from .inventory import InventoryException +from .inventory import Inventory, InventoryException +from .object_utils import make_unused_filepath from .pyfs import pyfs_openfs @@ -15,10 +26,13 @@ class NewVersion(): """Class to represent a new version to be added to an Object.""" def __init__(self, *, - inventory, + inventory=None, objdir=None, srcdir=None, metadata=None, + digest_algorithm=None, + content_directory=None, + content_path_normalization="uri", carry_content_forward=True, forward_delta=True, dedupe=False, @@ -26,43 +40,74 @@ def __init__(self, *, """Create NewVersion object. Arguments: - object (ocfl.Object): instance for which this is a new version. inventory (ocfl.Inventory): inventory that we will modify to build the new version. + content_path_normalization (str): the path normalization strategy + to use with content paths when files are added to this object + (default "uri") carry_content_forward (bool): True to carry forward the state from the last current version as a starting point. False to start with empty version state. - Example: - # mkdir tmp - # cp -rp fixtures/1.1/good-objects/spec-ex-full tmp/spec-ex-full - # python - >>> import ocfl - >>> object = ocfl.Object() - >>> nv = object.start_new_version(objdir="tmp/spec-ex-full", carry_content_forward=True) - >>> nv.inventory.current_version.logical_paths - ['foo/bar.xml', 'empty2.txt', 'image.tiff'] - >>> nv.delete("foo/bar.xml") - >>> nv.rename("empty2.txt", "empty3.txt") - >>> nv.add_content("fixtures/1.1/content/README.md", "readme", "v4/readme") - >>> object.commit_new_version(nv) - INFO:root:Updated OCFL object ark:/12345/bcd987 in tmp/spec-ex-full by adding v4 - + Example use: + + >>> # Prep: + >>> # mkdir tmp + >>> # cp -rp fixtures/1.1/good-objects/spec-ex-full tmp/spec-ex-full + >>> + >>> import ocfl + >>> object = ocfl.Object() + >>> nv = object.start_new_version(objdir="tmp/spec-ex-full", carry_content_forward=True) + >>> nv.inventory.current_version.logical_paths + ['foo/bar.xml', 'empty2.txt', 'image.tiff'] + >>> nv.delete("foo/bar.xml") + >>> nv.rename("empty2.txt", "empty3.txt") + >>> nv.add("fixtures/1.1/content/README.md", "readme", "v4/readme") + >>> object.write_new_version(nv) + INFO:root:Updated OCFL object ark:/12345/bcd987 in tmp/spec-ex-full by adding v4 + """ # Configuration self.inventory = inventory self.objdir = objdir self.srcdir = srcdir self.src_fs = None + self.content_path_normalization = content_path_normalization self.forward_delta = forward_delta self.dedupe = dedupe # Additional state needed for final commit self.old_digest_algorithm = old_digest_algorithm self.files_to_copy = {} # dict: src_path -> content_path - self._start_new_version(carry_content_forward=carry_content_forward, - metadata=metadata) + if inventory is None: + self._start_first_version(digest_algorithm=digest_algorithm, + content_directory=content_directory, + metadata=metadata) + else: + self._start_next_version(carry_content_forward=carry_content_forward, + metadata=metadata) + self.src_fs = pyfs_openfs(self.srcdir) + + def _start_first_version(self, *, + digest_algorithm=None, + content_directory=None, + metadata): + """Start the first version for this object. + + Arguments: + digest_algorithm (str or None): + content_directort (str or None): + """ + inventory = Inventory() + inventory.add_version(metadata=metadata) # also sets head "v1" + if digest_algorithm is None: + digest_algorithm = DEFAULT_DIGEST_ALGORITHM + inventory.digest_algorithm = digest_algorithm + if (content_directory is not None + and content_directory != DEFAULT_CONTENT_DIRECTORY): + inventory.content_directory = content_directory + self.inventory = inventory - def _start_new_version(self, *, metadata, carry_content_forward=False): + def _start_next_version(self, *, metadata, carry_content_forward=False): """Start the new version by adjusting inventory. If carry_content_forward is set then the state block of the previous @@ -80,9 +125,50 @@ def _start_new_version(self, *, metadata, carry_content_forward=False): if carry_content_forward: state = copy.deepcopy(self.inventory.current_version.state) self.inventory.add_version(state=state, metadata=metadata) - self.src_fs = pyfs_openfs(self.srcdir) - def add_content(self, src_path, logical_path, content_path=None): + @property + def content_directory(self): + """Get content directory catering for default.""" + return self.inventory.content_directory_to_use + + def _map_filepath(self, filepath): + """Map source filepath to a content path within the object. + + The purpose of the mapping might be normalization, sanitization, + content distribution, or something else. The mapping is set by the + content_path_normalization attribute where None indicates no mapping, the + source file name and path are preserved. + + Arguments: + filepath: the source filepath (possibly including directories) that + will be mapped into the object content path. + + Returns: + str: the full content path for this content that starts + with `vdir/content_directory/`. + """ + if self.content_path_normalization == "uri": + filepath = urlquote(filepath) + # also encode any leading period to unhide files + if filepath[0] == ".": + filepath = "%2E" + filepath[1:] + elif self.content_path_normalization == "md5": + # Truncated MD5 hash of the _filepath_ as an illustration of diff + # paths for the specification. Not sure whether there should be any + # real application of this + filepath = hashlib.md5(filepath.encode("utf-8")).hexdigest()[0:16] + elif self.content_path_normalization is not None: + raise NewVersionException("Unknown filepath normalization '%s' requested" + % (self.content_path_normalization)) + vfilepath = fs.path.join(self.inventory.head, self.content_directory, filepath) # path relative to root, inc v#/content + # Check we don't already have this vfilepath from many to one + # normalization, add suffix to distinguish if necessary + used = self.inventory.content_paths + if vfilepath in used: + vfilepath = make_unused_filepath(vfilepath, used) + return vfilepath + + def add(self, src_path, logical_path, content_path=None): """Add a file to the new version. Arguments: @@ -91,8 +177,16 @@ def add_content(self, src_path, logical_path, content_path=None): logical_path (str): logical filepath that this content should have within the version of the object """ - logging.debug("add_content(%s %s %s)", src_path, content_path, logical_path) inventory = self.inventory + if content_path is None: + content_path = self._map_filepath(src_path) + elif not content_path.startswith(inventory.head + "/" + self.content_directory + "/"): + raise NewVersionException("Bad content path %s, must start with version directory and content directory path elements" + % (content_path)) + elif content_path in inventory.content_paths: + raise NewVersionException("Bad content path %s, already exists!" + % (content_path)) + logging.debug("add(%s %s %s)", src_path, content_path, logical_path) # Does this logical path already exist? if logical_path in inventory.current_version.logical_paths: raise NewVersionException("Logical path %s already exists in new version %s" % (logical_path, inventory.head)) @@ -126,7 +220,7 @@ def delete(self, logical_path): the previous state (initialization with carry_content_forward=True). Assumes that the content is used in a previous version to will not - check to delete content from the manifest. Thus add_content() followed + check to delete content from the manifest. Thus add() followed but delete_content() could leave the new version in a bad state. Arguments: diff --git a/ocfl/object.py b/ocfl/object.py index f22c149..d9eb6d5 100755 --- a/ocfl/object.py +++ b/ocfl/object.py @@ -18,7 +18,7 @@ import fs.path import fs.copy -from .constants import INVENTORY_FILENAME +from .constants import INVENTORY_FILENAME, DEFAULT_CONTENT_DIRECTORY from .digest import file_digest from .inventory import Inventory from .inventory_validator import InventoryValidator @@ -65,8 +65,8 @@ class Object(): # pylint: disable=too-many-public-methods (default "content") digest_algorithm (str): the digest algorithm used for content addressing within this object (default "sha512") - filepath_normalization (str): the filepath normalization strategy to use - when files are added to this object (default "uri") + content_path_normalization (str): the filepath normalization strategy to + use when files are added to this object (default "uri") spec_version (str): OCFL specification version of this object forward_delta (bool): if True then indicates that forward delta file versioning should be used when files are added, not if False @@ -81,8 +81,9 @@ class Object(): # pylint: disable=too-many-public-methods """ - def __init__(self, *, identifier=None, content_directory="content", - digest_algorithm="sha512", filepath_normalization="uri", + def __init__(self, *, identifier=None, + content_directory=DEFAULT_CONTENT_DIRECTORY, + digest_algorithm="sha512", content_path_normalization="uri", spec_version="1.1", forward_delta=True, dedupe=True, lax_digests=False, fixity=None, obj_fs=None, path=None, create=False): @@ -92,7 +93,7 @@ def __init__(self, *, identifier=None, content_directory="content", identifier: id for this object content_directory: allow override of the default "content" digest_algorithm: allow override of the default "sha512" - filepath_normalization: allow override of default "uri" + content_path_normalization: allow override of default "uri" spec_version: OCFL specification version forward_delta: set False to turn off foward delta. With forward delta turned off, the same content will be repeated in a new version @@ -113,7 +114,7 @@ def __init__(self, *, identifier=None, content_directory="content", self.id = identifier self.content_directory = content_directory self.digest_algorithm = digest_algorithm - self.filepath_normalization = filepath_normalization + self.content_path_normalization = content_path_normalization self.spec_version = spec_version self.forward_delta = forward_delta self.dedupe = dedupe @@ -152,9 +153,11 @@ def copy_into_object(self, src_fs, srcfile, filepath, create_dirs=False): def map_filepath(self, filepath, vdir, used): """Map source filepath to a content path within the object. + FIXME - Remove this method in favor or NewVersion._map_filepath + The purpose of the mapping might be normalization, sanitization, content distribution, or something else. The mapping is set by the - filepath_normalization attribute where None indicates no mapping, the + content_path_normalization attribute where None indicates no mapping, the source file name and path are preserved. Arguments: @@ -167,18 +170,18 @@ def map_filepath(self, filepath, vdir, used): Returns vfilepath, the version filepath for this content that starts with `vdir/content_directory/`. """ - if self.filepath_normalization == "uri": + if self.content_path_normalization == "uri": filepath = urlquote(filepath) # also encode any leading period to unhide files if filepath[0] == ".": filepath = "%2E" + filepath[1:] - elif self.filepath_normalization == "md5": + elif self.content_path_normalization == "md5": # Truncated MD5 hash of the _filepath_ as an illustration of diff # paths for the specification. Not sure whether there should be any # real application of this filepath = hashlib.md5(filepath.encode("utf-8")).hexdigest()[0:16] - elif self.filepath_normalization is not None: - raise Exception("Unknown filepath normalization '%s' requested" % (self.filepath_normalization)) + elif self.content_path_normalization is not None: + raise Exception("Unknown filepath normalization '%s' requested" % (self.content_path_normalization)) vfilepath = fs.path.join(vdir, self.content_directory, filepath) # path relative to root, inc v#/content # Check we don"t already have this vfilepath from many to one # normalization, add suffix to distinguish if necessary @@ -198,7 +201,7 @@ def start_inventory(self): inventory.digest_algorithm = self.digest_algorithm inventory.init_manifest_and_versions() # Add contentDirectory if not "content" - if self.content_directory != "content": + if self.content_directory != DEFAULT_CONTENT_DIRECTORY: inventory.content_directory = self.content_directory # Add fixity section if requested if self.fixity is not None and len(self.fixity) > 0: @@ -481,9 +484,9 @@ def add_version_with_content(self, objdir="", srcdir=None, metadata=None): for src_path in sorted(src_fs.walk.files()): src_path = os.path.relpath(src_path, "/") obj_path = self.map_filepath(src_path, new_version.inventory.head, used={}) - new_version.add_content(src_path, src_path, obj_path) + new_version.add(src_path, src_path, obj_path) # Write the new version - return self.commit_new_version(new_version) + return self.write_new_version(new_version) def start_new_version(self, *, objdir=None, @@ -512,7 +515,7 @@ def start_new_version(self, *, Returns: ocfl.NewVersion: object where the new version will be built before - finally be added with commit_new_version() + finally be added with write_new_version() """ # Check the current object self.open_obj_fs(objdir) @@ -579,10 +582,14 @@ def start_new_version(self, *, objdir=objdir, srcdir=srcdir, metadata=metadata, + content_directory=self.content_directory, + content_path_normalization=self.content_path_normalization, + forward_delta=self.forward_delta, + dedupe=self.dedupe, carry_content_forward=carry_content_forward, old_digest_algorithm=old_digest_algorithm) - def commit_new_version(self, new_version): + def write_new_version(self, new_version): """Update this object with the specified new version. Arguments: diff --git a/ocfl/object_utils.py b/ocfl/object_utils.py index 6738493..1f9b724 100755 --- a/ocfl/object_utils.py +++ b/ocfl/object_utils.py @@ -105,7 +105,6 @@ def make_unused_filepath(filepath, used, separator="__"): supplied filepath if that hasn't been used, otherwise a filepath based on that with separator and a sequence integer added. """ - print("#USED= " + str(used)) n = 1 f = filepath while f in used: diff --git a/ocfl/validator.py b/ocfl/validator.py index e3d53d5..bc1edd9 100644 --- a/ocfl/validator.py +++ b/ocfl/validator.py @@ -13,7 +13,8 @@ import re import fs -from .constants import INVENTORY_FILENAME, SPEC_VERSIONS_SUPPORTED +from .constants import INVENTORY_FILENAME, SPEC_VERSIONS_SUPPORTED, \ + DEFAULT_CONTENT_DIRECTORY from .digest import file_digest, normalized_digest from .inventory_validator import InventoryValidator from .namaste import find_namastes @@ -90,7 +91,7 @@ def initialize(self): self.spec_version = self.default_spec_version self.log.spec_version = self.spec_version self.digest_algorithm = "sha512" - self.content_directory = "content" + self.content_directory = DEFAULT_CONTENT_DIRECTORY self.inventory_digest_files = {} # index by version_dir, algorithms may differ self.root_inv_validator = None self.obj_fs = None diff --git a/tests/test_object.py b/tests/test_object.py index 5cd6840..fde6510 100644 --- a/tests/test_object.py +++ b/tests/test_object.py @@ -383,12 +383,12 @@ def test_map_filepath(self): self.assertEqual(oo.map_filepath('a', 'v1', {'v1/content/a': True}), 'v1/content/a__2') # md5 oo = Object() - oo.filepath_normalization = 'md5' + oo.content_path_normalization = 'md5' self.assertEqual(oo.map_filepath('a', 'v1', {}), 'v1/content/0cc175b9c0f1b6a8') self.assertEqual(oo.map_filepath('a', 'v1', {'v1/content/0cc175b9c0f1b6a8': True}), 'v1/content/0cc175b9c0f1b6a8__2') # error case oo = Object() - oo.filepath_normalization = '???' + oo.content_path_normalization = '???' self.assertRaises(Exception, oo.map_filepath, 'a', 'v1', {}) def test_extract(self):