From b15cc0ae935aee932b77e5d5dccf516e1c725660 Mon Sep 17 00:00:00 2001 From: John Readey Date: Fri, 27 Dec 2024 16:47:50 +0800 Subject: [PATCH 01/32] doc updates --- docs/Makefile | 12 ++- docs/conf.py | 174 ++++++++++++++++++++++++++++--------- docs/config.rst | 47 ---------- docs/faq.rst | 18 ++-- docs/high/attr.rst | 11 +++ docs/high/dataset.rst | 73 +++++----------- docs/high/dims.rst | 2 +- docs/high/file.rst | 166 ++++++++++++++++++++++------------- docs/high/group.rst | 73 ++++++++++++++-- docs/high/lowlevel.rst | 2 +- docs/index.rst | 36 +++++--- docs/mpi.rst | 14 +-- docs/quick.rst | 82 ++++++++++++++--- docs/requirements-docs.txt | 3 - docs/strings.rst | 22 ++++- docs/swmr.rst | 4 +- docs/vds.rst | 12 +-- docs/whatsnew/0.9.0.rst | 17 ---- docs/whatsnew/index.rst | 26 +++++- 19 files changed, 514 insertions(+), 280 deletions(-) delete mode 100644 docs/requirements-docs.txt delete mode 100644 docs/whatsnew/0.9.0.rst diff --git a/docs/Makefile b/docs/Makefile index 6d9eed9e..faad298b 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -1,4 +1,5 @@ # Makefile for Sphinx documentation +# # You can set these variables from the command line. SPHINXOPTS = -W @@ -84,17 +85,17 @@ qthelp: @echo @echo "Build finished; now you can run "qcollectiongenerator" with the" \ ".qhcp project file in $(BUILDDIR)/qthelp, like this:" - @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/h5py.qhcp" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/h5pyd.qhcp" @echo "To view the help file:" - @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/h5py.qhc" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/h5pyd.qhc" devhelp: $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp @echo @echo "Build finished." @echo "To view the help file:" - @echo "# mkdir -p $$HOME/.local/share/devhelp/h5py" - @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/h5py" + @echo "# mkdir -p $$HOME/.local/share/devhelp/h5pyd" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/h5pyd" @echo "# devhelp" epub: @@ -174,3 +175,6 @@ pseudoxml: $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml @echo @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." + +show: + @python -m webbrowser -t "file://$(shell pwd)/_build/html/index.html" diff --git a/docs/conf.py b/docs/conf.py index 6dde65b6..dab4ab26 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,32 +1,45 @@ +# -*- coding: utf-8 -*- +# +# h5pyd documentation build configuration file, +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + import sys import os -from importlib import metadata # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -# sys.path.insert(0, os.path.abspath('.')) +#sys.path.insert(0, os.path.abspath('.')) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. -# needs_sphinx = '1.0' +#needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.intersphinx', - 'sphinx.ext.extlinks', - 'sphinx.ext.mathjax', + 'sphinx.ext.intersphinx', + 'sphinx.ext.extlinks', + 'sphinx.ext.mathjax', ] -# intersphinx_mapping = {'low': ('https://api.h5py.org', None)} +intersphinx_mapping = {'low': ('https://api.h5py.org', None)} extlinks = { - 'issue': ('https://github.com/HDFGroup/h5pyd/issues/%s', 'GH'), - 'pr': ('https://github.com/HDFGroup/h5pyd/pulls/%s', 'PR '), + 'issue': ('https://github.com/h5py/h5pyd/issues/%s', 'GH%s'), + 'pr': ('https://github.com/h5py/h5pyd/pull/%s', 'PR %s'), } + # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] @@ -34,33 +47,33 @@ source_suffix = '.rst' # The encoding of source files. -# source_encoding = 'utf-8-sig' +#source_encoding = 'utf-8-sig' # The master toctree document. master_doc = 'index' # General information about the project. project = 'h5pyd' -copyright = '2014 The HDF Group' +copyright = '2017, The HDF Group' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The full version, including alpha/beta/rc tags. -release = metadata.version('h5pyd') +release = '0.20.0' # The short X.Y version. version = '.'.join(release.split('.')[:2]) # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -# language = None +#language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -# today = '' +#today = '' # Else, today_fmt is used as the format for a strftime call. -# today_fmt = '%B %d, %Y' +#today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -68,58 +81,58 @@ # The reST default role (used for this markup: `text`) to use for all # documents. -# default_role = None +#default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -# add_function_parentheses = True +#add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -# add_module_names = True +#add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -# show_authors = False +#show_authors = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # A list of ignored prefixes for module index sorting. -# modindex_common_prefix = [] +#modindex_common_prefix = [] # If true, keep warnings as "system message" paragraphs in the built documents. -# keep_warnings = False +#keep_warnings = False # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'furo' +html_theme = 'sphinx_rtd_theme' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -# html_theme_options = {} +#html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. -# html_theme_path = [] +#html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -# html_title = None +#html_title = None # A shorter title for the navigation bar. Default is the same as html_title. -# html_short_title = None +#html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. -# html_logo = None +#html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -# html_favicon = None +#html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, @@ -129,48 +142,127 @@ # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. -# html_extra_path = [] +#html_extra_path = [] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -# html_last_updated_fmt = '%b %d, %Y' +#html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -# html_use_smartypants = True +#html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -# html_sidebars = {} +#html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. -# html_additional_pages = {} +#html_additional_pages = {} # If false, no module index is generated. -# html_domain_indices = True +#html_domain_indices = True # If false, no index is generated. -# html_use_index = True +#html_use_index = True # If true, the index is split into individual pages for each letter. -# html_split_index = False +#html_split_index = False # If true, links to the reST sources are added to the pages. -# html_show_sourcelink = True +#html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -# html_show_sphinx = True +#html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -# html_show_copyright = True +#html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -# html_use_opensearch = '' +#html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). -# html_file_suffix = None +#html_file_suffix = None # Output file base name for HTML help builder. htmlhelp_basename = 'h5pyddoc' + + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + ('index', 'h5pyd.tex', 'h5pyd Documentation', + 'The HDF Group', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('index', 'h5pyd', 'h5pyd Documentation', + ['The HDF Group'], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ('index', 'h5pyd', 'h5pyd Documentation', + 'The HDF Group', 'h5pyd', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +#texinfo_no_detailmenu = False diff --git a/docs/config.rst b/docs/config.rst index 9eb188b3..2c8ded3f 100644 --- a/docs/config.rst +++ b/docs/config.rst @@ -27,50 +27,3 @@ attributes: particular container by specifying ``track_order`` argument to :class:`h5py.File`, :meth:`h5py.Group.create_group`, :meth:`h5py.Group.create_dataset`. The default is ``False``. - - -IPython -------- - -H5py ships with a custom ipython completer, which provides object introspection -and tab completion for h5py objects in an ipython session. For example, if a -file contains 3 groups, "foo", "bar", and "baz":: - - In [4]: f['b - bar baz - - In [4]: f['f - # Completes to: - In [4]: f['foo' - - In [4]: f['foo']. - f['foo'].attrs f['foo'].items f['foo'].ref - f['foo'].copy f['foo'].iteritems f['foo'].require_dataset - f['foo'].create_dataset f['foo'].iterkeys f['foo'].require_group - f['foo'].create_group f['foo'].itervalues f['foo'].values - f['foo'].file f['foo'].keys f['foo'].visit - f['foo'].get f['foo'].name f['foo'].visititems - f['foo'].id f['foo'].parent - -The easiest way to enable the custom completer is to do the following in an -IPython session:: - - In [1]: import h5py - - In [2]: h5py.enable_ipython_completer() - -It is also possible to configure IPython to enable the completer every time you -start a new session. For >=ipython-0.11, "h5py.ipy_completer" just needs to be -added to the list of extensions in your ipython config file, for example -:file:`~/.config/ipython/profile_default/ipython_config.py` (if this file does -not exist, you can create it by invoking `ipython profile create`):: - - c = get_config() - c.InteractiveShellApp.extensions = ['h5py.ipy_completer'] - -For >> arr = np.arange(100) >>> dset = f.create_dataset("init", data=arr) +Assigning an array into a group works like specifying ``data`` and no other +parameters:: + + >>> f["init"] = arr + Keywords ``shape`` and ``dtype`` may be specified along with ``data``; if so, they will override ``data.shape`` and ``data.dtype``. It's required that (1) the total number of points in ``shape`` match the total number of points @@ -58,7 +63,7 @@ the requested ``dtype``. Reading & writing data ---------------------- -HDF5 datasets re-use the NumPy slicing syntax to read and write to the file. +HDF5 datasets reuse the NumPy slicing syntax to read and write to the file. Slice specifications are translated directly to HDF5 "hyperslab" selections, and are a fast and efficient way to access data in the file. The following slicing arguments are recognized: @@ -209,6 +214,10 @@ axes using ``None``:: >>> dset = f.create_dataset("unlimited", (10, 10), maxshape=(None, 10)) +For a 1D dataset, ``maxshape`` can be an integer instead of a tuple. But to make an +unlimited 1D dataset, ``maxshape`` must be a tuple ``(None,)``. Passing ``None`` gives +the default behaviour, where the initial size is also the maximum. + .. note:: Resizing an array with existing data works differently than in NumPy; if any axis shrinks, the data in the missing region is discarded. Data does not "rearrange" itself as it does when resizing a NumPy array. @@ -267,10 +276,10 @@ The ``compression_opts`` parameter will then be passed to this filter. A Python package of several popular filters, including Blosc, LZ4 and ZFP, for convenient use with h5py - `HDF5 Filter Plugins `_ + `HDF5 Filter Plugins `_ A collection of filters as a single download from The HDF Group - `Registered filter plugins `_ + `Registered filter plugins `_ The index of publicly announced filter plugins .. note:: The underlying implementation of the compression filter will have the @@ -340,7 +349,7 @@ A MultiBlockSlice can be used in place of a slice to select a number of (count) blocks of multiple elements separated by a stride, rather than a set of single elements separated by a step. -For an explanation of how this slicing works, see the `HDF5 documentation `_. +For an explanation of how this slicing works, see the `HDF5 documentation `_. For example:: @@ -436,47 +445,6 @@ The dtype of the dataset can be accessed via ``.dtype`` as per normal. As empty datasets cannot be sliced, some methods of datasets such as ``read_direct`` will raise a ``TypeError`` exception if used on a empty dataset. -Reading and Writing to Multiple Datasets ------------------------------------------------------------- -The MultiManager interface enables reading and writing to multiple datasets -in parallel. A MultiManager requires a list of datasets to operate on, and then accepts -slice arguments for reading and writing like a typical Dataset. - -Reading datasets through a MultiManager returns a list where each entry is an array containing -the values read from the corresponding data. - - >>> mm = MultiManager(datasets=[dset1, dset2, dset3]) - >>> data = mm[...] # read all elements from each dataset - >>> data[0] # data read from dset1 - [0, 1, 2, 3] - >>> data[1] # data read from dset2 - [0, 2, 3, 4] - -Writing to datasets through a MultiManager requires a list where each entry is an array containing -the values to be written to each dataset. - - >>> mm[0] = [[1], [2], [3]] # write a different element to index 0 in each dataset - >>> data = mm[...] - >>> data[0] - [1, 1, 2, 3] - >>> data[1] - [2, 2, 3, 4] - -Multiple selections can be provided to read or write to a different region on each dataset in the MultiManager. - - >>> selections = [np.s_[0:2], np.s_[1:4], np.s_[2:4]] - >>> data = mm[selections] - >>> data[0] - [1, 1] - >>> data[1] - [2, 3, 4] - >>> mm[selections] == [[0, 1], [4, 5, 6], [7, 8]] - >>> data = mm[...] - >>> data[0] - [0, 1, 2, 3] - >>> data[1] - [2, 4, 5, 6] - Reference --------- @@ -505,7 +473,7 @@ Reference >>> dset = f["MyDS"] >>> f.close() >>> if dset: - ... print("datset accessible") + ... print("dataset accessible") ... else: ... print("dataset inaccessible") dataset inaccessible @@ -544,12 +512,8 @@ Reference >>> out.dtype dtype('int16') - .. versionchanged:: 3.0 - Allowed reading through the wrapper object. In earlier versions, - :meth:`astype` had to be used as a context manager: - - >>> with dset.astype('int16'): - ... out = dset[:] + .. versionchanged:: 3.9 + :meth:`astype` can no longer be used as a context manager. .. method:: asstr(encoding=None, errors='strict') @@ -703,6 +667,11 @@ Reference Access to :ref:`dimension_scales`. + .. attribute:: is_scale + + Return ``True`` if the dataset is also a :ref:`dimension scale `, + ``False`` otherwise. + .. attribute:: attrs :ref:`attributes` for this dataset. diff --git a/docs/high/dims.rst b/docs/high/dims.rst index 8fe913ad..77719ea6 100644 --- a/docs/high/dims.rst +++ b/docs/high/dims.rst @@ -95,7 +95,7 @@ returned. There is no guarantee that the name of the dimension scale is unique. Nested dimension scales are not permitted: if a dataset has a dimension scale attached to it, converting the dataset to a dimension scale will fail, since the -`HDF5 specification doesn't allow this `_. :: +`HDF5 specification doesn't allow this `_. :: >>> f['data'].make_scale() RuntimeError: Unspecified error in H5DSset_scale (return value <0) diff --git a/docs/high/file.rst b/docs/high/file.rst index b3394334..22713b7c 100644 --- a/docs/high/file.rst +++ b/docs/high/file.rst @@ -9,6 +9,8 @@ File objects serve as your entry point into the world of HDF5. In addition to the File-specific capabilities listed here, every File instance is also an :ref:`HDF5 group ` representing the `root group` of the file. +Note: Python "File-like" objects are not supported. + .. _file_open: Opening & creating files @@ -96,67 +98,37 @@ of supported drivers and their options: Raw data filename extension. Default is '-r.h5'. 'ros3' - Allows read only access to HDF5 files on S3. Keywords: + Enables read-only access to HDF5 files in the AWS S3 or S3-compatible object + stores. HDF5 file name must be one of \http://, \https://, or s3:// + resource location. An s3:// location will be translated into an AWS + `path-style `_ + location by h5py. Keywords: aws_region: - Name of the AWS "region" where the S3 bucket with the file is, e.g. ``b"us-east-1"``. Default is ``b''``. + AWS region of the S3 bucket with the file, e.g. ``b"us-east-1"``. + Default is ``b''``. Required for s3:// locations. secret_id: - "Access ID" for the resource. Default is ``b''``. + AWS access key ID. Default is ``b''``. secret_key: - "Secret Access Key" associated with the ID and resource. Default is ``b''``. - - The argument values must be ``bytes`` objects. - - -.. _file_fileobj: - -Python file-like objects ------------------------- - -.. versionadded:: 2.9 - -The first argument to :class:`File` may be a Python file-like object, such as -an :class:`io.BytesIO` or :class:`tempfile.TemporaryFile` instance. -This is a convenient way to create temporary HDF5 files, e.g. for testing or to -send over the network. - -The file-like object must be open for binary I/O, and must have these methods: -``read()`` (or ``readinto()``), ``write()``, ``seek()``, ``tell()``, -``truncate()`` and ``flush()``. - - - >>> tf = tempfile.TemporaryFile() - >>> f = h5py.File(tf, 'w') - -Accessing the :class:`File` instance after the underlying file object has been -closed will result in undefined behaviour. + AWS secret access key. Default is ``b''``. -When using an in-memory object such as :class:`io.BytesIO`, the data written -will take up space in memory. If you want to write large amounts of data, -a better option may be to store temporary data on disk using the functions in -:mod:`tempfile`. + session_token: + AWS temporary session token. Default is ``b''``.' Must be used + together with temporary secret_id and secret_key. Available from HDF5 1.14.2. -.. literalinclude:: ../../examples/bytesio.py + The argument values must be ``bytes`` objects. Arguments aws_region, + secret_id, and secret_key are required to activate AWS authentication. -.. warning:: + .. note:: + Pre-built h5py packages on PyPI do not include ros3 driver support. If + you want this feature, you could use packages from conda-forge, or + :ref:`build h5py from source ` against an HDF5 build + with ros3. Alternatively, use the :ref:`file-like object + ` support with a package like s3fs. - When using a Python file-like object for an HDF5 file, make sure to close - the HDF5 file before closing the file object it's wrapping. If there is an - error while trying to close the HDF5 file, segfaults may occur. -.. note:: - - Using a Python file-like object for HDF5 is internally more complex, - as the HDF5 C code calls back into Python to access it. It inevitably - has more ways to go wrong, and the failures may be less clear when it does. - For some common use cases, you can easily avoid it: - - - To create a file in memory and never write it to disk, use the ``'core'`` - driver with ``mode='w', backing_store=False`` (see :ref:`file_driver`). - - To use a temporary file securely, make a temporary directory and - :ref:`open a file path ` inside it. .. _file_version: @@ -308,10 +280,10 @@ given dataset's chunks are controlled when creating the dataset, but it is possible to adjust the behavior of the chunk *cache* when opening the file. The parameters controlling this behavior are prefixed by ``rdcc``, for *raw data -chunk cache*. +chunk cache*. They apply to all datasets unless specifically changed for each one. * ``rdcc_nbytes`` sets the total size (measured in bytes) of the raw data chunk - cache for each dataset. The default size is 1 MB. + cache for each dataset. The default size is 1 MiB. This should be set to the size of each chunk times the number of chunks that are likely to be needed in cache. * ``rdcc_w0`` sets the policy for chunks to be @@ -346,8 +318,41 @@ chunk cache*. approximately 100 times that number of chunks. The default value is 521. Chunks and caching are described in greater detail in the `HDF5 documentation -`_. +`_. + +.. _file_alignment: + +Data alignment +-------------- + +When creating datasets within files, it may be advantageous to align the offset +within the file itself. This can help optimize read and write times if the data +become aligned with the underlying hardware, or may help with parallelism with +MPI. Unfortunately, aligning small variables to large blocks can leave a lot of +empty space in a file. To this effect, application developers are left with two +options to tune the alignment of data within their file. The two variables +``alignment_threshold`` and ``alignment_interval`` in the :class:`File` +constructor help control the threshold in bytes where the data alignment policy +takes effect and the alignment in bytes within the file. The alignment is +measured from the end of the user block. +For more information, see the official HDF5 documentation `H5P_SET_ALIGNMENT +`_. + +.. _file_meta_block_size: + +Meta block size +--------------- + +Space for metadata is allocated in blocks within the HDF5 file. The argument +``meta_block_size`` of the :class:`File` constructor sets the minimum size of +these blocks. Setting a large value can consolidate metadata into a small +number of regions. Setting a small value can reduce the overall file size, +especially in combination with the ``libver`` option. This controls how the +overall data and metadata are laid out within the file. + +For more information, see the official HDF5 documentation `H5P_SET_META_BLOCK_SIZE +`_. Reference --------- @@ -358,15 +363,17 @@ Reference HDF5 name of the root group, "``/``". To access the on-disk name, use :attr:`File.filename`. -.. class:: File(name, mode=None, driver=None, libver=None, \ - userblock_size=None, swmr=False, rdcc_nslots=None, rdcc_nbytes=None, \ - rdcc_w0=None, track_order=None, fs_strategy=None, fs_persist=False, \ - fs_threshold=1, **kwds) +.. class:: File(name, mode='r', driver=None, libver=None, userblock_size=None, \ + swmr=False, rdcc_nslots=None, rdcc_nbytes=None, rdcc_w0=None, \ + track_order=None, fs_strategy=None, fs_persist=False, fs_threshold=1, \ + fs_page_size=None, page_buf_size=None, min_meta_keep=0, min_raw_keep=0, \ + locking=None, alignment_threshold=1, alignment_interval=1, **kwds) Open or create a new file. - Note that in addition to the File-specific methods and properties listed - below, File objects inherit the full interface of :class:`Group`. + Note that in addition to the :class:`File`-specific methods and properties + listed below, :class:`File` objects inherit the full interface of + :class:`Group`. :param name: Name of file (`bytes` or `str`), or an instance of :class:`h5f.FileID` to bind to an existing @@ -392,13 +399,47 @@ Reference ``h5.get_config().track_order``. :param fs_strategy: The file space handling strategy to be used. Only allowed when creating a new file. One of "fsm", "page", - "aggregate", "none", or None (to use the HDF5 default). + "aggregate", "none", or ``None`` (to use the HDF5 default). :param fs_persist: A boolean to indicate whether free space should be persistent or not. Only allowed when creating a new file. The default is False. + :param fs_page_size: File space page size in bytes. Only use when + fs_strategy="page". If ``None`` use the HDF5 default (4096 bytes). :param fs_threshold: The smallest free-space section size that the free space manager will track. Only allowed when creating a new file. The default is 1. + :param page_buf_size: Page buffer size in bytes. Only allowed for HDF5 files + created with fs_strategy="page". Must be a power of two value and + greater or equal than the file space page size when creating the + file. It is not used by default. + :param min_meta_keep: Minimum percentage of metadata to keep in the page + buffer before allowing pages containing metadata to be evicted. + Applicable only if ``page_buf_size`` is set. Default value is zero. + :param min_raw_keep: Minimum percentage of raw data to keep in the page + buffer before allowing pages containing raw data to be evicted. + Applicable only if ``page_buf_size`` is set. Default value is zero. + :param locking: The file locking behavior. One of: + + - False (or "false") -- Disable file locking + - True (or "true") -- Enable file locking + - "best-effort" -- Enable file locking but ignore some errors + - None -- Use HDF5 defaults + + .. warning:: + + The HDF5_USE_FILE_LOCKING environment variable can override + this parameter. + + Only available with HDF5 >= 1.12.1 or 1.10.x >= 1.10.7. + :param alignment_threshold: Together with ``alignment_interval``, this + property ensures that any file object greater than or equal + in size to the alignment threshold (in bytes) will be + aligned on an address which is a multiple of alignment interval. + :param alignment_interval: This property should be used in conjunction with + ``alignment_threshold``. See the description above. For more + details, see :ref:`file_alignment`. + :param meta_block_size: Determines the current minimum size, in bytes, of + new metadata block allocations. See :ref:`file_meta_block_size`. :param kwds: Driver-specific keywords; see :ref:`file_driver`. .. method:: __bool__() @@ -452,3 +493,8 @@ Reference .. attribute:: userblock_size Size of user block (in bytes). Generally 0. See :ref:`file_userblock`. + + .. attribute:: meta_block_size + + Minimum size, in bytes, of metadata block allocations. Default: 2048. + See :ref:`file_meta_block_size`. diff --git a/docs/high/group.rst b/docs/high/group.rst index 60b032f9..7b222a06 100644 --- a/docs/high/group.rst +++ b/docs/high/group.rst @@ -135,7 +135,7 @@ If the target is removed, they will "dangle": External links ~~~~~~~~~~~~~~ -New in HDF5 1.8, external links are "soft links plus", which allow you to +External links are "soft links plus", which allow you to specify the name of the file as well as the path to the desired object. You can refer to objects in any file you wish. Use similar syntax as for soft links: @@ -277,6 +277,19 @@ Reference In this case `object` will be a :class:`Group` or :class:`Dataset` instance. + .. method:: visit_links(callable) + visititems_links(callable) + + These methods are like :meth:`visit` and :meth:`visititems`, but work on + the links in groups, rather than the objects those links point to. So if + you have two links pointing to the same object, these will 'see' both. + They also see soft & external links, which :meth:`visit` and + :meth:`visititems` ignore. + + The second argument to the callback for ``visititems_links`` is an + instance of one of the :ref:`link classes `. + + .. versionadded:: 3.11 .. method:: move(source, dest) @@ -292,9 +305,16 @@ Reference .. method:: copy(source, dest, name=None, shallow=False, expand_soft=False, expand_external=False, expand_refs=False, without_attrs=False) - Copy an object or group. The source and destination need not be in - the same file. If the source is a Group object, by default all objects - within that group will be copied recursively. + Copy an object or group. The source can be a path, Group, Dataset, or + Datatype object. The destination can be either a path or a Group + object. The source and destination need not be in the same file. + + If the source is a Group object, by default all objects within that + group will be copied recursively. + + When the destination is a Group object, by default the target will be + created in that group with its current name (basename of obj.name). You + can override that by setting "name" to a string. :param source: What to copy. May be a path in the file or a Group/Dataset object. :param dest: Where to copy it. May be a path or Group object. @@ -362,6 +382,15 @@ Reference :keyword fillvalue: This value will be used when reading uninitialized parts of the dataset. + :keyword fill_time: Control when to write the fill value. One of the + following choices: `alloc`, write fill value before writing + application data values or when the dataset is created; `never`, + never write fill value; `ifset`, write fill value if it is defined. + Default to `ifset`, which is the default of HDF5 library. If the + whole dataset is going to be written by the application, setting + this to `never` can avoid unnecessary writing of fill value and + potentially improve performance. + :keyword track_times: Enable dataset creation timestamps (**T**/F). :keyword track_order: Track attribute creation order if @@ -377,12 +406,37 @@ Reference it grow as needed. If only a name is given instead of an iterable of tuples, it is equivalent to ``[(name, 0, h5py.h5f.UNLIMITED)]``. + :keyword allow_unknown_filter: Do not check that the requested filter is available for use (T/F). This should only be set if you will write any data with ``write_direct_chunk``, compressing the data before passing it to h5py. - .. method:: require_dataset(name, shape=None, dtype=None, exact=None, **kwds) + :keyword rdcc_nbytes: Total size of the dataset's chunk cache in bytes. + The default size is 1024**2 (1 MiB). + + :keyword rdcc_w0: The chunk preemption policy for this dataset. This + must be between 0 and 1 inclusive and indicates the weighting + according to which chunks which have been fully read or written are + penalized when determining which chunks to flush from cache. A value + of 0 means fully read or written chunks are treated no differently + than other chunks (the preemption is strictly LRU) while a value of + 1 means fully read or written chunks are always preempted before + other chunks. If your application only reads or writes data once, + this can be safely set to 1. Otherwise, this should be set lower + depending on how often you re-read or re-write the same data. The + default value is 0.75. + + :keyword rdcc_nslots: The number of chunk slots in the dataset's chunk + cache. Increasing this value reduces the number of cache collisions, + but slightly increases the memory used. Due to the hashing strategy, + this value should ideally be a prime number. As a rule of thumb, + this value should be at least 10 times the number of chunks that can + fit in rdcc_nbytes bytes. For maximum performance, this value should + be set approximately 100 times that number of chunks. The default + value is 521. + + .. method:: require_dataset(name, shape, dtype, exact=False, **kwds) Open a dataset, creating it if it doesn't exist. @@ -390,11 +444,17 @@ Reference the same shape and a conversion-compatible dtype to be returned. If True, the shape and dtype must match exactly. + If keyword "maxshape" is given, the maxshape and dtype must match + instead. + + If any of the keywords "rdcc_nslots", "rdcc_nbytes", or "rdcc_w0" are + given, they will be used to configure the dataset's chunk cache. + Other dataset keywords (see create_dataset) may be provided, but are only used if a new dataset is to be created. Raises TypeError if an incompatible object already exists, or if the - shape or dtype don't match according to the above rules. + shape, maxshape or dtype don't match according to the above rules. :keyword exact: Require shape and type to match exactly (T/**F**) @@ -478,6 +538,7 @@ Reference :class:`Group` instance containing this group. +.. _group_link_classes: Link classes ------------ diff --git a/docs/high/lowlevel.rst b/docs/high/lowlevel.rst index 3b35fc38..c6c92821 100644 --- a/docs/high/lowlevel.rst +++ b/docs/high/lowlevel.rst @@ -8,7 +8,7 @@ h5py also provides a low-level API, which more closely follows the HDF5 C API. .. seealso:: - `h5py Low-Level API Reference `_ - - `HDF5 C/Fortran Reference Manual `_ + - `HDF5 C/Fortran Reference Manual `_ You can easily switch between the two levels in your code: diff --git a/docs/index.rst b/docs/index.rst index 32b1508a..54645c95 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,13 +1,23 @@ -HDF5 for Python -=============== +HDF5 for the Cloud +================== -The h5py package is a Pythonic interface to the HDF5 binary data format. +The h5pyd package is a Python interface for accessing the Highly Scalable Data Service (HSDS) +(https://www.hdfgroup.org/solutions/highly-scalable-data-service-hsds/). Whereas the HDF5 library enables +complex scientific data to be stored in files, HSDS uses a cloud-native storage model that works well with +object storage systems such as AWS S3 and Azure Blob Storage (regular POSIX stores are also supported). -`HDF5 `_ lets you store huge amounts of numerical -data, and easily manipulate that data from NumPy. For example, you can slice -into multi-terabyte datasets stored on disk, as if they were real NumPy -arrays. Thousands of datasets can be stored in a single file, categorized and -tagged however you want. +The native interface for HSDS is the HDF REST API (https://github.com/HDFGroup/hdf-rest-api), but the h5pyd package +provides a convienent mode of access using the same API as the popular h5py package (https://docs.h5py.org/en/stable/index.html) +that provides a Pythonic interface to the HDF5 library. In fact, many applications that use h5py can be +converted to using h5py, just by adding the statment ``import h5pyd as h5py``. + +However, not every h5py feature is supported in h5pyd (at least not yet!). For example Virtual Datasets are not supported. +For a complete list see the: :ref:`FAQ `. In addition, there are some features that are supported, like Parallel HDF5, +but work somewhat differently from the HDF5 library. And finally, there are features of h5pyd, that don't have any +correspondance to h5py, such as Folders. + +The h5pyd package also includes a set of command line tools, for doing common tasks such as uploading HDF5 files to HSDS. +See: tbd for a description of the CLI tools. Where to start -------------- @@ -20,8 +30,9 @@ Other resources --------------- * `Python and HDF5 O'Reilly book `_ -* `Ask questions on the mailing list at Google Groups `_ -* `GitHub project `_ +* `Ask questions on the HDF forum `_ +* `GitHub project for h5pyd `_ +* `GitHub project for HSDS `_ Introductory info @@ -61,10 +72,11 @@ Advanced topics mpi swmr vds + related_projects -Meta-info about the h5py project --------------------------------- +Meta-info about the h5pyd project +--------------------------------- .. toctree:: :maxdepth: 1 diff --git a/docs/mpi.rst b/docs/mpi.rst index fbe0e2c6..c0729599 100644 --- a/docs/mpi.rst +++ b/docs/mpi.rst @@ -3,11 +3,11 @@ Parallel HDF5 ============= -Read-only parallel access to HDF5 files works with no special preparation: -each process should open the file independently and read data normally -(avoid opening the file and then forking). +Parallel read access to HDF5 files is possible from separate processes (but not +threads) with no special features. It's advised to open the file independently +in each reader process; opening the file once and then forking may cause issues. -`Parallel HDF5 `_ is a +**Parallel HDF5** is a feature built on MPI which also supports *writing* an HDF5 file in parallel. To use this, both HDF5 and h5py must be compiled with MPI support turned on, as described below. @@ -21,7 +21,7 @@ Passing Interface) standard for interprocess communication. Consequently, when using Parallel HDF5 from Python, your application will also have to use the MPI library. -This is accomplished through the `mpi4py `_ Python package, which provides +This is accomplished through the `mpi4py `_ Python package, which provides excellent, complete Python bindings for MPI. Here's an example "Hello World" using ``mpi4py``:: @@ -41,7 +41,7 @@ The ``mpi4py`` package includes all kinds of mechanisms to share data between processes, synchronize, etc. It's a different flavor of parallelism than, say, threads or ``multiprocessing``, but easy to get used to. -Check out the `mpi4py web site `_ for more information +Check out the `mpi4py web site `_ for more information and a great tutorial. @@ -138,7 +138,7 @@ On the other hand, writing data to a dataset can be done independently:: MPI atomic mode --------------- -HDF5 versions 1.8.9+ support the MPI "atomic" file access mode, which trades +HDF5 supports the MPI "atomic" file access mode, which trades speed for more stringent consistency requirements. Once you've opened a file with the ``mpio`` driver, you can place it in atomic mode using the settable ``atomic`` property:: diff --git a/docs/quick.rst b/docs/quick.rst index 4e035800..9331d6f9 100644 --- a/docs/quick.rst +++ b/docs/quick.rst @@ -3,34 +3,92 @@ Quick Start Guide ================= +Github Codespaces +----------------- + +The quickest way to get started with h5pyd and HSDS is to use Github Codespaces. You can launch A +codespace the incudes h5pyd and HSDS by clicking here: . +Try out some of the included examples and Python notebooks to get familiar with the features +offered by the package. + +Read on to run h5pyd on your laptop or desktop system... + Install ------- -With `Anaconda `_ or -`Miniconda `_:: +You can install ``h5pyd`` via pip:: - conda install h5py + pip install h5py +If you will be running your own HSDS, install the ``hsds`` package as well:: -If there are wheels for your platform (mac, linux, windows on x86) and -you do not need MPI you can install ``h5py`` via pip:: + pip install hsds - pip install h5py +HSDS Setup +---------- + +If you will be using an existing HSDS instance, your sysadmin will provide you +with the http endpoint, username, and password used to access HSDS. You can +skip the rest of this section and go to h5pyd configuration below. + +HSDS can be installed on different platforms such as Kubernetes, Docker, or DC/OS. For +these options see the relevant install guide in . + +For now though, we will just run HSDS locally. Follow these steps: + +* Make a directory that will be used for data storage. For example,: ``mkdir ~/hsds_data`` +* Start the HSDS service: ``hsds --root_dir ~/hsds_data`` +* Once you see the output: ``READY! use endpoint: http://localhost:5101``, you can open the HSDS status page at: -With `Enthought Canopy `_, use -the GUI package manager or:: +When you are ready to shut down HSDS, just hit Ctlr-C in the terminal window where you started it. - enpkg h5py +h5pyd configuration +------------------- + +Typically, http requests to HSDS need to authenticated (likely you wouldn't want just anyone messing around with your HDF data!). +HSDS supports several authentication protocols, but the simplest (if not most secure) to use when getting started is what is +known as HTTP Basic Authentication. In this protocol, your HSDS username and password is encoded in the HTTP header of +each request. The h5pyd package will do this automatically, storing your credentials in the file ``~/.hscfg``. + +You can edit this file by hand, or use the hsconfigure tool included with h5pyd: + +* Start the app by running: ``hsconfigure`` in a terminal +* You'll be prompted for a server endpoint. If running HSDS locally, enter: ``http://localhost:5101`` +* Next you'll be asked for your username. Enter your system username if running HSDS locally, or the name given by your sysadmin otherwise +* Next you'll be asked for your password. Again use your system username, or the password provided by your sysadmin +* For API Key, just hit enter +* Access to HSDS and your credentials will be verified, and if ok, you will see ``connection ok`` +* Type ``Y`` to save your information to the .hscfg file + +At anytime, your can verify access to HSDS by running ``hsabout``. This utility will used your saved credentials to fetch +status information from the server and display it. -To install from source see :ref:`install`. Core concepts ------------- -An HDF5 file is a container for two kinds of objects: `datasets`, which are +While the HDF5 library works with files on a POSIX filesystem (typically a local disk or network mount), +with h5pyd all access to data storage is mediated by HSDS. For example, HSDS may be configured to use +AWS storage that you don't have permissions to view directly. + +To make keeping track of everything easier, HSDS manages storage using three levels of organization: + +* Buckets are collections of Folders and Domains +* Folders live in buckets are work much like directories in a POSIX file system +* Domains are the equivalent to HDF5 files + +Buckets are setup by the HSDS administrator and will correspond to AWS S3 Buckets, Azure Blob Containers, or POSIX directories. +Buckets can not be created using the h5pyd package (these need to be setup by the HSDS administrator), +but the h5pyd File and Folder object have an optional bucket parameter to specify which +bucket to access. Typically HSDS will be setup with a default bucket that will be used if no bucket name is given explicitly. + +Folders can be created using h5pyd (or the hstouch CLI tool). Likewise Domains can be created using ``h5pyd.File`` or the +hstouch CLI tool, e.g. ``hstouch /home/$USER/myfile.h5``. + +As with HDF5 files, HSDS domains are containers for two kinds of objects: `datasets`, which are array-like collections of data, and `groups`, which are folder-like containers that hold datasets and other groups. The most fundamental thing to remember -when using h5py is: +when using h5py(d) is: **Groups work like dictionaries, and datasets work like NumPy arrays** diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt deleted file mode 100644 index 325a62a4..00000000 --- a/docs/requirements-docs.txt +++ /dev/null @@ -1,3 +0,0 @@ -sphinx>=4.3.1 -furo==2021.11.23 -rstcheck==3.3.1 diff --git a/docs/strings.rst b/docs/strings.rst index 834ea189..b6be9830 100644 --- a/docs/strings.rst +++ b/docs/strings.rst @@ -17,7 +17,8 @@ for variable-length strings, or numpy bytes arrays (``'S'`` dtypes) for fixed-length strings. Use :meth:`.Dataset.asstr` to retrieve ``str`` objects. Variable-length strings in attributes are read as ``str`` objects. These are -decoded as UTF-8 with surrogate escaping for unrecognised bytes. +decoded as UTF-8 with surrogate escaping for unrecognised bytes. Fixed-length +strings are read as numpy bytes arrays, the same as for datasets. Storing strings --------------- @@ -25,7 +26,24 @@ Storing strings When creating a new dataset or attribute, Python ``str`` or ``bytes`` objects will be treated as variable-length strings, marked as UTF-8 and ASCII respectively. Numpy bytes arrays (``'S'`` dtypes) make fixed-length strings. -You can use :func:`.string_dtype` to explicitly specify any HDF5 string datatype. +You can use :func:`.string_dtype` to explicitly specify any HDF5 string datatype, +as shown in the examples below:: + + string_data = ["varying", "sizes", "of", "strings"] + + # Variable length strings (implicit) + f['vlen_strings1'] = string_data + + # Variable length strings (explicit) + ds = f.create_dataset('vlen_strings2', shape=4, dtype=h5py.string_dtype()) + ds[:] = string_data + + # Fixed length strings (implicit) - longer strings are truncated + f['fixed_strings1'] = np.array(string_data, dtype='S6') + + # Fixed length strings (explicit) - longer strings are truncated + ds = f.create_dataset('fixed_strings2', shape=4, dtype=h5py.string_dtype(length=6)) + ds[:] = string_data When writing data to an existing dataset or attribute, data passed as bytes is written without checking the encoding. Data passed as Python ``str`` objects diff --git a/docs/swmr.rst b/docs/swmr.rst index 89580a62..2c1192d6 100644 --- a/docs/swmr.rst +++ b/docs/swmr.rst @@ -37,7 +37,7 @@ creating the file. The HDF Group has documented the SWMR features in details on the website: -`Single-Writer/Multiple-Reader (SWMR) Documentation `_. +`Single-Writer/Multiple-Reader (SWMR) Documentation `_. This is highly recommended reading for anyone intending to use the SWMR feature even through h5py. For production systems in particular pay attention to the file system requirements regarding POSIX I/O semantics. @@ -99,7 +99,7 @@ example uses the the linux inotify (`pyinotify `_ python bindings) to receive a signal each time the target file has been updated. -.. literalinclude:: ../examples/swmr_inotify_example.py +.. literalinclude:: ../examples/swmr_multiprocess.py Multiprocess concurrent write and read ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/docs/vds.rst b/docs/vds.rst index a9a7c7f6..a26d862a 100644 --- a/docs/vds.rst +++ b/docs/vds.rst @@ -25,11 +25,11 @@ HDF5 dataset. .. Warning:: - Virtual dataset files cannot be opened with versions of the hdf5 library + Virtual dataset files cannot be opened with versions of the HDF5 library older than 1.10. -The HDF Group has documented the VDS features in detail on the website: -`Virtual Datasets (VDS) Documentation `_. +The HDF Group has overview of the VDS features on their website: +`Virtual Datasets (VDS) Documentation `_. .. _creating_vds: @@ -89,7 +89,7 @@ found in the examples folder: - `dataset_concatenation.py `_ illustrates virtually stacking datasets together along a new axis. - A number of examples are based on the sample use cases presented in the - `virtual datasets RFC `__: + `virtual datasets RFC `__: - `excalibur_detector_modules.py `_ - `dual_pco_edge.py `_ @@ -124,8 +124,8 @@ Reference slice it to indicate which regions should be used in the virtual dataset. When `creating a virtual dataset `_, paths to sources present - in the same file are changed to a ".", refering to the current file (see - `H5Pset_virtual `_). + in the same file are changed to a ".", referring to the current file (see + `H5Pset_virtual `_). This will keep such sources valid in case the file is renamed. :param path_or_dataset: diff --git a/docs/whatsnew/0.9.0.rst b/docs/whatsnew/0.9.0.rst deleted file mode 100644 index 464f7064..00000000 --- a/docs/whatsnew/0.9.0.rst +++ /dev/null @@ -1,17 +0,0 @@ -What's new in h5pyd 0.9.0 -========================= - -New features ------------- - -* - -Deprecations ------------- - -* - -Development ------------ - -* diff --git a/docs/whatsnew/index.rst b/docs/whatsnew/index.rst index bb72c479..3f7010a0 100644 --- a/docs/whatsnew/index.rst +++ b/docs/whatsnew/index.rst @@ -8,4 +8,28 @@ These document the changes between minor (or major) versions of h5py. .. toctree:: - 0.9.0 + 3.12 + 3.11 + 3.10 + 3.9 + 3.8 + 3.7 + 3.6 + 3.5 + 3.4 + 3.3 + 3.2 + 3.1 + 3.0 + 2.10 + 2.9 + 2.8 + 2.7.1 + 2.7 + 2.6 + 2.5 + 2.4 + 2.3 + 2.2 + 2.1 + 2.0 From 961d874152a62e933c335f31e8d8ff715766fda2 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 31 Dec 2024 10:45:48 +0800 Subject: [PATCH 02/32] fix file cloning --- docs/high/file.rst | 327 +++++-------------------------------------- docs/index.rst | 10 +- docs/quick.rst | 59 +++++--- h5pyd/_hl/files.py | 70 +++++---- test/hl/test_file.py | 27 ++++ 5 files changed, 148 insertions(+), 345 deletions(-) diff --git a/docs/high/file.rst b/docs/high/file.rst index 22713b7c..250a3de9 100644 --- a/docs/high/file.rst +++ b/docs/high/file.rst @@ -13,16 +13,16 @@ Note: Python "File-like" objects are not supported. .. _file_open: -Opening & creating files ------------------------- +Opening & creating domains +-------------------------- -HDF5 files work generally like standard Python file objects. They support +HSDS domains work generally like standard Python file objects. They support standard modes like r/w/a, and should be closed when they are no longer in use. However, there is obviously no concept of "text" vs "binary" mode. >>> f = h5py.File('myfile.hdf5','r') -The file name may be a byte string or unicode string. Valid modes are: +The file name may be a string (i.e. Python 3 unicode string). Valid modes are: ======== ================================================ r Readonly, file must exist (default) @@ -32,138 +32,35 @@ The file name may be a byte string or unicode string. Valid modes are: a Read/write if exists, create otherwise ======== ================================================ -.. versionchanged:: 3.0 - Files are now opened read-only by default. Earlier versions of h5py would - pick different modes depending on the presence and permissions of the file. + Files are opened read-only by default. So the file mode parameter is + only required for one of the writable modes. .. _file_driver: -File drivers ------------- - -HDF5 ships with a variety of different low-level drivers, which map the logical -HDF5 address space to different storage mechanisms. You can specify which -driver you want to use when the file is opened:: - - >>> f = h5py.File('myfile.hdf5', driver=, ) - -For example, the HDF5 "core" driver can be used to create a purely in-memory -HDF5 file, optionally written out to disk when it is closed. Here's a list -of supported drivers and their options: - - None - **Strongly recommended.** Use the standard HDF5 driver appropriate - for the current platform. On UNIX, this is the H5FD_SEC2 driver; - on Windows, it is H5FD_WINDOWS. - - 'sec2' - Unbuffered, optimized I/O using standard POSIX functions. - - 'stdio' - Buffered I/O using functions from stdio.h. - - 'core' - Store and manipulate the data in memory, and optionally write it - back out when the file is closed. Using this with an existing file - and a reading mode will read the entire file into memory. Keywords: - - backing_store: - If True (default), save changes to the real file at the specified - path on :meth:`~.File.close` or :meth:`~.File.flush`. - If False, any changes are discarded when the file is closed. - - block_size: - Increment (in bytes) by which memory is extended. Default is 64k. - - 'family' - Store the file on disk as a series of fixed-length chunks. Useful - if the file system doesn't allow large files. Note: the filename - you provide *must* contain a printf-style integer format code - (e.g. %d"), which will be replaced by the file sequence number. - Keywords: - - memb_size: Maximum file size (default is 2**31-1). - - 'fileobj' - Store the data in a Python file-like object; see below. - This is the default if a file-like object is passed to :class:`File`. - - 'split' - Splits the meta data and raw data into separate files. Keywords: - - meta_ext: - Metadata filename extension. Default is '-m.h5'. - - raw_ext: - Raw data filename extension. Default is '-r.h5'. - - 'ros3' - Enables read-only access to HDF5 files in the AWS S3 or S3-compatible object - stores. HDF5 file name must be one of \http://, \https://, or s3:// - resource location. An s3:// location will be translated into an AWS - `path-style `_ - location by h5py. Keywords: - - aws_region: - AWS region of the S3 bucket with the file, e.g. ``b"us-east-1"``. - Default is ``b''``. Required for s3:// locations. - - secret_id: - AWS access key ID. Default is ``b''``. - - secret_key: - AWS secret access key. Default is ``b''``. - - session_token: - AWS temporary session token. Default is ``b''``.' Must be used - together with temporary secret_id and secret_key. Available from HDF5 1.14.2. - - The argument values must be ``bytes`` objects. Arguments aws_region, - secret_id, and secret_key are required to activate AWS authentication. - - .. note:: - Pre-built h5py packages on PyPI do not include ros3 driver support. If - you want this feature, you could use packages from conda-forge, or - :ref:`build h5py from source ` against an HDF5 build - with ros3. Alternatively, use the :ref:`file-like object - ` support with a package like s3fs. - - - -.. _file_version: - -Version bounding ----------------- - -HDF5 has been evolving for many years now. By default, the library will write -objects in the most compatible fashion possible, so that older versions will -still be able to read files generated by modern programs. However, there can be -feature or performance advantages if you are willing to forgo a certain level of -backwards compatibility. By using the "libver" option to :class:`File`, you can -specify the minimum and maximum sophistication of these structures: - - >>> f = h5py.File('name.hdf5', libver='earliest') # most compatible - >>> f = h5py.File('name.hdf5', libver='latest') # most modern - -Here "latest" means that HDF5 will always use the newest version of these -structures without particular concern for backwards compatibility. The -"earliest" option means that HDF5 will make a *best effort* to be backwards -compatible. - -The default is "earliest". - -Specifying version bounds has changed from HDF5 version 1.10.2. There are two new -compatibility levels: `v108` (for HDF5 1.8) and `v110` (for HDF5 1.10). This -change enables, for example, something like this: - - >>> f = h5py.File('name.hdf5', libver=('earliest', 'v108')) - -which enforces full backward compatibility up to HDF5 1.8. Using any HDF5 -feature that requires a newer format will raise an error. - -`latest` is now an alias to another bound label that represents the latest -version. Because of this, the `File.libver` property will not use `latest` in -its output for HDF5 1.10.2 or later. +Unsupported options +------------------- + +The following options are used with h5py.File, but are not supported with h5pyd: + +* driver +* libver +* userblock_size +* rdcc_nbytes +* rdcc_w0 +* rdcc_nslots +* fs_strategy +* fs_persist +* fs_page_size +* fs_threshold +* page_buf_size +* min_meta_keep +* min_raw_keep +* locking +* alignment_threshold +* alignment_interval +* meta_block_size + +For the most part these relate to concepts that don't apply to HSDS, so are not included. .. _file_closing: @@ -181,7 +78,7 @@ HDF5 calls 'weak' closing. .. code-block:: - with h5py.File('f1.h5', 'r') as f1: + with h5py.File('/a_folder/f1.h5', 'r') as f1: ds = f1['dataset'] # ERROR - can't access dataset, because f1 is closed: @@ -197,162 +94,8 @@ HDF5 calls 'weak' closing. del ds # Now f2.h5 will be closed +.. -.. _file_userblock: - -User block ----------- - -HDF5 allows the user to insert arbitrary data at the beginning of the file, -in a reserved space called the `user block`. The length of the user block -must be specified when the file is created. It can be either zero -(the default) or a power of two greater than or equal to 512. You -can specify the size of the user block when creating a new file, via the -``userblock_size`` keyword to File; the userblock size of an open file can -likewise be queried through the ``File.userblock_size`` property. - -Modifying the user block on an open file is not supported; this is a limitation -of the HDF5 library. However, once the file is closed you are free to read and -write data at the start of the file, provided your modifications don't leave -the user block region. - - -.. _file_filenames: - -Filenames on different systems ------------------------------- - -Different operating systems (and different file systems) store filenames with -different encodings. Additionally, in Python there are at least two different -representations of filenames, as encoded ``bytes`` or as a Unicode string -(``str`` on Python 3). - -h5py's high-level interfaces always return filenames as ``str``, e.g. -:attr:`File.filename`. h5py accepts filenames as either ``str`` or ``bytes``. -In most cases, using Unicode (``str``) paths is preferred, but there are some -caveats. - -.. note:: - - HDF5 handles filenames as bytes (C ``char *``), and the h5py :doc:`lowlevel` - matches this. - -macOS (OSX) -........... -macOS is the simplest system to deal with, it only accepts UTF-8, so using -Unicode paths will just work (and should be preferred). - -Linux (and non-macOS Unix) -.......................... -Filenames on Unix-like systems are natively bytes. By convention, the locale -encoding is used to convert to and from unicode; on most modern systems this -will be UTF-8 by default (especially since Python 3.7, with :pep:`538`). - -Passing Unicode paths will mostly work, and Unicode paths from system -functions like ``os.listdir()`` should always work. But if there are filenames -that aren't in the expected encoding (e.g. on a network filesystem or a -removable drive, or because something is misconfigured), you may want to handle -them as bytes. - -Windows -....... -Windows systems natively handle filenames as Unicode, and with HDF5 1.10.6 and -above filenames passed to h5py as bytes will be used as UTF-8 encoded text, -regardless of system configuration. - -HDF5 1.10.5 and below could only use filenames with characters from the active -code page, e.g. `Windows-1252 `_ on -many systems configured for European languages. This limitation applies whether -you use ``str`` or ``bytes`` with h5py. - -.. _file_cache: - -Chunk cache ------------ - -:ref:`dataset_chunks` allows datasets to be stored on disk in separate pieces. -When a part of any one of these pieces is needed, the entire chunk is read into -memory before the requested part is copied to the user's buffer. To the extent -possible those chunks are cached in memory, so that if the user requests a -different part of a chunk that has already been read, the data can be copied -directly from memory rather than reading the file again. The details of a -given dataset's chunks are controlled when creating the dataset, but it is -possible to adjust the behavior of the chunk *cache* when opening the file. - -The parameters controlling this behavior are prefixed by ``rdcc``, for *raw data -chunk cache*. They apply to all datasets unless specifically changed for each one. - -* ``rdcc_nbytes`` sets the total size (measured in bytes) of the raw data chunk - cache for each dataset. The default size is 1 MiB. - This should be set to the size of each chunk times the number of - chunks that are likely to be needed in cache. -* ``rdcc_w0`` sets the policy for chunks to be - removed from the cache when more space is needed. If the value is set to 0, - then the library will always evict the least recently used chunk in cache. If - the value is set to 1, the library will always evict the least recently used - chunk which has been fully read or written, and if none have been fully read - or written, it will evict the least recently used chunk. If the value is - between 0 and 1, the behavior will be a blend of the two. Therefore, if the - application will access the same data more than once, the value should be set - closer to 0, and if the application does not, the value should be set closer - to 1. -* ``rdcc_nslots`` is the number of chunk slots in - the cache for each dataset. In order to allow the chunks to be looked up - quickly in cache, each chunk is assigned a unique hash value that is used to - look up the chunk. The cache contains a simple array of pointers to chunks, - which is called a hash table. A chunk's hash value is simply the index into - the hash table of the pointer to that chunk. While the pointer at this - location might instead point to a different chunk or to nothing at all, no - other locations in the hash table can contain a pointer to the chunk in - question. Therefore, the library only has to check this one location in the - hash table to tell if a chunk is in cache or not. This also means that if two - or more chunks share the same hash value, then only one of those chunks can be - in the cache at the same time. When a chunk is brought into cache and another - chunk with the same hash value is already in cache, the second chunk must be - evicted first. Therefore it is very important to make sure that the size of - the hash table (which is determined by the ``rdcc_nslots`` parameter) is large - enough to minimize the number of hash value collisions. Due to the hashing - strategy, this value should ideally be a prime number. As a rule of thumb, - this value should be at least 10 times the number of chunks that can fit in - ``rdcc_nbytes`` bytes. For maximum performance, this value should be set - approximately 100 times that number of chunks. The default value is 521. - -Chunks and caching are described in greater detail in the `HDF5 documentation -`_. - -.. _file_alignment: - -Data alignment --------------- - -When creating datasets within files, it may be advantageous to align the offset -within the file itself. This can help optimize read and write times if the data -become aligned with the underlying hardware, or may help with parallelism with -MPI. Unfortunately, aligning small variables to large blocks can leave a lot of -empty space in a file. To this effect, application developers are left with two -options to tune the alignment of data within their file. The two variables -``alignment_threshold`` and ``alignment_interval`` in the :class:`File` -constructor help control the threshold in bytes where the data alignment policy -takes effect and the alignment in bytes within the file. The alignment is -measured from the end of the user block. - -For more information, see the official HDF5 documentation `H5P_SET_ALIGNMENT -`_. - -.. _file_meta_block_size: - -Meta block size ---------------- - -Space for metadata is allocated in blocks within the HDF5 file. The argument -``meta_block_size`` of the :class:`File` constructor sets the minimum size of -these blocks. Setting a large value can consolidate metadata into a small -number of regions. Setting a small value can reduce the overall file size, -especially in combination with the ``libver`` option. This controls how the -overall data and metadata are laid out within the file. - -For more information, see the official HDF5 documentation `H5P_SET_META_BLOCK_SIZE -`_. Reference --------- @@ -360,7 +103,7 @@ Reference .. note:: Unlike Python file objects, the attribute :attr:`File.name` gives the - HDF5 name of the root group, "``/``". To access the on-disk name, use + HDF5 name of the root group, "``/``". To access the domain name, use :attr:`File.filename`. .. class:: File(name, mode='r', driver=None, libver=None, userblock_size=None, \ @@ -369,13 +112,13 @@ Reference fs_page_size=None, page_buf_size=None, min_meta_keep=0, min_raw_keep=0, \ locking=None, alignment_threshold=1, alignment_interval=1, **kwds) - Open or create a new file. + Open or create a new HSDS domain. Note that in addition to the :class:`File`-specific methods and properties listed below, :class:`File` objects inherit the full interface of :class:`Group`. - :param name: Name of file (`bytes` or `str`), or an instance of + :param name: Name of domain (`str`), or an instance of :class:`h5f.FileID` to bind to an existing file identifier, or a file-like object (see :ref:`file_fileobj`). diff --git a/docs/index.rst b/docs/index.rst index 54645c95..c04b773a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -4,17 +4,19 @@ HDF5 for the Cloud The h5pyd package is a Python interface for accessing the Highly Scalable Data Service (HSDS) (https://www.hdfgroup.org/solutions/highly-scalable-data-service-hsds/). Whereas the HDF5 library enables complex scientific data to be stored in files, HSDS uses a cloud-native storage model that works well with -object storage systems such as AWS S3 and Azure Blob Storage (regular POSIX stores are also supported). +object storage systems such as AWS S3 and Azure Blob Storage (regular POSIX stores are also supported). As +a service, HSDS can be running on the same system as the application using h5pyd, or it can be running on a server +(e.g. running in an AWS data center co-located with S3 storage) while clients send requests over the network. The native interface for HSDS is the HDF REST API (https://github.com/HDFGroup/hdf-rest-api), but the h5pyd package -provides a convienent mode of access using the same API as the popular h5py package (https://docs.h5py.org/en/stable/index.html) +provides a convenient mode of access using the same API as the popular h5py package (https://docs.h5py.org/en/stable/index.html) that provides a Pythonic interface to the HDF5 library. In fact, many applications that use h5py can be -converted to using h5py, just by adding the statment ``import h5pyd as h5py``. +converted to using h5py, just by adding the statement ``import h5pyd as h5py``. However, not every h5py feature is supported in h5pyd (at least not yet!). For example Virtual Datasets are not supported. For a complete list see the: :ref:`FAQ `. In addition, there are some features that are supported, like Parallel HDF5, but work somewhat differently from the HDF5 library. And finally, there are features of h5pyd, that don't have any -correspondance to h5py, such as Folders. +correspondence to h5py, such as Folders. The h5pyd package also includes a set of command line tools, for doing common tasks such as uploading HDF5 files to HSDS. See: tbd for a description of the CLI tools. diff --git a/docs/quick.rst b/docs/quick.rst index 9331d6f9..0dc60d32 100644 --- a/docs/quick.rst +++ b/docs/quick.rst @@ -20,7 +20,7 @@ You can install ``h5pyd`` via pip:: pip install h5py -If you will be running your own HSDS, install the ``hsds`` package as well:: +If you will be running your own HSDS instance, install the ``hsds`` package as well:: pip install hsds @@ -45,12 +45,14 @@ When you are ready to shut down HSDS, just hit Ctlr-C in the terminal window whe h5pyd configuration ------------------- -Typically, http requests to HSDS need to authenticated (likely you wouldn't want just anyone messing around with your HDF data!). -HSDS supports several authentication protocols, but the simplest (if not most secure) to use when getting started is what is +Typically, http requests to HSDS need to authenticated (likely you don't want just anyone messing around with your HDF data!). +HSDS supports several authentication protocols, but the simplest (if not the most secure) to use when getting started is what is known as HTTP Basic Authentication. In this protocol, your HSDS username and password is encoded in the HTTP header of -each request. The h5pyd package will do this automatically, storing your credentials in the file ``~/.hscfg``. +each request. On receiving the request, HSDS will decode the autentication header and then compare with a list of valid +usernames/passwords stored in a text file. The h5pyd package will do this automatically, using your credentials stored +in the file ``~/.hscfg`` to add the authentication header to each HSDS request. -You can edit this file by hand, or use the hsconfigure tool included with h5pyd: +You can edit hscfg file by hand, or use the hsconfigure tool included with h5pyd as follows: * Start the app by running: ``hsconfigure`` in a terminal * You'll be prompted for a server endpoint. If running HSDS locally, enter: ``http://localhost:5101`` @@ -77,27 +79,45 @@ To make keeping track of everything easier, HSDS manages storage using three le * Folders live in buckets are work much like directories in a POSIX file system * Domains are the equivalent to HDF5 files -Buckets are setup by the HSDS administrator and will correspond to AWS S3 Buckets, Azure Blob Containers, or POSIX directories. +Buckets are the top-level of storage used by HSDS and will correspond to AWS S3 Buckets, Azure Blob Containers, or POSIX directories. Buckets can not be created using the h5pyd package (these need to be setup by the HSDS administrator), but the h5pyd File and Folder object have an optional bucket parameter to specify which bucket to access. Typically HSDS will be setup with a default bucket that will be used if no bucket name is given explicitly. -Folders can be created using h5pyd (or the hstouch CLI tool). Likewise Domains can be created using ``h5pyd.File`` or the -hstouch CLI tool, e.g. ``hstouch /home/$USER/myfile.h5``. +Folders can be created using h5pyd (or the hstouch CLI tool). To use hstouch to create a folder, run hstouch followed by +the path to the desired folder. E.g. ``hstouch /home/$USER/myfolder/``. In h5oyd, there no 'current folder' concept, +so the path must always start with ``/`` (or if desired, ``hdf5://`` to distinguish from a POSIX path). Also, to create +folder, the path must end in a slash. To view the contents of a folder, use the ``hsls`` tool. E.g.: +``hsls /home/$USER/myfolder/``. -As with HDF5 files, HSDS domains are containers for two kinds of objects: `datasets`, which are +Folder can contain sub-folders, but also domains (equivalent to an HDF5 file). As with HDF5 files, +HSDS domains are containers for two kinds of objects: `datasets`, which are array-like collections of data, and `groups`, which are folder-like containers that hold datasets and other groups. The most fundamental thing to remember -when using h5py(d) is: +when using h5pyd (as with h5py) is: **Groups work like dictionaries, and datasets work like NumPy arrays** -Suppose someone has sent you a HDF5 file, :code:`mytestfile.hdf5`. (To create this file, read `Appendix: Creating a file`_.) The very first thing you'll need to do is to open the file for reading:: +Domains can be created programmatically, or using the CLI tools. E.g. ``hstouch /home/$USER/myfolder/mytestfile.h5``. +To convert an HDF5 file to an HSDS domain, you can use the hscp command: ``hscp mytestfile.h5 /home/$USER/myfolder/``. - >>> import h5py - >>> f = h5py.File('mytestfile.hdf5', 'r') +A quick note on domain permissions: When you create a new domain, it will only be accessible using your +credentials. You can enable who else can access the domain using the hsacl tool. For example, to enable +other users to read a domain (but not modify it) use: ``hsacl /home/$USER/myfolder/mytest.h5 +r default``. +For details of using hsacl, see: tbd. + +To programmatically access a domain for reading for reading, use the h5pyd.File object:: + + >>> import h5pyd as h5py + >>> f = h5py.File('/home/test_user1/mytestfile.h5', 'r') # replace test_user1 with your user name + +The :ref:`File object ` is your starting point. If you are familiar with h5py, the rest of this section will be +exactly the same as to what you'd expect opening an HDF5 file. In you are not familiar with h5py, keep reading, but +keep in mind that this will apply to both h5py and h5pyd. + -The :ref:`File object ` is your starting point. What is stored in this file? Remember :py:class:`h5py.File` acts like a Python dictionary, thus we can check the keys, +What is stored in the domain? Remember :py:class:`h5pyd.File` +acts like a Python dictionary, thus we can check the keys, >>> list(f.keys()) ['mydataset'] @@ -128,11 +148,10 @@ from a dataset in the file:: For more, see :ref:`file` and :ref:`dataset`. -Appendix: Creating a file -+++++++++++++++++++++++++ +Creating a domain programmatically +++++++++++++++++++++++++++++++++++ -At this point, you may wonder how :code:`mytestdata.hdf5` is created. -We can create a file by setting the :code:`mode` to :code:`w` when +You can create a domain by setting the :code:`mode` to :code:`w` when the File object is initialized. Some other modes are :code:`a` (for read/write/create access), and :code:`r+` (for read/write access). @@ -140,7 +159,7 @@ A full list of file access modes and their meanings is at :ref:`file`. :: >>> import h5py >>> import numpy as np - >>> f = h5py.File("mytestfile.hdf5", "w") + >>> f = h5py.File("/home/test_user1/myfolder/mytestfile.hdf5", "w") The :ref:`File object ` has a couple of methods which look interesting. One of them is ``create_dataset``, which as the name suggests, creates a data set of given shape and dtype :: @@ -173,7 +192,7 @@ created is itself a group, in this case the `root group`, named ``/``: Creating a subgroup is accomplished via the aptly-named ``create_group``. But we need to open the file in the "append" mode first (Read/write if exists, create otherwise) :: - >>> f = h5py.File('mydataset.hdf5', 'a') + >>> f = h5py.File('/home/test_user1/myfolder/mydataset.h5', 'a') >>> grp = f.create_group("subgroup") All ``Group`` objects also have the ``create_*`` methods like File:: diff --git a/h5pyd/_hl/files.py b/h5pyd/_hl/files.py index c9d4be5a..4bfeb0db 100644 --- a/h5pyd/_hl/files.py +++ b/h5pyd/_hl/files.py @@ -198,7 +198,11 @@ def attrs(self): @property def filename(self): """File name on disk""" - return self.id.http_conn.domain + if self.id.http_conn: + filename = self.id.http_conn.domain + else: + filename = None + return filename @property def driver(self): @@ -207,7 +211,11 @@ def driver(self): @property def mode(self): """Python mode used to open file""" - return self.id.http_conn.mode + if self.id.http_conn: + mode = self.id.http_conn.mode + else: + mode = None + return mode @property def fid(self): @@ -329,19 +337,23 @@ def __init__( """ groupid = None dn_ids = [] + root_json = None + cfg = config.get_config() # pulls in state from a .hscfg file (if found). + + if mode and mode not in ("r", "r+", "w", "w-", "x", "a"): + raise ValueError("Invalid mode; must be one of r, r+, w, w-, x, a") + + if mode is None: + mode = "r" + # if we're passed a GroupId as domain, just initialize the file object # with that. This will be faster and enable the File object to share the same http connection. - no_endpoint_info = endpoint is None and username is None and password is None - if (mode is None and no_endpoint_info and isinstance(domain, GroupID)): + # no_endpoint_info = endpoint is None and username is None and password is None + if isinstance(domain, GroupID): groupid = domain else: - if mode and mode not in ("r", "r+", "w", "w-", "x", "a"): - raise ValueError("Invalid mode; must be one of r, r+, w, w-, x, a") - - if mode is None: - mode = "r" - - cfg = config.get_config() # pulls in state from a .hscfg file (if found). + if not isinstance(domain, str): + raise IOError(400, "expected a str or GroupID object for domain") # accept domain values in the form: # http://server:port/home/user/myfile.h5 @@ -354,7 +366,7 @@ def __init__( # # For http prefixed values, extract the endpont and use the rest as domain path for protocol in ("http://", "https://", "hdf5://", "http+unix://"): - if domain and domain.startswith(protocol): + if isinstance(domain, str) and domain.startswith(protocol): if protocol.startswith("http"): domain = domain[len(protocol):] # extract the endpoint @@ -378,9 +390,8 @@ def __init__( if domain[0] != "/": raise IOError(400, "relative paths are not valid") - if endpoint is None: - if "hs_endpoint" in cfg: - endpoint = cfg["hs_endpoint"] + if endpoint is None and "hs_endpoint" in cfg: + endpoint = cfg["hs_endpoint"] # remove the trailing slash on endpoint if it exists if endpoint and endpoint.endswith('/'): @@ -421,8 +432,6 @@ def __init__( timeout=timeout, ) - root_json = None - # try to do a GET from the domain req = "/" params = {"getdnids": 1} # return dn ids if available @@ -499,20 +508,8 @@ def __init__( http_conn.close() raise IOError(404, "Unexpected error") - if "dn_ids" in root_json: - dn_ids = root_json["dn_ids"] - root_uuid = root_json["root"] - if "limits" in root_json: - self._limits = root_json["limits"] - else: - self._limits = None - if "version" in root_json: - self._version = root_json["version"] - else: - self._version = None - if mode == "a": # for append, verify we have 'update' permission on the domain # try first with getting the acl for the current user, then as default @@ -563,6 +560,21 @@ def __init__( self._dn_ids = dn_ids self._swmr_mode = swmr + if not root_json: + root_json = self.id.obj_json + if root_json: + if "dn_ids" in root_json: + dn_ids = root_json["dn_ids"] + + if "limits" in root_json: + self._limits = root_json["limits"] + else: + self._limits = None + if "version" in root_json: + self._version = root_json["version"] + else: + self._version = None + Group.__init__(self, self._id, track_order=track_order) def _getVerboseInfo(self): diff --git a/test/hl/test_file.py b/test/hl/test_file.py index a8f67605..b16ebbe1 100644 --- a/test/hl/test_file.py +++ b/test/hl/test_file.py @@ -207,6 +207,33 @@ def test_create(self): f.close() self.assertEqual(f.id.id, 0) + def test_file_clone(self): + # verify you can create a File object based on an existing reference + filename = self.getFileName("file_clone") + print("filename:", filename) + + f = h5py.File(filename, 'w') + self.assertEqual(f.filename, filename) + self.assertEqual(f.name, "/") + self.assertTrue(f.id.id is not None) + self.assertEqual(len(f.keys()), 0) + self.assertEqual(f.mode, 'r+') + self.assertTrue(h5py.is_hdf5(filename)) + + f.create_group("g1") + self.assertTrue("g1" in f) + + # get a new file instance using a File object + g = h5py.File(f.id) + self.assertEqual(g.filename, f.filename) + self.assertEqual(g.id.id, f.id.id) + self.assertTrue("g1" in g) + # print("f version:", f._version) + # print("g version:", g._version) + + f.close() + g.close() + def test_open_notfound(self): # verify open of non-existent file throws exception From 3913b733bf0d882b35951a338953c8010fca31b0 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 31 Dec 2024 10:55:41 +0800 Subject: [PATCH 03/32] fix flake8 errors --- docs/conf.py | 114 +++++++++++++++++++++++++-------------------------- 1 file changed, 57 insertions(+), 57 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index dab4ab26..2acefc59 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# h5pyd documentation build configuration file, +# h5pyd documentation build configuration file, # # This file is execfile()d with the current directory set to its # containing dir. @@ -17,20 +17,20 @@ # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) +# sys.path.insert(0, os.path.abspath('.')) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' +# needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.intersphinx', - 'sphinx.ext.extlinks', - 'sphinx.ext.mathjax', + 'sphinx.ext.intersphinx', + 'sphinx.ext.extlinks', + 'sphinx.ext.mathjax', ] intersphinx_mapping = {'low': ('https://api.h5py.org', None)} @@ -47,7 +47,7 @@ source_suffix = '.rst' # The encoding of source files. -#source_encoding = 'utf-8-sig' +# source_encoding = 'utf-8-sig' # The master toctree document. master_doc = 'index' @@ -67,13 +67,13 @@ # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -#language = None +# language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -#today = '' +# today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +# today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -81,27 +81,27 @@ # The reST default role (used for this markup: `text`) to use for all # documents. -#default_role = None +# default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +# add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -#add_module_names = True +# add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -#show_authors = False +# show_authors = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] +# modindex_common_prefix = [] # If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False +# keep_warnings = False # -- Options for HTML output ---------------------------------------------- @@ -113,26 +113,26 @@ # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -#html_theme_options = {} +# html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] +# html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -#html_title = None +# html_title = None # A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None +# html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = None +# html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -#html_favicon = None +# html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, @@ -142,48 +142,48 @@ # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. -#html_extra_path = [] +# html_extra_path = [] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' +# html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -#html_sidebars = {} +# html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. -#html_additional_pages = {} +# html_additional_pages = {} # If false, no module index is generated. -#html_domain_indices = True +# html_domain_indices = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +# html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True +# html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True +# html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None +# html_file_suffix = None # Output file base name for HTML help builder. htmlhelp_basename = 'h5pyddoc' @@ -192,43 +192,43 @@ # -- Options for LaTeX output --------------------------------------------- latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', + # The paper size ('letterpaper' or 'a4paper'). + # 'papersize': 'letterpaper', -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', + # The font size ('10pt', '11pt' or '12pt'). + # 'pointsize': '10pt', -# Additional stuff for the LaTeX preamble. -#'preamble': '', + # Additional stuff for the LaTeX preamble. + # 'preamble': '', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - ('index', 'h5pyd.tex', 'h5pyd Documentation', - 'The HDF Group', 'manual'), + ('index', 'h5pyd.tex', 'h5pyd Documentation', + 'The HDF Group', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of # the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -#latex_use_parts = False +# latex_use_parts = False # If true, show page references after internal links. -#latex_show_pagerefs = False +# latex_show_pagerefs = False # If true, show URL addresses after external links. -#latex_show_urls = False +# latex_show_urls = False # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_domain_indices = True +# latex_domain_indices = True # -- Options for manual page output --------------------------------------- @@ -241,7 +241,7 @@ ] # If true, show URL addresses after external links. -#man_show_urls = False +# man_show_urls = False # -- Options for Texinfo output ------------------------------------------- @@ -250,19 +250,19 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'h5pyd', 'h5pyd Documentation', - 'The HDF Group', 'h5pyd', 'One line description of project.', - 'Miscellaneous'), + ('index', 'h5pyd', 'h5pyd Documentation', + 'The HDF Group', 'h5pyd', 'One line description of project.', + 'Miscellaneous'), ] # Documents to append as an appendix to all manuals. -#texinfo_appendices = [] +# texinfo_appendices = [] # If false, no module index is generated. -#texinfo_domain_indices = True +# texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' +# texinfo_show_urls = 'footnote' # If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False +# texinfo_no_detailmenu = False From daa41b498ba9a030efd93611bf90e7a517f08008 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 31 Dec 2024 11:44:51 +0800 Subject: [PATCH 04/32] updated index --- docs/high/file.rst | 81 +++----------------------------------------- docs/index.rst | 2 -- test/hl/test_file.py | 4 +-- 3 files changed, 7 insertions(+), 80 deletions(-) diff --git a/docs/high/file.rst b/docs/high/file.rst index 250a3de9..96c0a264 100644 --- a/docs/high/file.rst +++ b/docs/high/file.rst @@ -106,11 +106,7 @@ Reference HDF5 name of the root group, "``/``". To access the domain name, use :attr:`File.filename`. -.. class:: File(name, mode='r', driver=None, libver=None, userblock_size=None, \ - swmr=False, rdcc_nslots=None, rdcc_nbytes=None, rdcc_w0=None, \ - track_order=None, fs_strategy=None, fs_persist=False, fs_threshold=1, \ - fs_page_size=None, page_buf_size=None, min_meta_keep=0, min_raw_keep=0, \ - locking=None, alignment_threshold=1, alignment_interval=1, **kwds) +.. class:: File(name, mode='r', swmr=False, track_order=None) Open or create a new HSDS domain. @@ -120,70 +116,15 @@ Reference :param name: Name of domain (`str`), or an instance of :class:`h5f.FileID` to bind to an existing - file identifier, or a file-like object - (see :ref:`file_fileobj`). + file identifier. :param mode: Mode in which to open file; one of ("w", "r", "r+", "a", "w-"). See :ref:`file_open`. - :param driver: File driver to use; see :ref:`file_driver`. - :param libver: Compatibility bounds; see :ref:`file_version`. - :param userblock_size: Size (in bytes) of the user block. If nonzero, - must be a power of 2 and at least 512. See - :ref:`file_userblock`. :param swmr: If ``True`` open the file in single-writer-multiple-reader mode. Only used when mode="r". - :param rdcc_nbytes: Total size of the raw data chunk cache in bytes. The - default size is :math:`1024^2` (1 MiB) per dataset. - :param rdcc_w0: Chunk preemption policy for all datasets. Default value is - 0.75. - :param rdcc_nslots: Number of chunk slots in the raw data chunk cache for - this file. Default value is 521. :param track_order: Track dataset/group/attribute creation order under root group if ``True``. Default is ``h5.get_config().track_order``. - :param fs_strategy: The file space handling strategy to be used. - Only allowed when creating a new file. One of "fsm", "page", - "aggregate", "none", or ``None`` (to use the HDF5 default). - :param fs_persist: A boolean to indicate whether free space should be - persistent or not. Only allowed when creating a new file. The - default is False. - :param fs_page_size: File space page size in bytes. Only use when - fs_strategy="page". If ``None`` use the HDF5 default (4096 bytes). - :param fs_threshold: The smallest free-space section size that the free - space manager will track. Only allowed when creating a new file. - The default is 1. - :param page_buf_size: Page buffer size in bytes. Only allowed for HDF5 files - created with fs_strategy="page". Must be a power of two value and - greater or equal than the file space page size when creating the - file. It is not used by default. - :param min_meta_keep: Minimum percentage of metadata to keep in the page - buffer before allowing pages containing metadata to be evicted. - Applicable only if ``page_buf_size`` is set. Default value is zero. - :param min_raw_keep: Minimum percentage of raw data to keep in the page - buffer before allowing pages containing raw data to be evicted. - Applicable only if ``page_buf_size`` is set. Default value is zero. - :param locking: The file locking behavior. One of: - - - False (or "false") -- Disable file locking - - True (or "true") -- Enable file locking - - "best-effort" -- Enable file locking but ignore some errors - - None -- Use HDF5 defaults - - .. warning:: - - The HDF5_USE_FILE_LOCKING environment variable can override - this parameter. - - Only available with HDF5 >= 1.12.1 or 1.10.x >= 1.10.7. - :param alignment_threshold: Together with ``alignment_interval``, this - property ensures that any file object greater than or equal - in size to the alignment threshold (in bytes) will be - aligned on an address which is a multiple of alignment interval. - :param alignment_interval: This property should be used in conjunction with - ``alignment_threshold``. See the description above. For more - details, see :ref:`file_alignment`. - :param meta_block_size: Determines the current minimum size, in bytes, of - new metadata block allocations. See :ref:`file_meta_block_size`. - :param kwds: Driver-specific keywords; see :ref:`file_driver`. + .. method:: __bool__() @@ -224,20 +165,8 @@ Reference True if the file access is using :doc:`/swmr`. Use :attr:`mode` to distinguish SWMR read from write. - .. attribute:: driver - String giving the driver used to open the file. Refer to - :ref:`file_driver` for a list of drivers. + .. attribute:: version - .. attribute:: libver + HSDS version string - 2-tuple with library version settings. See :ref:`file_version`. - - .. attribute:: userblock_size - - Size of user block (in bytes). Generally 0. See :ref:`file_userblock`. - - .. attribute:: meta_block_size - - Minimum size, in bytes, of metadata block allocations. Default: 2048. - See :ref:`file_meta_block_size`. diff --git a/docs/index.rst b/docs/index.rst index c04b773a..e9980f20 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -25,7 +25,6 @@ Where to start -------------- * :ref:`Quick-start guide ` -* :ref:`Installation ` Other resources @@ -44,7 +43,6 @@ Introductory info :maxdepth: 1 quick - build High-level API reference diff --git a/test/hl/test_file.py b/test/hl/test_file.py index b16ebbe1..f5bf2c7f 100644 --- a/test/hl/test_file.py +++ b/test/hl/test_file.py @@ -228,8 +228,8 @@ def test_file_clone(self): self.assertEqual(g.filename, f.filename) self.assertEqual(g.id.id, f.id.id) self.assertTrue("g1" in g) - # print("f version:", f._version) - # print("g version:", g._version) + print("f version:", f._version) + print("g version:", g._version) f.close() g.close() From f4da4560615f9877539b3c3fa617dd6c3ec1c09a Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 31 Dec 2024 12:07:25 +0800 Subject: [PATCH 05/32] support file attributes when cloning --- h5pyd/_hl/files.py | 36 ++++++++++++++++++++++++------------ test/hl/test_file.py | 3 +-- 2 files changed, 25 insertions(+), 14 deletions(-) diff --git a/h5pyd/_hl/files.py b/h5pyd/_hl/files.py index 4bfeb0db..2859799e 100644 --- a/h5pyd/_hl/files.py +++ b/h5pyd/_hl/files.py @@ -561,19 +561,31 @@ def __init__( self._swmr_mode = swmr if not root_json: - root_json = self.id.obj_json - if root_json: - if "dn_ids" in root_json: - dn_ids = root_json["dn_ids"] + # fetch the root_json + req = "/" + params = {"getdnids": 1} # return dn ids if available - if "limits" in root_json: - self._limits = root_json["limits"] - else: - self._limits = None - if "version" in root_json: - self._version = root_json["version"] - else: - self._version = None + if use_cache and mode == "r": + params["getobjs"] = "T" + params["include_attrs"] = "T" + if bucket: + params["bucket"] = bucket + rsp = self.id.http_conn.GET(req, params=params) + if rsp.status_code != 200: + raise IOError(rsp.status_code, rsp.reason) + root_json = json.loads(rsp.text) + + if "dn_ids" in root_json: + dn_ids = root_json["dn_ids"] + + if "limits" in root_json: + self._limits = root_json["limits"] + else: + self._limits = None + if "version" in root_json: + self._version = root_json["version"] + else: + self._version = None Group.__init__(self, self._id, track_order=track_order) diff --git a/test/hl/test_file.py b/test/hl/test_file.py index f5bf2c7f..cfe5e7f0 100644 --- a/test/hl/test_file.py +++ b/test/hl/test_file.py @@ -228,8 +228,7 @@ def test_file_clone(self): self.assertEqual(g.filename, f.filename) self.assertEqual(g.id.id, f.id.id) self.assertTrue("g1" in g) - print("f version:", f._version) - print("g version:", g._version) + self.assertEqual(f._version, g._version) f.close() g.close() From ff571516fc2c397e25b3b9bf6a5a518d48c8751e Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 2 Jan 2025 14:27:51 +0800 Subject: [PATCH 06/32] added docs for File --- docs/high/file.rst | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/docs/high/file.rst b/docs/high/file.rst index 96c0a264..03289725 100644 --- a/docs/high/file.rst +++ b/docs/high/file.rst @@ -43,7 +43,6 @@ Unsupported options The following options are used with h5py.File, but are not supported with h5pyd: * driver -* libver * userblock_size * rdcc_nbytes * rdcc_w0 @@ -119,11 +118,41 @@ Reference file identifier. :param mode: Mode in which to open file; one of ("w", "r", "r+", "a", "w-"). See :ref:`file_open`. - :param swmr: If ``True`` open the file in single-writer-multiple-reader + :param endpoint: HSDS http endpoint. If None, the endpoint given by HS_ENDPOINT environment + variable will be used if set. Otherwise, the endpoint given in the + .hscfg file will be used + :param username: HSDS username. If None, the username given by the HS_USERNAME environment + variable will be used if set. Otherwise, the username given in the + .hscfg file will be used + :param password: HSDS password. If None, the password given by the HS_PASSWORD environment + variable will be used if set. Otherwise, the password given in the + .hscfg file will be used + :param bucket: Name of bucket the domain is expected to be found in. If None, the + default HSDS bucket name will be used + :param api_key: API key (e.g. a JSON Web Token) to use for authentication. If provided, + username and password parameters will be ignored + :param session: Keep http connection alive between requests (more efficient than + re-creating the connection on each request) + :param use_cache: Save domain state locally rather than fetching needed state from HSDS + as needed. Set use_cache to False when opening a domain if you expect + other clients to be modifying domain metadata (e.g. adding links or attributes). + :param swmr: If ``True`` open the file in single-writer-multiple-reader. Has the same + effect as setting use_cache to False. mode. Only used when mode="r". + :param libver: For compatibility with h5py - library version bounds. Has no effect other + than returning given value as a property. + :param owner: For new domains, the owner username to be used for the domain. Can only be + set if username is an HSDS admin user. If owner is None, username will be + assigned as owner. + :param linked_domain: For new domain, use the root object of the linked_domain. + :param logger: Logger object ot be used for logging. :param track_order: Track dataset/group/attribute creation order under root group if ``True``. Default is ``h5.get_config().track_order``. + :param retries: Number of retries to use if an http request fails + (e.g. on a 503 Service Unavailable response). + :param timeout: Number of seconds to wait on a http response before failing. + .. method:: __bool__() From 9ae650e49698c2fb822cd9168acf9a1e2de2ae73 Mon Sep 17 00:00:00 2001 From: John Readey Date: Mon, 6 Jan 2025 14:56:25 +0800 Subject: [PATCH 07/32] update to File docs --- docs/high/file.rst | 428 ++++++++++++++++++++++++++++++++++++++------- docs/quick.rst | 6 +- h5pyd/_hl/files.py | 6 +- 3 files changed, 369 insertions(+), 71 deletions(-) diff --git a/docs/high/file.rst b/docs/high/file.rst index 03289725..0615e0b6 100644 --- a/docs/high/file.rst +++ b/docs/high/file.rst @@ -5,37 +5,270 @@ File Objects ============ -File objects serve as your entry point into the world of HDF5. In addition +File objects serve as your entry point into the world of HDF5. While an h5py file object +corresponds to a POSIX file, in h5pyd a file object represents a HSDS ``domain`` . +Like HDF5 files, an HSDS domain is a hierarchical collection of groups and datasets. +Unlike an HDF5 file though, the storage for a domain is managed by HSDS and clients may +not have direct access to the storage medium (e.g. an S3 bucket in which the user does not +have authorization to access directly). + +For the most part you work with file objects in h5pyd in the same manner as you would with h5py. +The primary difference being that while you provide a file path in the h5py file constructor, you +use a domain path in h5pyd (see "Opening & Creating domains" below). + +In addition to the File-specific capabilities listed here, every File instance is -also an :ref:`HDF5 group ` representing the `root group` of the file. +also an :ref:`HDF5 group ` representing the `root group` of the domain. -Note: Python "File-like" objects are not supported. .. _file_open: Opening & creating domains -------------------------- -HSDS domains work generally like standard Python file objects. They support +File objects in h5pyd work generally like standard Python file objects. They support standard modes like r/w/a, and should be closed when they are no longer in use. However, there is obviously no concept of "text" vs "binary" mode. - >>> f = h5py.File('myfile.hdf5','r') +Domains are identified by a sequence of folder names and finally the domain name +all delimitated by ``/`` characters. +While the result looks like a standard POSIX path, +the path is only relevant to the particular HSDS instance you are connecting to. +You may prefer to use the optional ``hdf5://`` prefix as a reminder that the path +is not actually referencing a POSIX file. -The file name may be a string (i.e. Python 3 unicode string). Valid modes are: + >>> f = h5pyd.File('/home/test_user1/mydomain.h5', 'r') + >>> f = h5pyd.File('hdf5:://home/test_user1/mydomain.h5', 'r') # equivalent + +The first argument is the path to the domain. The path must be a string (i.e. Python 3 unicode string) and +must be an absolute path (starting with '/' or 'hdf5://'). If you are unsure about what domains are present, +you can use the ``hsls`` utility to list the contents of a folder. E.g. ``$ hsls /home/test_user1/``. + +Note: Python "File-like" objects are not supported as the domain path. + +Domains live in buckets, and if a bucket name is not provided, the default bucket that has been +configured in the HSDS instance will be used. To explicitly give a bucket name, use the bucket parameter: + + >>> f = h5py.File('/home/test_user1/mydomain.h5','r', bucket='mybucket') + +The second argument is the domain access mode. +Valid modes are: ======== ================================================ - r Readonly, file must exist (default) - r+ Read/write, file must exist - w Create file, truncate if exists - w- or x Create file, fail if exists + r Readonly, domain must exist (default) + r+ Read/write, domain must exist + w Create domain, delete existing domain if found + w- or x Create domain, fail if exists a Read/write if exists, create otherwise ======== ================================================ - Files are opened read-only by default. So the file mode parameter is - only required for one of the writable modes. +Domains are opened read-only by default. So the file mode parameter is +only required for one of the writable modes. + +Note: that unlike with h5py and the HDF5 library, there's no concept of file locking. The +same domain can be opened multiple times in the same or different thread, or even on a +different machine. Multiple clients can access the same domain for modification, but this won't +result in the domain becoming corrupted (though nothing in HSDS guards against clients over-writing +each others updates). + +Whatever the mode used, if the domain is not configured to authorize the desired action, a +``403 - Forbidden`` error will be raised. See the next sections on authentication and authorization. + +In addition to instantiating a file object with a domain path, you can pass the low level id of an +existing file object. The two file objects will share the root group, but methods (e.g. flush) +can be invoked independently. + + >>> f = h5pyd.File('/home/test_user1/mydomain.h5', 'w') # create a new domain + >>> g = h5pyd.File(f.id, "r") # this handle can only be used for reading + >>> f.close() # close f, g is still open + >>> g.filename # returns '/home/test_user1/mydomain.h5' + +.. _file_authentication: + +Authentication +-------------- + +In most cases HSDS will reject requests that don't provide some form of authentication. +The HSDS username and password can be supplied by using the ``username`` and ``password`` +arguments. In addition the desired HSDS endpoint can be specified using ``endpoint``. +For example: + +.. code-block:: + + path = "hdf5://home/test_user1/mydomain.h5" + username = "test_user1" + password = "12345" + endpoint = "http://hsds.hdf.test" + f = h5pyd.File(path, 'r', username=username, password=password, endpoint=endpoint) + +.. + +The username, password, and endpoint provided will be stored with the File object and used to +authenticate any requests sent to HSDS for operations on this domain. If the username +and password given are invalid, a ``401 - Unauthorized`` error will be raised. + +Of course it's not best practice to hardcode usernames and passwords, so alternatively the environment variables +``HS_USERNAME``, ``HS_PASSWORD``, and ``HS_ENDPOINT`` can be used to store the user credentials and endpoint. If +username, password, and endpoint arguments are not provided, the respective environment variables will be used +if set. + +If neither named parameters or environment variables are supplied, this information will be read from +the file ``.hscfg`` in the users home directory. The ``.hscfg`` can be created using the ``hsconfigure`` +utility (see: tbd). + +Finally, if no credentials are found using any of these methods, anonymous requests (http requests that don't include +an authentication header) will be used. +Depending on the permission settings of the domain and whether the HSDS instance has been configured to allow +anonymous requests, this will allow read-only actions on the domain. + +.. _file_authorization: + +Authorization +------------- + +HSDS uses the concept of ``Access Control Lists (ACLs)`` to determine what actions a given user can perform on a domain. +A domain can have one or more ACLs associated with it. Each ACL consist of the following fields: + +* user (string) - user or group name (a group is a set of users) +* create (T/F) - permission to create new objects (domains, groups, datasets, etc.) +* read (T/F) - permission to read data and metadata (e.g. list links in a group) +* update (T/F) - permission to modify metadata or dataset data +* delete (T/F) - permission to delete objects (including the domain itself) +* readACL (T/F) - permission to view permissions (i.e. read the list of ACLs for a domain) +* updateACL (T/F) - permission to add, delete, or modify ACLs + +When HSDS receives a request, it will determine what type of action is requiring (read, update, delete, etc.), and +then review the ACLs for the domain to determine if the action is authorized. If there is an ACL for the particular +user making the request, then the relevant flag for that ACL will be used. Otherwise, if there is a group ACL which +authorizes the request and the user is a member of that group, the request will be authorized. There is a special +group name: ``default`` that includes all users. In any case, if no authorizing ACL is found, +a `403 - Forbidden`` error will be raised. + +When a new domain is created (e.g. by using h5pyd.File with the `w` mode), typically it will have one ACL that gives +the owner of the domain (the authenticated user unless the 'owner' argument is given) full control. Other users would not have +permissions to even read the domain. These permissions can be adjusted, or new ACLs added programmatically (using tbd), +or using the ``hsacl`` tool (see: tbd). + +Folders (every domain lives in specific folder) also have ACLs. To create a new domain, the authenticating user +needs to have create permissions for the domain's folder. + +Finally, there are special users that can be configured in HSDS known as ``admin`` users. Admin users can perform any +action regardless of the ACLs. With great power comes great responsibility, so it's best practice to only use +admin credentials when there's no alternative (e.g. you accidentally removed permissions for a domain you own). + + +.. _file_cache: + +Caching +------- + +When a domain is open for reading, h5pyd will by default, cache certain metadata from the domain +(e.g. links in a group), so that it doesn't +have to repeatedly request information from the HSDS instance associated with the domain. This is good for performance +(requests to HSDS generally have higher latency than reading from a file), but in cases where the domain is being actively modified, +it may not be what you want. For example, suppose a sensor of some sort was setup so that readings from the previous time +period was appended to a dataset every second. By default, h5pyd won't know to check that the dataset shape has +been modified, so a program written to plot real-time readings wouldn't see any updates. +To avoid this, setting ``use_swmr`` to True will instruct h5pyd to not cache any data, so +any operation will fetch the current data from HSDS. See: (tbd) for more details. + +.. _file_flush: + +Flushing +-------- + +For performance reasons, HSDS will not immediately write updates to a domain while processing +the request that made the update. +Rather, the modifications will live in a server-side memory cache of "dirty" objects. +These objects will get written to storage periodically (every one second by default). +This is very similar in concept to how writes to a POSIX file don't immediately +get written to disk, but will be managed by the file controller. +With h5pyd, if HSDS unfortunately crashed just after processing a series of +PUT or POST requests, these changes would not get published to the storage device and as a result be lost. + +If you need to make absolutely certain that recent updates have been persisted, use the flush method. This call +won't return until HSDS has verified that all pending updates have been written to permanent storage. + + +.. _file_closing: + +Closing domains +--------------- + +Objects in HSDS are stateless - i.e. at the level of the REST interface, the server doesn't +utilize any session information in responding to requests. So an "open" vs. "closed" +domain is a concept that only applies at the client level. The h5pyd file object +does use the close method to do some internal housekeeping however. For example, closing +the http connection with the HSDS. So invoking close on h5pyd file object is good best practice, +but not a critical as with h5py. + +The close method will be invoked automatically when you leave the ``with h5py.File(...)`` block. + +The close method does have an optional parameter not found in h5yd: ``flush``. +See See :ref:`file_flush` . + + +.. _file_delete: + + +Deleting Domains +---------------- + +With h5py and the HDF5 library you would normally delete HDF5 files using your systems file browser, or the "rm" +command. Programmatically you could delete a HDF5 file using the standard Python Path.unlink method. +Neither of these options are possible with HSDS domains, but the ``hsrm`` (see: tbd) command is included with +h5pyd and works like the standard ``rm`` command with domain paths used instead of file paths. + +Programmatically, you can delete domains using the del method of the folder object (see: tbd). + +.. _file_summary: + +Summary data +------------ + +Due to the way in which domains are stored, certain information about the domain would be unfeasible to +determine on demand. For example to compute the total amount of storage used would require summing the size +of each piece of object metadata and each dataset chunk, which for large domains could require fetching +attributes for millions of objects. So for these properties, the server runs asynchronous tasks to compile +summary information about the domain. + +The impact of this is that some properties of the file object will only be reflect the +state of the domain as of the last time HSDS ran this asynchronous task (typically a few seconds to a minute +after the last update to the domain). + +Properties for which this applies are: + +* num_objects +* num_datatypes +* num_groups +* num_datasets +* num_linked_chunks +* total_size +* metadata_bytes +* linked_bytes +* allocated_bytes +* md5_sum + +The last_scan property returns the timestamp at which the scan was run. You can use this property to determine when +HSDS has updated the summary data for a domain. The following illustrates how to get summary data +for a recent update: + +.. code-block:: + + time_stamp = f.last_scan # get the last scan time + f.create_group("g1") # create a new group + while f.last_scan == time_stamp: + time.sleep(0.1) # wait for summary data to be updated + # print affected summary properties + print("num_groups:", num_groups) + print("num_objects:", num_objects) + print("metadata_bytes:", metadata_bytes) + print("total_size:", total_size) + +.. + -.. _file_driver: +.. _file_unsupported: Unsupported options ------------------- @@ -61,39 +294,6 @@ The following options are used with h5py.File, but are not supported with h5pyd: For the most part these relate to concepts that don't apply to HSDS, so are not included. -.. _file_closing: - -Closing files -------------- - -If you call :meth:`File.close`, or leave a ``with h5py.File(...)`` block, -the file will be closed and any objects (such as groups or datasets) you have -from that file will become unusable. This is equivalent to what HDF5 calls -'strong' closing. - -If a file object goes out of scope in your Python code, the file will only -be closed when there are no remaining objects belonging to it. This is what -HDF5 calls 'weak' closing. - -.. code-block:: - - with h5py.File('/a_folder/f1.h5', 'r') as f1: - ds = f1['dataset'] - - # ERROR - can't access dataset, because f1 is closed: - ds[0] - - def get_dataset(): - f2 = h5py.File('f2.h5', 'r') - return f2['dataset'] - ds = get_dataset() - - # OK - f2 is out of scope, but the dataset reference keeps it open: - ds[0] - - del ds # Now f2.h5 will be closed - -.. Reference @@ -114,9 +314,9 @@ Reference :class:`Group`. :param name: Name of domain (`str`), or an instance of - :class:`h5f.FileID` to bind to an existing - file identifier. - :param mode: Mode in which to open file; one of + :class:`Group.id` to bind to an existing + domain identifier. + :param mode: Mode in which to open the domain; one of ("w", "r", "r+", "a", "w-"). See :ref:`file_open`. :param endpoint: HSDS http endpoint. If None, the endpoint given by HS_ENDPOINT environment variable will be used if set. Otherwise, the endpoint given in the @@ -136,7 +336,7 @@ Reference :param use_cache: Save domain state locally rather than fetching needed state from HSDS as needed. Set use_cache to False when opening a domain if you expect other clients to be modifying domain metadata (e.g. adding links or attributes). - :param swmr: If ``True`` open the file in single-writer-multiple-reader. Has the same + :param swmr: If ``True`` open the domain in single-writer-multiple-reader. Has the same effect as setting use_cache to False. mode. Only used when mode="r". :param libver: For compatibility with h5py - library version bounds. Has no effect other @@ -145,7 +345,7 @@ Reference set if username is an HSDS admin user. If owner is None, username will be assigned as owner. :param linked_domain: For new domain, use the root object of the linked_domain. - :param logger: Logger object ot be used for logging. + :param logger: Logger object to be used for logging. :param track_order: Track dataset/group/attribute creation order under root group if ``True``. Default is ``h5.get_config().track_order``. @@ -157,45 +357,143 @@ Reference .. method:: __bool__() - Check that the file descriptor is valid and the file open: + Check that the file descriptor is valid and the domain is open: - >>> f = h5py.File(filename) + >>> f = h5pyd.File(domainpath) >>> f.close() >>> if f: - ... print("file is open") + ... print("domain is open") ... else: - ... print("file is closed") - file is closed + ... print("domain is closed") + domain is closed - .. method:: close() + .. method:: close(flush=False) - Close this file. All open objects will become invalid. + Close this domain. All open objects will become invalid. If flush is True, will + invoke a flush operation before closing the domain. .. method:: flush() - Request that the HDF5 library flush its buffers to disk. + Request that HSDS persist any recent updates to permanent storage + + .. method:: getACLs() + + Return a list of ACLs associated with the domain. See: tbd + + .. method:: getACL(username) + + Returns the ACL for the given user or group name. Raises a ``401 - Not Found`` error + if no ACL with that name exists + + .. method:: run_scan() + + Force a re-compilation of summary data (see tbd). Requires write intent on the domain .. attribute:: id - Low-level identifier (an instance of :class:`FileID `). + Low-level identifier (an instance of :class:`GroupID`). .. attribute:: filename - Name of this file on disk, as a Unicode string. + Path to the domain, as a Unicode string. .. attribute:: mode - String indicating if the file is open readonly ("r") or read-write + String indicating if the domain is open readonly ("r") or read-write ("r+"). Will always be one of these two values, regardless of the - mode used to open the file. + mode used to open the domain. .. attribute:: swmr_mode - True if the file access is using :doc:`/swmr`. Use :attr:`mode` to + True if the domain access is using :doc:`/swmr`. Use :attr:`mode` to distinguish SWMR read from write. + .. attribute:: libver - .. attribute:: version + Compatibility place holder for HDF5 library version. + + .. attribute:: driver + + Compatibility place holder for HDF5 file driver. Returns: ``rest_driver``. + + .. attribute:: serverver HSDS version string + .. attribute:: userblock_size + + Compatibility place holder. Always returns 0. + + .. attribute:: created + + Time (in seconds since epoch) that the domain was created. + + .. attribute:: modified + + Time (in seconds since epoch) the the domain was last modified + + .. attribute:: owner + + Name of user who created the domain + + .. attribute:: num_objects + + Number of objects (groups, datases, named datatypes) that are in the domain + + .. attribute:: num_datatypes + + Number of named datatypes in the domain + + .. attribute:: num_datasets + + Number of datasets in the domain + + .. attribute:: num_groups + + Number of groups in the domain + + .. attribute:: num_chunks + + Number of chunks (sum of number of chunks for each dataset) in the domain + + .. attribute:: num_linked_chunks + + Number of linked chunks (chunks that reference HDF5 file chunks) in the domain + + .. attribute:: allocated_bytes + + Number of bytes that have been allocated (i.e. the sum of the size of each chunk that has + been created) for the domain + + .. attribute:: metadata_bytes + + Number of bytes that been used for metadata (object properties, links, attributes, etc.) in + the domain + + .. attribute:: linked_bytes + + Number of bytes contained in chunks that links to HDF5 file chunks + + .. attribute:: total_size + + Total amount of storage used for metadata, chunk data, and linked chunks in the domain + + .. attribute:: md5_sum + + MD5 checksum for domain - a 32 character hexadecimal string. Will change whenever any metadata + or dataset data is modified + + .. attribute:: last_scan + + Time (in seconds since epoch) that the last domain scan was performed + + .. attribute:: limits + + Server defined limits. Currently returns a dictionary with the keys + ``min_chunk_size``, ``max_chunk_size``, and ``max_request_size``. + + .. attribute:: compressors + + Compression filters supported by HSDS. See: tbd + + diff --git a/docs/quick.rst b/docs/quick.rst index 0dc60d32..f3fb015b 100644 --- a/docs/quick.rst +++ b/docs/quick.rst @@ -6,10 +6,10 @@ Quick Start Guide Github Codespaces ----------------- -The quickest way to get started with h5pyd and HSDS is to use Github Codespaces. You can launch A -codespace the incudes h5pyd and HSDS by clicking here: . +The quickest way to get started with h5pyd and HSDS is to use Github Codespaces. You can launch a +codespace session that incudes h5pyd and HSDS by clicking here: . Try out some of the included examples and Python notebooks to get familiar with the features -offered by the package. +offered by h5pyd and HSDS. Read on to run h5pyd on your laptop or desktop system... diff --git a/h5pyd/_hl/files.py b/h5pyd/_hl/files.py index 2859799e..fed49419 100644 --- a/h5pyd/_hl/files.py +++ b/h5pyd/_hl/files.py @@ -294,8 +294,8 @@ def __init__( See the h5py user guide for a detailed explanation of the options. domain - URI of the domain name to access. E.g.: /home/username/tall.h5. Can also - use DNS style: tall.username.home + URI of the domain name to access. E.g.: /home/username/tall.h5. Alternatively a GroupID + (low-level group identifier object can be used) mode Access mode: 'r', 'r+', 'w', or 'a' endpoint @@ -736,7 +736,7 @@ def compressors(self): compressors = [] return compressors - # override base implemention of ACL methods to use the domain rather than update root group + # override base implementation of ACL methods to use the domain rather than update root group def getACL(self, username): req = "/acls/" + username rsp_json = self.GET(req) From 272ab38dbf72ae9c435247ef45943f0f170385cc Mon Sep 17 00:00:00 2001 From: John Readey Date: Mon, 6 Jan 2025 20:58:50 +0800 Subject: [PATCH 08/32] update for group docs --- docs/high/file.rst | 43 ++++---- docs/high/group.rst | 203 ++++++++++++++++++------------------ examples/check_last_scan.py | 20 ++++ h5pyd/_hl/base.py | 3 +- h5pyd/_hl/group.py | 68 ++---------- 5 files changed, 154 insertions(+), 183 deletions(-) create mode 100644 examples/check_last_scan.py diff --git a/docs/high/file.rst b/docs/high/file.rst index 0615e0b6..f19f75eb 100644 --- a/docs/high/file.rst +++ b/docs/high/file.rst @@ -1,4 +1,4 @@ -.. currentmodule:: h5py +.. currentmodule:: h5pyd .. _file: @@ -44,7 +44,9 @@ The first argument is the path to the domain. The path must be a string (i.e. P must be an absolute path (starting with '/' or 'hdf5://'). If you are unsure about what domains are present, you can use the ``hsls`` utility to list the contents of a folder. E.g. ``$ hsls /home/test_user1/``. -Note: Python "File-like" objects are not supported as the domain path. +.. note:: + + Python "File-like" objects are not supported as the domain path. Domains live in buckets, and if a bucket name is not provided, the default bucket that has been configured in the HSDS instance will be used. To explicitly give a bucket name, use the bucket parameter: @@ -65,11 +67,13 @@ Valid modes are: Domains are opened read-only by default. So the file mode parameter is only required for one of the writable modes. -Note: that unlike with h5py and the HDF5 library, there's no concept of file locking. The -same domain can be opened multiple times in the same or different thread, or even on a -different machine. Multiple clients can access the same domain for modification, but this won't -result in the domain becoming corrupted (though nothing in HSDS guards against clients over-writing -each others updates). +.. note:: + + Unlike with h5py and the HDF5 library, there's no concept of file locking. The + same domain can be opened multiple times in the same or different thread, or even on a + different machine. Multiple clients can access the same domain for modification, but this won't + result in the domain becoming corrupted (though nothing in HSDS guards against clients over-writing + each others updates). Whatever the mode used, if the domain is not configured to authorize the desired action, a ``403 - Forbidden`` error will be raised. See the next sections on authentication and authorization. @@ -144,9 +148,10 @@ authorizes the request and the user is a member of that group, the request will group name: ``default`` that includes all users. In any case, if no authorizing ACL is found, a `403 - Forbidden`` error will be raised. -When a new domain is created (e.g. by using h5pyd.File with the `w` mode), typically it will have one ACL that gives -the owner of the domain (the authenticated user unless the 'owner' argument is given) full control. Other users would not have -permissions to even read the domain. These permissions can be adjusted, or new ACLs added programmatically (using tbd), +When a new domain is created (e.g. by using h5pyd.File with the `w` access mode), an ACL that gives +the owner of the domain (the authenticated user making the request unless the 'owner' argument is given) full control. +Other users would not have permissions to even read the domain. +These permissions can be adjusted, or new ACLs added programmatically (using tbd), or using the ``hsacl`` tool (see: tbd). Folders (every domain lives in specific folder) also have ACLs. To create a new domain, the authenticating user @@ -216,7 +221,7 @@ Deleting Domains With h5py and the HDF5 library you would normally delete HDF5 files using your systems file browser, or the "rm" command. Programmatically you could delete a HDF5 file using the standard Python Path.unlink method. -Neither of these options are possible with HSDS domains, but the ``hsrm`` (see: tbd) command is included with +None of these options are possible with HSDS domains, but the ``hsrm`` (see: tbd) command is included with h5pyd and works like the standard ``rm`` command with domain paths used instead of file paths. Programmatically, you can delete domains using the del method of the folder object (see: tbd). @@ -229,11 +234,11 @@ Summary data Due to the way in which domains are stored, certain information about the domain would be unfeasible to determine on demand. For example to compute the total amount of storage used would require summing the size of each piece of object metadata and each dataset chunk, which for large domains could require fetching -attributes for millions of objects. So for these properties, the server runs asynchronous tasks to compile -summary information about the domain. +attributes for millions of objects. So for these properties, the server periodically runs asynchronous tasks +to compile summary information about the domain. -The impact of this is that some properties of the file object will only be reflect the -state of the domain as of the last time HSDS ran this asynchronous task (typically a few seconds to a minute +The impact of this is that some properties of the file object will only reflect the +domain state as of the last time HSDS ran this asynchronous task (typically a few seconds to a minute after the last update to the domain). Properties for which this applies are: @@ -260,10 +265,10 @@ for a recent update: while f.last_scan == time_stamp: time.sleep(0.1) # wait for summary data to be updated # print affected summary properties - print("num_groups:", num_groups) - print("num_objects:", num_objects) - print("metadata_bytes:", metadata_bytes) - print("total_size:", total_size) + print("num_groups:", f.num_groups) + print("num_objects:", f.num_objects) + print("metadata_bytes:", f.metadata_bytes) + print("total_size:", f.total_size) .. diff --git a/docs/high/group.rst b/docs/high/group.rst index 7b222a06..f0637a94 100644 --- a/docs/high/group.rst +++ b/docs/high/group.rst @@ -1,4 +1,4 @@ -.. currentmodule:: h5py +.. currentmodule:: h5pyd .. _group: @@ -6,24 +6,30 @@ Groups ====== -Groups are the container mechanism by which HDF5 files are organized. From -a Python perspective, they operate somewhat like dictionaries. In this case +Groups are the container mechanism by which HSDS domains (ans well +as HDF5 files) are organized. +From a Python perspective, they operate somewhat like dictionaries. In this case the "keys" are the names of group members, and the "values" are the members themselves (:class:`Group` and :class:`Dataset`) objects. Group objects also contain most of the machinery which makes HDF5 useful. The :ref:`File object ` does double duty as the HDF5 *root group*, and -serves as your entry point into the file: +serves as your entry point into the domain: - >>> f = h5py.File('foo.hdf5','w') + >>> f = h5py.File('/home/test_user1/test/foo.hdf5','w') >>> f.name '/' >>> list(f.keys()) [] -Names of all objects in the file are all text strings (``str``). These will be encoded with the HDF5-approved UTF-8 -encoding before being passed to the HDF5 C library. Objects may also be -retrieved using byte strings, which will be passed on to HDF5 as-is. +Names of all objects in the domain are all text strings (``str``). +These will be encoded using UTF-8 for transmission in http requests and by +HSDS when written to the storage medium. + +.. note:: + + Objects can be retrieved using byte strings, but these will decoded using + UTF8 before sending the request to HSDS. .. _group_create: @@ -49,6 +55,29 @@ Multiple intermediate groups can also be created implicitly:: >>> grp3.name '/some/long' +.. _group_anonymous: + +Anonymous groups +---------------- + +An anonymous group (a group that nothing links to) +can be created by using ``None`` as the group name. +The new group can either be set as a link target later, +or kept as a "hidden" group of the domain. + +An anonymous group can be accessed using it's low-level id as +in this example: + + >>> anon_grp = f.create_group(None) + >>> grp = f.getObjByUuid(anon_grp.id.id) # another reference to anon_grp + >>> f["g1"] = anon_grp # link the grpup as "g1" of the root group + +.. note:: + + Unlike with HDF5, anonymous objects won't be released when the file is closed. + They will need to be explicitly deleted if they are desired to be + used temporarily. + .. _group_links: @@ -63,24 +92,38 @@ they support the indexing syntax, and standard exceptions: >>> missing = subgrp["missing"] KeyError: "Name doesn't exist (Symbol table: Object not found)" -Objects can be deleted from the file using the standard syntax:: - - >>> del subgroup["MyDataset"] - .. note:: - When using h5py from Python 3, the keys(), values() and items() methods + The keys(), values() and items() methods will return view-like objects instead of lists. These objects support membership testing and iteration, but can't be sliced like lists. -By default, objects inside group are iterated in alphanumeric order. +By default, objects inside a group are iterated in alphanumeric order. However, if group is created with ``track_order=True``, the insertion -order for the group is remembered (tracked) in HDF5 file, and group +order for the group is remembered (tracked) in the domain, and group contents are iterated in that order. The latter is consistent with Python 3.7+ dictionaries. The default ``track_order`` for all new groups can be specified -globally with ``h5.get_config().track_order``. +globally with ``h5pyd.get_config().track_order``. + +Links can be deleted from a group using the standard Python syntax:: + + >>> del subgroup["MyDataset"] + +.. note:: + + Unlike with h5py and HDF5, in h5pyd deleting the last link to an object will + not cause the target object to be deleted. Instead the object needs to be + explicitly deleted using its UUID rather than the link name. + See the example below. + +To delete the object a link refers to, pass the UUID identifier of the +object as the argument: + >>> g1 = f.create_group('g1') # create a new object + >>> del f[g1.id.id] # now delete the object + >>> 'g1' in f # link "g1" still exists + >>> del f['g1'] # delete the link .. _group_hardlinks: @@ -117,11 +160,11 @@ Soft links Also like a UNIX filesystem, HDF5 groups can contain "soft" or symbolic links, which contain a text path instead of a pointer to the object itself. You -can easily create these in h5py by using ``h5py.SoftLink``:: +can easily create these in h5pyd by using ``h5pyd.SoftLink``:: - >>> myfile = h5py.File('foo.hdf5','w') + >>> myfile = h5pyd.File('/home/test_user1/foo.hdf5','w') >>> group = myfile.create_group("somegroup") - >>> myfile["alias"] = h5py.SoftLink('/somegroup') + >>> myfile["alias"] = h5pyd.SoftLink('/somegroup') If the target is removed, they will "dangle": @@ -140,43 +183,33 @@ specify the name of the file as well as the path to the desired object. You can refer to objects in any file you wish. Use similar syntax as for soft links: - >>> myfile = h5py.File('foo.hdf5','w') - >>> myfile['ext link'] = h5py.ExternalLink("otherfile.hdf5", "/path/to/resource") + >>> f = h5pyd.File('/home/test_user1/foo.hdf5','w') + >>> f['ext link'] = h5pyd.ExternalLink("/home/test_user1/otherfile.hdf5", "/path/to/resource") -When the link is accessed, the file "otherfile.hdf5" is opened, and object at +When the link is accessed, the domain "/home/test_user1/otherfile.hdf5" is opened, and object at "/path/to/resource" is returned. -Since the object retrieved is in a different file, its ".file" and ".parent" -properties will refer to objects in that file, *not* the file in which the +Since the object retrieved is in a different domain, its ".file" and ".parent" +properties will refer to objects in that domain, *not* the domain in which the link resides. .. note:: - Currently, you can't access an external link if the file it points to is - already open. This is related to how HDF5 manages file permissions - internally. - -.. note:: + To specify an externlink to a domain in different bucket, pre-append the + target bucket name to the external path. E.g. ``otherbucket/home/test_user1/otherfile.hdf5`` - The filename is stored in the file as bytes, normally UTF-8 encoded. - In most cases, this should work reliably, but problems are possible if a - file created on one platform is accessed on another. Older versions of HDF5 - may have problems on Windows in particular. See :ref:`file_filenames` for - more details. Reference --------- .. class:: Group(identifier) - Generally Group objects are created by opening objects in the file, or - by the method :meth:`Group.create_group`. Call the constructor with - a :class:`GroupID ` instance to create a new Group - bound to an existing low-level identifier. + Generally Group objects are created by opening objects in the domain, or + by the method :meth:`Group.create_group`. .. method:: __iter__() - Iterate over the names of objects directly attached to the group. + Iterate over the names of the links in the group. Use :meth:`Group.visit` or :meth:`Group.visititems` for recursive access to group members. @@ -198,17 +231,7 @@ Reference .. method:: __bool__() Check that the group is accessible. - A group could be inaccessible for several reasons. For instance, the - group, or the file it belongs to, may have been closed elsewhere. - - >>> f = h5py.open(filename) - >>> group = f["MyGroup"] - >>> f.close() - >>> if group: - ... print("group is accessible") - ... else: - ... print("group is inaccessible") - group is inaccessible + Will always return True for a valid group reference .. method:: keys() @@ -289,8 +312,6 @@ Reference The second argument to the callback for ``visititems_links`` is an instance of one of the :ref:`link classes `. - .. versionadded:: 3.11 - .. method:: move(source, dest) Move an object or link in the file. If `source` is a hard link, this @@ -302,6 +323,10 @@ Reference :param dest: New location for object or link. :type dest: String + .. note:: + + This method is not yet supported, and will raise an error if invoked. + .. method:: copy(source, dest, name=None, shallow=False, expand_soft=False, expand_external=False, expand_refs=False, without_attrs=False) @@ -326,6 +351,11 @@ Reference :param expand_refs: Copy objects which are pointed to by references. :param without_attrs: Copy object(s) without copying HDF5 attributes. + .. note:: + + This method is not yet supported, and will raise an error if invoked. + + .. method:: create_group(name, track_order=None) @@ -363,7 +393,7 @@ Reference :param data: Initialize dataset to this (NumPy array). - :keyword chunks: Chunk shape, or True to enable auto-chunking. + :keyword chunks: Chunk shape, or True for auto-chunking. :keyword maxshape: Dataset will be resizable up to this shape (Tuple). Automatically enables chunking. Use None for the @@ -382,59 +412,18 @@ Reference :keyword fillvalue: This value will be used when reading uninitialized parts of the dataset. - :keyword fill_time: Control when to write the fill value. One of the - following choices: `alloc`, write fill value before writing - application data values or when the dataset is created; `never`, - never write fill value; `ifset`, write fill value if it is defined. - Default to `ifset`, which is the default of HDF5 library. If the - whole dataset is going to be written by the application, setting - this to `never` can avoid unnecessary writing of fill value and - potentially improve performance. - - :keyword track_times: Enable dataset creation timestamps (**T**/F). :keyword track_order: Track attribute creation order if ``True``. Default is ``h5.get_config().track_order``. - :keyword external: Store the dataset in one or more external, non-HDF5 - files. This should be an iterable (such as a list) of tuples of - ``(name, offset, size)`` to store data from ``offset`` to - ``offset + size`` in the named file. Each name must be a str, - bytes, or os.PathLike; each offset and size, an integer. The last - file in the sequence may have size ``h5py.h5f.UNLIMITED`` to let - it grow as needed. If only a name is given instead of an iterable - of tuples, it is equivalent to - ``[(name, 0, h5py.h5f.UNLIMITED)]``. - - :keyword allow_unknown_filter: Do not check that the requested filter is - available for use (T/F). This should only be set if you will - write any data with ``write_direct_chunk``, compressing the - data before passing it to h5py. - - :keyword rdcc_nbytes: Total size of the dataset's chunk cache in bytes. - The default size is 1024**2 (1 MiB). - - :keyword rdcc_w0: The chunk preemption policy for this dataset. This - must be between 0 and 1 inclusive and indicates the weighting - according to which chunks which have been fully read or written are - penalized when determining which chunks to flush from cache. A value - of 0 means fully read or written chunks are treated no differently - than other chunks (the preemption is strictly LRU) while a value of - 1 means fully read or written chunks are always preempted before - other chunks. If your application only reads or writes data once, - this can be safely set to 1. Otherwise, this should be set lower - depending on how often you re-read or re-write the same data. The - default value is 0.75. - - :keyword rdcc_nslots: The number of chunk slots in the dataset's chunk - cache. Increasing this value reduces the number of cache collisions, - but slightly increases the memory used. Due to the hashing strategy, - this value should ideally be a prime number. As a rule of thumb, - this value should be at least 10 times the number of chunks that can - fit in rdcc_nbytes bytes. For maximum performance, this value should - be set approximately 100 times that number of chunks. The default - value is 521. + + :keyword initializer: Dataset initializer method - a method that will be + invoked each time a dataset chunk is initialized. methods + currently available: arange, or None for no initializer + + :keyword initializer_args: List of arguments for dataset initializer args + .. method:: require_dataset(name, shape, dtype, exact=False, **kwds) @@ -487,6 +476,11 @@ Reference :param fillvalue: The value to use where there is no data. + .. note: + + This is a place holder method until Virtual Datasets are supported. + Invoking the method will raise an error + .. method:: build_virtual_dataset() Assemble a virtual dataset in this group. @@ -507,6 +501,11 @@ Reference (optional). Use None for unlimited dimensions. :param fillvalue: The value used where no data is available. + .. note: + + This is a place holder method until Virtual Datasets are supported. + Invoking the method will raise an error + .. attribute:: attrs :ref:`attributes` for this group. @@ -574,7 +573,7 @@ Link classes .. attribute:: filename - Name of the external file as a Unicode string + Path to a domain as a Unicode string .. attribute:: path diff --git a/examples/check_last_scan.py b/examples/check_last_scan.py new file mode 100644 index 00000000..a2b9346b --- /dev/null +++ b/examples/check_last_scan.py @@ -0,0 +1,20 @@ +import time +import h5pyd + +domain_path = "/home/test_user1/test/one_group.h5" + +f = h5pyd.File(domain_path, 'w') + +time_stamp = f.last_scan # get the last scan time +f.create_group("g1") # create a new group +ts = time.time() +print("waiting for scan update") +while f.last_scan == time_stamp: + time.sleep(0.1) # wait for summary data to be updated +wait_time = time.time() - ts +print(f"last_scan updated after: {wait_time:6.2f} seconds") +# print affected summary properties +print("num_groups:", f.num_groups) +print("num_objects:", f.num_objects) +print("metadata_bytes:", f.metadata_bytes) +print("total_size:", f.total_size) diff --git a/h5pyd/_hl/base.py b/h5pyd/_hl/base.py index db136d6a..42f99c66 100644 --- a/h5pyd/_hl/base.py +++ b/h5pyd/_hl/base.py @@ -1126,8 +1126,7 @@ def __ne__(self, other): return not self.__eq__(other) def __bool__(self): - with phil: - return bool(self.id) + return bool(self.id) def getACL(self, username): req = self._req_prefix + '/acls/' + username diff --git a/h5pyd/_hl/group.py b/h5pyd/_hl/group.py index b7a6501c..55845a7a 100644 --- a/h5pyd/_hl/group.py +++ b/h5pyd/_hl/group.py @@ -358,8 +358,6 @@ def create_dataset(self, name, shape=None, dtype=None, data=None, **kwds): (Scalar) Use this value for uninitialized parts of the dataset. track_oder (T/F) List attributes by creation_time if set - track_times - (T/F) Enable dataset creation timestamps. initializer (String) chunk initializer function initializer_args @@ -448,6 +446,11 @@ def create_dataset_like(self, name, other, **kwupdate): kwupdate.setdefault('maxshape', other.maxshape) return self.create_dataset(name, **kwupdate) + + def create_virtual_dataset(name, layout, fillvalue=None): + """ Create a virtual dataset """ + # not currently supported + raise IOError("Not supported") def create_table(self, name, numrows=None, dtype=None, data=None, **kwds): """ Create a new Table - a one dimensional HDF5 Dataset with a compound type @@ -1076,50 +1079,7 @@ def copy(self, source, dest, name=None, ['MyGroup', 'MyCopy'] """ - pass - """ - with phil: - if isinstance(source, HLObject): - source_path = '.' - else: - # Interpret source as a path relative to this group - source_path = source - source = self - - if isinstance(dest, Group): - if name is not None: - dest_path = name - else: - # copy source into dest group: dest_name/source_name - dest_path = pp.basename(h5i.get_name(source[source_path].id)) - - elif isinstance(dest, HLObject): - raise TypeError("Destination must be path or Group object") - else: - # Interpret destination as a path relative to this group - dest_path = dest - dest = self - - flags = 0 - if shallow: - flags |= h5o.COPY_SHALLOW_HIERARCHY_FLAG - if expand_soft: - flags |= h5o.COPY_EXPAND_SOFT_LINK_FLAG - if expand_external: - flags |= h5o.COPY_EXPAND_EXT_LINK_FLAG - if expand_refs: - flags |= h5o.COPY_EXPAND_REFERENCE_FLAG - if without_attrs: - flags |= h5o.COPY_WITHOUT_ATTR_FLAG - if flags: - copypl = h5p.create(h5p.OBJECT_COPY) - copypl.set_copy_object(flags) - else: - copypl = None - - h5o.copy(source.id, self._e(source_path), dest.id, self._e(dest_path), - copypl, base.dlcpl) - """ + raise IOError("Not implemented") def move(self, source, dest): """ Move a link to a new location in the file. @@ -1128,14 +1088,7 @@ def move(self, source, dest): "source" is a soft or external link, the link itself is moved, with its value unmodified. """ - pass - """ - with phil: - if source == dest: - return - self.id.links.move(self._e(source), self.id, self._e(dest), - lapl=self._lapl, lcpl=self._lcpl) - """ + raise IOError("Not supported") def visit(self, func): """ Recursively visit all names in this group and subgroups (HDF5 1.8). @@ -1158,12 +1111,7 @@ def visit(self, func): >>> f.visit(list_of_names.append) """ return self.visititems(func) - """ - with phil: - def proxy(name): - return func(self._d(name)) - return h5o.visit(self.id, proxy) - """ + def visititems(self, func): """ Recursively visit names and objects in this group (HDF5 1.8). From f7b0f8ad6351244a7354ac9d3165a99b6401dfbf Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 7 Jan 2025 15:33:46 +0800 Subject: [PATCH 09/32] speed up test_file test --- docs/high/group.rst | 14 +++++++++++--- h5pyd/_hl/group.py | 5 ++--- test/hl/test_file.py | 18 ++++++++++++------ test/hl/test_group.py | 22 +++++++++++----------- 4 files changed, 36 insertions(+), 23 deletions(-) diff --git a/docs/high/group.rst b/docs/high/group.rst index f0637a94..a59cbdd2 100644 --- a/docs/high/group.rst +++ b/docs/high/group.rst @@ -139,6 +139,14 @@ is to create an :ref:`HDF5 datasets `:: >>> out +If the object is a Numpy datatype, the default is to create a committed datatype object. + + >> dt = numpy.dtype("int32") + >> grp["name"] = dt + >> out = grp["name"] + >> out + + When the object being stored is an existing Group or Dataset, a new link is made to the object:: @@ -513,7 +521,7 @@ Reference .. attribute:: id The groups's low-level identifier; an instance of - :class:`GroupID `. + :class:`GroupID `. .. attribute:: ref @@ -565,10 +573,10 @@ Link classes Like :class:`SoftLink`, only they specify a filename in addition to a path. See :ref:`group_extlinks`. - :param filename: Name of the file to which the link points + :param filename: Path to the domain to which the link points :type filename: String - :param path: Path to the object in the external file. + :param path: Path to the object in the external domain. :type path: String .. attribute:: filename diff --git a/h5pyd/_hl/group.py b/h5pyd/_hl/group.py index 55845a7a..8e395c58 100644 --- a/h5pyd/_hl/group.py +++ b/h5pyd/_hl/group.py @@ -446,7 +446,7 @@ def create_dataset_like(self, name, other, **kwupdate): kwupdate.setdefault('maxshape', other.maxshape) return self.create_dataset(name, **kwupdate) - + def create_virtual_dataset(name, layout, fillvalue=None): """ Create a virtual dataset """ # not currently supported @@ -1111,7 +1111,6 @@ def visit(self, func): >>> f.visit(list_of_names.append) """ return self.visititems(func) - def visititems(self, func): """ Recursively visit names and objects in this group (HDF5 1.8). @@ -1303,7 +1302,7 @@ class ExternalLink(object): """ Represents an HDF5 external link. Paths may be absolute or relative. - No checking is performed to ensure either the target or file exists. + No checking is performed to ensure either the target or the domain exists. """ @property diff --git a/test/hl/test_file.py b/test/hl/test_file.py index cfe5e7f0..7dd76854 100644 --- a/test/hl/test_file.py +++ b/test/hl/test_file.py @@ -54,6 +54,7 @@ def test_create(self): filename = self.getFileName("new_file") print("filename:", filename) now = time.time() + print("test create") f = h5py.File(filename, 'w') self.assertEqual(f.filename, filename) self.assertEqual(f.name, "/") @@ -128,13 +129,10 @@ def test_create(self): f = h5py.File(filename, "a") f.create_group("foo") del f["foo"] + f.close() # re-open as read-only - if h5py.__name__ == "h5pyd": - wait_time = 90 # change to >90 to test async updates - print("waiting {wait_time:d} seconds for root scan sync".format(wait_time=wait_time)) - time.sleep(wait_time) # let async process update obj number f = h5py.File(filename, 'r') self.assertEqual(f.filename, filename) self.assertEqual(f.name, "/") @@ -166,10 +164,18 @@ def test_create(self): if h5py.__name__ == "h5pyd": # check properties that are only available for h5pyd # Note: num_groups won't reflect current state since the - # data is being updated asynchronously + # data is being updated asynchronously, so wait for a scan update + logging.info("waiting on scan update") + ts = time.time() + while not f.last_scan: + time.sleep(0.1) + elapsed = time.time() - ts + if elapsed > 90: + logging.error("scan not complete after 90 seconds") + self.assertTrue(False) + logging.info(f"last_scan updated after {elapsed:6.2f} seconds") self.assertEqual(f.num_objects, 3) self.assertEqual(f.num_groups, 3) - self.assertEqual(f.num_datasets, 0) self.assertEqual(f.num_datatypes, 0) self.assertTrue(f.allocated_bytes == 0) diff --git a/test/hl/test_group.py b/test/hl/test_group.py index 40b4fccb..bdb7161b 100644 --- a/test/hl/test_group.py +++ b/test/hl/test_group.py @@ -192,22 +192,25 @@ def test_create(self): self.assertTrue(name in f) self.assertTrue("/g1/g1.1" in f) g1_1 = f["/g1/g1.1"] + r = f["/"] - if is_hsds: - linkee_class = r.get('mysoftlink', getclass=True) - # TBD: investigate why h5py returned None here - self.assertEqual(linkee_class, h5py.Group) - link_class = r.get('mysoftlink', getclass=True, getlink=True) - self.assertEqual(link_class, h5py.SoftLink) - softlink = r.get('mysoftlink', getlink=True) - self.assertEqual(softlink.path, '/g1/g1.1') + linkee_class = r.get('mysoftlink', getclass=True) + self.assertEqual(linkee_class, h5py.Group) + link_class = r.get('mysoftlink', getclass=True, getlink=True) + self.assertEqual(link_class, h5py.SoftLink) + softlink = r.get('mysoftlink', getlink=True) + self.assertEqual(softlink.path, '/g1/g1.1') linked_obj = f["mysoftlink"] self.assertEqual(linked_obj.id, g1_1.id) if is_hsds: # for h5pyd we should be able to retrieve the anon group + anon_group = f.getObjByUuid(anon_group_id) + self.assertEqual(anon_group_id, anon_group.id.id) + # can also get anon group using groups/ key anon_group = f[f"groups/{anon_group_id}"] self.assertEqual(anon_group_id, anon_group.id.id) + f.close() def test_nested_create(self): @@ -234,9 +237,6 @@ def test_nested_create(self): def test_external_links(self): # create a file for use a link target - if config.get("use_h5py"): - # for some reason this test is failing in Travis - return linked_filename = self.getFileName("linked_file") abs_filepath = os.path.abspath(linked_filename) if config.get("use_h5py"): From d70f6344c107150f76e096b861f7718554a19c41 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 7 Jan 2025 15:46:44 +0800 Subject: [PATCH 10/32] fix flake8 error --- examples/check_last_scan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/check_last_scan.py b/examples/check_last_scan.py index a2b9346b..7719ae7b 100644 --- a/examples/check_last_scan.py +++ b/examples/check_last_scan.py @@ -10,7 +10,7 @@ ts = time.time() print("waiting for scan update") while f.last_scan == time_stamp: - time.sleep(0.1) # wait for summary data to be updated + time.sleep(0.1) # wait for summary data to be updated wait_time = time.time() - ts print(f"last_scan updated after: {wait_time:6.2f} seconds") # print affected summary properties From b06a242b65771399b7d2e2ace37880928a517f10 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 7 Jan 2025 16:04:42 +0800 Subject: [PATCH 11/32] fix flake8 error --- test/hl/test_file.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/hl/test_file.py b/test/hl/test_file.py index 7dd76854..ee38edc0 100644 --- a/test/hl/test_file.py +++ b/test/hl/test_file.py @@ -54,7 +54,6 @@ def test_create(self): filename = self.getFileName("new_file") print("filename:", filename) now = time.time() - print("test create") f = h5py.File(filename, 'w') self.assertEqual(f.filename, filename) self.assertEqual(f.name, "/") @@ -234,7 +233,6 @@ def test_file_clone(self): self.assertEqual(g.filename, f.filename) self.assertEqual(g.id.id, f.id.id) self.assertTrue("g1" in g) - self.assertEqual(f._version, g._version) f.close() g.close() From fda4def48f24802613af55046d7dee9aa2b216fb Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 7 Jan 2025 22:04:39 +0800 Subject: [PATCH 12/32] updated group doc --- docs/high/group.rst | 160 ++++++++++++++++++++---------------------- h5pyd/_hl/group.py | 17 +---- test/hl/test_file.py | 6 +- test/hl/test_group.py | 53 ++++++-------- 4 files changed, 105 insertions(+), 131 deletions(-) diff --git a/docs/high/group.rst b/docs/high/group.rst index a59cbdd2..131a28aa 100644 --- a/docs/high/group.rst +++ b/docs/high/group.rst @@ -123,7 +123,7 @@ object as the argument: >>> g1 = f.create_group('g1') # create a new object >>> del f[g1.id.id] # now delete the object >>> 'g1' in f # link "g1" still exists - >>> del f['g1'] # delete the link + >>> del f['g1'] # delete the link .. _group_hardlinks: @@ -187,7 +187,7 @@ External links ~~~~~~~~~~~~~~ External links are "soft links plus", which allow you to -specify the name of the file as well as the path to the desired object. You +specify the name of the domain as well as the path to the desired object. You can refer to objects in any file you wish. Use similar syntax as for soft links: @@ -206,6 +206,56 @@ link resides. To specify an externlink to a domain in different bucket, pre-append the target bucket name to the external path. E.g. ``otherbucket/home/test_user1/otherfile.hdf5`` +.. _group_multilink: + +Multi-linking +~~~~~~~~~~~~~ + +Compared with accessing a disk file using HDF5, each request that is sent to HSDS will have higher +latency. For best performance, you'll want to reduce the number of requests being sent to the +server as much as possible. Multi-linking helps in this area by allowing multiple links to be +created, accessed, or deleted in one request. + +Consider the case where you'd like to add three soft links to create in the root group. +The traditional way this would be done in h5py would be to add each link in turn: + + >>> f = h5py.File('foo.hdf5', 'w') + >>> f['x'] = h5py.SoftLink('/g1.1/x') + >>> f['y'] = h5py.SoftLink('/g2.2/y') + >>> f['z'] = h5py.SoftLink('/g3.3/z') + +While these method works with h5pyd as well, it would be more efficient to +utilize multi-linking in this way: + + >>> f = h5pyd.File('/home/test_user1/foo,h5', 'w') + >>> links = [] + >>> links.append(h5py.SoftLink('/g1.1/x')) + >>> links.append(h5py.SoftLink('/g2.2/y')) + >>> links.append(h5py.SoftLink('/g3.3/z')) + >>> names = ['x', 'y', 'z'] + >>> f[names] = links # 3 links will be created in one request + +To create multiple links in one call, just use a list of link names +as the key and a list of link objects (HardLink, SoftLik, or ExternalLink) +as the value (where the number of names is equal to the number of links). +The result will be the same as if you created the links +one by one, but the operation will take less time. + +Multi-linking can be used to fetch links as well. +If you need to fetch a specific set of link names +from a group, you can do this: + + >>> names = ['ACH293', 'BUR389', 'CDJ982'] + >>> f.get(names, getlink=True) + {'ACH293': , + 'BUR389': , + 'CDJ392': } + +Multiple links can also be deleted simultaneously. For example, + + >>> names = ['ACH293', 'BUR389', 'CDJ982'] + >>> del f[names] + Reference --------- @@ -263,7 +313,7 @@ Reference :return: a set-like object. - .. method:: get(name, default=None, getclass=False, getlink=False) + .. method:: get(name, default=None, getclass=False, getlink=False, track_order=None) Retrieve an item, or information about an item. `name` and `default` work like the standard Python ``dict.get``. @@ -277,6 +327,10 @@ Reference :class:`SoftLink` or :class:`ExternalLink` instance. If ``getclass`` is also True, returns the corresponding Link class without instantiating it. + :param track_order: If True, return links by creation order. If False, + return link by alphanumeric order, if None, return links + based on the track_order setting in effect when the + group was created. .. method:: visit(callable) @@ -320,50 +374,6 @@ Reference The second argument to the callback for ``visititems_links`` is an instance of one of the :ref:`link classes `. - .. method:: move(source, dest) - - Move an object or link in the file. If `source` is a hard link, this - effectively renames the object. If a soft or external link, the - link itself is moved. - - :param source: Name of object or link to move. - :type source: String - :param dest: New location for object or link. - :type dest: String - - .. note:: - - This method is not yet supported, and will raise an error if invoked. - - - .. method:: copy(source, dest, name=None, shallow=False, expand_soft=False, expand_external=False, expand_refs=False, without_attrs=False) - - Copy an object or group. The source can be a path, Group, Dataset, or - Datatype object. The destination can be either a path or a Group - object. The source and destination need not be in the same file. - - If the source is a Group object, by default all objects within that - group will be copied recursively. - - When the destination is a Group object, by default the target will be - created in that group with its current name (basename of obj.name). You - can override that by setting "name" to a string. - - :param source: What to copy. May be a path in the file or a Group/Dataset object. - :param dest: Where to copy it. May be a path or Group object. - :param name: If the destination is a Group object, use this for the - name of the copied object (default is basename). - :param shallow: Only copy immediate members of a group. - :param expand_soft: Expand soft links into new objects. - :param expand_external: Expand external links into new objects. - :param expand_refs: Copy objects which are pointed to by references. - :param without_attrs: Copy object(s) without copying HDF5 attributes. - - .. note:: - - This method is not yet supported, and will raise an error if invoked. - - .. method:: create_group(name, track_order=None) @@ -472,47 +482,32 @@ Reference shape and dtype, in which case the provided values take precedence over those from `other`. - .. method:: create_virtual_dataset(name, layout, fillvalue=None) - - Create a new virtual dataset in this group. See :doc:`/vds` for more - details. + .. method:: create_table(name, numrows=None, dtype=None, data=None, **kwds) - :param str name: - Name of the dataset (absolute or relative). - :param VirtualLayout layout: - Defines what source data fills which parts of the virtual dataset. - :param fillvalue: - The value to use where there is no data. + Create a new table (one-dimensional dataset). Options are explained in tbd . - .. note: - - This is a place holder method until Virtual Datasets are supported. - Invoking the method will raise an error + :param name: Name of table to create. May be an absolute + or relative path. Provide None to create an anonymous + dataset, to be linked into the file later. - .. method:: build_virtual_dataset() + :param numrows: Number of initial rows - Assemble a virtual dataset in this group. + :param dtype: Data type for new table - This is used as a context manager:: + :param data: Initialize table to this (NumPy array). - with f.build_virtual_dataset('virt', (10, 1000), np.uint32) as layout: - layout[0] = h5py.VirtualSource('foo.h5', 'data', (1000,)) + :keyword chunks: Chunk shape, or True for auto-chunking. - Inside the context, you populate a :class:`VirtualLayout` object. - The file is only modified when you leave the context, and if there's - no error. + - :param str name: Name of the dataset (absolute or relative) - :param tuple shape: Shape of the dataset - :param dtype: A numpy dtype for data read from the virtual dataset - :param tuple maxshape: Maximum dimensions if the dataset can grow - (optional). Use None for unlimited dimensions. - :param fillvalue: The value used where no data is available. + .. method:: getObjByUuid(obj_uuid) - .. note: + Returns the object in the domain with the given low-level identifier UUID. + Raises an IOError ("401 - Not Found") + if no object with the given identifier exists. - This is a place holder method until Virtual Datasets are supported. - Invoking the method will raise an error + :param str obj_uuid: + Object identifier of the object to be returned. .. attribute:: attrs @@ -528,11 +523,6 @@ Reference An HDF5 object reference pointing to this group. See :ref:`refs_object`. - .. attribute:: regionref - - A proxy object allowing you to interrogate region references. - See :ref:`refs_region`. - .. attribute:: name String giving the full path to this group. @@ -545,6 +535,10 @@ Reference :class:`Group` instance containing this group. + .. attribute:: modified + + datetime object giving the time object was last modified + .. _group_link_classes: Link classes diff --git a/h5pyd/_hl/group.py b/h5pyd/_hl/group.py index 8e395c58..1682ae41 100644 --- a/h5pyd/_hl/group.py +++ b/h5pyd/_hl/group.py @@ -59,21 +59,6 @@ def __init__(self, bind, track_order=None, **kwargs): if not isinstance(bind, GroupID): raise ValueError(f"{bind} is not a GroupID") HLObject.__init__(self, bind, track_order=track_order, **kwargs) - """ - if track_order is None: - # set order based on group creation props - gcpl = self.id.gcpl_json - if "CreateOrder" in gcpl: - createOrder = gcpl["CreateOrder"] - if not createOrder or createOrder == "0": - self._track_order = False - else: - self._track_order = True - else: - self._track_order = False - else: - self._track_order = track_order - """ self._req_prefix = "/groups/" + self.id.uuid self._link_db = {} # cache for links @@ -356,7 +341,7 @@ def create_dataset(self, name, shape=None, dtype=None, data=None, **kwds): conjunction with the scale/offset filter. fillvalue (Scalar) Use this value for uninitialized parts of the dataset. - track_oder + track_order (T/F) List attributes by creation_time if set initializer (String) chunk initializer function diff --git a/test/hl/test_file.py b/test/hl/test_file.py index ee38edc0..979e94de 100644 --- a/test/hl/test_file.py +++ b/test/hl/test_file.py @@ -182,7 +182,7 @@ def test_create(self): f.close() self.assertEqual(f.id.id, 0) - # re-open using hdf5:// prefix + # re-open using hdf5:// prefix (only for h5pyd) if h5py.__name__ == "h5pyd": if filename[0] == '/': filepath = "hdf5:/" + filename @@ -266,10 +266,12 @@ def test_auth(self): self.assertTrue(f.id.id is not None) self.assertEqual(len(f.keys()), 2) + if h5py.__name__ == "h5py": + return # no ACLs in h5py + # no explicit ACLs yet file_acls = f.getACLs() self.assertTrue(len(file_acls) >= 1) # Should have at least the test_user1 acl - username = f.owner file_acl = f.getACL(username) diff --git a/test/hl/test_group.py b/test/hl/test_group.py index bdb7161b..7be32be4 100644 --- a/test/hl/test_group.py +++ b/test/hl/test_group.py @@ -28,9 +28,6 @@ def test_create(self): filename = self.getFileName("create_group") print("filename:", filename) f = h5py.File(filename, 'w') - is_hsds = False - if isinstance(f.id.id, str) and f.id.id.startswith("g-"): - is_hsds = True # HSDS has different permission defaults self.assertTrue('/' in f) r = f['/'] @@ -178,6 +175,7 @@ def test_create(self): # Check group's last modified time if h5py.__name__ == "h5pyd": self.assertTrue(isinstance(g1.modified, datetime)) + print("modified", type(g1.modified)) # try creating an anon group anon_group = g1.create_group(None) @@ -203,7 +201,7 @@ def test_create(self): linked_obj = f["mysoftlink"] self.assertEqual(linked_obj.id, g1_1.id) - if is_hsds: + if h5py.__name__ == "h5pyd": # for h5pyd we should be able to retrieve the anon group anon_group = f.getObjByUuid(anon_group_id) self.assertEqual(anon_group_id, anon_group.id.id) @@ -238,12 +236,14 @@ def test_nested_create(self): def test_external_links(self): # create a file for use a link target linked_filename = self.getFileName("linked_file") + + f = h5py.File(linked_filename, 'w') abs_filepath = os.path.abspath(linked_filename) - if config.get("use_h5py"): - rel_filepath = os.path.relpath(linked_filename) - else: + if h5py.__name__ == "h5pyd": rel_filepath = "linked_file.h5" - f = h5py.File(linked_filename, 'w') + else: + rel_filepath = os.path.relpath(linked_filename) + g1 = f.create_group("g1") dset = g1.create_dataset('ds', (5, 7), dtype='f4') dset_id = dset.id.id @@ -271,7 +271,7 @@ def test_external_links(self): linked_obj = f["relpath_link"] self.assertTrue(linked_obj.name, "/g1/ds") self.assertEqual(linked_obj.shape, (5, 7)) - if not config.get("use_h5py"): + if h5py.__name__ == "h5pyd": self.assertEqual(linked_obj.id.id, dset_id) f.close() @@ -310,11 +310,11 @@ def get_count(grp): def test_link_multi_removal(self): # create a file for use a link target - if config.get("use_h5py"): - return + if h5py.__name__ == "h5py": + return # multilink is for h5pyd only filename = self.getFileName("test_link_multi_removal") print(f"filename: {filename}") - + f = h5py.File(filename, 'w') g1 = f.create_group("g1") g1_clone = f["g1"] @@ -354,9 +354,8 @@ def test_link_multi_removal(self): f.close() def test_link_multi_create(self): - if config.get("use_h5py"): - return - + if h5py.__name__ == "h5py": + return # multi create h5pyd only feature filename = self.getFileName("test_link_multi_create") print(f"filename: {filename}") @@ -434,11 +433,10 @@ def test_link_multi_create(self): self.assertEqual(link.filename, links[i % num_links]._filename) def test_link_get_multi(self): - if config.get("use_h5py"): - return - filename = self.getFileName("test_link_get_multi") print(f"filename: {filename}") + if h5py.__name__ == "h5py": + return # no multi link for h5py f = h5py.File(filename, 'w') g1 = f.create_group("g1") @@ -633,38 +631,34 @@ def test_no_track_order(self): self.assertEqual(list(reversed(g)), list(reversed(ref))) def test_get_dataset_track_order(self): - - # h5py does not support track_order on group.get() - if config.get("use_h5py"): - return - + filename = self.getFileName("test_get_dataset_track_order") print(f"filename: {filename}") + if h5py.__name__ == "h5py": + return # h5py does not support track_order on group.get() + with h5py.File(filename, 'w') as f: g = f.create_group('order') - dset = g.create_dataset('dset', (10,), dtype='i4') dset2 = g.create_dataset('dset2', (10,), dtype='i4') - self.populate_attrs(dset) self.populate_attrs(dset2) with h5py.File(filename) as f: g = f['order'] - d = g.get('dset', track_order=True) self.assertEqual(list(d.attrs), list(self.titles)) - d2 = g.get('dset2', track_order=False) ref = sorted(self.titles) self.assertEqual(list(d2.attrs), ref) def test_get_group_track_order(self): # h5py does not support track_order on group.get() - if config.get("use_h5py"): - return filename = self.getFileName("test_get_group_track_order") print(f"filename: {filename}") + if h5py.__name__ == "h5py": + return # h5py does not support track_order on group.get() + with h5py.File(filename, 'w') as f: g = f.create_group('order') g._track_order = True @@ -682,7 +676,6 @@ def test_get_group_track_order(self): subg2 = g.get('subgroup', track_order=False) self.assertEqual(list(subg2), sorted(self.titles)) - if __name__ == '__main__': loglevel = logging.ERROR logging.basicConfig(format='%(asctime)s %(message)s', level=loglevel) From 64d5a4d3851ea3869b2818437f8e951e80b4111c Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 9 Jan 2025 15:13:07 +0800 Subject: [PATCH 13/32] sort links client side --- docs/high/group.rst | 10 +- h5pyd/_hl/group.py | 270 ++++++++++++++++++++++++------------------ h5pyd/_hl/httpconn.py | 1 - test/hl/test_file.py | 2 +- test/hl/test_group.py | 21 ++-- 5 files changed, 179 insertions(+), 125 deletions(-) diff --git a/docs/high/group.rst b/docs/high/group.rst index 131a28aa..80a2471e 100644 --- a/docs/high/group.rst +++ b/docs/high/group.rst @@ -106,6 +106,10 @@ Python 3.7+ dictionaries. The default ``track_order`` for all new groups can be specified globally with ``h5pyd.get_config().track_order``. +If the group has already been created and you wish to fetch the links +in a specific order, you can invoke the ``get`` method with the desired +``track_order`` argument. + Links can be deleted from a group using the standard Python syntax:: >>> del subgroup["MyDataset"] @@ -212,11 +216,11 @@ Multi-linking ~~~~~~~~~~~~~ Compared with accessing a disk file using HDF5, each request that is sent to HSDS will have higher -latency. For best performance, you'll want to reduce the number of requests being sent to the +latency. Therefore for best performance, you'll want to reduce the number of requests being sent to the server as much as possible. Multi-linking helps in this area by allowing multiple links to be created, accessed, or deleted in one request. -Consider the case where you'd like to add three soft links to create in the root group. +Consider the case where you'd like to add three soft links to the root group. The traditional way this would be done in h5py would be to add each link in turn: >>> f = h5py.File('foo.hdf5', 'w') @@ -224,7 +228,7 @@ The traditional way this would be done in h5py would be to add each link in turn >>> f['y'] = h5py.SoftLink('/g2.2/y') >>> f['z'] = h5py.SoftLink('/g3.3/z') -While these method works with h5pyd as well, it would be more efficient to +While this method works with h5pyd as well, with h5pyd it would be more efficient to utilize multi-linking in this way: >>> f = h5pyd.File('/home/test_user1/foo,h5', 'w') diff --git a/h5pyd/_hl/group.py b/h5pyd/_hl/group.py index 1682ae41..cef155e6 100644 --- a/h5pyd/_hl/group.py +++ b/h5pyd/_hl/group.py @@ -60,7 +60,64 @@ def __init__(self, bind, track_order=None, **kwargs): raise ValueError(f"{bind} is not a GroupID") HLObject.__init__(self, bind, track_order=track_order, **kwargs) self._req_prefix = "/groups/" + self.id.uuid - self._link_db = {} # cache for links + self._link_db = None # cache for links + + def _refresh_link_cache(self, force=False): + if self._link_db is not None and not force and self.id.http_conn.mode == 'r': + # already initialized and we are in read-only mode, just return + return + + objdb = self.id._http_conn.getObjDb() + group_id = self.id.uuid + + if not force and objdb and group_id in objdb: + # _objdb is meta-data pulled from the domain on open. + # see if we can extract the link json from there + self.log.debug(f"found {group_id} in objdb") + group_json = objdb[group_id] + links = group_json["links"] + # mix in a "collection key for compatibility with server GET links request + for title in links: + link = links[title] + if 'class' not in link: + self.log.error(f"expected to find class key in link {link}") + continue + link['title'] = title + link_class = link['class'] + if link_class == 'H5L_TYPE_HARD': + if 'id' not in link: + self.log.error(f"expected to find id key in hard link: {link}") + continue + link_id = link['id'] + if not link_id: + self.log.error(f"null id for hard link: {link}") + continue + if link_id.startswith("g-"): + link['collection'] = "groups" + elif link_id.startswith("d-"): + link['collection'] = "datasets" + elif link_id.startswith("t-"): + link["collection"] = "datatypes" + else: + self.log.error(f"unexpected id string for hard link: {link}") + else: + pass # no collection for non hardlink + link_db = links + else: + # make server request + self.log.debug(f"requesting links for {group_id}") + req = "/groups/" + group_id + "/links" + rsp_json = self.GET(req, use_cache=False) + links = rsp_json['links'] + link_db = {} + for link in links: + title = link['title'] + link_db[title] = link + + self.log.debug(f"_refresh_link_cache - found {len(links)} for {group_id}") + + # reset the link cache + self._link_db = link_db def _get_link_json(self, h5path): """ Return parent_uuid and json description of link for given path """ @@ -84,12 +141,17 @@ def _get_link_json(self, h5path): # asking for the root, just return the root link return parent_uuid, tgt_json else: - if in_group and h5path in self._link_db: - # link belonging to this group, see if it's in the cache - tgt_json = self._link_db[h5path] - parent_uuid = self.id.id + if in_group: + self._refresh_link_cache() + if h5path in self._link_db: + # link belonging to this group, return cache itm + tgt_json = self._link_db[h5path] + parent_uuid = self.id.id - return parent_uuid, tgt_json + return parent_uuid, tgt_json + else: + self.log.info(f"{h5path} not found") + raise KeyError("Unable to open object (Component not found)") path = h5path.split('/') @@ -126,7 +188,7 @@ def _get_link_json(self, h5path): group_uuid = tgt_json["id"] if tgt_json: - # mix in a "collection key for compatibilty wtth server GET links request + # mix in a "collection key for compatibility with server GET links request if group_uuid and group_uuid.startswith("g-"): tgt_json['collection'] = "groups" elif group_uuid and group_uuid.startswith("d-"): @@ -150,7 +212,7 @@ def _get_link_json(self, h5path): req = "/groups/" + parent_uuid + "/links/" + name try: - rsp_json = self.GET(req, params={"CreateOrder": "1" if self.track_order else "0"}) + rsp_json = self.GET(req) except IOError: raise KeyError("Unable to open object (Component not found)") @@ -158,10 +220,6 @@ def _get_link_json(self, h5path): raise IOError("Unexpected Error") tgt_json = rsp_json['link'] - if in_group: - # add to db to speed up future requests - self._link_db[name] = tgt_json - if tgt_json['class'] == 'H5L_TYPE_HARD': if tgt_json['collection'] == 'groups': parent_uuid = tgt_json['id'] @@ -170,18 +228,6 @@ def _get_link_json(self, h5path): return parent_uuid, tgt_json - def _get_objdb_links(self): - """ Return the links json from the objdb if present. - """ - objdb = self.id.http_conn.getObjDb() - if not objdb: - return None - if self.id.id not in objdb: - self.log.warning(f"{self.id.id} not found in objdb") - return None - group_json = objdb[self.id.id] - return group_json["links"] - def _make_group(self, parent_id=None, parent_name=None, link=None, track_order=None): """ helper function to make a group """ @@ -270,7 +316,7 @@ def create_group(self, h5path, track_order=None): parent_uuid = sub_group.id.id else: - # sub-group already exsits + # sub-group already exists self.log.debug(f"create group - found subgroup: {link}") if "link" not in rsp_json: raise IOError("Unexpected Error") @@ -499,6 +545,9 @@ def require_dataset(self, name, shape, dtype, exact=False, **kwds): shape or dtype don't match according to the above rules. """ + if isinstance(name, bytes): + name = name.decode('utf-8') + if name not in self: return self.create_dataset(name, *(shape, dtype), **kwds) @@ -526,6 +575,9 @@ def require_group(self, name): TypeError is raised if something with that name already exists that isn't a group. """ + if isinstance(name, bytes): + # convert byte input to string + name = name.decode("utf-8") if name not in self: return self.create_group(name) @@ -562,10 +614,9 @@ def getObjByUuid(self, uuid, collection_type=None, track_order=None): else: raise IOError(f"Unexpected uuid: {uuid}") objdb = self.id.http_conn.getObjDb() - if objdb and uuid in objdb and False: + if objdb and uuid in objdb: # we should be able to construct an object from objdb json obj_json = objdb[uuid] - print('fetch from db') else: # will need to get JSON from server req = f"/{collection_type}/{uuid}" @@ -722,6 +773,11 @@ def get(self, name, default=None, getclass=False, getlink=False, track_order=Non >>> if cls == SoftLink: ... print '"foo" is a soft link!' """ + kwd_args = ("limit", "marker", "pattern", "follow_links") + for kwd in kwds: + if kwd not in kwd_args: + raise TypeError(f"group.get() unexpected keyword argument: {kwd}") + if not (getclass or getlink): try: return self.__getitem__(name, track_order=track_order) @@ -767,8 +823,13 @@ def get(self, name, default=None, getclass=False, getlink=False, track_order=Non params["pattern"] = pattern if follow_links: params["follow_links"] = 1 + if track_order is not None: - params["CreateOrder"] = "1" if track_order else "0" + if limit or marker or pattern or follow_links: + # send server request, otherwise we'll just sort + # client side (so things will work even if we have + # the request in http cache) + params["CreateOrder"] = "1" if track_order else "0" if name: body = {} @@ -782,19 +843,37 @@ def get(self, name, default=None, getclass=False, getlink=False, track_order=Non if "links" in rsp: # Process list of link objects so they may be accessed by name - links = rsp['links'] - links_out = {} + links_out = collections.OrderedDict() + links = rsp["links"] if all([isUUID(k) for k in links]): # Multiple groups queried, links are returned under group ids for group_id in links: - group_links = {} - - for link in links[group_id]: + group_links = collections.OrderedDict() + + link_list = links[group_id] + if track_order is None: + pass # just use in the order we got from the server + elif track_order: + # sort by created key + link_list.sort(key=lambda d: d['created']) + else: + # sort by title + link_list.sort(key=lambda d: d['title']) + + for link in link_list: group_links[link["title"]] = self._objectify_link_Json(link) links_out[group_id] = group_links else: + if track_order is None: + pass # just use in the order we got from the server + elif track_order: + # sort by created key + links.sort(key=lambda d: d['created']) + else: + # sort by title + links.sort(key=lambda d: d['title']) for link in links: links_out[link["title"]] = self._objectify_link_Json(link) else: @@ -933,8 +1012,9 @@ def __setitem__(self, name, obj): arr = numpy.array(obj, dtype=dt) self.create_dataset(name, shape=arr.shape, dtype=arr.dtype, data=arr[...]) - # ds = self.create_dataset(None, data=obj, dtype=base.guess_dtype(obj)) - # h5o.link(ds.id, self.id, name, lcpl=lcpl) + if isinstance(name, str) and name.find('/') != -1: + # object in this group, update link db + self._refresh_link_cache() def __delitem__(self, name): """ Delete (unlink) an item from this group. """ @@ -964,66 +1044,61 @@ def __delitem__(self, name): self.DELETE(req) - for n in name: - if n.find('/') == -1 and n in self._link_db: - # remove from link cache - del self._link_db[name] + self._refresh_link_cache() def __len__(self): """ Number of members attached to this group """ - links_json = self._get_objdb_links() - # we can avoid a server request and just count the links in the obj json - if links_json: - return len(links_json) - - req = "/groups/" + self.id.uuid - params = {} - if self.track_order is not None: - params["CreateOrder"] = "1" if self.track_order else "0" - rsp_json = self.GET(req, params=params) - return rsp_json['linkCount'] + self._refresh_link_cache() + num_links = len(self._link_db) + return num_links + + def _get_link_list(self, track_order=None): + if track_order is None: + track_order = self.track_order + self._refresh_link_cache() + + # convert to a list of dicts + links = [] + for title in self._link_db: + link = self._link_db[title] + links.append(link) + + if track_order: + links.sort(key=lambda d: d['created']) + else: + links.sort(key=lambda d: d['title']) + return links def __iter__(self): """ Iterate over member names """ - links = self._get_objdb_links() - - if links is None: - req = "/groups/" + self.id.uuid + "/links" - params = {} - if self.track_order is not None: - params["CreateOrder"] = "1" if self.track_order else "0" - rsp_json = self.GET(req, params=params) - links = rsp_json['links'] - - # reset the link cache - self._link_db = {} - for link in links: - name = link["title"] - self._link_db[name] = link + links = self._get_link_list() - for x in links: - yield x['title'] - else: - if self.track_order: - links = sorted(links.items(), key=lambda x: x[1]['created']) - else: - links = sorted(links.items()) + for link in links: + yield link['title'] - ordered_links = {} - for link in links: - ordered_links[link[0]] = link[1] + def __reversed__(self): + """ Iterate over member names in reverse order """ + self._refresh_link_cache() + links = self._get_link_list() - for name in ordered_links: - yield name + for link in reversed(links): + yield link['title'] def __contains__(self, name): """ Test if a member name exists """ found = False - try: - self._get_link_json(name) - found = True - except KeyError: - pass # not found + + if name.find('/') == -1: + # a link in this group + self._refresh_link_cache() + if name in self._link_db: + found = True + else: + try: + self._get_link_json(name) + found = True + except KeyError: + pass # not found return found def copy(self, source, dest, name=None, @@ -1209,40 +1284,11 @@ def __repr__(self): r = f'' return r - def __reversed__(self): - """ Iterate over member names in reverse order """ - links = self._get_objdb_links() - - if links is None: - req = "/groups/" + self.id.uuid + "/links" - rsp_json = self.GET(req, params={"CreateOrder": "1" if self.track_order else "0"}) - links = rsp_json['links'] - - # reset the link cache - self._link_db = {} - for link in links: - name = link["title"] - self._link_db[name] = link - - for x in reversed(links): - yield x['title'] - else: - if self.track_order: - links = sorted(links.items(), key=lambda x: x[1]['created']) - else: - links = sorted(links.items()) - - ordered_links = {} - for link in links: - ordered_links[link[0]] = link[1] - - for name in reversed(ordered_links): - yield name - def refresh(self): """Refresh the group metadata by reloading from the file. """ self.id.refresh() + self._refresh_link_cache(force=True) class HardLink(object): diff --git a/h5pyd/_hl/httpconn.py b/h5pyd/_hl/httpconn.py index 8d55d6d2..8b981ff9 100644 --- a/h5pyd/_hl/httpconn.py +++ b/h5pyd/_hl/httpconn.py @@ -445,7 +445,6 @@ def GET(self, req, format="json", params=None, headers=None, use_cache=True): self.log.debug("httpcon - returning cache result") rsp = self._cache[req] return rsp - self.log.info(f"GET: {self._endpoint + req} [{params['domain']}] timeout: {self._timeout}") for k in params: diff --git a/test/hl/test_file.py b/test/hl/test_file.py index 979e94de..3e67a6d3 100644 --- a/test/hl/test_file.py +++ b/test/hl/test_file.py @@ -268,7 +268,7 @@ def test_auth(self): if h5py.__name__ == "h5py": return # no ACLs in h5py - + # no explicit ACLs yet file_acls = f.getACLs() self.assertTrue(len(file_acls) >= 1) # Should have at least the test_user1 acl diff --git a/test/hl/test_group.py b/test/hl/test_group.py index 7be32be4..907465f6 100644 --- a/test/hl/test_group.py +++ b/test/hl/test_group.py @@ -30,7 +30,6 @@ def test_create(self): f = h5py.File(filename, 'w') self.assertTrue('/' in f) r = f['/'] - self.assertEqual(len(r), 0) self.assertTrue(isinstance(r, h5py.Group)) self.assertTrue(r.name, '/') @@ -175,7 +174,6 @@ def test_create(self): # Check group's last modified time if h5py.__name__ == "h5pyd": self.assertTrue(isinstance(g1.modified, datetime)) - print("modified", type(g1.modified)) # try creating an anon group anon_group = g1.create_group(None) @@ -236,14 +234,14 @@ def test_nested_create(self): def test_external_links(self): # create a file for use a link target linked_filename = self.getFileName("linked_file") - + f = h5py.File(linked_filename, 'w') abs_filepath = os.path.abspath(linked_filename) if h5py.__name__ == "h5pyd": rel_filepath = "linked_file.h5" else: rel_filepath = os.path.relpath(linked_filename) - + g1 = f.create_group("g1") dset = g1.create_dataset('ds', (5, 7), dtype='f4') dset_id = dset.id.id @@ -314,7 +312,7 @@ def test_link_multi_removal(self): return # multilink is for h5pyd only filename = self.getFileName("test_link_multi_removal") print(f"filename: {filename}") - + f = h5py.File(filename, 'w') g1 = f.create_group("g1") g1_clone = f["g1"] @@ -578,6 +576,12 @@ def test_track_order(self): self.assertEqual(title, self.titles[i]) i += 1 + # test with get and track_order=False + links = g.get(None, getlink=True, track_order=False) + ref = sorted(self.titles) + self.assertEqual(list(links), ref) + self.assertEqual(list(links), ref) + # re-opening the file should retain the track_order setting with h5py.File(filename) as f: g = f['order'] @@ -631,12 +635,12 @@ def test_no_track_order(self): self.assertEqual(list(reversed(g)), list(reversed(ref))) def test_get_dataset_track_order(self): - + filename = self.getFileName("test_get_dataset_track_order") print(f"filename: {filename}") if h5py.__name__ == "h5py": return # h5py does not support track_order on group.get() - + with h5py.File(filename, 'w') as f: g = f.create_group('order') dset = g.create_dataset('dset', (10,), dtype='i4') @@ -658,7 +662,7 @@ def test_get_group_track_order(self): print(f"filename: {filename}") if h5py.__name__ == "h5py": return # h5py does not support track_order on group.get() - + with h5py.File(filename, 'w') as f: g = f.create_group('order') g._track_order = True @@ -676,6 +680,7 @@ def test_get_group_track_order(self): subg2 = g.get('subgroup', track_order=False) self.assertEqual(list(subg2), sorted(self.titles)) + if __name__ == '__main__': loglevel = logging.ERROR logging.basicConfig(format='%(asctime)s %(message)s', level=loglevel) From c1bc51bd18faf9c4c714ad955d644d615740862e Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 9 Jan 2025 15:42:43 +0800 Subject: [PATCH 14/32] fix test for h5py compat --- test/hl/test_group.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/test/hl/test_group.py b/test/hl/test_group.py index 907465f6..581af8a7 100644 --- a/test/hl/test_group.py +++ b/test/hl/test_group.py @@ -576,11 +576,12 @@ def test_track_order(self): self.assertEqual(title, self.titles[i]) i += 1 - # test with get and track_order=False - links = g.get(None, getlink=True, track_order=False) - ref = sorted(self.titles) - self.assertEqual(list(links), ref) - self.assertEqual(list(links), ref) + if h5py.__name__ == "h5pyd": + # test with get and track_order=False + links = g.get(None, getlink=True, track_order=False) + ref = sorted(self.titles) + self.assertEqual(list(links), ref) + self.assertEqual(list(links), ref) # re-opening the file should retain the track_order setting with h5py.File(filename) as f: From 4bb9f4c4a2fd77c3b4b322e45013fbd83c96df3e Mon Sep 17 00:00:00 2001 From: John Readey Date: Mon, 20 Jan 2025 13:49:43 +0800 Subject: [PATCH 15/32] first pass at objdb class --- h5pyd/_hl/attrs.py | 431 +++++----------- h5pyd/_hl/base.py | 290 +---------- h5pyd/_hl/dataset.py | 263 ++++------ h5pyd/_hl/datatype.py | 5 +- h5pyd/_hl/dims.py | 219 +++----- h5pyd/_hl/files.py | 78 +-- h5pyd/_hl/filters.py | 1 - h5pyd/_hl/group.py | 1029 +++++++++++-------------------------- h5pyd/_hl/httpconn.py | 270 +++++----- h5pyd/_hl/objdb.py | 406 +++++++++++++++ h5pyd/_hl/objectid.py | 413 +++++++++++---- h5pyd/_hl/selections.py | 4 +- h5pyd/_hl/table.py | 29 +- test/hl/test_attribute.py | 171 ------ test/hl/test_file.py | 3 +- test/hl/test_group.py | 258 +--------- test/hl/test_table.py | 1 + 17 files changed, 1579 insertions(+), 2292 deletions(-) create mode 100644 h5pyd/_hl/objdb.py diff --git a/h5pyd/_hl/attrs.py b/h5pyd/_hl/attrs.py index c1134f94..fcaaae3f 100644 --- a/h5pyd/_hl/attrs.py +++ b/h5pyd/_hl/attrs.py @@ -25,7 +25,6 @@ from . import base from .base import jsonToArray, Empty from .datatype import Datatype -from .objectid import GroupID, DatasetID, TypeID from .h5type import getTypeItem, createDataType, special_dtype, Reference @@ -50,29 +49,18 @@ class AttributeManager(base.MutableMappingHDF5, base.CommonStateObject): shape, use create(). """ - def __init__(self, parent): + def __init__(self, parent, track_order=None): """ Private constructor. """ self._parent = parent + self._track_order = track_order - if isinstance(parent.id, GroupID): - self._req_prefix = "/groups/" + parent.id.uuid + "/attributes/" - elif isinstance(parent.id, TypeID): - self._req_prefix = "/datatypes/" + parent.id.uuid + "/attributes/" - elif isinstance(parent.id, DatasetID): - self._req_prefix = "/datasets/" + parent.id.uuid + "/attributes/" + @property + def track_order(self): + if self._track_order is None: + return self._parent.track_order else: - # "unknown id" - self._req_prefix = "" - objid = self._parent.id.uuid - objdb = self._parent.id.http_conn.getObjDb() - if objdb and objid in objdb: - # _objdb is meta-data pulled from the domain on open. - # use the link json from there if present - obj_json = objdb[objid] - self._objdb_attributes = obj_json["attributes"] - else: - self._objdb_attributes = None + return self._track_order def _bytesArrayToList(self, data): """ @@ -114,28 +102,35 @@ def __getitem__(self, name): if isinstance(name, bytes): name = name.decode("utf-8") - if self._objdb_attributes is not None: - if name not in self._objdb_attributes: - raise KeyError - attr_json = self._objdb_attributes[name] - else: - req = self._req_prefix + name - try: - attr_json = self._parent.GET(req) - except IOError: - raise KeyError + attr_json = self._parent.get_attr(name) shape_json = attr_json['shape'] type_json = attr_json['type'] dtype = createDataType(type_json) - if shape_json['class'] == 'H5S_NULL': - return Empty(dtype) - value_json = attr_json['value'] - if 'dims' in shape_json: - shape = shape_json['dims'] - else: - shape = () + # The shape_json may actually be the shape value we passed + # to the server on PUT attributes rather than the GET response. + # Finagle the code here to do the right thing in both cases. + # TBD: Update HSDS to accept the the shape_json as shape + # parameter so this can be avoided + + if isinstance(shape_json, str): + # H5S_NULL should be the only possible value + if shape_json == 'H5S_NULL': + return Empty(dtype) + else: + raise TypeError(f"unexpected attr shape: {shape_json}") + elif isinstance(shape_json, tuple): + shape = shape_json + elif isinstance(shape_json, dict): + if shape_json['class'] == 'H5S_NULL': + return Empty(dtype) + if 'dims' in shape_json: + shape = shape_json['dims'] + else: + shape = () + + value_json = attr_json['value'] # Do this first, as we'll be fiddling with the dtype for top-level # array types @@ -158,71 +153,12 @@ def __getitem__(self, name): try: v.encode("utf-8") except UnicodeEncodeError: - self._parent.log.debug("converting utf8 unencodable string as bytes") + # converting utf8 unencodable string as bytes v = v.encode("utf-8", errors="surrogateescape") return v return arr - def get_attributes(self, names=None, pattern=None, limit=None, marker=None): - """ - Get all attributes or a subset of attributes from the target object. - If 'use_cache' is True, use the objdb cache if available. - The cache cannot be used with pattern, limit, or marker parameters. - - if 'pattern' is provided, retrieve all attributes with names that match the pattern - according to Unix pathname pattern expansion rules. - - if 'limit' is provided, retrieve at most 'limit' attributes. - - if 'marker' is provided, retrieve attributes whose names occur after the name 'marker' in the target object - """ - if names and (pattern or limit or marker): - raise ValueError("names cannot be used with pattern, limit or marker") - - if self._objdb_attributes is not None: - # use the objdb cache - out = {} - for a in self._objdb_attributes: - name = a['name'] - out[name] = self._objdb_attributes[name] - return out - - # Omit trailing slash - req = self._req_prefix[:-1] - - body = {} - params = {"IncludeData": 1} - - if pattern: - params["pattern"] = pattern - if limit: - params["Limit"] = limit - if marker: - params["Marker"] = marker - - if names: - if isinstance(names, list): - names = [name.decode('utf-8') if isinstance(name, bytes) else name for name in names] - else: - if isinstance(names, bytes): - names = names.decode("utf-8") - names = [names] - - body['attr_names'] = names - - if body: - rsp = self._parent.POST(req, body=body, params=params) - else: - rsp = self._parent.GET(req, params=params) - - attrs_json = rsp['attributes'] - names = [attr['name'] for attr in attrs_json] - values = [attr['value'] for attr in attrs_json] - out = {} - - for i in range(len(names)): - out[names[i]] = values[i] - - return out - def __setitem__(self, name, value): """ Set a new attribute, overwriting any existing attribute. @@ -230,29 +166,18 @@ def __setitem__(self, name, value): use a specific type or shape, or to preserve the type of an attribute, use the methods create() and modify(). """ - self.create(name, values=value, dtype=base.guess_dtype(value)) + self.create(name, value, dtype=base.guess_dtype(value)) def __delitem__(self, name): """ Delete an attribute (which must already exist). """ - params = {} + self._parent.del_attr(name) - if isinstance(name, list): - names = [name.decode('utf-8') if isinstance(name, bytes) else name for name in name] - # Omit trailing slash - req = self._req_prefix[:-1] - params["attr_names"] = "/".join(names) - else: - if isinstance(name, bytes): - name = name.decode("utf-8") - req = self._req_prefix + name - self._parent.DELETE(req, params=params) - - def create(self, names, values, shape=None, dtype=None): + def create(self, name, value, shape=None, dtype=None): """ Create new attribute(s), overwriting any existing attributes. - names + name Name of the new attribute or list of names (required) - values + value Array to initialize the attribute or list of arrays (required) shape Shape of the attribute. Overrides data.shape if both are @@ -261,149 +186,93 @@ def create(self, names, values, shape=None, dtype=None): Data type of the attribute. Overrides data.dtype if both are given. """ - self._parent.log.info(f"attrs.create({names})") - # Standardize single attribute arguments to lists - if not isinstance(names, list): - names = [names] - values = [values] + # First, make sure we have a NumPy array. We leave the data + # type conversion for HDF5 to perform. + if isinstance(value, Reference): + dtype = special_dtype(ref=Reference) + if not isinstance(value, Empty): + value = numpy.asarray(value, dtype=dtype, order='C') - # Do not permit duplicate names - if len(names) != len(set(names)): - raise ValueError("Duplicate attribute names are not allowed") + if shape is None and not isinstance(value, Empty): + shape = value.shape - if shape is not None and not isinstance(shape, list): - shapes = [shape] - elif shape is None: - shapes = [None] * len(names) - else: - # Given shape is already a list of shapes - shapes = shape + use_htype = None # If a committed type is given, we must use it in h5a.create. + + if isinstance(dtype, Datatype): + use_htype = dtype.id + dtype = dtype.dtype - if dtype is not None and not isinstance(dtype, list): - dtypes = [dtype] + # Special case if data are complex numbers + is_complex = (value.dtype.kind == 'c') and (dtype.names is None) or ( + dtype.names != ('r', 'i')) or ( + any(dt.kind != 'f' for dt, off in dtype.fields.values())) or ( + dtype.fields['r'][0] == dtype.fields['i'][0]) + + if is_complex: + raise TypeError(f'Wrong committed datatype for complex numbers: {dtype.name}') elif dtype is None: - dtypes = [None] * len(names) - else: - # Given dtype is already a list of dtypes - dtypes = dtype - - type_jsons = [None] * len(names) - - if (len(names) != len(values)) or (shapes is not None and len(shapes) != len(values)) or\ - (dtypes is not None and len(dtypes) != len(values)): - raise ValueError("provided names, values, shapes and dtypes must have the same length") - - for i in range(len(names)): - # First, make sure we have a NumPy array. We leave the data - # type conversion for HDF5 to perform. - if isinstance(values[i], Reference): - dtypes[i] = special_dtype(ref=Reference) - if not isinstance(values[i], Empty): - values[i] = numpy.asarray(values[i], dtype=dtypes[i], order='C') - - if shapes[i] is None and not isinstance(values[i], Empty): - shapes[i] = values[i].shape - - use_htype = None # If a committed type is given, we must use it in h5a.create. - - if isinstance(dtypes[i], Datatype): - use_htype = dtypes[i].id - dtypes[i] = dtypes[i].dtype - - # Special case if data are complex numbers - is_complex = (values[i].dtype.kind == 'c') and (dtypes[i].names is None) or ( - dtypes[i].names != ('r', 'i')) or ( - any(dt.kind != 'f' for dt, off in dtypes[i].fields.values())) or ( - dtypes[i].fields['r'][0] == dtypes[i].fields['i'][0]) - - if is_complex: - raise TypeError( - f'Wrong committed datatype for complex numbers: {dtypes[i].name}') - elif dtypes[i] is None: - if values[i].dtype.kind == 'U': - # use vlen for unicode strings - dtypes[i] = special_dtype(vlen=str) - else: - dtypes[i] = values[i].dtype + if value.dtype.kind == 'U': + # use vlen for unicode strings + dtype = special_dtype(vlen=str) else: - dtypes[i] = numpy.dtype(dtypes[i]) # In case a string, e.g. 'i8' is passed - - # Where a top-level array type is requested, we have to do some - # fiddling around to present the data as a smaller array of - # subarrays. - if not isinstance(values[i], Empty): - if dtypes[i].subdtype is not None: + dtype = value.dtype + else: + dtype = numpy.dtype(dtype) # In case a string, e.g. 'i8' is passed - subdtype, subshape = dtypes[i].subdtype + # Where a top-level array type is requested, we have to do some + # fiddling around to present the data as a smaller array of + # subarrays. + if not isinstance(value, Empty): + if dtype.subdtype is not None: + subdtype, subshape = dtype.subdtype - # Make sure the subshape matches the last N axes' sizes. - if shapes[i][-len(subshape):] != subshape: - raise ValueError(f"Array dtype shape {subshape} is incompatible with data shape {shapes[i]}") + # Make sure the subshape matches the last N axes' sizes. + if shape[-len(subshape):] != subshape: + raise ValueError(f"Array dtype shape {subshape} is incompatible with data shape {shape}") - # New "advertised" shape and dtype - shapes[i] = shapes[i][0:len(shapes[i]) - len(subshape)] - dtypes[i] = subdtype + # New "advertised" shape and dtype + shape = shape[0:len(shape) - len(subshape)] + dtype = subdtype - # Not an array type; make sure to check the number of elements - # is compatible, and reshape if needed. - else: - if numpy.prod(shapes[i]) != numpy.prod(values[i].shape): - raise ValueError("Shape of new attribute conflicts with shape of data") + # Not an array type; make sure to check the number of elements + # is compatible, and reshape if needed. + else: + if numpy.prod(shape) != numpy.prod(value.shape): + raise ValueError("Shape of new attribute conflicts with shape of data") - if shapes[i] != values[i].shape: - values[i] = values[i].reshape(shapes[i]) + if shape != value.shape: + value = value.reshape(shape) # We need this to handle special string types. - values[i] = numpy.asarray(values[i], dtype=dtypes[i]) + value = numpy.asarray(value, dtype=dtype) - # Make HDF5 datatype and dataspace for the H5A calls - if use_htype is None: - type_jsons[i] = getTypeItem(dtypes[i]) - self._parent.log.debug(f"attrs.create type_json: {format(type_jsons[i])}") + # Make HDF5 datatype and dataspace for the H5A calls + if use_htype is None: + type_json = getTypeItem(dtype) params = {} - body = {} params['replace'] = 1 - attributes = {} - - for i in range(len(names)): - attr = {} - attr['type'] = type_jsons[i] - if isinstance(values[i], Empty): - attr['shape'] = 'H5S_NULL' - else: - attr['shape'] = shapes[i] - if values[i].dtype.kind != 'c': - attr['value'] = self._bytesArrayToList(values[i]) - else: - # Special case: complex numbers - special_dt = createDataType(type_jsons[i]) - tmp = numpy.empty(shape=values[i].shape, dtype=special_dt) - tmp['r'] = values[i].real - tmp['i'] = values[i].imag - attr['value'] = json.loads(json.dumps(tmp.tolist())) - attributes[names[i]] = attr - - if len(names) > 1: - # Create multiple attributes - # Omit trailing slash - req = self._req_prefix[:-1] - body['attributes'] = attributes + attr = {} + attr['type'] = type_json + if isinstance(value, Empty): + attr['shape'] = 'H5S_NULL' else: - # Create single attribute - req = self._req_prefix + names[0] - for key in attributes[names[0]]: - body[key] = attributes[names[0]][key] + attr['shape'] = shape + if value.dtype.kind != 'c': + attr['value'] = self._bytesArrayToList(value) + else: + # Special case: complex numbers + special_dt = createDataType(type_json) + tmp = numpy.empty(shape=value.shape, dtype=special_dt) + tmp['r'] = value.real + tmp['i'] = value.imag + attr['value'] = json.loads(json.dumps(tmp.tolist())) - try: - self._parent.PUT(req, body=body, params=params) - except RuntimeError: - # 'replace' parameter is used, so failure is not due to attribute already existing - raise RuntimeError("Failued to create attribute(s)") + self._parent.set_attr(name, attr) def modify(self, name, value): """ Change the value of an attribute while preserving its type. @@ -437,97 +306,33 @@ def modify(self, name, value): def __len__(self): """ Number of attributes attached to the object. """ - - if self._objdb_attributes is not None: - count = len(self._objdb_attributes) - else: - # make a server requests - req = self._req_prefix - # backup over the '/attributes/' part of the req - req = req[:-(len('/attributes/'))] - rsp = self._parent.GET(req) # get parent obj - count = rsp['attributeCount'] - return count - - def __iter__(self): - """ Iterate over the names of attributes. """ - if self._objdb_attributes is not None: - if self._parent.track_order: - attrs = sorted(self._objdb_attributes.items(), key=lambda x: x[1]['created']) - else: - attrs = sorted(self._objdb_attributes.items()) - - ordered_attrs = {} - for a in attrs: - ordered_attrs[a[0]] = a[1] - - for name in ordered_attrs: - yield name - - else: - # make server request - req = self._req_prefix - # backup over the trailing slash in req - req = req[:-1] - rsp = self._parent.GET(req, params={"CreateOrder": "1" if self._parent.track_order else "0"}) - attributes = rsp['attributes'] - - attrlist = [] - for attr in attributes: - attrlist.append(attr['name']) - - for name in attrlist: - yield name + return self._parent.attr_count def __contains__(self, name): """ Determine if an attribute exists, by name. """ - exists = True if isinstance(name, bytes): name = name.decode("utf-8") - if self._objdb_attributes is not None: - exists = name in self._objdb_attributes + if self._parent.has_attr(name): + return True else: - # make server request - req = self._req_prefix + name - try: - self._parent.GET(req) - except IOError: - # todo - verify this is a 404 response - exists = False - return exists + return False def __repr__(self): if not self._parent.id.id: return "" - return f"" + return f"" + + def __iter__(self): + """ Iterate over the names of attributes. """ + # convert to a list of dicts + names = self._parent.get_attr_names(track_order=self.track_order) + for name in names: + yield name def __reversed__(self): """ Iterate over the names of attributes in reverse order. """ - if self._objdb_attributes is not None: - if self._parent.track_order: - attrs = sorted(self._objdb_attributes.items(), key=lambda x: x[1]['created']) - else: - attrs = sorted(self._objdb_attributes.items()) - - ordered_attrs = {} - for a in attrs: - ordered_attrs[a[0]] = a[1] - - for name in reversed(ordered_attrs): - yield name - - else: - # make server request - req = self._req_prefix - # backup over the trailing slash in req - req = req[:-1] - rsp = self._parent.GET(req, params={"CreateOrder": "1" if self._parent.track_order else "0"}) - attributes = rsp['attributes'] - - attrlist = [] - for attr in attributes: - attrlist.append(attr['name']) - - for name in reversed(attrlist): - yield name + names = self._parent.get_attr_names(track_order=self.track_order) + for name in reversed(names): + yield name + # done diff --git a/h5pyd/_hl/base.py b/h5pyd/_hl/base.py index 42f99c66..b6c1fac1 100644 --- a/h5pyd/_hl/base.py +++ b/h5pyd/_hl/base.py @@ -15,14 +15,13 @@ import posixpath import os import sys -import json import numpy as np import logging import logging.handlers from collections.abc import ( Mapping, MutableMapping, KeysView, ValuesView, ItemsView ) -from .objectid import GroupID +from .objectid import GroupID, ObjectID from .h5type import Reference, check_dtype, special_dtype numpy_integer_types = (np.int8, np.uint8, np.int16, np.int16, np.int32, np.uint32, np.int64, np.uint64) @@ -797,109 +796,14 @@ def file(self): from .files import File http_conn = self._id.http_conn root_uuid = http_conn.root_uuid - # construct a group json, so we don't need to do a request - group_json = {} - group_json["root"] = root_uuid - group_json["id"] = root_uuid - group_json["domain"] = http_conn.domain - group_json["created"] = http_conn.created - group_json["lastModified"] = http_conn.modified - - groupid = GroupID(None, group_json, http_conn=http_conn) + groupid = GroupID(root_uuid, http_conn=http_conn) return File(groupid) - def _getNameFromObjDb(self): - objdb = self._id._http_conn.getObjDb() - - if not objdb: - return None - - root_uuid = self._id.http_conn.root_uuid - objid = self._id.uuid - self.log.debug(f"_getNameFromObjDb: find name for: {objid}") - objids = set() - objids.add(objid) - h5path = "" - while not h5path.startswith("/"): - found_link = False - for id in objdb: - if id == objid: - self.log.debug(f"_getNameFromObjDb - skipping id {id} - obj cannot link to itself") - continue - self.log.debug(f"_getNameFromObjDb - searching id: {id}") - if not id.startswith("g-"): - continue # not a group, so no links - if id in objids: - continue # we've been here already - obj = objdb[id] - links = obj["links"] - for title in links: - self.log.debug(f"_getNameFromObjDb - looking at linK: {title}") - link = links[title] - link_class = link["class"] - if link_class != 'H5L_TYPE_HARD': - self.log.debug(f"_getNameFromObjDb - skipping link type: {link_class}") - continue - if link["id"] == objid: - # found a link to our target - found_link = True - if not h5path: - h5path = title - else: - h5path = title + '/' + h5path - self.log.debug(f"_getNameFromObjDb - update h5path: {h5path}") - objids.add(id) - if id == root_uuid: - h5path = '/' + h5path # we got to root - self.log.debug("_getNameFromObjDb - found root") - else: - objid = id - self.log.debug(f"_getNameFromObjDb - now looking for link to: {objid}") - break - if not found_link: - self.log.info("_getNameFromObjDb - could not find link") - break - if h5path.startswith("/"): - # found path to obj - self.log.debug(f"_getNameFromObjDb - returning: {h5path}") - return h5path - else: - self.log.debug("_getNameFromObjDb - could not find path") - return None - @property def name(self): """ Return the full name of this object. None if anonymous. """ - try: - obj_name = self._name - except AttributeError: - # name hasn't been assigned yet - obj_name = self._getNameFromObjDb() # pull from the objdb if present - if obj_name: - self._name = obj_name # save this - if not obj_name: - # query the server for the name - self.log.debug(f"querying server for name to: {self._id.id}") - req = None - if self._id.id.startswith("g-"): - req = "/groups/" + self._id.id - elif self._id.id.startswith("d-"): - req = "/datasets/" + self._id.id - elif self._id.id.startswith("t-"): - req = "/datatypes/" + self._id - if req: - params = params = {"getalias": 1} - self.log.info(f"sending get alias request for id: {self._id.id}") - obj_json = self.GET(req, params, use_cache=False) - if "alias" in obj_json: - alias = obj_json["alias"] - if len(alias) > 0: - obj_name = alias[0] - self._name = obj_name - - return obj_name - # return self._d(h5i.get_name(self.id)) + return self._name @property def parent(self): @@ -921,7 +825,6 @@ def id(self): def ref(self): """ An (opaque) HDF5 reference to this object """ return Reference(self) - # return h5r.create(self.id, b'.', h5r.OBJECT) @property def regionref(self): @@ -941,148 +844,33 @@ def regionref(self): def attrs(self): """ Attributes attached to this object """ from . import attrs - return attrs.AttributeManager(self) + return attrs.AttributeManager(self.id, track_order=self.track_order) @property def modified(self): """Last modified time as a datetime object""" - return self.id._modified + return self.id.modified @property def track_order(self): - return self._track_order - - def verifyCert(self): - # default to validate CERT for https requests, unless - # the H5PYD_VERIFY_CERT environment variable is set and True - # - # TBD: set default to True once the signing authority of data.hdfgroup.org is - # recognized - if "H5PYD_VERIFY_CERT" in os.environ: - verify_cert = os.environ["H5PYD_VERIFY_CERT"].upper() - if verify_cert.startswith('F'): - return False - return True - - def GET(self, req, params=None, use_cache=True, format="json"): - if self.id.http_conn is None: - raise IOError("object not initialized") - # This should be the default - but explictly set anyway - headers = {"Accept-Encoding": "deflate, gzip"} - - rsp = self.id._http_conn.GET(req, params=params, headers=headers, format=format, use_cache=use_cache) - if rsp.status_code != 200: - self.log.info(f"Got response: {rsp.status_code}") - raise IOError(rsp.status_code, rsp.reason) - if 'Content-Type' in rsp.headers and rsp.headers['Content-Type'] == "application/octet-stream": - if 'Content-Length' in rsp.headers: - # not available when http compression is used - self.log.debug("returning binary content, length: " + rsp.headers['Content-Length']) - else: - self.log.debug("returning binary content - length unknown") - HTTP_CHUNK_SIZE = 4096 - http_chunks = [] - downloaded_bytes = 0 - for http_chunk in rsp.iter_content(chunk_size=HTTP_CHUNK_SIZE): - if http_chunk: # filter out keep alive chunks - self.log.debug(f"got http_chunk - {len(http_chunk)} bytes") - downloaded_bytes += len(http_chunk) - http_chunks.append(http_chunk) - if len(http_chunks) == 0: - raise IOError("no data returned") - if len(http_chunks) == 1: - # can return first and only chunk as response - rsp_content = http_chunks[0] - else: - msg = f"retrieved {len(http_chunks)} http_chunks " - msg += f" {downloaded_bytes} total bytes" - self.log.info(msg) - rsp_content = bytearray(downloaded_bytes) - index = 0 - for http_chunk in http_chunks: - rsp_content[index:(index + len(http_chunk))] = http_chunk - index += len(http_chunk) - return rsp_content - else: - # assume JSON - rsp_json = json.loads(rsp.text) - self.log.debug(f"rsp_json - {len(rsp.text)} bytes") - return rsp_json - - def PUT(self, req, body=None, params=None, format="json", replace=False): - if self.id.http_conn is None: - raise IOError("object not initialized") - - # try to do a PUT to the domain - rsp = self._id._http_conn.PUT(req, body=body, params=params, format=format) - self.log.info(f"PUT rsp status_code: {rsp.status_code}") - - if rsp.status_code not in (200, 201, 204): - if rsp.status_code == 409: - # Conflict error - if replace: - self.log.info(f"replacing resource: {req}") - rsp = self.id._http_conn.DELETE(req) - if rsp.status_code != 200: - raise IOError(rsp.reason) - rsp = self._id._http_conn.PUT(req, body=body, params=params, format=format) - if rsp.status_code not in (200, 201): - raise IOError(rsp.reason) - else: - raise RuntimeError(rsp.reason) - else: - raise IOError(f"{rsp.reason}:{rsp.status_code}") - - if rsp.text: - rsp_json = json.loads(rsp.text) - return rsp_json - - def POST(self, req, body=None, params=None, format="json"): - if self.id.http_conn is None: - raise IOError("object not initialized") + track_order = self._track_order + if track_order is None and self.id.cpl.get('CreateOrder'): + track_order = True + return track_order - # try to do a POST to the domain - - self.log.info(f"POST: {req} [{self.id.domain}]") - - rsp = self.id._http_conn.POST(req, body=body, params=params, format=format) - if rsp.status_code == 409: - raise ValueError("name already exists") - if rsp.status_code not in (200, 201): - self.log.error(f"POST error - status_code: {rsp.status_code}, reason: {rsp.reason}") - raise IOError(rsp.reason) - - if 'Content-Type' in rsp.headers and rsp.headers['Content-Type'] == "application/octet-stream": - if 'Content-Length' in rsp.headers: - # not available when http compression is used - self.log.info("returning binary content, length: " + rsp.headers['Content-Length']) - else: - self.log.info("returning binary compressed content") - return rsp.content - else: - # assume JSON - rsp_json = json.loads(rsp.text) - return rsp_json - - def DELETE(self, req, params=None): - if self.id.http_conn is None: - raise IOError("object not initialized") - - # try to do a DELETE of the resource - - self.log.info(f"DEL: {req} [{self.id.domain}]") - rsp = self.id._http_conn.DELETE(req, params=params) - # self.log.info("RSP: " + str(rsp.status_code) + ':' + rsp.text) - if rsp.status_code != 200: - raise IOError(rsp.reason) - - def __init__(self, oid, file=None, track_order=None): + def __init__(self, oid, track_order=None): """ Setup this object, given its low-level identifier """ + if not isinstance(oid, ObjectID): + raise TypeError(f"unexpected type for HLObject.__init__: {type(oid)}") self._id = oid self.log = self._id.http_conn.logging - self.req_prefix = None # derived class should set this to the URI of the object - self._file = file - # self._name = None + if self.id.uuid == self.id.http_conn.root_uuid: + # set the name as the root group + self._name = "/" + else: + # allow super-class to set the name based on how this + # object was instantiated + self._name = None if not self.log.handlers: # setup logging @@ -1096,23 +884,7 @@ def __init__(self, oid, file=None, track_order=None): else: pass - if track_order is None: - # set order based on group creation props - obj_json = self.id.obj_json - if "creationProperties" in obj_json: - cpl = obj_json["creationProperties"] - else: - cpl = {} - if "CreateOrder" in cpl: - createOrder = cpl["CreateOrder"] - if not createOrder or createOrder == "0": - self._track_order = False - else: - self._track_order = True - else: - self._track_order = False - else: - self._track_order = track_order + self._track_order = track_order def __hash__(self): return hash(self.id.id) @@ -1128,28 +900,6 @@ def __ne__(self, other): def __bool__(self): return bool(self.id) - def getACL(self, username): - req = self._req_prefix + '/acls/' + username - rsp_json = self.GET(req) - acl_json = rsp_json["acl"] - return acl_json - - def getACLs(self): - req = self._req_prefix + '/acls' - rsp_json = self.GET(req) - acls_json = rsp_json["acls"] - return acls_json - - def putACL(self, acl): - if "userName" not in acl: - raise IOError("ACL has no 'userName' key") - perm = {} - for k in ("create", "read", "update", "delete", "readACL", "updateACL"): - perm[k] = acl[k] - - req = self._req_prefix + '/acls/' + acl['userName'] - self.PUT(req, body=perm) - # --- Dictionary-style interface ---------------------------------------------- diff --git a/h5pyd/_hl/dataset.py b/h5pyd/_hl/dataset.py index b08ddffc..284bad33 100644 --- a/h5pyd/_hl/dataset.py +++ b/h5pyd/_hl/dataset.py @@ -14,7 +14,6 @@ import posixpath as pp from copy import copy -import sys import time import numpy import os @@ -30,7 +29,7 @@ from . import filters from . import selections as sel from .datatype import Datatype -from .h5type import getTypeItem, createDataType, check_dtype, special_dtype, getItemSize +from .h5type import getTypeItem, check_dtype, special_dtype, getItemSize from .. import config _LEGACY_GZIP_COMPRESSION_VALS = frozenset(range(10)) @@ -67,6 +66,7 @@ def readtime_dtype(basetype, names): def make_new_dset( parent, + name=None, shape=None, dtype=None, data=None, @@ -88,8 +88,9 @@ def make_new_dset( Only creates anonymous datasets. """ - # fill in fields for the body of the POST request as we got - body = {} + if name and name.find('/') != -1: + raise ValueError("name cannot be a path") + cfg = config.get_config() # Convert data to a C-contiguous ndarray @@ -112,11 +113,6 @@ def make_new_dset( ): raise ValueError("Shape tuple is incompatible with data") - if shape is None: - body["shape"] = "H5S_NULL" - else: - body["shape"] = shape - if track_times is not None: if track_times not in (True, False): raise TypeError("invalid track_times") @@ -183,7 +179,6 @@ def make_new_dset( raise ValueError(errmsg) else: type_json = getTypeItem(dtype) - body["type"] = type_json layout = None if chunks is not None and isinstance(chunks, dict): @@ -253,45 +248,40 @@ def make_new_dset( dcpl["fillValue"] = fillvalue if track_order or cfg.track_order: - dcpl["CreateOrder"] = 1 + track_order = True if chunks and isinstance(chunks, dict): dcpl["layout"] = chunks - body["creationProperties"] = dcpl + maxdims = None if maxshape is not None and len(maxshape) > 0: if shape is not None: - maxshape = tuple(m if m is not None else 0 for m in maxshape) - body["maxdims"] = maxshape + maxdims = tuple(m if m is not None else 0 for m in maxshape) else: print("maxshape provided but no shape") - req = "/datasets" - - rsp = parent.POST(req, body=body) - - json_rep = {} - json_rep["id"] = rsp["id"] - - req = "/datasets/" + rsp["id"] - rsp = parent.GET(req) - - json_rep["shape"] = rsp["shape"] - json_rep["type"] = rsp["type"] - json_rep["lastModified"] = rsp["lastModified"] - if "creationProperties" in rsp: - json_rep["creationProperties"] = rsp["creationProperties"] + kwds = {"type_json": type_json, "cpl": dcpl} + if shape is None: + kwds["shape"] = "H5S_NULL" else: - json_rep["creationProperties"] = {} - if "layout" in rsp: - json_rep["layout"] = rsp["layout"] + kwds["shape"] = shape + if maxdims: + kwds["maxdims"] = maxdims + if track_order: + kwds["track_order"] = track_order + + dset_id = parent.id.make_obj(name, **kwds) # create the dataset - dset_id = DatasetID(parent, json_rep) + # for new datasets do a fetch to regularize the json values + if not dset_id.chunks: + # TBD - have HSDS return chunk shape on create, then + # we can avoid additional server request + dset_id.refresh() if data is not None: # init data - dset = Dataset(dset_id, track_order=(track_order or cfg.track_order)) + dset = Dataset(dset_id) dset[...] = data return dset_id @@ -544,38 +534,24 @@ def dims(self): @property def ndim(self): """Numpy-style attribute giving the number of dimensions""" - if self._shape is None: + if self.shape is None: return 0 else: - return len(self._shape) + return len(self.shape) @property def shape(self): """Numpy-style shape tuple giving dataset dimensions""" # just return the cached shape value # (although potentially it could have changed on server) - return self._shape - - def get_shape(self, check_server=False): - # this version will optionally refetch the shape from the server - # (if the dataset is resiable) shape_json = self.id.shape_json if shape_json["class"] == "H5S_NULL": return None if shape_json["class"] == "H5S_SCALAR": return () # return empty + dims = tuple(shape_json['dims']) - if "maxdims" not in shape_json or not check_server: - # not resizable, just return dims - dims = shape_json["dims"] - else: - # resizable, retrieve current shape - req = "/datasets/" + self.id.uuid + "/shape" - rsp = self.GET(req) - shape_json = rsp["shape"] - dims = shape_json["dims"] - self._shape = tuple(dims) - return self._shape + return dims @shape.setter def shape(self, shape): @@ -584,18 +560,18 @@ def shape(self, shape): @property def size(self): """Numpy-style attribute giving the total dataset size""" - if self._shape is None: + dims = self.shape + if dims is None: return None - return numpy.prod(self._shape, dtype=numpy.int64).item() + return numpy.prod(dims, dtype=numpy.int64).item() @property def nbytes(self): """Numpy-style attribute giving the raw dataset size as the number of bytes""" size = self.size - if ( - size is None - ): # if we are an empty 0-D array, then there are no bytes in the dataset + if (size is None): # if we are an empty 0-D array, then there are no bytes in the dataset return 0 + # TBD - this is not the actual size for vlen types return self.dtype.itemsize * size @property @@ -727,7 +703,7 @@ def maxshape(self): @property def fillvalue(self): """Fill value for this dataset (0 by default)""" - dcpl = self.id.dcpl_json + dcpl = self.id.cpl if "fillValue" in dcpl: fill_value = dcpl["fillValue"] if isinstance(fill_value, list): @@ -749,7 +725,7 @@ def fillvalue(self): @property def _is_empty(self): """check if this is a null-space datset""" - return self._shape is None + return self.shape is None @property def num_chunks(self): @@ -768,26 +744,15 @@ def __init__(self, bind, track_order=None): if not isinstance(bind, DatasetID): raise ValueError(f"{bind} is not a DatasetID") - HLObject.__init__(self, bind, track_order=track_order) + super().__init__(bind, track_order=track_order) - self._dcpl = self.id.dcpl_json - self._filters = filters.get_filters(self._dcpl) + self._filters = filters.get_filters(self.id.cpl) self._local = None # local() # make a numpy dtype out of the type json - self._dtype = createDataType(self.id.type_json) + self._dtype = self.id.get_type() self._item_size = getItemSize(self.id.type_json) - if track_order is None: - if "CreateOrder" in self._dcpl: - if not self._dcpl["CreateOrder"] or self._dcpl["CreateOrder"] == "0": - self._track_order = False - else: - self._track_order = True - else: - self._track_order = track_order - - self._shape = self.get_shape() self._num_chunks = None # aditional state we'll get when requested self._allocated_size = None # as above @@ -799,8 +764,7 @@ def _getVerboseInfo(self): now = time.time() if (self._verboseUpdated is None or now - self._verboseUpdated > VERBOSE_REFRESH_TIME): # resynch the verbose data - req = "/datasets/" + self.id.uuid + "?verbose=1" - rsp_json = self.GET(req) + rsp_json = self.id.getVerboseInfo() if "num_chunks" in rsp_json: self._num_chunks = rsp_json["num_chunks"] else: @@ -839,18 +803,10 @@ def resize(self, size, axis=None): except TypeError: raise TypeError("Argument must be a single int if axis is specified") - size = list(self._shape) + size = list(self.shape) size[axis] = newlen - size = tuple(size) - - # send the request to the server - body = {"shape": size} - req = "/datasets/" + self.id.uuid + "/shape" - self.PUT(req, body=body) - # self.id.set_extent(size) - # h5f.flush(self.id) # THG recommends - self._shape = size # save the new shape + self.id.resize(size) def __len__(self): """The size of the first axis. TypeError if scalar. @@ -858,10 +814,6 @@ def __len__(self): Limited to 2**32 on 32-bit systems; Dataset.len() is preferred. """ size = self.len() - if size > sys.maxsize: - raise OverflowError( - "Value too big for Python's __len__; use Dataset.len() instead." - ) return size def len(self): @@ -870,7 +822,8 @@ def len(self): Use of this method is preferred to len(dset), as Python's built-in len() cannot handle values greater then 2**32 on 32-bit systems. """ - shape = self._shape + + shape = self.shape if shape is None or len(shape) == 0: raise TypeError("Attempt to take len() of scalar dataset") return shape[0] @@ -880,7 +833,7 @@ def __iter__(self): BEWARE: Modifications to the yielded data are *NOT* written to file. """ - shape = self._shape + shape = self.shape # to reduce round trips, grab BUFFER_SIZE items at a time # TBD: set buffersize based on size of each row BUFFER_SIZE = 1000 @@ -920,7 +873,7 @@ def iter_chunks(self, sel=None): def _getQueryParam(self, start, stop, step=None): param = "" - rank = len(self._shape) + rank = len(self.shape) if rank == 0: return None if step is None: @@ -1019,31 +972,41 @@ def __getitem__(self, args, new_dtype=None): return Empty(self.dtype) raise ValueError("Empty datasets cannot be sliced") + shape = self.shape + if shape is None: + rank = None + else: + rank = len(shape) + # === Scalar dataspaces ================= - if self._shape == (): + if shape == (): selection = sel.select(self, args) self.log.info(f"selection.mshape: {selection.mshape}") - # TBD - refactor the following with the code for the non-scalar case - req = "/datasets/" + self.id.uuid + "/value" - rsp = self.GET(req, format="binary") + req = f"/datasets/{self.id.uuid}/value" + + rsp = self.id.http_conn.GET(req, format="binary") + if rsp.status_code != 200: + msg = f"Error retrieving data: {rsp.status_code}" + self.log.warning(msg) + raise IOError(msg) - if type(rsp) in (bytes, bytearray): + if rsp.is_binary: # got binary response self.log.info("got binary response for scalar selection") # arr = numpy.frombuffer(rsp, dtype=new_dtype) - arr = bytesToArray(rsp, new_dtype, self._shape) + arr = bytesToArray(rsp.text, new_dtype, shape) if not self.dtype.shape: - self.log.debug(f"reshape arr to: {self._shape}") - arr = numpy.reshape(arr, self._shape) + self.log.debug(f"reshape arr to: {shape}") + arr = numpy.reshape(arr, shape) else: # got JSON response # need some special conversion for compound types -- # each element must be a tuple, but the JSON decoder # gives us a list instead. - data = rsp["value"] + data = rsp.json()["value"] self.log.info("got json response for scalar selection") if len(mtype) > 1 and type(data) in (list, tuple): converted_data = [] @@ -1080,32 +1043,23 @@ def __getitem__(self, args, new_dtype=None): single_element = selection.mshape == () mshape = (1,) if single_element else selection.mshape - rank = len(self._shape) - - self.log.debug(f"dataset shape: {self._shape}") + self.log.debug(f"dataset shape: {shape}") self.log.debug(f"mshape: {mshape}") # Perfom the actual read rsp = None - req = "/datasets/" + self.id.uuid + "/value" + req = f"/datasets/{self.id.uuid}/value" params = {} if mtype.names != self.dtype.names: params["fields"] = ":".join(mtype.names) - if self.id._http_conn.mode == "r" and self.id._http_conn.cache_on: - # enables lambda to be used on server - self.log.debug("setting nonstrict parameter") - params["nonstrict"] = 1 - else: - self.log.debug("not settng nonstrict") - if isinstance(selection, sel.SimpleSelection): # Divy up large selections into pages, so no one request # to the server will take unduly long to process chunk_layout = self.id.chunks if chunk_layout is None: - chunk_layout = self._shape + chunk_layout = shape elif isinstance(chunk_layout, dict): # CHUNK_REF layout if "dims" not in chunk_layout: @@ -1126,8 +1080,8 @@ def __getitem__(self, args, new_dtype=None): # determine the dimension for paging for i in range(rank): stop = sel_start[i] + selection.count[i] * sel_step[i] - if stop > self._shape[i]: - stop = self._shape[i] + if stop > shape[i]: + stop = shape[i] sel_stop.append(stop) if scalar_selection[i]: # scalar index so will hit just one chunk @@ -1201,27 +1155,29 @@ def __getitem__(self, args, new_dtype=None): self.log.info(f"page_mshape: {page_mshape}") params["select"] = self._getQueryParam(page_start, page_stop, sel_step) - try: - rsp = self.GET(req, params=params, format="binary") - except IOError as ioe: - self.log.info(f"got IOError: {ioe.errno}") - if ioe.errno == 413 and chunks_per_page > 1: - # server rejected the request, reduce the page size - chunks_per_page //= 2 - self.log.info(f"New chunks_per_page: {chunks_per_page}") - break - else: - raise IOError(f"Error retrieving data: {ioe.errno}") + + rsp = self.id.http_conn.GET(req, params=params, format="binary") + if rsp.status_code == 413 and chunks_per_page > 1: + # server rejected the request, reduce the page size + chunks_per_page //= 2 + msg = f"Got 413 response, set chunks_per_page to: {chunks_per_page}" + self.log.info(msg) + break + elif rsp.status_code != 200: + msg = f"Error retrieving data: {rsp.status_code}" + self.log.warning(msg) + raise IOError(msg) + if isinstance(rsp, str): # hexencoded response? # this is returned by API Gateway for lamba responses rsp = bytes.fromhex(rsp) # from here treat it like a byte responses - if type(rsp) in (bytes, bytearray): + if rsp.is_binary: # got binary response # TBD - check expected number of bytes - self.log.info(f"binary response, {len(rsp)} bytes") - arr1d = bytesToArray(rsp, mtype, page_mshape) + self.log.info(f"binary response, {len(rsp.text)} bytes") + arr1d = bytesToArray(rsp.text, mtype, page_mshape) page_arr = numpy.reshape(arr1d, page_mshape) else: # got JSON response @@ -1229,8 +1185,9 @@ def __getitem__(self, args, new_dtype=None): # each element must be a tuple, but the JSON decoder # gives us a list instead. self.log.info("json response") + rsp_json = rsp.json() - data = rsp["value"] + data = rsp_json["value"] self.log.debug(data) page_arr = jsonToArray(page_mshape, mtype, data) @@ -1279,20 +1236,25 @@ def __getitem__(self, args, new_dtype=None): # use a post method to avoid long query strings self.log.info("using post select") try: - rsp = self.POST(req, body=params, format="binary") + rsp = self.id.http_conn.POST(req, body=params, format="binary") except IOError as ioe: self.log.info(f"got IOError: {ioe.errno}") raise IOError(f"Error retrieving data: {ioe.errno}") else: try: - rsp = self.GET(req, params=params, format="binary") + rsp = self.id.http_conn.GET(req, params=params, format="binary") except IOError as ioe: self.log.info(f"got IOError: {ioe.errno}") - raise IOError(f"Error retrieving data: {ioe.errno}") - if type(rsp) in (bytes, bytearray): + raise IOError(ioe.errno, "Error retrieving data") + + if rsp.status_code != 200: + self.log.info(f"got http error: {rsp.status_code}") + raise IOError(rsp.status_code, "Error retrieving data") + + if rsp.is_binary: # got binary response - self.log.info(f"binary response, {len(rsp)} bytes") - arr = bytesToArray(rsp, mtype, mshape) + self.log.info(f"binary response, {len(rsp.text)} bytes") + arr = bytesToArray(rsp.text, mtype, mshape) else: # got JSON response # need some special conversion for compound types -- @@ -1300,7 +1262,7 @@ def __getitem__(self, args, new_dtype=None): # gives us a list instead. self.log.info("json response") - data = rsp["value"] + data = rsp.json()["value"] # self.log.debug(data) arr = jsonToArray(mshape, mtype, data) @@ -1310,7 +1272,6 @@ def __getitem__(self, args, new_dtype=None): body = {} points = selection.points.tolist() - rank = len(self._shape) # verify the points are in range and strictly monotonic (for the 1d case) last_point = -1 @@ -1328,7 +1289,7 @@ def __getitem__(self, args, new_dtype=None): if len(point) != rank: raise ValueError("invalid point argument") for i in range(rank): - if point[i] < 0 or point[i] >= self._shape[i]: + if point[i] < 0 or point[i] >= shape[i]: raise IndexError("point out of range") if rank == 1: if point[0] <= last_point: @@ -1336,7 +1297,7 @@ def __getitem__(self, args, new_dtype=None): last_point = point[0] elif rank == 1 and isinstance(point, int): - if point < 0 or point > self._shape[0]: + if point < 0 or point > shape[0]: raise IndexError("point out of range") if point <= last_point: raise TypeError("index points must be strictly increasing") @@ -1349,16 +1310,19 @@ def __getitem__(self, args, new_dtype=None): body = arr_points.tobytes() self.log.info(f"point select binary request, num bytes: {len(body)}") - rsp = self.POST(req, format=format, body=body) - if type(rsp) in (bytes, bytearray): - elements_received = len(rsp) // mtype.itemsize + rsp = self.id.http_conn.POST(req, format=format, body=body) + if rsp.status_code != 200: + self.log.info(f"got http post error: {rsp.status_code}") + raise IOError(rsp.status_code, "Error on POST request") + if rsp.is_binary: + elements_received = len(rsp.text) // mtype.itemsize elements_expected = selection.mshape[0] if elements_received != elements_expected: msg = f"Expected {elements_expected} elements, but got {elements_received}" self.log.warning(msg) raise IOError(msg) - arr = numpy.frombuffer(rsp, dtype=mtype) + arr = numpy.frombuffer(rsp.text, dtype=mtype) else: data = rsp["value"] if len(data) != selection.mshape[0]: @@ -1404,7 +1368,7 @@ def __setitem__(self, args, val): self.log.debug("val not ndarray") pass # not a numpy object, just leave dtype as None - if self._shape is None: + if self.shape is None: # null space dataset if isinstance(val, Empty): return # nothing to do @@ -1592,7 +1556,7 @@ def __setitem__(self, args, val): if len(names) > 0: params["fields"] = ":".join(names) - self.PUT(req, body=body, format=format, params=params) + self.id.http_conn.PUT(req, body=body, format=format, params=params) def read_direct(self, dest, source_sel=None, dest_sel=None): """Read data directly from HDF5 into an existing NumPy array. @@ -1681,10 +1645,6 @@ def __array__(self, dtype=None, copy=True): f"but memory allocation cannot be avoided on read" ) - # Special case for (0,)*-shape datasets - if self._shape is None or numpy.prod(self._shape) == 0: - return numpy.empty(self._shape, dtype=self.dtype if dtype is None else dtype) - data = self[:] if dtype is not None: return data.astype(dtype, copy=False) @@ -1702,14 +1662,13 @@ def __repr__(self): namestr = f'"{name}"' else: namestr = "/" - r = f'' + r = f'' return r def refresh(self): """Refresh the dataset metadata by reloading from the file. """ self.id.refresh() - self._shape = self.get_shape() self._num_chunks = None # aditional state we'll get when requested self._allocated_size = None # as above self._verboseUpdated = None # when the verbose data was fetched diff --git a/h5pyd/_hl/datatype.py b/h5pyd/_hl/datatype.py index 309dae3c..f25f7573 100644 --- a/h5pyd/_hl/datatype.py +++ b/h5pyd/_hl/datatype.py @@ -37,16 +37,15 @@ def dtype(self): """Numpy dtype equivalent for this datatype""" return self._dtype - def __init__(self, bind): + def __init__(self, bind, track_order=None): """ Create a new Datatype object by binding to a low-level TypeID. """ if not isinstance(bind, TypeID): # todo: distinguish type from other hl objects raise ValueError(f"{bind} is not a TypeID") - HLObject.__init__(self, bind) + HLObject.__init__(self, bind, track_order=track_order) self._dtype = createDataType(self.id.type_json) - self._req_prefix = "/datatypes/" + self.id.uuid def __repr__(self): if not self.id: diff --git a/h5pyd/_hl/dims.py b/h5pyd/_hl/dims.py index e4b31e5a..f3616e12 100644 --- a/h5pyd/_hl/dims.py +++ b/h5pyd/_hl/dims.py @@ -11,65 +11,32 @@ ############################################################################## from __future__ import absolute_import -import json from . import base from .dataset import Dataset from .objectid import DatasetID -class DimensionProxy(base.CommonStateObject): - '''Represents an HDF5 'dimension'.''' +def _getAttrValue(objid, attr_name): + """ helper function to get an attribute value. + Return None if attribute is not found, + else return attr_json['value'] """ + if objid.has_attr(attr_name): + attr_json = objid.get_attr(attr_name) + return attr_json['value'] + return None - def _getAttributeJson(self, attr_name, objid=None): - """ Helper function to get attribute json if present - """ - if not objid: - objid = self._id.id - objdb = self._id.http_conn.getObjDb() - if objdb and objid in objdb: - dset_json = objdb[objid] - attrs_json = dset_json["attributes"] - if attr_name not in attrs_json: - return None - return attrs_json[attr_name] - # no objdb - req = "/datasets/" + objid + "/attributes/" + attr_name - rsp = self._id.http_conn.GET(req) - if rsp.status_code == 200: - attr_json = json.loads(rsp.text) - return attr_json - else: - return None - def _getDatasetJson(self, objid): - """ Helper function to get dataset json by id - """ - - objdb = self._id.http_conn.getObjDb() - if objdb and objid in objdb: - # objdb present, get JSON for this dataset - dset_json = objdb[objid] - return dset_json - - # no objdb, make server request - req = "/datasets/" + objid - rsp = self._id.http_conn.GET(req) - if rsp.status_code == 200: - dset_json = json.loads(rsp.text) - return dset_json - else: - return None +class DimensionProxy(base.CommonStateObject): + '''Represents an HDF5 'dimension'.''' @property def label(self): ''' Get the dimension scale label ''' - labels_json = self._getAttributeJson('DIMENSION_LABELS') + label_values = _getAttrValue(self._id, 'DIMENSION_LABELS') - if not labels_json: + if label_values: return '' - label_values = labels_json["value"] - if self._dimension >= len(label_values): # label get request out of range return '' @@ -78,14 +45,11 @@ def label(self): @label.setter def label(self, val): - # pylint: disable=missing-docstring - dset = Dataset(self._id) - req = dset.attrs._req_prefix + 'DIMENSION_LABELS' - try: - labels = dset.GET(req) - dset.DELETE(req) - except IOError: - rank = len(dset.shape) + label_name = 'DIMENSION_LABELS' + if self._id.has_attr(label_name): + labels = self._id.get_attr(label_name) + else: + rank = self._id.rank labels = { 'shape': { 'class': 'H5S_SIMPLE', @@ -100,9 +64,11 @@ def label(self, val): 'value': ['' for n in range(rank)] } labels['value'][self._dimension] = val - dset.PUT(req, body=labels, replace=True) + self._id.set_attr(label_name, labels) def __init__(self, id_, dimension): + if not isinstance(id_, DatasetID): + raise TypeError(f"expected DatasetID, but got: {type(id_)}") self._id = id_ self._dimension = int(dimension) @@ -117,10 +83,10 @@ def __iter__(self): yield k def __len__(self): - dimlist_json = self._getAttributeJson('DIMENSION_LIST') - if not dimlist_json: + dimlist_values = _getAttrValue(self._id, 'DIMENSION_LIST') + if not dimlist_values: return 0 - dimlist_values = dimlist_json['value'] + if self._dimension >= len(dimlist_values): # dimension scale len request out of range return 0 @@ -128,27 +94,26 @@ def __len__(self): def __getitem__(self, item): - dimlist_attr_json = self._getAttributeJson('DIMENSION_LIST') - dimlist_attr_values = [] - if dimlist_attr_json: - dimlist_attr_values = dimlist_attr_json["value"] + dimlist_values = _getAttrValue(self._id, 'DIMENSION_LIST') + if dimlist_values is None: + dimlist_attr_values = [] if self._dimension >= len(dimlist_attr_values): # dimension scale len request out of range") return None + dimlist_values = dimlist_attr_values[self._dimension] dset_scale_id = None if isinstance(item, int): if item >= len(dimlist_values): # no dimension scale - raise IndexError( - "No dimension scale found for index: {}".format(item)) + raise IndexError(f"No dimension scale found for index: {item}") ref_id = dimlist_values[item] if ref_id and not ref_id.startswith("datasets/"): - msg = "unexpected ref_id: {}".format(ref_id) + msg = f"unexpected ref_id: {ref_id}" raise IOError(msg) else: - dset_scale_id = ref_id[len("datasets/"):] + dset_scale_id = self._id.get(ref_id) else: # Iterate through the dimension scales finding one with the # correct name @@ -156,21 +121,17 @@ def __getitem__(self, item): if not ref_id: continue if not ref_id.startswith("datasets/"): - msg = "unexpected ref_id: {}".format(ref_id) - raise IOError(msg) - continue - dset_id = ref_id[len("datasets/"):] - attr_json = self._getAttributeJson('NAME', objid=dset_id) - if attr_json["value"] == item: + raise IOError(f"unexpected ref_id: {ref_id}") + dset_id = self._id.get(ref_id) + if item == _getAttrValue(dset_id, 'NAME'): # found it! dset_scale_id = dset_id break + if not dset_scale_id: - raise KeyError( - 'No dimension scale with name"{}" found'.format(item)) - dscale_json = self._getDatasetJson(dset_scale_id) - dscale = Dataset(DatasetID( - parent=None, item=dscale_json, http_conn=self._id.http_conn)) + raise KeyError(f"No dimension scale with name '{item}' found'") + dscale = Dataset(dset_scale_id) + return dscale def attach_scale(self, dscale): @@ -179,37 +140,27 @@ def attach_scale(self, dscale): Provide the Dataset of the scale you would like to attach. ''' dset = Dataset(self._id) - try: - rsp = dscale.GET(dscale.attrs._req_prefix + 'CLASS') - except IOError: + dscale_class = _getAttrValue(dscale.id, 'CLASS') + if dscale_class is None: dset.dims.create_scale(dscale) - rsp = None + dscale_class = _getAttrValue(dscale.id, 'CLASS') - if not rsp: - rsp = dscale.GET(dscale.attrs._req_prefix + 'CLASS') - if rsp['value'] != 'DIMENSION_SCALE': - raise RuntimeError( - '{} is not a dimension scale'.format(dscale.name)) + if dscale_class != 'DIMENSION_SCALE': + raise RuntimeError(f"{dscale.name} is not a dimension scale") - try: - rsp = dset.GET(dset.attrs._req_prefix + 'CLASS') - if rsp['value'] == 'DIMENSION_SCALE': - raise RuntimeError( - '{} cannot attach a dimension scale to a dimension scale' - .format(dset.name)) - except IOError: - pass + dset_class = _getAttrValue(self._id, 'CLASS') + if dset_class == 'DIMENSION_SCALE': + msg = f"{dset.name} cannot attach a dimension scale to a dimension scale" + raise RuntimeError(msg) # Create a DIMENSION_LIST attribute if needed - req = dset.attrs._req_prefix + 'DIMENSION_LIST' - rank = len(dset.shape) - value = [list() for r in range(rank)] - try: - dimlist = dset.GET(req) - value = dimlist["value"] - dset.DELETE(req) - except IOError: - pass + rank = self._id.rank + value = _getAttrValue(self._id, 'DIMENSION_LIST') + if value: + # delete and replace later + self._id.del_attr('DIMENSION_LIST') + else: + value = [list() for r in range(rank)] dimlist = { 'creationProperties': { @@ -231,14 +182,12 @@ def attach_scale(self, dscale): # Update the DIMENSION_LIST attribute with the object reference to the # dimension scale - dimlist['value'][self._dimension].append('datasets/' + dscale.id.id) - dset.PUT(req, body=dimlist, replace=True) + dimlist['value'][self._dimension].append('datasets/' + dscale.id.uuid) + self._id.set_attr('DIMENSION_list', dimlist) - req = dscale.attrs._req_prefix + 'REFERENCE_LIST' - - try: - old_reflist = dscale.GET(req) - except IOError: + if dscale.id.has_attr('REFERENCE_LIST'): + old_reflist = dscale.id.get_attr('REFERENCE_LIST') + else: old_reflist = { 'creationProperties': { 'nameCharEncoding': 'H5T_CSET_ASCII' @@ -276,38 +225,42 @@ def attach_scale(self, dscale): reflist_value = [] else: reflist_value = [] - reflist_value.append(['datasets/' + dset.id.id, self._dimension]) + reflist_value.append(['datasets/' + dset.id.uuid, self._dimension]) new_reflist["value"] = reflist_value new_reflist["shape"]["dims"] = [len(reflist_value), ] # Update the REFERENCE_LIST attribute of the dimension scale - dscale.PUT(req, body=new_reflist, replace=True) + dscale.id.id.set_attr('REFERENCE_LIST', new_reflist) def detach_scale(self, dscale): ''' Remove a scale from this dimension. Provide the Dataset of the scale you would like to remove. ''' - dset = Dataset(self._id) - req = dset.attrs._req_prefix + 'DIMENSION_LIST' - dimlist = dset.GET(req) - dset.DELETE(req) + if 'DIMENSION_LIST' not in self._id.attrs: + raise IOError("no DIMENSION_LIST attr in {dset._id}") + dimlist = self._id.get_attr('DIMENSION_LIST') + self._id.del_attr('DIMENSION_LIST') + try: - ref = 'datasets/' + dscale.id.id + ref = 'datasets/' + dscale.id.uuid dimlist['value'][self._dimension].remove(ref) except Exception as e: # Restore the attribute's old value then raise the same # exception - dset.PUT(req, body=dimlist) + self._id.set_attr('DIMENSION_LIST', dimlist) raise e - dset.PUT(req, body=dimlist) + self._id.set_attr('DIMENSION_LIST', dimlist) + + if dscale.id.has_attr('REFERENCE_LIST'): + old_reflist = dscale.id.get_attr('REFERENCE_LIST') + else: + old_reflist = {} - req = dscale.attrs._req_prefix + 'REFERENCE_LIST' - old_reflist = dscale.GET(req) if "value" in old_reflist and len(old_reflist["value"]) > 0: new_refs = list() - remove = ['datasets/' + dset.id.id, self._dimension] + remove = ['datasets/' + self._id.uuid, self._dimension] for el in old_reflist['value']: if remove[0] != el[0] and remove[1] != el[1]: new_refs.append(el) @@ -317,14 +270,13 @@ def detach_scale(self, dscale): if len(new_refs) > 0: new_reflist["value"] = new_refs new_reflist["shape"] = [len(new_refs), ] - dscale.PUT(req, body=new_reflist, replace=True) + # tbd: replace = True + dscale.id.set_attr('REFERENCE_LIST', new_reflist) else: # Remove REFERENCE_LIST attribute if this dimension scale is # not attached to any dataset - try: - dscale.DELETE(req) - except OSError: - pass + if old_reflist: + dscale.id.del_attr('REFERENCE_LIST') def items(self): ''' Get a list of (name, Dataset) pairs with all scales on this @@ -334,10 +286,7 @@ def items(self): num_scales = self.__len__() for i in range(num_scales): dscale = self.__getitem__(i) - name_attr_json = self._getAttributeJson('NAME', objid=dscale.id.id) - dscale_name = '' - if name_attr_json: - dscale_name = name_attr_json['value'] + dscale_name = _getAttrValue(dscale.id.id, 'NAME') scales.append((dscale_name, dscale)) return scales @@ -352,7 +301,7 @@ def values(self): def __repr__(self): if not self._id: return '' - return f'<{self.label} dimension {self._dimension} of HDf5 dataset {self._id.id}>' + return f'<{self.label} dimension {self._dimension} of HDf5 dataset {self._id.uuid}>' class DimensionManager(base.MappingHDF5, base.CommonStateObject): @@ -433,10 +382,8 @@ def create_scale(self, dset, name=''): }, 'value': name } - req_class = dset.attrs._req_prefix + 'CLASS' - req_name = dset.attrs._req_prefix + 'NAME' - dset.PUT(req_class, body=class_attr, replace=True) + self._id.set_attr('CLASS', class_attr) try: - dset.PUT(req_name, body=name_attr, replace=True) + self._id.set_attr('NAME', name_attr) except Exception: - dset.DELETE(req_class) + self._id.del_attr('CLASS') diff --git a/h5pyd/_hl/files.py b/h5pyd/_hl/files.py index fed49419..f15c141e 100644 --- a/h5pyd/_hl/files.py +++ b/h5pyd/_hl/files.py @@ -14,7 +14,6 @@ import io import os -import json import pathlib import time @@ -187,13 +186,9 @@ class File(Group): @property def attrs(self): """Attributes attached to this object""" - # hdf5 complains that a file identifier is an invalid location for an - # attribute. Instead of self, pass the root group to AttributeManager: from . import attrs - # parent_obj = {"id": self.id.uuid} - # return attrs.AttributeManager(self['/']) - return attrs.AttributeManager(self) + return attrs.AttributeManager(self.id) @property def filename(self): @@ -255,6 +250,11 @@ def swmr_mode(self): """ Controls use of cached metadata """ return self._swmr_mode + @property + def objdb(self): + """ Return ref to object database """ + return self._id.http_conn.objdb + @swmr_mode.setter def swmr_mode(self, value): # enforce the same rule as h5py - swrm_mode can't be changed after opening the file @@ -464,7 +464,7 @@ def __init__( connect_try += 1 if rsp.status_code == 200: - root_json = json.loads(rsp.text) + root_json = rsp.json() if rsp.status_code != 200 and mode in ("r", "r+"): # file must exist http_conn.close() @@ -503,7 +503,7 @@ def __init__( http_conn.close() raise IOError(rsp.status_code, rsp.reason) - root_json = json.loads(rsp.text) + root_json = rsp.json() if "root" not in root_json: http_conn.close() raise IOError(404, "Unexpected error") @@ -519,8 +519,8 @@ def __init__( req = "/acls/" + name rsp = http_conn.GET(req) if rsp.status_code == 200: - rspJson = json.loads(rsp.text) - domain_acl = rspJson["acl"] + rsp_json = rsp.json() + domain_acl = rsp_json["acl"] if not domain_acl["update"]: http_conn.close() raise IOError(403, "Forbidden") @@ -530,29 +530,17 @@ def __init__( if mode in ("w", "w-", "x", "a"): http_conn._mode = "r+" - group_json = None - # do we already have the group_json? - if "domain_objs" in root_json and mode == "r": - objdb = root_json["domain_objs"] - http_conn._objdb = objdb - if root_uuid in objdb: - group_json = objdb[root_uuid] - - if not group_json: - # get the group json for the root group - req = "/groups/" + root_uuid - - rsp = http_conn.GET(req) + objdb = http_conn.objdb - if rsp.status_code != 200: - http_conn.close() - raise IOError(rsp.status_code, "Unexpected Error") - group_json = json.loads(rsp.text) + if "domain_objs" in root_json: + domain_objs = root_json["domain_objs"] + objdb.load(domain_objs) + else: + objdb.reload() - groupid = GroupID(None, group_json, http_conn=http_conn) + groupid = GroupID(root_uuid, http_conn=http_conn) # end else - self._name = "/" self._id = groupid self._verboseInfo = None # additional state we'll get when requested self._verboseUpdated = None # when the verbose data was fetched @@ -573,7 +561,7 @@ def __init__( rsp = self.id.http_conn.GET(req, params=params) if rsp.status_code != 200: raise IOError(rsp.status_code, rsp.reason) - root_json = json.loads(rsp.text) + root_json = rsp.json() if "dn_ids" in root_json: dn_ids = root_json["dn_ids"] @@ -594,7 +582,11 @@ def _getVerboseInfo(self): if (self._verboseUpdated is None or now - self._verboseUpdated > VERBOSE_REFRESH_TIME): # resynch the verbose data req = "/?verbose=1" - rsp_json = self.GET(req, use_cache=False, params={"CreateOrder": "1" if self._track_order else "0"}) + rsp = self.id.http_conn.GET(req) + if rsp.status_code != 200: + self.log.error(f"got status code: {rsp.status_code} for get verbose") + raise IOError("unexpected error") + rsp_json = rsp.json() self.log.debug("get verbose info") props = {} @@ -739,13 +731,19 @@ def compressors(self): # override base implementation of ACL methods to use the domain rather than update root group def getACL(self, username): req = "/acls/" + username - rsp_json = self.GET(req) + rsp = self.id.http_conn.GET(req) + if rsp.status_code != 200: + raise IOError(rsp.status_code, "Unable to get ACL") + rsp_json = rsp.json() acl_json = rsp_json["acl"] return acl_json def getACLs(self): req = "/acls" - rsp_json = self.GET(req) + rsp = self.id.http_conn.GET(req) + if rsp.status_code != 200: + raise IOError(rsp.status_code, "Unable to get ACL") + rsp_json = rsp.json() acls_json = rsp_json["acls"] return acls_json @@ -759,7 +757,9 @@ def putACL(self, acl): perm[k] = acl[k] req = "/acls/" + acl["userName"] - self.PUT(req, body=perm) + rsp = self.id.http_conn.PUT(req, body=perm) + if rsp.status_code not in (200, 201): + raise IOError(rsp.status_code, "Failed to create ACL") def run_scan(self): MAX_WAIT = 10 @@ -796,9 +796,13 @@ def flush(self): self.log.info("sending PUT flush request") req = "/" body = {"flush": 1, "getdnids": 1} - rsp = self.PUT(req, body=body) - if "dn_ids" in rsp: - dn_ids = rsp["dn_ids"] + rsp = self.id.http_conn.PUT(req, body=body) + self.log.debug(f"got status code: {rsp.status_code} from flush") + if rsp.status_code != 200: + raise RuntimeError(f"got status code: {rsp.status_code} on flush") + rsp_json = rsp.json() + if "dn_ids" in rsp_json: + dn_ids = rsp_json["dn_ids"] orig_ids = set(self._dn_ids) current_ids = set(dn_ids) self._dn_ids = current_ids diff --git a/h5pyd/_hl/filters.py b/h5pyd/_hl/filters.py index bcfb6b78..4a5a03c2 100644 --- a/h5pyd/_hl/filters.py +++ b/h5pyd/_hl/filters.py @@ -309,7 +309,6 @@ def guess_chunk(shape, maxshape, typesize): ndims = len(shape) if ndims == 0: raise ValueError("Chunks not allowed for scalar datasets.") - chunks = np.array(shape, dtype="=f8") if not np.all(np.isfinite(chunks)): raise ValueError("Illegal value in chunk tuple") diff --git a/h5pyd/_hl/group.py b/h5pyd/_hl/group.py index cef155e6..dcc0fd46 100644 --- a/h5pyd/_hl/group.py +++ b/h5pyd/_hl/group.py @@ -17,32 +17,27 @@ import collections from .base import HLObject, MutableMappingHDF5, guess_dtype -from .objectid import TypeID, GroupID, DatasetID +from .objectid import TypeID, GroupID, DatasetID, isUUID from .h5type import special_dtype from . import dataset from .dataset import Dataset -from . import table from .table import Table from .datatype import Datatype from . import h5type -from .. import config -def isUUID(name): - # return True if name looks like an object id - # There are some additional checks we could add to reduce false positives - # (like checking for hyphens in the right places) - if isinstance(name, str) and len(name) >= 38: - if name.startswith("groups/") or name.startswith("g-"): - return True - elif name.startswith("datatypes/") or name.startswith("t-"): - return True - elif name.startswith("datasets/") or name.startswith("d-"): - return True - else: - return False - else: - return False +def _h5parent(path): + """ Return parent of given path """ + parent_path = op.dirname(path) + return parent_path + + +def _h5base(path): + """ Return base name of the given path """ + # TBD - this needs to fixed up to work as h5py does + # e.g. _h5base('x/y/') should return 'y' + base_path = op.basename(path) + return base_path class Group(HLObject, MutableMappingHDF5): @@ -51,290 +46,129 @@ class Group(HLObject, MutableMappingHDF5): """ def __init__(self, bind, track_order=None, **kwargs): - # print "group init, bind:", bind """ Create a new Group object by binding to a low-level GroupID. """ if not isinstance(bind, GroupID): raise ValueError(f"{bind} is not a GroupID") - HLObject.__init__(self, bind, track_order=track_order, **kwargs) - self._req_prefix = "/groups/" + self.id.uuid - self._link_db = None # cache for links - - def _refresh_link_cache(self, force=False): - if self._link_db is not None and not force and self.id.http_conn.mode == 'r': - # already initialized and we are in read-only mode, just return - return - - objdb = self.id._http_conn.getObjDb() - group_id = self.id.uuid - - if not force and objdb and group_id in objdb: - # _objdb is meta-data pulled from the domain on open. - # see if we can extract the link json from there - self.log.debug(f"found {group_id} in objdb") - group_json = objdb[group_id] - links = group_json["links"] - # mix in a "collection key for compatibility with server GET links request - for title in links: - link = links[title] - if 'class' not in link: - self.log.error(f"expected to find class key in link {link}") - continue - link['title'] = title - link_class = link['class'] - if link_class == 'H5L_TYPE_HARD': - if 'id' not in link: - self.log.error(f"expected to find id key in hard link: {link}") - continue - link_id = link['id'] - if not link_id: - self.log.error(f"null id for hard link: {link}") - continue - if link_id.startswith("g-"): - link['collection'] = "groups" - elif link_id.startswith("d-"): - link['collection'] = "datasets" - elif link_id.startswith("t-"): - link["collection"] = "datatypes" - else: - self.log.error(f"unexpected id string for hard link: {link}") - else: - pass # no collection for non hardlink - link_db = links - else: - # make server request - self.log.debug(f"requesting links for {group_id}") - req = "/groups/" + group_id + "/links" - rsp_json = self.GET(req, use_cache=False) - links = rsp_json['links'] - link_db = {} - for link in links: - title = link['title'] - link_db[title] = link - - self.log.debug(f"_refresh_link_cache - found {len(links)} for {group_id}") - - # reset the link cache - self._link_db = link_db - - def _get_link_json(self, h5path): - """ Return parent_uuid and json description of link for given path """ - self.log.debug("__get_link_json({})".format(h5path)) - parent_uuid = self.id.uuid - tgt_json = None - if isinstance(h5path, bytes): - h5path = h5path.decode('utf-8') - if h5path.find('/') == -1: - in_group = True # link owned by this group - else: - in_group = False # may belong to some other group - - if h5path[0] == '/': - # abs path, start with root - # get root_uuid - parent_uuid = self.id.http_conn.root_uuid - # make a fake tgt_json to represent 'link' to root group - tgt_json = {'collection': "groups", 'class': "H5L_TYPE_HARD", 'id': parent_uuid} - if h5path == '/': - # asking for the root, just return the root link - return parent_uuid, tgt_json - else: - if in_group: - self._refresh_link_cache() - if h5path in self._link_db: - # link belonging to this group, return cache itm - tgt_json = self._link_db[h5path] - parent_uuid = self.id.id - - return parent_uuid, tgt_json - else: - self.log.info(f"{h5path} not found") - raise KeyError("Unable to open object (Component not found)") + super().__init__(bind, track_order=track_order, **kwargs) - path = h5path.split('/') + def _get_bypath(self, h5path, create=False, track_order=None): + """ Return object id at given path. + If group_create, create any groups that don't already exists """ - objdb = self.id._http_conn.getObjDb() + self.log.info(f"_get_bypath: {h5path}") - if objdb: - # _objdb is meta-data pulled from the domain on open. - # see if we can extract the link json from there - self.log.debug(f"searching objdb for {h5path}") - group_uuid = parent_uuid + if h5path == "/": + # return root group + root_uuid = self.id.http_conn.root_uuid + root_id = self.id.get(root_uuid) # create a GroupID object + root_grp = Group(root_id, track_order=track_order) + return root_grp + elif h5path[0] == '/': + # absolute path - start with root + root_uuid = self.id.http_conn.root_uuid + parent_id = self.id.get(root_uuid) + parent_name = "/" + else: + # relative path - start with this object + parent_id = self.id + parent_name = self._name - for name in path: - if not name: - continue - if group_uuid not in objdb: - self.log.warning(f"objdb search: {group_uuid} not found in objdb") - tgt_json = None - break - group_json = objdb[group_uuid] - group_links = group_json["links"] - if name not in group_links: - self.log.debug(f"objdb search: {name} not found") - tgt_json = None - break - tgt_json = group_links[name] - - if tgt_json['class'] != 'H5L_TYPE_HARD': - # use server side look ups for non-hardlink paths - group_uuid = None - self.log.debug("objdb search: non-hardlink") - # tgt_json = None - # break - else: - group_uuid = tgt_json["id"] - - if tgt_json: - # mix in a "collection key for compatibility with server GET links request - if group_uuid and group_uuid.startswith("g-"): - tgt_json['collection'] = "groups" - elif group_uuid and group_uuid.startswith("d-"): - tgt_json['collection'] = "datasets" - elif group_uuid and group_uuid.startswith("t-"): - tgt_json["collection"] = "datatypes" + links = h5path.split('/') + for title in links: + if not title: + continue # skip + self.log.debug(f"_get_bypath - iterate for link: {title}") + if parent_id.has_link(title): + # the sub-group already exists, adjust parent and keep iterating + sub_link_json = parent_id.get_link(title) + link_class = sub_link_json['class'] + if link_class == 'H5L_TYPE_HARD': + parent_id = parent_id.get(sub_link_json['id']) + elif link_class == 'H5L_TYPE_SOFT': + slink_path = sub_link_json.get('h5path') + if not slink_path: + raise IOError(f"invalid softlink: {title}") + obj = self._get_bypath(slink_path) # recursive call + parent_id = obj.id + elif link_class == 'H5L_TYPE_EXTERNAL': + external_path = sub_link_json.get('h5path') + external_domain = sub_link_json.get('h5domain') + if not external_path or not external_domain: + raise IOError(f"invalid extenallink: {title}") + # TBD: how to handle external links to other buckets? + from .files import File + if not external_domain.startswith("hdf5://") and not op.isabs(external_domain): + current_domain = self._id.http_conn.domain + external_domain = op.join(op.dirname(current_domain), external_domain) + external_domain = op.normpath(external_domain) + try: + f = File(external_domain, track_order=track_order) + except IOError: + # unable to find external link + raise KeyError(f"Unable to open domain: {external_domain}") + return f[external_path] else: - self.log.debug("no collection for non hardlink") - - return group_uuid, tgt_json + raise IOError(f"Unexpected link_class: {link_class}") + elif create: + # create the sub-group + if self.id.http_conn.mode == 'r': + raise ValueError("Unable to create group (No write intent on file)") + + self.log.debug(f"_get_bypath - making subgroup: '{title}'") + parent_id = parent_id.make_obj(title, track_order=track_order) else: - raise KeyError("Unable to open object (Component not found)") - - for name in path: - if not name: - continue - - if not parent_uuid: - raise KeyError("Unable to open object (Component not found)") - - req = "/groups/" + parent_uuid + "/links/" + name - - try: - rsp_json = self.GET(req) - except IOError: - raise KeyError("Unable to open object (Component not found)") - - if "link" not in rsp_json: - raise IOError("Unexpected Error") - tgt_json = rsp_json['link'] - - if tgt_json['class'] == 'H5L_TYPE_HARD': - if tgt_json['collection'] == 'groups': - parent_uuid = tgt_json['id'] + self.log.warning(f"_get_bypath(h5path={h5path}, parent_id={parent_id}) not found") + raise KeyError(f"object {h5path} does not exists") + if parent_name: + if parent_name[-1] == '/': + parent_name = parent_name + title else: - parent_uuid = None - - return parent_uuid, tgt_json - - def _make_group(self, parent_id=None, parent_name=None, link=None, track_order=None): - """ helper function to make a group """ - - cfg = config.get_config() - - link_json = {} - if parent_id: - link_json["id"] = parent_id - - if link: - link_json["name"] = link - - body = {} - if link_json: - body["link"] = link_json - if track_order or cfg.track_order: - body["creationProperties"] = {"CreateOrder": 1} - - self.log.debug(f"create group with body: {body}") - rsp = self.POST('/groups', body=body) - - group_json = rsp - groupId = GroupID(self, group_json) - - sub_group = Group(groupId, track_order=(track_order or cfg.track_order)) - - if parent_name: - if parent_name[-1] == '/': - parent_name = parent_name + link + parent_name = f"{parent_name}/{title}" + self.log.debug(f"_get_bypath - parent name: {parent_name}") + + if isinstance(parent_id, GroupID): + tgt = Group(parent_id, track_order=track_order) + elif isinstance(parent_id, TypeID): + tgt = Datatype(parent_id, track_order=track_order) + elif isinstance(parent_id, DatasetID): + if parent_id.rank == 1 and parent_id.type_class == 'H5T_COMPOUND': + tgt = Table(parent_id, track_order=track_order) else: - parent_name = parent_name + '/' + link - self.log.debug(f"create group - parent name: {parent_name}") - sub_group._name = parent_name + tgt = Dataset(parent_id, track_order=track_order) + else: + raise TypeError(f"unexpected type: {type(parent_id)}") - return sub_group + tgt._name = parent_name + + return tgt - def create_group(self, h5path, track_order=None): + def create_group(self, h5path, track_order=None, ignore_exists=False): """ Create and return a new subgroup. Name may be absolute or relative. Fails if the target name already exists. """ - if isinstance(h5path, bytes): - h5path = h5path.decode('utf-8') - if h5path is None: - # anonymous group - sub_group = self._make_group(track_order=track_order) - return sub_group - - if h5path[-1] == '/': - raise ValueError("Invalid path for create_group") - elif h5path[0] == '/': - # absolute path - parent_uuid = self.file.id.id # uuid of root group - parent_name = "/" - else: - parent_uuid = self.id.id - parent_name = self._name - - self.log.info(f"create_group: {h5path}") - - links = h5path.split('/') - sub_group = None # the object we'll return - for link in links: - if not link: - continue # skip - self.log.debug(f"create_group - iterate for link: {link}") - create_group = False - req = "/groups/" + parent_uuid + "/links/" + link - - try: - rsp_json = self.GET(req) - except IOError as ioe: - self.log.debug(f"Got ioe: {ioe}") - create_group = True - - if create_group: - kwargs = {} - kwargs["parent_id"] = parent_uuid - kwargs["parent_name"] = parent_name - kwargs["link"] = link - kwargs["track_order"] = track_order - sub_group = self._make_group(**kwargs) - parent_uuid = sub_group.id.id - + obj_id = self.id.make_obj(None, track_order=track_order) + return Group(obj_id) + + if not ignore_exists: + # verify an existing link is not already present + h5parent = _h5parent(h5path) + h5base = _h5base(h5path) + if h5parent: + parent_group = self._get_bypath(h5parent, create=True) else: - # sub-group already exists - self.log.debug(f"create group - found subgroup: {link}") - if "link" not in rsp_json: - raise IOError("Unexpected Error") - link_json = rsp_json["link"] - if link_json["class"] != 'H5L_TYPE_HARD': - # TBD: get the referenced object for softlink? - raise IOError("cannot create subgroup of softlink") - parent_uuid = link_json["id"] - if parent_name: - if parent_name[-1] == '/': - parent_name = parent_name + link_json["title"] - else: - parent_name = parent_name + '/' + link_json["title"] - self.log.debug(f"create group - parent name: {parent_name}") - - if sub_group is None: - # didn't actually create anything - raise ValueError("name already exists") + parent_group = self + if parent_group.id.has_link(h5base): + self.log.warning("unable to create_group: {h5parent}, already exists") + raise ValueError("Unable to synchronously create group (name already exists)") + + sub_group = self._get_bypath(h5path, track_order=track_order, create=True) return sub_group @@ -402,39 +236,27 @@ def create_dataset(self, name, shape=None, dtype=None, data=None, **kwds): # convert byte input to string name = name.decode("utf-8") - dsid = dataset.make_new_dset(self, shape=shape, dtype=dtype, data=data, **kwds) - dset = dataset.Dataset(dsid) - - if name is not None: - items = name.split('/') - path = [] - for item in items: - if len(item) > 0: - path.append(item) # just get non-empty strings - - grp = self - - if len(path) == 0: - # no name, just return anonymous dataset - return dset - - dset_link = path[-1] - dset._name = self._name - if dset._name[-1] != '/': - dset._name += '/' - if len(path) > 1: - grp_path = path[:-1] - # create any grps along the path that don't already exist - for item in grp_path: - if item not in grp: - grp = grp.create_group(item) - else: - grp = grp[item] + if name: + if name[-1] == '/': + raise ValueError("Invalid path for create_dataset") + h5path = _h5parent(name) + if h5path: + parent_grp = self._get_bypath(h5path, create=True) + else: + parent_grp = self + base_name = _h5base(name) + else: + parent_grp = self + base_name = None - dset._name = dset._name + item + '/' + dset_id = dataset.make_new_dset(parent_grp, name=base_name, shape=shape, dtype=dtype, data=data, **kwds) + if dset_id.rank == 1 and dset_id.type_class == 'H5T_COMPOUND': + dset = Table(dset_id) + else: + dset = Dataset(dset_id) - dset._name += dset_link - grp[dset_link] = dset + if base_name: + dset._name = f"{self._name}/{base_name}" return dset @@ -459,10 +281,10 @@ def create_dataset_like(self, name, other, **kwupdate): kwupdate.setdefault(k, getattr(other, k)) # TODO: more elegant way to pass these (dcpl to create_dataset?) - dcpl_json = other.id.dcpl_json + cpl = other.id.cpl track_order = None - if "CreateOrder" in dcpl_json: - createOrder = dcpl_json["CreateOrder"] + if "CreateOrder" in cpl: + createOrder = cpl["CreateOrder"] if not createOrder or createOrder == "0": track_order = False else: @@ -528,8 +350,12 @@ def create_table(self, name, numrows=None, dtype=None, data=None, **kwds): ValueError("dtype must be compound") kwds["maxshape"] = (0,) dset = self.create_dataset(name, shape=shape, dtype=dtype, data=data, **kwds) - obj = table.Table(dset.id) - return obj + tbl = Table(dset.id) + + if name: + tbl._name = f"{self._name}/{name}" + + return tbl def require_dataset(self, name, shape, dtype, exact=False, **kwds): """ Open a dataset, creating it if it doesn't exist. @@ -569,80 +395,15 @@ def require_dataset(self, name, shape, dtype, exact=False, **kwds): return dset - def require_group(self, name): + def require_group(self, name, track_order=None): """ Return a group, creating it if it doesn't exist. TypeError is raised if something with that name already exists that isn't a group. """ - if isinstance(name, bytes): - # convert byte input to string - name = name.decode("utf-8") - - if name not in self: - return self.create_group(name) - grp = self[name] - if not isinstance(grp, Group): - raise TypeError(f"Incompatible object ({grp.__class__.__name__}) already exists") + grp = self._get_bypath(name, track_order=track_order, create=True) return grp - def getObjByUuid(self, uuid, collection_type=None, track_order=None): - """ Utility method to get an obj based on collection type and uuid """ - self.log.debug(f"getObjByUuid({uuid})") - obj_json = None - # need to do somee hacky code for h5serv vs hsds compatibility - # trim off any collection prefix from the input - if uuid.startswith("groups/"): - uuid = uuid[len("groups/"):] - if collection_type is None: - collection_type = 'groups' - elif uuid.startswith("datasets/"): - uuid = uuid[len("datasets/"):] - if collection_type is None: - collection_type = 'datasets' - elif uuid.startswith("datatypes/"): - uuid = uuid[len("datatypes/"):] - if collection_type is None: - collection_type = 'datatypes' - if collection_type is None: - if uuid.startswith("g-"): - collection_type = "groups" - elif uuid.startswith("t-"): - collection_type = "datatypes" - elif uuid.startswith("d-"): - collection_type = "datasets" - else: - raise IOError(f"Unexpected uuid: {uuid}") - objdb = self.id.http_conn.getObjDb() - if objdb and uuid in objdb: - # we should be able to construct an object from objdb json - obj_json = objdb[uuid] - else: - # will need to get JSON from server - req = f"/{collection_type}/{uuid}" - # make server request - params = {} - if track_order is not None: - params["CreateOrder"] = "1" if track_order else "0" - obj_json = self.GET(req, params=params) - - if collection_type == 'groups': - tgt = Group(GroupID(self, obj_json), track_order=track_order) - elif collection_type == 'datatypes': - tgt = Datatype(TypeID(self, obj_json)) - elif collection_type == 'datasets': - # create a Table if the dataset is one dimensional and compound - shape_json = obj_json["shape"] - dtype_json = obj_json["type"] - if "dims" in shape_json and len(shape_json["dims"]) == 1 and dtype_json["class"] == 'H5T_COMPOUND': - tgt = Table(DatasetID(self, obj_json), track_order=track_order) - else: - tgt = Dataset(DatasetID(self, obj_json), track_order=track_order) - else: - raise IOError(f"Unexpected collection_type: {collection_type}") - - return tgt - def __getitem__(self, name, track_order=None): """ Open an object in the file """ # convert bytes to str for PY3 @@ -650,85 +411,40 @@ def __getitem__(self, name, track_order=None): name = name.decode('utf-8') self.log.debug(f"group.__getitem__({name}, track_order={track_order})") - tgt = None + tgt_uuid = None if isinstance(name, h5type.Reference): tgt = name.objref() # weak reference to ref object if tgt is not None: return tgt # ref'd object has not been deleted - if isinstance(name.id, GroupID): - tgt = self.getObjByUuid(name.id.uuid, collection_type="groups", track_order=track_order) - elif isinstance(name.id, DatasetID): - tgt = self.getObjByUuid(name.id.uuid, collection_type="datasets", track_order=track_order) - elif isinstance(name.id, TypeID): - tgt = self.getObjByUuid(name.id.uuid, collection_type="datasets", track_order=track_order) else: - raise IOError("Unexpected Error - ObjectID type: " + name.__class__.__name__) - return tgt - - if isUUID(name): - tgt = self.getObjByUuid(name) - return tgt - - parent_uuid, link_json = self._get_link_json(name) - link_class = link_json['class'] - - if link_class == 'H5L_TYPE_HARD': - tgt = self.getObjByUuid(link_json['id'], collection_type=link_json['collection'], track_order=track_order) - elif link_class == 'H5L_TYPE_SOFT': - h5path = link_json['h5path'] - soft_parent_uuid, soft_json = self._get_link_json(h5path) - tgt = self.getObjByUuid(soft_json['id'], collection_type=soft_json['collection'], track_order=track_order) - - elif link_class == 'H5L_TYPE_EXTERNAL': - # try to get a handle to the file and return the linked object... - # Note: set use_session to false since file.close won't be called - # (and hince the httpconn socket won't be closed) - from .files import File - external_domain = link_json['h5domain'] - if not external_domain.startswith("hdf5://") and not op.isabs(external_domain): - current_domain = self._id.http_conn.domain - external_domain = op.join(op.dirname(current_domain), external_domain) - external_domain = op.normpath(external_domain) - try: - endpoint = self.id.http_conn.endpoint - username = self.id.http_conn.username - password = self.id.http_conn.password - f = File(external_domain, endpoint=endpoint, username=username, password=password, mode='r', - track_order=track_order) - except IOError: - # unable to find external link - raise KeyError("Unable to open file: " + link_json['h5domain']) - return f[link_json['h5path']] - - elif link_class == 'H5L_TYPE_USER_DEFINED': - raise IOError("Unable to fetch user-defined link") - else: - raise IOError("Unexpected error, invalid link class:" + link_json['class']) - - # assign name - if name[0] == '/': - tgt._name = name + tgt_uuid = name.id.id + elif isUUID(name): + tgt_uuid = name + elif name == "/": + # return root group + tgt_uuid = self.id.http_conn.root_uuid else: - if self.name: - if self.name[-1] == '/': - tgt._name = self.name + name + pass # will do a path lookup + + if tgt_uuid: + obj_id = self.id.get(tgt_uuid) + if isinstance(obj_id, GroupID): + tgt = Group(obj_id) + elif isinstance(obj_id, DatasetID): + if obj_id.rank == 1 and obj_id.type_class == 'H5T_COMPOUND': + tgt = Table(obj_id) else: - tgt._name = self.name + '/' + name + tgt = Dataset(obj_id) + elif isinstance(obj_id, TypeID): + tgt = Datatype(obj_id) else: - tgt._name = name - return tgt + raise IOError("Unexpected Error - ObjectID type: " + obj_id.__class__.__name__) + return tgt - def _objectify_link_Json(self, link_json): - if "id" in link_json: - link_obj = HardLink(link_json["id"]) - elif "h5path" in link_json and "h5domain" not in link_json: - link_obj = SoftLink(link_json["h5path"]) - elif "h5path" in link_json and "h5domain" in link_json: - link_obj = ExternalLink(link_json["h5domain"], link_json["h5path"]) - else: - raise ValueError("Invalid link JSON") + # get item by h5path + tgt = self._get_bypath(name, track_order=track_order) - return link_obj + return tgt def get(self, name, default=None, getclass=False, getlink=False, track_order=None, **kwds): """ Retrieve an item or other information. @@ -752,20 +468,6 @@ def get(self, name, default=None, getclass=False, getlink=False, track_order=Non List links and attributes by creation order if True, alphanumerically if False. If None, the track_order used when creating the group will be used. - "limit" is an integer: - If "name" is None, this will return the first "limit" links in the group. - - "marker" is a string: - If "name" is None, this will return only the links that come after the marker in the group's link ordering. - - "pattern" is a string: - If "name" is None, this will return only the links that match the given pattern - in the target group (and subgroups, if follow_links is provided). - Matching is done according to Unix pathname expansion rules. - - "follow_links" is True: - If "name" is None, subgroups of the target group will be recursively searched - for links that match the given names or pattern. Example: @@ -778,126 +480,76 @@ def get(self, name, default=None, getclass=False, getlink=False, track_order=Non if kwd not in kwd_args: raise TypeError(f"group.get() unexpected keyword argument: {kwd}") - if not (getclass or getlink): - try: - return self.__getitem__(name, track_order=track_order) - except KeyError: - return default - - if not isinstance(name, list) and name is not None and name not in self: - return default - - elif getclass and not getlink: - obj = self.__getitem__(name, track_order) - if obj is None: - return None - if obj.id.__class__ is GroupID: - return Group - elif obj.id.__class__ is DatasetID: - return Dataset - elif obj.id.__class__ is TypeID: - return Datatype - else: - raise TypeError("Unknown object type") - - elif getlink: - if name is None or isinstance(name, list): - # Get all links in target group(s) - # Retrieve "limit", "marker", and "pattern" from kwds - limit = kwds.get("limit", None) - marker = kwds.get("marker", None) - pattern = kwds.get("pattern", None) - follow_links = kwds.get("follow_links", False) - - if name and (limit or marker or pattern or follow_links): - raise ValueError("Cannot specify 'name' along with 'limit', 'marker', 'pattern', or 'follow_links'") - - req = "/groups/" + self.id.uuid + "/links" - params = {} - - if limit: - params["Limit"] = limit - if marker: - params["Marker"] = marker - if pattern: - params["pattern"] = pattern - if follow_links: - params["follow_links"] = 1 + if not name: + raise TypeError("Argument 'path' must not be None") - if track_order is not None: - if limit or marker or pattern or follow_links: - # send server request, otherwise we'll just sort - # client side (so things will work even if we have - # the request in http cache) - params["CreateOrder"] = "1" if track_order else "0" - - if name: - body = {} - - titles = [linkname.decode('utf-8') if - isinstance(linkname, bytes) else linkname for linkname in name] - body['titles'] = titles - rsp = self.POST(req, body=body, params=params) + h5path = _h5parent(name) + base_name = _h5base(name) + if not base_name: + # TBD: is this valid? + raise IOError(f"invalid name: {name}") + + if not h5path: + parent = self + else: + parent = self.__getitem__(h5path) + if not isinstance(parent, Group): + self.log.error(f"unexpected object: {type(parent)}") + raise TypeError(name) + + if not isinstance(parent.id, GroupID): + self.log.error(f"unexpected object: {type(parent)}") + raise TypeError(name) + + if not parent.id.has_link(base_name): + raise IOError(f"{name} not found") + + if getlink: + link_json = parent.id.get_link(base_name) + link_class = link_json['class'] + if link_class == 'H5L_TYPE_HARD': + if getclass: + return HardLink else: - rsp = self.GET(req, params=params) - - if "links" in rsp: - # Process list of link objects so they may be accessed by name - links_out = collections.OrderedDict() - links = rsp["links"] - if all([isUUID(k) for k in links]): - # Multiple groups queried, links are returned under group ids - for group_id in links: - group_links = collections.OrderedDict() - - link_list = links[group_id] - if track_order is None: - pass # just use in the order we got from the server - elif track_order: - # sort by created key - link_list.sort(key=lambda d: d['created']) - else: - # sort by title - link_list.sort(key=lambda d: d['title']) - - for link in link_list: - group_links[link["title"]] = self._objectify_link_Json(link) - - links_out[group_id] = group_links - - else: - if track_order is None: - pass # just use in the order we got from the server - elif track_order: - # sort by created key - links.sort(key=lambda d: d['created']) - else: - # sort by title - links.sort(key=lambda d: d['title']) - for link in links: - links_out[link["title"]] = self._objectify_link_Json(link) + return HardLink() + elif link_class == 'H5L_TYPE_SOFT': + if getclass: + return SoftLink else: - raise ValueError("Can't parse server response to links query") - - return links_out + soft_path = link_json["h5path"] + return SoftLink(soft_path) + elif link_class == 'H5L_TYPE_EXTERNAL': + if getclass: + return ExternalLink + else: + ext_path = link_json['h5path'] + domain_path = link_json['h5domain'] + return ExternalLink(domain_path, ext_path) else: - parent_uuid, link_json = self._get_link_json(name) - typecode = link_json['class'] - - if typecode == 'H5L_TYPE_SOFT': - if getclass: - return SoftLink + self.log.info(f"user-defined link class: {link_class}") + if getclass: + return UserDefinedLink + else: + return UserDefinedLink() - return SoftLink(link_json['h5path']) - elif typecode == 'H5L_TYPE_EXTERNAL': - if getclass: - return ExternalLink + else: + tgt = self.__getitem__(name) - return ExternalLink(link_json['h5domain'], link_json['h5path']) - elif typecode == 'H5L_TYPE_HARD': - return HardLink if getclass else HardLink(link_json['id']) + if getclass: + # return class of object that link is pointing too + if isinstance(tgt.id, GroupID): + return Group + elif isinstance(tgt.id, TypeID): + return Datatype + elif isinstance(tgt, DatasetID): + return Dataset else: - raise TypeError("Unknown link type") + raise TypeError("Unexpected id class: {type(tgt)}") + else: + # getclass and getlink ar false, return the object + if track_order is not None: + tgt._track_order = track_order + return tgt def __setitem__(self, name, obj): """ Add an object to the group. The name must not already be in use. @@ -923,83 +575,42 @@ def __setitem__(self, name, obj): values are stored as scalar datasets. Raise ValueError if we can't understand the resulting array dtype. """ - if isinstance(name, list) and isinstance(obj, list): - if len(name) != len(obj): - raise ValueError("name and object list lengths do not match") - - links = {} - - for i in range(len(name)): - if isinstance(obj[i], HLObject): - links[name[i]] = {"id": obj[i].id.uuid} - elif isinstance(obj[i], SoftLink): - links[name[i]] = {"h5path": obj[i].path} - elif isinstance(obj[i], ExternalLink): - links[name[i]] = {"h5path": obj[i].path, "h5domain": obj[i].filename} - else: - raise ValueError("only links are supported for multiple object creation") - body = {"links": links} - req = "/groups/" + self.id.uuid + "/links" - self.PUT(req, body=body) + if not name: + raise TypeError("Argument 'path', must not be none") + if len(name) > 1 and name[-1] == '/': + name = name[:-1] + parent_path = _h5parent(name) + basename = _h5base(name) - elif name.find('/') != -1: - parent_path = op.dirname(name) - basename = op.basename(name) - if not basename: - raise KeyError("Group path can not end with '/'") - parent_uuid, link_json = self._get_link_json(parent_path) - if parent_uuid is None: - raise KeyError("group path: {} not found".format(parent_path)) - if link_json["class"] != 'H5L_TYPE_HARD': - raise IOError("cannot create subgroup of softlink") - parent_uuid = link_json["id"] - req = "/groups/" + parent_uuid - params = {} - if self.track_order is not None: - params["CreateOrder"] = "1" if self.track_order else "0" - group_json = self.GET(req, params=params) - tgt = Group(GroupID(self, group_json)) - tgt[basename] = obj - - elif isinstance(obj, HLObject): - body = {'id': obj.id.uuid} - req = "/groups/" + self.id.uuid + "/links/" + name - self.PUT(req, body=body) + if parent_path: + grp = self._get_bypath(parent_path, create=True) + else: + grp = self - elif isinstance(obj, SoftLink): - body = {'h5path': obj.path} - req = "/groups/" + self.id.uuid + "/links/" + name - self.PUT(req, body=body) - # self.id.links.create_soft(name, self._e(obj.path), - # lcpl=lcpl, lapl=self._lapl) + self.log.debug(f"got parent group for set: {grp.name}") + if basename in grp: + self.log.warning(f"link with {basename} already exists") + raise IOError("Unable to create link (name already exists)") + + if isinstance(obj, HLObject): + # create a hardlink to the given object + link_json = {'class': 'H5L_TYPE_HARD', 'id': obj.id.id} + grp.id.set_link(basename, link_json) + elif isinstance(obj, SoftLink): + link_json = {'class': 'H5L_TYPE_SOFT', 'h5path': obj.path} + grp.id.set_link(basename, link_json) elif isinstance(obj, ExternalLink): - body = {'h5path': obj.path, - 'h5domain': obj.filename} - req = "/groups/" + self.id.uuid + "/links/" + name - self.PUT(req, body=body) - # self.id.links.create_external(name, self._e(obj.filename), - # self._e(obj.path), lcpl=lcpl, lapl=self._lapl) + link_json = {'class': 'H5L_TYPE_EXTERNAL', 'h5path': obj.path} + link_json['h5domain'] = obj.filename + grp.id.set_link(basename, link_json) elif isinstance(obj, numpy.dtype): - # print "create named type" + self.log.info("create named type") type_json = h5type.getTypeItem(obj) - req = "/datatypes" - - body = {'type': type_json} - rsp = self.POST(req, body=body) - body['id'] = rsp['id'] - body['lastModified'] = rsp['lastModified'] - - type_id = TypeID(self, body) - req = "/groups/" + self.id.uuid + "/links/" + name - body = {'id': type_id.uuid} - self.PUT(req, body=body) - - # htype = h5t.py_create(obj) - # htype.commit(self.id, name, lcpl=lcpl) + grp.id.make_obj(name, type_json=type_json) else: if isinstance(obj, numpy.ndarray): @@ -1010,96 +621,69 @@ def __setitem__(self, name, obj): else: dt = guess_dtype(obj) arr = numpy.array(obj, dtype=dt) - self.create_dataset(name, shape=arr.shape, dtype=arr.dtype, data=arr[...]) - - if isinstance(name, str) and name.find('/') != -1: - # object in this group, update link db - self._refresh_link_cache() + grp.create_dataset(basename, shape=arr.shape, dtype=arr.dtype, data=arr[...]) + # link was created for us def __delitem__(self, name): """ Delete (unlink) an item from this group. """ + objdb = self.id.http_conn.objdb if isUUID(name): - tgt = self.getObjByUuid(name) - if tgt: - if isinstance(tgt.id, GroupID): - req = "/groups/" + tgt.id.uuid - elif isinstance(tgt.id, DatasetID): - req = "/datasets/" + tgt.id.uuid - elif isinstance(tgt.id, TypeID): - req = "/datatypes/" + tgt.id.uuid - else: - raise TypeError(f"unexpected type for object id: {tgt.id}") + obj_id = op.basename(name) + if obj_id in objdb: + del objdb[obj_id] else: - raise IOError("Not found") - + self.log.warning(f"expected to find obj_id: {obj_id} for delete") else: - # delete the link(s), not an object - if isinstance(name, list): - # delete multiple links - req = "/groups/" + self.id.uuid + "/links?titles=" + '/'.join(name) - else: - # delete single link - req = "/groups/" + self.id.uuid + "/links/" + name + parent_path = _h5parent(name) + basename = _h5base(name) + if not basename: + raise KeyError("Group path can not end with '/'") - self.DELETE(req) + if parent_path: + grp = self._get_bypath(parent_path) + else: + grp = self - self._refresh_link_cache() + grp.id.del_link(basename) def __len__(self): """ Number of members attached to this group """ - self._refresh_link_cache() - num_links = len(self._link_db) + num_links = self.id.link_count return num_links - def _get_link_list(self, track_order=None): - if track_order is None: - track_order = self.track_order - self._refresh_link_cache() - - # convert to a list of dicts - links = [] - for title in self._link_db: - link = self._link_db[title] - links.append(link) - - if track_order: - links.sort(key=lambda d: d['created']) - else: - links.sort(key=lambda d: d['title']) - return links - def __iter__(self): """ Iterate over member names """ - links = self._get_link_list() + titles = self.id.get_link_titles(track_order=self.track_order) - for link in links: - yield link['title'] + for title in titles: + yield title def __reversed__(self): """ Iterate over member names in reverse order """ - self._refresh_link_cache() - links = self._get_link_list() + titles = self.id.get_link_titles(track_order=self.track_order) - for link in reversed(links): - yield link['title'] + for title in reversed(titles): + yield title def __contains__(self, name): """ Test if a member name exists """ - found = False + if name == "/": + return True + parent_path = _h5parent(name) + basename = _h5base(name) + if not basename: + raise KeyError("Group path can not end with '/'") + + if parent_path: + grp = self._get_bypath(parent_path) + else: + grp = self - if name.find('/') == -1: - # a link in this group - self._refresh_link_cache() - if name in self._link_db: - found = True + if grp.id.has_link(basename): + return True else: - try: - self._get_link_json(name) - found = True - except KeyError: - pass # not found - return found + return False def copy(self, source, dest, name=None, shallow=False, expand_soft=False, expand_external=False, @@ -1220,28 +804,8 @@ def visititems(self, func): visited[parent.id.uuid] = True if parent.id.__class__ is GroupID: # get group links - objdb = self.id._http_conn.getObjDb() - if objdb: - # should be able to retrieve from cache obj - if parent.id.uuid not in objdb: - raise IOError(f"expected to find id {parent.id.uuid} in objdb") - group_json = objdb[parent.id.uuid] - # make this look like the server response - links_json = group_json["links"] - links = [] - for k in links_json: - item = links_json[k] - item['title'] = k - links.append(item) - else: - # request from server - req = "/groups/" + parent.id.uuid + "/links" - params = {} - if self.track_order is not None: - params["CreateOrder"] = "1" if self.track_order else "0" - rsp_json = self.GET(req, params=params) - links = rsp_json['links'] - for link in links: + for title in parent.id.get_link_titles(): + link = parent.id.get_link(title) obj = None if link['class'] == 'H5L_TYPE_SOFT': # obj = SoftLink(link['h5path']) @@ -1254,7 +818,7 @@ def visititems(self, func): elif link['class'] == 'H5L_TYPE_HARD': if link['id'] in visited: continue # already been there - obj = parent.__getitem__(link['title']) + obj = parent.__getitem__(title) tovisit[obj.id.uuid] = obj obj = None if obj is not None: @@ -1288,7 +852,6 @@ def refresh(self): """Refresh the group metadata by reloading from the file. """ self.id.refresh() - self._refresh_link_cache(force=True) class HardLink(object): diff --git a/h5pyd/_hl/httpconn.py b/h5pyd/_hl/httpconn.py index 8b981ff9..f803258f 100644 --- a/h5pyd/_hl/httpconn.py +++ b/h5pyd/_hl/httpconn.py @@ -2,7 +2,7 @@ # Copyright by The HDF Group. # # All rights reserved. # # # -# This file is part of HSDS (HDF5 REST Server) Service, Libraries and # +# This file is part of HSDS (HDF5 REST Server) Service, Libraries and # # Utilities. The full HDF5 REST Server copyright notice, including # # terms governing use, modification, and redistribution, is contained in # # the file COPYING, which can be found at the root of the source code # @@ -25,11 +25,10 @@ import logging from . import openid +from .objdb import ObjDB from .. import config from . import requests_lambda -MAX_CACHE_ITEM_SIZE = 10000 # max size of an item to put in the cache - def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) @@ -40,30 +39,19 @@ def eprint(*args, **kwargs): 1000, ) # #20 # 180 # seconds - allow time for hsds service to bounce - -class CacheResponse(object): - """Wrap a json response in a Requests.Response looking class. - Note: we don't want to keep a proper requests obj in the cache since it - would contain refernces to other objects - """ - - def __init__(self, rsp): - # just save off what we need - self._text = rsp.text - self._status_code = rsp.status_code - self._headers = rsp.headers - - @property - def text(self): - return self._text - - @property - def status_code(self): - return self._status_code - - @property - def headers(self): - return self._headers +""" +def verifyCert(self): + # default to validate CERT for https requests, unless + # the H5PYD_VERIFY_CERT environment variable is set and True + # + # TBD: set default to True once the signing authority of data.hdfgroup.org is + # recognized + if "H5PYD_VERIFY_CERT" in os.environ: + verify_cert = os.environ["H5PYD_VERIFY_CERT"].upper() + if verify_cert.startswith('F'): + return False + return True +""" def getAzureApiKey(): @@ -146,6 +134,118 @@ def getKeycloakApiKey(): return api_key +class HttpResponse: + """ wrapper for http request responses """ + def __init__(self, rsp, logger=None): + self._rsp = rsp + self._logger = logger + if logger is None: + self.log = logging + else: + self.log = logging.getLogger(logger) + self._text = None + + @property + def status_code(self): + """ return response status code """ + return self._rsp.status_code + + @property + def reason(self): + """ return response reason """ + return self._rsp.reason + + @property + def content_type(self): + """ return content type """ + rsp = self._rsp + if 'Content-Type' in rsp.headers: + content_type = rsp.headers['Content-Type'] + else: + content_type = "" + return content_type + + @property + def content_length(self): + """ Return length of response if available """ + if 'Content-Length' in self._rsp.headers: + content_length = self._rsp.headers['Content-Length'] + else: + content_length = None + return content_length + + @property + def is_binary(self): + """ return True if the response indicates binary data """ + + if self.content_type == "application/octet-stream": + return True + else: + return False + + @property + def is_json(self): + """ return true if response indicates json """ + + if self.content_type.startswith("application/json"): + return True + else: + return False + + @property + def text(self): + """ getresponse content as bytes """ + + if not self._text: + rsp = self._rsp + if not self.is_binary: + # hex encoded response? + # this is returned by API Gateway for lambda responses + self._text = bytes.fromhex(rsp.text) + else: + if self.content_length: + self.log.debug(f"got binary response, {self.content_length} bytes") + else: + self.log.debug("got binary response, content_length unknown") + + HTTP_CHUNK_SIZE = 4096 + http_chunks = [] + downloaded_bytes = 0 + for http_chunk in rsp.iter_content(chunk_size=HTTP_CHUNK_SIZE): + if http_chunk: # filter out keep alive chunks + self.log.debug(f"got http_chunk - {len(http_chunk)} bytes") + downloaded_bytes += len(http_chunk) + http_chunks.append(http_chunk) + if len(http_chunks) == 0: + raise IOError("no data returned") + if len(http_chunks) == 1: + # can return first and only chunk as response + self._text = http_chunks[0] + else: + msg = f"retrieved {len(http_chunks)} http_chunks " + msg += f" {downloaded_bytes} total bytes" + self.log.info(msg) + self._text = bytearray(downloaded_bytes) + index = 0 + for http_chunk in http_chunks: + self._text[index:(index + len(http_chunk))] = http_chunk + index += len(http_chunk) + + return self._text + + def json(self): + """ Return json from response""" + + rsp = self._rsp + + if not self.is_json: + raise IOError("response is not json") + + rsp_json = json.loads(rsp.text) + self.log.debug(f"rsp_json - {len(rsp.text)} bytes") + return rsp_json + + class HttpConn: """ Some utility methods based on equivalents in base class. @@ -166,6 +266,7 @@ def __init__( logger=None, retries=3, timeout=DEFAULT_TIMEOUT, + objdb=None, **kwds, ): self._domain = domain_name @@ -179,12 +280,7 @@ def __init__( self._api_key = api_key self._s = None # Sessions self._server_info = None - if use_cache: - self._cache = {} - self._objdb = {} - else: - self._cache = None - self._objdb = None + self._logger = logger if logger is None: self.log = logging @@ -315,6 +411,8 @@ def __init__( else: self.log.error(f"Unknown openid provider: {provider}") + self._objdb = ObjDB(self, use_cache=use_cache) + def __del__(self): if self._hsds: self.log.debug("hsds stop") @@ -329,6 +427,11 @@ def getHeaders(self, username=None, password=None, headers=None): if headers is None: headers = {} + + # This should be the default - but explicitly set anyway + if "Accept-Encoding" not in headers: + headers['Accept-Encoding'] = "deflate, gzip" + elif "Authorization" in headers: return headers # already have auth key if username is None: @@ -403,10 +506,11 @@ def verifyCert(self): return False return True - def getObjDb(self): + @property + def objdb(self): return self._objdb - def GET(self, req, format="json", params=None, headers=None, use_cache=True): + def GET(self, req, format="json", params=None, headers=None): if self._endpoint is None: raise IOError("object not initialized") # check that domain is defined (except for some specific requests) @@ -431,20 +535,6 @@ def GET(self, req, format="json", params=None, headers=None, use_cache=True): if format == "binary": headers["accept"] = "application/octet-stream" - # list of parameters which should disable cache usage - - check_cache = self._cache is not None and use_cache and format == "json" - check_cache = check_cache and params["domain"] == self._domain - check_cache = check_cache and "select" not in params and "query" not in params - check_cache = check_cache and "follow_links" not in params and "pattern" not in params - check_cache = check_cache and "Limit" not in params and "Marker" not in params - - if check_cache: - self.log.debug("httpcon - checking cache") - if req in self._cache: - self.log.debug("httpcon - returning cache result") - rsp = self._cache[req] - return rsp self.log.info(f"GET: {self._endpoint + req} [{params['domain']}] timeout: {self._timeout}") for k in params: @@ -480,68 +570,14 @@ def GET(self, req, format="json", params=None, headers=None, use_cache=True): self.log.error(f"got {type(e)} exception: {e}") raise IOError("Unexpected exception") - content_type = None - if rsp.status_code == 200 and self._cache is not None: - rsp_headers = rsp.headers - content_length = 0 - if "Content-Length" in rsp_headers: - try: - content_length = int(rsp_headers["Content-Length"]) - except ValueError: - content_length = MAX_CACHE_ITEM_SIZE + 1 - self.log.debug(f"content_length: {content_length}") - - if "Content-Type" in rsp_headers: - content_type = rsp_headers["Content-Type"] - self.log.debug(f"content_type: {content_type}") - - add_to_cache = content_type and content_type.startswith("application/json") - add_to_cache = add_to_cache and content_length < MAX_CACHE_ITEM_SIZE and not req.endswith("/value") - add_to_cache = add_to_cache and "follow_links" not in params and "pattern" not in params - add_to_cache = add_to_cache and "Limit" not in params and "Marker" not in params - - if add_to_cache: - # add to our _cache - cache_rsp = CacheResponse(rsp) - self.log.debug(f"adding {req} to cache") - self._cache[req] = cache_rsp - - if rsp.status_code == 200 and req == "/": - self.log.info(f"got domain json: {len(rsp.text)} bytes") - self._domain_json = json.loads(rsp.text) - - # when calling AWS Lambda thru API Gatway, the status_code - # indicates the Lambda request was successful, but not necessarily - # the requested HSDS action was. - # Check here and raise IOError is needed. - - json_success = (rsp.status_code == 200) and content_type and content_type.startswith("application/json") - - if json_success: - body = json.loads(rsp.text) - if "statusCode" in body: - status_code = body["statusCode"] - if status_code == 400: - raise IOError("Invalid request") - if status_code == 403: - raise IOError("Unauthorize") - if status_code == 404: - raise IOError("Not found") - if status_code == 410: - raise IOError("Conflict") - if status_code == 500: - raise IOError("Unexpected error") - - return rsp + return HttpResponse(rsp) def PUT(self, req, body=None, format="json", params=None, headers=None): if self._endpoint is None: raise IOError("object not initialized") if self._domain is None: raise IOError("no domain defined") - if self._cache is not None: - # update invalidate everything in cache - self._cache = {} + if params: self.log.info(f"PUT params: {params}") else: @@ -594,17 +630,13 @@ def PUT(self, req, body=None, format="json", params=None, headers=None): self.log.info("clearing domain_json cache") self._domain_json = None self.log.info(f"PUT returning: {rsp}") - return rsp + return HttpResponse(rsp) def POST(self, req, body=None, format="json", params=None, headers=None): if self._endpoint is None: raise IOError("object not initialized") if self._domain is None: raise IOError("no domain defined") - if self._cache is not None: - # invalidate cache for updates - # TBD: handle special case for point selection since that doesn't modify anything - self._cache = {} if params is None: params = {} @@ -660,13 +692,12 @@ def POST(self, req, body=None, format="json", params=None, headers=None): if rsp.status_code not in (200, 201): self.log.error(f"POST error: {rsp.status_code}") - return rsp + return HttpResponse(rsp) def DELETE(self, req, params=None, headers=None): if self._endpoint is None: raise IOError("object not initialized") - if self._cache is not None: - self._cache = {} + if req not in ("/domains", "/") and self._domain is None: raise IOError("no domain defined") if params is None: @@ -701,10 +732,10 @@ def DELETE(self, req, params=None, headers=None): raise IOError("Connection Error") if rsp.status_code == 200 and req == "/": - self.log.info("clearning domain_json cache") + self.log.info("clearing domain_json cache") self._domain_json = None - return rsp + return HttpResponse(rsp) @property def session(self): @@ -775,13 +806,6 @@ def password(self): def mode(self): return self._mode - @property - def cache_on(self): - if self._cache is None: - return False - else: - return True - @property def domain_json(self): if self._domain_json is None: @@ -789,7 +813,7 @@ def domain_json(self): if rsp.status_code != 200: raise IOError(rsp.reason) # assume JSON - self._domain_json = json.loads(rsp.text) + self._domain_json = rsp.json() return self._domain_json @property diff --git a/h5pyd/_hl/objdb.py b/h5pyd/_hl/objdb.py new file mode 100644 index 00000000..599ea46d --- /dev/null +++ b/h5pyd/_hl/objdb.py @@ -0,0 +1,406 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## + +from __future__ import absolute_import + +import time +# import weakref +from .. import config + + +def get_collection(uuid): + """ Return the collection type for the given obj uuid """ + + if uuid.startswith("g-"): + return "groups" + elif uuid.startswith("t-"): + return "datatypes" + elif uuid.startswith("d-"): + return "datasets" + else: + raise TypeError(f"unexpected uuid: {uuid}") + + +class ObjDB(): + """ Domain level object map """ + def __init__(self, http_conn, use_cache=True): + self._http_conn = http_conn # weakref.ref(http_conn) + self._objdb = {} + self._loadtime = {} + self._use_cache = use_cache + self.log = http_conn.logging + + def fetch(self, obj_uuid): + """ get obj_json for given obj_uuid from the server """ + + self.log.debug(f"ObjDB.fetch({obj_uuid})") + + if obj_uuid.startswith("g-"): + collection_type = "groups" + elif obj_uuid.startswith("t-"): + collection_type = "datatypes" + elif obj_uuid.startswith("d-"): + collection_type = "datasets" + else: + msg = f"Unexpected obj_uuid: {obj_uuid}" + self.log.error(msg) + raise IOError(msg) + + req = f"/{collection_type}/{obj_uuid}" + # make server request + params = {"include_attrs": 1} + if collection_type == "groups": + # get links as well + params["include_links"] = 1 + rsp = self._http_conn.GET(req, params=params) + if rsp.status_code in (404, 410): + self.log.warning(f"obj: {obj_uuid} not found") + return None + elif rsp.status_code != 200: + raise IOError(f"Unexpected error on request {req}: {rsp.status_code}") + obj_json = rsp.json() + self.__set_item__(obj_uuid, obj_json) + return obj_json + + def __set_item__(self, obj_uuid, obj_json): + """ set the obj_json in the db with obj_uuid as the key """ + + discard_keys = ('root', 'id', 'attributeCount', 'linkCount', 'hrefs', 'domain', 'bucket') + # tbd: should bucket be supported? Not being returned in GET request + for k in discard_keys: + if k in obj_json: + del obj_json[k] + + if "attributes" not in obj_json: + obj_json["attributes"] = {} + if get_collection(obj_uuid) == "groups" and "links" not in obj_json: + obj_json["links"] = {} + + # assign or replace current object + self._objdb[obj_uuid] = obj_json + self._loadtime[obj_uuid] = time.time() + + return obj_json + + def __getitem__(self, obj_uuid): + if obj_uuid not in self._objdb: + self.log.warning(f"id: {obj_uuid} not found in objDB") + raise KeyError(obj_uuid) + obj_json = self._objdb[obj_uuid] + return obj_json + + def __delitem__(self, obj_uuid): + if obj_uuid not in self._objdb: + self.log.warning(f"id: {obj_uuid} not found for deletion in objDB") + raise KeyError(obj_uuid) + del self._objdb[obj_uuid] + del self._loadtime[obj_uuid] + + def __len__(self): + return len(self._objdb) + + def __iter__(self): + for obj_uuid in self._objdb: + yield obj_uuid + + def __contains__(self, obj_uuid): + if obj_uuid in self._objdb: + return True + else: + return False + + def load(self, domain_objs): + """ load content from hsds summary json """ + for obj_uuid in domain_objs: + obj_json = domain_objs[obj_uuid] + self.__set_item__(obj_uuid, obj_json) + + def reload(self): + """ re-initialize objdb """ + self.log.info(f"objdb.reload {self._http_conn.domain}") + self._objdb = {} + self._loadtime = {} + obj_uuids = set() + obj_uuids.add(self._http_conn.root_uuid) + while obj_uuids: + obj_uuid = obj_uuids.pop() + obj_json = self.fetch(obj_uuid) + self.__set_item__(obj_uuid, obj_json) + + if "links" in obj_json: + # add ids for any hard-links to our search if + # not in the db already + links = obj_json["links"] + for title in links: + self.log.debug(f"got link: {title}") + link = links[title] + if "class" not in link: + self.log.error(f"expected to find class key in {link}") + continue + if link['class'] != 'H5L_TYPE_HARD': + continue # only care about hard links + if "id" not in link: + self.log.error(f"expected to find id key in {link}") + continue + link_id = link['id'] + if link_id in self._objdb: + # we've already fetched this object + continue + self.log.debug(f"adding hardlink id: {link_id}") + obj_uuids.add(link_id) + + self.log.info(f"objdb.reload complete, {len(self._objdb)} objects loaded") + + def get_bypath(self, parent_uuid, h5path, follow=False, getlink=False): + """ Return obj_json for the given link path starting from parent_uuid """ + self.log.debug(f"get_bypath(parent_uuid: {parent_uuid}), h5path: {h5path}") + if not parent_uuid.startswith("g-"): + self.log.error("get_bypath - expected parent_uuid to be a group id") + raise TypeError() + if parent_uuid not in self._objdb: + self.log.warning("get_bypath - parent_uuid not found") + raise KeyError("parent_uuid: {parent_uuid} not found") + + obj_id = parent_uuid + obj_json = self._objdb[obj_id] + searched_ids = set(obj_id) + + link_names = h5path.split('/') + self.log.debug(f"link_names: {link_names}") + for link_name in link_names: + if not link_name: + continue + link_tgt = None + self.log.debug(f"link_name: {link_name}") + if not obj_id: + break + if not obj_id.startswith("g-"): + self.log.error(f"get_bypath, {link_name} is not a group") + raise KeyError(h5path) + if 'links' not in obj_json: + self.log.error(f"expected to find links key in: {obj_json}") + raise KeyError(h5path) + links = obj_json['links'] + self.log.debug(f"links: {links}") + if link_name not in links: + self.log.warning(f"link: {link_name} not found in {obj_id}") + self.log.debug(f"links: {links}") + raise KeyError(h5path) + link_tgt = links[link_name] + self.log.debug(f"link_tgt: {link_tgt}") + link_class = link_tgt['class'] + obj_id = None + obj_json = None + if link_class == 'H5L_TYPE_HARD': + # hard link + obj_id = link_tgt['id'] + if obj_id in searched_ids: + self.log.warning(f"circular reference using path: {h5path}") + raise KeyError(h5path) + if obj_id not in self._objdb: + # TBD - fetch from the server in case this object has not + # been loaded yet? + self.log.warning(f"id: {obj_id} not found") + obj_json = None + else: + searched_ids.add(obj_id) + obj_json = self._objdb[obj_id] + elif link_class == 'H5L_TYPE_SOFT': + if not follow: + continue + # soft link + slink_path = link_tgt['h5path'] + if not slink_path: + self.log.warning(f"id: {obj_id} has null h5path for link: {link_name}") + raise KeyError(h5path) + if slink_path.startswith('/'): + slink_id = self._http_conn.root_uuid + else: + slink_id = obj_id + # recursive call + try: + obj_json = self.get_bypath(slink_id, slink_path) + except KeyError: + self.log.warning(f"Unable to find object in softpath: {slink_path}") + continue + obj_id = obj_json['id'] + elif link_class == 'H5L_TYPE_EXTERNAL': + if not follow: + continue + # tbd + self.log.error("external link not supported") + else: + self.log.error(f"link type: {link_class} not supported") + + if getlink: + if not link_tgt: + self.log.warning("get_bypath link at {h5path} not found") + raise KeyError(h5path) + self.log.info(f"get_bypath link at {h5path} found link: {link_tgt}") + return link_tgt + else: + if not obj_id: + self.log.warning(f"get_bypath {h5path} not found") + raise KeyError(h5path) + self.log.info(f"get_bypath link at {h5path} found target: {obj_id}") + return obj_json + + def set_link(self, group_uuid, title, link_json, replace=False): + """ create/update the given link """ + if not group_uuid.startswith("g-"): + raise TypeError("objdb.set_link - expected a group identifier") + if title.find('/') != -1: + raise KeyError("objdb.setlink - link title can not be nested") + obj_json = self.__getitem__(group_uuid) + links = obj_json["links"] + if title in links and replace: + # TBD: hsds update to for link replacement? + self.del_link(group_uuid, title) + # make a http put + req = f"/groups/{group_uuid}/links/{title}" + self._http_conn.PUT(req, body=link_json) # create the link + link_json['created'] = time.time() + links[title] = link_json + + def del_link(self, group_uuid, title): + if title.find('/') != -1: + raise KeyError("objdb.del_link - link title can not be nested") + obj_json = self.__getitem__(group_uuid) + links = obj_json["links"] + # tbd - validate link_json? + if title in links: + req = f"/groups/{group_uuid}/links/{title}" + rsp = self._http_conn.DELETE(req) + if rsp.status_code != 200: + raise IOError(rsp.status_code, f"failed to delete link: {title}") + # ok - so delete our cached copy + del links[title] + else: + self.log.warning(f"title: {title} not found in objdb for id {id}") + + def make_obj(self, parent_uuid, title, type_json=None, shape=None, cpl=None, track_order=None, maxdims=None): + """ create a new object + If type_json and shape_json are none - create a group + If type_json and shape_json - create a dataset + If type_json and not shape_json - create a datatype + """ + cfg = config.get_config() # pulls in state from a .hscfg file (if found). + + if track_order is None: + track_order = cfg.track_order + link_json = {} + if parent_uuid and title: + if title.find('/') != -1: + raise KeyError("link title can not be nested") + if parent_uuid not in self._objdb: + raise KeyError(f"parent_uuid: {parent_uuid} not found") + + link_json["name"] = title + + body = {} + if link_json: + body["link"] = link_json + + if type_json: + body['type'] = type_json + if shape is not None: + body['shape'] = shape + if maxdims: + body['maxdims'] = maxdims + req = "/datasets" + else: + req = "/datatypes" + else: + if shape: + raise KeyError("shape set, but no type") + req = "/groups" + + if track_order: + if not cpl: + cpl = {} + cpl['CreateOrder'] = 1 + if cpl: + body['creationProperties'] = cpl + + # self.log.debug(f"create group with body: {body}") + rsp = self._http_conn.POST(req, body=body) + self.log.info(f"got status code: {rsp.status_code} for POST req: {req}") + + if rsp.status_code not in (200, 201): + raise IOError(f"req: {req} failed with status: {rsp.status.code}") + + obj_json = rsp.json() + # mixin creation props if set + if cpl: + obj_json['creationProperties'] = cpl + obj_uuid = obj_json['id'] + self.__set_item__(obj_uuid, obj_json) # update group db + if link_json: + # tweak link_json to look like a link entry on objdb + link_json['class'] = 'H5L_TYPE_HARD' + link_json['created'] = time.time() + link_json['id'] = obj_uuid + del link_json['name'] + self.set_link(parent_uuid, title, link_json) + + return obj_uuid + + def set_attr(self, obj_uuid, name, attr_json): + """ create update attribute """ + obj_json = self.__getitem__(obj_uuid) + attrs = obj_json["attributes"] + params = {} + if name in attrs: + self.log.debug(f"replacing attr {name} of {obj_uuid}") + params['replace'] = 1 + + collection = get_collection(obj_uuid) + req = f"/{collection}/{obj_uuid}/attributes/{name}" + rsp = self._http_conn.PUT(req, body=attr_json, params=params) + + if rsp.status_code not in (200, 201): + self.log.error(f"got {rsp.status_code} for put req: {req}") + raise RuntimeError(f"Unexpected error on put request {req}: {rsp.status_code}") + self.log.info(f"got {rsp.status_code} for req: {req}") + attr_json['created'] = time.time() + attrs[name] = attr_json + + def del_attr(self, obj_uuid, name): + """ delete the given attribute """ + obj_json = self.__getitem__(obj_uuid) + attrs = obj_json["attributes"] + if name not in attrs: + self.log.warning(f"attr {name} of {obj_uuid} not found for delete") + raise KeyError("Unable to delete attribute (can't locate attribute)") + + collection = get_collection(obj_uuid) + req = f"/{collection}/{obj_uuid}/attributes/{name}" + rsp = self._http_conn.DELETE(req) + + if rsp.status_code != 200: + self.log.error(f"got {rsp.status_code} for delete req: {req}") + raise RuntimeError(f"Unexpected error on delete request {req}: {rsp.status_code}") + # remove from the objdb + del attrs[name] + + def resize(self, dset_uuid, dims): + """ update the shape of the dataset """ + # send the request to the server + body = {"shape": dims} + req = f"/datasets/{dset_uuid}/shape" + rsp = self._http_conn.PUT(req, body=body) + if rsp.status_code not in (200, 201): + msg = "unable to resize dataset shape, error" + raise IOError(rsp.status_code, msg) + # TBD Have HSDS return updated shape in response to avoid + # this GET request + self.fetch(dset_uuid) diff --git a/h5pyd/_hl/objectid.py b/h5pyd/_hl/objectid.py index 173f6d57..60368f71 100644 --- a/h5pyd/_hl/objectid.py +++ b/h5pyd/_hl/objectid.py @@ -12,9 +12,9 @@ from __future__ import absolute_import from datetime import datetime -import json import pytz import time +# import weakref from .h5type import createDataType @@ -26,16 +26,63 @@ def parse_lastmodified(datestr): dt = datetime.strptime( datestr, '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=pytz.UTC) else: - # if the time is an int or float, interpet as seconds since epoch + # if the time is an int or float, interpret as seconds since epoch dt = datetime.fromtimestamp(time.time()) return dt +def isUUID(name): + # return True if name looks like an object id + # There are some additional checks we could add to reduce false positives + # (like checking for hyphens in the right places) + if isinstance(name, str) and len(name) >= 38: + if name.startswith("groups/") or name.startswith("g-"): + return True + elif name.startswith("datatypes/") or name.startswith("t-"): + return True + elif name.startswith("datasets/") or name.startswith("d-"): + return True + else: + return False + else: + return False + + +def get_UUID(name): + """ return just the uuid part if the ref starts with 'groups/', 'datasets/' etc. """ + if not isUUID(name): + raise IOError(f"expected a uuid, but got: {name}") + if name.startswith("groups/"): + obj_uuid = name[len("groups/"):] + elif name.startswith("datatypes/"): + obj_uuid = name[len("datatypes/")] + elif name.startswith("datasets/"): + obj_uuid = name[len("datasets/"):] + else: + obj_uuid = name + return obj_uuid + + +def get_class_for_uuid(uuid): + """ Return class based on uuid """ + if not uuid: + return None + obj_uuid = get_UUID(uuid) + if obj_uuid.startswith("g-"): + return GroupID + elif obj_uuid.startswith("d-"): + return DatasetID + elif obj_uuid.startswith("t-"): + return TypeID + else: + raise TypeError(f"unexpected uuid string: {obj_uuid}") + + class ObjectID: """ - Uniquely identifies an h5serv resource + Uniquely identifies a resource """ @property @@ -52,17 +99,31 @@ def __hash__(self): @property def domain(self): """ domain for this obj """ - return self.http_conn.domain + return self._http_conn.domain @property def obj_json(self): """json representation of the object""" - return self._obj_json + objdb = self._http_conn.objdb + obj_json = objdb[self.uuid] + return obj_json @property def modified(self): """last modified timestamp""" - return self._modified + obj_json = self.obj_json + + last_modified = obj_json['lastModified'] + """Turn last modified datetime string into a datetime object.""" + if isinstance(last_modified, str): + # format: 2016-06-30T06:17:16.563536Z + # format: "2016-08-04T06:44:04Z" + dt = datetime.strptime(last_modified, '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=pytz.UTC) + else: + # if the time is an int or float, interpret as seconds since epoch + dt = datetime.fromtimestamp(time.time()) + + return dt @property def http_conn(self): @@ -72,50 +133,135 @@ def http_conn(self): @property def collection_type(self): """ Return collection type based on uuid """ - if self._uuid.startswith("g-"): + if self.uuid.startswith("g-"): collection_type = "groups" - elif self._uuid.startswith("t-"): + elif self.uuid.startswith("t-"): collection_type = "datatypes" - elif self._uuid.startswith("d-"): + elif self.uuid.startswith("d-"): collection_type = "datasets" else: - raise IOError(f"Unexpected uuid: {self._uuid}") + raise IOError(f"Unexpected uuid: {self.uuid}") return collection_type - def __init__(self, parent, item, http_conn=None, **kwds): + @property + def cpl(self): + # return creation property list + if 'creationProperties' in self.obj_json: + cpl = self.obj_json['creationProperties'] + else: + cpl = {} + return cpl - """Create a new objectId. - """ - parent_id = None - if parent is not None: - if isinstance(parent, ObjectID): - parent_id = parent + @property + def track_order(self): + """ Return the track_order state """ + track_order = None + cpl = self.cpl + if "CreateOrder" in cpl: + createOrder = cpl["CreateOrder"] + if not createOrder or createOrder == "0": + track_order = False else: - # assume we were passed a Group/Dataset/datatype - parent_id = parent.id + track_order = True + return track_order + + def get(self, obj_uuid): + """ Return id obj for given uuid """ + obj_class = get_class_for_uuid(obj_uuid) + if obj_class is GroupID: + obj = GroupID(obj_uuid, http_conn=self._http_conn) + elif obj_class is TypeID: + obj = TypeID(obj_uuid, http_conn=self._http_conn) + elif obj_class is DatasetID: + obj = DatasetID(obj_uuid, http_conn=self._http_conn) + else: + raise TypeError(f"Unexpected type: {obj_uuid}") - if type(item) is not dict: - raise IOError("Unexpected Error") + return obj - if "id" not in item: - raise IOError("Unexpected Error") + @property + def attrs(self): + obj_json = self.obj_json + if "attributes" not in obj_json: + raise IOError(f"expected to find attributes key in obj_json for {self._uuid}") + return obj_json['attributes'] + + def set_attr(self, name, attr): + """ Create the given attribute """ + objdb = self._http_conn.objdb + objdb.set_attr(self._uuid, name, attr) + + def get_attr(self, name): + """ Return the given attribute """ + attrs = self.attrs + if name not in attrs: + raise KeyError(f"Unable to get attribute (can't locate attribute: '{name}'") + attr = attrs[name] + + return attr + + def del_attr(self, name): + """ Delete the named attribute """ + objdb = self._http_conn.objdb + objdb.del_attr(self._uuid, name) + + def has_attr(self, name): + """ Test if an attribute name exists """ + attrs = self.attrs + if name in attrs: + return True + else: + return False - self._uuid = item['id'] + @property + def attr_count(self): + """ Get the number of attributes """ + attrs = self.attrs + return len(attrs) + + def get_attr_names(self, track_order=None): + """ Get a list of attribute names """ + attrs = self.attrs + if track_order is None: + track_order = self.track_order + + # convert to a list of dicts + attr_list = [] + for title in attrs: + attr_json = attrs[title] + item = {} + item['title'] = title + item['created'] = attr_json['created'] + attr_list.append(item) + + if track_order: + attr_list.sort(key=lambda d: d['created']) + else: + attr_list.sort(key=lambda d: d['title']) + names = [x['title'] for x in attr_list] + return names - self._modified = parse_lastmodified(item['lastModified']) + def __init__(self, obj_uuid, http_conn=None): - self._obj_json = item + """Create a new objectId. + """ + self._uuid = get_UUID(obj_uuid) if http_conn is not None: - self._http_conn = http_conn - elif parent_id is not None and parent_id.http_conn is not None: - self._http_conn = parent_id.http_conn + # use a weakref here so we don't keep a potentially large + # objdb in memory accidentally + self._http_conn = http_conn # weakref.ref(http_conn) else: raise IOError("Expected parent to have http connector") + objdb = http_conn.objdb + + if self._uuid not in objdb: + objdb.fetch(self._uuid) # will throw IOError if not found + def __eq__(self, other): if isinstance(other, self.__class__): - return self._uuid == other._uuid + return self._uuid == other.uuid else: return False @@ -124,95 +270,81 @@ def __ne__(self, other): def refresh(self): """ get the latest obj_json data from server """ - - # will need to get JSON from server - req = f"/{self.collection_type}/{self.id}" - # make server request - rsp = self._http_conn.GET(req) - if rsp.status_code != 200: - raise IOError(f"refresh request got status: {rsp.satus_code}") - item = json.loads(rsp.text) - - self._obj_json = item - self._modified = parse_lastmodified(item['lastModified']) - - objdb = self.http_conn._objdb - if objdb and self.id in objdb: - # delete any cached data from objdb so that gets will reflect server state - del objdb[self.id] + objdb = self._http_conn.objdb + objdb.fetch(self.uuid) def close(self): """Remove handles to id. """ - self._old_uuid = self._uuid # for debugging + self._old_uuid = self.uuid # for debugging self._uuid = 0 - self._obj_json = None self._http_conn = None def __bool__(self): - return bool(self._uuid) + return bool(self.uuid) def __del__(self): """ cleanup """ self.close() + def __repr__(self): + class_name = self.__class__.__name__ + if self._uuid: + r = f"<{class_name}({self._uuid})>" + else: + r = f"" + + return r + class TypeID(ObjectID): @property def type_json(self): - return self.obj_json['type'] + obj_json = self.obj_json + return obj_json['type'] def get_type(self): - type_json = self._obj_json["type"] + obj_json = self.obj_json + type_json = obj_json["type"] dtype = createDataType(type_json) return dtype - @property - def tcpl_json(self): - if 'creationProperties' in self._obj_json: - tcpl = self._obj_json['creationProperties'] - else: - tcpl = {} - return tcpl - - def __init__(self, parent, item, **kwds): + def __init__(self, obj_id, http_conn=None): """Create a new TypeID. """ - ObjectID.__init__(self, parent, item, **kwds) + if get_class_for_uuid(obj_id) != TypeID: + raise IOError(f"unexpected id for TypeID: {obj_id}") - if self.collection_type != "datatypes": - raise IOError(f"Unexpected collection_type: {self._collection_type}") + super().__init__(obj_id, http_conn=http_conn) class DatasetID(ObjectID): @property def type_json(self): - return self._obj_json['type'] + obj_json = self.obj_json + return obj_json['type'] @property def shape_json(self): - return self._obj_json['shape'] + obj_json = self.obj_json + return obj_json['shape'] def get_type(self): - type_json = self._obj_json["type"] + type_json = self.type_json dtype = createDataType(type_json) return dtype @property - def dcpl_json(self): - if 'creationProperties' in self._obj_json: - dcpl = self._obj_json['creationProperties'] - else: - dcpl = {} - return dcpl + def type_class(self): + return self.type_json['class'] @property def rank(self): rank = 0 - shape = self._obj_json['shape'] + shape = self.shape_json if shape['class'] == 'H5S_SIMPLE': dims = shape['dims'] rank = len(dims) @@ -221,13 +353,13 @@ def rank(self): @property def layout(self): layout = None - - if 'layout' in self.obj_json: - layout = self.obj_json['layout'] + obj_json = self.obj_json + if 'layout' in obj_json: + layout = obj_json['layout'] else: - dcpl = self.dcpl_json - if dcpl and 'layout' in dcpl: - layout = dcpl['layout'] + cpl = self.cpl + if 'layout' in cpl: + layout = cpl['layout'] return layout @@ -243,31 +375,126 @@ def chunks(self): return chunks - def __init__(self, parent, item, **kwds): + def __init__(self, obj_id, http_conn=None): """Create a new DatasetID. """ + if get_class_for_uuid(obj_id) != DatasetID: + raise IOError(f"unexpected id for DatasetID: {obj_id}") + super().__init__(obj_id, http_conn=http_conn) + + def getVerboseInfo(self): + req = f"/datasets/{self._uuid}" + params = {'verbose': 1} + rsp = self._http_conn.GET(req, params=params) + if rsp.status_code != 200: + raise RuntimeError(f"get status: {rsp.status_code} for {req}") + rsp_json = rsp.json() + return rsp_json - ObjectID.__init__(self, parent, item, **kwds) - - if self.collection_type != "datasets": - raise IOError(f"Unexpected collection_type: {self._collection_type}") + def resize(self, dims): + """ update the shape of the dataset """ + # send the request to the server + objdb = self._http_conn.objdb + objdb.resize(self._uuid, dims) class GroupID(ObjectID): - def __init__(self, parent, item, http_conn=None, **kwds): + def __init__(self, obj_id, http_conn=None): """Create a new GroupID. """ + if get_class_for_uuid(obj_id) != GroupID: + raise IOError(f"unexpected id for GroupIID: {obj_id}") - ObjectID.__init__(self, parent, item, http_conn=http_conn, **kwds) + super().__init__(obj_id, http_conn=http_conn) - if self.collection_type != "groups": - raise IOError(f"Unexpected collection_type: {self._collection_type}") + @property + def links(self): + obj_json = self.obj_json + if "links" not in obj_json: + raise IOError(f"expected to find links key in obj_json for {self._uuid}") + return obj_json['links'] + + def make_obj(self, title, type_json=None, shape=None, cpl=None, track_order=None, maxdims=None): + obj_json = self.obj_json + if title: + links = obj_json['links'] + if title in links: + raise IOError("Unable to create object (name already exists)") + objdb = self._http_conn.objdb + kwds = {} + + if shape is not None: + kwds['shape'] = shape + if type_json: + kwds['type_json'] = type_json + if cpl: + kwds['cpl'] = cpl + if track_order: + kwds['track_order'] = track_order + if maxdims: + kwds['maxdims'] = maxdims + obj_uuid = objdb.make_obj(self._uuid, title, **kwds) + obj_id = self.get(obj_uuid) + return obj_id + + def get_link(self, title): + """ return link json given it's title """ + links = self.links + if title not in links: + raise KeyError(f"link {title} not found") + link_json = links[title] + return link_json + + def set_link(self, title, link_json, replace=False): + """ set the given link """ + links = self.links + if not replace and title in links: + raise IOError("Unable to create link (name already exists)") + objdb = self._http_conn.objdb + + objdb.set_link(self.uuid, title, link_json, replace=replace) + + def del_link(self, title): + """ delete the given link """ + links = self.links + if title not in links: + # not found + raise KeyError(f"link '{title}' not found") + objdb = self._http_conn.objdb + objdb.del_link(self.uuid, title) @property - def gcpl_json(self): - if 'creationProperties' in self._obj_json: - gcpl = self._obj_json['creationProperties'] + def link_count(self): + """ return number of links """ + links = self.links + return len(links) + + def get_link_titles(self, track_order=None): + links = self.links + if track_order is None: + track_order = self.track_order + + # convert to a list of dicts + link_list = [] + for title in links: + link_json = links[title] + item = {} + item['title'] = title + item['created'] = link_json['created'] + link_list.append(item) + + if track_order: + link_list.sort(key=lambda d: d['created']) else: - gcpl = {} - return gcpl + link_list.sort(key=lambda d: d['title']) + titles = [x['title'] for x in link_list] + return titles + + def has_link(self, title): + """ Test if a link name exists """ + links = self.links + if title in links: + return True + else: + return False diff --git a/h5pyd/_hl/selections.py b/h5pyd/_hl/selections.py index c053f287..ffba7d15 100644 --- a/h5pyd/_hl/selections.py +++ b/h5pyd/_hl/selections.py @@ -104,7 +104,7 @@ def select(obj, args): int(a) except Exception: use_fancy = True - if use_fancy: + if use_fancy and hasattr(obj, "shape"): sel = FancySelection(obj.shape) sel[args] return sel @@ -144,7 +144,7 @@ class Selection(object): """ def __init__(self, shape, *args, **kwds): - """ Create a selection. Shape may be None if spaceid is given. """ + """ Create a selection. """ shape = tuple(shape) self._shape = shape diff --git a/h5pyd/_hl/table.py b/h5pyd/_hl/table.py index 040c871b..fdd365af 100644 --- a/h5pyd/_hl/table.py +++ b/h5pyd/_hl/table.py @@ -85,15 +85,14 @@ class Table(Dataset): def __init__(self, bind, track_order=None): """ Create a new Table object by binding to a low-level DatasetID. """ - if not isinstance(bind, DatasetID): raise ValueError(f"{bind} is not a DatasetID") - Dataset.__init__(self, bind, track_order=track_order) + super().__init__(bind, track_order=track_order) - if len(self._dtype) < 1: + if len(self.dtype) < 1: raise ValueError("Table type must be compound") - if len(self._shape) > 1: + if len(self.shape) > 1: raise ValueError("Table must be one-dimensional") @property @@ -107,13 +106,14 @@ def colnames(self): @property def nrows(self): - return self._shape[0] + shape_json = self.id.shape_json + return shape_json['dims'][0] def read(self, start=None, stop=None, step=None, field=None, out=None): if start is None: start = 0 if stop is None: - stop = self._shape[0] + stop = self.nrows if step is None: step = 1 arr = self[start:stop:step] @@ -164,10 +164,10 @@ def readtime_dtype(basetype, names): if not start: start = 0 if not stop: - stop = self._shape[0] + stop = self.nrows else: start = 0 - stop = self._shape[0] + stop = self.nrows selection_arg = slice(start, stop) selection = sel.select(self, selection_arg) @@ -280,10 +280,10 @@ def update_where(self, condition, value, start=None, stop=None, step=None, limit if not start: start = 0 if not stop: - stop = self._shape[0] + stop = self.nrows else: start = 0 - stop = self._shape[0] + stop = self.nrows selection_arg = slice(start, stop) selection = sel.select(self, selection_arg) @@ -391,7 +391,7 @@ def append(self, rows): numrows = val.shape[0] - req = "/datasets/" + self.id.uuid + "/value" + req = f"/datasets/{self.id.uuid}/value" params = {} body = {} @@ -413,8 +413,9 @@ def append(self, rows): body['value'] = val body['append'] = numrows - self.PUT(req, body=body, format=format, params=params) + rsp = self.id.http_conn.PUT(req, body=body, format=format, params=params) + if rsp.status_code != 200: + raise IOError(rsp.status_code, "table append failed") # if we get here, the request was successful, adjust the shape - total_rows = self._shape[0] + numrows - self._shape = (total_rows,) + # TBD... diff --git a/test/hl/test_attribute.py b/test/hl/test_attribute.py index 4577bea3..eb3d03a1 100644 --- a/test/hl/test_attribute.py +++ b/test/hl/test_attribute.py @@ -118,177 +118,6 @@ def test_create(self): # close file f.close() - def test_create_multiple(self): - if config.get('use_h5py') or self.hsds_version() < "0.9.0": - return - - filename = self.getFileName("create_attribute_multiple") - print("filename:", filename) - f = h5py.File(filename, 'w') - - g1 = f.create_group('g1') - - num_attrs = 10 - # No shape or dtype specified - names = ['attr' + str(i) for i in range(num_attrs)] - values = [np.arange(50)] * num_attrs - g1.attrs.create(names, values) - - for i in range(num_attrs): - self.assertTrue(names[i] in g1.attrs) - self.assertTrue(np.array_equal(g1.attrs[names[i]], values[i])) - - # Test replacing existing attributes - new_values = [np.arange(100)] * num_attrs - g1.attrs.create(names, new_values) - - for i in range(num_attrs): - self.assertTrue(names[i] in g1.attrs) - self.assertTrue(np.array_equal(g1.attrs[names[i]], new_values[i])) - - # Test creating attributes with shape and dtype specified - names = ['attr' + str(i) for i in range(num_attrs, 2 * num_attrs)] - values = [np.arange(i + 1) for i in range(num_attrs)] - dtypes = [np.int32] * num_attrs - shapes = [(i + 1,) for i in range(num_attrs)] - g1.attrs.create(names, values, shapes, dtypes) - - for i in range(num_attrs): - self.assertTrue(names[i] in g1.attrs) - self.assertTrue(np.array_equal(g1.attrs[names[i]], values[i])) - self.assertEqual(g1.attrs[names[i]].dtype, dtypes[i]) - self.assertEqual(g1.attrs[names[i]].shape, shapes[i]) - - def test_get_multiple(self): - if config.get('use_h5py') or self.hsds_version() < "0.9.0": - return - - filename = self.getFileName("get_attribute_multiple") - print("filename:", filename) - f = h5py.File(filename, 'w') - - # create attributes - num_attrs = 10 - g1 = f.create_group('g1') - names = ['attr' + str(i) for i in range(num_attrs)] - values = [np.arange(50) for i in range(num_attrs)] - - for i in range(10): - g1.attrs[names[i]] = values[i] - - # get all attributes - values_out = g1.attrs.get_attributes() - - self.assertEqual(len(values_out), 10) - for i in range(10): - self.assertTrue(names[i] in values_out) - self.assertTrue(np.array_equal(values_out[names[i]], values[i])) - - # get attributes from cache - values_out = g1.attrs.get_attributes() - self.assertEqual(len(values_out), 10) - for i in range(10): - self.assertTrue(names[i] in values_out) - self.assertTrue(np.array_equal(values_out[names[i]], values[i])) - - # get attributes that match the pattern 'attr5' - pattern = "attr5" - values_out = g1.attrs.get_attributes(pattern=pattern) - - self.assertTrue("attr5" in values_out) - self.assertTrue(np.array_equal(values_out["attr5"], values[5])) - - # get only attributes that match the pattern 'att*' - g1.attrs['new_attr'] = np.arange(100) - pattern = "att*" - values_out = g1.attrs.get_attributes(pattern=pattern) - - self.assertEqual(len(values_out), 10) - - for i in range(10): - self.assertTrue(names[i] in values_out) - self.assertTrue(np.array_equal(values_out[names[i]], values[i])) - - # get the first five attributes - limit = 5 - values_out = g1.attrs.get_attributes(limit=limit) - - self.assertEqual(len(values_out), 5) - - for i in range(5): - self.assertTrue(names[i] in values_out) - self.assertTrue(np.array_equal(values_out[names[i]], values[i])) - - # get all attributes after 'attr4 - marker = "attr4" - values_out = g1.attrs.get_attributes(marker=marker, limit=limit) - - self.assertEqual(len(values_out), 5) - - for i in range(6, 10): - self.assertTrue(names[i] in values_out) - self.assertTrue(np.array_equal(values_out[names[i]], values[i])) - - # get set of attributes by name - names = ['attr5', 'attr7', 'attr9'] - - values_out = g1.attrs.get_attributes(names=names) - - self.assertEqual(len(values_out), 3) - - for name in names: - self.assertTrue(name in values_out) - i = int(name[4]) - self.assertTrue(np.array_equal(values_out[name], values[i])) - - def test_delete_multiple(self): - if config.get('use_h5py') or self.hsds_version() < "0.9.0": - return - - filename = self.getFileName("delete_attribute_multiple") - print("filename:", filename) - f = h5py.File(filename, 'w') - - # create attributes - num_attrs = 10 - g1 = f.create_group('g1') - names = ['attr' + str(i) for i in range(num_attrs)] - values = [np.arange(50) for i in range(num_attrs)] - - for i in range(10): - g1.attrs[names[i]] = values[i] - - # delete the first five attributes - del g1.attrs[names[0:5]] - - # check that the first five attributes are gone - for i in range(5): - self.assertFalse(names[i] in g1.attrs) - - # check that the last five attributes are still there - for i in range(5, 10): - self.assertTrue(names[i] in g1.attrs) - self.assertTrue(np.array_equal(g1.attrs[names[i]], values[i])) - - # delete single attribute - del g1.attrs[names[5]] - - self.assertFalse(names[5] in g1.attrs) - - for i in range(6, 10): - self.assertTrue(names[i] in g1.attrs) - self.assertTrue(np.array_equal(g1.attrs[names[i]], values[i])) - - # delete attributes with name that must be URL-encoded - names = ['attr with spaces', 'attr%', 'unicode八attr'] - for name in names: - g1.attrs[name] = np.arange(100) - - del g1.attrs[names] - - for name in names: - self.assertTrue(name not in g1.attrs) - class TestTrackOrder(TestCase): diff --git a/test/hl/test_file.py b/test/hl/test_file.py index 3e67a6d3..5a864c6c 100644 --- a/test/hl/test_file.py +++ b/test/hl/test_file.py @@ -264,7 +264,8 @@ def test_auth(self): self.assertEqual(f.filename, filename) self.assertEqual(f.name, "/") self.assertTrue(f.id.id is not None) - self.assertEqual(len(f.keys()), 2) + print("f.keys:", list(f.keys())) + self.assertEqual(len(list(f.keys())), 2) if h5py.__name__ == "h5py": return # no ACLs in h5py diff --git a/test/hl/test_group.py b/test/hl/test_group.py index 581af8a7..9b83abf4 100644 --- a/test/hl/test_group.py +++ b/test/hl/test_group.py @@ -23,6 +23,16 @@ class TestGroup(TestCase): + def test_del(self): + filename = self.getFileName("test_del") + print("filename:", filename) + f = h5py.File(filename, 'w') + self.assertTrue('/' in f) + f.create_group("deadcat") + del f["deadcat"] + f.close() + f = h5py.File(filename) + def test_create(self): # create main test file filename = self.getFileName("create_group") @@ -121,11 +131,13 @@ def test_create(self): # create a external hardlink r['myexternallink'] = h5py.ExternalLink(link_target_filename, "somepath") + # test getclass g1_class = r.get('g1', getclass=True) self.assertEqual(g1_class, h5py.Group) linkee_class = r.get('mysoftlink', getclass=True) self.assertEqual(linkee_class, h5py.Group) + print(f"test {r}.get, getclass and getlink are true") link_class = r.get('mysoftlink', getclass=True, getlink=True) self.assertEqual(link_class, h5py.SoftLink) softlink = r.get('mysoftlink', getlink=True) @@ -201,7 +213,7 @@ def test_create(self): if h5py.__name__ == "h5pyd": # for h5pyd we should be able to retrieve the anon group - anon_group = f.getObjByUuid(anon_group_id) + anon_group = f[anon_group_id] self.assertEqual(anon_group_id, anon_group.id.id) # can also get anon group using groups/ key anon_group = f[f"groups/{anon_group_id}"] @@ -306,244 +318,6 @@ def get_count(grp): f.close() - def test_link_multi_removal(self): - # create a file for use a link target - if h5py.__name__ == "h5py": - return # multilink is for h5pyd only - filename = self.getFileName("test_link_multi_removal") - print(f"filename: {filename}") - - f = h5py.File(filename, 'w') - g1 = f.create_group("g1") - g1_clone = f["g1"] - # create multiple subgroups - names = ["subgroup" + str(i) for i in range(10)] - subgrps = [] - for name in names: - subgrps.append(g1.create_group(name)) - - self.assertEqual(len(g1), 10) - - # Remove first 5 subgroups - del g1[names[0:5]] - - self.assertEqual(len(g1), 5) - self.assertEqual(len(g1_clone), 5) - - for name in names[0:5]: - self.assertFalse(name in g1) - self.assertFalse(name in g1_clone) - - for name in names[5:]: - self.assertTrue(name in g1) - self.assertTrue(name in g1_clone) - - # delete links with names that must be URL-encoded - names = ['link with spaces', 'link%', 'unicode八link'] - - for name in names: - g1[name] = g1 - - del g1[names] - - for name in names: - self.assertTrue(name not in g1) - - f.close() - - def test_link_multi_create(self): - if h5py.__name__ == "h5py": - return # multi create h5pyd only feature - filename = self.getFileName("test_link_multi_create") - print(f"filename: {filename}") - - f = h5py.File(filename, 'w') - g1 = f.create_group("g1") - - # Create 10 soft links - num_links = 10 - names = ["link" + str(i) for i in range(num_links)] - links = [] - - for name in names: - new_link = h5py.SoftLink("dummy_path_" + str(name)) - links.append(new_link) - - g1[names] = links - - self.assertEqual(len(g1), num_links) - - for i in range(num_links): - name = names[i] - self.assertTrue(name in g1) - self.assertEqual(g1.get(name, getlink=True).path, links[i].path) - - # Create soft and hard links - names = ["link" + str(i) for i in range(num_links, 2 * num_links)] - links = [] - - for i in range(num_links, 2 * num_links): - if i % 2 == 0: - new_link = h5py.SoftLink("dummy_path_" + str(i)) - else: - # Hard link to g1 - new_link = g1 - - links.append(new_link) - - g1[names] = links - - self.assertEqual(len(g1), num_links * 2) - - for i in range(num_links, 2 * num_links): - name = "link" + str(i) - self.assertTrue(name in g1) - - if i % 2 == 0: - link = g1.get(name, getlink=True) - self.assertEqual(link.path, links[i % num_links].path) - else: - g1_clone = g1.get(name) - self.assertEqual(len(g1_clone), len(g1)) - self.assertEqual(g1_clone.id.id, g1.id.id) - - # Create external links - - names = ["link" + str(i) for i in range(num_links * 2, num_links * 3)] - links = [] - - for i in range(num_links * 2, num_links * 3): - filename = "dummy_filename_" + str(i) - path = "dummy_path_" + str(i) - new_link = h5py.ExternalLink(filename=filename, path=path) - links.append(new_link) - - g1[names] = links - - self.assertEqual(len(g1), num_links * 3) - - for i in range(num_links * 2, num_links * 3): - name = "link" + str(i) - self.assertTrue(name in g1) - - link = g1.get(name, getlink=True) - self.assertEqual(link.path, links[i % num_links]._path) - self.assertEqual(link.filename, links[i % num_links]._filename) - - def test_link_get_multi(self): - filename = self.getFileName("test_link_get_multi") - print(f"filename: {filename}") - if h5py.__name__ == "h5py": - return # no multi link for h5py - - f = h5py.File(filename, 'w') - g1 = f.create_group("g1") - - # Create subgroups - g2 = g1.create_group("g2") - g3 = g2.create_group("g3") - - # Create links in each group - - num_links = 20 - names = ["link" + str(i) for i in range(num_links)] - - for name in names: - g1[name] = g1 - g2[name] = g2 - g3[name] = g3 - - # Get all links from g1 only - links_out = g1.get(None, getlink=True) - - self.assertEqual(len(links_out), num_links + 1) - - for name in names: - self.assertTrue(name in links_out) - link = links_out[name] - self.assertEqual(link.id, g1.id.uuid) - - # Get all links from g1 and subgroups - links_out = g1.get(None, getlink=True, follow_links=True) - - # 3 groups containing links - self.assertEqual(len(links_out), 3) - - for group_id in [g1.id.uuid, g2.id.uuid, g3.id.uuid]: - self.assertTrue(group_id in links_out) - links = links_out[group_id] - - if group_id == g3.id.uuid: - self.assertEqual(len(links), num_links) - else: - self.assertEqual(len(links), num_links + 1) - - for name in names: - self.assertTrue(name in links) - link = links[name] - self.assertEqual(link.id, group_id) - - # Make sure cache does not erroneously return recursive links - links_out = g1.get(None, getlink=True) - self.assertEqual(len(links_out), num_links + 1) - - # Return only 5 links from group - - links_out = g1.get(None, getlink=True, limit=5) - self.assertEqual(len(links_out), 5) - - self.assertTrue("g2" in links_out) - for name in sorted(names)[0:4]: - self.assertTrue(name in links_out) - link = links_out[name] - self.assertEqual(link.id, g1.id.uuid) - - # Return next 5 links via marker - links_out = g1.get(None, getlink=True, limit=5, marker=sorted(names)[3]) - - self.assertEqual(len(links_out), 5) - - for name in sorted(names)[4:9]: - self.assertTrue(name in links_out) - link = links_out[name] - self.assertEqual(link.id, g1.id.uuid) - - # Return all links in g1 besides g2 - links_out = g1.get(None, getlink=True, pattern="link*") - self.assertEqual(len(links_out), 20) - - for name in names: - if name.startswith("link1"): - self.assertTrue(name in links_out) - link = links_out[name] - self.assertEqual(link.id, g1.id.uuid) - - # Return all links in g1/g2/g3 except for the group links - links_out = g1.get(None, getlink=True, follow_links=True, pattern="link*") - self.assertEqual(len(links_out), 3) - - for group_id in [g1.id.uuid, g2.id.uuid, g3.id.uuid]: - self.assertTrue(group_id in links_out) - links = links_out[group_id] - - self.assertEqual(len(links), num_links) - - for name in names: - self.assertTrue(name in links) - link = links[name] - self.assertEqual(link.id, group_id) - - # Retrieve a set of links by name - names = ["link" + str(i) for i in range(5, 15)] - links_out = g1.get(names, getlink=True) - - self.assertEqual(len(links_out), 10) - - for name in names: - self.assertTrue(name in links_out) - link = links_out[name] - self.assertEqual(link.id, g1.id.uuid) - class TestTrackOrder(TestCase): titles = ("one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten") @@ -578,11 +352,9 @@ def test_track_order(self): if h5py.__name__ == "h5pyd": # test with get and track_order=False - links = g.get(None, getlink=True, track_order=False) + g = f.get('order', track_order=False) ref = sorted(self.titles) - self.assertEqual(list(links), ref) - self.assertEqual(list(links), ref) - + self.assertEqual(list(g.keys()), ref) # re-opening the file should retain the track_order setting with h5py.File(filename) as f: g = f['order'] diff --git a/test/hl/test_table.py b/test/hl/test_table.py index 818e6257..0c57b5ab 100644 --- a/test/hl/test_table.py +++ b/test/hl/test_table.py @@ -95,6 +95,7 @@ def test_query_table(self): table.append(data) self.assertEqual(table.nrows, len(data)) + self.assertEqual(table.shape, (len(data),)) for indx in range(len(data)): row = table[indx] From bd2a54b08b050b57c79a676cb026443e73fe1ddb Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 21 Jan 2025 11:19:08 +0800 Subject: [PATCH 16/32] added closed_group test --- test/hl/test_group.py | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/test/hl/test_group.py b/test/hl/test_group.py index 9b83abf4..8ab5df01 100644 --- a/test/hl/test_group.py +++ b/test/hl/test_group.py @@ -54,7 +54,12 @@ def test_create(self): self.assertTrue(g1.id.id != r.id.id) self.assertEqual(g1.name, "/g1") + g1_file = g1.file + print("g1_file.name:", g1_file.filename) + self.assertEqual(g1.file.filename, filename) + r.create_group("g1/g1.1") + print(g1_file.filename) g1_1 = r["g1/g1.1"] self.assertEqual(g1_1.name, "/g1/g1.1") self.assertEqual(len(r), 1) @@ -142,7 +147,7 @@ def test_create(self): self.assertEqual(link_class, h5py.SoftLink) softlink = r.get('mysoftlink', getlink=True) self.assertEqual(softlink.path, '/g1/g1.1') - + """ linkee_class = r.get('myexternallink', getclass=True) link_class = r.get('myexternallink', getclass=True, getlink=True) self.assertEqual(link_class, h5py.ExternalLink) @@ -150,17 +155,22 @@ def test_create(self): self.assertEqual(external_link.path, 'somepath') external_link_filename = external_link.filename self.assertTrue(external_link_filename.find('link_target') > -1) + """ links = r.items() got_external_link = False + print("looping through links") for link in links: + print("link:", link[0]) title = link[0] obj = link[1] if title == 'myexternallink': + print("title is myexternallink") self.assertTrue(obj is not None) self.assertEqual(len(obj), 0) self.assertTrue(obj.file.filename != filename) got_external_link = True + print("got_external_link") self.assertTrue(got_external_link) @@ -243,6 +253,23 @@ def test_nested_create(self): f.close() + def test_closed_group(self): + def get_group_ref(filename): + tmp_filename = self.getFileName("tmp_file") + + with h5py.File(tmp_filename, 'w') as f: + g1 = f.create_group("g1") + self.assertTrue(isinstance(g1, h5py.Group)) + + return g1 + + filename = self.getFileName("test_closed_group") + print("filename:", filename) + + grp = get_group_ref(filename) + print(grp) + self.assertFalse(grp) + def test_external_links(self): # create a file for use a link target linked_filename = self.getFileName("linked_file") From dd3fe0a352e5cc6588de60b15fed967ac8b20b90 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 21 Jan 2025 11:20:52 +0800 Subject: [PATCH 17/32] cleaned up debug print statements --- test/hl/test_group.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/hl/test_group.py b/test/hl/test_group.py index 8ab5df01..115422ec 100644 --- a/test/hl/test_group.py +++ b/test/hl/test_group.py @@ -260,14 +260,13 @@ def get_group_ref(filename): with h5py.File(tmp_filename, 'w') as f: g1 = f.create_group("g1") self.assertTrue(isinstance(g1, h5py.Group)) + self.assertTrue(g1) return g1 filename = self.getFileName("test_closed_group") - print("filename:", filename) grp = get_group_ref(filename) - print(grp) self.assertFalse(grp) def test_external_links(self): From 62ecf1d6d4017322c550e97377e8eea579cae9cb Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 21 Jan 2025 11:25:28 +0800 Subject: [PATCH 18/32] more cleanup --- test/hl/test_group.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/test/hl/test_group.py b/test/hl/test_group.py index 115422ec..61e5f61b 100644 --- a/test/hl/test_group.py +++ b/test/hl/test_group.py @@ -254,9 +254,8 @@ def test_nested_create(self): f.close() def test_closed_group(self): - def get_group_ref(filename): - tmp_filename = self.getFileName("tmp_file") - + def get_group_ref(): + tmp_filename = self.getFileName("test_closed_group") with h5py.File(tmp_filename, 'w') as f: g1 = f.create_group("g1") self.assertTrue(isinstance(g1, h5py.Group)) @@ -264,9 +263,7 @@ def get_group_ref(filename): return g1 - filename = self.getFileName("test_closed_group") - - grp = get_group_ref(filename) + grp = get_group_ref() self.assertFalse(grp) def test_external_links(self): From 6f450897396d7473271be2d40b8666779788acc9 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 21 Jan 2025 13:02:58 +0800 Subject: [PATCH 19/32] add weakref for closed objs --- h5pyd/_hl/base.py | 6 ++-- h5pyd/_hl/files.py | 33 ++++++++--------- h5pyd/_hl/group.py | 7 +++- h5pyd/_hl/httpconn.py | 7 ++++ h5pyd/_hl/objdb.py | 32 ++++++++++------- h5pyd/_hl/objectid.py | 82 +++++++++++++++++++++++++++++-------------- test/hl/test_group.py | 27 ++++++-------- 7 files changed, 116 insertions(+), 78 deletions(-) diff --git a/h5pyd/_hl/base.py b/h5pyd/_hl/base.py index b6c1fac1..f8484bec 100644 --- a/h5pyd/_hl/base.py +++ b/h5pyd/_hl/base.py @@ -21,7 +21,7 @@ from collections.abc import ( Mapping, MutableMapping, KeysView, ValuesView, ItemsView ) -from .objectid import GroupID, ObjectID +from .objectid import FileID, ObjectID from .h5type import Reference, check_dtype, special_dtype numpy_integer_types = (np.int8, np.uint8, np.int16, np.int16, np.int32, np.uint32, np.int64, np.uint64) @@ -796,9 +796,9 @@ def file(self): from .files import File http_conn = self._id.http_conn root_uuid = http_conn.root_uuid - groupid = GroupID(root_uuid, http_conn=http_conn) + fileid = FileID(root_uuid, http_conn=http_conn) - return File(groupid) + return File(fileid) @property def name(self): diff --git a/h5pyd/_hl/files.py b/h5pyd/_hl/files.py index f15c141e..da5fa1e4 100644 --- a/h5pyd/_hl/files.py +++ b/h5pyd/_hl/files.py @@ -17,7 +17,7 @@ import pathlib import time -from .objectid import GroupID +from .objectid import FileID from .group import Group from .httpconn import HttpConn from .. import config @@ -335,7 +335,8 @@ def __init__( timeout Timeout value in seconds """ - groupid = None + + fileid = None dn_ids = [] root_json = None cfg = config.get_config() # pulls in state from a .hscfg file (if found). @@ -349,8 +350,8 @@ def __init__( # if we're passed a GroupId as domain, just initialize the file object # with that. This will be faster and enable the File object to share the same http connection. # no_endpoint_info = endpoint is None and username is None and password is None - if isinstance(domain, GroupID): - groupid = domain + if isinstance(domain, FileID): + fileid = domain else: if not isinstance(domain, str): raise IOError(400, "expected a str or GroupID object for domain") @@ -538,10 +539,10 @@ def __init__( else: objdb.reload() - groupid = GroupID(root_uuid, http_conn=http_conn) + fileid = FileID(root_uuid, http_conn=http_conn) # end else - self._id = groupid + self._id = fileid self._verboseInfo = None # additional state we'll get when requested self._verboseUpdated = None # when the verbose data was fetched self._lastScan = None # when summary stats where last updated by server @@ -575,7 +576,7 @@ def __init__( else: self._version = None - Group.__init__(self, self._id, track_order=track_order) + super().__init__(self._id, track_order=track_order) def _getVerboseInfo(self): now = time.time() @@ -792,6 +793,10 @@ def run_scan(self): def flush(self): """Tells the service to complete any pending updates to permanent storage""" + if self.mode == 'r': + # read-only, no need to flush + return + self.log.debug("flush") self.log.info("sending PUT flush request") req = "/" @@ -818,18 +823,8 @@ def close(self, flush=None): # this will close the socket of the http_conn singleton self.log.debug(f"close, mode: {self.mode}") - if flush is None: - # set flush to true if this is a direct connect and file - # is writable - if self.mode == "r+" and self._id._http_conn._hsds: - flush = True - else: - flush = False - # do a PUT flush if this file is writable and the server is HSDS and flush is set - if flush: - self.flush() - if self._id._http_conn: - self._id._http_conn.close() + self.flush() + self._id.close() def __enter__(self): diff --git a/h5pyd/_hl/group.py b/h5pyd/_hl/group.py index dcc0fd46..267d9eae 100644 --- a/h5pyd/_hl/group.py +++ b/h5pyd/_hl/group.py @@ -109,7 +109,12 @@ def _get_bypath(self, h5path, create=False, track_order=None): except IOError: # unable to find external link raise KeyError(f"Unable to open domain: {external_domain}") - return f[external_path] + # save reference to the fileid object so the returned object + # doesn't get closed on return + self.id.http_conn.add_external_ref(f.id) + obj = f[external_path] + obj.id._http_conn = f.id._http_conn + return obj else: raise IOError(f"Unexpected link_class: {link_class}") elif create: diff --git a/h5pyd/_hl/httpconn.py b/h5pyd/_hl/httpconn.py index f803258f..1502351e 100644 --- a/h5pyd/_hl/httpconn.py +++ b/h5pyd/_hl/httpconn.py @@ -280,6 +280,7 @@ def __init__( self._api_key = api_key self._s = None # Sessions self._server_info = None + self._external_refs = [] self._logger = logger if logger is None: @@ -778,6 +779,12 @@ def session(self): s = self._s return s + def add_external_ref(self, fid): + # this is used by the group class to keep references to external links open + if fid.__class__.__name__ != "FileID": + raise TypeError("add_external_ref, expected FileID type") + self._external_refs.append(fid) + def close(self): if self._s: self._s.close() diff --git a/h5pyd/_hl/objdb.py b/h5pyd/_hl/objdb.py index 599ea46d..e1b9347d 100644 --- a/h5pyd/_hl/objdb.py +++ b/h5pyd/_hl/objdb.py @@ -13,7 +13,7 @@ from __future__ import absolute_import import time -# import weakref +import weakref from .. import config @@ -33,12 +33,20 @@ def get_collection(uuid): class ObjDB(): """ Domain level object map """ def __init__(self, http_conn, use_cache=True): - self._http_conn = http_conn # weakref.ref(http_conn) + self._http_conn = weakref.ref(http_conn) self._objdb = {} self._loadtime = {} self._use_cache = use_cache self.log = http_conn.logging + @property + def http_conn(self): + # access weark ref + conn = self._http_conn() + if conn is None: + raise RuntimeError("http connection has been garbage collected") + return conn + def fetch(self, obj_uuid): """ get obj_json for given obj_uuid from the server """ @@ -61,7 +69,7 @@ def fetch(self, obj_uuid): if collection_type == "groups": # get links as well params["include_links"] = 1 - rsp = self._http_conn.GET(req, params=params) + rsp = self.http_conn.GET(req, params=params) if rsp.status_code in (404, 410): self.log.warning(f"obj: {obj_uuid} not found") return None @@ -126,11 +134,11 @@ def load(self, domain_objs): def reload(self): """ re-initialize objdb """ - self.log.info(f"objdb.reload {self._http_conn.domain}") + self.log.info(f"objdb.reload {self.http_conn.domain}") self._objdb = {} self._loadtime = {} obj_uuids = set() - obj_uuids.add(self._http_conn.root_uuid) + obj_uuids.add(self.http_conn.root_uuid) while obj_uuids: obj_uuid = obj_uuids.pop() obj_json = self.fetch(obj_uuid) @@ -223,7 +231,7 @@ def get_bypath(self, parent_uuid, h5path, follow=False, getlink=False): self.log.warning(f"id: {obj_id} has null h5path for link: {link_name}") raise KeyError(h5path) if slink_path.startswith('/'): - slink_id = self._http_conn.root_uuid + slink_id = self.http_conn.root_uuid else: slink_id = obj_id # recursive call @@ -267,7 +275,7 @@ def set_link(self, group_uuid, title, link_json, replace=False): self.del_link(group_uuid, title) # make a http put req = f"/groups/{group_uuid}/links/{title}" - self._http_conn.PUT(req, body=link_json) # create the link + self.http_conn.PUT(req, body=link_json) # create the link link_json['created'] = time.time() links[title] = link_json @@ -279,7 +287,7 @@ def del_link(self, group_uuid, title): # tbd - validate link_json? if title in links: req = f"/groups/{group_uuid}/links/{title}" - rsp = self._http_conn.DELETE(req) + rsp = self.http_conn.DELETE(req) if rsp.status_code != 200: raise IOError(rsp.status_code, f"failed to delete link: {title}") # ok - so delete our cached copy @@ -332,7 +340,7 @@ def make_obj(self, parent_uuid, title, type_json=None, shape=None, cpl=None, tra body['creationProperties'] = cpl # self.log.debug(f"create group with body: {body}") - rsp = self._http_conn.POST(req, body=body) + rsp = self.http_conn.POST(req, body=body) self.log.info(f"got status code: {rsp.status_code} for POST req: {req}") if rsp.status_code not in (200, 201): @@ -365,7 +373,7 @@ def set_attr(self, obj_uuid, name, attr_json): collection = get_collection(obj_uuid) req = f"/{collection}/{obj_uuid}/attributes/{name}" - rsp = self._http_conn.PUT(req, body=attr_json, params=params) + rsp = self.http_conn.PUT(req, body=attr_json, params=params) if rsp.status_code not in (200, 201): self.log.error(f"got {rsp.status_code} for put req: {req}") @@ -384,7 +392,7 @@ def del_attr(self, obj_uuid, name): collection = get_collection(obj_uuid) req = f"/{collection}/{obj_uuid}/attributes/{name}" - rsp = self._http_conn.DELETE(req) + rsp = self.http_conn.DELETE(req) if rsp.status_code != 200: self.log.error(f"got {rsp.status_code} for delete req: {req}") @@ -397,7 +405,7 @@ def resize(self, dset_uuid, dims): # send the request to the server body = {"shape": dims} req = f"/datasets/{dset_uuid}/shape" - rsp = self._http_conn.PUT(req, body=body) + rsp = self.http_conn.PUT(req, body=body) if rsp.status_code not in (200, 201): msg = "unable to resize dataset shape, error" raise IOError(rsp.status_code, msg) diff --git a/h5pyd/_hl/objectid.py b/h5pyd/_hl/objectid.py index 60368f71..35711463 100644 --- a/h5pyd/_hl/objectid.py +++ b/h5pyd/_hl/objectid.py @@ -14,7 +14,7 @@ from datetime import datetime import pytz import time -# import weakref +import weakref from .h5type import createDataType @@ -99,12 +99,12 @@ def __hash__(self): @property def domain(self): """ domain for this obj """ - return self._http_conn.domain + return self.http_conn.domain @property def obj_json(self): """json representation of the object""" - objdb = self._http_conn.objdb + objdb = self.http_conn.objdb obj_json = objdb[self.uuid] return obj_json @@ -127,8 +127,20 @@ def modified(self): @property def http_conn(self): - """ http connector """ - return self._http_conn + # access weak ref + if isinstance(self._http_conn, weakref.ReferenceType): + conn = self._http_conn() + if conn is None: + raise RuntimeError("http connection has been garbage collected") + else: + return self._http_conn + return conn + + @property + def objdb(self): + # get ref to ObjDB instance + http_conn = self.http_conn + return http_conn.objdb @property def collection_type(self): @@ -169,11 +181,11 @@ def get(self, obj_uuid): """ Return id obj for given uuid """ obj_class = get_class_for_uuid(obj_uuid) if obj_class is GroupID: - obj = GroupID(obj_uuid, http_conn=self._http_conn) + obj = GroupID(obj_uuid, http_conn=self.http_conn) elif obj_class is TypeID: - obj = TypeID(obj_uuid, http_conn=self._http_conn) + obj = TypeID(obj_uuid, http_conn=self.http_conn) elif obj_class is DatasetID: - obj = DatasetID(obj_uuid, http_conn=self._http_conn) + obj = DatasetID(obj_uuid, http_conn=self.http_conn) else: raise TypeError(f"Unexpected type: {obj_uuid}") @@ -188,8 +200,7 @@ def attrs(self): def set_attr(self, name, attr): """ Create the given attribute """ - objdb = self._http_conn.objdb - objdb.set_attr(self._uuid, name, attr) + self.objdb.set_attr(self._uuid, name, attr) def get_attr(self, name): """ Return the given attribute """ @@ -202,8 +213,7 @@ def get_attr(self, name): def del_attr(self, name): """ Delete the named attribute """ - objdb = self._http_conn.objdb - objdb.del_attr(self._uuid, name) + self.objdb.del_attr(self._uuid, name) def has_attr(self, name): """ Test if an attribute name exists """ @@ -247,17 +257,15 @@ def __init__(self, obj_uuid, http_conn=None): """ self._uuid = get_UUID(obj_uuid) - if http_conn is not None: + if http_conn: # use a weakref here so we don't keep a potentially large # objdb in memory accidentally - self._http_conn = http_conn # weakref.ref(http_conn) + self._http_conn = weakref.ref(http_conn) else: raise IOError("Expected parent to have http connector") - objdb = http_conn.objdb - - if self._uuid not in objdb: - objdb.fetch(self._uuid) # will throw IOError if not found + if self._uuid not in self.objdb: + self.objdb.fetch(self._uuid) # will throw IOError if not found def __eq__(self, other): if isinstance(other, self.__class__): @@ -270,8 +278,7 @@ def __ne__(self, other): def refresh(self): """ get the latest obj_json data from server """ - objdb = self._http_conn.objdb - objdb.fetch(self.uuid) + self.objdb.fetch(self.uuid) def close(self): """Remove handles to id. @@ -281,7 +288,8 @@ def close(self): self._http_conn = None def __bool__(self): - return bool(self.uuid) + """ Return true if the weak ref to http_conn is still valid """ + return bool(self._http_conn()) def __del__(self): """ cleanup """ @@ -289,7 +297,7 @@ def __del__(self): def __repr__(self): class_name = self.__class__.__name__ - if self._uuid: + if self._uuid and self._http_conn(): r = f"<{class_name}({self._uuid})>" else: r = f"" @@ -385,7 +393,7 @@ def __init__(self, obj_id, http_conn=None): def getVerboseInfo(self): req = f"/datasets/{self._uuid}" params = {'verbose': 1} - rsp = self._http_conn.GET(req, params=params) + rsp = self.http_conn.GET(req, params=params) if rsp.status_code != 200: raise RuntimeError(f"get status: {rsp.status_code} for {req}") rsp_json = rsp.json() @@ -394,7 +402,7 @@ def getVerboseInfo(self): def resize(self, dims): """ update the shape of the dataset """ # send the request to the server - objdb = self._http_conn.objdb + objdb = self.http_conn.objdb objdb.resize(self._uuid, dims) @@ -421,7 +429,7 @@ def make_obj(self, title, type_json=None, shape=None, cpl=None, track_order=None links = obj_json['links'] if title in links: raise IOError("Unable to create object (name already exists)") - objdb = self._http_conn.objdb + objdb = self.http_conn.objdb kwds = {} if shape is not None: @@ -451,7 +459,7 @@ def set_link(self, title, link_json, replace=False): links = self.links if not replace and title in links: raise IOError("Unable to create link (name already exists)") - objdb = self._http_conn.objdb + objdb = self.http_conn.objdb objdb.set_link(self.uuid, title, link_json, replace=replace) @@ -461,7 +469,7 @@ def del_link(self, title): if title not in links: # not found raise KeyError(f"link '{title}' not found") - objdb = self._http_conn.objdb + objdb = self.http_conn.objdb objdb.del_link(self.uuid, title) @property @@ -498,3 +506,23 @@ def has_link(self, title): return True else: return False + + +class FileID(GroupID): + + def __init__(self, root_uuid, http_conn=None): + super().__init__(root_uuid, http_conn=http_conn) + self._file_conn = http_conn # keep a strong ref here + + def __bool__(self): + if self._file_conn: + return True + else: + return False + + def close(self): + """Remove handles to id. + """ + + self._file_conn = None + super().close() diff --git a/test/hl/test_group.py b/test/hl/test_group.py index 61e5f61b..70f2a13a 100644 --- a/test/hl/test_group.py +++ b/test/hl/test_group.py @@ -55,11 +55,9 @@ def test_create(self): self.assertEqual(g1.name, "/g1") g1_file = g1.file - print("g1_file.name:", g1_file.filename) - self.assertEqual(g1.file.filename, filename) + self.assertEqual(g1_file.filename, filename) r.create_group("g1/g1.1") - print(g1_file.filename) g1_1 = r["g1/g1.1"] self.assertEqual(g1_1.name, "/g1/g1.1") self.assertEqual(len(r), 1) @@ -142,12 +140,11 @@ def test_create(self): self.assertEqual(g1_class, h5py.Group) linkee_class = r.get('mysoftlink', getclass=True) self.assertEqual(linkee_class, h5py.Group) - print(f"test {r}.get, getclass and getlink are true") link_class = r.get('mysoftlink', getclass=True, getlink=True) self.assertEqual(link_class, h5py.SoftLink) softlink = r.get('mysoftlink', getlink=True) self.assertEqual(softlink.path, '/g1/g1.1') - """ + linkee_class = r.get('myexternallink', getclass=True) link_class = r.get('myexternallink', getclass=True, getlink=True) self.assertEqual(link_class, h5py.ExternalLink) @@ -155,22 +152,17 @@ def test_create(self): self.assertEqual(external_link.path, 'somepath') external_link_filename = external_link.filename self.assertTrue(external_link_filename.find('link_target') > -1) - """ links = r.items() got_external_link = False - print("looping through links") for link in links: - print("link:", link[0]) title = link[0] obj = link[1] if title == 'myexternallink': - print("title is myexternallink") self.assertTrue(obj is not None) self.assertEqual(len(obj), 0) self.assertTrue(obj.file.filename != filename) got_external_link = True - print("got_external_link") self.assertTrue(got_external_link) @@ -254,16 +246,19 @@ def test_nested_create(self): f.close() def test_closed_group(self): - def get_group_ref(): + def get_group_ref(f): tmp_filename = self.getFileName("test_closed_group") - with h5py.File(tmp_filename, 'w') as f: - g1 = f.create_group("g1") + with h5py.File(tmp_filename, 'w') as g: + g1 = g.create_group("g1") self.assertTrue(isinstance(g1, h5py.Group)) self.assertTrue(g1) + return g1 - return g1 - - grp = get_group_ref() + filename = self.getFileName("tmp_file") + f = h5py.File(filename, 'w') + grp = get_group_ref(f) + # grp should be closed + self.assertFalse(grp is None) self.assertFalse(grp) def test_external_links(self): From 712bcdb085df06d9ba7192f7ad34f9c888334871 Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 22 Jan 2025 16:23:36 +0800 Subject: [PATCH 20/32] updates for folders and dimscales --- h5pyd/_hl/dims.py | 308 ++++---------------------------- h5pyd/_hl/folders.py | 13 +- h5pyd/_hl/objectid.py | 8 + h5pyd/h5ds.py | 366 ++++++++++++++++++++++++++++++++++----- test/hl/test_dimscale.py | 3 +- 5 files changed, 379 insertions(+), 319 deletions(-) diff --git a/h5pyd/_hl/dims.py b/h5pyd/_hl/dims.py index f3616e12..3ba69cc4 100644 --- a/h5pyd/_hl/dims.py +++ b/h5pyd/_hl/dims.py @@ -13,17 +13,7 @@ from __future__ import absolute_import from . import base from .dataset import Dataset -from .objectid import DatasetID - - -def _getAttrValue(objid, attr_name): - """ helper function to get an attribute value. - Return None if attribute is not found, - else return attr_json['value'] """ - if objid.has_attr(attr_name): - attr_json = objid.get_attr(attr_name) - return attr_json['value'] - return None +from .. import h5ds class DimensionProxy(base.CommonStateObject): @@ -32,44 +22,16 @@ class DimensionProxy(base.CommonStateObject): @property def label(self): ''' Get the dimension scale label ''' - label_values = _getAttrValue(self._id, 'DIMENSION_LABELS') - - if label_values: - return '' - - if self._dimension >= len(label_values): - # label get request out of range - return '' - - return label_values[self._dimension] + return h5ds.get_label(self._id, self._dimension) @label.setter def label(self, val): - label_name = 'DIMENSION_LABELS' - if self._id.has_attr(label_name): - labels = self._id.get_attr(label_name) - else: - rank = self._id.rank - labels = { - 'shape': { - 'class': 'H5S_SIMPLE', - 'dims': [rank] - }, - 'type': { - 'class': 'H5T_STRING', - 'charSet': 'H5T_CSET_UTF8', - 'length': 'H5T_VARIABLE', - 'strPad': 'H5T_STR_NULLTERM' - }, - 'value': ['' for n in range(rank)] - } - labels['value'][self._dimension] = val - self._id.set_attr(label_name, labels) - - def __init__(self, id_, dimension): - if not isinstance(id_, DatasetID): - raise TypeError(f"expected DatasetID, but got: {type(id_)}") - self._id = id_ + h5ds.set_label(self._id, self._dimension, val) + + def __init__(self, dset, dimension): + if not isinstance(dset, Dataset): + raise TypeError(f"expected Dataset, but got: {type(dset)}") + self._id = dset.id self._dimension = int(dimension) def __hash__(self): @@ -83,211 +45,57 @@ def __iter__(self): yield k def __len__(self): - dimlist_values = _getAttrValue(self._id, 'DIMENSION_LIST') - if not dimlist_values: - return 0 - - if self._dimension >= len(dimlist_values): - # dimension scale len request out of range - return 0 - return len(dimlist_values[self._dimension]) + return h5ds.get_num_scales(self._id, self._dimension) def __getitem__(self, item): + """ get a dimension scale. + item can be an int in which case scale at that index will be returned + or item can be a str in which casee the scale ith that name will be returned """ - dimlist_values = _getAttrValue(self._id, 'DIMENSION_LIST') - if dimlist_values is None: - dimlist_attr_values = [] - - if self._dimension >= len(dimlist_attr_values): - # dimension scale len request out of range") - return None - - dimlist_values = dimlist_attr_values[self._dimension] - dset_scale_id = None if isinstance(item, int): - if item >= len(dimlist_values): - # no dimension scale - raise IndexError(f"No dimension scale found for index: {item}") - ref_id = dimlist_values[item] - if ref_id and not ref_id.startswith("datasets/"): - msg = f"unexpected ref_id: {ref_id}" - raise IOError(msg) - else: - dset_scale_id = self._id.get(ref_id) + scales = [] + h5ds.iterate(self._id, self._dimension, scales.append, 0) + return Dataset(scales[item]) else: - # Iterate through the dimension scales finding one with the - # correct name - for ref_id in dimlist_values: - if not ref_id: - continue - if not ref_id.startswith("datasets/"): - raise IOError(f"unexpected ref_id: {ref_id}") - dset_id = self._id.get(ref_id) - if item == _getAttrValue(dset_id, 'NAME'): - # found it! - dset_scale_id = dset_id - break - - if not dset_scale_id: - raise KeyError(f"No dimension scale with name '{item}' found'") - dscale = Dataset(dset_scale_id) - - return dscale + def f(dsid): + """ Iterate over scales to find a matching name """ + if h5ds.get_scale_name(dsid) == self._e(item): + return dsid + + res = h5ds.iterate(self._id, self._dimension, f, 0) + if res is None: + raise KeyError(item) + return Dataset(res) def attach_scale(self, dscale): ''' Attach a scale to this dimension. Provide the Dataset of the scale you would like to attach. ''' - dset = Dataset(self._id) - dscale_class = _getAttrValue(dscale.id, 'CLASS') - if dscale_class is None: - dset.dims.create_scale(dscale) - dscale_class = _getAttrValue(dscale.id, 'CLASS') - - if dscale_class != 'DIMENSION_SCALE': - raise RuntimeError(f"{dscale.name} is not a dimension scale") - - dset_class = _getAttrValue(self._id, 'CLASS') - if dset_class == 'DIMENSION_SCALE': - msg = f"{dset.name} cannot attach a dimension scale to a dimension scale" - raise RuntimeError(msg) - - # Create a DIMENSION_LIST attribute if needed - rank = self._id.rank - value = _getAttrValue(self._id, 'DIMENSION_LIST') - if value: - # delete and replace later - self._id.del_attr('DIMENSION_LIST') - else: - value = [list() for r in range(rank)] - - dimlist = { - 'creationProperties': { - 'nameCharEncoding': 'H5T_CSET_ASCII' - }, - 'shape': { - 'class': 'H5S_SIMPLE', - 'dims': [rank] - }, - 'type': { - 'base': { - 'base': 'H5T_STD_REF_OBJ', - 'class': 'H5T_REFERENCE' - }, - 'class': 'H5T_VLEN' - }, - 'value': value - } - - # Update the DIMENSION_LIST attribute with the object reference to the - # dimension scale - dimlist['value'][self._dimension].append('datasets/' + dscale.id.uuid) - self._id.set_attr('DIMENSION_list', dimlist) - - if dscale.id.has_attr('REFERENCE_LIST'): - old_reflist = dscale.id.get_attr('REFERENCE_LIST') - else: - old_reflist = { - 'creationProperties': { - 'nameCharEncoding': 'H5T_CSET_ASCII' - }, - 'shape': { - 'class': 'H5S_SIMPLE' - }, - 'type': { - 'class': 'H5T_COMPOUND', - 'fields': [ - { - 'name': 'dataset', - 'type': { - 'base': 'H5T_STD_REF_OBJ', - 'class': 'H5T_REFERENCE' - } - }, - { - 'name': 'index', - 'type': { - 'base': 'H5T_STD_I32LE', - 'class': 'H5T_INTEGER' - } - } - ] - } - } - - new_reflist = {} - new_reflist["type"] = old_reflist["type"] - new_reflist["shape"] = old_reflist["shape"] - if "value" in old_reflist: - reflist_value = old_reflist["value"] - if reflist_value is None: - reflist_value = [] - else: - reflist_value = [] - reflist_value.append(['datasets/' + dset.id.uuid, self._dimension]) - new_reflist["value"] = reflist_value - new_reflist["shape"]["dims"] = [len(reflist_value), ] - - # Update the REFERENCE_LIST attribute of the dimension scale - dscale.id.id.set_attr('REFERENCE_LIST', new_reflist) + h5ds.attach_scale(self._id, dscale.id, self._dimension) def detach_scale(self, dscale): ''' Remove a scale from this dimension. Provide the Dataset of the scale you would like to remove. ''' - if 'DIMENSION_LIST' not in self._id.attrs: - raise IOError("no DIMENSION_LIST attr in {dset._id}") - dimlist = self._id.get_attr('DIMENSION_LIST') - self._id.del_attr('DIMENSION_LIST') - - try: - ref = 'datasets/' + dscale.id.uuid - dimlist['value'][self._dimension].remove(ref) - except Exception as e: - # Restore the attribute's old value then raise the same - # exception - self._id.set_attr('DIMENSION_LIST', dimlist) - raise e - self._id.set_attr('DIMENSION_LIST', dimlist) - - if dscale.id.has_attr('REFERENCE_LIST'): - old_reflist = dscale.id.get_attr('REFERENCE_LIST') - else: - old_reflist = {} - - if "value" in old_reflist and len(old_reflist["value"]) > 0: - new_refs = list() - - remove = ['datasets/' + self._id.uuid, self._dimension] - for el in old_reflist['value']: - if remove[0] != el[0] and remove[1] != el[1]: - new_refs.append(el) - - new_reflist = {} - new_reflist["type"] = old_reflist["type"] - if len(new_refs) > 0: - new_reflist["value"] = new_refs - new_reflist["shape"] = [len(new_refs), ] - # tbd: replace = True - dscale.id.set_attr('REFERENCE_LIST', new_reflist) - else: - # Remove REFERENCE_LIST attribute if this dimension scale is - # not attached to any dataset - if old_reflist: - dscale.id.del_attr('REFERENCE_LIST') + h5ds.detach_scale(self._id, dscale.id, self._dimension) def items(self): ''' Get a list of (name, Dataset) pairs with all scales on this dimension. ''' + scale_ids = [] + + # H5DSiterate raises an error if there are no dimension scales, + # rather than iterating 0 times. + if len(self) > 0: + h5ds.iterate(self._id, self._dimension, scale_ids.append, 0) + scales = [] - num_scales = self.__len__() - for i in range(num_scales): - dscale = self.__getitem__(i) - dscale_name = _getAttrValue(dscale.id.id, 'NAME') - scales.append((dscale_name, dscale)) + for scale_id in scale_ids: + scale_name = h5ds.get_scale_name(scale_id) + scales.append(scale_name, Dataset(scale_id)) return scales def keys(self): @@ -344,46 +152,4 @@ def create_scale(self, dset, name=''): Provide the dataset and a name for the scale. ''' - # CLASS attribute with the value 'DIMENSION_SCALE' - class_attr = { - 'creationProperties': { - 'nameCharEncoding': 'H5T_CSET_ASCII' - }, - 'shape': { - 'class': 'H5S_SCALAR' - }, - 'type': { - 'charSet': 'H5T_CSET_ASCII', - 'class': 'H5T_STRING', - 'length': 16, - 'strPad': 'H5T_STR_NULLTERM' - }, - 'value': 'DIMENSION_SCALE' - } - - # NAME attribute with dimension scale's name - if isinstance(name, bytes): - name = name.decode('ascii') - else: - name = name.encode('utf-8').decode('ascii') - - name_attr = { - 'creationProperties': { - 'nameCharEncoding': 'H5T_CSET_ASCII' - }, - 'shape': { - 'class': 'H5S_SCALAR' - }, - 'type': { - 'charSet': 'H5T_CSET_ASCII', - 'class': 'H5T_STRING', - 'length': len(name) + 1, - 'strPad': 'H5T_STR_NULLTERM' - }, - 'value': name - } - self._id.set_attr('CLASS', class_attr) - try: - self._id.set_attr('NAME', name_attr) - except Exception: - self._id.del_attr('CLASS') + dset.make_scale(name) diff --git a/h5pyd/_hl/folders.py b/h5pyd/_hl/folders.py index bfdfe672..61105f94 100644 --- a/h5pyd/_hl/folders.py +++ b/h5pyd/_hl/folders.py @@ -13,7 +13,6 @@ from __future__ import absolute_import import os.path as op -import json import time import logging from .httpconn import HttpConn @@ -224,10 +223,10 @@ def __init__( if rsp.status_code < 500: self.log.warning(f"folder put status_code: {rsp.status_code}") else: - self.log.error("status_code: {}".format(rsp.status_code)) + self.log.error(f"status_code: {rsp.status_code}") raise IOError(rsp.status_code, rsp.reason) - domain_json = json.loads(rsp.text) - self.log.info("domain_json: {}".format(domain_json)) + domain_json = rsp.json() + self.log.info(f"domain_json: {domain_json}") if "class" in domain_json: if domain_json["class"] != "folder": self.log.warning("Not a folder domain") @@ -258,7 +257,7 @@ def getACL(self, username): rsp = self._http_conn.GET(req) if rsp.status_code != 200: raise IOError(rsp.reason) - rsp_json = json.loads(rsp.text) + rsp_json = rsp.json() acl_json = rsp_json["acl"] return acl_json @@ -269,7 +268,7 @@ def getACLs(self): rsp = self._http_conn.GET(req) if rsp.status_code != 200: raise IOError(rsp.status_code, rsp.reason) - rsp_json = json.loads(rsp.text) + rsp_json = rsp.json() acls_json = rsp_json["acls"] return acls_json @@ -315,7 +314,7 @@ def _getSubdomains(self): rsp = self._http_conn.GET(req, params=params) if rsp.status_code != 200: raise IOError(rsp.status_code, rsp.reason) - rsp_json = json.loads(rsp.text) + rsp_json = rsp.json() if "domains" not in rsp_json: raise IOError(500, "Unexpected Error") domains = rsp_json["domains"] diff --git a/h5pyd/_hl/objectid.py b/h5pyd/_hl/objectid.py index 35711463..32aaaf65 100644 --- a/h5pyd/_hl/objectid.py +++ b/h5pyd/_hl/objectid.py @@ -211,6 +211,14 @@ def get_attr(self, name): return attr + def get_attr_value(self, name): + """ Return attribute value or None if not found """ + if self.has_attr(name): + attr_json = self.get_attr(name) + return attr_json["value"] + else: + return None + def del_attr(self, name): """ Delete the named attribute """ self.objdb.del_attr(self._uuid, name) diff --git a/h5pyd/h5ds.py b/h5pyd/h5ds.py index a7e08327..4ad728c5 100644 --- a/h5pyd/h5ds.py +++ b/h5pyd/h5ds.py @@ -9,28 +9,221 @@ # distribution tree. If you do not have access to this file, you may # # request a copy from help@hdfgroup.org. # ############################################################################## -import json from ._hl.objectid import DatasetID +def attach_scale(dset: DatasetID, dscale: DatasetID, idx: int): + """ Attach Dimension Scale dscale to Dimension idx of Dataset dset. """ -def _getAttributeJson(attr_name: str, dsetid: DatasetID) -> dict: - uuid = dsetid.id - objdb = dsetid.http_conn.getObjDb() - if objdb and uuid in objdb: - dset_json = objdb[uuid] - attrs_json = dset_json["attributes"] - return attrs_json.get(attr_name, dict()) + rank = dset.rank + if idx < 0: + raise ValueError("dimension must be non-negative") + if idx >= rank: + raise ValueError("invalid dimension") + + if not is_scale(dscale): + raise TypeError("f{dscale} is not a dimension scale") + + if is_scale(dset): + raise TypeError("cannot attach a dimension scale to a dimension scale") + + # Create a DIMENSION_LIST attribute if needed + + orig_dimlist = dset.getAttrValue('DIMENSION_LIST') + if orig_dimlist: + # delete and replace later + dset.del_attr('DIMENSION_LIST') + + value = [list() for _ in range(rank)] + + dimlist = { + 'creationProperties': { + 'nameCharEncoding': 'H5T_CSET_ASCII' + }, + 'shape': { + 'class': 'H5S_SIMPLE', + 'dims': [rank] + }, + 'type': { + 'base': { + 'base': 'H5T_STD_REF_OBJ', + 'class': 'H5T_REFERENCE' + }, + 'class': 'H5T_VLEN' + }, + 'value': value + } + + # Update the DIMENSION_LIST attribute with the object reference to the + # dimension scale + dimlist['value'][idx].append('datasets/' + dscale.uuid) + dset.set_attr('DIMENSION_list', dimlist) + + if dscale.has_attr('REFERENCE_LIST'): + old_reflist = dscale.get_attr('REFERENCE_LIST') else: - req = f"/datasets/{uuid}/attributes/{attr_name}" - rsp = dsetid.http_conn.GET(req) - if rsp.status_code == 200: - return json.loads(rsp.text) + old_reflist = { + 'creationProperties': { + 'nameCharEncoding': 'H5T_CSET_ASCII' + }, + 'shape': { + 'class': 'H5S_SIMPLE' + }, + 'type': { + 'class': 'H5T_COMPOUND', + 'fields': [ + { + 'name': 'dataset', + 'type': { + 'base': 'H5T_STD_REF_OBJ', + 'class': 'H5T_REFERENCE' + } + }, + { + 'name': 'index', + 'type': { + 'base': 'H5T_STD_I32LE', + 'class': 'H5T_INTEGER' + } + } + ] + } + } + + new_reflist = {} + new_reflist["type"] = old_reflist["type"] + new_reflist["shape"] = old_reflist["shape"] + if "value" in old_reflist: + reflist_value = old_reflist["value"] + if reflist_value is None: + reflist_value = [] + else: + reflist_value = [] + reflist_value.append(['datasets/' + dset.uuid, idx]) + new_reflist["value"] = reflist_value + new_reflist["shape"]["dims"] = [len(reflist_value), ] + + # Update the REFERENCE_LIST attribute of the dimension scale + dscale.set_attr('REFERENCE_LIST', new_reflist) + +def detach_scale(dset: DatasetID, dscale: DatasetID, idx: int): + """ Detach Dimension Scale dscale from the Dimension idx of Dataset dset. """ + + rank = dset.rank + if idx < 0: + raise ValueError("dimension must be non-negative") + if idx >= rank: + raise ValueError("invalid dimension") + + if not dset.has_attr('DIMENSION_LIST'): + raise IOError("no DIMENSION_LIST attr in {dset}") + dimlist = dset.get_attr('DIMENSION_LIST') + dset.del_attr('DIMENSION_LIST') + + try: + # TBD: use ref class + ref = 'datasets/' + dscale.uuid + dimlist['value'][idx].remove(ref) + except Exception as e: + # Restore the attribute's old value then raise the same + # exception + dset.set_attr('DIMENSION_LIST', dimlist) + raise e + dset.set_attr('DIMENSION_LIST', dimlist) + + if dscale.has_attr('REFERENCE_LIST'): + old_reflist = dscale.get_attr('REFERENCE_LIST') + else: + old_reflist = {} + + if "value" in old_reflist and len(old_reflist["value"]) > 0: + new_refs = list() + + remove = ['datasets/' + dset.uuid, idx] + for el in old_reflist['value']: + if remove[0] != el[0] and remove[1] != el[1]: + new_refs.append(el) + + new_reflist = {} + new_reflist["type"] = old_reflist["type"] + if len(new_refs) > 0: + new_reflist["value"] = new_refs + new_reflist["shape"] = [len(new_refs), ] + if dscale.has_attr('REFERENCE_LIST'): + dscale.del_attr('REFERENCE_LIST') + dscale.set_attr('REFERENCE_LIST', new_reflist) else: - return dict() + # Remove REFERENCE_LIST attribute if this dimension scale is + # not attached to any dataset + if old_reflist: + dscale.del_attr('REFERENCE_LIST') + +def get_label(dset: DatasetID, idx: int) -> str: + """ Read the label for Dimension idx of Dataset dset into buffer label. """ + + rank = dset.rank + if idx < 0: + raise ValueError("dimension must be non-negative") + if idx >= rank: + raise ValueError("invalid dimension") + + label_values = dset.get_attr('DIMENSION_LABELS') + + if not label_values: + return '' + + if idx >= len(label_values): + # label get request out of range + return '' + + return label_values[idx] + +def get_num_scales(dset: DatasetID, dim: int) -> int: + """ Determines how many Dimension Scales are attached to Dimension dim of Dataset dset. """ + + rank = dset.rank + if dim < 0: + raise ValueError("dimension must be non-negative") + if dim >= rank: + raise ValueError("invalid dimension") + + dimlist_values = dset.get_attr_value('DIMENSION_LIST') + if not dimlist_values: + return 0 + + if dim >= len(dimlist_values): + # dimension scale len request out of range + return 0 + return len(dimlist_values[dim]) + + +def get_scale_name(dscale: DatasetID) -> str: + """ Retrieves name of Dimension Scale dscale. """ + + return dscale.get_attr_value("NAME") + +def is_attached(dset: DatasetID, dscale: DatasetID, idx: int) -> bool: + """ Report if Dimension Scale dscale is currently attached to Dimension idx of Dataset dset. """ + rank = dset.rank + if idx < 0: + raise ValueError("dimension must be non-negative") + if idx >= rank: + raise ValueError("invalid dimension") + + if not is_scale(dscale) or is_scale(dset): + return False + if not dset.has_attr("DIMENSION_LIST"): + return False + dimlist = dset.get_attr("DIMENSION_LIST") + reflist = dscale.get_attr("REFERENCE_LIST") + try: + return ([f"datasets/{dset._uuid}", idx] in + reflist["value"] and f"datasets/{dscale._uuid}" in dimlist["value"][idx]) + except (KeyError, IndexError): + return False -def is_scale(dsetid: DatasetID) -> bool: - """True if HDF5 dataset is a Dimension Scale.""" +def is_scale(dset: DatasetID) -> bool: + """ Determines whether dset is a dimension scale. """ # This is the expected CLASS attribute's JSON... # { # 'creationProperties': { @@ -47,32 +240,127 @@ def is_scale(dsetid: DatasetID) -> bool: # }, # 'value': 'DIMENSION_SCALE' # } - class_json = _getAttributeJson("CLASS", dsetid) - try: - if class_json["value"] != "DIMENSION_SCALE": - return False - elif class_json["shape"]["class"] != "H5S_SCALAR": - return False - elif class_json["type"]["class"] != "H5T_STRING": - return False - elif class_json["type"]["strPad"] != "H5T_STR_NULLTERM": - return False - elif class_json["type"]["length"] != 16: - return False - except KeyError: + class_json = dset.get_attr("CLASS") + if class_json["value"] != "DIMENSION_SCALE": return False - + if 'creationProperties' not in class_json: + return False + cpl = class_json['creationProperties'] + if 'nameCharEncoding' not in cpl: + return False + if cpl['nameCharEncoding'] != 'H5T_CSET_ASCII': + return False + shape_json = class_json['shape'] + if shape_json.get('class') != 'H5S_SCALAR': + return False + type_json = class_json['type'] + if type_json.get('class') != 'H5T_STRING': + return False + if type_json.get('length') != 16: + return False + if type_json.get('charSet') != 'H5T_CSET_ASCII': + return False + if type_json.get('strPad') != 'H5T_STR_NULLTERM': + return False + return True +def set_label(dset: DatasetID, idx: int, label: str): + """ Set label for the Dimension idx of Dataset dset to the value label. """ -def is_attached(dsetid: DatasetID, dscaleid: DatasetID, idx: int) -> bool: - """True if Dimension Scale ``dscale`` is attached to Dataset ``dset`` at dimension ``idx``""" - if not is_scale(dscaleid) or is_scale(dsetid): - return False - dimlist = _getAttributeJson("DIMENSION_LIST", dsetid) - reflist = _getAttributeJson("REFERENCE_LIST", dscaleid) + rank = dset.rank + if idx < 0: + raise ValueError("dimension must be non-negative") + if idx >= rank: + raise ValueError("invalid dimension") + + label_name = 'DIMENSION_LABELS' + if dset.has_attr(label_name): + labels = dset.get_attr(label_name) + else: + labels = { + 'shape': { + 'class': 'H5S_SIMPLE', + 'dims': [rank] + }, + 'type': { + 'class': 'H5T_STRING', + 'charSet': 'H5T_CSET_UTF8', + 'length': 'H5T_VARIABLE', + 'strPad': 'H5T_STR_NULLTERM' + }, + 'value': ['' for n in range(rank)] + } + labels['value'][idx] = label + dset.set_attr(label_name, labels) + +def set_scale(dset: DatasetID, dimname: str): + """ Convert dataset dset to a dimension scale, with optional name dimname. """ + + # CLASS attribute with the value 'DIMENSION_SCALE' + class_attr = { + 'creationProperties': { + 'nameCharEncoding': 'H5T_CSET_ASCII' + }, + 'shape': { + 'class': 'H5S_SCALAR' + }, + 'type': { + 'charSet': 'H5T_CSET_ASCII', + 'class': 'H5T_STRING', + 'length': 16, + 'strPad': 'H5T_STR_NULLTERM' + }, + 'value': 'DIMENSION_SCALE' + } + + name_attr = { + 'creationProperties': { + 'nameCharEncoding': 'H5T_CSET_ASCII' + }, + 'shape': { + 'class': 'H5S_SCALAR' + }, + 'type': { + 'charSet': 'H5T_CSET_ASCII', + 'class': 'H5T_STRING', + 'length': len(dimname) + 1, + 'strPad': 'H5T_STR_NULLTERM' + }, + 'value': dimname + } + dset.set_attr('CLASS', class_attr) try: - return ([f"datasets/{dsetid.id}", idx] in - reflist["value"] and f"datasets/{dscaleid.id}" in dimlist["value"][idx]) - except (KeyError, IndexError): - return False + dset.set_attr('NAME', name_attr) + except Exception: + dset.del_attr('CLASS') + + +def iterate(dset: DatasetID, dim: int, callable: any, startidx: int=0) -> any: + """ Iterate a callable (function, method or callable object) over the members of a group. + Your callable should have the signature: + + func(STRING name) => Result + Returning None continues iteration; returning anything else aborts iteration and returns that value. Keywords: + """ + + rank = dset.rank + if dim < 0: + raise ValueError("dimension must be non-negative") + if dim >= rank: + raise ValueError("invalid dimension") + + dimlist = dset.get_attr_value('DIMENSION_LIST') + if not dimlist: + return 0 + + if startidx >= len(dimlist): + # dimension scale len request out of range + return 0 + + idx = startidx + while idx < len(dimlist): + dscale_uuid = dimlist[idx] + callable(DatasetID(dscale_uuid)) + idx += 1 + diff --git a/test/hl/test_dimscale.py b/test/hl/test_dimscale.py index 0484c94b..02c20bb0 100644 --- a/test/hl/test_dimscale.py +++ b/test/hl/test_dimscale.py @@ -28,8 +28,7 @@ class TestDimensionScale(TestCase): def test_everything(self): """Everything related to dimension scales""" filename = self.getFileName('test_dimscale') - print('filename:', filename) - f = h5py.File(filename, 'w') + f = h5py.File(filename, 'w') dset = f.create_dataset('temperatures', (10, 10, 10), dtype='f') f.create_dataset('scale_x', data=np.arange(10) * 10e3) From 176301539474305e1fa089ac7dcbdb21e3dc4c6b Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 23 Jan 2025 14:08:06 +0800 Subject: [PATCH 21/32] reorganize lowlevel and highlevel files --- README.md | 2 +- h5pyd/__init__.py | 10 +- h5pyd/_hl/attrs.py | 6 +- h5pyd/_hl/base.py | 57 ++----- h5pyd/_hl/dataset.py | 7 +- h5pyd/_hl/datatype.py | 4 +- h5pyd/_hl/files.py | 4 +- h5pyd/_hl/folders.py | 2 +- h5pyd/_hl/group.py | 52 ++++-- h5pyd/_hl/requests_lambda.py | 291 --------------------------------- h5pyd/_hl/table.py | 8 +- h5pyd/h5ds.py | 45 ++--- h5pyd/{_hl => }/h5type.py | 30 +++- h5pyd/{_hl => }/h5type_test.py | 0 h5pyd/{_hl => }/httpconn.py | 19 +-- h5pyd/{_hl => }/objdb.py | 40 +++-- h5pyd/{_hl => }/objectid.py | 15 ++ h5pyd/{_hl => }/openid.py | 2 +- h5pyd/{_hl => }/serverinfo.py | 2 +- test/hl/test_attribute.py | 1 + test/hl/test_dataset_create.py | 21 ++- test/hl/test_dataset_objref.py | 45 +++-- test/hl/test_dimscale.py | 2 +- 23 files changed, 209 insertions(+), 456 deletions(-) delete mode 100644 h5pyd/_hl/requests_lambda.py rename h5pyd/{_hl => }/h5type.py (97%) rename h5pyd/{_hl => }/h5type_test.py (100%) rename h5pyd/{_hl => }/httpconn.py (97%) rename h5pyd/{_hl => }/objdb.py (94%) rename h5pyd/{_hl => }/objectid.py (97%) rename h5pyd/{_hl => }/openid.py (99%) rename h5pyd/{_hl => }/serverinfo.py (99%) diff --git a/README.md b/README.md index fee48c5c..4e969b12 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ variables are needed to be defined: To use "local" mode with S3, define these variables: * ``AWS_S3_GATEWAY`` - AWS S3 endpoint, e.g.: ``https://s3.us-west-2.amazonaws.com`` -* ``AWS_REGION`` - Region where the Lambda function is installed, e.g.: ``us-west-2`` +* ``AWS_REGION`` - Region where the S3 bucket is located, e.g.: ``us-west-2`` * ``AWS_SECRET_ACCESS_KEY`` - Your AWS secret access AWS_SECRET_ACCESS_KEY * ``AWS_ACCESS_KEY_ID`` - Your AWS access key ID diff --git a/h5pyd/__init__.py b/h5pyd/__init__.py index 28da0644..5312674c 100644 --- a/h5pyd/__init__.py +++ b/h5pyd/__init__.py @@ -14,10 +14,10 @@ from . import version from ._hl.base import Empty -from ._hl.h5type import special_dtype, Reference, RegionReference -from ._hl.h5type import vlen_dtype, string_dtype, enum_dtype -from ._hl.h5type import check_vlen_dtype, check_string_dtype, check_enum_dtype -from ._hl.h5type import check_opaque_dtype, check_ref_dtype, check_dtype +from .h5type import special_dtype, Reference, RegionReference +from .h5type import vlen_dtype, string_dtype, enum_dtype +from .h5type import check_vlen_dtype, check_string_dtype, check_enum_dtype +from .h5type import check_opaque_dtype, check_ref_dtype, check_dtype from ._hl.files import File, H5Image, is_hdf5 from ._hl.folders import Folder from ._hl.group import Group, SoftLink, ExternalLink, UserDefinedLink, HardLink @@ -25,7 +25,7 @@ from ._hl.table import Table from ._hl.datatype import Datatype from ._hl.attrs import AttributeManager -from ._hl.serverinfo import getServerInfo +from .serverinfo import getServerInfo from . import h5ds diff --git a/h5pyd/_hl/attrs.py b/h5pyd/_hl/attrs.py index fcaaae3f..6edd509d 100644 --- a/h5pyd/_hl/attrs.py +++ b/h5pyd/_hl/attrs.py @@ -25,7 +25,8 @@ from . import base from .base import jsonToArray, Empty from .datatype import Datatype -from .h5type import getTypeItem, createDataType, special_dtype, Reference +from ..objectid import get_class_for_uuid, GroupID, TypeID, DatasetID +from ..h5type import getTypeItem, createDataType, special_dtype, Reference class AttributeManager(base.MutableMappingHDF5, base.CommonStateObject): @@ -264,6 +265,9 @@ def create(self, name, value, shape=None, dtype=None): attr['shape'] = shape if value.dtype.kind != 'c': attr['value'] = self._bytesArrayToList(value) + elif isinstance(value, Reference): + # special case reference types + attr['value'] = value.tolist() else: # Special case: complex numbers special_dt = createDataType(type_json) diff --git a/h5pyd/_hl/base.py b/h5pyd/_hl/base.py index f8484bec..315e3540 100644 --- a/h5pyd/_hl/base.py +++ b/h5pyd/_hl/base.py @@ -21,8 +21,8 @@ from collections.abc import ( Mapping, MutableMapping, KeysView, ValuesView, ItemsView ) -from .objectid import FileID, ObjectID -from .h5type import Reference, check_dtype, special_dtype +from ..objectid import FileID, ObjectID +from ..h5type import Reference, check_dtype, special_dtype numpy_integer_types = (np.int8, np.uint8, np.int16, np.int16, np.int32, np.uint32, np.int64, np.uint64) numpy_float_types = (np.float16, np.float32, np.float64) @@ -43,28 +43,6 @@ def __exit__(self, a, b, c): pass -_phil = FakeLock() - -# Python alias for access from other modules -phil = _phil - - -def with_phil(func): - """ Locking decorator """ - """ - For h5yp source code compatiblity - jlr - """ - - import functools - - def wrapper(*args, **kwds): - with _phil: - return func(*args, **kwds) - - functools.update_wrapper(wrapper, func, ('__name__', '__doc__')) - return wrapper - - def find_item_type(data): """Find the item type of a simple object or collection of objects. @@ -591,7 +569,6 @@ class LinkCreationPropertyList(object): """ Represents a LinkCreationPropertyList """ - @with_phil def __init__(self, char_encoding=None): if char_encoding: if char_encoding not in ("CSET_ASCII", "CSET_UTF8"): @@ -600,7 +577,6 @@ def __init__(self, char_encoding=None): else: self._char_encoding = "CSET_ASCII" - @with_phil def __repr__(self): return "" @@ -614,7 +590,6 @@ class LinkAccessPropertyList(object): Represents a LinkAccessPropertyList """ - @with_phil def __repr__(self): return "" @@ -921,16 +896,14 @@ class ValuesViewHDF5(ValuesView): """ def __contains__(self, value): - with phil: - for key in self._mapping: - if value == self._mapping.get(key): - return True - return False + for key in self._mapping: + if value == self._mapping.get(key): + return True + return False def __iter__(self): - with phil: - for key in self._mapping: - yield self._mapping.get(key) + for key in self._mapping: + yield self._mapping.get(key) class ItemsViewHDF5(ItemsView): @@ -940,16 +913,14 @@ class ItemsViewHDF5(ItemsView): """ def __contains__(self, item): - with phil: - key, val = item - if key in self._mapping: - return val == self._mapping.get(key) - return False + key, val = item + if key in self._mapping: + return val == self._mapping.get(key) + return False def __iter__(self): - with phil: - for key in self._mapping: - yield (key, self._mapping.get(key)) + for key in self._mapping: + yield (key, self._mapping.get(key)) class MappingHDF5(Mapping): diff --git a/h5pyd/_hl/dataset.py b/h5pyd/_hl/dataset.py index 284bad33..9b74e389 100644 --- a/h5pyd/_hl/dataset.py +++ b/h5pyd/_hl/dataset.py @@ -23,13 +23,13 @@ from .base import HLObject, jsonToArray, bytesToArray, arrayToBytes from .base import Empty, guess_dtype -from .h5type import Reference, RegionReference +from ..h5type import Reference, RegionReference from .base import _decode -from .objectid import DatasetID +from ..objectid import DatasetID from . import filters from . import selections as sel from .datatype import Datatype -from .h5type import getTypeItem, check_dtype, special_dtype, getItemSize +from ..h5type import getTypeItem, check_dtype, special_dtype, getItemSize from .. import config _LEGACY_GZIP_COMPRESSION_VALS = frozenset(range(10)) @@ -1267,6 +1267,7 @@ def __getitem__(self, args, new_dtype=None): arr = jsonToArray(mshape, mtype, data) self.log.debug(f"jsontoArray returned: {arr}") + elif isinstance(selection, sel.PointSelection): format = "binary" # default binary body = {} diff --git a/h5pyd/_hl/datatype.py b/h5pyd/_hl/datatype.py index f25f7573..28eb9013 100644 --- a/h5pyd/_hl/datatype.py +++ b/h5pyd/_hl/datatype.py @@ -17,8 +17,8 @@ # from ..h5t import TypeID from .base import HLObject -from .objectid import TypeID -from .h5type import createDataType +from ..objectid import TypeID +from ..h5type import createDataType class Datatype(HLObject): diff --git a/h5pyd/_hl/files.py b/h5pyd/_hl/files.py index da5fa1e4..63863e61 100644 --- a/h5pyd/_hl/files.py +++ b/h5pyd/_hl/files.py @@ -17,9 +17,9 @@ import pathlib import time -from .objectid import FileID +from ..objectid import FileID from .group import Group -from .httpconn import HttpConn +from ..httpconn import HttpConn from .. import config VERBOSE_REFRESH_TIME = 1.0 # 1 second diff --git a/h5pyd/_hl/folders.py b/h5pyd/_hl/folders.py index 61105f94..eaacf149 100644 --- a/h5pyd/_hl/folders.py +++ b/h5pyd/_hl/folders.py @@ -15,7 +15,7 @@ import os.path as op import time import logging -from .httpconn import HttpConn +from ..httpconn import HttpConn from .. import config diff --git a/h5pyd/_hl/group.py b/h5pyd/_hl/group.py index 267d9eae..2848b761 100644 --- a/h5pyd/_hl/group.py +++ b/h5pyd/_hl/group.py @@ -17,13 +17,13 @@ import collections from .base import HLObject, MutableMappingHDF5, guess_dtype -from .objectid import TypeID, GroupID, DatasetID, isUUID -from .h5type import special_dtype +from ..objectid import ObjectID, TypeID, GroupID, DatasetID, isUUID +from ..h5type import special_dtype from . import dataset from .dataset import Dataset from .table import Table from .datatype import Datatype -from . import h5type +from .. import h5type def _h5parent(path): @@ -261,7 +261,15 @@ def create_dataset(self, name, shape=None, dtype=None, data=None, **kwds): dset = Dataset(dset_id) if base_name: - dset._name = f"{self._name}/{base_name}" + if parent_grp.name: + if parent_grp.name[-1] == '/': + dset._name = parent_grp.name + base_name + else: + dset._name = f"{parent_grp.name}/{base_name}" + else: + dset._name = None + else: + dset._name = None return dset @@ -416,25 +424,40 @@ def __getitem__(self, name, track_order=None): name = name.decode('utf-8') self.log.debug(f"group.__getitem__({name}, track_order={track_order})") - tgt_uuid = None + obj_id = None if isinstance(name, h5type.Reference): tgt = name.objref() # weak reference to ref object if tgt is not None: return tgt # ref'd object has not been deleted else: - tgt_uuid = name.id.id + tgt_uuid = name.uuid + obj_id = self.id.get(tgt_uuid) + elif isinstance(name, ObjectID): + obj_id = name elif isUUID(name): - tgt_uuid = name + # TBD: elements from dataset reference types are being returned as strings + # so interpret anything that looks like a UUID as ref pointer. + # Once bytesToArray is fixed to do the right thing, we can omit this check + obj_id = self.id.get(name) elif name == "/": # return root group - tgt_uuid = self.id.http_conn.root_uuid + root_uuid = self.id.http_conn.root_uuid + obj_id = self.id.get(root_uuid) else: pass # will do a path lookup - if tgt_uuid: - obj_id = self.id.get(tgt_uuid) + if obj_id: + # verify the object exists + objdb = self.id.http_conn.objdb + if obj_id.id not in objdb: + try: + objdb.fetch(obj_id.id) # will raise exception if + except IOError: + raise KeyError(f"Object {obj_id} does not exist") if isinstance(obj_id, GroupID): tgt = Group(obj_id) + if name == "/": + tgt._name = "/" elif isinstance(obj_id, DatasetID): if obj_id.rank == 1 and obj_id.type_class == 'H5T_COMPOUND': tgt = Table(obj_id) @@ -633,12 +656,9 @@ def __delitem__(self, name): """ Delete (unlink) an item from this group. """ objdb = self.id.http_conn.objdb - if isUUID(name): - obj_id = op.basename(name) - if obj_id in objdb: - del objdb[obj_id] - else: - self.log.warning(f"expected to find obj_id: {obj_id} for delete") + if isinstance(name, ObjectID): + # delete the object, not the link + objdb.del_obj(name.id) else: parent_path = _h5parent(name) basename = _h5base(name) diff --git a/h5pyd/_hl/requests_lambda.py b/h5pyd/_hl/requests_lambda.py deleted file mode 100644 index 01a99880..00000000 --- a/h5pyd/_hl/requests_lambda.py +++ /dev/null @@ -1,291 +0,0 @@ -import json - -# rom .config import Config - -""" -get aiobotocore lambda client -""" - -LAMBDA_REQ_PREFIX = "http+lambda://" - -STATUS_REASONS = { - 200: "OK", - 201: "Created", - 202: "Accepted", - 204: "No Content", - 400: "Bad Request", - 401: "Unauthorized", - 403: "Forbidden", - 404: "Not Found", - 408: "Request Timeout", - 409: "Confict", - 410: "Gone", - 413: "Payload Too Large", - 500: "Internal Server Error", - 501: "Not Implemented", - 503: "Service Unavailable", - 504: "Gateway Timeout", - 507: "Insufficient Storage", -} - - -class HttpChunkIterator(object): - """ - Class to iterate through list of chunks of a http response - """ - - def __init__(self, data, chunk_size=1): - self._data = data - self._chunk_size = chunk_size - self._index = 0 - - def __iter__(self): - return self - - def __next__(self): - if self._data is None: - raise StopIteration() - if self._index >= len(self._data): - raise StopIteration() - num_bytes = len(self._data) - n = self._index - m = n + self._chunk_size - if m > num_bytes: - m = num_bytes - self._index = m - return self._data[n:m] - - -class LambdaResponse: - def __init__(self, lambda_rsp): - self._status_code = 500 - self._reason = "" - self._headers = {} - self._text = None - self._json = None - self._content_length = 0 - self._iter_index = 0 - if lambda_rsp and isinstance(lambda_rsp, dict): - if "StatusCode" in lambda_rsp: - lambda_status_code = lambda_rsp["StatusCode"] - - if lambda_status_code in (200, 201) and "Payload" in lambda_rsp: - payload = lambda_rsp["Payload"] - rsp_text = payload.read().decode("utf-8") - rsp_payload = json.loads(rsp_text) - if rsp_payload.get("isBase64Encoded"): - is_base64_encoded = True - else: - is_base64_encoded = False - - if "statusCode" in rsp_payload: - self._status_code = rsp_payload["statusCode"] - if "headers" in rsp_payload: - headers_text = rsp_payload["headers"] - - headers = json.loads(headers_text) - for k in headers: - v = headers[k] - self._headers[k] = v - if self._status_code in (200, 201) and "body" in rsp_payload: - body_text = rsp_payload["body"] - # set the json prop for a dict, - # otherwise just set the text prop - if isinstance(body_text, dict): - self._json = body_text - elif is_base64_encoded and body_text: - # convert hex encoded to bytes - self._text = bytes.fromhex(body_text) - else: - self._text = body_text - - else: - raise ValueError("lambda: unable to get payload") - else: - raise TypeError("lambda: expected dict response") - if self._status_code in STATUS_REASONS: - self._reason = STATUS_REASONS[self._status_code] - else: - self._reason = "Unexpected status code" - - @property - def status_code(self): - return self._status_code - - @property - def reason(self): - return self._reason - - @property - def text(self): - if self._text: - return self._text - elif self._json: - self._text = json.dumps(self._json) - return self._text - else: - return None - - def json(self): - if self._json: - return self._json - elif self._text: - self._json = json.loads(self._text) - return self._json - else: - return None - - def iter_content(self, chunk_size=1): - return HttpChunkIterator(self._text, chunk_size=chunk_size) - - @property - def headers(self): - return self._headers - - @property - def content_length(self): - if self._text: - return len(self._text) - elif self._json: - self._text = json.dumps(self._text) - return len(self._text) - else: - return 0 - - -class Session: - def __init__(self, timeout=10): - self.timeout = timeout - - def __enter__(self): - pass - - def __exit__(self): - pass - - def mount(self, protocol, adapter): - # TBD - # print(f"requests_lambda mount({protocol})") - pass - - def _invoke(self, req, method="GET", params=None, headers=None, data=None): - if not req: - msg = "no req" - raise ValueError(msg) - if not req.startswith(LAMBDA_REQ_PREFIX): - msg = f"Expected req to start with {LAMBDA_REQ_PREFIX}" - raise ValueError(msg) - if method not in ("GET", "PUT", "POST", "DELETE"): - msg = f"Unexpected method: {method}" - raise ValueError(msg) - if method in ("GET", "DELETE") and data: - msg = f"data not expected for method: {method}" - raise ValueError(msg) - - # Convert uri of the form: http+lambda://FUNC_NAME/REQ - # as: - # function_name = FUNC_NAME - # req_path = REQ - # params = {PARAMS} - s = req[len(LAMBDA_REQ_PREFIX):] # strip off protocol - index = s.find("/") - if index <= 0: - msg = "Unexpected request" - raise ValueError(msg) - function_name = s[:index] - if function_name.find("/") >= 0: - msg = f"unexpected lambda function name: {function_name}" - raise ValueError(msg) - index = s.find(function_name) - req_path = s[index + len(function_name):] - if not req_path: - msg = "no request path found" - raise ValueError(msg) - - # convert header values to string from bytes if needed - json_headers = {} - for k in headers: - v = headers[k] - if isinstance(v, bytes): - json_headers[k] = v.decode("utf-8") - else: - json_headers[k] = v - - req_json = { - "method": method, - "path": req_path, - "params": params, - "headers": json_headers, - "body": data, - } - - payload = json.dumps(req_json).encode("utf-8") - - import boto3 # import here so it's not a global dependency - from botocore.exceptions import ClientError - - # with boto3.client('lambda') - lambda_client = boto3.client("lambda") - try: - lambda_rsp = lambda_client.invoke( - FunctionName=function_name, - InvocationType="RequestResponse", - Payload=payload, - ) - except ClientError as ce: - if "Error" in ce.response and "Code" in ce.response["Error"]: - error_code = ce.response["Error"]["Code"] - else: - error_code = "Unknown Lambda error" - if error_code == "UnrecognizedClientException": - # this happens when the AWS access key not provided - error_code += " (are the AWS credentials valid?)" - raise ValueError(error_code) - rsp = LambdaResponse(lambda_rsp) - return rsp - - def get( - self, req, params=None, headers=None, stream=False, timeout=None, verify=None - ): - """ - Lambda GET request - - req should be in form: "http+lambda://function/path" - """ - if stream: - raise ValueError("stream not supported for Lambda") - rsp = self._invoke(req, params=params, headers=headers) - return rsp - - def put(self, req, params=None, headers=None, data=None, verify=None): - """ - Lambda PUT request - - req should be in form: "http+lambda://function/path" - """ - rsp = self._invoke(req, method="PUT", params=params, headers=headers, data=data) - return rsp - - def post(self, req, params=None, headers=None, data=None, verify=None): - """ - Lambda POST request - - req should be in form: "http+lambda://function/path" - """ - rsp = self._invoke( - req, method="POST", params=params, headers=headers, data=data - ) - return rsp - - def delete(self, req, params=None, headers=None, verify=None): - """ - Lambda DELETE request - - req should be in form: "http+lambda://function/path" - """ - rsp = self._invoke(req, method="DELETE", params=params, headers=headers) - return rsp - - def close(self): - # TBD - release any held resources - pass diff --git a/h5pyd/_hl/table.py b/h5pyd/_hl/table.py index fdd365af..d5e218d0 100644 --- a/h5pyd/_hl/table.py +++ b/h5pyd/_hl/table.py @@ -15,11 +15,11 @@ from .base import _decode from .base import bytesToArray from .dataset import Dataset -from .objectid import DatasetID +from ..objectid import DatasetID from . import selections as sel -from .h5type import Reference -from .h5type import check_dtype -from .h5type import getQueryDtype +from ..h5type import Reference +from ..h5type import check_dtype +from ..h5type import getQueryDtype class Cursor(): diff --git a/h5pyd/h5ds.py b/h5pyd/h5ds.py index 4ad728c5..bf456f92 100644 --- a/h5pyd/h5ds.py +++ b/h5pyd/h5ds.py @@ -11,6 +11,7 @@ ############################################################################## from ._hl.objectid import DatasetID + def attach_scale(dset: DatasetID, dscale: DatasetID, idx: int): """ Attach Dimension Scale dscale to Dimension idx of Dataset dset. """ @@ -19,20 +20,20 @@ def attach_scale(dset: DatasetID, dscale: DatasetID, idx: int): raise ValueError("dimension must be non-negative") if idx >= rank: raise ValueError("invalid dimension") - + if not is_scale(dscale): raise TypeError("f{dscale} is not a dimension scale") - + if is_scale(dset): raise TypeError("cannot attach a dimension scale to a dimension scale") - + # Create a DIMENSION_LIST attribute if needed - + orig_dimlist = dset.getAttrValue('DIMENSION_LIST') if orig_dimlist: # delete and replace later dset.del_attr('DIMENSION_LIST') - + value = [list() for _ in range(rank)] dimlist = { @@ -105,6 +106,7 @@ def attach_scale(dset: DatasetID, dscale: DatasetID, idx: int): # Update the REFERENCE_LIST attribute of the dimension scale dscale.set_attr('REFERENCE_LIST', new_reflist) + def detach_scale(dset: DatasetID, dscale: DatasetID, idx: int): """ Detach Dimension Scale dscale from the Dimension idx of Dataset dset. """ @@ -113,7 +115,7 @@ def detach_scale(dset: DatasetID, dscale: DatasetID, idx: int): raise ValueError("dimension must be non-negative") if idx >= rank: raise ValueError("invalid dimension") - + if not dset.has_attr('DIMENSION_LIST'): raise IOError("no DIMENSION_LIST attr in {dset}") dimlist = dset.get_attr('DIMENSION_LIST') @@ -157,6 +159,7 @@ def detach_scale(dset: DatasetID, dscale: DatasetID, idx: int): if old_reflist: dscale.del_attr('REFERENCE_LIST') + def get_label(dset: DatasetID, idx: int) -> str: """ Read the label for Dimension idx of Dataset dset into buffer label. """ @@ -165,7 +168,7 @@ def get_label(dset: DatasetID, idx: int) -> str: raise ValueError("dimension must be non-negative") if idx >= rank: raise ValueError("invalid dimension") - + label_values = dset.get_attr('DIMENSION_LABELS') if not label_values: @@ -177,6 +180,7 @@ def get_label(dset: DatasetID, idx: int) -> str: return label_values[idx] + def get_num_scales(dset: DatasetID, dim: int) -> int: """ Determines how many Dimension Scales are attached to Dimension dim of Dataset dset. """ @@ -185,7 +189,7 @@ def get_num_scales(dset: DatasetID, dim: int) -> int: raise ValueError("dimension must be non-negative") if dim >= rank: raise ValueError("invalid dimension") - + dimlist_values = dset.get_attr_value('DIMENSION_LIST') if not dimlist_values: return 0 @@ -198,9 +202,10 @@ def get_num_scales(dset: DatasetID, dim: int) -> int: def get_scale_name(dscale: DatasetID) -> str: """ Retrieves name of Dimension Scale dscale. """ - + return dscale.get_attr_value("NAME") + def is_attached(dset: DatasetID, dscale: DatasetID, idx: int) -> bool: """ Report if Dimension Scale dscale is currently attached to Dimension idx of Dataset dset. """ @@ -209,7 +214,7 @@ def is_attached(dset: DatasetID, dscale: DatasetID, idx: int) -> bool: raise ValueError("dimension must be non-negative") if idx >= rank: raise ValueError("invalid dimension") - + if not is_scale(dscale) or is_scale(dset): return False if not dset.has_attr("DIMENSION_LIST"): @@ -222,6 +227,7 @@ def is_attached(dset: DatasetID, dscale: DatasetID, idx: int) -> bool: except (KeyError, IndexError): return False + def is_scale(dset: DatasetID) -> bool: """ Determines whether dset is a dimension scale. """ # This is the expected CLASS attribute's JSON... @@ -262,9 +268,10 @@ def is_scale(dset: DatasetID) -> bool: return False if type_json.get('strPad') != 'H5T_STR_NULLTERM': return False - + return True + def set_label(dset: DatasetID, idx: int, label: str): """ Set label for the Dimension idx of Dataset dset to the value label. """ @@ -273,7 +280,7 @@ def set_label(dset: DatasetID, idx: int, label: str): raise ValueError("dimension must be non-negative") if idx >= rank: raise ValueError("invalid dimension") - + label_name = 'DIMENSION_LABELS' if dset.has_attr(label_name): labels = dset.get_attr(label_name) @@ -294,9 +301,10 @@ def set_label(dset: DatasetID, idx: int, label: str): labels['value'][idx] = label dset.set_attr(label_name, labels) + def set_scale(dset: DatasetID, dimname: str): """ Convert dataset dset to a dimension scale, with optional name dimname. """ - + # CLASS attribute with the value 'DIMENSION_SCALE' class_attr = { 'creationProperties': { @@ -336,8 +344,8 @@ def set_scale(dset: DatasetID, dimname: str): dset.del_attr('CLASS') -def iterate(dset: DatasetID, dim: int, callable: any, startidx: int=0) -> any: - """ Iterate a callable (function, method or callable object) over the members of a group. +def iterate(dset: DatasetID, dim: int, callable: any, startidx: int = 0) -> any: + """ Iterate a callable (function, method or callable object) over the members of a group. Your callable should have the signature: func(STRING name) => Result @@ -349,18 +357,17 @@ def iterate(dset: DatasetID, dim: int, callable: any, startidx: int=0) -> any: raise ValueError("dimension must be non-negative") if dim >= rank: raise ValueError("invalid dimension") - + dimlist = dset.get_attr_value('DIMENSION_LIST') if not dimlist: return 0 - + if startidx >= len(dimlist): # dimension scale len request out of range return 0 - + idx = startidx while idx < len(dimlist): dscale_uuid = dimlist[idx] callable(DatasetID(dscale_uuid)) idx += 1 - diff --git a/h5pyd/_hl/h5type.py b/h5pyd/h5type.py similarity index 97% rename from h5pyd/_hl/h5type.py rename to h5pyd/h5type.py index e48dc5c7..bbf1cf3c 100644 --- a/h5pyd/_hl/h5type.py +++ b/h5pyd/h5type.py @@ -58,9 +58,9 @@ class Reference(): Represents an HDF5 object reference """ @property - def id(self): + def uuid(self): """ Low-level identifier appropriate for this object """ - return self._id + return self._uuid @property def objref(self): @@ -70,22 +70,36 @@ def objref(self): def __init__(self, bind): """ Create a new reference by binding to a group/dataset/committed type """ - self._id = bind._id - self._objref = weakref.ref(bind) + if bind: + self._uuid = bind.id.uuid + self._objref = weakref.ref(bind) + else: + self._uuid = "" + self._objref = None def __repr__(self): - if not isinstance(self._id.id, str): + # TBD: for h5py compatiblity, this should return "", but + # we are using some hacks to pass refs as strings, so need this for now + if not isinstance(self._uuid, str): raise TypeError("Expected string id") - item = None + if not self._uuid: + item = "" + elif self._uuid.startswith("g-"): + item = f"groups/{self._uuid}" + elif self._uuid.startswith("t-"): + item = f"datatypes/{self._uuid}" + elif self._uuid.startswith("d-"): + item = f"datasets/{self._uuid}" + else: + raise ValueError("unexpected uuid: {self._uuid}") - collection_type = self._id.collection_type - item = f"{collection_type}/{self._id.id}" return item def tolist(self): return [self.__repr__(),] + class RegionReference(): """ diff --git a/h5pyd/_hl/h5type_test.py b/h5pyd/h5type_test.py similarity index 100% rename from h5pyd/_hl/h5type_test.py rename to h5pyd/h5type_test.py diff --git a/h5pyd/_hl/httpconn.py b/h5pyd/httpconn.py similarity index 97% rename from h5pyd/_hl/httpconn.py rename to h5pyd/httpconn.py index 1502351e..669cce68 100644 --- a/h5pyd/_hl/httpconn.py +++ b/h5pyd/httpconn.py @@ -26,8 +26,7 @@ from . import openid from .objdb import ObjDB -from .. import config -from . import requests_lambda +from . import config def eprint(*args, **kwargs): @@ -249,7 +248,6 @@ def json(self): class HttpConn: """ Some utility methods based on equivalents in base class. - TBD: Should refactor these to a common base class """ def __init__( @@ -276,7 +274,6 @@ def __init__( self._retries = retries self._timeout = timeout self._hsds = None - self._lambda = None self._api_key = api_key self._s = None # Sessions self._server_info = None @@ -301,12 +298,6 @@ def __init__( msg = "no endpoint set" raise ValueError(msg) - lambda_prefix = requests_lambda.LAMBDA_REQ_PREFIX - - if endpoint.startswith(lambda_prefix): - # save lambda function name - self._lambda = endpoint[len(lambda_prefix):] - elif endpoint.startswith("local"): # create a local hsds server # set the number of nodes @@ -548,10 +539,7 @@ def GET(self, req, format="json", params=None, headers=None): self._hsds.run() s = self.session - if self._lambda: - stream = False - else: - stream = True + stream = True # tbd - config for no streaming? rsp = s.get( self._endpoint + req, @@ -745,15 +733,12 @@ def session(self): retries = self._retries backoff_factor = 1 status_forcelist = (500, 502, 503, 504) - lambda_prefix = requests_lambda.LAMBDA_REQ_PREFIX if self._use_session: if self._s is None: if self._endpoint.startswith("http+unix://"): self.log.debug(f"create unixsocket session: {self._endpoint}") s = requests_unixsocket.Session() - elif self._endpoint.startswith(lambda_prefix): - s = requests_lambda.Session() else: # regular request session s = requests.Session() diff --git a/h5pyd/_hl/objdb.py b/h5pyd/objdb.py similarity index 94% rename from h5pyd/_hl/objdb.py rename to h5pyd/objdb.py index e1b9347d..d3c7ee0b 100644 --- a/h5pyd/_hl/objdb.py +++ b/h5pyd/objdb.py @@ -14,20 +14,8 @@ import time import weakref -from .. import config - - -def get_collection(uuid): - """ Return the collection type for the given obj uuid """ - - if uuid.startswith("g-"): - return "groups" - elif uuid.startswith("t-"): - return "datatypes" - elif uuid.startswith("d-"): - return "datasets" - else: - raise TypeError(f"unexpected uuid: {uuid}") +from . import config +from .objectid import get_collection class ObjDB(): @@ -108,8 +96,14 @@ def __getitem__(self, obj_uuid): def __delitem__(self, obj_uuid): if obj_uuid not in self._objdb: - self.log.warning(f"id: {obj_uuid} not found for deletion in objDB") - raise KeyError(obj_uuid) + print(f"{obj_uuid} not in objdb, fetching") + obj_json = self.fetch(obj_uuid) + if not obj_json: + self.log.warning(f"id: {obj_uuid} not found for deletion in objDB") + raise KeyError(obj_uuid) + collection = get_collection(obj_uuid) + req = f"/{collection}/{obj_uuid}" + self._http_conn.DELETE(req) del self._objdb[obj_uuid] del self._loadtime[obj_uuid] @@ -280,6 +274,8 @@ def set_link(self, group_uuid, title, link_json, replace=False): links[title] = link_json def del_link(self, group_uuid, title): + """ Delete the given link """ + if title.find('/') != -1: raise KeyError("objdb.del_link - link title can not be nested") obj_json = self.__getitem__(group_uuid) @@ -362,6 +358,18 @@ def make_obj(self, parent_uuid, title, type_json=None, shape=None, cpl=None, tra return obj_uuid + def del_obj(self, obj_uuid): + """ Delete the given object """ + collection = get_collection(obj_uuid) + req = f"/{collection}/{obj_uuid}" + + rsp = self.http_conn.DELETE(req) + if rsp.status_code != 200: + raise IOError(rsp.status_code, f"failed to delete object: {obj_uuid}") + # ok - so delete our cached copy + if obj_uuid in self._objdb: + del self._objdb[obj_uuid] + def set_attr(self, obj_uuid, name, attr_json): """ create update attribute """ obj_json = self.__getitem__(obj_uuid) diff --git a/h5pyd/_hl/objectid.py b/h5pyd/objectid.py similarity index 97% rename from h5pyd/_hl/objectid.py rename to h5pyd/objectid.py index 32aaaf65..d6319f15 100644 --- a/h5pyd/_hl/objectid.py +++ b/h5pyd/objectid.py @@ -64,6 +64,21 @@ def get_UUID(name): return obj_uuid +def get_collection(uuid): + """ Return the collection type for the given obj uuid """ + + obj_uuid = get_UUID(uuid) + + if obj_uuid.startswith("g-"): + return "groups" + elif obj_uuid.startswith("t-"): + return "datatypes" + elif obj_uuid.startswith("d-"): + return "datasets" + else: + raise TypeError(f"unexpected uuid: {uuid}") + + def get_class_for_uuid(uuid): """ Return class based on uuid """ if not uuid: diff --git a/h5pyd/_hl/openid.py b/h5pyd/openid.py similarity index 99% rename from h5pyd/_hl/openid.py rename to h5pyd/openid.py index e0eb0f07..bb59af54 100644 --- a/h5pyd/_hl/openid.py +++ b/h5pyd/openid.py @@ -29,7 +29,7 @@ def eprint(*args, **kwargs): # eprint("Unable to import google auth packages") -from .. import config as hsconfig +from . import config as hsconfig class OpenIDHandler(ABC): diff --git a/h5pyd/_hl/serverinfo.py b/h5pyd/serverinfo.py similarity index 99% rename from h5pyd/_hl/serverinfo.py rename to h5pyd/serverinfo.py index 10203cb2..c009708b 100644 --- a/h5pyd/_hl/serverinfo.py +++ b/h5pyd/serverinfo.py @@ -14,7 +14,7 @@ import time from .httpconn import HttpConn -from .. import config +from . import config def getServerInfo(endpoint=None, username=None, password=None, api_key=None, **kwds): diff --git a/test/hl/test_attribute.py b/test/hl/test_attribute.py index eb3d03a1..b88639cf 100644 --- a/test/hl/test_attribute.py +++ b/test/hl/test_attribute.py @@ -110,6 +110,7 @@ def test_create(self): refdt = h5py.special_dtype(ref=h5py.Reference) # create ref dtype g1.attrs.create('f1', g11_ref, dtype=refdt) # create attribute with ref to g1.1 ref = g1.attrs['f1'] # read back the attribute + print(f"ref: {ref} type: {type(ref)}") refobj = f[ref] # get the ref'd object self.assertTrue('name' in refobj.attrs) # should see the tag attribute diff --git a/test/hl/test_dataset_create.py b/test/hl/test_dataset_create.py index c68f5773..819b75ce 100644 --- a/test/hl/test_dataset_create.py +++ b/test/hl/test_dataset_create.py @@ -556,6 +556,11 @@ def validate_dset(dset): validate_dset(dset) dset_id = dset.id.id + if config.get("use_h5py"): + self.assertTrue(isinstance(dset_id, int)) + else: + self.assertTrue(isinstance(dset_id, str)) + if not config.get("use_h5py"): # Check dataset's last modified time self.assertTrue(isinstance(dset.modified, datetime)) @@ -572,20 +577,22 @@ def validate_dset(dset): self.assertEqual(num_links, 0) if not config.get("use_h5py"): # can get a reference to the dataset using the dataset id - uuid_ref = f"datasets/{dset_id}" - dset = f[uuid_ref] + + dset_ref = f.id.get(dset_id) + print("dset_ref:", dset_ref) + dset = f[dset_ref] validate_dset(dset) self.assertEqual(dset.id.id, dset_id) - # explictly delete dataset - del f[uuid_ref] + # explicitly delete dataset + del f[dset_ref] # should not be returned now try: - dset = f[uuid_ref] + dset = f[dset_ref] print(f"didn't expect to get: {dset}") - self.asertTrue(False) - except IOError: + self.assertTrue(False) + except KeyError: pass # expected f.close() diff --git a/test/hl/test_dataset_objref.py b/test/hl/test_dataset_objref.py index 37b9a74d..d79e9e00 100644 --- a/test/hl/test_dataset_objref.py +++ b/test/hl/test_dataset_objref.py @@ -51,6 +51,8 @@ def test_create(self): # get ref to g1/g1.1 from g2 g11ref = g2[g11_ref] + self.assertTrue(isinstance(g11ref, h5py.Group)) + # create subgroup /g1/g1.1/foo g11ref.create_group("foo") self.assertEqual(len(g11), 1) @@ -63,6 +65,7 @@ def test_create(self): d1_ref = d1.ref dt = h5py.special_dtype(ref=h5py.Reference) self.assertTrue(dt.metadata['ref'] is h5py.Reference) + ref = h5py.check_dtype(ref=dt) self.assertEqual(ref, h5py.Reference) @@ -75,16 +78,19 @@ def test_create(self): dset[1] = d1_ref a_ref = dset[0] - obj = f[a_ref] - if not config.get("use_h5py"): - self.assertEqual(obj.id.id, g11.id.id) # ref to g1.1 - self.assertEqual(obj.name, "/g1/g1.1") - b_ref = dset[1] - obj = f[b_ref] - if not config.get("use_h5py"): - self.assertEqual(obj.id.id, d1.id.id) # ref to d1 - self.assertEqual(obj.name, "/g2/d1") + a_obj = f[a_ref] + b_obj = f[b_ref] + if config.get("use_h5py"): + self.assertEqual(a_obj.name, "/g1/g1.1") + self.assertEqual(b_obj.name, "/g2/d1") + else: + # in h5pyd, paths aren't assigned when an object is + # fetched by reference + self.assertEqual(a_obj.name, None) + self.assertEqual(a_obj.id, g11.id) # ref to g1.1 + self.assertEqual(b_obj.name, None) + self.assertEqual(b_obj.id, d1.id) # ref to /g2/d1 # try the same thing using attributes ref_values = [g11_ref, d1_ref] @@ -101,14 +107,17 @@ def test_create(self): self.assertEqual(ref, h5py.Reference) a0_ref = attr[0] obj = f[a0_ref] - if not config.get("use_h5py"): + if config.get("use_h5py"): + self.assertEqual(obj.name, "/g1/g1.1") + else: self.assertEqual(obj.id.id, g11.id.id) # ref to g1.1 - self.assertEqual(obj.name, "/g1/g1.1") + a1_ref = attr[1] obj = f[a1_ref] - if not config.get("use_h5py"): + if config.get("use_h5py"): + self.assertEqual(obj.name, "/g2/d1") + else: self.assertEqual(obj.id.id, d1.id.id) # ref to d1 - self.assertEqual(obj.name, "/g2/d1") f.close() # try opening in read-mode @@ -122,15 +131,17 @@ def test_create(self): self.assertEqual(ref, h5py.Reference) a0_ref = attr[0] obj = f[a0_ref] - if not config.get("use_h5py"): + if config.get("use_h5py"): + self.assertEqual(obj.name, "/g1/g1.1") + else: self.assertEqual(obj.id.id, g11.id.id) # ref to g1.1 - self.assertEqual(obj.name, "/g1/g1.1") a1_ref = attr[1] obj = f[a1_ref] - if not config.get("use_h5py"): + if config.get("use_h5py"): + self.assertEqual(obj.name, "/g2/d1") + else: self.assertEqual(obj.id.id, d1.id.id) # ref to d1 - self.assertEqual(obj.name, "/g2/d1") def test_delete(self): filename = self.getFileName("objref_delete_test") diff --git a/test/hl/test_dimscale.py b/test/hl/test_dimscale.py index 02c20bb0..e280bc1e 100644 --- a/test/hl/test_dimscale.py +++ b/test/hl/test_dimscale.py @@ -28,7 +28,7 @@ class TestDimensionScale(TestCase): def test_everything(self): """Everything related to dimension scales""" filename = self.getFileName('test_dimscale') - f = h5py.File(filename, 'w') + f = h5py.File(filename, 'w') dset = f.create_dataset('temperatures', (10, 10, 10), dtype='f') f.create_dataset('scale_x', data=np.arange(10) * 10e3) From 1c48eaf1a9c46c895bb3843df30bc96996aeea68 Mon Sep 17 00:00:00 2001 From: John Readey Date: Fri, 24 Jan 2025 17:06:11 +0800 Subject: [PATCH 22/32] update for dimsacale support --- h5pyd/_apps/utillib.py | 2 +- h5pyd/_hl/dataset.py | 11 +++++- h5pyd/_hl/dims.py | 40 ++++++++++++------- h5pyd/_hl/files.py | 6 +-- h5pyd/_hl/group.py | 2 +- h5pyd/_hl/table.py | 30 +++++++++------ h5pyd/h5ds.py | 81 +++++++++++++++++++++++---------------- h5pyd/h5type.py | 1 - h5pyd/objdb.py | 21 +++++++++- h5pyd/objectid.py | 24 ++++++++++-- test/hl/test_attribute.py | 1 - test/hl/test_dimscale.py | 37 ++++++++++-------- test/hl/test_file.py | 1 - 13 files changed, 169 insertions(+), 88 deletions(-) diff --git a/h5pyd/_apps/utillib.py b/h5pyd/_apps/utillib.py index 576526f7..eb5a8086 100755 --- a/h5pyd/_apps/utillib.py +++ b/h5pyd/_apps/utillib.py @@ -162,7 +162,7 @@ def get_chunk_layout(dset): msg = "get_chunk_layout called on hdf5 dataset" logging.error(msg) raise IOError(msg) - dset_json = dset.id.dcpl_json + dset_json = dset.id.cpl if "layout" not in dset_json: msg = f"expect to find layout key in dset_json: {dset_json}" logging.error(msg) diff --git a/h5pyd/_hl/dataset.py b/h5pyd/_hl/dataset.py index 9b74e389..1ce1dee1 100644 --- a/h5pyd/_hl/dataset.py +++ b/h5pyd/_hl/dataset.py @@ -30,6 +30,7 @@ from . import selections as sel from .datatype import Datatype from ..h5type import getTypeItem, check_dtype, special_dtype, getItemSize +from .. import h5ds from .. import config _LEGACY_GZIP_COMPRESSION_VALS = frozenset(range(10)) @@ -1689,7 +1690,15 @@ def make_scale(self, name=""): You can optionally pass a name to associate with this scale. """ - self.dims.create_scale(self, name=name) + h5ds.set_scale(self._id, name) + + @property + def is_scale(self): + """Return ``True`` if this dataset is also a dimension scale. + + Return ``False`` otherwise. + """ + return h5ds.is_scale(self._id) """ Convert a list to a tuple, recursively. diff --git a/h5pyd/_hl/dims.py b/h5pyd/_hl/dims.py index 3ba69cc4..fab22d57 100644 --- a/h5pyd/_hl/dims.py +++ b/h5pyd/_hl/dims.py @@ -13,6 +13,7 @@ from __future__ import absolute_import from . import base from .dataset import Dataset +from .. objectid import DatasetID from .. import h5ds @@ -29,9 +30,9 @@ def label(self, val): h5ds.set_label(self._id, self._dimension, val) def __init__(self, dset, dimension): - if not isinstance(dset, Dataset): + if not isinstance(dset, DatasetID): raise TypeError(f"expected Dataset, but got: {type(dset)}") - self._id = dset.id + self._id = dset self._dimension = int(dimension) def __hash__(self): @@ -52,26 +53,29 @@ def __getitem__(self, item): item can be an int in which case scale at that index will be returned or item can be a str in which casee the scale ith that name will be returned """ + scales = [] + h5ds.iterate(self._id, self._dimension, scales.append, 0) + if isinstance(item, int): - scales = [] - h5ds.iterate(self._id, self._dimension, scales.append, 0) + if item < 0 or item >= len(scales): + raise IndexError(f"{item} is out of range") return Dataset(scales[item]) - else: - def f(dsid): - """ Iterate over scales to find a matching name """ - if h5ds.get_scale_name(dsid) == self._e(item): - return dsid - res = h5ds.iterate(self._id, self._dimension, f, 0) - if res is None: - raise KeyError(item) - return Dataset(res) + else: + for dsid in scales: + if h5ds.get_scale_name(dsid) == item: + return Dataset(dsid) + raise KeyError(item) def attach_scale(self, dscale): ''' Attach a scale to this dimension. Provide the Dataset of the scale you would like to attach. ''' + if not isinstance(dscale, Dataset): + raise TypeError(f"attach_scale expected Dataset but got: {type(dscale)}") + if not h5ds.is_scale(dscale.id): + h5ds.set_scale(dscale.id) h5ds.attach_scale(self._id, dscale.id, self._dimension) def detach_scale(self, dscale): @@ -79,6 +83,9 @@ def detach_scale(self, dscale): Provide the Dataset of the scale you would like to remove. ''' + if not isinstance(dscale, Dataset): + raise TypeError(f"detach_scale expected Dataset but got: {type(dscale)}") + h5ds.detach_scale(self._id, dscale.id, self._dimension) def items(self): @@ -95,7 +102,7 @@ def items(self): scales = [] for scale_id in scale_ids: scale_name = h5ds.get_scale_name(scale_id) - scales.append(scale_name, Dataset(scale_id)) + scales.append((scale_name, Dataset(scale_id))) return scales def keys(self): @@ -123,6 +130,9 @@ class DimensionManager(base.MappingHDF5, base.CommonStateObject): def __init__(self, parent): ''' Private constructor. ''' + if not isinstance(parent, Dataset): + raise TypeError(f"attach_scale expected Dataset but got: {type(parent)}") + self._id = parent.id def __getitem__(self, index): @@ -151,5 +161,7 @@ def create_scale(self, dset, name=''): Provide the dataset and a name for the scale. ''' + if not isinstance(dset, Dataset): + raise TypeError(f"create_scale expected Dataset but got: {type(dset)}") dset.make_scale(name) diff --git a/h5pyd/_hl/files.py b/h5pyd/_hl/files.py index 63863e61..64f5eccc 100644 --- a/h5pyd/_hl/files.py +++ b/h5pyd/_hl/files.py @@ -823,9 +823,9 @@ def close(self, flush=None): # this will close the socket of the http_conn singleton self.log.debug(f"close, mode: {self.mode}") - self.flush() - - self._id.close() + if self.id: + self.flush() + self._id.close() def __enter__(self): return self diff --git a/h5pyd/_hl/group.py b/h5pyd/_hl/group.py index 2848b761..ba3625e2 100644 --- a/h5pyd/_hl/group.py +++ b/h5pyd/_hl/group.py @@ -827,7 +827,7 @@ def visititems(self, func): # caller indicates to end iteration break visited[parent.id.uuid] = True - if parent.id.__class__ is GroupID: + if isinstance(parent.id, GroupID): # get group links for title in parent.id.get_link_titles(): link = parent.id.get_link(title) diff --git a/h5pyd/_hl/table.py b/h5pyd/_hl/table.py index d5e218d0..fec0fc71 100644 --- a/h5pyd/_hl/table.py +++ b/h5pyd/_hl/table.py @@ -201,18 +201,21 @@ def readtime_dtype(basetype, names): params["select"] = sel_param try: self.log.debug(f"params: {params}") - rsp = self.GET(req, params=params) - if isinstance(rsp, bytes): + rsp = self.id.http_conn.GET(req, params=params) + if rsp.status_code != 200: + raise IOError(rsp.status_code, "table read request failed") + if rsp.is_binary: # binary response - arr = bytesToArray(rsp, rsp_type, None) + arr = bytesToArray(rsp.text, rsp_type, None) count = len(arr) self.log.info(f"got {count} rows binary data") else: - values = rsp["value"] + rsp_json = rsp.json() + values = rsp_json["value"] count = len(values) - if "index" in rsp: + if "index" in rsp_json: # older server version that returns index as a seperate key - indices = rsp["index"] + indices = rsp_json["index"] if len(indices) != count: raise ValueError(f"expected {count} indicies, but got: {len(indices)}") else: @@ -298,15 +301,18 @@ def update_where(self, condition, value, start=None, stop=None, step=None, limit req = "/datasets/" + self.id.uuid + "/value" - rsp = self.PUT(req, body=value, format="json", params=params) + rsp = self.id.http_conn.PUT(req, body=value, format="json", params=params) + if rsp.status_code != 200: + raise IOError(rsp.status_code, "table update request failed") + rsp_json = rsp.json() indices = None arr = None - if "index" in rsp: - indices = rsp["index"] - elif "value" in rsp: + if "index" in rsp_json: + indices = rsp_json["index"] + elif "value" in rsp_json: # new-style return type - index is first element in each row indices = [] - for row in rsp["value"]: + for row in rsp_json["value"]: indices.append(row[0]) else: raise ValueError("unexpected response from PUT query") @@ -418,4 +424,4 @@ def append(self, rows): raise IOError(rsp.status_code, "table append failed") # if we get here, the request was successful, adjust the shape - # TBD... + self.id.shape_refresh() diff --git a/h5pyd/h5ds.py b/h5pyd/h5ds.py index bf456f92..a2419272 100644 --- a/h5pyd/h5ds.py +++ b/h5pyd/h5ds.py @@ -11,6 +11,10 @@ ############################################################################## from ._hl.objectid import DatasetID +DIMENSION_LIST = "DIMENSION_LIST" +REFERENCE_LIST = "REFERENCE_LIST" +DIMENSION_LABELS = "DIMENSION_LABELS" + def attach_scale(dset: DatasetID, dscale: DatasetID, idx: int): """ Attach Dimension Scale dscale to Dimension idx of Dataset dset. """ @@ -25,15 +29,15 @@ def attach_scale(dset: DatasetID, dscale: DatasetID, idx: int): raise TypeError("f{dscale} is not a dimension scale") if is_scale(dset): - raise TypeError("cannot attach a dimension scale to a dimension scale") + raise RuntimeError("cannot attach a dimension scale to a dimension scale") # Create a DIMENSION_LIST attribute if needed - orig_dimlist = dset.getAttrValue('DIMENSION_LIST') - if orig_dimlist: + if dset.has_attr(DIMENSION_LIST): # delete and replace later - dset.del_attr('DIMENSION_LIST') - + value = dset.get_attr_value(DIMENSION_LIST) + dset.del_attr(DIMENSION_LIST) + else: value = [list() for _ in range(rank)] dimlist = { @@ -57,10 +61,10 @@ def attach_scale(dset: DatasetID, dscale: DatasetID, idx: int): # Update the DIMENSION_LIST attribute with the object reference to the # dimension scale dimlist['value'][idx].append('datasets/' + dscale.uuid) - dset.set_attr('DIMENSION_list', dimlist) + dset.set_attr(DIMENSION_LIST, dimlist) - if dscale.has_attr('REFERENCE_LIST'): - old_reflist = dscale.get_attr('REFERENCE_LIST') + if dscale.has_attr(REFERENCE_LIST): + old_reflist = dscale.get_attr(REFERENCE_LIST) else: old_reflist = { 'creationProperties': { @@ -104,7 +108,7 @@ def attach_scale(dset: DatasetID, dscale: DatasetID, idx: int): new_reflist["shape"]["dims"] = [len(reflist_value), ] # Update the REFERENCE_LIST attribute of the dimension scale - dscale.set_attr('REFERENCE_LIST', new_reflist) + dscale.set_attr(REFERENCE_LIST, new_reflist) def detach_scale(dset: DatasetID, dscale: DatasetID, idx: int): @@ -116,10 +120,10 @@ def detach_scale(dset: DatasetID, dscale: DatasetID, idx: int): if idx >= rank: raise ValueError("invalid dimension") - if not dset.has_attr('DIMENSION_LIST'): + if not dset.has_attr(DIMENSION_LIST): raise IOError("no DIMENSION_LIST attr in {dset}") - dimlist = dset.get_attr('DIMENSION_LIST') - dset.del_attr('DIMENSION_LIST') + dimlist = dset.get_attr(DIMENSION_LIST) + dset.del_attr(DIMENSION_LIST) try: # TBD: use ref class @@ -128,12 +132,12 @@ def detach_scale(dset: DatasetID, dscale: DatasetID, idx: int): except Exception as e: # Restore the attribute's old value then raise the same # exception - dset.set_attr('DIMENSION_LIST', dimlist) + dset.set_attr(DIMENSION_LIST, dimlist) raise e - dset.set_attr('DIMENSION_LIST', dimlist) + dset.set_attr(DIMENSION_LIST, dimlist) - if dscale.has_attr('REFERENCE_LIST'): - old_reflist = dscale.get_attr('REFERENCE_LIST') + if dscale.has_attr(REFERENCE_LIST): + old_reflist = dscale.get_attr(REFERENCE_LIST) else: old_reflist = {} @@ -150,14 +154,14 @@ def detach_scale(dset: DatasetID, dscale: DatasetID, idx: int): if len(new_refs) > 0: new_reflist["value"] = new_refs new_reflist["shape"] = [len(new_refs), ] - if dscale.has_attr('REFERENCE_LIST'): - dscale.del_attr('REFERENCE_LIST') - dscale.set_attr('REFERENCE_LIST', new_reflist) + if dscale.has_attr(REFERENCE_LIST): + dscale.del_attr(REFERENCE_LIST) + dscale.set_attr(REFERENCE_LIST, new_reflist) else: # Remove REFERENCE_LIST attribute if this dimension scale is # not attached to any dataset if old_reflist: - dscale.del_attr('REFERENCE_LIST') + dscale.del_attr(REFERENCE_LIST) def get_label(dset: DatasetID, idx: int) -> str: @@ -169,7 +173,10 @@ def get_label(dset: DatasetID, idx: int) -> str: if idx >= rank: raise ValueError("invalid dimension") - label_values = dset.get_attr('DIMENSION_LABELS') + if not dset.has_attr(DIMENSION_LABELS): + return '' + + label_values = dset.get_attr_value(DIMENSION_LABELS) if not label_values: return '' @@ -189,8 +196,7 @@ def get_num_scales(dset: DatasetID, dim: int) -> int: raise ValueError("dimension must be non-negative") if dim >= rank: raise ValueError("invalid dimension") - - dimlist_values = dset.get_attr_value('DIMENSION_LIST') + dimlist_values = dset.get_attr_value(DIMENSION_LIST) if not dimlist_values: return 0 @@ -217,10 +223,12 @@ def is_attached(dset: DatasetID, dscale: DatasetID, idx: int) -> bool: if not is_scale(dscale) or is_scale(dset): return False - if not dset.has_attr("DIMENSION_LIST"): + if not dset.has_attr(DIMENSION_LIST): + return False + dimlist = dset.get_attr(DIMENSION_LIST) + if not dscale.has_attr(REFERENCE_LIST): return False - dimlist = dset.get_attr("DIMENSION_LIST") - reflist = dscale.get_attr("REFERENCE_LIST") + reflist = dscale.get_attr(REFERENCE_LIST) try: return ([f"datasets/{dset._uuid}", idx] in reflist["value"] and f"datasets/{dscale._uuid}" in dimlist["value"][idx]) @@ -246,6 +254,8 @@ def is_scale(dset: DatasetID) -> bool: # }, # 'value': 'DIMENSION_SCALE' # } + if not dset.has_attr("CLASS"): + return False class_json = dset.get_attr("CLASS") if class_json["value"] != "DIMENSION_SCALE": return False @@ -281,7 +291,7 @@ def set_label(dset: DatasetID, idx: int, label: str): if idx >= rank: raise ValueError("invalid dimension") - label_name = 'DIMENSION_LABELS' + label_name = DIMENSION_LABELS if dset.has_attr(label_name): labels = dset.get_attr(label_name) else: @@ -298,11 +308,11 @@ def set_label(dset: DatasetID, idx: int, label: str): }, 'value': ['' for n in range(rank)] } - labels['value'][idx] = label + labels['value'][idx] = label dset.set_attr(label_name, labels) -def set_scale(dset: DatasetID, dimname: str): +def set_scale(dset: DatasetID, dimname: str = ""): """ Convert dataset dset to a dimension scale, with optional name dimname. """ # CLASS attribute with the value 'DIMENSION_SCALE' @@ -361,13 +371,18 @@ def iterate(dset: DatasetID, dim: int, callable: any, startidx: int = 0) -> any: dimlist = dset.get_attr_value('DIMENSION_LIST') if not dimlist: return 0 + if len(dimlist) != rank: + raise ValueError(f"expected {rank} elements in dimlist, but got: {len(dimlist)}") + + scale_list = dimlist[dim] - if startidx >= len(dimlist): + if startidx >= len(scale_list): # dimension scale len request out of range return 0 idx = startidx - while idx < len(dimlist): - dscale_uuid = dimlist[idx] - callable(DatasetID(dscale_uuid)) + while idx < len(scale_list): + dscale_uuid = scale_list[idx] + dscale_id = dset.get(dscale_uuid) + callable(dscale_id) idx += 1 diff --git a/h5pyd/h5type.py b/h5pyd/h5type.py index bbf1cf3c..7cb0560d 100644 --- a/h5pyd/h5type.py +++ b/h5pyd/h5type.py @@ -99,7 +99,6 @@ def tolist(self): return [self.__repr__(),] - class RegionReference(): """ diff --git a/h5pyd/objdb.py b/h5pyd/objdb.py index d3c7ee0b..e4fcc7f7 100644 --- a/h5pyd/objdb.py +++ b/h5pyd/objdb.py @@ -96,7 +96,6 @@ def __getitem__(self, obj_uuid): def __delitem__(self, obj_uuid): if obj_uuid not in self._objdb: - print(f"{obj_uuid} not in objdb, fetching") obj_json = self.fetch(obj_uuid) if not obj_json: self.log.warning(f"id: {obj_uuid} not found for deletion in objDB") @@ -408,6 +407,24 @@ def del_attr(self, obj_uuid, name): # remove from the objdb del attrs[name] + def shape_refresh(self, dset_uuid): + """ Get the latest dataset shape """ + if dset_uuid not in self._objdb: + # just need to do a fetch... + self.fetch(dset_uuid) + else: + obj_json = self._objdb[dset_uuid] + req = f"/datasets/{dset_uuid}/shape" + rsp = self.http_conn.GET(req) + if rsp.status_code != 200: + msg = "unable to get dataset shape" + raise IOError(rsp.status_code, msg) + rsp_json = rsp.json() + if "shape" not in rsp_json: + raise RuntimeError(f"Unexpected response for {req}") + shape_json = rsp_json['shape'] + obj_json['shape'] = shape_json + def resize(self, dset_uuid, dims): """ update the shape of the dataset """ # send the request to the server @@ -419,4 +436,4 @@ def resize(self, dset_uuid, dims): raise IOError(rsp.status_code, msg) # TBD Have HSDS return updated shape in response to avoid # this GET request - self.fetch(dset_uuid) + self.shape_refresh(dset_uuid) diff --git a/h5pyd/objectid.py b/h5pyd/objectid.py index d6319f15..f585214a 100644 --- a/h5pyd/objectid.py +++ b/h5pyd/objectid.py @@ -301,12 +301,12 @@ def __ne__(self, other): def refresh(self): """ get the latest obj_json data from server """ - self.objdb.fetch(self.uuid) + self.objdb.fetch(self._uuid) def close(self): """Remove handles to id. """ - self._old_uuid = self.uuid # for debugging + self._old_uuid = self._uuid # for debugging self._uuid = 0 self._http_conn = None @@ -428,6 +428,25 @@ def resize(self, dims): objdb = self.http_conn.objdb objdb.resize(self._uuid, dims) + def shape_refresh(self): + """ Get the current shape """ + if self.is_extensible(): + self.objdb.shape_refresh(self._uuid) + + def is_extensible(self): + """ Return True if the dataset is extensible """ + shape_json = self.shape_json + if "maxdims" not in shape_json: + return False + extensible = False + maxdims = shape_json['maxdims'] + for i in range(len(maxdims)): + # fi any dim is 0 or None, then extensible + extent = maxdims[i] + if extent == 0 or extent is None: + extensible = True + return extensible + class GroupID(ObjectID): @@ -514,7 +533,6 @@ def get_link_titles(self, track_order=None): item['title'] = title item['created'] = link_json['created'] link_list.append(item) - if track_order: link_list.sort(key=lambda d: d['created']) else: diff --git a/test/hl/test_attribute.py b/test/hl/test_attribute.py index b88639cf..eb3d03a1 100644 --- a/test/hl/test_attribute.py +++ b/test/hl/test_attribute.py @@ -110,7 +110,6 @@ def test_create(self): refdt = h5py.special_dtype(ref=h5py.Reference) # create ref dtype g1.attrs.create('f1', g11_ref, dtype=refdt) # create attribute with ref to g1.1 ref = g1.attrs['f1'] # read back the attribute - print(f"ref: {ref} type: {type(ref)}") refobj = f[ref] # get the ref'd object self.assertTrue('name' in refobj.attrs) # should see the tag attribute diff --git a/test/hl/test_dimscale.py b/test/hl/test_dimscale.py index e280bc1e..7ebd51da 100644 --- a/test/hl/test_dimscale.py +++ b/test/hl/test_dimscale.py @@ -18,7 +18,6 @@ if config.get("use_h5py"): import h5py - print("using h5py") else: import h5pyd as h5py @@ -44,15 +43,16 @@ def test_everything(self): self.assertIsInstance(d, h5py._hl.dims.DimensionProxy) # Create and name dimension scales - dset.dims.create_scale(f['scale_x'], 'Simulation X (North) axis') + f['scale_x'].make_scale('Simulation X (North) axis') + self.assertTrue(h5py.h5ds.is_scale(f['scale_x'].id)) - dset.dims.create_scale(f['scale_y'], 'Simulation Y (East) axis') + f['scale_y'].make_scale('Simulation Y (East) axis') self.assertTrue(h5py.h5ds.is_scale(f['scale_y'].id)) - dset.dims.create_scale(f['scale_z'], 'Simulation Z (Vertical) axis') + f['scale_z'].make_scale('Simulation Z (Vertical) axis') self.assertTrue(h5py.h5ds.is_scale(f['scale_z'].id)) # Try re-creating the last dimscale - dset.dims.create_scale(f['scale_z'], 'Simulation Z (Vertical) axis') + f['scale_z'].make_scale('Simulation Z (Vertical) axis') self.assertTrue(h5py.h5ds.is_scale(f['scale_z'].id)) # Attach a non-dimension scale (and in the process make it a dimension @@ -116,9 +116,16 @@ def test_everything(self): self.assertIsInstance(s[0], str) self.assertEqual(s[0], 'Simulation Z (Vertical) axis') + for s in dset.dims[0].items(): + self.assertIsInstance(s, tuple) + self.assertIsInstance(s[1], h5py.Dataset) + self.assertIsInstance(s[0], str) + title = 'Simulation X (North) axis' + self.assertTrue(title in dset.dims[0]) + self.assertIsInstance(dset.dims[0][0], h5py.Dataset) - self.assertIsInstance(dset.dims[0]['Simulation X (North) axis'], - h5py.Dataset) + + self.assertIsInstance(dset.dims[0]['Simulation X (North) axis'], h5py.Dataset) with self.assertRaises(IndexError): dset.dims[0][10] @@ -127,21 +134,17 @@ def test_everything(self): dset.dims[0]['foobar'] # Test dimension scale names - # TBD: why does this raise Unicode error for h5pyd? - if config.get("use_h5py"): - dset.dims.create_scale(f['scale_name'], '√') - else: - with self.assertRaises(UnicodeError): - dset.dims.create_scale(f['scale_name'], '√') + f['scale_name'].make_scale('√') with self.assertRaises((AttributeError, TypeError)): - dset.dims.create_scale(f['scale_name'], 67) + f['scale_name'].make_scale(67) f.close() # try opening file in read mode f = h5py.File(filename, 'r') dset = f['/temperatures'] + self.assertTrue(len(dset.dims), 3) labels = ('x', 'y', 'z') for i in range(3): @@ -152,7 +155,11 @@ def test_everything(self): else: self.assertEqual(len(dimscale), 1) scale = dimscale[0] - self.assertTrue(scale.name.endswith(labels[i])) + if config.get('use_h5py'): + self.assertTrue(scale.name.endswith(labels[i])) + else: + # in h5pyd, dimscales are anonymous + self.assertTrue(scale.name is None) self.assertEqual(scale.shape, (10,)) for s in dset.dims[2].items(): self.assertIsInstance(s, tuple) diff --git a/test/hl/test_file.py b/test/hl/test_file.py index 5a864c6c..c0790446 100644 --- a/test/hl/test_file.py +++ b/test/hl/test_file.py @@ -264,7 +264,6 @@ def test_auth(self): self.assertEqual(f.filename, filename) self.assertEqual(f.name, "/") self.assertTrue(f.id.id is not None) - print("f.keys:", list(f.keys())) self.assertEqual(len(list(f.keys())), 2) if h5py.__name__ == "h5py": From ea5a3cbd10f569f30ee5be4d941ea789427c7aaa Mon Sep 17 00:00:00 2001 From: John Readey Date: Fri, 24 Jan 2025 17:12:43 +0800 Subject: [PATCH 23/32] fix for objectid import --- h5pyd/h5ds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/h5pyd/h5ds.py b/h5pyd/h5ds.py index a2419272..97527487 100644 --- a/h5pyd/h5ds.py +++ b/h5pyd/h5ds.py @@ -9,7 +9,7 @@ # distribution tree. If you do not have access to this file, you may # # request a copy from help@hdfgroup.org. # ############################################################################## -from ._hl.objectid import DatasetID +from .objectid import DatasetID DIMENSION_LIST = "DIMENSION_LIST" REFERENCE_LIST = "REFERENCE_LIST" From 2a06a0d0e8797ee1a7c2dba701fda8d5afd9505d Mon Sep 17 00:00:00 2001 From: John Readey Date: Fri, 24 Jan 2025 22:10:33 +0800 Subject: [PATCH 24/32] added config get method --- h5pyd/config.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/h5pyd/config.py b/h5pyd/config.py index 57b66762..3f63dbf9 100755 --- a/h5pyd/config.py +++ b/h5pyd/config.py @@ -89,6 +89,14 @@ def __getitem__(self, name): return None return Config._cfg[name] + def get(self, name, default): + """ return config value for name or default if None """ + val = self.__getitem__(name) + if val is None: + return default + else: + return default + def __setitem__(self, name, obj): """ set config item """ Config._cfg[name] = obj From 6c9fe2410e264eb7bf823054cf21c8a1efc21f2a Mon Sep 17 00:00:00 2001 From: John Readey Date: Sat, 25 Jan 2025 17:05:05 +0800 Subject: [PATCH 25/32] no PUT on close by default --- h5pyd/_hl/files.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/h5pyd/_hl/files.py b/h5pyd/_hl/files.py index 64f5eccc..440530d4 100644 --- a/h5pyd/_hl/files.py +++ b/h5pyd/_hl/files.py @@ -791,20 +791,24 @@ def run_scan(self): return - def flush(self): + def flush(self, checkpoint=False): """Tells the service to complete any pending updates to permanent storage""" if self.mode == 'r': # read-only, no need to flush return self.log.debug("flush") + # TBD: send any pending write requests + if not checkpoint: + return + self.log.info("sending PUT flush request") req = "/" body = {"flush": 1, "getdnids": 1} rsp = self.id.http_conn.PUT(req, body=body) self.log.debug(f"got status code: {rsp.status_code} from flush") if rsp.status_code != 200: - raise RuntimeError(f"got status code: {rsp.status_code} on flush") + raise RuntimeError(f"got status code: {rsp.status_code} on flush") rsp_json = rsp.json() if "dn_ids" in rsp_json: dn_ids = rsp_json["dn_ids"] @@ -818,7 +822,7 @@ def flush(self): raise IOError(500, "Unexpected Error") self.log.info("PUT flush complete") - def close(self, flush=None): + def close(self): """Clears reference to remote resource.""" # this will close the socket of the http_conn singleton From 1981d947b0b19a53a927ac9c52c29d233443922c Mon Sep 17 00:00:00 2001 From: John Readey Date: Sun, 26 Jan 2025 14:12:21 +0800 Subject: [PATCH 26/32] added ACLs class --- h5pyd/_apps/hsacl.py | 127 ++++++++----------- h5pyd/_hl/acls.py | 281 +++++++++++++++++++++++++++++++++++++++++ h5pyd/_hl/base.py | 45 ------- h5pyd/_hl/files.py | 84 +++++------- h5pyd/_hl/folders.py | 44 +------ h5pyd/config.py | 2 +- test/hl/test_file.py | 91 +++++++------ test/hl/test_folder.py | 27 ++-- 8 files changed, 428 insertions(+), 273 deletions(-) create mode 100644 h5pyd/_hl/acls.py diff --git a/h5pyd/_apps/hsacl.py b/h5pyd/_apps/hsacl.py index 0e23222a..e5b2850f 100755 --- a/h5pyd/_apps/hsacl.py +++ b/h5pyd/_apps/hsacl.py @@ -38,30 +38,6 @@ def abort(msg): sys.exit(-1) -# -# get given ACL, return None if not found -# -def getACL(f, username="default"): - try: - acl = f.getACL(username) - except IOError as ioe: - if ioe.errno == 403: - print("No permission to read ACL for this domain") - sys.exit(1) - elif ioe.errno == 401: - print("username/password needs to be provided") - sys.exit(1) - elif ioe.errno == 404 or not ioe.errno: - return None - else: - eprint(f"unexpected error: {ioe}") - sys.exit(1) - if acl and "domain" in acl: - # remove the domain key - del acl["domain"] - return acl - - # # Usage # @@ -195,16 +171,22 @@ def main(): else: abort(f"Unexpected error: {ioe}") + # + # get the acls + # + try: + acls = f.acls + except IOError as ioe: + if ioe.errno == 403: + username = cfg["hs_username"] + abort(f"User: {username} does not have permission to read ACL for this domain") + elif ioe.errno == 401: + abort("username/password needs to be provided") + else: + abort(f"Unexpected error: {ioe}") + # update/add ACL if permission flags have been set if perm: - default_acl = {'updateACL': False, - 'delete': False, - 'create': False, - 'read': False, - 'update': False, - 'readACL': False, - 'userName': 'default' - } # note: list.copy not supported in py2.7, copy by hand for now # update_names = usernames.copy() update_names = [] @@ -216,65 +198,58 @@ def main(): for username in update_names: # get user's ACL if it exist - acl = getACL(f, username=username) - if acl is None: - acl = default_acl.copy() - acl["userName"] = username - logging.info(f"updating acl to: {acl}") + if username not in acls: + acl = acls.readonly_acl() + acl.read = False + else: + acl = acls[username] + logging.info(f"updating acl for user: {username}") # mix in any permission changes for k in perm: - acl[k] = perm[k] + if k == "create": + acl.create = perm[k] + elif k == "read": + acl.read = perm[k] + elif k == "update": + acl.update = perm[k] + elif k == "delete": + acl.delete = perm[k] + elif k == "readACL": + acl.readACL = perm[k] + elif k == "updateACL": + acl.updateACL = perm[k] + else: + raise IOError(f"Unexpected permission: {k}") try: - f.putACL(acl) + logging.info(f"setting {username} acl to: {acl}") + acls[username] = acl except IOError as ioe: if ioe.errno in (401, 403): abort("access is not authorized") else: - abort("Unexpected error:", ioe) - # - # read the acls - # + abort(f"Unexpected error: {ioe}") + if len(usernames) == 0: # no usernames, dump all ACLs - try: - acls = f.getACLs() - except IOError as ioe: - if ioe.errno == 403: - username = cfg["hs_username"] - abort(f"User: {username} does not have permission to read ACL for this domain") - elif ioe.errno == 401: - abort("username/password needs to be provided") - else: - abort(f"Unexpected error: {ioe}") print("%015s %08s %08s %08s %08s %08s %08s " % fields) print("-" * 80) - for acl in acls: - vals = (acl["userName"], acl["create"], acl["read"], - acl["update"], acl["delete"], acl["readACL"], acl["updateACL"]) + for name in acls: + acl = acls[name] + vals = (name, acl.create, acl.read, + acl.update, acl.delete, acl.readACL, acl.updateACL) print("%015s %08s %08s %08s %08s %08s %08s " % vals) else: header_printed = False # don't print header until we have at least one ACL for username in usernames: - try: - acl = f.getACL(username) - if not header_printed: - print("%015s %08s %08s %08s %08s %08s %08s " % fields) - print("-" * 80) - header_printed = True - vals = (acl["userName"], acl["create"], acl["read"], - acl["update"], acl["delete"], acl["readACL"], acl["updateACL"]) - print("%015s %08s %08s %08s %08s %08s %08s " % vals) - except IOError as ioe: - if ioe.errno == 403: - this_user = cfg["hs_username"] - abort(f"User {this_user} does not have permission to read ACL for this domain") - elif ioe.errno == 401: - abort("username/password needs to be provided") - elif ioe.errno == 404: - abort(f"{username} not found") - else: - abort(f"Unexpected error: {ioe}") - + if username not in acls: + abort(f"{username} not found") + if not header_printed: + print("%015s %08s %08s %08s %08s %08s %08s " % fields) + print("-" * 80) + header_printed = True + vals = (username, acl.create, acl.read, acl.update, acl.delete, + acl.readACL, acl.updateACL) + print("%015s %08s %08s %08s %08s %08s %08s " % vals) f.close() diff --git a/h5pyd/_hl/acls.py b/h5pyd/_hl/acls.py new file mode 100644 index 00000000..b0321ced --- /dev/null +++ b/h5pyd/_hl/acls.py @@ -0,0 +1,281 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## + +""" + Implements high-level operations for ACLs. + + Provides the ACLManager class, available on file and folder objects + as .acls. +""" + +from __future__ import absolute_import + +ACL_KEYS = ("create", "read", "update", "delete", "readACL", "updateACL") + + +class ACL(): + def __init__(self, perm): + self._acl = perm + + @property + def create(self): + return True if self._acl['create'] else False + + @create.setter + def create(self, v): + self._acl['create'] = bool(v) + + @property + def read(self): + return True if self._acl['read'] else False + + @read.setter + def read(self, v): + self._acl['read'] = bool(v) + + @property + def update(self): + return True if self._acl['update'] else False + + @update.setter + def update(self, v): + self._acl['update'] = bool(v) + + @property + def delete(self): + return True if self._acl['delete'] else False + + @delete.setter + def delete(self, v): + self._acl['delete'] = bool(v) + + @property + def readACL(self): + return True if self._acl['readACL'] else False + + @readACL.setter + def readACL(self, v): + self._acl['readACL'] = bool(v) + + @property + def updateACL(self): + return True if self._acl['updateACL'] else False + + @updateACL.setter + def updateACL(self, v): + self._acl['updateACL'] = bool(v) + + def copy(self): + acl_copy = ACL(self._acl.copy()) + return acl_copy + + def __repr__(self): + # repr for full priv ACL: + # repr for read-only ACL: + perms = [] + perms.append('c') if self.create else perms.append('-') + perms.append('r') if self.read else perms.append('-') + perms.append('u') if self.update else perms.append('-') + perms.append('d') if self.delete else perms.append('-') + perms.append('e') if self.readACL else perms.append('-') + perms.append('p') if self.updateACL else perms.append('-') + return f"" + + +class ACLManager(): + + """ + Allows dictionary-style access to an Domain or Folder's ACLs. + + These are created exclusively by the library and are available as + a Python attribute at .attrs + + Like Group objects, attributes provide a minimal dictionary- + style interface. Anything which can be reasonably converted to a + Numpy array or Numpy scalar can be stored. + + Attributes are automatically created on assignment with the + syntax .attrs[name] = value, with the HDF5 type automatically + deduced from the value. Existing attributes are overwritten. + + To modify an existing attribute while preserving its type, use the + method modify(). To specify an attribute of a particular type and + shape, use create(). + """ + + def __init__(self, parent): + """ Private constructor. + """ + if hasattr(parent, "id"): + self._parent_type = "Domain" + self._http_conn = parent.id.http_conn + else: + # assume Folder + self._parent_type = "Folder" + self._http_conn = parent._http_conn + self._acls = None + + def refresh(self): + """ Fetch the current set of ACLs from the server """ + req = "/acls" + rsp = self._http_conn.GET(req) + if rsp.status_code != 200: + raise IOError(rsp.status_code, "Unable to get ACLs") + rsp_json = rsp.json() + acls = rsp_json["acls"] + # convert to a dict + self._acls = {} + + for acl_json in acls: + user_name = acl_json['userName'] + acl = {} + for k in ACL_KEYS: + if k in acl_json: + acl_bool = bool(acl_json[k]) + acl[k] = acl_bool + else: + acl[k] = False + self._acls[user_name] = acl + + def readonly_acl(self): + """ return a ACL with just read set to true """ + acl = { + 'updateACL': False, + 'delete': False, + 'create': False, + 'read': True, + 'update': False, + 'readACL': False, + } + return ACL(acl) + + def fullperm_acl(): + """ return a ACL with all flags set to True """ + acl = { + 'updateACL': True, + 'delete': True, + 'create': True, + 'read': True, + 'update': True, + 'readACL': True, + } + return ACL(acl) + + def __getitem__(self, name): + """ Read the value of an attribute. + """ + if isinstance(name, bytes): + name = name.decode("utf-8") + + if self._acls is None: + self.refresh() + + if name not in self._acls: + raise IOError(404, "Not Found") + + return ACL(self._acls[name]) + + def __setitem__(self, name, acl): + """ Set a new attribute, overwriting any existing attribute. + + The type and shape of the attribute are determined from the data. To + use a specific type or shape, or to preserve the type of an attribute, + use the methods create() and modify(). + """ + + if not isinstance(acl, ACL): + raise TypeError("expected ACL instance") + + if isinstance(name, bytes): + name = name.decode("utf-8") + + if not name or len(name.split()) != 1: + raise ValueError("name not valid") + + if self._acls is None: + self.refresh() + + req = "/acls/" + name + body = acl._acl + rsp = self._http_conn.PUT(req, body=body) + if rsp.status_code not in (200, 201): + raise IOError(rsp.status_code, "PUT ACL failed") + self.refresh() + + def create_ACL(self, c=False, r=False, u=False, d=False, e=False, p=False): + perm = {"create": c, "read": r, "update": u, "delete": d, "readACL": e, "updateACL": p} + acl = ACL(perm) + self.refresh() + return acl + + def __delitem__(self, name): + """ Delete an attribute (which must already exist). """ + if isinstance(name, bytes): + name = name.decode("utf-8") + + if not name or len(name.split()) != 1: + raise ValueError("name not valid") + + if self._acls is None: + self.refresh() + + if name not in self._acls: + raise IOError(404, "Not Found") + + req = "/acls/" + name + + # TBD: this action is not yet supported in HSDS, so expect an error... + rsp = self._http_conn.DELETE(req) + if rsp.status_code != 200: + raise IOError(rsp.status_code, "DELETE ACL failed") + self.refresh() + + def __len__(self): + """ Number of attributes attached to the object. """ + if self._acls is None: + self.refresh() + return len(self._acls) + + def __contains__(self, name): + """ Determine if an attribute exists, by name. """ + if isinstance(name, bytes): + name = name.decode("utf-8") + + if not name or len(name.split()) != 1: + raise ValueError("name not valid") + + if self._acls is None: + self.refresh() + + return True if name in self._acls else False + + def __repr__(self): + return f"" + + def _get_acl_names(self): + if self._acls is None: + self.refresh() + return self._acls.keys() + + def __iter__(self): + """ Iterate over the names of the ACLs. """ + # convert to a list of dicts + names = self._get_acl_names() + for name in names: + yield name + + def __reversed__(self): + """ Iterate over the names of attributes in reverse order. """ + names = self._get_acl_names() + for name in reversed(names): + yield name + # done diff --git a/h5pyd/_hl/base.py b/h5pyd/_hl/base.py index 315e3540..dc6526d8 100644 --- a/h5pyd/_hl/base.py +++ b/h5pyd/_hl/base.py @@ -718,51 +718,6 @@ def selection(self, ref): pass -class ACL(object): - - @property - def username(self): - return self._username - - @property - def create(self): - return self._create - - @property - def delete(self): - return self._delete - - @property - def read(self): - return self._read - - @property - def update(self): - return self._update - - @property - def readACL(self): - return self._readACL - - @property - def updateACL(self): - return self._updateACL - - """ - Proxy object which handles ACLs (access control list) - - """ - - def __init__(self): - self._username = None - self._create = True - self._delete = True - self._read = True - self._update = True - self._readACL = True - self._updateACL = True - - class HLObject(CommonStateObject): @property diff --git a/h5pyd/_hl/files.py b/h5pyd/_hl/files.py index 440530d4..23f80231 100644 --- a/h5pyd/_hl/files.py +++ b/h5pyd/_hl/files.py @@ -190,6 +190,12 @@ def attrs(self): return attrs.AttributeManager(self.id) + @property + def acls(self): + """ ACLs attached to this object """ + from . import acls + return acls.ACLManager(self) + @property def filename(self): """File name on disk""" @@ -511,23 +517,6 @@ def __init__( root_uuid = root_json["root"] - if mode == "a": - # for append, verify we have 'update' permission on the domain - # try first with getting the acl for the current user, then as default - for name in (username, "default"): - if not username: - continue - req = "/acls/" + name - rsp = http_conn.GET(req) - if rsp.status_code == 200: - rsp_json = rsp.json() - domain_acl = rsp_json["acl"] - if not domain_acl["update"]: - http_conn.close() - raise IOError(403, "Forbidden") - else: - break # don't check with "default" user in this case - if mode in ("w", "w-", "x", "a"): http_conn._mode = "r+" @@ -578,6 +567,30 @@ def __init__( super().__init__(self._id, track_order=track_order) + if mode == 'a': + # check that we have update permissions + try: + acls = self.acls + except IOError as ioe: + if ioe.errno == 403: + # no permission to read acl, trust update is ok + acls = None + else: + # some other error + raise + if acls: + for name in (username, "default"): + if not username: + continue + if username not in acls: + continue + acl = acls[username] + if acl.update: + break # ok to update + else: + self.close() + raise IOError(403, "Forbidden") + def _getVerboseInfo(self): now = time.time() if (self._verboseUpdated is None or now - self._verboseUpdated > VERBOSE_REFRESH_TIME): @@ -729,39 +742,6 @@ def compressors(self): compressors = [] return compressors - # override base implementation of ACL methods to use the domain rather than update root group - def getACL(self, username): - req = "/acls/" + username - rsp = self.id.http_conn.GET(req) - if rsp.status_code != 200: - raise IOError(rsp.status_code, "Unable to get ACL") - rsp_json = rsp.json() - acl_json = rsp_json["acl"] - return acl_json - - def getACLs(self): - req = "/acls" - rsp = self.id.http_conn.GET(req) - if rsp.status_code != 200: - raise IOError(rsp.status_code, "Unable to get ACL") - rsp_json = rsp.json() - acls_json = rsp_json["acls"] - return acls_json - - def putACL(self, acl): - if "userName" not in acl: - raise IOError(404, "ACL has no 'userName' key") - perm = {} - for k in ("create", "read", "update", "delete", "readACL", "updateACL"): - if k not in acl: - raise IOError(404, "Missing ACL field: {}".format(k)) - perm[k] = acl[k] - - req = "/acls/" + acl["userName"] - rsp = self.id.http_conn.PUT(req, body=perm) - if rsp.status_code not in (200, 201): - raise IOError(rsp.status_code, "Failed to create ACL") - def run_scan(self): MAX_WAIT = 10 self._getVerboseInfo() @@ -801,14 +781,14 @@ def flush(self, checkpoint=False): # TBD: send any pending write requests if not checkpoint: return - + self.log.info("sending PUT flush request") req = "/" body = {"flush": 1, "getdnids": 1} rsp = self.id.http_conn.PUT(req, body=body) self.log.debug(f"got status code: {rsp.status_code} from flush") if rsp.status_code != 200: - raise RuntimeError(f"got status code: {rsp.status_code} on flush") + raise RuntimeError(f"got status code: {rsp.status_code} on flush") rsp_json = rsp.json() if "dn_ids" in rsp_json: dn_ids = rsp_json["dn_ids"] diff --git a/h5pyd/_hl/folders.py b/h5pyd/_hl/folders.py index eaacf149..8fb79c67 100644 --- a/h5pyd/_hl/folders.py +++ b/h5pyd/_hl/folders.py @@ -250,45 +250,11 @@ def __init__( else: self._owner = None - def getACL(self, username): - if self._http_conn is None: - raise IOError(400, "folder is not open") - req = "/acls/" + username - rsp = self._http_conn.GET(req) - if rsp.status_code != 200: - raise IOError(rsp.reason) - rsp_json = rsp.json() - acl_json = rsp_json["acl"] - return acl_json - - def getACLs(self): - if self._http_conn is None: - raise IOError(400, "folder is not open") - req = "/acls" - rsp = self._http_conn.GET(req) - if rsp.status_code != 200: - raise IOError(rsp.status_code, rsp.reason) - rsp_json = rsp.json() - acls_json = rsp_json["acls"] - return acls_json - - def putACL(self, acl): - if self._http_conn is None: - raise IOError(400, "folder is not open") - if self._http_conn.mode == "r": - raise IOError(400, "folder is open as read-onnly") - if "userName" not in acl: - raise IOError(404, "ACL has no 'userName' key") - perm = {} - for k in ("create", "read", "update", "delete", "readACL", "updateACL"): - if k not in acl: - raise IOError(404, "Missing ACL field: {}".format(k)) - perm[k] = acl[k] - - req = "/acls/" + acl["userName"] - rsp = self._http_conn.PUT(req, body=perm) - if rsp.status_code != 201: - raise IOError(rsp.status_code, rsp.reason) + @property + def acls(self): + """ ACLs attached to this object """ + from . import acls + return acls.ACLManager(self) def _getSubdomains(self): if self._http_conn is None: diff --git a/h5pyd/config.py b/h5pyd/config.py index 3f63dbf9..b7602ffd 100755 --- a/h5pyd/config.py +++ b/h5pyd/config.py @@ -96,7 +96,7 @@ def get(self, name, default): return default else: return default - + def __setitem__(self, name, obj): """ set config item """ Config._cfg[name] = obj diff --git a/test/hl/test_file.py b/test/hl/test_file.py index c0790446..c1010a7d 100644 --- a/test/hl/test_file.py +++ b/test/hl/test_file.py @@ -19,7 +19,6 @@ import h5pyd as h5py from common import ut, TestCase -from copy import copy import time import logging @@ -266,86 +265,96 @@ def test_auth(self): self.assertTrue(f.id.id is not None) self.assertEqual(len(list(f.keys())), 2) - if h5py.__name__ == "h5py": - return # no ACLs in h5py - # no explicit ACLs yet - file_acls = f.getACLs() - self.assertTrue(len(file_acls) >= 1) # Should have at least the test_user1 acl + self.assertTrue(len(f.acls) >= 1) # Should have at least the test_user1 acl username = f.owner - file_acl = f.getACL(username) + file_acl = f.acls[username] + self.assertEqual(str(file_acl), "") # default owner ACL should grant full permissions - acl_keys = ("create", "read", "update", "delete", "readACL", "updateACL") - # self.assertEqual(file_acl["userName"], "default") - for k in acl_keys: - self.assertEqual(file_acl[k], True) + self.assertTrue(file_acl.create) + self.assertTrue(file_acl.read) + self.assertTrue(file_acl.update) + self.assertTrue(file_acl.delete) + self.assertTrue(file_acl.readACL) + self.assertTrue(file_acl.updateACL) - try: - default_acl = f.getACL("default") - except IOError as ioe: - if ioe.errno == 404: - pass # expected + self.assertFalse("default" in f.acls) # create public-read ACL - default_acl = {} - for key in acl_keys: - if key == "read": - default_acl[key] = True - else: - default_acl[key] = False - default_acl["userName"] = "default" - f.putACL(default_acl) + default_acl = file_acl.copy() + default_acl.create = False + default_acl.update = False + default_acl.delete = False + default_acl.readACL = False + default_acl.updateACL = False + + self.assertEqual(str(default_acl), "") + + f.acls["default"] = default_acl + f.close() + user2_name = self.test_user2['name'] + user2_password = self.test_user2['password'] - # ooen with test_user2 should succeed for read mode + # open with test_user2 should succeed for read mode try: - f = h5py.File(filename, 'r', username=self.test_user2["name"], password=self.test_user2["password"]) - f.close() + f = h5py.File(filename, 'r', username=user2_name, password=user2_password) except IOError: self.assertTrue(False) + # user2 does not have read ACL permission + try: + len(f.acls) + self.assertTrue(False) + except IOError as ioe: + if ioe.errno == 403: + pass # expected + else: + self.assertTrue(False) + + f.close() + # test_user2 has read access, but opening in write mode should fail try: - f = h5py.File(filename, 'w', username=self.test_user2["name"], password=self.test_user2["password"]) + f = h5py.File(filename, 'w', username=user2_name, password=user2_password) self.assertFalse(True) # expect exception for hsds except IOError as ioe: self.assertEqual(ioe.errno, 403) # user is not authorized # append mode w/ test_user2 try: - f = h5py.File(filename, 'a', username=self.test_user2["name"], password=self.test_user2["password"]) + f = h5py.File(filename, 'a', username=user2_name, password=user2_password) self.assertFalse(True) # expected exception except IOError as ioe: self.assertEqual(ioe.errno, 403) # Forbidden - f = h5py.File(filename, 'a') # open for append with original username - # add an acl for test_user2 that has only read/update access - user2_acl = copy(default_acl) - user2_acl["userName"] = self.test_user2["name"] - user2_acl["read"] = True # allow read access - user2_acl["update"] = True - user2_acl["readACL"] = True - f.putACL(user2_acl) + # updating an acl as user2 should not be allowed + user2_acl = default_acl.copy() + user2_acl.update = True # flip update and readACL to True + user2_acl.readACL = True + f = h5py.File(filename, 'a') # open for append with original username + f.acls[user2_name] = user2_acl # add an acl for test_user2 f.close() - # ooen with test_user2 should succeed for read mode + # open with test_user2 should succeed for append mode try: - f = h5py.File(filename, 'r', username=self.test_user2["name"], password=self.test_user2["password"]) + f = h5py.File(filename, 'a', username=user2_name, password=user2_password) except IOError: self.assertTrue(False) + f.close() # test_user2 opening in write mode should still fail try: - f = h5py.File(filename, 'w', username=self.test_user2["name"], password=self.test_user2["password"]) + f = h5py.File(filename, 'w', username=user2_name, password=user2_password) self.assertFalse(True) # expected exception except IOError as ioe: self.assertEqual(ioe.errno, 403) # user is not authorized # append mode w/ test_user2 try: - f = h5py.File(filename, 'a', username=self.test_user2["name"], password=self.test_user2["password"]) + f = h5py.File(filename, 'a', username=user2_name, password=user2_password) except IOError: self.assertTrue(False) # shouldn't get here diff --git a/test/hl/test_folder.py b/test/hl/test_folder.py index f9e7c3b3..346fdee4 100644 --- a/test/hl/test_folder.py +++ b/test/hl/test_folder.py @@ -55,13 +55,14 @@ def test_list(self): self.assertEqual(dir_parent[:-1], op.dirname(folder_name[:-1])) # get ACL for dir - dir_acl = d.getACL(self.test_user1["name"]) - self.assertEqual(len(dir_acl.keys()), 7) - for k in dir_acl.keys(): - self.assertTrue(dir_acl[k]) - - dir_acls = d.getACLs() - self.assertTrue(isinstance(dir_acls, list)) + user1_name = self.test_user1['name'] + dir_acl = d.acls[user1_name] + self.assertTrue(dir_acl.create) + self.assertTrue(dir_acl.read) + self.assertTrue(dir_acl.update) + self.assertTrue(dir_acl.delete) + self.assertTrue(dir_acl.readACL) + self.assertTrue(dir_acl.updateACL) count = len(d) self.assertTrue(count > 1) @@ -107,18 +108,6 @@ def test_list(self): dir_parent = d.parent self.assertEqual(dir_parent[:-1], op.dirname(folder_name[:-1])) - # get ACL for dir - dir_acl = d.getACL(self.test_user1["name"]) - self.assertEqual(len(dir_acl.keys()), 7) - for k in dir_acl.keys(): - self.assertTrue(dir_acl[k]) - - dir_acls = d.getACLs() - self.assertTrue(isinstance(dir_acls, list)) - - count = len(d) - self.assertTrue(count > 1) - test_domain_found = False i = 0 From 4549ab19db0520e7b06a6a0c4e0540764b7a7d2b Mon Sep 17 00:00:00 2001 From: John Readey Date: Sun, 26 Jan 2025 14:57:02 +0800 Subject: [PATCH 27/32] update comments for ACL class --- h5pyd/_hl/acls.py | 70 +++++++++++++---------------------------------- 1 file changed, 19 insertions(+), 51 deletions(-) diff --git a/h5pyd/_hl/acls.py b/h5pyd/_hl/acls.py index b0321ced..ebccf752 100644 --- a/h5pyd/_hl/acls.py +++ b/h5pyd/_hl/acls.py @@ -96,20 +96,16 @@ class ACLManager(): """ Allows dictionary-style access to an Domain or Folder's ACLs. - These are created exclusively by the library and are available as - a Python attribute at .attrs + Like Group objects, acls provide a minimal dictionary- + style interface, key'd by username. Each acl consist of a set + of permissions flags for 'create', 'read', 'update', 'delete', + 'readACL', and 'updateACL'. - Like Group objects, attributes provide a minimal dictionary- - style interface. Anything which can be reasonably converted to a - Numpy array or Numpy scalar can be stored. + To modify an existing ACL, fetch it, set the desired permission + flags and then set the acl. - Attributes are automatically created on assignment with the - syntax .attrs[name] = value, with the HDF5 type automatically - deduced from the value. Existing attributes are overwritten. - - To modify an existing attribute while preserving its type, use the - method modify(). To specify an attribute of a particular type and - shape, use create(). + To create a new ACL, get an ACL instance with acls.create_ACL method, + modify as desired, then set the acl using the desired username. """ def __init__(self, parent): @@ -146,32 +142,14 @@ def refresh(self): acl[k] = False self._acls[user_name] = acl - def readonly_acl(self): - """ return a ACL with just read set to true """ - acl = { - 'updateACL': False, - 'delete': False, - 'create': False, - 'read': True, - 'update': False, - 'readACL': False, - } - return ACL(acl) - - def fullperm_acl(): - """ return a ACL with all flags set to True """ - acl = { - 'updateACL': True, - 'delete': True, - 'create': True, - 'read': True, - 'update': True, - 'readACL': True, - } - return ACL(acl) + def create_acl(self, c=False, r=False, u=False, d=False, e=False, p=False): + """ return an ACL with the given flag settings""" + perm = {"create": c, "read": r, "update": u, "delete": d, "readACL": e, "updateACL": p} + acl = ACL(perm) + return acl def __getitem__(self, name): - """ Read the value of an attribute. + """ Get the ACL for the given username. """ if isinstance(name, bytes): name = name.decode("utf-8") @@ -185,11 +163,7 @@ def __getitem__(self, name): return ACL(self._acls[name]) def __setitem__(self, name, acl): - """ Set a new attribute, overwriting any existing attribute. - - The type and shape of the attribute are determined from the data. To - use a specific type or shape, or to preserve the type of an attribute, - use the methods create() and modify(). + """ Set an ACL, overwriting any existing ACL. """ if not isinstance(acl, ACL): @@ -211,14 +185,8 @@ def __setitem__(self, name, acl): raise IOError(rsp.status_code, "PUT ACL failed") self.refresh() - def create_ACL(self, c=False, r=False, u=False, d=False, e=False, p=False): - perm = {"create": c, "read": r, "update": u, "delete": d, "readACL": e, "updateACL": p} - acl = ACL(perm) - self.refresh() - return acl - def __delitem__(self, name): - """ Delete an attribute (which must already exist). """ + """ Delete an ACL (which must already exist). """ if isinstance(name, bytes): name = name.decode("utf-8") @@ -240,13 +208,13 @@ def __delitem__(self, name): self.refresh() def __len__(self): - """ Number of attributes attached to the object. """ + """ Number of ACLs attached to the domain or folder. """ if self._acls is None: self.refresh() return len(self._acls) def __contains__(self, name): - """ Determine if an attribute exists, by name. """ + """ Determine if an ACL exists, by name. """ if isinstance(name, bytes): name = name.decode("utf-8") @@ -274,7 +242,7 @@ def __iter__(self): yield name def __reversed__(self): - """ Iterate over the names of attributes in reverse order. """ + """ Iterate over the names of ACLs in reverse order. """ names = self._get_acl_names() for name in reversed(names): yield name From b42b559ceb17974d76d9a8d08aa0aa8e3f75480a Mon Sep 17 00:00:00 2001 From: John Readey Date: Mon, 27 Jan 2025 13:02:38 +0800 Subject: [PATCH 28/32] added limits to objdb --- h5pyd/_hl/base.py | 2 +- h5pyd/_hl/files.py | 34 ++++++++---- h5pyd/_hl/group.py | 23 +++------ h5pyd/httpconn.py | 7 +-- h5pyd/objdb.py | 125 ++++++++++++++++++++++++++++----------------- h5pyd/objectid.py | 81 +++++++++++++++++------------ 6 files changed, 162 insertions(+), 110 deletions(-) diff --git a/h5pyd/_hl/base.py b/h5pyd/_hl/base.py index dc6526d8..af2e73dd 100644 --- a/h5pyd/_hl/base.py +++ b/h5pyd/_hl/base.py @@ -794,7 +794,7 @@ def __init__(self, oid, track_order=None): raise TypeError(f"unexpected type for HLObject.__init__: {type(oid)}") self._id = oid self.log = self._id.http_conn.logging - if self.id.uuid == self.id.http_conn.root_uuid: + if self.id.is_root: # set the name as the root group self._name = "/" else: diff --git a/h5pyd/_hl/files.py b/h5pyd/_hl/files.py index 23f80231..67607c3d 100644 --- a/h5pyd/_hl/files.py +++ b/h5pyd/_hl/files.py @@ -285,6 +285,8 @@ def __init__( api_key=None, use_session=True, use_cache=True, + cache_limit=0, + cache_expire_time=0, swmr=False, libver=None, logger=None, @@ -317,8 +319,14 @@ def __init__( use_session maintain http connect between calls use_cache - save attribute and links values rather than retreiving from server each time they are accessed. + save attribute and links values rather than retrieving from server each time they are accessed. Set to False if the storage content is expected to change due to another application + cache_limit + If use_cache is True, the max number of metadata objects to hold in cache. + If cache_limit is 0, the number is unlimited + cache_expire_time + Amount of time in seconds to hold object in metadata cache before refreshing. + If 0, items will be held in cache indefinitely swmr For compatibility with h5py - has the effect of overriding use_cache so that metadata will always be synchronized with the server @@ -421,8 +429,13 @@ def __init__( elif "hs_bucket" in cfg: bucket = cfg["hs_bucket"] - if swmr: - use_cache = False # disable metadata caching in swmr mode + if use_cache: + if cache_limit > 0: + max_objects = cache_limit + else: + max_objects = None + else: + max_objects = 0 http_conn = HttpConn( domain, @@ -433,7 +446,8 @@ def __init__( mode=mode, api_key=api_key, use_session=use_session, - use_cache=use_cache, + expire_time=cache_expire_time, + max_objects=max_objects, logger=logger, retries=retries, timeout=timeout, @@ -442,10 +456,12 @@ def __init__( # try to do a GET from the domain req = "/" params = {"getdnids": 1} # return dn ids if available - - if use_cache and mode == "r": - params["getobjs"] = "T" - params["include_attrs"] = "T" + if max_objects is None or max_objects > 0: + # get object meta objects + # TBD: have hsds support a max limit of objects to return + params["getobjs"] = 1 + params["include_attrs"] = 1 + params["include_links"] = 1 if bucket: params["bucket"] = bucket @@ -525,8 +541,6 @@ def __init__( if "domain_objs" in root_json: domain_objs = root_json["domain_objs"] objdb.load(domain_objs) - else: - objdb.reload() fileid = FileID(root_uuid, http_conn=http_conn) # end else diff --git a/h5pyd/_hl/group.py b/h5pyd/_hl/group.py index ba3625e2..e688b9cf 100644 --- a/h5pyd/_hl/group.py +++ b/h5pyd/_hl/group.py @@ -62,14 +62,12 @@ def _get_bypath(self, h5path, create=False, track_order=None): if h5path == "/": # return root group - root_uuid = self.id.http_conn.root_uuid - root_id = self.id.get(root_uuid) # create a GroupID object + root_id = self.id.get_root() # create a GroupID object root_grp = Group(root_id, track_order=track_order) return root_grp elif h5path[0] == '/': # absolute path - start with root - root_uuid = self.id.http_conn.root_uuid - parent_id = self.id.get(root_uuid) + parent_id = self.id.get_root() parent_name = "/" else: # relative path - start with this object @@ -441,19 +439,11 @@ def __getitem__(self, name, track_order=None): obj_id = self.id.get(name) elif name == "/": # return root group - root_uuid = self.id.http_conn.root_uuid - obj_id = self.id.get(root_uuid) + obj_id = self.id.get_root() else: pass # will do a path lookup if obj_id: - # verify the object exists - objdb = self.id.http_conn.objdb - if obj_id.id not in objdb: - try: - objdb.fetch(obj_id.id) # will raise exception if - except IOError: - raise KeyError(f"Object {obj_id} does not exist") if isinstance(obj_id, GroupID): tgt = Group(obj_id) if name == "/": @@ -466,7 +456,7 @@ def __getitem__(self, name, track_order=None): elif isinstance(obj_id, TypeID): tgt = Datatype(obj_id) else: - raise IOError("Unexpected Error - ObjectID type: " + obj_id.__class__.__name__) + raise IOError(f"Unexpected Error - ObjectID type: {obj_id.__class__.__name__}") return tgt # get item by h5path @@ -654,11 +644,12 @@ def __setitem__(self, name, obj): def __delitem__(self, name): """ Delete (unlink) an item from this group. """ - objdb = self.id.http_conn.objdb if isinstance(name, ObjectID): # delete the object, not the link - objdb.del_obj(name.id) + if name.is_root: + IOError("The root group can not be deleted") + name.delete_object() else: parent_path = _h5parent(name) basename = _h5base(name) diff --git a/h5pyd/httpconn.py b/h5pyd/httpconn.py index 669cce68..ee3f57ee 100644 --- a/h5pyd/httpconn.py +++ b/h5pyd/httpconn.py @@ -260,7 +260,8 @@ def __init__( api_key=None, mode="a", use_session=True, - use_cache=True, + expire_time=1.0, + max_objects=None, logger=None, retries=3, timeout=DEFAULT_TIMEOUT, @@ -285,7 +286,7 @@ def __init__( else: self.log = logging.getLogger(logger) msg = f"HttpConn.init(domain: {domain_name} use_session: {use_session} " - msg += f"use_cache: {use_cache} retries: {retries}" + msg += f"expire_time: {expire_time:6.2f} sec retries: {retries}" self.log.debug(msg) if self._timeout != DEFAULT_TIMEOUT: @@ -403,7 +404,7 @@ def __init__( else: self.log.error(f"Unknown openid provider: {provider}") - self._objdb = ObjDB(self, use_cache=use_cache) + self._objdb = ObjDB(self, expire_time=expire_time, max_objects=max_objects) def __del__(self): if self._hsds: diff --git a/h5pyd/objdb.py b/h5pyd/objdb.py index e4fcc7f7..b21893a3 100644 --- a/h5pyd/objdb.py +++ b/h5pyd/objdb.py @@ -18,13 +18,25 @@ from .objectid import get_collection +class PendingItem(): + def __init__(self, obj_uuid, action, name, data): + self._uuid = obj_uuid + self._action = action + self._name = name + self._data = data + self._load_time = time.time() + + class ObjDB(): """ Domain level object map """ - def __init__(self, http_conn, use_cache=True): + def __init__(self, http_conn, expire_time=0.0, max_objects=None): self._http_conn = weakref.ref(http_conn) self._objdb = {} - self._loadtime = {} - self._use_cache = use_cache + self._load_times = {} + self._pending = [] + self._missing_uuids = set() + self._expire_time = expire_time + self._max_objects = max_objects self.log = http_conn.logging @property @@ -35,11 +47,31 @@ def http_conn(self): raise RuntimeError("http connection has been garbage collected") return conn + def _is_expired(self, obj_uuid): + """ Get expired state of the uuid. Return: + None - if the item is not loaded + True - if the item is loaded but expired + False - if the item is loaded but not expired + """ + if obj_uuid not in self._load_times: + return None + if self._expire_time > 0.0: + age = time.time() - self._load_times[obj_uuid] + + return age > self._expire_time + else: + return False + def fetch(self, obj_uuid): """ get obj_json for given obj_uuid from the server """ self.log.debug(f"ObjDB.fetch({obj_uuid})") + if obj_uuid in self._missing_uuids: + msg = f"returning None for fetch since object {obj_uuid} is in missing_uuids set" + self.log.warning(msg) + return None + if obj_uuid.startswith("g-"): collection_type = "groups" elif obj_uuid.startswith("t-"): @@ -60,18 +92,20 @@ def fetch(self, obj_uuid): rsp = self.http_conn.GET(req, params=params) if rsp.status_code in (404, 410): self.log.warning(f"obj: {obj_uuid} not found") + self._missing_uuids.add(obj_uuid) return None elif rsp.status_code != 200: raise IOError(f"Unexpected error on request {req}: {rsp.status_code}") obj_json = rsp.json() - self.__set_item__(obj_uuid, obj_json) return obj_json - def __set_item__(self, obj_uuid, obj_json): + def __setitem__(self, obj_uuid, obj_json): """ set the obj_json in the db with obj_uuid as the key """ - + if self._max_objects is not None: + if len(self._objdb) >= self._max_objects: + # over limit, skip set + return discard_keys = ('root', 'id', 'attributeCount', 'linkCount', 'hrefs', 'domain', 'bucket') - # tbd: should bucket be supported? Not being returned in GET request for k in discard_keys: if k in obj_json: del obj_json[k] @@ -83,18 +117,33 @@ def __set_item__(self, obj_uuid, obj_json): # assign or replace current object self._objdb[obj_uuid] = obj_json - self._loadtime[obj_uuid] = time.time() - - return obj_json + self._load_times[obj_uuid] = time.time() def __getitem__(self, obj_uuid): - if obj_uuid not in self._objdb: + """ get item from objdb, fetching from server if necessary """ + + if self._is_expired(obj_uuid) in (None, True): + # fetch latest json + obj_json = self.fetch(obj_uuid) + if obj_json is not None: + self.__setitem__(obj_uuid, obj_json) + else: + obj_json = self._objdb[obj_uuid] + + if obj_json is None: self.log.warning(f"id: {obj_uuid} not found in objDB") raise KeyError(obj_uuid) - obj_json = self._objdb[obj_uuid] return obj_json + def free(self, obj_uuid): + """ free from objdb """ + if obj_uuid in self._objdb: + del self._objdb[obj_uuid] + if obj_uuid in self._load_times: + del self._load_times[obj_uuid] + def __delitem__(self, obj_uuid): + """ delete object frm server and free from objdb""" if obj_uuid not in self._objdb: obj_json = self.fetch(obj_uuid) if not obj_json: @@ -102,9 +151,8 @@ def __delitem__(self, obj_uuid): raise KeyError(obj_uuid) collection = get_collection(obj_uuid) req = f"/{collection}/{obj_uuid}" - self._http_conn.DELETE(req) - del self._objdb[obj_uuid] - del self._loadtime[obj_uuid] + self.http_conn.DELETE(req) + self.free(obj_uuid) def __len__(self): return len(self._objdb) @@ -123,19 +171,22 @@ def load(self, domain_objs): """ load content from hsds summary json """ for obj_uuid in domain_objs: obj_json = domain_objs[obj_uuid] - self.__set_item__(obj_uuid, obj_json) + self.__setitem__(obj_uuid, obj_json) - def reload(self): + def reload(self, load_all=False): """ re-initialize objdb """ self.log.info(f"objdb.reload {self.http_conn.domain}") self._objdb = {} self._loadtime = {} obj_uuids = set() obj_uuids.add(self.http_conn.root_uuid) + if not load_all: + return + while obj_uuids: obj_uuid = obj_uuids.pop() obj_json = self.fetch(obj_uuid) - self.__set_item__(obj_uuid, obj_json) + self.__setitem__(obj_uuid, obj_json) if "links" in obj_json: # add ids for any hard-links to our search if @@ -167,12 +218,13 @@ def get_bypath(self, parent_uuid, h5path, follow=False, getlink=False): if not parent_uuid.startswith("g-"): self.log.error("get_bypath - expected parent_uuid to be a group id") raise TypeError() - if parent_uuid not in self._objdb: + + obj_json = self.__getitem__(parent_uuid) + if obj_json is None: self.log.warning("get_bypath - parent_uuid not found") raise KeyError("parent_uuid: {parent_uuid} not found") obj_id = parent_uuid - obj_json = self._objdb[obj_id] searched_ids = set(obj_id) link_names = h5path.split('/') @@ -207,38 +259,17 @@ def get_bypath(self, parent_uuid, h5path, follow=False, getlink=False): if obj_id in searched_ids: self.log.warning(f"circular reference using path: {h5path}") raise KeyError(h5path) - if obj_id not in self._objdb: - # TBD - fetch from the server in case this object has not - # been loaded yet? + obj_json = self.__getitem__(obj_id) + if not obj_json: self.log.warning(f"id: {obj_id} not found") obj_json = None else: searched_ids.add(obj_id) obj_json = self._objdb[obj_id] elif link_class == 'H5L_TYPE_SOFT': - if not follow: - continue - # soft link - slink_path = link_tgt['h5path'] - if not slink_path: - self.log.warning(f"id: {obj_id} has null h5path for link: {link_name}") - raise KeyError(h5path) - if slink_path.startswith('/'): - slink_id = self.http_conn.root_uuid - else: - slink_id = obj_id - # recursive call - try: - obj_json = self.get_bypath(slink_id, slink_path) - except KeyError: - self.log.warning(f"Unable to find object in softpath: {slink_path}") - continue - obj_id = obj_json['id'] + self.log.warning("objdb.get_bypath can't follow soft links") elif link_class == 'H5L_TYPE_EXTERNAL': - if not follow: - continue - # tbd - self.log.error("external link not supported") + self.log.warning("objdb.get_bypath can't follow external links") else: self.log.error(f"link type: {link_class} not supported") @@ -305,7 +336,7 @@ def make_obj(self, parent_uuid, title, type_json=None, shape=None, cpl=None, tra if title.find('/') != -1: raise KeyError("link title can not be nested") if parent_uuid not in self._objdb: - raise KeyError(f"parent_uuid: {parent_uuid} not found") + self.log.warning(f"make_obj: {parent_uuid} not in objdb") link_json["name"] = title @@ -346,7 +377,7 @@ def make_obj(self, parent_uuid, title, type_json=None, shape=None, cpl=None, tra if cpl: obj_json['creationProperties'] = cpl obj_uuid = obj_json['id'] - self.__set_item__(obj_uuid, obj_json) # update group db + self.__setitem__(obj_uuid, obj_json) # update group db if link_json: # tweak link_json to look like a link entry on objdb link_json['class'] = 'H5L_TYPE_HARD' diff --git a/h5pyd/objectid.py b/h5pyd/objectid.py index f585214a..ab587b47 100644 --- a/h5pyd/objectid.py +++ b/h5pyd/objectid.py @@ -116,11 +116,27 @@ def domain(self): """ domain for this obj """ return self.http_conn.domain + @property + def http_conn(self): + # access weak ref + if isinstance(self._http_conn, weakref.ReferenceType): + conn = self._http_conn() + if conn is None: + raise RuntimeError("http connection has been garbage collected") + else: + return self._http_conn + return conn + + @property + def objdb(self): + # get ref to ObjDB instance + http_conn = self.http_conn + return http_conn.objdb + @property def obj_json(self): """json representation of the object""" - objdb = self.http_conn.objdb - obj_json = objdb[self.uuid] + obj_json = self.objdb[self.uuid] return obj_json @property @@ -140,23 +156,6 @@ def modified(self): return dt - @property - def http_conn(self): - # access weak ref - if isinstance(self._http_conn, weakref.ReferenceType): - conn = self._http_conn() - if conn is None: - raise RuntimeError("http connection has been garbage collected") - else: - return self._http_conn - return conn - - @property - def objdb(self): - # get ref to ObjDB instance - http_conn = self.http_conn - return http_conn.objdb - @property def collection_type(self): """ Return collection type based on uuid """ @@ -206,6 +205,19 @@ def get(self, obj_uuid): return obj + def get_root(self): + """ Return root id """ + root_uuid = self.http_conn.root_uuid + return GroupID(root_uuid, http_conn=self.http_conn) + + @property + def is_root(self): + """ Return True if this is the root group id """ + if self._uuid == self.http_conn.root_uuid: + return True + else: + return False + @property def attrs(self): obj_json = self.obj_json @@ -275,9 +287,8 @@ def get_attr_names(self, track_order=None): return names def __init__(self, obj_uuid, http_conn=None): + """Create a new objectId """ - """Create a new objectId. - """ self._uuid = get_UUID(obj_uuid) if http_conn: @@ -287,8 +298,10 @@ def __init__(self, obj_uuid, http_conn=None): else: raise IOError("Expected parent to have http connector") - if self._uuid not in self.objdb: - self.objdb.fetch(self._uuid) # will throw IOError if not found + obj_json = self.objdb[self._uuid] # we're raise KeyError if not found + if not obj_json: + # should be not None if KeyError wasn't thrown + raise RuntimeError("Unexpected error") def __eq__(self, other): if isinstance(other, self.__class__): @@ -301,11 +314,16 @@ def __ne__(self, other): def refresh(self): """ get the latest obj_json data from server """ + self.objdb.free(self._uuid) self.objdb.fetch(self._uuid) + def delete_object(self): + """ delete the given object on the server """ + + del self.objdb[self._uuid] + def close(self): - """Remove handles to id. - """ + """Remove handles to id """ self._old_uuid = self._uuid # for debugging self._uuid = 0 self._http_conn = None @@ -425,8 +443,7 @@ def getVerboseInfo(self): def resize(self, dims): """ update the shape of the dataset """ # send the request to the server - objdb = self.http_conn.objdb - objdb.resize(self._uuid, dims) + self.objdb.resize(self._uuid, dims) def shape_refresh(self): """ Get the current shape """ @@ -471,7 +488,6 @@ def make_obj(self, title, type_json=None, shape=None, cpl=None, track_order=None links = obj_json['links'] if title in links: raise IOError("Unable to create object (name already exists)") - objdb = self.http_conn.objdb kwds = {} if shape is not None: @@ -484,7 +500,7 @@ def make_obj(self, title, type_json=None, shape=None, cpl=None, track_order=None kwds['track_order'] = track_order if maxdims: kwds['maxdims'] = maxdims - obj_uuid = objdb.make_obj(self._uuid, title, **kwds) + obj_uuid = self.objdb.make_obj(self._uuid, title, **kwds) obj_id = self.get(obj_uuid) return obj_id @@ -501,9 +517,8 @@ def set_link(self, title, link_json, replace=False): links = self.links if not replace and title in links: raise IOError("Unable to create link (name already exists)") - objdb = self.http_conn.objdb - objdb.set_link(self.uuid, title, link_json, replace=replace) + self.objdb.set_link(self.uuid, title, link_json, replace=replace) def del_link(self, title): """ delete the given link """ @@ -511,8 +526,7 @@ def del_link(self, title): if title not in links: # not found raise KeyError(f"link '{title}' not found") - objdb = self.http_conn.objdb - objdb.del_link(self.uuid, title) + self.objdb.del_link(self.uuid, title) @property def link_count(self): @@ -521,6 +535,7 @@ def link_count(self): return len(links) def get_link_titles(self, track_order=None): + links = self.links if track_order is None: track_order = self.track_order From 2cb2e306a0404235935be687048825413fad2c4c Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 28 Jan 2025 14:52:03 +0800 Subject: [PATCH 29/32] fist pass at delayed updates --- h5pyd/_hl/files.py | 9 ++- h5pyd/httpconn.py | 4 +- h5pyd/objdb.py | 161 +++++++++++++++++++++++++++++++------- h5pyd/objectid.py | 8 ++ test/hl/test_attribute.py | 12 +-- test/hl/test_file.py | 5 +- test/hl/test_group.py | 8 +- 7 files changed, 165 insertions(+), 42 deletions(-) diff --git a/h5pyd/_hl/files.py b/h5pyd/_hl/files.py index 67607c3d..40c355a4 100644 --- a/h5pyd/_hl/files.py +++ b/h5pyd/_hl/files.py @@ -287,6 +287,7 @@ def __init__( use_cache=True, cache_limit=0, cache_expire_time=0, + max_age=1.0, swmr=False, libver=None, logger=None, @@ -327,6 +328,9 @@ def __init__( cache_expire_time Amount of time in seconds to hold object in metadata cache before refreshing. If 0, items will be held in cache indefinitely + max_time + Amount of of time in seconds to hold a dirty object in the metadata cache before writing. + If 0, items will be written immediately swmr For compatibility with h5py - has the effect of overriding use_cache so that metadata will always be synchronized with the server @@ -448,6 +452,7 @@ def __init__( use_session=use_session, expire_time=cache_expire_time, max_objects=max_objects, + max_age=max_age, logger=logger, retries=retries, timeout=timeout, @@ -792,7 +797,9 @@ def flush(self, checkpoint=False): return self.log.debug("flush") - # TBD: send any pending write requests + # send any pending write requests + self.id.flush() + if not checkpoint: return diff --git a/h5pyd/httpconn.py b/h5pyd/httpconn.py index ee3f57ee..e7303e23 100644 --- a/h5pyd/httpconn.py +++ b/h5pyd/httpconn.py @@ -262,6 +262,7 @@ def __init__( use_session=True, expire_time=1.0, max_objects=None, + max_age=1.0, logger=None, retries=3, timeout=DEFAULT_TIMEOUT, @@ -404,7 +405,8 @@ def __init__( else: self.log.error(f"Unknown openid provider: {provider}") - self._objdb = ObjDB(self, expire_time=expire_time, max_objects=max_objects) + kwds = {"expire_time": expire_time, "max_objects": max_objects, "max_age": max_age} + self._objdb = ObjDB(self, **kwds) def __del__(self): if self._hsds: diff --git a/h5pyd/objdb.py b/h5pyd/objdb.py index b21893a3..9939a479 100644 --- a/h5pyd/objdb.py +++ b/h5pyd/objdb.py @@ -18,30 +18,29 @@ from .objectid import get_collection -class PendingItem(): - def __init__(self, obj_uuid, action, name, data): - self._uuid = obj_uuid - self._action = action - self._name = name - self._data = data - self._load_time = time.time() - - class ObjDB(): """ Domain level object map """ - def __init__(self, http_conn, expire_time=0.0, max_objects=None): + def __init__(self, http_conn, expire_time=0.0, max_age=0.0, max_objects=None): self._http_conn = weakref.ref(http_conn) self._objdb = {} self._load_times = {} - self._pending = [] + if max_age > 0.0: + self._pending = {} + else: + self._pending = None self._missing_uuids = set() self._expire_time = expire_time + self._max_age = max_age self._max_objects = max_objects self.log = http_conn.logging + if http_conn.mode == 'r': + self._read_only = True + else: + self._read_only = False @property def http_conn(self): - # access weark ref + # access weak ref conn = self._http_conn() if conn is None: raise RuntimeError("http connection has been garbage collected") @@ -62,6 +61,60 @@ def _is_expired(self, obj_uuid): else: return False + def _flush_pending(self): + # self._pending[obj_uuid] = {"links": {}, "attrs": {}} + if not self._pending: + self.log.debug("flush_pending - no pending objects") + return + + # flush attributes + obj_ids = {} + for obj_uuid in self._pending: + pending_attrs = self._pending[obj_uuid]['attrs'] + if pending_attrs: + if obj_uuid not in obj_ids: + obj_ids[obj_uuid] = {} + obj_id = obj_ids[obj_uuid] + obj_id["attributes"] = pending_attrs + + if obj_ids: + body = {"obj_ids": obj_ids} + root_uuid = self.http_conn.root_uuid + req = f"/groups/{root_uuid}/attributes" + rsp = self.http_conn.PUT(req, body=body) + if rsp.status_code not in (200, 201): + raise IOError(rsp.status_code, "Failed to update attributes") + else: + # clear items from pending queue + for obj_id in obj_ids: + self._pending[obj_id]['attrs'] = {} + + # flush links + obj_ids = {} + for obj_uuid in self._pending: + pending_links = self._pending[obj_uuid]['links'] + if pending_links: + if obj_uuid not in obj_ids: + obj_ids[obj_uuid] = {} + obj_id = obj_ids[obj_uuid] + obj_id["links"] = pending_links + + if obj_ids: + body = {"grp_ids": obj_ids} + root_uuid = self.http_conn.root_uuid + req = f"/groups/{root_uuid}/links" + rsp = self.http_conn.PUT(req, body=body) + if rsp.status_code not in (200, 201): + raise IOError(rsp.status_code, "Failed to update links") + else: + # clear items from pending queue + for obj_id in obj_ids: + self._pending[obj_id]['links'] = {} + + def flush(self): + """ commit all pending items """ + self._flush_pending() + def fetch(self, obj_uuid): """ get obj_json for given obj_uuid from the server """ @@ -144,6 +197,9 @@ def free(self, obj_uuid): def __delitem__(self, obj_uuid): """ delete object frm server and free from objdb""" + if self._read_only: + raise IOError("no write intent on domain") + if obj_uuid not in self._objdb: obj_json = self.fetch(obj_uuid) if not obj_json: @@ -286,21 +342,35 @@ def get_bypath(self, parent_uuid, h5path, follow=False, getlink=False): self.log.info(f"get_bypath link at {h5path} found target: {obj_id}") return obj_json + def _get_pending(self, obj_uuid): + """ get pending items """ + if obj_uuid not in self._pending: + self._pending[obj_uuid] = {"links": {}, "attrs": {}} + return self._pending[obj_uuid] + def set_link(self, group_uuid, title, link_json, replace=False): """ create/update the given link """ if not group_uuid.startswith("g-"): raise TypeError("objdb.set_link - expected a group identifier") if title.find('/') != -1: raise KeyError("objdb.setlink - link title can not be nested") + if self._read_only: + raise IOError("no write intent on domain") obj_json = self.__getitem__(group_uuid) links = obj_json["links"] + link_json['created'] = time.time() + if title in links and replace: # TBD: hsds update to for link replacement? self.del_link(group_uuid, title) - # make a http put - req = f"/groups/{group_uuid}/links/{title}" - self.http_conn.PUT(req, body=link_json) # create the link - link_json['created'] = time.time() + if self._max_age > 0.0: + pending_links = self._get_pending(group_uuid)["links"] + pending_links[title] = link_json + else: + # do a PUT immediately + # make a http put + req = f"/groups/{group_uuid}/links/{title}" + self.http_conn.PUT(req, body=link_json) # create the link links[title] = link_json def del_link(self, group_uuid, title): @@ -308,10 +378,22 @@ def del_link(self, group_uuid, title): if title.find('/') != -1: raise KeyError("objdb.del_link - link title can not be nested") + if self._read_only: + raise IOError("no write intent on domain") obj_json = self.__getitem__(group_uuid) links = obj_json["links"] # tbd - validate link_json? + if self._max_age > 0.0: + pending_links = self._get_pending(group_uuid)["links"] + if title in pending_links: + del pending_links[title] + if title in links: + pending_links = self._get_pending(group_uuid)["links"] + if title in pending_links: + del pending_links[title] + + # TBD - support deferred delete req = f"/groups/{group_uuid}/links/{title}" rsp = self.http_conn.DELETE(req) if rsp.status_code != 200: @@ -327,6 +409,8 @@ def make_obj(self, parent_uuid, title, type_json=None, shape=None, cpl=None, tra If type_json and shape_json - create a dataset If type_json and not shape_json - create a datatype """ + if self._read_only: + raise IOError("no write intent on domain") cfg = config.get_config() # pulls in state from a .hscfg file (if found). if track_order is None: @@ -390,6 +474,8 @@ def make_obj(self, parent_uuid, title, type_json=None, shape=None, cpl=None, tra def del_obj(self, obj_uuid): """ Delete the given object """ + if self._read_only: + raise IOError("no write intent on domain") collection = get_collection(obj_uuid) req = f"/{collection}/{obj_uuid}" @@ -402,32 +488,47 @@ def del_obj(self, obj_uuid): def set_attr(self, obj_uuid, name, attr_json): """ create update attribute """ + if self._read_only: + raise IOError("no write intent on domain") obj_json = self.__getitem__(obj_uuid) attrs = obj_json["attributes"] - params = {} - if name in attrs: - self.log.debug(f"replacing attr {name} of {obj_uuid}") - params['replace'] = 1 - - collection = get_collection(obj_uuid) - req = f"/{collection}/{obj_uuid}/attributes/{name}" - rsp = self.http_conn.PUT(req, body=attr_json, params=params) - - if rsp.status_code not in (200, 201): - self.log.error(f"got {rsp.status_code} for put req: {req}") - raise RuntimeError(f"Unexpected error on put request {req}: {rsp.status_code}") - self.log.info(f"got {rsp.status_code} for req: {req}") attr_json['created'] = time.time() + + if self._max_age > 0.0: + pending_links = self._get_pending(obj_uuid)["attrs"] + pending_links[name] = attr_json + else: + # do a PUT immediately + + params = {} + if name in attrs: + self.log.debug(f"replacing attr {name} of {obj_uuid}") + params['replace'] = 1 + collection = get_collection(obj_uuid) + req = f"/{collection}/{obj_uuid}/attributes/{name}" + rsp = self.http_conn.PUT(req, body=attr_json, params=params) + + if rsp.status_code not in (200, 201): + self.log.error(f"got {rsp.status_code} for put req: {req}") + raise RuntimeError(f"Unexpected error on put request {req}: {rsp.status_code}") + self.log.info(f"got {rsp.status_code} for req: {req}") attrs[name] = attr_json def del_attr(self, obj_uuid, name): """ delete the given attribute """ + if self._read_only: + raise IOError("no write intent on domain") obj_json = self.__getitem__(obj_uuid) attrs = obj_json["attributes"] if name not in attrs: self.log.warning(f"attr {name} of {obj_uuid} not found for delete") raise KeyError("Unable to delete attribute (can't locate attribute)") + if self._max_age > 0.0: + pending_attrs = self._get_pending(obj_uuid)["attrs"] + if name in pending_attrs: + del pending_attrs[name] + # tbd - support deferred deletion collection = get_collection(obj_uuid) req = f"/{collection}/{obj_uuid}/attributes/{name}" rsp = self.http_conn.DELETE(req) @@ -458,6 +559,8 @@ def shape_refresh(self, dset_uuid): def resize(self, dset_uuid, dims): """ update the shape of the dataset """ + if self._read_only: + raise IOError("no write intent on domain") # send the request to the server body = {"shape": dims} req = f"/datasets/{dset_uuid}/shape" diff --git a/h5pyd/objectid.py b/h5pyd/objectid.py index ab587b47..5292f28a 100644 --- a/h5pyd/objectid.py +++ b/h5pyd/objectid.py @@ -576,9 +576,17 @@ def __bool__(self): else: return False + def flush(self): + """ commit any pending write requests """ + if not self._http_conn: + pass # connection closed + else: + self.objdb.flush() + def close(self): """Remove handles to id. """ + self.flush() self._file_conn = None super().close() diff --git a/test/hl/test_attribute.py b/test/hl/test_attribute.py index eb3d03a1..5cc79b50 100644 --- a/test/hl/test_attribute.py +++ b/test/hl/test_attribute.py @@ -134,7 +134,9 @@ def fill_attrs(self, obj): def test_track_order(self): filename = self.getFileName("test_test_track_order_attribute") print(f"filename: {filename}") - with h5py.File(filename, 'w') as f: + # use max_age as 0 because pending writes messes up the tracking order + # TBD: find work-around for this + with h5py.File(filename, 'w', max_age=0.0) as f: grp1 = f.create_group('grp1', track_order=True) self.fill_attrs(grp1) self.assertEqual(list(grp1.attrs), list(self.titles)) @@ -157,7 +159,7 @@ def test_track_order_cfg(self): filename = self.getFileName("test_test_track_order_attribute") print(f"filename: {filename}") cfg = h5py.get_config() - with h5py.File(filename, 'w') as f: + with h5py.File(filename, 'w', max_age=0.0) as f: cfg.track_order = True grp1 = f.create_group('grp1') dset1 = f.create_dataset('dset1', data=[42,]) @@ -176,7 +178,7 @@ def test_track_order_cfg(self): def test_no_track_order(self): filename = self.getFileName("test_test_no_track_order_attribute") print(f"filename: {filename}") - f = h5py.File(filename, 'w') + f = h5py.File(filename, 'w', max_age=0.0) g1 = f.create_group('test') # name alphanumeric self.fill_attrs(g1) self.assertEqual(list(g1.attrs), sorted(list(self.titles))) @@ -184,7 +186,7 @@ def test_no_track_order(self): def test_track_order_overwrite_delete(self): filename = self.getFileName("test_test_track_order_overwrite_delete") print(f"filename: {filename}") - f = h5py.File(filename, 'w') + f = h5py.File(filename, 'w', max_age=0.0) g1 = f.create_group("g1", track_order=True) # creation order self.fill_attrs(g1) @@ -205,7 +207,7 @@ def test_track_order_not_inherited(self): """ filename = self.getFileName("test_test_track_order_not_inherited") print(f"filename: {filename}") - f = h5py.File(filename, 'w', track_order=True) + f = h5py.File(filename, 'w', track_order=True, max_age=0.0) g1 = f.create_group('test') self.fill_attrs(g1) diff --git a/test/hl/test_file.py b/test/hl/test_file.py index c1010a7d..1843b9ff 100644 --- a/test/hl/test_file.py +++ b/test/hl/test_file.py @@ -166,6 +166,7 @@ def test_create(self): logging.info("waiting on scan update") ts = time.time() while not f.last_scan: + print("waiting on summary data to be compiled...") time.sleep(0.1) elapsed = time.time() - ts if elapsed > 90: @@ -412,7 +413,7 @@ def test_track_order(self): filename = self.getFileName("test_track_order_file") print(f"filename: {filename}") # write file using creation order - with h5py.File(filename, 'w', track_order=True) as f: + with h5py.File(filename, 'w', track_order=True, max_age=0.0) as f: self.populate(f) self.assertEqual(list(f), list(self.titles)) self.assertEqual(list(f.attrs), list(self.titles)) @@ -428,7 +429,7 @@ def test_cfg_track_order(self): # write file using creation order cfg = h5py.get_config() cfg.track_order = True - with h5py.File(filename, 'w') as f: + with h5py.File(filename, 'w', max_age=0.0) as f: self.populate(f) self.assertEqual(list(f), list(self.titles)) self.assertEqual(list(f.attrs), list(self.titles)) diff --git a/test/hl/test_group.py b/test/hl/test_group.py index 70f2a13a..affe751b 100644 --- a/test/hl/test_group.py +++ b/test/hl/test_group.py @@ -357,7 +357,7 @@ def populate_attrs(self, d): def test_track_order(self): filename = self.getFileName("test_track_order_group") print(f"filename: {filename}") - with h5py.File(filename, 'w') as f: + with h5py.File(filename, 'w', max_age=0.0) as f: g = f.create_group('order', track_order=True) # creation order self.populate(g) @@ -388,7 +388,7 @@ def test_track_order_cfg(self): filename = self.getFileName("test_track_order_cfg_group") print(f"filename: {filename}") cfg = h5py.get_config() - with h5py.File(filename, 'w') as f: + with h5py.File(filename, 'w', max_age=0.0) as f: cfg.track_order = True # creation order g = f.create_group('order') cfg.track_order = False # reset @@ -432,7 +432,7 @@ def test_get_dataset_track_order(self): if h5py.__name__ == "h5py": return # h5py does not support track_order on group.get() - with h5py.File(filename, 'w') as f: + with h5py.File(filename, 'w', max_age=0.0) as f: g = f.create_group('order') dset = g.create_dataset('dset', (10,), dtype='i4') dset2 = g.create_dataset('dset2', (10,), dtype='i4') @@ -454,7 +454,7 @@ def test_get_group_track_order(self): if h5py.__name__ == "h5py": return # h5py does not support track_order on group.get() - with h5py.File(filename, 'w') as f: + with h5py.File(filename, 'w', max_age=0.0) as f: g = f.create_group('order') g._track_order = True # create subgroup and populate it with links From 7b5c2c4755815db1e3ef4154550a238e10594b36 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 28 Jan 2025 15:16:43 +0800 Subject: [PATCH 30/32] fix h5py compat errors --- test/hl/test_attribute.py | 30 +++++++++++++++++++++++++----- test/hl/test_file.py | 12 ++++++++++-- test/hl/test_group.py | 25 ++++++++++++++++++++----- 3 files changed, 55 insertions(+), 12 deletions(-) diff --git a/test/hl/test_attribute.py b/test/hl/test_attribute.py index 5cc79b50..07f3e92c 100644 --- a/test/hl/test_attribute.py +++ b/test/hl/test_attribute.py @@ -136,7 +136,11 @@ def test_track_order(self): print(f"filename: {filename}") # use max_age as 0 because pending writes messes up the tracking order # TBD: find work-around for this - with h5py.File(filename, 'w', max_age=0.0) as f: + if config.get('use_h5py'): + kwds = {} + else: + kwds = {"max_age": 0.0} + with h5py.File(filename, 'w', **kwds) as f: grp1 = f.create_group('grp1', track_order=True) self.fill_attrs(grp1) self.assertEqual(list(grp1.attrs), list(self.titles)) @@ -159,7 +163,11 @@ def test_track_order_cfg(self): filename = self.getFileName("test_test_track_order_attribute") print(f"filename: {filename}") cfg = h5py.get_config() - with h5py.File(filename, 'w', max_age=0.0) as f: + if config.get('use_h5py'): + kwds = {} + else: + kwds = {"max_age": 0.0} + with h5py.File(filename, 'w', **kwds) as f: cfg.track_order = True grp1 = f.create_group('grp1') dset1 = f.create_dataset('dset1', data=[42,]) @@ -178,7 +186,11 @@ def test_track_order_cfg(self): def test_no_track_order(self): filename = self.getFileName("test_test_no_track_order_attribute") print(f"filename: {filename}") - f = h5py.File(filename, 'w', max_age=0.0) + if config.get('use_h5py'): + kwds = {} + else: + kwds = {"max_age": 0.0} + f = h5py.File(filename, 'w', **kwds) g1 = f.create_group('test') # name alphanumeric self.fill_attrs(g1) self.assertEqual(list(g1.attrs), sorted(list(self.titles))) @@ -186,7 +198,11 @@ def test_no_track_order(self): def test_track_order_overwrite_delete(self): filename = self.getFileName("test_test_track_order_overwrite_delete") print(f"filename: {filename}") - f = h5py.File(filename, 'w', max_age=0.0) + if config.get('use_h5py'): + kwds = {} + else: + kwds = {"max_age": 0.0} + f = h5py.File(filename, 'w', **kwds) g1 = f.create_group("g1", track_order=True) # creation order self.fill_attrs(g1) @@ -207,7 +223,11 @@ def test_track_order_not_inherited(self): """ filename = self.getFileName("test_test_track_order_not_inherited") print(f"filename: {filename}") - f = h5py.File(filename, 'w', track_order=True, max_age=0.0) + if config.get('use_h5py'): + kwds = {} + else: + kwds = {"max_age": 0.0} + f = h5py.File(filename, 'w', track_order=True, **kwds) g1 = f.create_group('test') self.fill_attrs(g1) diff --git a/test/hl/test_file.py b/test/hl/test_file.py index 1843b9ff..e0b840e8 100644 --- a/test/hl/test_file.py +++ b/test/hl/test_file.py @@ -413,7 +413,11 @@ def test_track_order(self): filename = self.getFileName("test_track_order_file") print(f"filename: {filename}") # write file using creation order - with h5py.File(filename, 'w', track_order=True, max_age=0.0) as f: + if config.get('use_h5py'): + kwds = {} + else: + kwds = {"max_age": 0.0} + with h5py.File(filename, 'w', track_order=True, **kwds) as f: self.populate(f) self.assertEqual(list(f), list(self.titles)) self.assertEqual(list(f.attrs), list(self.titles)) @@ -429,7 +433,11 @@ def test_cfg_track_order(self): # write file using creation order cfg = h5py.get_config() cfg.track_order = True - with h5py.File(filename, 'w', max_age=0.0) as f: + if config.get('use_h5py'): + kwds = {} + else: + kwds = {"max_age": 0.0} + with h5py.File(filename, 'w', **kwds) as f: self.populate(f) self.assertEqual(list(f), list(self.titles)) self.assertEqual(list(f.attrs), list(self.titles)) diff --git a/test/hl/test_group.py b/test/hl/test_group.py index affe751b..276240c0 100644 --- a/test/hl/test_group.py +++ b/test/hl/test_group.py @@ -357,7 +357,11 @@ def populate_attrs(self, d): def test_track_order(self): filename = self.getFileName("test_track_order_group") print(f"filename: {filename}") - with h5py.File(filename, 'w', max_age=0.0) as f: + if config.get('use_h5py'): + kwds = {} + else: + kwds = {"max_age": 0.0} + with h5py.File(filename, 'w', **kwds) as f: g = f.create_group('order', track_order=True) # creation order self.populate(g) @@ -388,7 +392,11 @@ def test_track_order_cfg(self): filename = self.getFileName("test_track_order_cfg_group") print(f"filename: {filename}") cfg = h5py.get_config() - with h5py.File(filename, 'w', max_age=0.0) as f: + if config.get('use_h5py'): + kwds = {} + else: + kwds = {"max_age": 0.0} + with h5py.File(filename, 'w', **kwds) as f: cfg.track_order = True # creation order g = f.create_group('order') cfg.track_order = False # reset @@ -431,8 +439,11 @@ def test_get_dataset_track_order(self): print(f"filename: {filename}") if h5py.__name__ == "h5py": return # h5py does not support track_order on group.get() - - with h5py.File(filename, 'w', max_age=0.0) as f: + if config.get('use_h5py'): + kwds = {} + else: + kwds = {"max_age": 0.0} + with h5py.File(filename, 'w', **kwds) as f: g = f.create_group('order') dset = g.create_dataset('dset', (10,), dtype='i4') dset2 = g.create_dataset('dset2', (10,), dtype='i4') @@ -454,7 +465,11 @@ def test_get_group_track_order(self): if h5py.__name__ == "h5py": return # h5py does not support track_order on group.get() - with h5py.File(filename, 'w', max_age=0.0) as f: + if config.get('use_h5py'): + kwds = {} + else: + kwds = {"max_age": 0.0} + with h5py.File(filename, 'w', **kwds) as f: g = f.create_group('order') g._track_order = True # create subgroup and populate it with links From a9ca5f649f184380cb246f2a7fb06adf4b5940a7 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 28 Jan 2025 15:49:40 +0800 Subject: [PATCH 31/32] flush pending when max items exceeded --- h5pyd/objdb.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/h5pyd/objdb.py b/h5pyd/objdb.py index 9939a479..083f95b2 100644 --- a/h5pyd/objdb.py +++ b/h5pyd/objdb.py @@ -17,6 +17,7 @@ from . import config from .objectid import get_collection +MAX_PENDING_ITEMS = 500 # TBD: make this a config? class ObjDB(): """ Domain level object map """ @@ -28,6 +29,7 @@ def __init__(self, http_conn, expire_time=0.0, max_age=0.0, max_objects=None): self._pending = {} else: self._pending = None + self._pending_count = 0 self._missing_uuids = set() self._expire_time = expire_time self._max_age = max_age @@ -62,7 +64,7 @@ def _is_expired(self, obj_uuid): return False def _flush_pending(self): - # self._pending[obj_uuid] = {"links": {}, "attrs": {}} + """ commit any pending updates""" if not self._pending: self.log.debug("flush_pending - no pending objects") return @@ -111,6 +113,8 @@ def _flush_pending(self): for obj_id in obj_ids: self._pending[obj_id]['links'] = {} + self._pending_count = 0 + def flush(self): """ commit all pending items """ self._flush_pending() @@ -366,6 +370,9 @@ def set_link(self, group_uuid, title, link_json, replace=False): if self._max_age > 0.0: pending_links = self._get_pending(group_uuid)["links"] pending_links[title] = link_json + self._pending_count += 1 + if self._pending_count > MAX_PENDING_ITEMS: + self._flush_pending() else: # do a PUT immediately # make a http put @@ -497,6 +504,9 @@ def set_attr(self, obj_uuid, name, attr_json): if self._max_age > 0.0: pending_links = self._get_pending(obj_uuid)["attrs"] pending_links[name] = attr_json + self._pending_count += 1 + if self._pending_count > MAX_PENDING_ITEMS: + self._flush_pending() else: # do a PUT immediately From a42ee3b7e45b8fd1f6cc0337931d7257ba71ea2f Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 28 Jan 2025 15:52:15 +0800 Subject: [PATCH 32/32] fix flake8 error --- h5pyd/objdb.py | 1 + 1 file changed, 1 insertion(+) diff --git a/h5pyd/objdb.py b/h5pyd/objdb.py index 083f95b2..b1393c8e 100644 --- a/h5pyd/objdb.py +++ b/h5pyd/objdb.py @@ -19,6 +19,7 @@ MAX_PENDING_ITEMS = 500 # TBD: make this a config? + class ObjDB(): """ Domain level object map """ def __init__(self, http_conn, expire_time=0.0, max_age=0.0, max_objects=None):