diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b56a134cd..3b2523ca3 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -68,7 +68,7 @@ jobs: - os: ubuntu-latest python-version: 3.7 allow-failure: false - test-case: check-only + test-case: check-all # documentation build - os: ubuntu-latest python-version: 3.7 diff --git a/CHANGES.rst b/CHANGES.rst index a45725689..55192afd6 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -12,6 +12,23 @@ Changes Changes: -------- +- Support `CWL` ``InlineJavascriptRequirement`` for `Process` deployment to allow successful schema validation. +- Support `CWL` ``Directory`` type references (resolves `#466 `_). + Those references correspond to `WPS` and `OGC API - Processes` ``href`` + using the ``Content-Type: application/directory`` Media-Type and must hava a trailing slash (``/``) character. +- Support `S3` file or directory references using *Access Point*, *Virtual-hosted–style* and *Outposts* URLs + (see AWS documentation + `Methods for accessing a bucket `_). +- Apply more validation rules against expected `S3` file or directory reference formats. +- Update documentation regarding handling of `S3` references (more formats supported) and ``Directory`` type references. +- Support ``weaver.wps_output_context`` setting and ``X-WPS-Output-Context`` request header resolution in combination + with `S3` bucket location employed for storing `Job` outputs. +- Nest every complex `Job` output (regardless if stored on local `WPS` outputs or on `S3`, and whether the output is + of ``File`` or ``Directory`` type) under its corresponding output ID collected from the `Process` definition to avoid + potential name conflicts in storage location, especially in the case of multiple output IDs that could be aggregated + with various files and listing of directory contents. +- Allow ``colander.SchemaNode`` (with extensions for `OpenAPI` schema converters) to provide validation ``pattern`` + field directly with a compiled ``re.Pattern`` object. - Support `CWL` definition for ``cwltool:CUDARequirement`` to request the use of a GPU, including support for using Docker with a GPU (resolves `#104 `_). - Support `CWL` definition for ``NetworkAccess`` to indicate whether a process requires outgoing IPv4/IPv6 network diff --git a/Makefile b/Makefile index e8a649798..439dd9d48 100644 --- a/Makefile +++ b/Makefile @@ -469,7 +469,7 @@ CHECKS := $(addprefix check-, $(CHECKS)) # items that should not install python dev packages should be added here instead # they must provide their own target/only + with dependency install variants CHECKS_NO_PY := css md -CHECKS_NO_PY := $(addprefix fix-, $(CHECKS_NO_PY)) +CHECKS_NO_PY := $(addprefix check-, $(CHECKS_NO_PY)) CHECKS_ALL := $(CHECKS) $(CHECKS_NO_PY) $(CHECKS): check-%: install-dev check-%-only @@ -482,7 +482,7 @@ mkdir-reports: check: check-all ## alias for 'check-all' target .PHONY: check-only -check-only: $(addsuffix -only, $(CHECKS)) +check-only: $(addsuffix -only, $(CHECKS_ALL)) .PHONY: check-all check-all: install-dev $(CHECKS_ALL) ## check all code linters diff --git a/docs/_static/custom.css b/docs/_static/custom.css index 48712687d..e497b5c13 100644 --- a/docs/_static/custom.css +++ b/docs/_static/custom.css @@ -48,3 +48,22 @@ div[class^="highlight"] { max-width: 100%; overflow: visible; } + +/* add missing border when row spans more than one line */ +.rst-content table.docutils td:first-child, +.rst-content table.docutils th:first-child, +.rst-content table.field-list td:first-child, +.rst-content table.field-list th:first-child, +.wy-table td:first-child, +.wy-table th:first-child { + border-left-width: 1px; + border-right-width: 1px; +} + +/* avoid mismatching background color in + table rows that spans multiple lines, due to + alternating colors on individual odd/even rows + */ +#table-file-type-handling tr.row-even > td[rowspan] { + background-color: revert; +} diff --git a/docs/examples/directory-listing-s3.json b/docs/examples/directory-listing-s3.json new file mode 100644 index 000000000..f65cdd630 --- /dev/null +++ b/docs/examples/directory-listing-s3.json @@ -0,0 +1,39 @@ +{ + "ResponseMetadata": { + "RequestId": "vpiM5RBkJ3O68CnD5fO42d887Jh49Cf8bhA6nw7ZTHIuGRVccDQM", + "HTTPStatusCode": 200, + "HTTPHeaders": { + "x-amzn-requestid": "vpiM5RBkJ3O68CnD5fO42d887Jh49Cf8bhA6nw7ZTHIuGRVccDQM" + }, + "RetryAttempts": 0 + }, + "IsTruncated": false, + "Contents": [ + { + "Key": "dir/file.txt", + "LastModified": "2022-11-01T04:25:42+00:00", + "ETag": "\"17404a596cbd0d1e6c7d23fcd845ab82\"", + "Size": 4, + "StorageClass": "STANDARD" + }, + { + "Key": "dir/sub/file.txt", + "LastModified": "2022-11-01T04:25:42+00:00", + "ETag": "\"17404a596cbd0d1e6c7d23fcd845ab82\"", + "Size": 4, + "StorageClass": "STANDARD" + }, + { + "Key": "dir/sub/nested/file.txt", + "LastModified": "2022-11-01T04:25:42+00:00", + "ETag": "\"17404a596cbd0d1e6c7d23fcd845ab82\"", + "Size": 4, + "StorageClass": "STANDARD" + } + ], + "Name": "wps-process-test-bucket", + "Prefix": "dir/", + "MaxKeys": 1000, + "EncodingType": "url", + "KeyCount": 3 +} diff --git a/docs/examples/directory-listing.html b/docs/examples/directory-listing.html new file mode 100644 index 000000000..39ae1dcfe --- /dev/null +++ b/docs/examples/directory-listing.html @@ -0,0 +1,28 @@ + + +

Index of /dir/

+
+ + + + + + + + + + + + + + + + + + + + +
ContentModified
README
2022-10-31 23:48
dir/
2022-10-31 23:48
dir/file.txt
2022-10-31 23:48
+
+ + diff --git a/docs/examples/directory-listing.json b/docs/examples/directory-listing.json new file mode 100644 index 000000000..6d9edf943 --- /dev/null +++ b/docs/examples/directory-listing.json @@ -0,0 +1,5 @@ +[ + "https://example.com/base/dir/README.md", + "https://example.com/base/dir/nested/image.png", + "https://example.com/base/dir/nested/data.csv" +] diff --git a/docs/source/appendix.rst b/docs/source/appendix.rst index fd38ff3dd..98030d1d3 100644 --- a/docs/source/appendix.rst +++ b/docs/source/appendix.rst @@ -19,11 +19,11 @@ Glossary queries in the context of :term:`EOImage` inputs. Application Package - General term that refers to *"what and how the :term:`Process` will execute"*. Application Packages provide - the core details about the execution methodology of the underlying operation the :term:`Process` provides, and - are therefore always contained within a :term:`Process` definition. This is more specifically represented - by a :term:`CWL` specification in the case of `Weaver` implementation, but could technically be defined by - another similar approach. See :ref:`Application Package` section for all relevant details. + General term that refers to *"what and how to execute"* the :term:`Process`. Application Packages provide the + core details about the execution methodology of the underlying operation that defines the :term:`Process`, and + are therefore always contained within a :ref:`Process Description `. This is more specifically + represented by a :term:`CWL` specification in the case of `Weaver` implementation, but could technically be + defined by another similar approach. See the :ref:`Application Package` section for all relevant details. AWS Amazon Web Services diff --git a/docs/source/cli.rst b/docs/source/cli.rst index ea8a19075..030d52cd2 100644 --- a/docs/source/cli.rst +++ b/docs/source/cli.rst @@ -161,7 +161,7 @@ A :term:`Workflow` of multiple :term:`Process` references (possibly of distinct .. note:: Content definitions for :term:`CWL` :ref:`application-package` and/or the literal :term:`Process` body can be submitted using either a local file reference, an URL, or a literal string formatted as :term:`JSON` - or :temr:`YAML`. With the :ref:`Python Interface `, the definition can also be provided + or :term:`YAML`. With the :ref:`Python Interface `, the definition can also be provided with a :class:`dict` directly. Below is a sample :term:`Process` deployment using a basic Python script wrapped in a :term:`Docker` image to ensure diff --git a/docs/source/configuration.rst b/docs/source/configuration.rst index a0eb894f7..27a974fe5 100644 --- a/docs/source/configuration.rst +++ b/docs/source/configuration.rst @@ -77,21 +77,21 @@ they are optional and which default value or operation is applied in each situat schema ``definitions`` section. The configuration setting is available to override this endpoint by another static URL location where the corresponding schemas can be found if desired. -.. versionadded:: 4.0.0 +.. versionadded:: 4.0 - | ``weaver.cwl_euid = `` [:class:`int`, *experimental*] | (default: ``None``, auto-resolved by :term:`CWL` with effective machine user) | | Define the effective machine user ID to be used for running the :term:`Application Package`. -.. versionadded:: 1.9.0 +.. versionadded:: 1.9 - | ``weaver.cwl_egid = `` [:class:`int`, *experimental*] | (default: ``None``, auto-resolved by :term:`CWL` with the group of the effective user) | | Define the effective machine group ID to be used for running the :term:`Application Package`. -.. versionadded:: 1.9.0 +.. versionadded:: 1.9 - | ``weaver.wps = true|false`` [:class:`bool`-like] | (default: ``true``) @@ -117,7 +117,7 @@ they are optional and which default value or operation is applied in each situat | The *path* variant **SHOULD** start with ``/`` for appropriate concatenation with ``weaver.url``, although this is not strictly enforced. -- | ``weaver.wps_output_s3_bucket = `` +- | ``weaver.wps_output_s3_bucket = `` | (default: ``None``) | | AWS S3 bucket where to store WPS outputs. Used in conjunction with ``weaver.wps_output_s3_region``. @@ -126,22 +126,22 @@ they are optional and which default value or operation is applied in each situat to that location. If no bucket is specified, the outputs fall back to using the location specified by ``weaver.wps_output_dir``. -.. versionadded:: 1.13.0 +.. versionadded:: 1.13 .. seealso:: - `Configuration of AWS S3 Buckets`_ + :ref:`conf_s3_buckets` -- | ``weaver.wps_output_s3_region = `` - | (default: ``None``) +- | ``weaver.wps_output_s3_region = `` + | (default: ``None``, any :term:`S3` |region| amongst :data:`mypy_boto3_s3.literals.RegionName`) | | AWS S3 region to employ for storing WPS outputs. Used in conjunction with ``weaver.wps_output_s3_bucket``. | - | When this parameter is defined as well as ``weaver.wps_output_s3_bucket``, it is employed to define which `S3` + | When this parameter is defined as well as ``weaver.wps_output_s3_bucket``, it is employed to define which :term:`S3` to write output files to. If not defined but ``weaver.wps_output_s3_bucket`` is specified, `Weaver` attempt to - retrieve the region from the profile defined in `AWS` configuration files or environment variables. + retrieve the region from the profile defined in :term:`AWS` configuration files or environment variables. -.. versionadded:: 1.13.0 +.. versionadded:: 1.13 .. seealso:: - `Configuration of AWS S3 Buckets`_ + :ref:`conf_s3_buckets` - | ``weaver.wps_output_dir = `` | (default: *path* ``/tmp``) @@ -153,7 +153,7 @@ they are optional and which default value or operation is applied in each situat with the :term:`Job` ID. | This directory should be mapped to `Weaver`'s :term:`WPS` output URL to serve them externally as needed. -.. versionchanged:: 4.3.0 +.. versionchanged:: 4.3 The output directory could be nested under a *contextual directory* if requested during :term:`Job` submission. See :ref:`exec_output_location` and below ``weaver.wps_output_context`` parameter for more details. @@ -166,7 +166,13 @@ they are optional and which default value or operation is applied in each situat When not defined, ``X-WPS-Output-Context`` header can still take effect, but omitting it will store results directly under ``weaver.wps_output_dir`` instead of default *context* location. -.. versionadded:: 4.3.0 +.. versionadded:: 4.3 + +.. versionchanged:: 4.27 + Nesting of the *context* directory from ``X-WPS-Output-Context`` or ``weaver.wps_output_dir`` will + also take effect when storing :term:`Job` results under :term:`S3` when ``weaver.wps_output_s3_bucket`` + and ``weaver.wps_output_s3_region`` are also defined. Previous versions applied the *context* directory + only for local storage using the other :term:`WPS` output settings. .. seealso:: See :ref:`exec_output_location` for more details about this feature and implications of this setting. @@ -229,7 +235,7 @@ they are optional and which default value or operation is applied in each situat completion if an email was provided in the :ref:`Execute ` request body (see also: :ref:`Email Notification`). -.. versionadded:: 4.15.0 +.. versionadded:: 4.15 - | ``weaver.exec_sync_max_wait = `` [:class:`int`, seconds] | (default: ``20``) @@ -239,7 +245,7 @@ they are optional and which default value or operation is applied in each situat | See :ref:`proc_exec_mode` for more details on the feature and how to employ it. | Ensure `Celery`_ worker is configured as specified below. -.. versionadded:: 4.15.0 +.. versionadded:: 4.15 - | ``weaver.quote_sync_max_wait = `` [:class:`int`, seconds] | (default: ``20``) @@ -269,51 +275,62 @@ they are optional and which default value or operation is applied in each situat Configuration of AWS S3 Buckets ======================================= -Any `AWS` `S3` bucket accessed by `Weaver` needs to be accessible by the application, whether it is to fetch input -files or to store output results. This can require from the server administrator to specify credentials by one of -reference |aws-cred-support|_ to provide necessary role and/or permissions. See also reference |aws-config|_ which -list various options that will be considered when working with `S3` buckets. - -.. |aws-cred-support| replace:: supported methodologies -.. _aws-cred-support: `aws-credentials`_ +Any :term:`AWS` :term:`S3` |bucket| provided to `Weaver` needs to be accessible by the application, whether it is to +fetch input files or to store output results. This can require from the server administrator to specify credentials +by one of reference supported |aws-credentials|_ methodologies to provide necessary role and/or permissions. See also +reference |aws-config|_ which list various options that will be considered when working with :term:`S3` buckets. -Note that `Weaver` expects the |aws-config|_ to define a *default profile* from which the AWS -client can infer which *region* it needs to connect to. The `S3` bucket to store files should be defined by -``weaver.wps_output_s3_bucket`` setting as presented in the previous section. +Note that `Weaver` expects the |aws-config|_ to define a *default profile* from which the :term:`AWS` +client can infer which |region| it needs to connect to. The :term:`S3` bucket to store files should be +defined with ``weaver.wps_output_s3_bucket`` setting as presented in the previous section. -The `S3` file references for input and output in `Weaver` are expected to be formatted as: +The :term:`S3` file and directory references for input and output in `Weaver` are expected to be formatted as one of +the methods described in |aws_s3_bucket_access|_ (more details about supported formats in :ref:`aws_s3_ref`). +The easiest and most common approach is to use a reference using the ``s3://`` scheme as follows: .. code-block:: text s3:/// -This implicitly tells `Weaver` to employ the `S3` bucket it was configured with as well as the automatically retrieved -region from the `AWS` server configuration. +This implicitly tells `Weaver` to employ the specified :term:`S3` bucket it was configured with as well as the +automatically retrieved location (using the region from the *default profile*) in the |aws-config|_ of the application. -Alternatively, the reference can be provided with the more explicit `AWS` `S3` link such as: +Alternatively, the reference can be provided as input more explicitly with any of the supported :ref:`aws_s3_ref`. +For example, the :term:`AWS` :term:`S3` link could be specified as follows. .. code-block:: text - https://s3.[region-name.]amazonaws.com// + https://s3.{Region}.amazonaws.com/{Bucket}/{file-key} -In this situation, `Weaver` will parse it as equivalent to the prior shorthand reference format, as long as the `AWS` -server configuration matches with all associated details from the HTTP URL variant. If this is not the case, `Weaver` -will still attempt to fetch the file as *standard* HTTP reference, but read access should be granted accordingly to the -corresponding bucket and file such that `Weaver` can access it. +In this situation, `Weaver` will parse it as equivalent to the prior shorthand ``s3://`` reference format, by +substituting any appropriate details retrieved from the |aws-config|_ as needed to form the above HTTP URL variant. +For example, an alternative |region| from the default could be specified. After resolution, `Weaver` +will still attempt to fetch the file as *standard* HTTP reference by following the relevant |aws_s3_bucket_access|_. +In each case, read access should be granted accordingly to the corresponding bucket, files and/or directories such +that `Weaver` can stage them locally. For produced outputs, the write access must be granted. -Finally, in the above references, ``file-key`` is used as *anything after* the bucket name. In other words, this value -can contain any amount of ``/`` separators and details. For example, `Weaver` will store process output results to `S3` -using ``file-key`` as a combination of ``/.``, therefore forming the full job result file -references as: +In the above references, ``file-key`` is used as *anything after* the |bucket| name. In other words, this +value can contain any amount of ``/`` separators and path elements. For example, if ``weaver.wps_output_s3_bucket`` is +defined in the configuration, `Weaver` will store process output results to :term:`S3` using ``file-key`` as a +combination of ``{WPS-UUID}/{output-id.ext}``, therefore forming the full :term:`Job` result file references as: .. code-block:: text - https://s3..amazonaws.com///. + https://s3.{Region}.amazonaws.com/{Bucket}/{WPS-UUID}/{output-id.ext} + Region ::= weaver.wps_output_s3_region + Bucket ::= weaver.wps_output_s3_bucket .. note:: - Value of ``WPS-UUID`` can be retrieved from `Weaver` internal job storage from :meth:`weaver.datatypes.Job.wps_id`. - It refers to the process execution identifier that accomplished the WPS request to run the `Application Package`. + Value of ``WPS-UUID`` can be retrieved from `Weaver` internal :term:`Job` storage + from :meth:`weaver.datatypes.Job.wps_id`. It refers to the :ref:`Process Execution ` identifier + that accomplished the :term:`WPS` request to run the :term:`Application Package`. + +.. note:: + The value of ``file-key`` also applies for :ref:`cwl-dir` references. + +.. |region| replace:: *Region* +.. |bucket| replace:: *Bucket* .. _conf_data_sources: @@ -358,7 +375,7 @@ Please refer to `wps_processes.yml.example`_ for explicit format, keywords suppo the remote resource at that point in time, and will not update if the reference changes. On the other hand, their listing and description offering will not require the remote service to be available at all time until execution. -.. versionadded:: 1.14.0 +.. versionadded:: 1.14 When references are specified using ``providers`` section instead of ``processes``, the registration only saves the remote WPS provider endpoint to dynamically populate WPS processes on demand. @@ -389,7 +406,7 @@ Please refer to `wps_processes.yml.example`_ for explicit format, keywords suppo Configuration of CWL Processes ======================================= -.. versionadded:: 4.19.0 +.. versionadded:: 4.19 Although `Weaver` supports :ref:`Deployment ` and dynamic management of :term:`Process` definitions while the web application is running, it is sometime more convenient for service providers to offer a set of predefined @@ -442,7 +459,7 @@ in an identical definition as if it was :ref:`Deployed ` using : Configuration of Request Options ======================================= -.. versionadded:: 1.8.0 +.. versionadded:: 1.8 It is possible to define :term:`Request Options` that consist of additional arguments that will be passed down to :func:`weaver.utils.request_extra`, which essentially call a traditional request using :mod:`requests` module, but @@ -485,7 +502,7 @@ etc. on a per-request basis, leave other requests unaffected and generally more Configuration of File Vault ======================================= -.. versionadded:: 4.9.0 +.. versionadded:: 4.9 Configuration of the :term:`Vault` is required in order to obtain access to its functionalities and to enable its :term:`API` endpoints. This feature is notably employed to push local files to a remote `Weaver` diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 1323e1e05..08b4a3917 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -76,7 +76,7 @@ Please refer to :ref:`Configuration` and :ref:`Running` sections for following s Database Migration ===================== -.. versionadded:: 4.3.0 +.. versionadded:: 4.3 Previous versions of `Weaver` did not require any specific version of `MongoDB`_. Features were working using version as early as ``mongo==3.4`` if not even older. diff --git a/docs/source/package.rst b/docs/source/package.rst index 2a6239003..b112ca9d3 100644 --- a/docs/source/package.rst +++ b/docs/source/package.rst @@ -8,7 +8,7 @@ Application Package .. contents:: :local: - :depth: 2 + :depth: 3 The :term:`Application Package` defines the internal script definition and configuration that will be executed by a :term:`Process`. This package is based on |CWL|_ (:term:`CWL`). Using the extensive |cwl-spec|_ as backbone for @@ -156,7 +156,7 @@ whenever required for launching new :term:`Job` executions. according to their needs. To resolve such cases, the |update-token-req|_ request or an entire re-deployment of the :term:`Process` could be accomplished, whichever is more convenient for them. -.. versionadded:: 4.5.0 +.. versionadded:: 4.5 Specification and handling of the ``X-Auth-Docker`` header for providing an authentication token. CWL Workflow @@ -358,39 +358,141 @@ the :ref:`Deploy ` request body with any of the following variat Inputs/Outputs Type ----------------------- -In the :term:`CWL` context, the ``type`` field indicates the type of :term:`I/O`. Available types are presented in the -|cwl-io-type|_ portion of the specification. +In the :term:`CWL` context, the ``type`` field indicates the type of :term:`I/O`. +Available types are presented in the |cwl-io-type|_ portion of the :term:`CWL` specification. +.. _warn-any: .. warning:: - `Weaver` has two unsupported :term:`CWL` ``type``, namely ``Any`` and ``Directory``. This limitation is - **intentional** as :term:`WPS` does not offer equivalents. Furthermore, both of these types make the process - description too ambiguous. For instance, most processes expect remote file references, and providing a - ``Directory`` doesn't indicate an explicit reference to which files to retrieve during stage-in operation of - a :term:`Job` execution. - + `Weaver` does not support :term:`CWL` ``type: Any``. This limitation is **intentional** in order to guarantee + proper resolution of :term:`CWL` types to their corresponding :term:`WPS` definitions. Furthermore, the ``Any`` + type would make the :term:`Process` description too ambiguous. + +Type Correspondance +~~~~~~~~~~~~~~~~~~~~ + +A summary of applicable types is presented below. + +Those :term:`CWL` types can be mapped to :term:`WPS` and/or :term:`OAS` contexts in order to obtain corresponding +:term:`I/O` definitions. However, not every type exists in each of those contexts. Therefore, some types will +necessarily be simplified or converted to their best corresponding match when exact mapping cannot be accomplished. +The simplification of types can happen when converting in any direction +(:term:`CWL` |nbsp| |<=>| |nbsp| :term:`WPS` |nbsp| |<=>| |nbsp| :term:`OAS`). +It all depends on which definitions that were provided are the more specific. For example, a :term:`WPS` ``dateTime`` +will be simplified to a generic :term:`CWL` ``string``, and into an :term:`OAS` ``string`` with ``format: "date-time"``. +In this example, it would be important to provide the :term:`WPS` or :term:`OAS` definitions if the *date-time* portion +was critical, since it could not be inferred only from :term:`CWL` ``string``. + +Further details regarding handling methods or important considerations for +specific types will be presented in :ref:`cwl-type` and :ref:`cwl-dir` sections. + ++----------------------+-------------------------+------------------------+--------------------------------------------+ +| :term:`CWL` ``type`` | :term:`WPS` data type | :term:`OAS` type | Description | +| | and sub-type :sup:`(1)` | | | ++======================+=========================+========================+============================================+ +| ``Any`` | |na| | |na| | Not supported. See :ref:`note `. | ++----------------------+-------------------------+------------------------+--------------------------------------------+ +| ``null`` | |na| | |na| | Cannot be used by itself. |br| | +| | | | Represents optional :term:`I/O` when | +| | | | combined with other types :sup:`(2)`. | ++----------------------+-------------------------+------------------------+--------------------------------------------+ +| ``boolean`` | ``Literal`` |br| | ``boolean`` | Binary value. | +| | (``bool``, ``boolean``) | | | ++----------------------+-------------------------+------------------------+--------------------------------------------+ +| ``int``, | ``Literal`` |br| | ``integer``, | Numeric whole value. |br| | +| ``long`` | (``int``, ``integer``, | ``number`` |br| | Unless when explicit conversion between | +| | ``long``, | (format: ``int32``, | contextes can accomplished, the generic | +| | ``positiveInteger``, | ``int64``) :sup:`(3)` | ``integer`` will be employed. | +| | ``nonNegativeInteger``) | | | ++----------------------+-------------------------+------------------------+--------------------------------------------+ +| ``float``, | ``Literal`` |br| | ``number`` |br| | Numeric floating-point value. | +| ``double`` | (``float``, ``double``, | (format: ``float``, | By default, ``float`` is used unless more | +| | ``scale``, ``angle``) | ``double``) :sup:`(3)` | explicit context conversion can be | +| | | | accomplished :sup:`(4)`. | ++----------------------+-------------------------+------------------------+--------------------------------------------+ +| ``string`` | ``Literal`` |br| | ``string`` |br| | Generic string. Default employed if | +| | (``string``, ``date``, | (format: ``date``, | nothing more specific is resolved. |br| | +| | ``time``, ``dateTime``, | ``time``, | | +| | ``anyURI``) | ``datetime``, | This type can be used to represent any | +| | | ``date-time``, | :ref:`File Reference ` | +| | | ``full-date``, | as plain URL string without resolution. | +| | | ``uri``, ``url``, | | +| | | etc.) :sup:`(5)` | | ++----------------------+-------------------------+------------------------+--------------------------------------------+ +| |na| | ``BoundingBox`` | :term:`JSON` | Only partial support available. |br| | +| | | :sup:`(6)` | See :ref:`note `. | ++----------------------+-------------------------+------------------------+--------------------------------------------+ +| ``File`` | ``Complex`` | :term:`JSON` | :ref:`File Reference ` | +| | | :sup:`(6)` | with Media-Type validation and staging | +| | | | according to the applicable scheme. | ++----------------------+-------------------------+------------------------+--------------------------------------------+ +| ``Directory`` | ``Complex`` | :term:`JSON` | :ref:`Directory Reference ` | +| | | :sup:`(6)` | handled as nested ``Files`` to stage. | ++----------------------+-------------------------+------------------------+--------------------------------------------+ + +| :sup:`(1)` Resolution method according to critical fields defined in :ref:`cwl-type`. +| :sup:`(2)` More details in :ref:`oas_basic_types` and :ref:`cwl-array-null-values` sections. +| :sup:`(3)` Number is used in combination with ``format`` to find best match between integer and floating point values. + If not provided, it defaults to ``float`` to handle both cases. +| :sup:`(4)` The ``float`` name is employed loosely to represent any *floating-point* value rather than + *single-precision* (16-bits). Its internal representation is *double-precision* (32-bits) given that the + implementation is in Python. +| :sup:`(5)` Because ``string`` is the default, any ``format`` and ``pattern`` can be specified. + More specific types with these items can help apply additional validation, although not strictly enforced. +| :sup:`(6)` Specific schema required as described in :ref:`oas_json_types`. + +.. _cwl-type: + +Type Resolution +~~~~~~~~~~~~~~~ In the :term:`WPS` context, three data types exist, namely ``Literal``, ``BoundingBox`` and ``Complex`` data. .. _bbox-note: .. note:: As of the current version of `Weaver`, :term:`WPS` data type ``BoundingBox`` is not completely supported. - The schema definition exists in :term:`WPS` context but is not handled by any :term:`CWL` type conversion yet. - This feature is reflected by issue `#51 `_. + The schema definition exists in :term:`WPS` and :term:`OAS` contexts but is not handled by any :term:`CWL` type + conversion yet. This feature is reflected by issue `#51 `_. It is possible to use a ``Literal`` data of type ``string`` corresponding to :term:`WKT` [#]_, [#]_ in the meantime. .. [#] |wkt-example|_ .. [#] |wkt-format|_ -As presented in the example of the previous section, :term:`I/O` in the :term:`WPS` context does not require an explicit -indication of the type from one of ``Literal``, ``BoundingBox`` and ``Complex`` data. Instead, :term:`WPS` type is +As presented in previous examples, :term:`I/O` in the :term:`WPS` context does not require an explicit indication of +which data type from one of ``Literal``, ``BoundingBox`` and ``Complex`` to apply. Instead, :term:`WPS` type can be inferred using the matched API schema of the I/O. For instance, ``Complex`` I/O (e.g.: file reference) requires the ``formats`` field to distinguish it from a plain ``string``. Therefore, specifying either ``format`` in :term:`CWL` or ``formats`` in :term:`WPS` immediately provides all needed information for `Weaver` to understand that this I/O is -expected to be a file reference. A combination of ``bbox`` and ``crs`` fields would otherwise indicate a ``BoundingBox`` -I/O (see :ref:`note `). If none of the two previous schemas are matched, the I/O type resolution falls back -to ``Literal`` data of ``string`` type. To employ another primitive data type such as ``Integer``, an explicit -indication needs to be provided as follows. +expected to be a file reference. + +.. code-block:: json + :caption: WPS Complex Data Type + :linenos: + + { + "id": "input", + "formats": [ + {"mediaType": "application/json", "default": true} + ] + } + +A combination of ``supportedCRS`` objects providing ``crs`` references would +otherwise indicate a ``BoundingBox`` :term:`I/O` (see :ref:`note `). + +.. code-block:: json + :caption: WPS BoundingBox Data Type + :linenos: + + { + "id": "input", + "supportedCRS": [ + {"crs": "http://www.opengis.net/def/crs/OGC/1.3/CRS84", "default": true} + ] + } + +If none of the two previous schemas are matched, the :term:`I/O` type resolution falls back +to ``Literal`` data of ``string`` type. To employ another primitive data type such as ``Integer``, +an explicit indication needs to be provided as follows. .. code-block:: json :caption: WPS Literal Data Type @@ -404,16 +506,79 @@ indication needs to be provided as follows. } Obviously, the equivalent :term:`CWL` definition is simpler in this case (i.e.: only ``type: int`` is required). -It is therefore *recommended* to take advantage of `Weaver`'s merging strategy in this case by providing only the -details through the :term:`CWL` definition and have the corresponding :term:`WPS` I/O type automatically deduced by +It is therefore *recommended* to take advantage of `Weaver`'s merging strategy during +:ref:`Process Deployment ` in this case by providing only the details through +the :term:`CWL` definition and have the corresponding :term:`WPS` I/O type automatically deduced by the generated process. If desired, ``literalDataDomains`` can still be explicitly provided as above to ensure that it gets parsed as intended type. +.. versionadded:: 4.16 + With more recent versions of `Weaver`, it is also possible to employ :term:`OpenAPI` schema definitions provided in the :term:`WPS` I/O to specify the explicit structure that applies to ``Literal``, ``BoundingBox`` and ``Complex`` data types. When :term:`OpenAPI` schema are detected, they are also considered in the merging strategy along with other specifications provided in :term:`CWL` and :term:`WPS` contexts. More details about :term:`OAS` context is -provided in :ref:`OpenAPI Schema` section. +provided in :ref:`oas_io_schema` section. + +.. _dir_ref_type: +.. _cwl-dir: + +Directory Type +~~~~~~~~~~~~~~ + +.. versionchanged:: 4.27 + Support of :term:`CWL` ``type: Directory`` added to `Weaver`. + +In order to map a ``Directory`` to the underlying :term:`WPS` :term:`Process` that do not natively offer this +type of reference, a ``Complex`` "*pseudo-file*" using Media-Type ``application/directory`` is employed. For further +validation that a ``Directory`` is properly parsed by `Weaver`, provided URL references must also end with a trailing +slash (``/``) character. + +.. warning:: + Note that, when using ``Directory`` type, very few format and content validation can be accomplished for individual + files contained in that directory. The contents must therefore match the definitions expected by the application + receiving it. No explicit validation is accomplished by `Weaver` to ensure if expected contents are available. + +When a ``Directory`` type is specified in the :term:`Process` definition, and that +a :ref:`File Reference ` is provided during :ref:`Execution `, the reference +pointed to as ``Directory`` must provide a listing of files. Those files can either be relative to the ``Directory`` +or other absolute :ref:`File Reference ` locations. The applicable scheme to stage those files will +be applied as needed based on resolved references. It is therefore possible to mix URL schemes between the listed +references. For example, a ``Directory`` listing as :term:`JSON` obtained from a ``https://`` endpoint could provide +multiple ``File`` locations from ``s3://`` buckets to stage for :ref:`Process Execution `. + +The following ``Directory`` listing formats are supported. + +.. table:: + :class: code-table + :align: center + :widths: 70,30 + + +-----------------------------------------------------------+------------------------------------------------------+ + | Listing Format | Description | + +===========================================================+======================================================+ + | .. literalinclude:: ../examples/directory-listing.html | A file index where each reference to be staged | + | :caption: HTML File Index | should be contained in a ```` tag. | + | :language: yaml | | + | | The structure can be contained in a ````, | + | | an HTML list (``
    ``, ``
" if include_table_format else "") + for href in dir_files + ] + dir_refs = "\n" + "\n".join(ref_files) + dir_base = directory_path if directory_path.startswith("/") else f"/{directory_path}" + dir_base = dir_base if dir_base.endswith("/") else f"{dir_base}/" + dir_title = f"

Index of {dir_base}

" if include_dir_heading else "" + if include_table_format: + dir_pre = "
" if include_table_format else "") + + ("
" if include_table_format and include_code_format else "") +
+        f"{href}\t\t\t" +
+        ("
" if include_table_format and include_code_format else "") + + ("
" if include_table_format and include_modified_date else "") + + ( + f"{str(datetime.fromtimestamp(os.stat(os.path.join(local_directory, href)).st_mtime)).rsplit(':', 1)[0]}" + if include_modified_date else "" + ) + + ("
" + dir_post = "
" + if include_modified_date: + dir_mid = "ContentModified" + else: + dir_mid = "Content" + else: + dir_pre = "
" if include_code_format else ""
+        dir_mid = ""
+        dir_post = "
" if include_code_format else "" + dir_sep = "
" if include_separators else "" + dir_html = inspect.cleandoc(f""" + + + {dir_title} + {dir_sep if dir_title else ""} + {dir_pre} + {dir_mid} + {dir_refs} + {dir_post} + {dir_sep} + + + """) + return dir_html + + +def mocked_file_server(directory, # type: str + url, # type: str + settings, # type: SettingsType + *, # force named keyword arguments after + mock_get=True, # type: bool + mock_head=True, # type: bool + mock_browse_index=False, # type: bool + headers_override=None, # type: Optional[AnyHeadersContainer] + requests_mock=None, # type: Optional[responses.RequestsMock] + **directory_listing_kwargs, # type: bool + ): # type: (...) -> responses.RequestsMock """ Mocks a file server endpoint hosting some local directory files. @@ -809,13 +909,15 @@ def mocked_file_server(directory, # type: str refer to distinct endpoints that will not cause conflicting request patching configurations. .. seealso:: - For WPS output directory/endpoint, consider using :func:`mocked_wps_output` instead. + - For WPS output directory/endpoint, consider using :func:`mocked_wps_output` instead. + - For applicable directory listing arguments, see :func:`mocked_dir_listing`. :param directory: Path of the directory to mock as file server resources. :param url: HTTP URL to mock as file server endpoint. :param settings: Application settings to retrieve requests options. :param mock_get: Whether to mock HTTP GET methods received on WPS output URL. :param mock_head: Whether to mock HTTP HEAD methods received on WPS output URL. + :param mock_browse_index: Whether to mock an ``index.html`` with file listing when requests point to a directory. :param headers_override: Override specified headers in produced response. :param requests_mock: Previously defined request mock instance to extend with new definitions. :return: Mocked response that would normally be obtained by a file server hosting WPS output directory. @@ -828,7 +930,7 @@ def mocked_file_server(directory, # type: str ) def request_callback(request): - # type: (AnyRequestType) -> Tuple[int, Dict[str, str], str] + # type: (AnyRequestType) -> Tuple[int, HeadersType, BodyType] """ Operation called when the file-server URL is matched against incoming requests that have been mocked. """ @@ -842,10 +944,10 @@ def request_callback(request): mime_type, encoding = mimetypes.guess_type(file_path) headers.update({ "Server": "mocked_wps_output", - "Date": str(datetime.datetime.utcnow()), + "Date": str(datetime.utcnow()), "Content-Type": mime_type or ContentType.TEXT_PLAIN, "Content-Encoding": encoding or "", - "Last-Modified": str(datetime.datetime.fromtimestamp(os.stat(file_path).st_mtime)) + "Last-Modified": str(datetime.fromtimestamp(os.stat(file_path).st_mtime)) }) if request.method == "HEAD": headers.pop("Content-Length", None) @@ -860,18 +962,45 @@ def request_callback(request): return 405, {}, "" return 404, {}, "" + def browse_callback(request): + # type: (AnyRequestType) -> Tuple[int, HeadersType, BodyType] + if mock_head and request.method == "HEAD": + return 200, {"Content-Type": ContentType.TEXT_HTML}, "" + elif mock_get and request.method == "GET": + dir_path = request.url.replace(url, directory, 1).split(directory, 1)[-1] + if dir_path.endswith("index.html"): + dir_path = dir_path.rsplit("/", 1)[0] + dir_list = os.path.join(directory, dir_path.lstrip("/")) + dir_html = mocked_dir_listing(dir_list, dir_path, **directory_listing_kwargs) + headers = {"Content-Type": ContentType.TEXT_HTML, "Content-Length": str(len(dir_html))} + return 200, headers, dir_html + return 405, {}, "" + mock_req = requests_mock or responses.RequestsMock(assert_all_requests_are_fired=False) - any_file_url = re.compile(fr"{url}/[\w\-_/.]+") # match any sub-directory/file structure + if mock_browse_index: + # match any sub-directory/file structure, except ones ending with dir/index + any_file_url = re.compile(fr"^{url}/[\w\-_/.]+(? Union[responses.RequestsMock, MockPatch] @@ -891,13 +1020,21 @@ def mocked_wps_output(settings, # type: SettingsType :param settings: Application settings to retrieve WPS output configuration. :param mock_get: Whether to mock HTTP GET methods received on WPS output URL. :param mock_head: Whether to mock HTTP HEAD methods received on WPS output URL. + :param mock_browse_index: Whether to mock an ``index.html`` with file listing when requests point to a directory. :param headers_override: Override specified headers in produced response. :param requests_mock: Previously defined request mock instance to extend with new definitions. :return: Mocked response that would normally be obtained by a file server hosting WPS output directory. """ wps_url = get_wps_output_url(settings) wps_dir = get_wps_output_dir(settings) - return mocked_file_server(wps_dir, wps_url, settings, mock_get, mock_head, headers_override, requests_mock) + return mocked_file_server( + wps_dir, wps_url, settings, + mock_get=mock_get, + mock_head=mock_head, + mock_browse_index=mock_browse_index, + headers_override=headers_override, + requests_mock=requests_mock, + ) def mocked_execute_celery(celery_task="weaver.processes.execution.execute_process", func_execute_task=None): @@ -1020,65 +1157,150 @@ def mocked_process_package(): ) -def mocked_aws_credentials(test_func): - # type: (Callable[[...], Any]) -> Callable +def mocked_aws_config(_func=null, # type: Optional[Callable[[..., *Any], Any]] + *, # force named keyword arguments after + default_region=MOCK_AWS_REGION, # type: RegionName + ): # type: (...) -> Callable[[..., *Any], Any] """ - Mocked AWS Credentials for :py:mod:`moto`. + Mocked AWS configuration and credentials for :mod:`moto` and :mod:`boto3`. When using this fixture, ensures that if other mocks fail, at least credentials should be invalid to avoid mistakenly overriding real bucket files. + + .. seealso:: + - :func:`mocked_aws_s3` + - :func:`mocked_aws_s3_bucket_test_file` """ - def wrapped(*args, **kwargs): - with mock.patch.dict(os.environ, { - "AWS_ACCESS_KEY_ID": "testing", - "AWS_SECRET_ACCESS_KEY": "testing", - "AWS_SECURITY_TOKEN": "testing", - "AWS_SESSION_TOKEN": "testing" - }): - return test_func(*args, **kwargs) - return wrapped + from weaver.utils import validate_s3 as real_validate_s3 + + def mock_validate_s3(*, region, bucket): + # type: (Any, str, str) -> None + if region == MOCK_AWS_REGION: + region = AWS_S3_REGIONS[0] # any valid for temporarily passing check + real_validate_s3(region=region, bucket=bucket) + + def decorator(test_func): + # type: (Callable[[..., *Any], Any]) -> Callable[[..., *Any], Any] + @functools.wraps(test_func) + def wrapped(*args, **kwargs): + # type: (*Any, **Any) -> Any + + with contextlib.ExitStack() as stack: + stack.enter_context(mock.patch.dict(os.environ, { + "AWS_DEFAULT_REGION": default_region, + "AWS_ACCESS_KEY_ID": "testing", + "AWS_SECRET_ACCESS_KEY": "testing", + "AWS_SECURITY_TOKEN": "testing", + "AWS_SESSION_TOKEN": "testing" + })) + stack.enter_context(mock.patch("weaver.utils.validate_s3", side_effect=mock_validate_s3)) + stack.enter_context(mock.patch("weaver.wps.utils.validate_s3", side_effect=mock_validate_s3)) + return test_func(*args, **kwargs) + return wrapped + if _func is not null and callable(_func): + return decorator(_func) + return decorator def mocked_aws_s3(test_func): - # type: (Callable[[...], Any]) -> Callable + # type: (Callable[[..., *Any], Any]) -> Callable[[..., *Any], Any] """ - Mocked AWS S3 bucket for :py:mod:`boto3` over mocked AWS credentials using :py:mod:`moto`. + Mocked AWS S3 for :mod:`boto3` over mocked AWS credentials using :mod:`moto`. - .. warning:: - Make sure to employ the same :py:data:`MOCK_AWS_REGION` otherwise mock will not work and S3 operations will - attempt writing to real bucket. + .. seealso:: + - :func:`mocked_aws_config` + - :func:`mocked_aws_s3_bucket_test_file` """ + @functools.wraps(test_func) def wrapped(*args, **kwargs): + # type: (*Any, **Any) -> Any with moto.mock_s3(): return test_func(*args, **kwargs) return wrapped -def mocked_aws_s3_bucket_test_file(bucket_name, file_name, file_content="mock"): - # type: (str, str, str) -> str +@overload +def setup_aws_s3_bucket(__func=null, *, region=MOCK_AWS_REGION, bucket="", client=True): + # type: (Any, Any, BucketLocationConstraintType, str, Literal[True]) -> S3Client + ... + + +@overload +def setup_aws_s3_bucket(__func=null, *, region=MOCK_AWS_REGION, bucket="", client=False): + # type: (Any, Any, BucketLocationConstraintType, str, Literal[False]) -> Callable[[...], Any] + ... + + +def setup_aws_s3_bucket(__func=null, # type: Optional[Callable[[..., *Any], Any]] + *, # force named keyword arguments after + region=MOCK_AWS_REGION, # type: BucketLocationConstraintType + bucket="", # type: str + client=False, # type: bool + ): # type: (...) -> Union[Callable[[...], Any], S3Client] + import boto3 + from botocore.exceptions import ClientError + + def setup(): + s3 = boto3.client("s3", region_name=region) # type: S3Client + s3_location = {"LocationConstraint": region} # type: CreateBucketConfigurationTypeDef + try: + s3.create_bucket(Bucket=bucket, CreateBucketConfiguration=s3_location) + except ClientError as exc: + if exc.response["Error"]["Code"] not in ["BucketAlreadyExists", "BucketAlreadyOwnedByYou"]: + raise + return s3 + + if client: + return setup() + + def decorate(func): + # type: (Callable[[...], Any]) -> Callable[[...], Any] + @functools.wraps(func) + def wrapped(*args, **kwargs): + setup() + return func(*args, **kwargs) + return wrapped + + if callable(__func): # without () call + return decorate(__func) + return decorate # with parameters + + +def mocked_aws_s3_bucket_test_file(bucket_name, # type: str + file_name, # type: str + file_content="mock", # type: str + s3_region=MOCK_AWS_REGION, # type: BucketLocationConstraintType + s3_scheme="s3", # type: S3Scheme + ): # type: (...) -> str """ - Mock a test file as if retrieved from an AWS-S3 bucket reference. + Mock a test file as if retrieved from an :term:`AWS` term:`S3` bucket reference. Generates a test file reference from dummy data that will be uploaded to the specified S3 bucket name using the provided file key. The S3 interface employed is completely dependent of the wrapping context. For instance, calling this function with :func:`mocked_aws_s3` decorator will effectively employ the mocked S3 interface. + .. note:: + Any applicable :term:`AWS` term:`S3` mock should have been applied before calling this function. + This function does not itself configure the mocking mechanism. + + .. warning:: + Make sure to employ the same :paramref:`s3_region` across calls when referring to the + same :paramref:`bucket_name`. Otherwise, mock could fail and S3 operations could be attempted + towards real S3 bucket locations. + .. seealso:: + - :func:`mocked_aws_config` - :func:`mocked_aws_s3` """ - import boto3 - if not MOCK_AWS_REGION: - s3 = boto3.client("s3") - s3.create_bucket(Bucket=bucket_name) - else: - s3 = boto3.client("s3", region_name=MOCK_AWS_REGION) - s3_location = {"LocationConstraint": MOCK_AWS_REGION} - s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration=s3_location) + s3 = setup_aws_s3_bucket(region=s3_region, bucket=bucket_name, client=True) with tempfile.NamedTemporaryFile(mode="w") as tmp_file: tmp_file.write(file_content) tmp_file.flush() s3.upload_file(Bucket=bucket_name, Filename=tmp_file.name, Key=file_name) - return f"s3://{bucket_name}/{file_name}" + s3_prefix = "" + if s3_scheme == "https": + s3_prefix = f"s3.{s3_region}.amazonaws.com/" + return f"{s3_scheme}://{s3_prefix}{bucket_name}/{file_name}" def mocked_http_file(test_func): @@ -1093,12 +1315,14 @@ def mocked_http_file(test_func): - :func:`mocked_reference_test_file` """ def mocked_file_request(file_reference, file_outdir, **kwargs): + # type: (str, str, **Any) -> str if file_reference and file_reference.startswith(MOCK_HTTP_REF): file_reference = file_reference.replace(MOCK_HTTP_REF, "") file_path = fetch_file(file_reference, file_outdir, **kwargs) return file_path def wrapped(*args, **kwargs): + # type: (*Any, **Any) -> Any with mock.patch("weaver.processes.wps_package.fetch_file", side_effect=mocked_file_request): return test_func(*args, **kwargs) return wrapped @@ -1130,3 +1354,64 @@ def mocked_reference_test_file(file_name_or_path, href_type, file_content="mock" path = f"{href_prefix}{path}" href_type = None if "://" in path else href_type return f"{href_type}://{path}" if href_type else path + + +def setup_test_file_hierarchy(test_paths, test_root_dir, test_data="data"): + # type: (Iterable[Union[Path, Tuple[Path, Optional[Path]]]], Path, str) -> List[Path] + """ + Creates all requested files and directories, either directly or as system links to another file or directory. + + All files and directories should be relative paths, which will be created under :paramref:`test_root_dir`. + + Directory creations are requested by terminating the path with a ``/``. Similarly, directory links should have + a ``/`` character at the end. Mismatching file/directory link source and corresponding target is not explicitly + prohibited, just could yield unexpected outcomes. + + Any nested file or directory definition that contains any missing parent directories on the file system will + have their complete parent directories hierarchy created immediately. If alternate link definitions are needed, + they should be specified beforehand. Files and directories paths are resolved and created in the specified order. + + When any link is specified (using a tuple of source to target paths), the target location that it refers to must + exist before the file or directory link creation is attempted. If the target is ``None`` or empty, it will be + considered a plain file or directory reference as if provided directly instead of the tuple form. + + :param test_paths: Paths to files, directories, or links to a file or directory to be generated. + :param test_root_dir: Base directory under which to create all requested elements. + :param test_data: Data written to created files when applicable. + :returns: Exhaustive listing of files and directories hierarchy under the root directory. + """ + for test_item in test_paths: + file_path, link_target = test_item if isinstance(test_item, tuple) else (test_item, None) + dir_path, file_path = os.path.split(file_path) + # resolve paths + if link_target: + link_target = os.path.join(test_root_dir, link_target) + if dir_path: + dir_path = os.path.join(test_root_dir, dir_path) + if file_path: + file_path = os.path.join(dir_path, file_path) + else: + file_path = os.path.join(test_root_dir, file_path) + # create file reference + if file_path: + if dir_path: + os.makedirs(dir_path, exist_ok=True) + if link_target: + parent_dir = os.path.split(file_path)[0] + os.makedirs(parent_dir, exist_ok=True) + os.symlink(link_target, file_path) + else: + with open(file_path, mode="w", encoding="utf-8") as f: + f.write(test_data) + # create dir reference + elif link_target and dir_path: + parent_dir = os.path.split(dir_path)[0] + os.makedirs(parent_dir, exist_ok=True) + os.symlink(link_target.rstrip("/"), dir_path, target_is_directory=True) + elif dir_path: + os.makedirs(dir_path, exist_ok=True) + listing = [] + for path, dirs, files in os.walk(test_root_dir): + listing.extend((os.path.join(path, dir_path) + "/" for dir_path in dirs)) + listing.extend((os.path.join(path, file_name) for file_name in files)) + return sorted(listing) diff --git a/tests/wps_restapi/test_jobs.py b/tests/wps_restapi/test_jobs.py index fd3d56094..e2f82a087 100644 --- a/tests/wps_restapi/test_jobs.py +++ b/tests/wps_restapi/test_jobs.py @@ -1291,7 +1291,7 @@ def test_get_job_invalid_uuid(self): """ Test handling of invalid UUID reference to search job. - .. versionchanged:: 4.6.0 + .. versionchanged:: 4.6 Jobs must explicitly use an :class:`uuid.UUID` object to search. Any value provided in path parameter that does not correspond to such definition raises a bad request. """ diff --git a/tests/wps_restapi/test_processes.py b/tests/wps_restapi/test_processes.py index eb57464ad..439dcdf67 100644 --- a/tests/wps_restapi/test_processes.py +++ b/tests/wps_restapi/test_processes.py @@ -1956,7 +1956,7 @@ def test_execute_process_missing_required_params(self): """ Validate execution against missing parameters. - .. versionchanged:: 4.15.0 + .. versionchanged:: 4.15 Multiple parameters are not **required** anymore because the alternative with ``Prefer`` header for :term:`OGC API - Processes` compliance is permitted. When the values are specified through, they should still be validated to provide relevant error details to the user. diff --git a/weaver/cli.py b/weaver/cli.py index 603ba4ad0..bfe189ed5 100644 --- a/weaver/cli.py +++ b/weaver/cli.py @@ -12,6 +12,7 @@ from urllib.parse import urlparse import yaml +from pyramid.httpexceptions import HTTPNotImplemented from requests.auth import AuthBase, HTTPBasicAuth from requests.structures import CaseInsensitiveDict from webob.headers import ResponseHeaders @@ -34,13 +35,15 @@ from weaver.sort import Sort, SortMethods from weaver.status import JOB_STATUS_CATEGORIES, Status, StatusCategory, map_status from weaver.utils import ( + OutputMethod, copy_doc, - fetch_file, + fetch_reference, fully_qualified_name, get_any_id, get_any_value, get_file_headers, get_header, + get_sane_name, import_target, load_file, null, @@ -903,7 +906,7 @@ def _parse_inputs(inputs): return OperationResult(False, f"Failed inputs parsing with error: [{exc!s}].", inputs) return values - def _update_files(self, inputs, url=None): + def _upload_files(self, inputs, url=None): # type: (ExecutionInputsMap, Optional[str]) -> Union[Tuple[ExecutionInputsMap, HeadersType], OperationResult] """ Replaces local file paths by references uploaded to the :term:`Vault`. @@ -933,6 +936,12 @@ def _update_files(self, inputs, url=None): continue if href.startswith("file://"): href = href[7:] + if os.path.isdir(href): + return OperationResult( + message=f"Cannot upload local directory to vault: [{file}]. Aborting operation.", + title="Directory upload not implemented.", + code=HTTPNotImplemented.code, + ) if not os.path.isfile(href): # Case for remote files (ex. http links) if "://" not in href: LOGGER.warning( @@ -1041,7 +1050,7 @@ def execute(self, values = self._parse_inputs(inputs) if isinstance(values, OperationResult): return values - result = self._update_files(values, url=base) + result = self._upload_files(values, url=base) if isinstance(result, OperationResult): return result values, auth_headers = result @@ -1141,6 +1150,12 @@ def upload(self, scheme = file_path.split("://", 1)[0] return OperationResult(False, "Scheme not supported for local file reference.", {"file_scheme": scheme}) file_path = os.path.abspath(os.path.expanduser(file_path)) + if os.path.isdir(file_path): + return OperationResult( + message=f"Cannot upload local directory to vault: [{file_path}]. Aborting operation.", + title="Directory upload not implemented.", + code=HTTPNotImplemented.code, + ) if not os.path.isfile(file_path): return OperationResult(False, "Resolved local file reference does not exist.", {"file_path": file_path}) LOGGER.debug("Processing file for vault upload: [%s]", file_path) @@ -1438,18 +1453,24 @@ def _download_references(self, outputs, out_links, out_dir, job_id, auth=None): # download outputs from body content LOGGER.debug("%s outputs in results content.", "Processing" if len(outputs) else "No") for output, value in outputs.items(): + # nest each output under its own directory to avoid conflicting names + # in case of many files across outputs that do guarantee uniqueness + out_id = get_sane_name(output, min_len=1, assert_invalid=False) + out_path = os.path.join(out_dir, out_id) is_list = True if not isinstance(value, list): value = [value] is_list = False for i, item in enumerate(value): if "href" in item: - file_path = fetch_file(item["href"], out_dir, link=False, auth=auth) + os.makedirs(out_path, exist_ok=True) + ref_path = fetch_reference(item["href"], out_path, auth=auth, + out_method=OutputMethod.COPY, out_listing=False) if is_list: - outputs[output][i]["path"] = file_path + outputs[output][i]["path"] = ref_path outputs[output][i]["source"] = "body" else: - outputs[output]["path"] = file_path + outputs[output]["path"] = ref_path outputs[output]["source"] = "body" # download links from headers @@ -1462,8 +1483,9 @@ def _download_references(self, outputs, out_links, out_dir, job_id, auth=None): rel = params["rel"][0].split(".") output = rel[0] is_array = len(rel) > 1 and str.isnumeric(rel[1]) - file_path = fetch_file(href, out_dir, link=False, auth=auth) - value = {"href": href, "type": ctype, "path": file_path, "source": "link"} + ref_path = fetch_reference(href, out_dir, auth=auth, + out_method=OutputMethod.COPY, out_listing=False) + value = {"href": href, "type": ctype, "path": ref_path, "source": "link"} if output in outputs: if isinstance(outputs[output], dict): # in case 'rel=". 1``) can be specified using semicolon (``;``) separated values - after a single input ID. Note that this is not the same as an single-value array-like input, which should + after a single input ID. Note that this is not the same as a single-value array-like input, which should use comma (``,``) separated values instead. The type of an element-wise item of this input can also be provided (i.e.: ``multiInput:int=1;2;3``). Alternatively, the same input ID can be repeated over many ``-I`` options each providing an element of the diff --git a/weaver/exceptions.py b/weaver/exceptions.py index 07019603d..e022c5511 100644 --- a/weaver/exceptions.py +++ b/weaver/exceptions.py @@ -37,7 +37,7 @@ if TYPE_CHECKING: from typing import Any, Callable, Type, Union - from weaver.typedefs import AnyCallableWrapped, ReturnValue + from weaver.typedefs import AnyCallableWrapped, Return class WeaverException(Exception): @@ -102,7 +102,7 @@ class ServiceNotAccessible(HTTPForbidden, OWSAccessForbidden, ServiceException): class ServiceNotFound(HTTPNotFound, OWSNotFound, ServiceException): """ - Error related to non existent service definition. + Error related to non-existent service definition. Error indicating that an OWS service could not be read from the storage backend by an instance of :class:`weaver.store.ServiceStore`. @@ -136,7 +136,7 @@ class ProcessNotAccessible(HTTPForbidden, OWSAccessForbidden, ProcessException): class ProcessNotFound(HTTPNotFound, OWSNotFound, ProcessException): """ - Error related to a non existent process definition. + Error related to a non-existent process definition. Error indicating that a local WPS process could not be read from the storage backend by an instance of :class:`weaver.store.ProcessStore`. @@ -170,7 +170,7 @@ class JobException(WeaverException): class JobNotFound(HTTPNotFound, OWSNotFound, JobException): """ - Error related to a non existing job definition. + Error related to a non-existent job definition. Error indicating that a job could not be read from the storage backend by an instance of :class:`weaver.store.JobStore`. @@ -264,7 +264,7 @@ class PackageExecutionError(HTTPInternalServerError, OWSNoApplicableCode, Packag class PackageNotFound(HTTPNotFound, OWSNotFound, PackageException): """ - Error related to a non existent package definition. + Error related to a non-existent package definition. Error indicating that an instance of :class:`weaver.processes.wps_package.WpsPackage` could not properly retrieve the package definition using provided references. @@ -273,7 +273,7 @@ class PackageNotFound(HTTPNotFound, OWSNotFound, PackageException): class PayloadNotFound(HTTPNotFound, OWSNotFound, PackageException): """ - Error related to a non existent deployment payload definition. + Error related to a non-existent deployment payload definition. Error indicating that an instance of :class:`weaver.processes.wps_package.WpsPackage` could not properly retrieve the package deploy payload using provided references. @@ -289,7 +289,7 @@ class QuoteException(WeaverException): class QuoteNotFound(HTTPNotFound, OWSNotFound, QuoteException): """ - Error related to a non existent quote definition. + Error related to a non-existent quote definition. Error indicating that a quote could not be read from the storage backend by an instance of :class:`weaver.store.QuoteStore`. @@ -329,7 +329,7 @@ class BillException(WeaverException): class BillNotFound(HTTPNotFound, OWSNotFound, BillException): """ - Error related to a non existent bill definition. + Error related to a non-existent bill definition. Error indicating that a bill could not be read from the storage backend by an instance of :class:`weaver.store.BillStore`. @@ -363,7 +363,7 @@ class VaultFileException(WeaverException): class VaultFileNotFound(HTTPNotFound, OWSNotFound, VaultFileException): """ - Error related to a non existent vault file definition. + Error related to a non-existent vault file definition. Error indicating that a vault file could not be read from the storage backend by an instance of :class:`weaver.store.VaultStore`. @@ -404,7 +404,7 @@ def handle_known_exceptions(function): The decorator simply returns the known exception such that :func:`weaver.tweens.ows_response_tween` can later handle it appropriately. Exceptions derived from :exc:`weaver.owsexceptions.OWSException` are employed since - they themselves have base references to :mod:`pywps.exceptions` classes that the service can understand. + they, themselves, already have base references to :mod:`pywps.exceptions` classes that the service can understand. .. warning:: In :mod:`pywps`, ``HTTPException`` refers to :exc:`werkzeug.exceptions.HTTPException` while in @@ -414,7 +414,7 @@ def handle_known_exceptions(function): @functools.wraps(function) def wrapped(*_, **__): - # type: (Any, Any) -> Union[ReturnValue, OWSException] + # type: (Any, Any) -> Union[Return, OWSException] try: return function(*_, **__) except (WeaverException, OWSException, HTTPException) as exc: @@ -466,10 +466,10 @@ def log_unhandled_exceptions(logger=LOGGER, message="Unhandled exception occurre known_exceptions = tuple(known_exceptions) def wrap(function): - # type: (Callable[[Any, Any], ReturnValue]) -> Callable + # type: (Callable[[Any, Any], Return]) -> Callable @functools.wraps(function) def call(*args, **kwargs): - # type: (Any, Any) -> ReturnValue + # type: (Any, Any) -> Return try: # handle input arguments that are extended by various pyramid operations if is_request: diff --git a/weaver/formats.py b/weaver/formats.py index 85b367c93..8a9eae651 100644 --- a/weaver/formats.py +++ b/weaver/formats.py @@ -1,3 +1,4 @@ +import datetime import json import logging import os @@ -19,7 +20,7 @@ if TYPE_CHECKING: from typing import Any, Dict, List, Optional, Tuple, Union - from weaver.base import PropertyDataType + from weaver.base import PropertyDataTypeT from weaver.typedefs import JSON, AnyRequestType LOGGER = logging.getLogger(__name__) @@ -43,6 +44,7 @@ class ContentType(Constants): "/" [x- | "."] ["+" suffix] *[";" parameter=value] """ + APP_DIR = "application/directory" APP_CWL = "application/cwl" APP_CWL_JSON = "application/cwl+json" APP_CWL_YAML = "application/cwl+yaml" @@ -130,10 +132,10 @@ class OutputFormat(Constants): @classmethod def get(cls, # pylint: disable=W0221,W0237 # arguments differ/renamed - format_or_version, # type: Union[str, AnyOutputFormat, PropertyDataType] + format_or_version, # type: Union[str, AnyOutputFormat, PropertyDataTypeT] default=JSON, # type: AnyOutputFormat allow_version=True, # type: bool - ): # type: (...) -> Union[AnyOutputFormat, PropertyDataType] + ): # type: (...) -> Union[AnyOutputFormat, PropertyDataTypeT] """ Resolve the applicable output format. @@ -205,7 +207,8 @@ class SchemaRole(Constants): ContentType.APP_TAR_GZ: ".tar.gz", ContentType.APP_YAML: ".yml", ContentType.IMAGE_TIFF: ".tif", # common alternate to .tiff - ContentType.ANY: ".*", # any for glob + ContentType.ANY: ".*", # any for glob + ContentType.APP_DIR: "/", # force href to finish with explicit '/' to mark directory ContentType.APP_OCTET_STREAM: "", ContentType.APP_FORM: "", ContentType.MULTI_PART_FORM: "", @@ -441,6 +444,8 @@ def _handle_dot(_ext): fmt = _CONTENT_TYPE_FORMAT_MAPPING.get(mime_type) if fmt: + if not fmt.extension.startswith("."): + return fmt.extension return _handle_dot(fmt.extension) ext = _CONTENT_TYPE_EXTENSION_MAPPING.get(mime_type) if ext: @@ -463,11 +468,15 @@ def get_content_type(extension, charset=None, default=None): :param default: Default Content-Type to return if no extension is matched. :return: Matched or default Content-Type. """ + ctype = None if not extension: return default if not extension.startswith("."): - extension = f".{extension}" - ctype = _EXTENSION_CONTENT_TYPES_MAPPING.get(extension) + ctype = _EXTENSION_CONTENT_TYPES_MAPPING.get(extension) + if not ctype: + extension = f".{extension}" + if not ctype: + ctype = _EXTENSION_CONTENT_TYPES_MAPPING.get(extension) if not ctype: return default return add_content_type_charset(ctype, charset) @@ -718,6 +727,13 @@ def guess_target_format(request, default=ContentType.APP_JSON): return content_type +def json_default_handler(obj): + # type: (Any) -> Union[JSON, str, None] + if isinstance(obj, (datetime.date, datetime.datetime)): + return obj.isoformat() + raise TypeError(f"Type {type(obj)} not serializable.") + + def repr_json(data, force_string=True, ensure_ascii=False, indent=2, **kwargs): # type: (Any, bool, bool, Optional[int], **Any) -> Union[JSON, str, None] """ @@ -727,8 +743,11 @@ def repr_json(data, force_string=True, ensure_ascii=False, indent=2, **kwargs): """ if data is None: return None + default = kwargs.pop("default", None) + if default is None: + default = json_default_handler try: - data_str = json.dumps(data, indent=indent, ensure_ascii=ensure_ascii, **kwargs) + data_str = json.dumps(data, indent=indent, ensure_ascii=ensure_ascii, default=default, **kwargs) return data_str if force_string else data except Exception: # noqa: W0703 # nosec: B110 return str(data) diff --git a/weaver/owsexceptions.py b/weaver/owsexceptions.py index ec99b7316..be020588b 100644 --- a/weaver/owsexceptions.py +++ b/weaver/owsexceptions.py @@ -30,7 +30,6 @@ from zope.interface import implementer from weaver.formats import ContentType -from weaver.utils import clean_json_text_body from weaver.warning import MissingParameterWarning, UnsupportedOperationWarning if TYPE_CHECKING: @@ -112,6 +111,8 @@ def __repr__(self): @staticmethod def json_formatter(status, body, title, environ): # noqa # type: (str, str, str, SettingsType) -> JSON + from weaver.utils import clean_json_text_body + body = clean_json_text_body(body) # message/description code = int(status.split()[0]) # HTTP status code body = {"description": body, "code": title} # title is the string OGC 'code' diff --git a/weaver/processes/constants.py b/weaver/processes/constants.py index 79911edf7..447e502b7 100644 --- a/weaver/processes/constants.py +++ b/weaver/processes/constants.py @@ -86,6 +86,7 @@ class OpenSearchField(Constants): CWL_REQUIREMENT_CUDA = "cwltool:CUDARequirement" CWL_REQUIREMENT_ENV_VAR = "EnvVarRequirement" CWL_REQUIREMENT_INIT_WORKDIR = "InitialWorkDirRequirement" +CWL_REQUIREMENT_INLINE_JAVASCRIPT = "InlineJavascriptRequirement" CWL_REQUIREMENT_NETWORK_ACCESS = "NetworkAccess" CWL_REQUIREMENT_RESOURCE = "ResourceRequirement" CWL_REQUIREMENT_SCATTER = "ScatterFeatureRequirement" @@ -94,6 +95,7 @@ class OpenSearchField(Constants): CWL_REQUIREMENT_CUDA, CWL_REQUIREMENT_ENV_VAR, CWL_REQUIREMENT_INIT_WORKDIR, + CWL_REQUIREMENT_INLINE_JAVASCRIPT, CWL_REQUIREMENT_NETWORK_ACCESS, CWL_REQUIREMENT_RESOURCE, # FIXME: perform pre-check on job submit? (https://github.com/crim-ca/weaver/issues/138) CWL_REQUIREMENT_SCATTER, @@ -115,7 +117,9 @@ class OpenSearchField(Constants): PACKAGE_EXTENSIONS = frozenset(["yaml", "yml", "json", "cwl", "job"]) PACKAGE_SIMPLE_TYPES = frozenset(["string", "boolean", "float", "int", "integer", "long", "double"]) PACKAGE_LITERAL_TYPES = frozenset(PACKAGE_SIMPLE_TYPES | {"null", "Any"}) -PACKAGE_COMPLEX_TYPES = frozenset(["File"]) # FIXME: type "Directory" not supported +PACKAGE_FILE_TYPE = "File" +PACKAGE_DIRECTORY_TYPE = "Directory" +PACKAGE_COMPLEX_TYPES = frozenset([PACKAGE_FILE_TYPE, PACKAGE_DIRECTORY_TYPE]) PACKAGE_ENUM_BASE = "enum" PACKAGE_CUSTOM_TYPES = frozenset([PACKAGE_ENUM_BASE]) # can be anything, but support "enum" which is more common PACKAGE_ARRAY_BASE = "array" diff --git a/weaver/processes/convert.py b/weaver/processes/convert.py index 15dfcb0e2..21139d9e1 100644 --- a/weaver/processes/convert.py +++ b/weaver/processes/convert.py @@ -53,8 +53,11 @@ PACKAGE_ARRAY_ITEMS, PACKAGE_ARRAY_MAX_SIZE, PACKAGE_ARRAY_TYPES, + PACKAGE_COMPLEX_TYPES, PACKAGE_CUSTOM_TYPES, + PACKAGE_DIRECTORY_TYPE, PACKAGE_ENUM_BASE, + PACKAGE_FILE_TYPE, PACKAGE_LITERAL_TYPES, WPS_BOUNDINGBOX, WPS_COMPLEX, @@ -90,7 +93,7 @@ from weaver.wps_restapi import swagger_definitions as sd if TYPE_CHECKING: - from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union + from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union from urllib.parse import ParseResult from pywps.app import WPSRequest @@ -109,7 +112,6 @@ AnySettingsContainer, AnyValueType, CWL, - CWL_IO_BaseType, CWL_IO_ComplexType, CWL_IO_EnumSymbols, CWL_IO_FileValue, @@ -126,9 +128,10 @@ NotRequired, OpenAPISchema, OpenAPISchemaArray, + OpenAPISchemaKeyword, OpenAPISchemaObject, OpenAPISchemaProperty, - OpenAPISchemaKeyword, + OpenAPISchemaReference, TypedDict ) from weaver.wps_restapi.constants import JobInputsOutputsSchemaType @@ -213,7 +216,7 @@ "bool": bool, "boolean": bool, "file": unquote, - "File": unquote, + PACKAGE_FILE_TYPE: unquote, "float": float, "int": int, "integer": int, @@ -469,7 +472,7 @@ def get_io_type_category(io_info): io_info = copy.deepcopy(io_info) io_info.setdefault("name", "dontcare") io_def = get_cwl_io_type(io_info, strict=False) - return WPS_COMPLEX if io_def.type in [null, "File", "Directory"] else WPS_LITERAL + return WPS_COMPLEX if io_def.type in [null, PACKAGE_FILE_TYPE, PACKAGE_DIRECTORY_TYPE] else WPS_LITERAL io_fmt = get_field(io_info, "supported_formats", search_variations=True) return WPS_LITERAL if io_fmt is null else WPS_COMPLEX @@ -497,7 +500,7 @@ def _convert_any2cwl_io_complex(cwl_io, cwl_ns, wps_io, io_select): """ cwl_io_fmt = None cwl_io_ext = ContentType.ANY - cwl_io["type"] = "File" + cwl_io["type"] = PACKAGE_FILE_TYPE cwl_id = cwl_io["id"] # inputs are allowed to define multiple 'supported' formats @@ -788,7 +791,7 @@ def ogcapi2cwl_process(payload, reference): # if no CWL could be resolved, generate I/O from process io_ns = {} # type: Dict[str, str] for io_select in ["input", "output"]: - io_holder = f"{io_select}s" + io_holder = f"{io_select}s" # type: Literal["inputs", "outputs"] # noqa io_struct = copy.deepcopy(process_info.get(io_holder, {})) io_struct = normalize_ordered_io(io_struct) cwl_pkg[io_holder] = {} # type: Dict[str, CWL_IO_Type] @@ -811,57 +814,62 @@ def ogcapi2cwl_process(payload, reference): } } } - cwl_package.update(cwl_pkg) + cwl_package.update(cwl_pkg) # type: ignore payload_copy["executionUnit"] = [{"unit": cwl_package}] payload_copy["deploymentProfile"] = "http://www.opengis.net/profiles/eoc/ogcapiApplication" return cwl_package, payload_copy -def is_cwl_file_type(io_info): - # type: (CWL_IO_Type) -> bool +def is_cwl_complex_type(io_info, complex_types=PACKAGE_COMPLEX_TYPES): + # type: (CWL_IO_Type, Iterable[CWL_IO_ComplexType]) -> bool """ - Identifies if the provided `CWL` input/output corresponds to one, many or potentially a ``File`` type(s). + Identifies if the provided :term:`CWL` input/output corresponds to one, many or a potential `Complex` type(s). - When multiple distinct *atomic* types are allowed for a given I/O (e.g.: ``[string, File]``) and that one of them - is a ``File``, the result will be ``True`` even if other types are not ``Files``. - Potential ``File`` when other base type is ``"null"`` will also return ``True``. + When multiple distinct *atomic* types are allowed for a given I/O (e.g.: ``type: [string, File]``) and that one + of them is one of the considered `Complex` type, the result will be ``True`` even if other types are not `Complex`. + Similarly, optional `Complex` types combined with ``"null"`` will also return ``True``. + + :param io_info: I/O to verify for complex type. + :param complex_types: + Complex types to consider. + By default, any type between :term:`CWL` ``File`` and ``Directory`` are valid. + The operation can be limited to one or the other if needed to identify a specific one. """ io_type = io_info.get("type") if not io_type: raise ValueError(f"Missing CWL 'type' definition: [{io_info!s}]") if isinstance(io_type, str): - return io_type == "File" + return io_type in complex_types if isinstance(io_type, dict): if io_type["type"] == PACKAGE_ARRAY_BASE: - return io_type["items"] == "File" - return io_type["type"] == "File" + return io_type["items"] in complex_types + return io_type["type"] in complex_types if isinstance(io_type, list): - return any(typ == "File" or is_cwl_file_type({"type": typ}) for typ in io_type) + return any( + (isinstance(typ, str) and typ in complex_types) or + is_cwl_complex_type({"type": typ}, complex_types) + for typ in io_type + ) raise ValueError(f"Unknown parsing of CWL 'type' format ({type(io_type)!s}) [{io_type!s}] in [{io_info}]") -def is_cwl_array_type(io_info, strict=True): - # type: (CWL_IO_Type, bool) -> Tuple[bool, str, MODE, Optional[Union[Type[AnyValue], CWL_IO_EnumSymbols]]] +def parse_cwl_array_type(io_info, strict=True): + # type: (CWL_IO_Type, bool) -> CWLIODefinition """ - Verifies if the specified I/O corresponds to one of various CWL array type definitions. + Parses the specified I/O for one of the various potential CWL array definitions. :param io_info: :term:`CWL` I/O definition to parse. :param strict: Indicates if only pure :term:`CWL` definition is allowed, or allow implicit data-type conversions. - :returns: - ``tuple(is_array, io_type, io_mode, io_allow)`` where: - - ``is_array``: specifies if the I/O is of array type. - - ``io_type``: array element type if ``is_array`` is True, type of ``io_info`` otherwise. - - ``io_mode``: validation mode to be applied if sub-element requires it, defaults to ``MODE.NONE``. - - ``io_allow``: validation values to be applied if sub-element requires it, defaults to ``AnyValue``. + :returns: Updated :term:`CWL` I/O definition with applicable properties. :raises PackageTypeError: if the array element doesn't have the required values and valid format. """ # use mapping to allow sub-function updates - io_return = { - "array": False, - "allow": AnyValue, - "type": get_cwl_io_type_name(io_info["type"]), - "mode": MODE.NONE, - } + io_return = CWLIODefinition( + array=False, + symbols=AnyValue, + type=get_cwl_io_type_name(io_info["type"]), + mode=MODE.NONE, + ) def _update_if_sub_enum(_io_item): # type: (CWL_IO_Type) -> bool @@ -871,75 +879,94 @@ def _update_if_sub_enum(_io_item): Parameter ``io_item`` should correspond to field ``items`` of an array I/O definition. Simple pass-through if the array item is not an ``enum``. """ - _is_enum, _enum_type, _enum_mode, _enum_allow = is_cwl_enum_type({"type": _io_item}) # noqa: typing - if _is_enum: + _def = parse_cwl_enum_type({"type": _io_item}) + if _def.enum: LOGGER.debug("I/O [%s] parsed as 'array' with sub-item as 'enum'", io_info["name"]) - io_return["type"] = _enum_type - io_return["mode"] = _enum_mode - io_return["allow"] = _enum_allow # type: ignore - return _is_enum + io_return.enum = True + io_return.type = _def.type + io_return.mode = _def.mode + io_return.symbols = _def.symbols + return _def.enum # optional I/O could be an array of '["null", ""]' with "" being any of the formats parsed after # is it the literal representation instead of the shorthand with '?' if isinstance(io_info["type"], list) and any(sub_type == "null" for sub_type in io_info["type"]): # we can ignore the optional indication in this case because it doesn't impact following parsing - io_return["type"] = list(filter(lambda sub_type: sub_type != "null", io_info["type"]))[0] + io_return.type = list(filter(lambda sub_type: sub_type != "null", io_info["type"]))[0] # array type conversion when defined as '{"type": "array", "items": ""}' # validate against 'Hashable' instead of 'dict' since 'OrderedDict'/'CommentedMap' can fail 'isinstance()' if ( - not isinstance(io_return["type"], str) - and not isinstance(io_return["type"], Hashable) - and "items" in io_return["type"] - and "type" in io_return["type"] + not isinstance(io_return.type, str) + and not isinstance(io_return.type, Hashable) + and "items" in io_return.type + and "type" in io_return.type ): - io_type = dict(io_return["type"]) # make hashable to allow comparison + io_type = dict(io_return.type) # make hashable to allow comparison if io_type["type"] != PACKAGE_ARRAY_BASE: raise PackageTypeError(f"Unsupported I/O 'array' definition: '{io_info!r}'.") # parse enum in case we got an array of allowed symbols io_items = get_cwl_io_type_name(io_type["items"]) is_enum = _update_if_sub_enum(io_items) if not is_enum: - io_return["type"] = io_items - io_type = get_cwl_io_type_name(io_return["type"]) + io_return.type = io_items + io_type = get_cwl_io_type_name(io_return.type) if io_type not in PACKAGE_ARRAY_ITEMS: # includes Complex, so implicit literal-only check possible io_type = any2cwl_literal_datatype(io_type) if strict or io_type not in PACKAGE_ARRAY_ITEMS: raise PackageTypeError(f"Unsupported I/O 'array' definition: '{io_info!r}'.") - io_return["type"] = io_type + io_return.type = io_type LOGGER.debug("I/O [%s] parsed as 'array' with nested dict notation", io_info["name"]) - io_return["array"] = True + io_return.array = True # array type conversion when defined as string '[]' - elif isinstance(io_return["type"], str) and get_cwl_io_type_name(io_return["type"]) in PACKAGE_ARRAY_TYPES: - io_return["type"] = get_cwl_io_type_name(io_return["type"][:-2]) # remove '[]' - if io_return["type"] in PACKAGE_CUSTOM_TYPES: + elif isinstance(io_return.type, str) and get_cwl_io_type_name(io_return.type) in PACKAGE_ARRAY_TYPES: + io_return.type = get_cwl_io_type_name(io_return.type[:-2]) # remove '[]' + if io_return.type in PACKAGE_CUSTOM_TYPES: # parse 'enum[]' for array of allowed symbols, provide expected structure for sub-item parsing io_item = deepcopy(io_info) - io_item["type"] = io_return["type"] # override corrected type without '[]' + io_item["type"] = io_return.type # override corrected type without '[]' _update_if_sub_enum(io_item) - if io_return["type"] not in PACKAGE_ARRAY_ITEMS: + if io_return.type not in PACKAGE_ARRAY_ITEMS: raise PackageTypeError(f"Unsupported I/O 'array' definition: '{io_info!r}'.") LOGGER.debug("I/O [%s] parsed as 'array' with shorthand '[]' notation", io_info["name"]) - io_return["array"] = True - return io_return["array"], io_return["type"], io_return["mode"], io_return["allow"] + io_return.array = True + + # in case the I/O was not an array parsed with one of the above conditions, + # still check for enum to be consistant in returned definition if one was provided + try: + _update_if_sub_enum(io_info) + except PackageTypeError: + pass + return io_return -def is_cwl_enum_type(io_info): - # type: (CWL_IO_Type) -> Tuple[bool, str, int, Optional[CWL_IO_EnumSymbols]] +def parse_cwl_enum_type(io_info): + # type: (CWL_IO_Type) -> CWLIODefinition """ - Verifies if the specified I/O corresponds to a CWL enum definition. + Parses the specified I/O for potential CWL enum definition. - :returns: - ``tuple(is_enum, io_type, io_allow)`` where: - - ``is_enum``: specifies if the I/O is of enum type. - - ``io_type``: enum base type if ``is_enum=True``, type of ``io_info`` otherwise. - - ``io_mode``: validation mode to be applied if input requires it, defaults to ``MODE.NONE``. - - ``io_allow``: validation values of the enum. + :returns: Updated :term:`CWL` I/O definition with applicable properties. :raises PackageTypeError: if the enum doesn't have the required parameters and valid format. """ io_type = get_cwl_io_type_name(io_info["type"]) - if not isinstance(io_type, dict) or "type" not in io_type or io_type["type"] not in PACKAGE_CUSTOM_TYPES: - return False, io_type, MODE.NONE, None + if not isinstance(io_type, dict) or "type" not in io_type: + io_def = CWLIODefinition( + type=io_type, + enum=False, + mode=MODE.NONE, + ) + return io_def + if isinstance(io_type, dict) and "type" in io_type and ( + isinstance(io_type["type"], str) and io_type["type"] not in PACKAGE_CUSTOM_TYPES or + isinstance(io_type["type"], list) + ): + io_type = io_type["type"] if isinstance(io_type["type"], str) else PACKAGE_ARRAY_BASE + io_def = CWLIODefinition( + type=io_type, + enum=False, + mode=MODE.NONE, + ) + return io_def if "symbols" not in io_type: raise PackageTypeError(f"Unsupported I/O 'enum' definition missing 'symbols': '{io_info!r}'.") @@ -963,7 +990,13 @@ def is_cwl_enum_type(io_info): f"Unsupported I/O 'enum' base type: `{type(first_allow)!s}`, from definition: `{io_info!r}`." ) - return True, io_type, MODE.SIMPLE, io_allow # allowed value validator mode must be set for input + io_def = CWLIODefinition( + type=io_type, # type: ignore + enum=True, + mode=MODE.SIMPLE, # allowed value validator mode must be set for input + symbols=io_allow, + ) + return io_def def get_cwl_io_type_name(io_type): @@ -980,22 +1013,95 @@ def get_cwl_io_type_name(io_type): @dataclass -class CWLIODefinition: +class CWLIODefinition(object): """ Utility :term:`CWL` I/O definition to contain metadata from parsing results. .. seealso:: :func:`weaver.processes.convert.get_cwl_io_type` """ + + # provide dataclass conversions for 'tuple()', 'list()', 'dict()' + + def keys(self): + # type: () -> List[str] + fields = getattr(self, "__dataclass_fields__") + return list(fields) + + def __getitem__(self, key): + # type: (str) -> Any + return getattr(self, key) + + def __iter__(self): + # type: () -> Iterator[Any] + for key in self.keys(): + value = self[key] + yield value + + # --- FIELDS --- + name: str = "" + """ + Name (or identifier) or the I/O. + """ + type: "Union[CWL_IO_LiteralType, CWL_IO_ComplexType]" = None + """ + Type of the :term:`CWL` I/O. + + If :attr:`enum` is ``True``, represents the enum base type. + If :attr:`array` is ``True``, represents the item type. + """ + null: bool = False + """ + Indicates if the I/O is nullable. + + This is obtained from a type composed of ``"null"`` and something else, + or using the shorthand ``{type}?`` notation. + """ + min_occurs: int = 1 + """ + Minimum number of occurrences allowed. + + When :attr:`null` is ``True``, it is equal to ``0``. + Otherwise, it is greater or equal to ``1``. + If greater than ``1``, :attr:`array` should be ``True``. + """ + max_occurs: int = 1 + """ + Maximum number of occurrences allowed. + + Applies only when :attr:`array` is ``True``. Otherwise, always equal to ``1``. + Can take the value :data:`PACKAGE_ARRAY_MAX_SIZE` to represent ``"unbounded"`` occurrences. + """ + array: bool = False + """ + Specifies if the I/O is of array type. + """ + enum: bool = False - symbols: "Union[CWL_IO_EnumSymbols, AnyValue]" = AnyValue + """ + Specifies if the I/O is of enum type. + """ + + symbols: "Union[CWL_IO_EnumSymbols, AnyValue, Type[AnyValue]]" = AnyValue + """ + Specifies the allowed values when the definition is marked as :attr:`enum`. + When not overriden by literal values, it uses the default :class:`AnyValue`. + """ + mode: MODE = MODE.NONE + """ + Validation mode to be applied if I/O requires it. + + Defaults to :attr:`MODE.NONE`. Indicates how strict the validation must be. + Usually applies when an enum must only allow a specific set of symbols. + Can also be used with Media-Types in more advanced validation use case with :mod:`pywps`. + """ def get_cwl_io_type(io_info, strict=True): @@ -1020,6 +1126,8 @@ def get_cwl_io_type(io_info, strict=True): """ io_type = get_cwl_io_type_name(io_info["type"]) is_null = False + io_mode = MODE.NONE + io_allow = AnyValue # parse multi-definition if isinstance(io_type, list): @@ -1038,19 +1146,19 @@ def get_cwl_io_type(io_info, strict=True): # check that many sub-type definitions all match same base type (no conflicting literals) io_type_many = set() io_base_type = None - for i, typ in enumerate(io_type): + for i, typ in enumerate(io_type, start=int(is_null)): typ = get_cwl_io_type_name(typ) io_name = io_info["name"] sub_type = {"type": typ, "name": f"{io_name}[{i}]"} # type: CWL_IO_Type - is_array, array_elem, _, _ = is_cwl_array_type(sub_type, strict=strict) - is_enum, enum_type, _, _ = is_cwl_enum_type(sub_type) + array_io_def = parse_cwl_array_type(sub_type, strict=strict) + enum_io_def = parse_cwl_enum_type(sub_type) # array base type more important than enum because later array conversion also handles allowed values - if is_array: + if array_io_def.array: io_base_type = typ # highest priority (can have sub-literal or sub-enum) - io_type_many.add(array_elem) - elif is_enum: - io_base_type = io_base_type if io_base_type is not None else enum_type # less priority - io_type_many.add(enum_type) + io_type_many.add(array_io_def.type) + elif enum_io_def.enum: + io_base_type = io_base_type if io_base_type is not None else enum_io_def.type # less priority + io_type_many.add(enum_io_def.type) else: io_base_type = io_base_type if io_base_type is not None else typ # less priority io_type_many.add(typ) # literal base type by itself (not array/enum) @@ -1067,27 +1175,35 @@ def get_cwl_io_type(io_info, strict=True): io_max_occurs = 1 # unless array after # convert array types - is_array, array_elem, io_mode, io_allow = is_cwl_array_type(io_info, strict=strict) - if is_array: + array_io_def = parse_cwl_array_type(io_info, strict=strict) + if array_io_def.array: LOGGER.debug("I/O parsed for 'array'") - io_type = array_elem + io_type = array_io_def.type io_max_occurs = PACKAGE_ARRAY_MAX_SIZE # convert enum types - is_enum, enum_type, enum_mode, enum_allow = is_cwl_enum_type(io_info) - if is_enum: - LOGGER.debug("I/O parsed for 'enum'") - io_type = enum_type - io_allow = enum_allow - io_mode = enum_mode + enum_io_def = parse_cwl_enum_type(io_info) + is_enum = False + if enum_io_def.enum: + LOGGER.debug("I/O parsed for 'enum' from base") + io_type = enum_io_def.type + io_allow = enum_io_def.symbols + io_mode = enum_io_def.mode + is_enum = True + elif array_io_def.enum: + LOGGER.debug("I/O parsed for 'enum' from array") + io_type = array_io_def.type + io_allow = array_io_def.symbols + io_mode = array_io_def.mode + is_enum = True # debug info for unhandled types conversion if not isinstance(io_type, str): - LOGGER.debug("is_array: [%s]", repr(is_array)) - LOGGER.debug("array_elem: [%s]", repr(array_elem)) - LOGGER.debug("is_enum: [%s]", repr(is_enum)) - LOGGER.debug("enum_type: [%s]", repr(enum_type)) - LOGGER.debug("enum_allow: [%s]", repr(enum_allow)) + LOGGER.debug("is_array: [%s]", repr(array_io_def.array)) + LOGGER.debug("array_elem: [%s]", repr(array_io_def.type)) + LOGGER.debug("is_enum: [%s]", repr(enum_io_def.enum)) + LOGGER.debug("enum_type: [%s]", repr(enum_io_def.type)) + LOGGER.debug("enum_allow: [%s]", repr(enum_io_def.symbols)) LOGGER.debug("io_info: [%s]", repr(io_info)) LOGGER.debug("io_type: [%s]", repr(io_type)) LOGGER.debug("type(io_type): [%s]", type(io_type)) @@ -1101,14 +1217,15 @@ def get_cwl_io_type(io_info, strict=True): io_min_occurs = 0 is_null = True - io_type = any2cwl_literal_datatype(io_type) + if io_type not in PACKAGE_COMPLEX_TYPES: + io_type = any2cwl_literal_datatype(io_type) io_def = CWLIODefinition( name=io_name, type=io_type, null=is_null, min_occurs=io_min_occurs, max_occurs=io_max_occurs, - array=is_array, + array=array_io_def.array, enum=is_enum, symbols=io_allow, mode=io_mode, @@ -1200,12 +1317,15 @@ def cwl2wps_io(io_info, io_select): else: # we need to minimally add 1 format, otherwise empty list is evaluated as None by pywps # when "supported_formats" is None, the process's json property raises because of it cannot iterate formats - kw["supported_formats"] = [DEFAULT_FORMAT] + if io_def.type == PACKAGE_FILE_TYPE: + kw["supported_formats"] = [DEFAULT_FORMAT] + if io_def.type == PACKAGE_DIRECTORY_TYPE: + kw["supported_formats"] = [get_format(ContentType.APP_DIR)] kw["mode"] = MODE.NONE # don't validate anything as default is only raw text if is_output: - if io_def.type == "Directory": + if io_def.type == PACKAGE_DIRECTORY_TYPE: kw["as_reference"] = True - if io_def.type == "File": + if io_def.type == PACKAGE_FILE_TYPE: has_contents = io_info.get("contents") is not None kw["as_reference"] = not has_contents else: @@ -1247,14 +1367,14 @@ def _get_file_input(input_data): inputs = {} for input_id, input_value in data.items(): # single file - if isinstance(input_value, dict) and input_value.get("class") == "File": + if isinstance(input_value, dict) and input_value.get("class") == PACKAGE_FILE_TYPE: inputs[input_id] = _get_file_input(input_value) # single literal value elif isinstance(input_value, (str, int, float, bool)): inputs[input_id] = {"value": input_value} # multiple files elif isinstance(input_value, list) and all( - isinstance(val, dict) and val.get("class") == "File" for val in input_value + isinstance(val, dict) and val.get("class") == PACKAGE_FILE_TYPE for val in input_value ): inputs[input_id] = [_get_file_input(val) for val in input_value] # multiple literal values @@ -1461,7 +1581,7 @@ def repr2json_input_values(inputs): arr_val = str_val.split(";") convert = INPUT_VALUE_TYPE_MAPPING[map_typ] arr_val = [repr2json_input_params(val, convert) for val in arr_val] - if map_typ.capitalize() == "File": + if map_typ.capitalize() == PACKAGE_FILE_TYPE: val_key = "href" for val in arr_val: ref = val["data"] @@ -1711,7 +1831,7 @@ def json2oas_io_bbox(io_info, io_hint=null): "properties": { "crs": crs_schema, "bbox": { - "type": "array", + "type": PACKAGE_ARRAY_BASE, "items": "number", "oneOf": [ {"minItems": 4, "maxItems": 4}, @@ -1719,7 +1839,7 @@ def json2oas_io_bbox(io_info, io_hint=null): ] }, } - } + } # type: OpenAPISchemaObject if isinstance(io_hint, dict): if "$ref" in io_hint: item_schema["$id"] = io_hint["$ref"] @@ -1903,7 +2023,7 @@ def json2oas_io(io_info, io_hint=null): # because specified single-value/objects *MUST* be provided, optional can be represented only by zero-length array if isinstance(min_occurs, int) and (min_occurs == 0 or min_occurs > 1): io_schema = { - "type": "array", + "type": PACKAGE_ARRAY_BASE, "items": item_schema, "minItems": min_occurs, } @@ -1912,7 +2032,7 @@ def json2oas_io(io_info, io_hint=null): elif max_occurs == 1 or max_occurs is null: # assume unspecified is default=1 io_schema = item_schema else: - array_schema = {"type": "array", "items": item_schema} + array_schema = {"type": PACKAGE_ARRAY_BASE, "items": item_schema} if isinstance(min_occurs, int): array_schema["minItems"] = min_occurs if isinstance(max_occurs, int): @@ -2220,7 +2340,7 @@ def oas2json_io(io_info): if io_ctype and ContentType.APP_JSON in io_ctype: io_formats[0]["schema"] = io_info["$id"] if io_type is null or io_json is null: - LOGGER.debug("Unknown OpenAPI to JSON I/O resolution for schema: %s", repr_json(io_info)) + LOGGER.debug("Unknown OpenAPI to JSON I/O resolution for schema:\n%s", repr_json(io_info)) return null # default literal value can help resolve as last resort if specific type cannot be inferred @@ -2267,7 +2387,7 @@ def oas_resolve_remote(io_info): # Then update the first level of references that we can potentially work with to resolve conversion type. # No need to resolve more since this is guaranteed to be 'complex' type. # We must use the resolver right away in case the remote $ref are relative to the same root $ref. - for keyword in OAS_KEYWORD_TYPES: + for keyword in OAS_KEYWORD_TYPES: # type: Literal["oneOf", "anyOf", "allOf", "not"] if keyword in io_info: if isinstance(io_info[keyword], list): # all keywords except 'not' for i, schema in enumerate(list(io_info[keyword])): @@ -2276,7 +2396,8 @@ def oas_resolve_remote(io_info): schema["$id"] = ref_id io_info[keyword][i] = schema # noqa elif "$ref" in io_info[keyword]: # only 'not' keyword - ref_schema = io_info[keyword]["$ref"] + io_keyword = io_info[keyword] # type: OpenAPISchemaReference # noqa + ref_schema = io_keyword["$ref"] ref_id, schema = resolver.resolve(ref_schema) schema["$id"] = ref_id io_info[keyword] = schema diff --git a/weaver/processes/esgf_process.py b/weaver/processes/esgf_process.py index 1f0789afb..4917c4688 100644 --- a/weaver/processes/esgf_process.py +++ b/weaver/processes/esgf_process.py @@ -5,6 +5,7 @@ import cwt # noqa # package: esgf-compute-api +from weaver.processes.constants import PACKAGE_FILE_TYPE from weaver.processes.wps1_process import Wps1Process from weaver.status import Status from weaver.utils import fetch_file @@ -146,7 +147,7 @@ def _get_files_urls(workflow_inputs): files = [files] for cwl_file in files: - if not cwl_file["class"] == "File": + if not cwl_file["class"] == PACKAGE_FILE_TYPE: raise ValueError(f"Input named '{InputNames.FILES}' must have a class named 'File'") location = cwl_file["location"] if not location.startswith("http"): diff --git a/weaver/processes/execution.py b/weaver/processes/execution.py index 1244ebbe9..b212ddc26 100644 --- a/weaver/processes/execution.py +++ b/weaver/processes/execution.py @@ -343,7 +343,7 @@ def collect_statistics(process, settings=None, job=None, rss_start=None): if res_ref and isinstance(res_ref, str): if res_ref.startswith(f"/{job.id}"): # pseudo-relative reference out_dir = get_wps_output_dir(settings) - res_ref = os.path.join(out_dir, res_ref[1:]) + res_ref = os.path.join(out_dir, res_ref.lstrip("/")) if os.path.isfile(res_ref): res_stat = os.stat(res_ref) res_id = get_any_id(result) diff --git a/weaver/processes/opensearch.py b/weaver/processes/opensearch.py index 20d033e2f..e85ccbdc0 100644 --- a/weaver/processes/opensearch.py +++ b/weaver/processes/opensearch.py @@ -18,7 +18,7 @@ from weaver.utils import get_any_id, request_extra if TYPE_CHECKING: - from typing import Deque, Dict, Iterable, List, Optional, Tuple + from typing import Deque, Dict, Iterable, Iterator, List, Optional, Tuple from weaver.processes.convert import WPS_Input_Type, JSON_IO_Type from weaver.typedefs import AnySettingsContainer, DataSourceOpenSearch, JSON @@ -256,7 +256,7 @@ def _fetch_datatsets_from_alternates_links(self, alternate_links): return [] def _query_features_paginated(self, params): - # type: (JSON) -> Iterable[JSON, str] + # type: (JSON) -> Iterator[JSON, str] """ Iterates over paginated results until all features are retrieved. @@ -289,7 +289,7 @@ def _query_features_paginated(self, params): start_index += n_received_features def query_datasets(self, params, accept_schemes, accept_mime_types): - # type: (JSON, List[str], List[str]) -> Iterable[str] + # type: (JSON, List[str], List[str]) -> Iterator[str] """ Query the specified datasets. diff --git a/weaver/processes/utils.py b/weaver/processes/utils.py index 859b0a04e..bc5f4745d 100644 --- a/weaver/processes/utils.py +++ b/weaver/processes/utils.py @@ -84,6 +84,7 @@ FileSystemPathType, JSON, Literal, + ProcessDeployment, PyramidRequest, NotRequired, Number, @@ -182,7 +183,7 @@ def get_process_information(process_description): @log_unhandled_exceptions(logger=LOGGER, message="Unhandled error occurred during parsing of deploy payload.", is_request=False) def _check_deploy(payload): - # type: (JSON) -> JSON + # type: (JSON) -> Union[ProcessDeployment, CWL] """ Validate minimum deploy payload field requirements with exception handling. """ @@ -206,7 +207,7 @@ def _check_deploy(payload): message = f"Process deployment {io_type} definition is invalid." # try raising sub-schema to have specific reason d_io = io_schema(name=io_type).deserialize(p_io) - # Raise directly if we where not able to detect the cause, but there is something incorrectly dropped. + # Raise directly if we were unable to detect the cause, but there is something incorrectly dropped. # Only raise if indirect vs direct deserialize differ such that auto-resolved defaults omitted from # submitted process I/O or unknowns fields that were correctly ignored don't cause false-positive diffs. if r_io != d_io: @@ -1003,12 +1004,12 @@ def register_wps_processes_from_config(container, wps_processes_file_path=None): meaning they will be fetched on the provider each time a request refers to them, keeping their definition up-to-date with the remote server. - .. versionadded:: 1.14.0 + .. versionadded:: 1.14 When references are specified using ``providers`` section instead of ``processes``, the registration only saves the remote WPS provider endpoint to dynamically populate :term:`WPS` processes on demand. Previous behavior was to register each :term:`WPS` process individually with ID ``[service]_[process]``. - .. versionchanged:: 4.19.0 + .. versionchanged:: 4.19 Parameter position are inverted. If :paramref:`wps_processes_file_path` is explicitly provided, it is used directly without considering settings. Otherwise, automatically employ the definition in setting: ``weaver.wps_processes_file``. @@ -1131,7 +1132,7 @@ def register_cwl_processes_from_config(container): """ Load multiple :term:`CWL` definitions from a directory to register corresponding :term:`Process`. - .. versionadded:: 4.19.0 + .. versionadded:: 4.19 Each individual :term:`CWL` definition must fully describe a :term:`Process` by itself. Therefore, an ``id`` must be available in the file to indicate the target deployment reference. In case of conflict, the existing database diff --git a/weaver/processes/wps_package.py b/weaver/processes/wps_package.py index cffcb52e2..4ee5c2980 100644 --- a/weaver/processes/wps_package.py +++ b/weaver/processes/wps_package.py @@ -12,6 +12,7 @@ - :mod:`weaver.wps_restapi.api` conformance details """ +import copy import json import logging import os @@ -32,10 +33,12 @@ from cwltool.factory import Factory as CWLFactory, WorkflowStatus as CWLException from pyramid.httpexceptions import HTTPOk, HTTPServiceUnavailable from pywps import Process -from pywps.inout import BoundingBoxInput, ComplexInput, LiteralInput from pywps.inout.basic import SOURCE_TYPE -from pywps.inout.storage.file import FileStorageBuilder -from pywps.inout.storage.s3 import S3StorageBuilder +from pywps.inout.inputs import BoundingBoxInput, ComplexInput, LiteralInput +from pywps.inout.outputs import ComplexOutput +from pywps.inout.storage import STORE_TYPE, CachedStorage +from pywps.inout.storage.file import FileStorage, FileStorageBuilder +from pywps.inout.storage.s3 import S3Storage, S3StorageBuilder from requests.structures import CaseInsensitiveDict from weaver.config import WeaverConfiguration, WeaverFeature, get_weaver_configuration @@ -50,7 +53,7 @@ PackageTypeError, PayloadNotFound ) -from weaver.formats import ContentType, get_cwl_file_format, repr_json +from weaver.formats import ContentType, get_content_type, get_cwl_file_format, get_format, repr_json from weaver.processes import opensearch from weaver.processes.constants import ( CWL_REQUIREMENT_APP_BUILTIN, @@ -64,18 +67,22 @@ CWL_REQUIREMENT_ENV_VAR, CWL_REQUIREMENT_RESOURCE, CWL_REQUIREMENTS_SUPPORTED, + PACKAGE_COMPLEX_TYPES, + PACKAGE_DIRECTORY_TYPE, PACKAGE_EXTENSIONS, + PACKAGE_FILE_TYPE, WPS_INPUT, WPS_OUTPUT ) from weaver.processes.convert import ( + DEFAULT_FORMAT, cwl2wps_io, - is_cwl_array_type, json2wps_field, json2wps_io, merge_package_io, normalize_ordered_io, ogcapi2cwl_process, + parse_cwl_array_type, wps2json_io, xml_wps2cwl ) @@ -86,7 +93,11 @@ from weaver.store.base import StoreJobs, StoreProcesses from weaver.utils import ( SUPPORTED_FILE_SCHEMES, + OutputMethod, + adjust_directory_local, + adjust_file_local, bytes2str, + fetch_directory, fetch_file, fully_qualified_name, get_any_id, @@ -96,6 +107,7 @@ get_log_fmt, get_sane_name, get_settings, + list_directory_recursive, request_extra, setup_loggers ) @@ -111,11 +123,12 @@ from weaver.wps_restapi import swagger_definitions as sd if TYPE_CHECKING: - from typing import Any, Callable, Deque, Dict, List, Optional, Tuple, Type, Union + from typing import Any, AnyStr, Callable, Deque, Dict, List, Optional, Tuple, Type, Union from cwltool.factory import Callable as CWLFactoryCallable from cwltool.process import Process as ProcessCWL from owslib.wps import WPSExecution + from pywps.inout.formats import Format from pywps.response.execute import ExecuteResponse from weaver.datatype import Authentication, Job @@ -133,6 +146,7 @@ AnyValueType, CWL, CWL_AnyRequirements, + CWL_IO_ComplexType, CWL_Requirement, CWL_RequirementsDict, CWL_RequirementNames, @@ -145,6 +159,7 @@ JSON, Literal, Number, + Path, ValueType ) from weaver.wps.service import WorkerRequest @@ -864,6 +879,129 @@ def try_or_raise_package_error(call, reason): return process_offering +class DirectoryNestedStorage(CachedStorage): + """ + Generates a nested storage for a directory where each contained file will be managed by the storage. + """ + + def __init__(self, storage): + # type: (Union[FileStorage, S3Storage]) -> None + """ + Initializes the storage. + + :param storage: Storage implementation that is employed for storing files in a directory-like structure. + """ + self.__dict__["_cache"] = {} + self.__dict__["storage"] = storage + super(DirectoryNestedStorage, self).__init__() + + def __getattr__(self, item): + # type: (str) -> Any + return getattr(self.storage, item) + + def __setattr__(self, key, value): + # type: (str, Any) -> None + """ + Setting a property on this storage applies it on the nested file storage. + """ + if key in self.__dict__: + object.__setattr__(self, key, value) + else: + setattr(self.storage, key, value) + + @property + def type(self): + # type: () -> STORE_TYPE + return STORE_TYPE.PATH if isinstance(self.storage, FileStorage) else STORE_TYPE.S3 + + def _patch_destination(self, destination): + # type: (str) -> str + destination = destination.lstrip("/") # avoid issues with prefix path join + # file storage already does the target-dir/output-dir join + # however, s3 storage does not... + if isinstance(self.storage, S3Storage): + return os.path.join(self.prefix, destination) + return destination + + def _do_store(self, output): + # type: (ComplexOutput) -> Tuple[STORE_TYPE, Path, str] + """ + Store all files contained in a directory recursively. + + .. note:: + This is called from :meth:`CachedStorage.store` only if not already in storage using cached output ID. + """ + root = output.file + if not os.path.isdir(root): + raise ValueError(f"Location is not a directory: [{root}]") + files = list_directory_recursive(root) + root = root.rstrip("/") + "/" + loc_path = self.location(output.identifier) + "/" # local directory or S3 location + url_path = self.url(output.identifier) + "/" # HTTP output or same S3 location + default_support = [DEFAULT_FORMAT] + [get_format(ctype) for ctype in [ContentType.ANY, ContentType.TEXT_PLAIN]] + for file in files: + out_file_path_rel = file.split(root, 1)[-1] + out_cache_key = self._patch_destination(os.path.join(str(output.uuid), out_file_path_rel)) + out_ext = os.path.splitext(out_file_path_rel)[-1] + out_ctype = get_content_type(out_ext) # attempt guessing more specific format + out_fmt = get_format(out_ctype) + out_fmts = default_support + ([out_fmt] if out_fmt else []) + out_file = ComplexOutput(out_cache_key, title=output.title, data_format=out_fmt, supported_formats=out_fmts) + out_file.file = file + out_file.uuid = output.uuid # forward base directory auto-generated when storing file + # create a copy in case the storage is used by many dirs, avoid concurrent read/write of distinct prefixes + dir_storage = copy.copy(self.storage) + if isinstance(dir_storage, S3Storage): + # patch S3 nested prefix under current directory + # S3 storage methods use only the file name to generate the bucket object key + # to preserve the nested output dir definition, it must be pushed as prefix + dir_storage.prefix = os.path.dirname(out_cache_key) + out_file.storage = dir_storage + out_type, out_path, out_url = dir_storage.store(out_file) + self._cache[out_cache_key] = (out_type, out_path, out_url) # propagate up for direct reference as needed + LOGGER.debug("Stored file [%s] for reference [%s] under [%s] directory located in [%s] for reference [%s].", + out_path, out_url, output.uuid, loc_path, url_path) + return self.type, loc_path, url_path + + def write(self, data, destination, data_format=None): + # type: (AnyStr, str, Optional[Format]) -> str + """ + Write data representing the directory itself or dispatch call to base storage for any other file contents. + + When the directory itself is targeted, upload an empty bucket object for S3 base storage, or makes the + directory structure for base file storage. + """ + dest_patched = self._patch_destination(destination) + if destination != "" and not destination.endswith("/"): + return self.storage.write(data, dest_patched, data_format=data_format) + if isinstance(self.storage, FileStorage): + os.makedirs(self.storage.target, exist_ok=True) + return self.url(dest_patched) + if isinstance(self.storage, S3Storage): + path = dest_patched.rstrip("/") + "/" + args = { + "ContentLength": 0, + "ContentType": ContentType.APP_DIR, + } + # create a bucket object that represents the dir + return self.storage.uploadData("", path, args) + raise NotImplementedError + + def url(self, destination): + # type: (str) -> str + destination = self._patch_destination(destination) + if destination in ["/", ""]: + return self.storage.url("") + return self.storage.url(destination) + + def location(self, destination): + # type: (str) -> Path + destination = self._patch_destination(destination) + if destination in ["/", ""]: + return self.storage.location("") + return self.storage.location(destination) + + class WpsPackage(Process): def __init__(self, package=None, payload=None, **kw): @@ -1148,6 +1286,8 @@ def setup_runtime(self): cwl_outdir = os.path.join(wps_workdir, "cwltool_out_") res_req = get_application_requirement(self.package, CWL_REQUIREMENT_RESOURCE, default={}, validate=False) runtime_params = { + # provide name reference to inject the value in log entries by cwltool + "name": self.identifier, # force explicit staging if write needed (InitialWorkDirRequirement in CWL package) # protect input paths that can be re-used to avoid potential in-place modifications "no_read_only": False, @@ -1491,11 +1631,11 @@ def _handler(self, request, response): self.update_status(error_msg, self.percent, Status.FAILED) raise else: - self.update_status("Package complete.", PACKAGE_PROGRESS_DONE, Status.SUCCEEDED) + self.update_status("Package operations complete.", PACKAGE_PROGRESS_DONE, Status.SUCCEEDED) return self.response - def must_fetch(self, input_ref): - # type: (str) -> bool + def must_fetch(self, input_ref, input_type): + # type: (str, PACKAGE_COMPLEX_TYPES) -> bool """ Figures out if file reference should be fetched immediately for local execution. @@ -1503,7 +1643,8 @@ def must_fetch(self, input_ref): S3 are handled here to avoid error on remote WPS not supporting it. .. seealso:: - - :ref:`File Reference Types` + - :ref:`file_ref_types` + - :ref:`dir_ref_type` """ if self.remote_execution or self.package_type == ProcessType.WORKFLOW: return False @@ -1512,22 +1653,25 @@ def must_fetch(self, input_ref): if input_ref.startswith("s3://"): return True return False - return not os.path.isfile(input_ref) + if input_type == PACKAGE_FILE_TYPE: + return not os.path.isfile(input_ref) + # fetch if destination directory was created in advance but not yet populated with its contents + return not os.path.isdir(input_ref) or not os.listdir(input_ref) def make_inputs(self, wps_inputs, # type: Dict[str, Deque[WPS_Input_Type]] cwl_inputs_info, # type: Dict[str, CWL_Input_Type] ): # type: (...) -> Dict[str, ValueType] """ - Converts WPS input values to corresponding CWL input values for processing by CWL package instance. + Converts :term:`WPS` input values to corresponding :term:`CWL` input values for processing by the package. - The WPS inputs must correspond to :mod:`pywps` definitions. - Multiple values are adapted to arrays as needed. - WPS ``Complex`` types (files) are converted to appropriate locations based on data or reference specification. + The :term:`WPS` inputs must correspond to :mod:`pywps` definitions. + Multiple values (repeated objects with corresponding IDs) are adapted to arrays as needed. + All :term:`WPS` `Complex` types are converted to appropriate locations based on data or reference specification. - :param wps_inputs: actual WPS inputs parsed from execution request - :param cwl_inputs_info: expected CWL input definitions for mapping - :return: CWL input values + :param wps_inputs: Actual :term:`WPS` inputs parsed from execution request. + :param cwl_inputs_info: Expected CWL input definitions for mapping. + :return: :term:`CWL` input values. """ cwl_inputs = {} for input_id in wps_inputs: @@ -1538,20 +1682,20 @@ def make_inputs(self, # process single occurrences input_i = input_occurs[0] # handle as reference/data - is_array, elem_type, _, _ = is_cwl_array_type(cwl_inputs_info[input_id]) - if isinstance(input_i, ComplexInput) or elem_type == "File": + io_def = parse_cwl_array_type(cwl_inputs_info[input_id]) + if isinstance(input_i, ComplexInput) or io_def.type in PACKAGE_COMPLEX_TYPES: # extend array data that allow max_occur > 1 # drop invalid inputs returned as None - if is_array: - input_href = [self.make_location_input(elem_type, input_def) for input_def in input_occurs] + if io_def.array: + input_href = [self.make_location_input(io_def.type, input_def) for input_def in input_occurs] input_href = [cwl_input for cwl_input in input_href if cwl_input is not None] else: - input_href = self.make_location_input(elem_type, input_i) + input_href = self.make_location_input(io_def.type, input_i) if input_href: cwl_inputs[input_id] = input_href elif isinstance(input_i, (LiteralInput, BoundingBoxInput)): # extend array data that allow max_occur > 1 - if is_array: + if io_def.array: input_data = [i.url if i.as_reference else i.data for i in input_occurs] else: input_data = input_i.url if input_i.as_reference else input_i.data @@ -1560,10 +1704,62 @@ def make_inputs(self, raise PackageTypeError(f"Undefined package input for execution: {type(input_i)}.") return cwl_inputs + def make_location_input_security_check(self, input_scheme, input_type, input_id, input_location, input_definition): + # type: (str, CWL_IO_ComplexType, str, str, ComplexInput) -> str + """ + Perform security access validation of the reference, and resolve it afterwards if accessible. + + Auto-map local file if possible to avoid useless download from current server. + Resolve :term:`Vault` reference with local file stored after decryption. + + :returns: Updated file location if any resolution occurred. + """ + if input_scheme == "vault": + if input_type != PACKAGE_FILE_TYPE: + raise PackageExecutionError( + f"Vault reference must be a file, but resolved [{input_type}] type " + f"instead for input [{input_id}] from location [{input_location}]." + ) + vault_id = bytes2str(urlparse(input_location).hostname) + input_url = get_vault_url(vault_id, self.settings) + resp = request_extra("HEAD", input_url, settings=self.settings, headers=self.auth) + if resp.status_code == 200: + self.logger.debug("Detected and validated remotely accessible reference [%s] " + "matching local Vault [%s]. Replacing URL reference for local access.", + input_location, input_url) + # pre-fetch by move and delete file from vault and decrypt it (as download would) + # to save transfer time/data from local file already available + auth = parse_vault_token(self.auth.get(sd.XAuthVaultFileHeader.name), unique=False) + file = get_authorized_file(vault_id, auth.get(vault_id), self.settings) + input_location = map_vault_location(input_url, self.settings) + input_location = decrypt_from_vault(file, input_location, + out_dir=input_definition.workdir, delete_encrypted=True) + self.logger.debug("Moved Vault file to temporary location: [%s]. " + "File not accessible from Vault endpoint anymore. " + "Location will be deleted after process execution.", + input_location) + else: + self.logger.error("Detected Vault file reference that is not accessible [%s] caused " + "by HTTP [%s] Detail:\n%s", input_location, + resp.status_code, repr_json(resp.text, indent=2)) + raise PackageAuthenticationError( + f"Input {input_id} with Vault reference [{vault_id}] is not accessible." + ) + else: + input_local_ref = map_wps_output_location(input_location, self.settings) + if input_local_ref: + resp = request_extra("HEAD", input_location, settings=self.settings, headers=self.auth) + if resp.status_code == 200: # if failed, following fetch will produce the appropriate HTTP error + self.logger.debug("Detected and validated remotely accessible reference [%s] " + "matching local WPS outputs [%s]. Skipping fetch using direct reference.", + input_location, input_local_ref) + input_location = input_local_ref + return input_location + def make_location_input(self, input_type, input_definition): - # type: (str, ComplexInput) -> Optional[JSON] + # type: (CWL_IO_ComplexType, ComplexInput) -> Optional[JSON] """ - Generates the JSON content required to specify a `CWL` ``File`` input definition from a location. + Generates the JSON content required to specify a `CWL` ``File`` or ``Directory`` input from a location. If the input reference corresponds to an HTTP URL that is detected as matching the local WPS output endpoint, implicitly convert the reference to the local WPS output directory to avoid useless download of available file. @@ -1576,28 +1772,38 @@ def make_location_input(self, input_type, input_definition): Any other variant of file reference will be fetched as applicable by the relevant schemes. + If the reference corresponds to a ``Directory``, all files that can be located in it will be fetched as + applicable by the relevant scheme of the reference. It is up to the remote location to provide listing + capabilities accordingly to view available files. + .. seealso:: - Documentation details of resolution based on schemes defined in :ref:`file_reference_types` section. + Documentation details of resolution based on schemes defined in :ref:`file_ref_types` section. """ # NOTE: # When running as EMS, must not call data/file methods if URL reference, otherwise contents # get fetched automatically by PyWPS objects. input_location = None - # cannot rely only on 'as_reference' as often it is not provided by the request although it's an href + input_id = input_definition.identifier + # cannot rely only on 'as_reference' as often it is not provided by the request, although it's an href if input_definition.as_reference: input_location = input_definition.url # FIXME: PyWPS bug # Calling 'file' method fetches it, and it is always called by the package itself - # during type validation if the MODE is anything else than disabled. + # during type validation if the MODE is anything else than disabled (MODE.NONE). # MODE.SIMPLE is needed minimally to check MIME-TYPE of input against supported formats. # - https://github.com/geopython/pywps/issues/526 # - https://github.com/crim-ca/weaver/issues/91 # since href is already handled (pulled and staged locally), use it directly to avoid double fetch with CWL # validate using the internal '_file' instead of 'file' otherwise we trigger the fetch - # normally, file should be pulled an this check should fail + # normally, file should be pulled and this check should fail input_definition_file = input_definition._iohandler._file # noqa: W0212 if input_definition_file and os.path.isfile(input_definition_file): - input_location = input_definition_file + # Because storage handlers assume files, a directory (pseudo-file with trailing '/' unknown to PyWPS) + # could be mistakenly generated as an empty file. Wipe it in this case to ensure proper resolution. + if input_type == PACKAGE_DIRECTORY_TYPE and os.stat(input_definition_file).st_size == 0: + os.remove(input_definition_file) + else: + input_location = input_definition_file # if source type is data, we actually need to call 'data' (without fetch of remote file, already fetched) # value of 'file' in this case points to a local file path where the wanted link was dumped as raw data if input_definition.source_type == SOURCE_TYPE.DATA: @@ -1619,7 +1825,7 @@ def make_location_input(self, input_type, input_definition): # Patch with a combination of available detection methods to be safe: # - The 'file' attribute gets resolved to the process '{workdir}/input' temporary file. # This 'file' is instead named 'input_{uuid}' when it is actually resolved to real input href/data contents. - # The IO handler better reports 'None' in its internal '_file' attribute. + # The IO handler reports 'None' more reliably with its internal '_file' attribute. # - For even more robustness, verify that erroneous 'data' matches the 'default format'. # The media-type should match and 'default' argument should True since it resolve with '_default' argument. default_format_def = getattr(input_definition, "_default", None) @@ -1632,52 +1838,42 @@ def make_location_input(self, input_type, input_definition): any(default_format_def.get("mimeType") == fmt.mime_type and fmt.mime_type is not None for fmt in input_definition.supported_formats) ): - self.logger.debug("File input (%s) DROPPED. Detected default format as data.", input_definition.identifier) + self.logger.debug("%s input (%s) DROPPED. Detected default format as data.", input_type, input_id) return None - # auto-map local file if possible after security check - if input_scheme == "vault": - vault_id = bytes2str(urlparse(input_location).hostname) - input_url = get_vault_url(vault_id, self.settings) - resp = request_extra("HEAD", input_url, settings=self.settings, headers=self.auth) - if resp.status_code == 200: - self.logger.debug("Detected and validated remotely accessible reference [%s] " - "matching local Vault [%s]. Replacing URL reference for local access.", - input_location, input_url) - # pre-fetch by move and delete file from vault and decrypt it (as download would) - # to save transfer time/data from local file already available - auth = parse_vault_token(self.auth.get(sd.XAuthVaultFileHeader.name), unique=False) - file = get_authorized_file(vault_id, auth.get(vault_id), self.settings) - input_location = map_vault_location(input_url, self.settings) - input_location = decrypt_from_vault(file, input_location, - out_dir=input_definition.workdir, delete_encrypted=True) - self.logger.debug("Moved Vault file to temporary location: [%s]. " - "File not accessible from Vault endpoint anymore. " - "Location will be deleted after process execution.", - input_location) + input_location = self.make_location_input_security_check( + input_scheme, + input_type, + input_id, + input_location, + input_definition + ) + + if self.must_fetch(input_location, input_type): + self.logger.info("%s input (%s) ATTEMPT fetch: [%s]", input_type, input_id, input_location) + if input_type == PACKAGE_FILE_TYPE: + input_location = fetch_file(input_location, input_definition.workdir, + settings=self.settings, headers=self.auth) + elif input_type == PACKAGE_DIRECTORY_TYPE: + # Because a directory reference can contain multiple sub-dir definitions, + # avoid possible conflicts with other inputs by nesting them under the ID. + # This also ensures that each directory input can work with a clean staging directory. + out_dir = os.path.join(input_definition.workdir, input_definition.identifier) + locations = fetch_directory(input_location, out_dir, + settings=self.settings, headers=self.auth) + if not locations: + raise PackageExecutionError( + f"Directory reference resolution method for input [{input_id}] " + f"from location [{input_location}] did not produce any staged file." + ) + input_location = out_dir else: - self.logger.error("Detected Vault file reference that is not accessible [%s] caused " - "by HTTP [%s] Detail:\n%s", input_location, - resp.status_code, repr_json(resp.text, indent=2)) - raise PackageAuthenticationError( - f"Input {input_definition.identifier} with Vault reference [{vault_id}] is not accessible." + raise PackageExecutionError( + f"Unknown reference staging resolution method for [{input_type}] type " + f"specified for input [{input_id}] from location [{input_location}]." ) else: - input_local_ref = map_wps_output_location(input_location, self.settings) - if input_local_ref: - resp = request_extra("HEAD", input_location, settings=self.settings, headers=self.auth) - if resp.status_code == 200: # if failed, following fetch will produce the appropriate HTTP error - self.logger.debug("Detected and validated remotely accessible reference [%s] " - "matching local WPS outputs [%s]. Skipping fetch using direct reference.", - input_location, input_local_ref) - input_location = input_local_ref - - if self.must_fetch(input_location): - self.logger.info("File input (%s) ATTEMPT fetch: [%s]", input_definition.identifier, input_location) - input_location = fetch_file(input_location, input_definition.workdir, - settings=self.settings, headers=self.auth) - else: - self.logger.info("File input (%s) SKIPPED fetch: [%s]", input_definition.identifier, input_location) + self.logger.info("%s input (%s) SKIPPED fetch: [%s]", input_type, input_id, input_location) location = {"location": input_location, "class": input_type} if input_definition.data_format is not None and input_definition.data_format.mime_type: @@ -1692,7 +1888,6 @@ def make_outputs(self, cwl_result): Maps `CWL` result outputs to corresponding `WPS` outputs. """ for output_id in self.request.outputs: # iterate over original WPS outputs, extra such as logs are dropped - # TODO: adjust output for glob patterns (https://github.com/crim-ca/weaver/issues/24) if isinstance(cwl_result[output_id], list) and not isinstance(self.response.outputs[output_id], list): if len(cwl_result[output_id]) > 1: self.logger.warning( @@ -1734,8 +1929,13 @@ def make_location_output(self, cwl_result, output_id): - :func:`weaver.wps.load_pywps_config` """ s3_bucket = self.settings.get("weaver.wps_output_s3_bucket") - result_loc = cwl_result[output_id]["location"].replace("file://", "") + result_loc = cwl_result[output_id]["location"].replace("file://", "").rstrip("/") result_path = os.path.split(result_loc)[-1] + result_type = cwl_result[output_id].get("class", PACKAGE_FILE_TYPE) + result_is_dir = result_type == PACKAGE_DIRECTORY_TYPE + if result_is_dir and not result_path.endswith("/"): + result_path += "/" + result_loc += "/" # PyWPS internally sets a new FileStorage (default) inplace when generating the JSON definition of the output. # This is done such that the generated XML status document in WPS response can obtain the output URL location. @@ -1756,18 +1956,10 @@ def make_location_output(self, cwl_result, output_id): # - pywps.inout.outputs.ComplexOutput.storage.store() # But, setter "pywps.inout.basic.ComplexOutput.storage" doesn't override predefined 'storage'. # Therefore, preemptively override "ComplexOutput._storage" to whichever location according to use case. - if s3_bucket: - # when 'url' is directly enforced, 'ComplexOutput.json' will use it instead of 'file' from temp workdir - # override builder only here so that only results are uploaded to S3, and not XML status - # using this storage builder, other settings (bucket, region, etc.) are retrieved from PyWPS server config - self.response.outputs[output_id]._storage = S3StorageBuilder().build() # noqa: W0212 - self.response.outputs[output_id].storage.prefix = str(self.response.uuid) # job UUID - elif self.job.context: - storage = FileStorageBuilder().build() - storage.target = os.path.join(storage.target, self.job.context) - storage.output_url = os.path.join(storage.output_url, self.job.context) - os.makedirs(storage.target, exist_ok=True) # pywps handles UUID-dir creation, but not nested context-dir - self.response.outputs[output_id]._storage = storage # noqa: W0212 + # Override builder per output to allow distinct S3/LocalFile for it and XML status that should remain local. + storage_type = STORE_TYPE.S3 if s3_bucket else STORE_TYPE.PATH + storage = self.make_location_storage(storage_type, result_type) + self.response.outputs[output_id]._storage = storage # noqa: W0212 # pywps will resolve file paths for us using its WPS request UUID os.makedirs(self.workdir, exist_ok=True) @@ -1775,13 +1967,53 @@ def make_location_output(self, cwl_result, output_id): if os.path.realpath(result_loc) != os.path.realpath(result_wps): self.logger.info("Moving [%s]: [%s] -> [%s]", output_id, result_loc, result_wps) - shutil.move(result_loc, result_wps) + if result_is_dir: + adjust_directory_local(result_loc, self.workdir, OutputMethod.MOVE) + else: + adjust_file_local(result_loc, self.workdir, OutputMethod.MOVE) # params 'as_reference + file' triggers 'ComplexOutput.json' to map the WPS-output URL from the WPS workdir self.response.outputs[output_id].as_reference = True self.response.outputs[output_id].file = result_wps + # Since each output has its own storage already prefixed by '[Context/]JobID/', avoid JobID nesting another dir. + # Instead, let it create a dir matching the output ID to get '[Context/]JobID/OutputID/[file(s).ext]' + self.response.outputs[output_id].uuid = output_id self.logger.info("Resolved WPS output [%s] as file reference: [%s]", output_id, result_wps) + def make_location_storage(self, storage_type, location_type): + # type: (STORE_TYPE, PACKAGE_COMPLEX_TYPES) -> Union[FileStorage, S3Storage, DirectoryNestedStorage] + """ + Generates the relevant storage implementation with requested types and references. + + :param storage_type: Where to store the outputs. + :param location_type: Type of output as defined by CWL package type. + :return: Storage implementation. + """ + if location_type == PACKAGE_FILE_TYPE and storage_type == STORE_TYPE.PATH: + storage = FileStorageBuilder().build() + elif location_type == PACKAGE_FILE_TYPE and storage_type == STORE_TYPE.S3: + storage = S3StorageBuilder().build() + elif location_type == PACKAGE_DIRECTORY_TYPE and storage_type == STORE_TYPE.PATH: + storage = DirectoryNestedStorage(FileStorageBuilder().build()) + elif location_type == PACKAGE_DIRECTORY_TYPE and storage_type == STORE_TYPE.S3: + storage = DirectoryNestedStorage(S3StorageBuilder().build()) + else: + raise PackageExecutionError( + "Cannot resolve unknown location storage for " + f"(storage: {storage_type}, type: {location_type})." + ) + + output_job_id = str(self.response.uuid) + output_prefix = os.path.join(self.job.context, output_job_id) if self.job.context else output_job_id + # pylint: disable=attribute-defined-outside-init # references to nested storage dynamically created + if storage_type == STORE_TYPE.S3: + storage.prefix = output_prefix + else: + storage.target = os.path.join(storage.target, output_prefix) + storage.output_url = os.path.join(storage.output_url, output_prefix) + os.makedirs(storage.target, exist_ok=True) # pywps handles Job UUID dir creation, but not nested dirs + return storage + def make_tool(self, toolpath_object, loading_context): # type: (CWL_ToolPathObject, LoadingContext) -> ProcessCWL from weaver.processes.wps_workflow import default_make_tool diff --git a/weaver/processes/wps_process_base.py b/weaver/processes/wps_process_base.py index 066a2f57c..585cb7180 100644 --- a/weaver/processes/wps_process_base.py +++ b/weaver/processes/wps_process_base.py @@ -12,11 +12,12 @@ from weaver.exceptions import PackageExecutionError from weaver.execute import ExecuteMode, ExecuteResponse, ExecuteTransmissionMode from weaver.formats import ContentType, repr_json -from weaver.processes.constants import OpenSearchField +from weaver.processes.constants import PACKAGE_DIRECTORY_TYPE, PACKAGE_FILE_TYPE, OpenSearchField from weaver.processes.utils import map_progress from weaver.status import JOB_STATUS_CATEGORIES, Status, StatusCategory, map_status from weaver.utils import ( - fetch_file, + OutputMethod, + fetch_reference, fully_qualified_name, get_any_id, get_any_message, @@ -291,26 +292,29 @@ def make_request(self, headers=headers, cookies=cookies, **kwargs) return response - def host_file(self, file_path): + def host_reference(self, reference): + # type: (str) -> str """ - Hosts an intermediate file between :term:`Workflow` steps for processes that require external or remote access. + Hosts an intermediate reference between :term:`Workflow` steps for processes that require remote access. - :param file_path: Intermediate file location (local path expected). - :return: Hosted temporary HTTP file location. + :param reference: Intermediate file or directory location (local path expected). + :return: Hosted temporary HTTP file or directory location. """ wps_out_url = get_wps_output_url(self.settings) wps_out_dir = get_wps_output_dir(self.settings) - file_path = os.path.realpath(file_path.replace("file://", "")) # in case CWL->WPS outputs link was made - if file_path.startswith(wps_out_dir): - file_href = file_path.replace(wps_out_dir, wps_out_url, 1) - LOGGER.debug("Hosting file [%s] skipped since already on WPS outputs as [%s]", file_path, file_href) + ref_path = os.path.realpath(reference.replace("file://", "")) # in case CWL->WPS outputs link was made + ref_path += "/" if reference.endswith("/") else "" + if reference.startswith(wps_out_dir): + ref_href = ref_path.replace(wps_out_dir, wps_out_url, 1) + LOGGER.debug("Hosting file [%s] skipped since already on WPS outputs as [%s]", reference, ref_href) else: tmp_out_dir = tempfile.mkdtemp(dir=wps_out_dir) - file_link = fetch_file(file_path, tmp_out_dir, self.settings, link=True) - file_href = file_link.replace(wps_out_dir, wps_out_url, 1) + ref_link = fetch_reference(ref_path, tmp_out_dir, out_listing=False, + settings=self.settings, out_method=OutputMethod.LINK) + ref_href = ref_link.replace(wps_out_dir, wps_out_url, 1) self.temp_staging.add(tmp_out_dir) - LOGGER.debug("Hosting file [%s] as [%s] on [%s]", file_path, file_link, file_href) - return file_href + LOGGER.debug("Hosting file [%s] as [%s] on [%s]", reference, ref_link, ref_href) + return ref_href def stage_results(self, results, expected_outputs, out_dir): # type: (JobResults, CWL_ExpectedOutputs, str) -> None @@ -360,19 +364,19 @@ def stage_results(self, results, expected_outputs, out_dir): # Because CWL expects the file to be in specified 'out_dir', make a link for it to be found # even though the file is stored in the full job output location instead (already staged by step). map_path = map_wps_output_location(value, self.settings) - as_link = False + out_method = OutputMethod.COPY if map_path: LOGGER.info("Detected result [%s] from [%s] as local reference to this instance. " "Skipping fetch and using local copy in output destination: [%s]", res_id, value, dst_path) LOGGER.debug("Mapped result [%s] to local reference: [%s]", value, map_path) src_path = map_path - as_link = True + out_method = OutputMethod.LINK else: LOGGER.info("Fetching result [%s] from [%s] to CWL output destination: [%s]", res_id, value, dst_path) src_path = value - fetch_file(src_path, cwl_out_dir, settings=self.settings, link=as_link) + fetch_reference(src_path, cwl_out_dir, out_method=out_method, settings=self.settings) def stage_inputs(self, workflow_inputs): # type: (CWL_WorkflowInputs) -> JobInputs @@ -386,6 +390,11 @@ def stage_inputs(self, workflow_inputs): for workflow_input_value_item in workflow_input_value: if isinstance(workflow_input_value_item, dict) and "location" in workflow_input_value_item: location = workflow_input_value_item["location"] + # if the location came from a collected output resolved by cwltool from a previous Workflow step + # obtained directory type does not contain the expected trailing slash for Weaver reference checks + input_class = workflow_input_value_item.get("class", PACKAGE_FILE_TYPE) + if input_class == PACKAGE_DIRECTORY_TYPE: + location = location.rstrip("/") + "/" execute_body_inputs.append({"id": workflow_input_key, "href": location}) else: execute_body_inputs.append({"id": workflow_input_key, "data": workflow_input_value_item}) @@ -398,7 +407,7 @@ def stage_inputs(self, workflow_inputs): exec_input["href"] = f"file{exec_href}" LOGGER.debug("OpenSearch intermediate input [%s] : [%s]", exec_input["id"], exec_input["href"]) elif exec_input["href"].startswith("file://"): - exec_input["href"] = self.host_file(exec_input["href"]) + exec_input["href"] = self.host_reference(exec_input["href"]) LOGGER.debug("Hosting intermediate input [%s] : [%s]", exec_input["id"], exec_input["href"]) return execute_body_inputs diff --git a/weaver/processes/wps_workflow.py b/weaver/processes/wps_workflow.py index d146325ce..8fe8c6d60 100644 --- a/weaver/processes/wps_workflow.py +++ b/weaver/processes/wps_workflow.py @@ -20,7 +20,7 @@ CWL_REQUIREMENT_APP_ESGF_CWT, CWL_REQUIREMENT_APP_WPS1 ) -from weaver.processes.convert import is_cwl_file_type +from weaver.processes.convert import is_cwl_complex_type from weaver.utils import get_settings from weaver.wps.utils import get_wps_output_dir @@ -188,8 +188,9 @@ def collect_output( step :term:`Process` outputs were generated locally. """ if "outputBinding" in schema and "glob" in schema["outputBinding"]: + # in case of Directory collection with '
/', use '.' because cwltool replaces it by the outdir glob = schema["outputBinding"]["glob"] - glob = os.path.split(glob)[-1] + glob = os.path.split(glob)[-1] or "." schema["outputBinding"]["glob"] = glob output = super(WpsWorkflow, self).collect_output( schema, @@ -218,22 +219,24 @@ def __init__(self, builder.pathmapper = self.pathmapper self.wps_process = wps_process # type: WpsProcessInterface - self.expected_outputs = {} # type: CWL_ExpectedOutputs # {id: file-pattern} + self.expected_outputs = {} # type: CWL_ExpectedOutputs # {id: glob-pattern} for output in expected_outputs: - # TODO Should we support something else? - if is_cwl_file_type(output): - # Expecting output to look like this - # output = {"id": "file:///tmp/random_path/process_name#output_id, - # "type": "File", - # "outputBinding": {"glob": output_name } - # } + if is_cwl_complex_type(output): output_id = shortname(output["id"]) - output_glob = output["outputBinding"]["glob"].split("/")[-1] - self.expected_outputs[output_id] = ( - output_id + "/" + output_glob - if self.wps_process.stage_output_id_nested else - output_glob - ) + glob_spec = output["outputBinding"]["glob"] + glob_list = isinstance(glob_spec, list) + out_globs = set() + # When applications run by themselves, their output glob could be very + # deeply nested to retrieve files under specific directory structures. + # However, as Workflow step, those outputs would already have been collected + # on the step output dir. The Workflow only needs the last part of the glob + # to collect the staged out files without the nested directory hierarchy. + for glob in glob_spec if glob_list else [glob_spec]: + # in case of Directory collection with '/', use '.' because cwltool replaces it by the outdir + out_glob = glob.split("/")[-1] or "." + out_glob = (output_id + "/" + out_glob) if self.wps_process.stage_output_id_nested else out_glob + out_globs.add(out_glob) + self.expected_outputs[output_id] = out_globs if glob_list else list(out_globs)[0] # pylint: disable=W0221,W0237 # naming using python like arguments def _execute(self, diff --git a/weaver/quotation/estimation.py b/weaver/quotation/estimation.py index bbb81a94f..9886e2299 100644 --- a/weaver/quotation/estimation.py +++ b/weaver/quotation/estimation.py @@ -107,7 +107,7 @@ def estimate_workflow_quote(quote, process): @app.task(bind=True) -def process_quote_estimator(task, quote_id): # noqa: E811 +def process_quote_estimator(task, quote_id): # type: (Task, AnyUUID) -> AnyQuoteStatus """ Estimate :term:`Quote` parameters for the :term:`Process` execution. diff --git a/weaver/typedefs.py b/weaver/typedefs.py index 61b441989..e769be5b6 100644 --- a/weaver/typedefs.py +++ b/weaver/typedefs.py @@ -13,7 +13,7 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Type, TypeVar, Union import psutil - from typing_extensions import Literal, NotRequired, Protocol, TypeAlias, TypedDict + from typing_extensions import Literal, NotRequired, ParamSpec, Protocol, Required, TypeAlias, TypedDict if hasattr(os, "PathLike"): FileSystemPathType = Union[os.PathLike, str] @@ -33,7 +33,7 @@ pass if MemoryInfo is Any: if TypedDict is Dict: - MemoryInfo = Dict + MemoryInfo = Dict[str, int] else: MemoryInfo = TypedDict("MemoryInfo", { "rss": int, @@ -62,15 +62,20 @@ from webtest.response import TestResponse from werkzeug.wrappers import Request as WerkzeugRequest + from weaver.execute import AnyExecuteControlOption, AnyExecuteMode, AnyExecuteResponse, AnyExecuteTransmissionMode from weaver.processes.constants import CWL_RequirementNames from weaver.processes.wps_process_base import WpsProcessInterface from weaver.datatype import Process from weaver.status import AnyStatusType + from weaver.visibility import AnyVisibility - ReturnValue = TypeVar("ReturnValue") # alias to identify the same return value as a decorated/wrapped function + Path = Union[os.PathLike, str, bytes] + + Params = ParamSpec("Params") # use with 'Callable[Params, Return]', 'Params.args' and 'Params.kwargs' + Return = TypeVar("Return") # alias to identify the same return value as a decorated/wrapped function AnyCallable = TypeVar("AnyCallable", bound=Callable[..., Any]) # callable used for decorated/wrapped functions - AnyCallableWrapped = Callable[[..., Any], ReturnValue] - AnyCallableAnyArgs = Union[Callable[[], ReturnValue], Callable[[..., Any], ReturnValue]] + AnyCallableWrapped = Callable[Params, Return] + AnyCallableAnyArgs = Union[Callable[[], Return], Callable[[..., Any], Return]] # pylint: disable=C0103,invalid-name Number = Union[int, float] @@ -84,24 +89,27 @@ _JSON: TypeAlias = "JSON" _JsonObjectItemAlias: TypeAlias = "_JsonObjectItem" _JsonListItemAlias: TypeAlias = "_JsonListItem" - _JsonObjectItem = Dict[str, Union[_JSON, _JsonObjectItemAlias, _JsonListItemAlias]] - _JsonListItem = List[Union[AnyValueType, _JsonObjectItem, _JsonListItemAlias]] - _JsonItem = Union[AnyValueType, _JsonObjectItem, _JsonListItem, _JSON] - JSON = Union[Dict[str, _JsonItem], List[_JsonItem], AnyValueType] + _JsonObjectItem = Dict[str, Union[AnyValueType, _JSON, _JsonObjectItemAlias, _JsonListItemAlias]] + _JsonListItem = List[Union[AnyValueType, _JSON, _JsonObjectItem, _JsonListItemAlias]] + _JsonItem = Union[AnyValueType, _JSON, _JsonObjectItem, _JsonListItem] + JSON = Union[Dict[str, Union[_JSON, _JsonItem]], List[Union[_JSON, _JsonItem]], AnyValueType] Link = TypedDict("Link", { - "rel": str, "title": str, - "href": str, + "rel": Required[str], + "href": Required[str], "hreflang": NotRequired[str], "type": NotRequired[str], # IANA Media-Type }, total=False) Metadata = TypedDict("Metadata", { "title": str, "role": str, # URL + "href": str, + "hreflang": str, + "rel": str, "value": str, "lang": NotRequired[str], - "type": NotRequired[str], # FIXME: relevant? + "type": NotRequired[str], }, total=False) LogLevelStr = Literal[ @@ -165,11 +173,6 @@ CWL_RequirementsDict = Dict[CWL_RequirementNames, Dict[str, str]] # {'': {: }} CWL_RequirementsList = List[CWL_Requirement] # [{'class': , : }] CWL_AnyRequirements = Union[CWL_RequirementsDict, CWL_RequirementsList] - # results from CWL execution - CWL_ResultFile = TypedDict("CWL_ResultFile", {"location": str}, total=False) - CWL_ResultValue = Union[AnyValueType, List[AnyValueType]] - CWL_ResultEntry = Union[Dict[str, CWL_ResultValue], CWL_ResultFile, List[CWL_ResultFile]] - CWL_Results = Dict[str, CWL_ResultEntry] CWL_Class = Literal["CommandLineTool", "ExpressionTool", "Workflow"] CWL_WorkflowStep = TypedDict("CWL_WorkflowStep", { "run": str, @@ -184,8 +187,8 @@ _CWL = "CWL" # type: TypeAlias CWL_Graph = List[_CWL] CWL = TypedDict("CWL", { - "cwlVersion": str, - "class": CWL_Class, + "cwlVersion": Required[str], + "class": Required[CWL_Class], "label": str, "doc": str, "id": NotRequired[str], @@ -218,40 +221,63 @@ except (AttributeError, ImportError, NameError): CWL_ToolPathObject = CWL - # CWL loading - CWL_WorkflowInputs = Dict[str, AnyValueType] # mapping of ID:value (any type) - CWL_ExpectedOutputs = Dict[str, AnyValueType] # mapping of ID:pattern (File only) - JobProcessDefinitionCallback = Callable[[str, Dict[str, str], Dict[str, Any]], WpsProcessInterface] - # CWL runtime - CWL_RuntimeLiteral = Union[str, float, int] + CWL_RuntimeLiteral = AnyValueType + CWL_RuntimeLiteralItem = Union[CWL_RuntimeLiteral, List[CWL_RuntimeLiteral]] CWL_RuntimeLiteralObject = TypedDict("CWL_RuntimeLiteralObject", { "id": str, - "value": CWL_RuntimeLiteral, + "value": CWL_RuntimeLiteralItem, }, total=False) CWL_RuntimeInputFile = TypedDict("CWL_RuntimeInputFile", { "id": NotRequired[str], - "class": str, - "location": str, + "class": Required[Literal["File"]], + "location": Required[str], "format": NotRequired[Optional[str]], - "basename": str, - "nameroot": str, - "nameext": str, + "basename": NotRequired[str], + "nameroot": NotRequired[str], + "nameext": NotRequired[str], }, total=False) CWL_RuntimeOutputFile = TypedDict("CWL_RuntimeOutputFile", { - "class": str, - "location": str, + "class": Required[Literal["File"]], + "location": Required[str], + "format": NotRequired[Optional[str]], + "basename": NotRequired[str], + "nameroot": NotRequired[str], + "nameext": NotRequired[str], + "checksum": NotRequired[str], + "size": NotRequired[int], + }, total=False) + CWL_RuntimeInputDirectory = TypedDict("CWL_RuntimeInputDirectory", { + "id": NotRequired[str], + "class": Required[Literal["Directory"]], + "location": Required[str], + "format": NotRequired[Optional[str]], + "nameroot": NotRequired[str], + "nameext": NotRequired[str], + "basename": NotRequired[str], + "listing": List[CWL_RuntimeInputFile], + }, total=False) + CWL_RuntimeOutputDirectory = TypedDict("CWL_RuntimeOutputDirectory", { + "class": Required[Literal["Directory"]], + "location": Required[str], "format": NotRequired[Optional[str]], - "basename": str, - "nameroot": str, - "nameext": str, + "basename": NotRequired[str], + "nameroot": NotRequired[str], + "nameext": NotRequired[str], "checksum": NotRequired[str], - "size": NotRequired[str], + "size": NotRequired[Literal[0]], + "listing": List[CWL_RuntimeOutputFile], }, total=False) - CWL_RuntimeInput = Union[CWL_RuntimeLiteral, CWL_RuntimeInputFile] + CWL_RuntimeInput = Union[CWL_RuntimeLiteralItem, CWL_RuntimeInputFile, CWL_RuntimeInputDirectory] CWL_RuntimeInputsMap = Dict[str, CWL_RuntimeInput] - CWL_RuntimeInputList = List[Union[CWL_RuntimeLiteralObject, CWL_RuntimeInputFile]] - CWL_RuntimeOutput = Union[CWL_RuntimeLiteral, CWL_RuntimeOutputFile] + CWL_RuntimeInputList = List[Union[CWL_RuntimeLiteralObject, CWL_RuntimeInputFile, CWL_RuntimeInputDirectory]] + CWL_RuntimeOutput = Union[CWL_RuntimeLiteral, CWL_RuntimeOutputFile, CWL_RuntimeOutputDirectory] + CWL_Results = Dict[str, CWL_RuntimeOutput] + + # CWL loading + CWL_WorkflowInputs = CWL_RuntimeInputsMap # mapping of ID:value (any type) + CWL_ExpectedOutputs = Dict[str, str] # mapping of ID:glob-pattern (File/Directory only) + JobProcessDefinitionCallback = Callable[[str, Dict[str, str], Dict[str, Any]], WpsProcessInterface] # OWSLib Execution # inputs of OWSLib are either a string (any literal type, bbox or complex file) @@ -334,6 +360,8 @@ JobValueFormat = TypedDict("JobValueFormat", { "mime_type": NotRequired[str], "media_type": NotRequired[str], + "mimeType": NotRequired[str], + "mediaType": NotRequired[str], "encoding": NotRequired[str], "schema": NotRequired[str], "extension": NotRequired[str], @@ -350,27 +378,31 @@ }, total=False) JobValueObject = Union[JobValueData, JobValueValue, JobValueFile] JobValueFileItem = TypedDict("JobValueFileItem", { - "id": str, - "href": Optional[str], - "format": Optional[JobValueFormat], + "id": Required[str], + "href": Required[str], + "format": NotRequired[JobValueFormat], }, total=False) JobValueDataItem = TypedDict("JobValueDataItem", { - "id": str, - "data": AnyValueType, + "id": Required[str], + "data": Required[AnyValueType], }, total=False) JobValueValueItem = TypedDict("JobValueValueItem", { - "id": str, - "value": AnyValueType, + "id": Required[str], + "value": Required[AnyValueType], }, total=False) - JobValueItem = Union[JobValueDataItem, JobValueFileItem] + JobValueItem = Union[JobValueDataItem, JobValueFileItem, JobValueValueItem] JobExpectItem = TypedDict("JobExpectItem", {"id": str}, total=True) - JobInputs = List[Union[JobValueItem, Dict[str, AnyValueType]]] - JobOutputs = List[Union[JobExpectItem, Dict[str, AnyValueType]]] + JobInputItem = Union[JobValueItem, Dict[str, AnyValueType]] + JobInputs = List[JobInputItem] + JobOutputItem = Union[JobExpectItem, Dict[str, AnyValueType]] + JobOutputs = List[JobOutputItem] JobResults = List[JobValueItem] - JobMonitorReference = Any # typically an URI of the remote job status or an execution object/handler + JobMonitorReference = Any # typically a URI of the remote job status or an execution object/handler - ExecutionInputsMap = Dict[str, JobValueObject] # when schema='weaver.processes.constants.ProcessSchema.OGC' - ExecutionInputsList = List[JobValueItem] # when schema='weaver.processes.constants.ProcessSchema.OLD' + # when schema='weaver.processes.constants.ProcessSchema.OGC' + ExecutionInputsMap = Dict[str, Union[JobValueObject, List[JobValueObject]]] + # when schema='weaver.processes.constants.ProcessSchema.OLD' + ExecutionInputsList = List[JobValueItem] ExecutionInputs = Union[ExecutionInputsList, ExecutionInputsMap] ExecutionOutputObject = TypedDict("ExecutionOutputObject", { @@ -378,7 +410,8 @@ }, total=False) ExecutionOutputItem = TypedDict("ExecutionOutputItem", { "id": str, - "transmissionMode": str + "transmissionMode": AnyExecuteTransmissionMode, + "format": NotRequired[JobValueFormat], }, total=False) ExecutionOutputsList = List[ExecutionOutputItem] ExecutionOutputsMap = Dict[str, ExecutionOutputObject] @@ -482,10 +515,11 @@ "properties": NotRequired[Dict[str, _OpenAPISchemaProperty]], "additionalProperties": NotRequired[Union[bool, Dict[str, Union[_OpenAPISchema, OpenAPISchemaReference]]]], }, total=False) - OpenAPISchemaObject = TypedDict("OpenAPISchemaObject", { + _OpenAPISchemaObject = TypedDict("_OpenAPISchemaObject", { "type": Literal["object"], "properties": Dict[str, OpenAPISchemaProperty], }, total=False) + OpenAPISchemaObject = Union[_OpenAPISchemaObject, OpenAPISchemaProperty] OpenAPISchemaArray = TypedDict("OpenAPISchemaArray", { "type": Literal["array"], "items": _OpenAPISchema, @@ -509,12 +543,12 @@ OpenAPISchemaNot, ] OpenAPISchema = Union[ + OpenAPISchemaMetadata, OpenAPISchemaObject, OpenAPISchemaArray, OpenAPISchemaKeyword, OpenAPISchemaProperty, OpenAPISchemaReference, - OpenAPISchemaMetadata, ] OpenAPISpecLicence = TypedDict("OpenAPISpecLicence", { "name": str, @@ -686,3 +720,91 @@ "responses": NotRequired[Dict[str, OpenAPISpecResponse]], # Swagger 2.0, OpenAPI 3.0: 'components/responses' "externalDocs": NotRequired[OpenAPISpecExternalDocs], }, total=True) + + FormatMediaType = TypedDict("FormatMediaType", { + "mediaType": Required[str], + "encoding": NotRequired[Optional[str]], + "schema": NotRequired[Union[str, OpenAPISchema]], + "default": NotRequired[bool], + }, total=False) + ProcessInputOutputItem = TypedDict("ProcessInputOutputItem", { + "id": str, + "title": NotRequired[str], + "description": NotRequired[str], + "keywords": NotRequired[List[str]], + "metadata": NotRequired[List[Metadata]], + "schema": NotRequired[OpenAPISchema], + "formats": NotRequired[List[FormatMediaType]], + "minOccurs": int, + "maxOccurs": Union[int, Literal["unbounded"]], + }, total=False) + ProcessInputOutputMap = Dict[str, ProcessInputOutputItem] + ProcessInputOutputList = List[ProcessInputOutputItem] + # Provide distinct types with mapping/listing representation of I/O to help annotation + # checkers resolve them more easily using less nested fields if specified explicitly + ProcessOfferingMapping = TypedDict("ProcessOfferingMapping", { + "id": Required[str], + "version": Optional[str], + "title": NotRequired[str], + "description": NotRequired[str], + "keywords": NotRequired[List[str]], + "metadata": NotRequired[List[Metadata]], + "inputs": Required[ProcessInputOutputMap], + "outputs": Required[ProcessInputOutputMap], + "jobControlOptions": List[AnyExecuteControlOption], + "outputTransmission": List[AnyExecuteControlOption], + "deploymentProfile": str, + "processDescriptionURL": NotRequired[str], + "processEndpointWPS1": NotRequired[str], + "executeEndpoint": NotRequired[str], + "links": List[Link], + "visibility": NotRequired[AnyVisibility], + }, total=False) + ProcessOfferingListing = TypedDict("ProcessOfferingListing", { + "id": Required[str], + "version": Optional[str], + "title": NotRequired[str], + "description": NotRequired[str], + "keywords": NotRequired[List[str]], + "metadata": NotRequired[List[Metadata]], + "inputs": Required[ProcessInputOutputList], + "outputs": Required[ProcessInputOutputList], + "jobControlOptions": List[AnyExecuteControlOption], + "outputTransmission": List[AnyExecuteControlOption], + "deploymentProfile": str, + "processDescriptionURL": NotRequired[str], + "processEndpointWPS1": NotRequired[str], + "executeEndpoint": NotRequired[str], + "links": List[Link], + "visibility": NotRequired[AnyVisibility], + }, total=False) + ProcessOffering = Union[ProcessOfferingMapping, ProcessOfferingListing] + ProcessDescriptionNestedMapping = TypedDict("ProcessDescriptionNestedMapping", { + "process": ProcessOfferingMapping, + }, total=False) + ProcessDescriptionNestedListing = TypedDict("ProcessDescriptionNestedListing", { + "process": ProcessOfferingListing, + }, total=False) + ProcessDescriptionNested = TypedDict("ProcessDescriptionNested", { + "process": ProcessOffering, + }, total=False) + ProcessDescriptionMapping = Union[ProcessOfferingMapping, ProcessDescriptionNestedMapping] + ProcessDescriptionListing = Union[ProcessOfferingListing, ProcessDescriptionNestedListing] + ProcessDescription = Union[ProcessDescriptionMapping, ProcessDescriptionListing] + + ExecutionUnitItem = TypedDict("ExecutionUnitItem", { + "unit": CWL + }, total=True) + ProcessDeployment = TypedDict("ProcessDeployment", { + "processDescription": ProcessDescription, + "executionUnit": List[Union[ExecutionUnitItem, Link]], + "immediateDeployment": NotRequired[bool], + "deploymentProfileName": str, + }, total=True) + + ProcessExecution = TypedDict("ProcessExecution", { + "mode": NotRequired[AnyExecuteMode], + "response": NotRequired[AnyExecuteResponse], + "inputs": Required[ExecutionInputs], + "outputs": Required[ExecutionOutputs], + }, total=False) diff --git a/weaver/utils.py b/weaver/utils.py index 25edbcd8d..1f16cf053 100644 --- a/weaver/utils.py +++ b/weaver/utils.py @@ -1,9 +1,9 @@ import difflib import errno +import fnmatch import functools import importlib.util import inspect -import json import logging import os import posixpath @@ -11,8 +11,10 @@ import shutil import sys import tempfile +import threading import time import warnings +from concurrent.futures import ALL_COMPLETED, CancelledError, ThreadPoolExecutor, as_completed, wait as wait_until from copy import deepcopy from datetime import datetime from distutils.version import LooseVersion @@ -26,8 +28,11 @@ import yaml from beaker.cache import cache_region, region_invalidate from beaker.exceptions import BeakerException +from botocore.config import Config as S3Config +from bs4 import BeautifulSoup from celery.app import Celery from jsonschema.validators import RefResolver as JsonSchemaRefResolver +from mypy_boto3_s3.literals import RegionName from pyramid.config import Configurator from pyramid.exceptions import ConfigurationError from pyramid.httpexceptions import ( @@ -50,9 +55,11 @@ from werkzeug.wrappers import Request as WerkzeugRequest from yaml.scanner import ScannerError -from weaver.base import Constants +import xml_util +from weaver.base import Constants, ExtendedEnum +from weaver.exceptions import WeaverException from weaver.execute import ExecuteControlOption, ExecuteMode -from weaver.formats import ContentType, get_content_type +from weaver.formats import ContentType, get_content_type, repr_json from weaver.status import map_status from weaver.warning import TimeZoneInfoAlreadySetWarning from weaver.xml_util import XML @@ -61,10 +68,12 @@ from types import FrameType from typing import ( Any, + AnyStr, Callable, Dict, List, Iterable, + Iterator, MutableMapping, NoReturn, Optional, @@ -73,13 +82,16 @@ Tuple, Union ) - from typing_extensions import TypeGuard + from typing_extensions import NotRequired, TypedDict, TypeGuard + + from mypy_boto3_s3.client import S3Client from weaver.execute import AnyExecuteControlOption, AnyExecuteMode from weaver.status import Status from weaver.typedefs import ( AnyCallable, AnyCallableAnyArgs, + AnyCookiesContainer, AnyKey, AnyHeadersContainer, AnySettingsContainer, @@ -96,11 +108,66 @@ Literal, OpenAPISchema, Number, - ReturnValue, + Params, + Path, + Return, SettingsType ) RetryCondition = Union[Type[Exception], Iterable[Type[Exception]], Callable[[Exception], bool]] + SchemeOptions = TypedDict("SchemeOptions", { + "file": Dict[str, JSON], + "http": Dict[str, JSON], # includes/duplicates HTTPS + "https": Dict[str, JSON], # includes/duplicates HTTP + "s3": Dict[str, JSON], + "vault": Dict[str, JSON], + }, total=True) + RequestOptions = TypedDict("RequestOptions", { + "timeout": NotRequired[int], + "connect_timeout": NotRequired[int], + "read_timeout": NotRequired[int], + "retry": NotRequired[int], + "retries": NotRequired[int], + "max_retries": NotRequired[int], + "backoff": NotRequired[Number], + "backoff_factor": NotRequired[Number], + "headers": NotRequired[AnyHeadersContainer], + "cookies": NotRequired[AnyCookiesContainer], + }, total=False) + + ResponseMetadata = TypedDict("ResponseMetadata", { + "RequestId": str, + "HTTPStatusCode": int, + "HTTPHeaders": HeadersType, + "RetryAttempts": int, + }, total=True) + S3FileContent = TypedDict("S3FileContent", { + "Key": str, + "LastModified": datetime, + "ETag": str, + "Size": int, + "StorageClass": Literal[ + "STANDARD", + "REDUCED_REDUNDANCY", + "GLACIER", + "STANDARD_IA", + "ONEZONE_IA", + "INTELLIGENT_TIERING", + "DEEP_ARCHIVE", + "OUTPOSTS", + "GLACIER_IR" + ], + }, total=True) + S3DirectoryListingResponse = TypedDict("S3DirectoryListingResponse", { + "ResponseMetadata": ResponseMetadata, + "IsTruncated": bool, + "Contents": List[S3FileContent], + "Name": str, # bucket + "Prefix": Optional[str], + "MaxKeys": int, + "KeyCount": int, + "EncodingType": Literal["url"], + }, total=True) OriginalClass = TypeVar("OriginalClass") ExtenderMixin = TypeVar("ExtenderMixin") @@ -108,6 +175,7 @@ class ExtendedClass(OriginalClass, ExtenderMixin): ... + LOGGER = logging.getLogger(__name__) SUPPORTED_FILE_SCHEMES = frozenset([ @@ -122,6 +190,45 @@ class ExtendedClass(OriginalClass, ExtenderMixin): FILE_NAME_QUOTE_PATTERN = re.compile(r"^\"?([\w\-.]+\.\w+)\"?$") # extension required, permissive extra quotes FILE_NAME_LOOSE_PATTERN = re.compile(r"^[\w\-.]+$") # no extension required +if sys.version_info >= (3, 7): + _LITERAL_VALUES_ATTRIBUTE = "__args__" +else: + _LITERAL_VALUES_ATTRIBUTE = "__values__" # pragma: no cover +AWS_S3_REGIONS = list(getattr(RegionName, _LITERAL_VALUES_ATTRIBUTE)) # type: List[RegionName] +AWS_S3_REGIONS_REGEX = "(" + "|".join(AWS_S3_REGIONS) + ")" +# https://docs.aws.amazon.com/general/latest/gr/aws-arns-and-namespaces.html +AWS_S3_ARN = "arn:aws:s3" +# https://docs.aws.amazon.com/AmazonS3/latest/userguide/bucketnamingrules.html +# https://stackoverflow.com/questions/50480924/regex-for-s3-bucket-name +AWS_S3_BUCKET_NAME_PATTERN = re.compile( + r"^" + r"(?!(^xn--|.+-s3alias$))" # prefix/suffix disallowed by reserved AWS use for other bucket access point variations + # lowercase only allowed, in range (min=3, max=63) characters + r"[a-z0-9]" # only alphanumeric start + r"(?:(\.(?!\.))|[a-z0-9-]){1,61}" # alphanumeric with dash/dot allowed, but repeated dots disallowed + r"[a-z0-9]" # only alphanumeric end + r"$" +) +# Bucket ARN = +# - arn:aws:s3:{Region}:{AccountId}:accesspoint/{AccessPointName}[/file-key] +# - arn:aws:s3-outposts:{Region}:{AccountId}:outpost/{OutpostId}/bucket/{Bucket}[/file-key] +# - arn:aws:s3-outposts:{Region}:{AccountId}:outpost/{OutpostId}/accesspoint/{AccessPointName}[/file-key] +AWS_S3_BUCKET_ARN_PATTERN = re.compile( + r"^" + rf"(?P{AWS_S3_ARN}(?:-outposts)?):" + rf"(?P{AWS_S3_REGIONS_REGEX}):" + r"(?P[a-z0-9]+):" + r"(?Paccesspoint|outpost)/" + r"(?P[a-z0-9][a-z0-9-]+[a-z0-9])" + r"$" +) +AWS_S3_BUCKET_REFERENCE_PATTERN = re.compile( + r"^(?Ps3://)" + rf"(?P{AWS_S3_BUCKET_NAME_PATTERN.pattern[1:-1]}|{AWS_S3_BUCKET_ARN_PATTERN.pattern[1:-1]})" + r"(?P(?:/$|/[\w.-]+)+)" # sub-dir and file-key path, minimally only dir trailing slash + r"$" +) + class CaseInsensitive(str): __str = None @@ -146,7 +253,7 @@ def __eq__(self, other): return self.__str.casefold() == str(other).casefold() -NUMBER_PATTERN = re.compile(r"^(?P[+-]?[0-9]+[.]?[0-9]*([e][+-]?[0-9]+)?)\s*(?P.*)$") +NUMBER_PATTERN = re.compile(r"^(?P[+-]?[0-9]+[.]?[0-9]*(e[+-]?[0-9]+)?)\s*(?P.*)$") UNIT_SI_POWER_UP = [CaseInsensitive("k"), "M", "G", "T", "P", "E", "Z", "Y"] # allow upper 'K' often used UNIT_SI_POWER_DOWN = ["m", "µ", "n", "p", "f", "a", "z", "y"] UNIT_BIN_POWER = ["Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi", "Yi"] @@ -211,6 +318,10 @@ class SchemaRefResolver(JsonSchemaRefResolver): Reference resolver that supports both :term:`JSON` and :term:`YAML` files from a remote location. """ # only need to override the remote resolution to add YAML support + # init overload used to patch invalid typing definition + def __init__(self, base_uri, referrer, *_, **__): + # type: (str, OpenAPISchema, *Any, **Any) -> None + super(SchemaRefResolver, self).__init__(base_uri, referrer, *_, **__) # type: ignore def resolve_remote(self, uri): # type: (str) -> OpenAPISchema @@ -1345,7 +1456,7 @@ def get_no_cache_option(request_headers, request_options): def get_request_options(method, url, settings): - # type: (str, str, AnySettingsContainer) -> SettingsType + # type: (str, str, AnySettingsContainer) -> RequestOptions """ Obtains the *request options* corresponding to the request from the configuration file. @@ -1400,11 +1511,11 @@ def get_request_options(method, url, settings): def retry_on_condition(operation, # type: AnyCallableAnyArgs - *args, # type: Any + *args, # type: Params.args condition=Exception, # type: RetryCondition retries=1, # type: int - **kwargs, # type: Any - ): # type: (...) -> ReturnValue + **kwargs, # type: Params.kwargs + ): # type: (...) -> Return """ Retries the operation call up to the amount of specified retries if the condition is encountered. @@ -1465,7 +1576,7 @@ def retry_on_cache_error(func): """ @functools.wraps(func) def wrapped(*args, **kwargs): - # type: (*Any, **Any) -> ReturnValue + # type: (*Any, **Any) -> Return try: return func(*args, **kwargs) except BeakerException as exc: @@ -1515,7 +1626,7 @@ def request_extra(method, # type: AnyRequestMethod only_server_errors=True, # type: bool ssl_verify=None, # type: Optional[bool] settings=None, # type: Optional[AnySettingsContainer] - **request_kwargs, # type: Any + **request_kwargs, # type: Any # RequestOptions ): # type: (...) -> AnyResponseType """ Standard library :mod:`requests` with additional functional utilities. @@ -1675,18 +1786,21 @@ def request_extra(method, # type: AnyRequestMethod return err -def download_file_http(file_reference, file_outdir, settings=None, **request_kwargs): - # type: (str, str, Optional[AnySettingsContainer], **Any) -> str +def download_file_http(file_reference, file_outdir, settings=None, callback=None, **request_kwargs): + # type: (str, str, Optional[AnySettingsContainer], Optional[Callable[[str], None]], **Any) -> str """ Downloads the file referenced by an HTTP URL location. Respects :rfc:`2183`, :rfc:`5987` and :rfc:`6266` regarding ``Content-Disposition`` header handling to resolve - any preferred file name. This value is employed if it fulfill validation criteria. Otherwise, the name is extracted + any preferred file name. This value is employed if it fulfills validation criteria. Otherwise, the name is extracted from the last part of the URL path. :param file_reference: HTTP URL where the file is hosted. :param file_outdir: Output local directory path under which to place the downloaded file. :param settings: Additional request-related settings from the application configuration (notably request-options). + :param callback: + Function that gets called progressively with incoming chunks from downloaded file. + Can be used to monitor download progress or raise an exception to abort it. :param request_kwargs: Additional keywords to forward to request call (if needed). :return: Path of the local copy of the fetched file. :raises HTTPException: applicable HTTP-based exception if any unrecoverable problem occurred during fetch request. @@ -1695,7 +1809,7 @@ def download_file_http(file_reference, file_outdir, settings=None, **request_kwa LOGGER.debug("Fetch file resolved as remote URL reference.") request_kwargs.pop("stream", None) - resp = request_extra("get", file_reference, stream=True, retries=3, settings=settings, **request_kwargs) + resp = request_extra("GET", file_reference, stream=True, retries=3, settings=settings, **request_kwargs) if resp.status_code >= 400: # pragma: no cover # use method since response object does not derive from Exception, therefore cannot be raised directly @@ -1741,93 +1855,321 @@ def download_file_http(file_reference, file_outdir, settings=None, **request_kwa # Setting 'chunk_size=None' lets the request find a suitable size according to # available memory. Without this, it defaults to 1 which is extremely slow. for chunk in resp.iter_content(chunk_size=None): + if callback: + callback(chunk) file.write(chunk) return file_path -def fetch_file(file_reference, file_outdir, settings=None, link=None, move=False, **request_kwargs): - # type: (str, str, Optional[AnySettingsContainer], Optional[bool], bool, **Any) -> str +def validate_s3(*, region, bucket): + # type: (Any, str, str) -> None """ - Fetches a file from local path, AWS-S3 bucket or remote URL, and dumps it's content to the output directory. + Validate patterns and allowed values for :term:`AWS` :term:`S3` client configuration. + """ + if not re.match(AWS_S3_REGIONS_REGEX, region) or region not in AWS_S3_REGIONS: + raise ValueError(f"Invalid AWS S3 Region format or value for: [{region!s}]\n") + if not re.match(AWS_S3_BUCKET_NAME_PATTERN, bucket): + raise ValueError(f"Invalid AWS S3 Bucket format or value for: [{bucket!s}]\n") + LOGGER.debug("All valid AWS S3 parameters: [Region=%s, Bucket=%s]", region, bucket) + + +def resolve_s3_from_http(reference): + # type: (str) -> Tuple[str, RegionName] + """ + Resolve an HTTP URL reference pointing to an S3 Bucket into the shorthand URL notation with S3 scheme. + + The expected reference should be formatted with one of the following supported formats. + + .. code-block:: text + + # Path-style URL + https://s3.{Region}.amazonaws.com/{Bucket}/[{dirs}/][{file-key}] + + # Virtual-hosted–style URL + https://{Bucket}.s3.{Region}.amazonaws.com/[{dirs}/][{file-key}] + + # Access-Point-style URL + https://{AccessPointName}-{AccountId}.s3-accesspoint.{Region}.amazonaws.com/[{dirs}/][{file-key}] + + # Outposts-style URL + https://{AccessPointName}-{AccountId}.{outpostID}.s3-outposts.{Region}.amazonaws.com/[{dirs}/][{file-key}] + + .. seealso:: + References on formats: + + - https://docs.aws.amazon.com/AmazonS3/latest/userguide/bucketnamingrules.html + - https://docs.aws.amazon.com/AmazonS3/latest/userguide/access-bucket-intro.html + - https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-access-points.html + - https://docs.aws.amazon.com/AmazonS3/latest/userguide/S3onOutposts.html + + .. seealso:: + References on resolution: + + - https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html + + :param reference: HTTP-S3 URL reference. + :return: Updated S3 reference and applicable S3 Region name. + """ + s3 = boto3.client("s3") # type: S3Client # created with default, environment, or ~/.aws/config + s3_url = s3.meta.endpoint_url # includes the region name, to be used to check if we must switch region + s3_region = s3.meta.region_name + try: + if not reference.startswith(s3_url): + LOGGER.warning( + "Detected HTTP reference to AWS S3 bucket [%s] that mismatches server region configuration [%s]. " + "Attempting to switch S3 region for proper resolution.", + reference, s3_region + ) + s3_parsed = urlparse(reference) + s3_host = s3_parsed.hostname + s3_path = s3_parsed.path + if ".s3-outposts." in s3_host: + # boto3 wants: + # Bucket ARN = + # - arn:aws:s3-outposts:{Region}:{AccountId}:outpost/{OutpostId}/bucket/{Bucket} + # - arn:aws:s3-outposts:{Region}:{AccountId}:outpost/{OutpostId}/accesspoint/{AccessPointName} + s3_outpost, s3_region = s3_host.split(".s3-outposts.", 1) + s3_access_point, s3_outpost_id = s3_outpost.rsplit(".", 1) + s3_access_name, s3_account = s3_access_point.rsplit("-", 1) + s3_region = s3_region.split(".amazonaws.com", 1)[0] + s3_ref = s3_path + s3_prefix = f"{AWS_S3_ARN}-outposts" + s3_arn = f"{s3_prefix}:{s3_region}:{s3_account}:outpost/{s3_outpost_id}/accesspoint/{s3_access_name}" + s3_reference = f"s3://{s3_arn}{s3_ref}" + elif ".s3-accesspoint." in s3_host: + # boto3 wants: + # Bucket ARN = arn:aws:s3:{Region}:{AccountId}:accesspoint/{AccessPointName} + s3_access_point, s3_region = s3_host.split(".s3-accesspoint.", 1) + s3_access_name, s3_account = s3_access_point.rsplit("-", 1) + s3_region = s3_region.split(".amazonaws.com", 1)[0] + s3_ref = s3_path + s3_arn = f"{AWS_S3_ARN}:{s3_region}:{s3_account}:accesspoint/{s3_access_name}" + s3_reference = f"s3://{s3_arn}{s3_ref}" + elif ".s3." in s3_host: + s3_bucket, s3_region = reference.split(".s3.", 1) + s3_region, s3_ref = s3_region.split(".amazonaws.com", 1) + s3_bucket = s3_bucket.rsplit("://", 1)[-1].strip("/") + s3_ref = s3_ref.lstrip("/") + s3_reference = f"s3://{s3_bucket}/{s3_ref}" + else: + s3_region, s3_ref = reference.split("https://s3.")[-1].split(".amazonaws.com") + s3_ref = s3_ref.lstrip("/") + s3_reference = f"s3://{s3_ref}" + else: + s3_ref = reference.replace(s3_url, "") + s3_ref = s3_ref.lstrip("/") + s3_reference = f"s3://{s3_ref}" + if not re.match(AWS_S3_BUCKET_REFERENCE_PATTERN, s3_reference) or not s3_ref: + raise ValueError("No S3 bucket, region or file/directory reference was " + f"found from input reference [{reference}].") + except (IndexError, TypeError, ValueError) as exc: + s3_valid_formats = [ + "https://s3.{Region}.amazonaws.com/{Bucket}/[{dirs}/][{file-key}]", + "https://{Bucket}.s3.{Region}.amazonaws.com/[{dirs}/][{file-key}]", + "https://{AccessPointName}-{AccountId}.s3-accesspoint.{Region}.amazonaws.com/[{dirs}/][{file-key}]", + "s3://{Bucket}/[{dirs}/][{file-key}] (**default region**)" # not parsed here, but show as valid option + ] + raise ValueError(f"Invalid AWS S3 reference format. Could not parse unknown: [{reference!s}]\n" + f"Available formats:\n{repr_json(s3_valid_formats, indent=2)}") from exc + LOGGER.debug("Adjusting HTTP reference to S3 URL style with resolved S3 Region:\n" + " Initial: [%s]\n" + " Updated: [%s]\n" + " Region: [%s]", + reference, s3_reference, s3_region) + return s3_reference, s3_region + + +def resolve_s3_reference(s3_reference): + # type: (str) -> Tuple[str, str, Optional[RegionName]] + """ + Resolve a reference of :term:`S3` scheme into the appropriate formats expected by :mod:`boto3`. + + :param s3_reference: Reference with ``s3://`` scheme with an ARN or literal Bucket/Object path. + :return: Tuple of resolved Bucket name, Object path and S3 Region. + """ + s3_ref = s3_reference[5:] + if s3_ref.startswith(AWS_S3_ARN): + s3_arn_match = re.match(AWS_S3_BUCKET_REFERENCE_PATTERN, s3_reference) + if not s3_arn_match: + raise ValueError( + f"Invalid AWS S3 ARN reference must have one of [accesspoint, outpost] target. " + f"None could be found in [{s3_reference}]." + ) + if s3_arn_match["type_name"] == "outpost": + parts = s3_arn_match["path"].split("/", 4) + bucket_name = s3_arn_match["bucket"] + "/".join(parts[:3]) + file_key = "/".join(parts[3:]) + elif s3_arn_match["type_name"] == "accesspoint": + bucket_name = s3_arn_match["bucket"] + file_key = s3_arn_match["path"] + else: + raise ValueError( + "Invalid AWS S3 ARN reference must have one of [accesspoint, outpost] target. " + f"None could be found in [{s3_reference}]." + ) + s3_region = s3_arn_match["region"] + else: + s3_region = None # default or predefined by caller + bucket_name, file_key = s3_ref.split("/", 1) + # files must always be relative without prefixed '/' + # directory should always contain the trailing '/' + if s3_reference.endswith("/"): + if not file_key.endswith("/"): + file_key += "/" + else: + file_key = file_key.lstrip("/") + return bucket_name, file_key, s3_region + + +def resolve_s3_http_options(**request_kwargs): + # type: (**Any) -> Dict[str, Union[S3Config, JSON]] + """ + Converts HTTP requests options to corresponding S3 configuration definitions. + + Resolved parameters will only preserve valid options that can be passed directly to :class:`botocore.client.S3` + when initialized with :func:`boto3.client` in combination with ``"s3"`` service. Valid HTTP requests options that + have been resolved will be nested under ``config`` with a :class:`S3Config` where applicable. + + :param request_kwargs: Request keywords to attempt mapping to S3 configuration. + :return: Resolved S3 client parameters. + """ + params = {} + cfg_kw = {} + if "timeout" in request_kwargs: + cfg_kw["connect_timeout"] = request_kwargs["timeout"] + cfg_kw["read_timeout"] = request_kwargs["timeout"] + if "connect_timeout" in request_kwargs: + cfg_kw["connect_timeout"] = request_kwargs["connect_timeout"] + if "read_timeout" in request_kwargs: + cfg_kw["read_timeout"] = request_kwargs["read_timeout"] + if "cert" in request_kwargs: + cfg_kw["client_cert"] = request_kwargs["cert"] # same combination of str or (str, str) accepted + if "verify" in request_kwargs: + params["verify"] = request_kwargs["verify"] # this is passed directly to the client rather than config + retries = request_kwargs.pop("retries", request_kwargs.pop("retry", request_kwargs.pop("max_retries", None))) + if retries is not None: + cfg_kw["retries"] = {"max_attempts": retries} + if "headers" in request_kwargs: + user_agent = get_header("User-Agent", request_kwargs["headers"]) + if user_agent: + cfg_kw["user_agent"] = user_agent + config = S3Config(**cfg_kw) + params["config"] = config + return params + + +def resolve_scheme_options(**kwargs): + # type: (**Any) -> Tuple[SchemeOptions, RequestOptions] + """ + Splits options into their relevant group by scheme prefix. + + Handled schemes are defined by :data:`SUPPORTED_FILE_SCHEMES`. + HTTP and HTTPS are grouped together and share the same options. + + :param kwargs: Keywords to categorise by scheme. + :returns: Categorised options by scheme and all other remaining keywords. + """ + options = {group: {} for group in SUPPORTED_FILE_SCHEMES} + keywords = {} + for opt, val in kwargs.items(): + if any(opt.startswith(scheme) for scheme in list(options)): + opt, key = opt.split("_", 1) + options[opt][key] = val + else: + keywords[opt] = val + options["http"].update(options.pop("https")) + options["https"] = options["http"] + return options, keywords + + +class OutputMethod(ExtendedEnum): + """ + Methodology employed to handle generation of a file or directory output that was fetched. + """ + AUTO = "auto" + LINK = "link" + MOVE = "move" + COPY = "copy" + + +def fetch_file(file_reference, # type: str + file_outdir, # type: str + *, # force named keyword arguments after + out_method=OutputMethod.AUTO, # type: OutputMethod + settings=None, # type: Optional[AnySettingsContainer] + callback=None, # type: Optional[Callable[[str], None]] + **option_kwargs, # type: Any # Union[SchemeOptions, RequestOptions] + ): # type: (...) -> str + """ + Fetches a file from local path, AWS-S3 bucket or remote URL, and dumps its content to the output directory. The output directory is expected to exist prior to this function call. The file reference scheme (protocol) determines from where to fetch the content. Output file name and extension will be the same as the original (after link resolution if applicable). Requests will consider ``weaver.request_options`` when using ``http(s)://`` scheme. + .. seealso:: + - :func:`fetch_reference` + - :func:`resolve_scheme_options` + - :func:`adjust_file_local` + - :func:`download_file_http` + :param file_reference: Local filesystem path (optionally prefixed with ``file://``), ``s3://`` bucket location or ``http(s)://`` remote URL file reference. Reference ``https://s3.[...]`` are also considered as ``s3://``. :param file_outdir: Output local directory path under which to place the fetched file. :param settings: Additional request-related settings from the application configuration (notably request-options). - :param link: - If ``True``, force generation of a symbolic link instead of hard copy, regardless if source is a file or link. - If ``False``, force hard copy of the file to destination, regardless if source is a file or link. - If ``None`` (default), resolve automatically as follows. - When the source is a symbolic link itself, the destination will also be a link. - When the source is a direct file reference, the destination will be a hard copy of the file. - Only applicable when the file reference is local. - :param move: - Move local file to the output directory instead of copying or linking it. - No effect if the output directory already contains the local file. - No effect if download must occurs for remote file. - :param request_kwargs: Additional keywords to forward to request call (if needed). + :param callback: + Function that gets called progressively with incoming chunks from downloaded file. + Only applicable when download occurs (remote file reference). + Can be used to monitor download progress or raise an exception to abort it. + :param out_method: + Method employed to handle the generation of the output file. + Only applicable when the file reference is local. Remote location always generates a local copy. + :param option_kwargs: + Additional keywords to forward to the relevant handling method by scheme. + Keywords should be defined as ``{scheme}_{option}`` with one of the known :data:`SUPPORTED_FILE_SCHEMES`. + If not prefixed by any scheme, the option will apply to all handling methods (if applicable). :return: Path of the local copy of the fetched file. :raises HTTPException: applicable HTTP-based exception if any occurred during the operation. :raises ValueError: when the reference scheme cannot be identified. """ + if file_reference.startswith("file://"): + file_reference = file_reference[7:] file_href = file_reference file_name = os.path.basename(os.path.realpath(file_reference)) # resolve any different name to use the original file_path = os.path.join(file_outdir, file_name) - if file_reference.startswith("file://"): - file_reference = file_reference[7:] - LOGGER.debug("Fetching file reference: [%s]", file_href) + LOGGER.debug("Fetching file reference: [%s] using options:\n%s", file_href, repr_json(option_kwargs)) + options, kwargs = resolve_scheme_options(**option_kwargs) if os.path.isfile(file_reference): LOGGER.debug("Fetch file resolved as local reference.") - if move and os.path.isfile(file_path): - LOGGER.debug("Reference [%s] cannot be moved to path [%s] (already exists)", file_href, file_path) - raise OSError("Cannot move file, already in output directory!") - if move: - shutil.move(os.path.realpath(file_reference), file_outdir) - # NOTE: - # If file is available locally and referenced as a system link, disabling 'follow_symlinks' - # creates a copy of the symlink instead of an extra hard-copy of the linked file. - elif os.path.islink(file_reference) and not os.path.isfile(file_path): - if link is True: - os.symlink(os.readlink(file_reference), file_path) - else: - shutil.copyfile(file_reference, file_path, follow_symlinks=link is False) - # otherwise copy the file if not already available - # expand directory of 'file_path' and full 'file_reference' to ensure many symlink don't result in same place - elif not os.path.isfile(file_path) or os.path.realpath(file_path) != os.path.realpath(file_reference): - if link is True: - os.symlink(file_reference, file_path) - else: - shutil.copyfile(file_reference, file_path) - else: - LOGGER.debug("Fetch file as local reference has no action to take, file already exists: [%s]", file_path) + adjust_file_local(file_href, file_outdir, out_method) elif file_reference.startswith("s3://"): LOGGER.debug("Fetch file resolved as S3 bucket reference.") - s3 = boto3.resource("s3") - bucket_name, file_key = file_reference[5:].split("/", 1) - bucket = s3.Bucket(bucket_name) - bucket.download_file(file_key, file_path) + s3_params = resolve_s3_http_options(**options["http"], **kwargs) + s3_region = options["s3"].pop("region_name", None) + bucket_name, file_key, s3_region_ref = resolve_s3_reference(file_reference) + if s3_region and s3_region_ref and s3_region != s3_region_ref: + raise ValueError("Invalid AWS S3 reference. " + f"Input region name [{s3_region}] mismatches reference region [{s3_region_ref}].") + s3_region = s3_region_ref or s3_region + s3_client = boto3.client("s3", region_name=s3_region, **s3_params) # type: S3Client + s3_client.download_file(bucket_name, file_key, file_path, Callback=callback) elif file_reference.startswith("http"): # pseudo-http URL referring to S3 bucket, try to redirect to above S3 handling method if applicable - if file_reference.startswith("https://s3."): - s3 = boto3.resource("s3") - # endpoint in the form: "https://s3.[region-name.]amazonaws.com//" - if not file_reference.startswith(s3.meta.endpoint_url): - LOGGER.warning("Detected HTTP file reference to AWS S3 bucket that mismatches server configuration. " - "Will consider it as plain HTTP with read access.") - else: - file_reference_s3 = file_reference.replace(s3.meta.endpoint_url, "") - file_ref_updated = f"s3://{file_reference_s3}" - LOGGER.debug("Adjusting file reference to S3 shorthand for further parsing:\n" - " Initial: [%s]\n" - " Updated: [%s]", file_reference, file_ref_updated) - return fetch_file(file_ref_updated, file_outdir, settings=settings, **request_kwargs) - file_path = download_file_http(file_reference, file_outdir, settings=settings, **request_kwargs) + if file_reference.startswith("https://s3.") or urlparse(file_reference).hostname.endswith(".amazonaws.com"): + LOGGER.debug("Detected HTTP-like S3 bucket file reference. Retrying file fetching with S3 reference.") + s3_ref, s3_region = resolve_s3_from_http(file_reference) + option_kwargs.pop("s3_region", None) + return fetch_file(s3_ref, file_outdir, settings=settings, s3_region_name=s3_region, **option_kwargs) + file_path = download_file_http( + file_reference, + file_outdir, + settings=settings, + callback=callback, + **options["http"], + **kwargs + ) else: scheme = file_reference.split("://") scheme = "" if len(scheme) < 2 else scheme[0] @@ -1841,6 +2183,748 @@ def fetch_file(file_reference, file_outdir, settings=None, link=None, move=False return file_path +def adjust_file_local(file_reference, file_outdir, out_method): + # type: (str, str, OutputMethod) -> None + """ + Adjusts the input file reference to the output location with the requested handling method. + + Handling Methods + ~~~~~~~~~~~~~~~~~~~~~~ + + - :attr:`OutputMethod.LINK`: + + Force generation of a symbolic link instead of hard copy, + regardless if source is directly a file or a link to one. + + - :attr:`OutputMethod.COPY`: + + Force hard copy of the file to destination, regardless if source is directly a file or a link to one. + + - :attr:`OutputMethod.MOVE`: + + Move the local file to the output directory instead of copying or linking it. + If the output directory already contains the local file, raises an :class:`OSError`. + + - :attr:`OutputMethod.AUTO` (default): + + Resolve conditionally as follows. + + * When the source is a symbolic link itself, the destination will also be a link. + * When the source is a direct file reference, the destination will be a hard copy of the file. + + :param file_reference: Original location of the file. + :param file_outdir: Target directory of the file. + :param out_method: Method employed to handle the generation of the output file. + """ + file_loc = os.path.realpath(file_reference) + file_name = os.path.basename(file_loc) # resolve any different name to use the original + file_path = os.path.join(file_outdir, file_name) + if out_method == OutputMethod.MOVE and os.path.isfile(file_path): + LOGGER.debug("Reference [%s] cannot be moved to path [%s] (already exists)", file_reference, file_path) + raise OSError("Cannot move file, already in output directory!") + if out_method == OutputMethod.MOVE: + shutil.move(file_loc, file_outdir) + if file_loc != file_reference and os.path.islink(file_reference): + os.remove(file_reference) + # NOTE: + # If file is available locally and referenced as a system link, disabling 'follow_symlinks' + # creates a copy of the symlink instead of an extra hard-copy of the linked file. + elif os.path.islink(file_reference) and not os.path.isfile(file_path): + if out_method == OutputMethod.LINK: + os.symlink(os.readlink(file_reference), file_path) + else: + shutil.copyfile(file_reference, file_path, follow_symlinks=out_method == OutputMethod.COPY) + # otherwise copy the file if not already available + # expand directory of 'file_path' and full 'file_reference' to ensure many symlink don't result in same place + elif not os.path.isfile(file_path) or os.path.realpath(file_path) != os.path.realpath(file_reference): + if out_method == OutputMethod.LINK: + os.symlink(file_reference, file_path) + else: + shutil.copyfile(file_reference, file_path) + else: + LOGGER.debug("File as local reference has no action to take, file already exists: [%s]", file_path) + + +def filter_directory_forbidden(listing): + # type: (Iterable[str]) -> Iterator[str] + """ + Filters out items that should always be removed from directory listing results. + """ + is_in = frozenset({"..", "../", "./"}) + equal = frozenset({"."}) # because of file extensions, cannot check 'part in item' + for item in listing: + if any(part in item for part in is_in): + continue + if any(part == item for part in equal): + continue + yield item + + +class PathMatchingMethod(ExtendedEnum): + GLOB = "glob" + REGEX = "regex" + + +def filter_directory_patterns(listing, include, exclude, matcher): + # type: (Iterable[str], Optional[Iterable[str]], Optional[Iterable[str]], PathMatchingMethod) -> List[str] + """ + Filters a list of files according to a set of include/exclude patterns. + + If a file is matched against an include pattern, it will take precedence over matches on exclude patterns. + By default, any file that is not matched by an excluded pattern will remain in the resulting filtered set. + Include patterns are only intended to "add back" previously excluded matches. They are **NOT** for defining + "only desired items". Adding include patterns without exclude patterns is redundant, as all files would be + retained by default anyway. + + Patterns can use regular expression definitions or Unix shell-style wildcards. + The :paramref:`matcher` should be selected accordingly to provided patterns matching method. + Potential functions are :func:`re.match`, :func:`re.fullmatch`, :func:`fnmatch.fnmatch`, :func:`fnmatch.fnmatchcase` + Literal strings for exact matches are also valid. + + .. note:: + Provided patterns are applied directly without modifications. If the file listing contains different root + directories than patterns, such as if patterns are specified with relative paths, obtained results could + mismatch the intended behavior. Make sure to align paths accordingly for the expected filtering context. + + :param listing: Files to filter. + :param include: Any matching patterns for files that should be explicitly included. + :param exclude: Any matching patterns for files that should be excluded unless included. + :param matcher: Pattern matching method to evaluate if a file path matches include and exclude definitions. + :return: Filtered files. + """ + listing_include = include or [] + listing_exclude = exclude or [] + if listing_include or listing_exclude: + if matcher == PathMatchingMethod.REGEX: + def is_match(pattern, value): # type: (str, str) -> bool + return re.fullmatch(pattern, value) is not None + elif matcher == PathMatchingMethod.GLOB: + def is_match(pattern, value): # type: (str, str) -> bool + return fnmatch.fnmatchcase(value, pattern) + else: + raise ValueError(f"Unknown path pattern matching method: [{matcher}]") + filtered = [ + item for item in listing if ( + not any(is_match(re_excl, item) for re_excl in listing_exclude) + or any(is_match(re_incl, item) for re_incl in listing_include) + ) + ] + LOGGER.debug("Filtering directory listing\n" + " include: %s\n" + " exclude: %s\n" + " listing: %s\n" + " filtered: %s\n", + listing_include, listing_exclude, listing, filtered) + listing = filtered + return listing + + +def download_files_s3(location, # type: str + out_dir, # type: Path + include=None, # type: Optional[List[str]] + exclude=None, # type: Optional[List[str]] + matcher=PathMatchingMethod.GLOB, # type: PathMatchingMethod + settings=None, # type: Optional[SettingsType] + **option_kwargs, # type: Any # Union[SchemeOptions, RequestOptions] + ): # type: (...) -> List[str] + """ + Download all listed S3 files references under the output directory using the provided S3 bucket and client. + + If nested directories are employed in the file paths, they will be downloaded with the same directory hierarchy + under the requested output directory. + + .. seealso:: + Filtering is subject to :func:`filter_directory_patterns` and :func:`filter_directory_forbidden`. + + :param location: S3 bucket location (with ``s3://`` scheme) targeted to retrieve files. + :param out_dir: Desired output location of downloaded files. + :param include: Any matching patterns for files that should be explicitly included. + :param exclude: Any matching patterns for files that should be excluded unless included. + :param matcher: Pattern matching method to evaluate if a file path matches include and exclude definitions. + :param settings: Additional request-related settings from the application configuration (notably request-options). + :param option_kwargs: + Additional keywords to forward to the relevant handling method by scheme. + Keywords should be defined as ``{scheme}_{option}`` with one of the known :data:`SUPPORTED_FILE_SCHEMES`. + If not prefixed by any scheme, the option will apply to all handling methods (if applicable). + :returns: Output locations of downloaded files. + """ + LOGGER.debug("Resolving S3 connection and options for directory listing.") + options, kwargs = resolve_scheme_options(**option_kwargs) + configs = get_request_options("GET", location, settings) + options["http"].update(**configs) + s3_params = resolve_s3_http_options(**options["http"], **kwargs) + s3_region = options["s3"].pop("region_name", None) + s3_client = boto3.client("s3", region_name=s3_region, **s3_params) # type: S3Client + bucket_name, dir_key = location[5:].split("/", 1) + base_url = s3_client.meta.endpoint_url.rstrip("/") + "/" + + # adjust patterns with full paths to ensure they still work with retrieved relative S3 keys + include = [incl.replace(base_url, "", 1) if incl.startswith(base_url) else incl for incl in include or []] + exclude = [excl.replace(base_url, "", 1) if excl.startswith(base_url) else excl for excl in exclude or []] + + LOGGER.debug("Resolved S3 Bucket [%s] and Region [%s] for download of files.", bucket_name, s3_region or "default") + s3_dir_resp = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=dir_key) + LOGGER.debug("Fetched S3 directory [%s] listing contents:\n%s", location, repr_json(s3_dir_resp)) + s3_files = (file["Key"] for file in s3_dir_resp["Contents"]) # definitions with relative paths (like patterns) + s3_files = (path for path in s3_files if not path.endswith("/")) + s3_files = filter_directory_forbidden(s3_files) + s3_files = filter_directory_patterns(s3_files, include, exclude, matcher) + s3_files = list(s3_files) + + # create directories in advance to avoid potential errors in case many workers try to generate the same one + base_url = base_url.rstrip("/") + sub_dirs = {os.path.split(path)[0] for path in s3_files if "://" not in path or path.startswith(base_url)} + sub_dirs = [os.path.join(out_dir, path.replace(base_url, "").lstrip("/")) for path in sub_dirs] + for _dir in reversed(sorted(sub_dirs)): + os.makedirs(_dir, exist_ok=True) + base_url += "/" + + LOGGER.debug("Starting fetch of individual S3 files from [%s]:\n%s", base_url, repr_json(s3_files)) + task_kill_event = threading.Event() # abort remaining tasks if set + + def _abort_callback(_chunk): # called progressively with downloaded chunks + # type: (AnyStr) -> None + if task_kill_event.is_set(): + raise CancelledError("Other failed download task triggered abort event.") + + def _download_file(_client, _bucket, _rel_file_path, _out_dir): + # type: (S3Client, str, str, str) -> str + if task_kill_event.is_set(): + raise CancelledError("Other failed download task triggered abort event.") + try: + _out_file = os.path.join(_out_dir, _rel_file_path) + _client.download_file(_bucket, _rel_file_path, _out_file, Callback=_abort_callback) + except Exception as exc: + _file_path = os.path.join(_client.meta.endpoint_url, _bucket, _rel_file_path) + LOGGER.error("Error raised in download worker for [%s]: [%s]", _file_path, exc, exc_info=exc) + task_kill_event.set() + raise + return _out_file + + max_workers = min(len(s3_files), 8) + if max_workers <= 0: + raise ValueError(f"No files specified for download from reference [{base_url}].") + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = ( + executor.submit(_download_file, s3_client, bucket_name, file_key, out_dir) + for file_key in s3_files + ) + for future in as_completed(futures): + if future.exception(): + other_futures = set(futures) - {future} + for o_future in other_futures: + o_future.cancel() + task_kill_event.set() + yield future.result() + # wait for any cleanup, must use set() because of https://github.com/python/cpython/issues/86104 + results, failures = wait_until(set(futures), return_when=ALL_COMPLETED) + if failures or any(not path for path in results): + raise WeaverException( + "Directory download failed due to at least one failing file download in listing: " + f"{[repr(exc.exception()) for exc in failures]}" + ) + + +def download_files_url(file_references, # type: Iterable[str] + out_dir, # type: Path + base_url, # type: str + include=None, # type: Optional[List[str]] + exclude=None, # type: Optional[List[str]] + matcher=PathMatchingMethod.GLOB, # type: PathMatchingMethod + settings=None, # type: Optional[SettingsType] + **option_kwargs, # type: Any # Union[SchemeOptions, RequestOptions] + ): # type: (...) -> Iterator[str] + """ + Download all listed files references under the output directory. + + If nested directories are employed in file paths, relative to :paramref:`base_url`, they will be downloaded + with the same directory hierarchy under the requested output directory. If the :paramref:`base_url` differs, + they will simply be downloaded at the root of the output directory. If any conflict occurs in such case, an + :class:`OSError` will be raised. + + .. seealso:: + Use :func:`download_files_s3` instead if all files share the same S3 bucket. + + :param file_references: Relative or full URL paths of the files to download. + :param out_dir: Desired output location of downloaded files. + :param base_url: + If full URL are specified, corresponding files will be retrieved using the appropriate scheme per file + allowing flexible data sources. Otherwise, any relative locations use this base URL to resolve the full + URL prior to downloading the file. + :param include: Any matching patterns for files that should be explicitly included. + :param exclude: Any matching patterns for files that should be excluded unless included. + :param matcher: Pattern matching method to evaluate if a file path matches include and exclude definitions. + :param settings: Additional request-related settings from the application configuration (notably request-options). + :param option_kwargs: + Additional keywords to forward to the relevant handling method by scheme. + Keywords should be defined as ``{scheme}_{option}`` with one of the known :data:`SUPPORTED_FILE_SCHEMES`. + If not prefixed by any scheme, the option will apply to all handling methods (if applicable). + :returns: Output locations of downloaded files. + """ + LOGGER.debug("Starting file listing download from references:\n%s", repr_json(file_references)) + + # References could be coming from different base URL/scheme/host. + # The include/exclude patterns will have to match them exactly in the even they don't share the same base URL. + # However, in the event they have the same URL, patterns could refer to their relative path only to that URL. + # Adjust patterns accordingly to allow filter against forbidden/include/exclude with relative paths. + base_url = get_url_without_query(base_url).rstrip("/") + "/" + include = [incl.replace(base_url, "", 1) if incl.startswith(base_url) else incl for incl in include or []] + exclude = [excl.replace(base_url, "", 1) if excl.startswith(base_url) else excl for excl in exclude or []] + file_references = (path for path in file_references if not path.endswith("/")) + file_refs_relative = {path for path in file_references if path.startswith(base_url)} + file_refs_absolute = set(file_references) - file_refs_relative + file_refs_relative = {path.replace(base_url, "") for path in file_refs_relative} + file_refs_absolute = filter_directory_forbidden(file_refs_absolute) + file_refs_absolute = filter_directory_patterns(file_refs_absolute, include, exclude, matcher) + file_refs_relative = filter_directory_forbidden(file_refs_relative) + file_refs_relative = filter_directory_patterns(file_refs_relative, include, exclude, matcher) + file_refs_relative = {os.path.join(base_url, path) for path in file_refs_relative} + file_references = sorted(list(set(file_refs_relative) | set(file_refs_absolute))) + + # create directories in advance to avoid potential errors in case many workers try to generate the same one + base_url = base_url.rstrip("/") + sub_dirs = {os.path.split(path)[0] for path in file_references if "://" not in path or path.startswith(base_url)} + sub_dirs = [os.path.join(out_dir, path.replace(base_url, "").lstrip("/")) for path in sub_dirs] + for _dir in reversed(sorted(sub_dirs)): + os.makedirs(_dir, exist_ok=True) + base_url += "/" + + LOGGER.debug("Starting fetch of individual files from [%s]:\n%s", base_url, repr_json(file_references)) + task_kill_event = threading.Event() # abort remaining tasks if set + + def _abort_callback(_chunk): # called progressively with downloaded chunks + # type: (AnyStr) -> None + if task_kill_event.is_set(): + raise CancelledError("Other failed download task triggered abort event.") + + def _download_file(_file_path): + # type: (str) -> str + _file_parts = _file_path.split("://", 1) + if len(_file_parts) == 1: # relative, no scheme + if not base_url: + raise ValueError(f"Cannot download relative reference [{_file_path}] without a base URL.") + _file_path = _file_path.strip("/") + _out_file = os.path.join(out_dir, _file_path) + _file_path = os.path.join(base_url, _file_path) + elif base_url and _file_path.startswith(base_url): + _out_file = os.path.join(out_dir, _file_path.replace(base_url, "")) + else: + _out_file = os.path.join(out_dir, os.path.split(_file_path)[-1]) + _out_dir = os.path.split(_out_file)[0] + try: + return fetch_file(_file_path, _out_dir, settings=settings, callback=_abort_callback, **option_kwargs) + except Exception as exc: + LOGGER.error("Error raised in download worker for [%s]: [%s]", _file_path, exc, exc_info=exc) + task_kill_event.set() + raise + + max_workers = min(len(file_references), 8) + if max_workers <= 0: + msg_ref = f" from reference [{base_url}]" if base_url else "" + raise ValueError(f"No files specified for download{msg_ref}.") + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = ( + executor.submit(_download_file, file_key) + for file_key in file_references + ) + for future in as_completed(futures): + if future.exception(): + task_kill_event.set() + yield future.result() + + +def download_files_html(html_data, # type: str + out_dir, # type: Path + base_url, # type: str + include=None, # type: Optional[List[str]] + exclude=None, # type: Optional[List[str]] + matcher=PathMatchingMethod.GLOB, # type: PathMatchingMethod + settings=None, # type: Optional[AnySettingsContainer] + **option_kwargs, # type: Any # Union[SchemeOptions, RequestOptions] + ): # type: (...) -> Iterator[str] + """ + Downloads files retrieved from a directory listing provided as an index of plain HTML with file references. + + If the index itself provides directories that can be browsed down, the tree hierarchy will be downloaded + recursively by following links. In such case, links are ignored if they cannot be resolved as a nested index pages. + + Retrieval of file references from directory listing attempts to be as flexible as possible to the HTML response + format, by ignoring style tags and looking only for ```` references. Examples of different supported + format representations are presented at following locations: + + - https://anaconda.org/anaconda/python/files (raw listing with text code style and minimal file metadata) + - https://mirrors.edge.kernel.org/pub/ (listing within a formatted table with multiple other metadata fields) + + .. seealso:: + :func:`download_files_url` + + :param html_data: HTML data contents with files references to download. + :param out_dir: Desired output location of downloaded files. + :param base_url: + If full URL are specified, corresponding files will be retrieved using the appropriate scheme per file + allowing flexible data sources. Otherwise, any relative locations use this base URL to resolve the full + URL prior to downloading the file. + :param include: Any matching patterns for files that should be explicitly included. + :param exclude: Any matching patterns for files that should be excluded unless included. + :param matcher: Pattern matching method to evaluate if a file path matches include and exclude definitions. + :param settings: Additional request-related settings from the application configuration (notably request-options). + :param option_kwargs: + Additional keywords to forward to the relevant handling method by scheme. + Keywords should be defined as ``{scheme}_{option}`` with one of the known :data:`SUPPORTED_FILE_SCHEMES`. + If not prefixed by any scheme, the option will apply to all handling methods (if applicable). + :returns: Output locations of downloaded files. + """ + options, kwargs = resolve_scheme_options(**option_kwargs) + + def _list_refs(_url, _data=None): + # type: (str, Optional[str]) -> Iterator[str] + if not _data: + _scheme = _url.split("://")[0] + _opts = options.get(_scheme, {}) # type: ignore + _resp = request_extra("GET", _url, settings=settings, **_opts, **kwargs) + ctype = get_header("Content-Type", _resp.headers, default=ContentType.TEXT_HTML) + if _resp.status_code != 200 or not any( + _type in ctype for _type in [ContentType.TEXT_HTML] + list(ContentType.ANY_XML) + ): + return [] + _data = _resp.text + _html = BeautifulSoup(_data, builder=xml_util.HTML_TREE_BUILDER) + _href = (_ref.get("href") for _ref in _html.find_all("a", recursive=True)) + _href = filter_directory_forbidden(_href) # preemptively remove forbidden items, avoid access/download attempts + for _ref in _href: + if not _ref.startswith(_url): + _ref = os.path.join(_url, _ref) + if not _ref.endswith("/"): + yield _ref + else: + for _sub_ref in _list_refs(_ref): + yield _sub_ref + + files = list(_list_refs(base_url, html_data)) + return download_files_url( + files, out_dir, base_url, + include=include, exclude=exclude, matcher=matcher, + settings=settings, **option_kwargs + ) + + +def adjust_directory_local(location, # type: Path + out_dir, # type: Path + out_method, # type: OutputMethod + include=None, # type: Optional[List[str]] + exclude=None, # type: Optional[List[str]] + matcher=PathMatchingMethod.GLOB, # type: PathMatchingMethod + ): # type: (...) -> List[Path] + """ + Adjusts the input directory reference to the output location with the requested handling method. + + Handling Methods + ~~~~~~~~~~~~~~~~~~~~~~ + + - Source location is the output directory: + + If the source location is exactly the same location as the output (after link resolution), nothing is applied, + unless filtered listing produces a different set of files. In that case, files to be excluded will be removed + from the file system. In other situations, below handling methods are considered. + + - :attr:`OutputMethod.LINK`: + + Force generation of the output directory as a symbolic link pointing to the original location, without any copy, + regardless if the source location is directly a directory or a link to one. + Not applicable if filtered listing does not match exactly the original source location listing. + In such case, resolution will use the second :attr:`OutputMethod.AUTO` handling approach instead. + + - :attr:`OutputMethod.COPY`: + + Force hard copy of the directory to the destination, and hard copy of all its underlying contents by resolving + any symbolic link along the way, regardless if the source location is directly a directory or a link to one. + + - :attr:`OutputMethod.MOVE`: + + Move the local directory's contents under the output directory instead of copying or linking it. + If the output directory already contains anything, raises an :class:`OSError`. + If exclusion filters yield any item to be omitted, those items will be deleted entirely from the file system. + + - :attr:`OutputMethod.AUTO` (default): + + Resolve conditionally as follows. + + * When the source is a symbolic link itself, the destination will be a link to it + (handled as :attr:`OutputMethod.LINK`), unless its restriction regarding filtered listing applies. + In that case, switches to the other handling method below. + + * When the source is a direct directory reference (or a link with differing listing after filter), the + destination will be a recursive copy of the source directory, but any encountered links will remain links + instead of resolving them and creating a copy (as accomplished by :attr:`OutputMethod.COPY`). + + .. seealso:: + :func:`filter_directory_patterns` + + :param location: Local reference to the source directory. + :param out_dir: Local reference to the output directory. + :param out_method: Method employed to handle the generation of the output directory. + :param include: Any matching patterns for files that should be explicitly included. + :param exclude: Any matching patterns for files that should be excluded unless included. + :param matcher: Pattern matching method to evaluate if a file path matches include and exclude definitions. + :returns: Listing of files after resolution and filtering if applicable. + """ + if location.startswith("file://"): + location = location[7:] + if not os.path.isdir(location): + raise OSError("Cannot operate with directory. " + f"Reference location [{location}] does not exist or is not a directory!") + + loc_dir = os.path.realpath(location) + out_dir = os.path.realpath(out_dir) if os.path.isdir(out_dir) else out_dir + loc_dir = loc_dir.rstrip("/") + "/" + out_dir = out_dir.rstrip("/") + "/" + listing = list_directory_recursive(loc_dir) + # Use relative paths to filter items to ensure forbidden or include/exclude patterns match + # the provided definitions as expected, since patterns more often do not use the full path. + # In case the patterns do use full paths though, adjust them to ensure they still work as well. + include = [incl.replace(loc_dir, "", 1) if incl.startswith(loc_dir) else incl for incl in include or []] + exclude = [excl.replace(loc_dir, "", 1) if excl.startswith(loc_dir) else excl for excl in exclude or []] + relative = (path.replace(loc_dir, "") for path in listing) + relative = filter_directory_forbidden(relative) + relative = list(sorted(relative)) + filtered = filter_directory_patterns(relative, include, exclude, matcher) + filtered = list(sorted(filtered)) + extras = list(set(relative) - set(filtered)) + extras = [os.path.join(out_dir, path) for path in extras] + desired = [os.path.join(loc_dir, path) for path in filtered] + filtered = list(sorted(os.path.join(out_dir, path) for path in filtered)) + + if loc_dir == out_dir: + if not extras: + LOGGER.debug("Local directory reference has no action to take, already exists: [%s]", loc_dir) + return filtered + LOGGER.debug("Local directory reference [%s] matches output, but desired listing differs. " + "Removing additional items:\n%s", loc_dir, repr_json(extras)) + for file_path in extras: + os.remove(file_path) + return filtered + + # Any operation (islink, remove, etc.) that must operate on the link itself rather than the directory it points + # to must not have the final '/' in the path. Otherwise, the link path (without final '/') is resolved before + # evaluating the operation, which make them attempt their call on the real directory itself. + link_dir = location.rstrip("/") + + if (os.path.exists(out_dir) and not os.path.isdir(out_dir)) or (os.path.isdir(out_dir) and os.listdir(out_dir)): + LOGGER.debug("References under [%s] cannot be placed under target path [%s] " + "(output is not a directory or output directory is not empty).", location, out_dir) + raise OSError("Cannot operate with directory." + f"Output location [{out_dir}] already exists or is not an empty directory!") + if os.path.exists(out_dir): + os.rmdir(out_dir) # need to remove to avoid moving contents nested under it + + # avoid unnecessary copy of files marked for exclusion + def copy_func(src, dst, *args, **kwargs): + # type: (Path, Path, *Any, **Any) -> None + if dst not in desired: + shutil.copy2(src, dst, *args, **kwargs) + + if out_method == OutputMethod.MOVE: + # Calling 'shutil.move' raises 'NotADirectoryError' if the source directory is a link + # (although contents would still be moved). Use the resolved path to avoid the error. + shutil.move(loc_dir, out_dir, copy_function=copy_func) + # Remove the original link location pointing to the resolved directory to be consistent + # with 'move' from a direct directory where the original location would not exist anymore. + if location != loc_dir and os.path.islink(link_dir): + os.remove(link_dir) + for file_path in extras: + os.remove(file_path) + return filtered + elif out_method == OutputMethod.LINK and not extras: # fallback AUTO if not exact listing + if os.path.islink(link_dir): + loc_dir = os.readlink(link_dir) + out_dir = out_dir.rstrip("/") + os.symlink(loc_dir, out_dir, target_is_directory=True) + return filtered + # AUTO: partial copy (links remain links) + # LINK: idem, when listing differ + # COPY: full copy (resolve symlinks) + shutil.copytree(loc_dir, out_dir, + symlinks=out_method != OutputMethod.COPY, + ignore_dangling_symlinks=True, + copy_function=copy_func) + return filtered + + +def list_directory_recursive(directory, relative=False): + # type: (Path, bool) -> Iterator[Path] + """ + Obtain a flat list of files recursively contained within a local directory. + """ + for path, _, files in os.walk(directory, followlinks=True): + for file_name in files: + yield file_name if relative else os.path.join(path, file_name) + + +def fetch_directory(location, # type: str + out_dir, # type: Path + *, # force named keyword arguments after + out_method=OutputMethod.AUTO, # type: OutputMethod + include=None, # type: Optional[List[str]] + exclude=None, # type: Optional[List[str]] + matcher=PathMatchingMethod.GLOB, # type: PathMatchingMethod + settings=None, # type: Optional[AnySettingsContainer] + **option_kwargs, # type: Any # Union[SchemeOptions, RequestOptions] + ): # type: (...) -> List[str] + """ + Fetches all files that can be listed from a directory in local or remote location. + + .. seealso:: + - :func:`fetch_reference` + - :func:`resolve_scheme_options` + - :func:`adjust_directory_local` + - :func:`download_files_html` + - :func:`download_files_s3` + - :func:`download_files_url` + + .. note:: + When using include/exclude filters, items that do not match a valid entry from the real listing are ignored. + Special directories such as ``..`` and ``.`` for navigation purpose are always excluded regardless of filters. + + :param location: Directory reference (URL, S3, local). Trailing slash required. + :param out_dir: Output local directory path under which to place fetched files. + :param out_method: + Method employed to handle the generation of the output directory. + Only applicable when the file reference is local. Remote location always generates a local copy. + :param include: Any matching patterns for files that should be explicitly included. + :param exclude: Any matching patterns for files that should be excluded unless included. + :param matcher: Pattern matching method to evaluate if a file path matches include and exclude definitions. + :param settings: Additional request-related settings from the application configuration (notably request-options). + :param option_kwargs: + Additional keywords to forward to the relevant handling method by scheme. + Keywords should be defined as ``{scheme}_{option}`` with one of the known :data:`SUPPORTED_FILE_SCHEMES`. + If not prefixed by any scheme, the option will apply to all handling methods (if applicable). + :returns: File locations retrieved from directory listing. + """ + if not get_url_without_query(location).endswith("/"): + raise ValueError(f"Invalid directory location [{location}] must have a trailing slash.") + LOGGER.debug("Fetching directory reference: [%s] using options:\n%s", location, repr_json(option_kwargs)) + if location.startswith("s3://"): + LOGGER.debug("Fetching listed files under directory resolved as S3 bucket reference.") + listing = download_files_s3(location, out_dir, + include=include, exclude=exclude, matcher=matcher, **option_kwargs) + elif location.startswith("https://s3."): + LOGGER.debug("Fetching listed files under directory resolved as HTTP-like S3 bucket reference.") + s3_ref, s3_region = resolve_s3_from_http(location) + option_kwargs["s3_region_name"] = s3_region + listing = download_files_s3(s3_ref, out_dir, + include=include, exclude=exclude, + settings=settings, **option_kwargs) + elif location.startswith("http://") or location.startswith("https://"): + LOGGER.debug("Fetch directory resolved as remote HTTP reference. Will attempt listing contents.") + resp = request_extra("GET", location) + if resp.status_code != 200: + LOGGER.error("Invalid response [%s] for directory listing from [%s]", resp.status_code, location) + raise ValueError(f"Cannot parse directory location [{location}] from [{resp.status_code}] response.") + ctype = get_header("Content-Type", resp.headers, default=ContentType.TEXT_HTML) + if any(_type in ctype for _type in [ContentType.TEXT_HTML] + list(ContentType.ANY_XML)): + listing = download_files_html(resp.text, out_dir, location, + include=include, exclude=exclude, matcher=matcher, + settings=settings, **option_kwargs) + elif ContentType.APP_JSON in ctype: + body = resp.json() # type: JSON + if isinstance(body, list) and all(isinstance(file, str) for file in body): + listing = download_files_url(body, out_dir, location, + include=include, exclude=exclude, matcher=matcher, + settings=settings, **option_kwargs) + else: + LOGGER.error("Invalid JSON from [%s] is not a list of files:\n%s", location, repr_json(body)) + raise ValueError(f"Cannot parse directory location [{location}] " + "expected as JSON response contents providing a list of files.") + else: + raise ValueError(f"Cannot list directory [{location}]. Unknown parsing of Content-Type [{ctype}] response.") + elif location.startswith("file://") or location.startswith("/"): + LOGGER.debug("Fetch directory resolved as local reference.") + listing = adjust_directory_local(location, out_dir, out_method, + include=include, exclude=exclude, matcher=matcher) + else: + raise ValueError(f"Unknown scheme for directory location [{location}].") + listing = list(sorted(listing)) + if LOGGER.isEnabledFor(logging.DEBUG): + for item in listing: + LOGGER.debug("Resolved file [%s] from [%s] directory listing.", item, location) + return listing + + +@overload +def fetch_reference(reference, # type: str + out_dir, # type: Path + *, # force named keyword arguments after + out_listing=False, # type: Literal[False] + out_method=OutputMethod.AUTO, # type: OutputMethod + settings=None, # type: Optional[AnySettingsContainer] + **option_kwargs, # type: Any # Union[SchemeOptions, RequestOptions] + ): # type: (...) -> str + ... + + +@overload +def fetch_reference(reference, # type: str + out_dir, # type: Path + *, # force named keyword arguments after + out_listing=False, # type: Literal[True] + out_method=OutputMethod.AUTO, # type: OutputMethod + settings=None, # type: Optional[AnySettingsContainer] + **option_kwargs, # type: Any # Union[SchemeOptions, RequestOptions] + ): # type: (...) -> List[str] + ... + + +def fetch_reference(reference, # type: str + out_dir, # type: Path + *, # force named keyword arguments after + out_listing=False, # type: bool + out_method=OutputMethod.AUTO, # type: OutputMethod + settings=None, # type: Optional[AnySettingsContainer] + **option_kwargs, # type: Any # Union[SchemeOptions, RequestOptions] + ): # type: (...) -> Union[str, List[str]] + """ + Fetches the single file or nested directory files from a local or remote location. + + The appropriate method depends on the format of the location. + If conditions from :ref:`cwl-dir` are met, the reference will be considered a ``Directory``. + In every other situation, a single ``File`` reference will be considered. + + .. seealso:: + See the relevant handling methods below for other optional arguments. + + - :func:`fetch_file` + - :func:`fetch_directory` + + :param reference: + Local filesystem path (optionally prefixed with ``file://``), ``s3://`` bucket location or ``http(s)://`` + remote URL file or directory reference. Reference ``https://s3.[...]`` are also considered as ``s3://``. + :param out_dir: Output local directory path under which to place the fetched file or directory. + :param out_listing: + Request that the complete file listing of the directory reference is returned. + Otherwise, return the local directory reference itself. + In the event of a file reference as input, the returned path will always be the fetched file itself, but it + will be contained within a single-item list if listing was ``True`` for consistency in the returned type with + the corresponding call for a directory reference. + :param settings: Additional request-related settings from the application configuration (notably request-options). + :param out_method: + Method employed to handle the generation of the output file or directory. + Only applicable when the reference is local. Remote location always generates a local copy. + :param option_kwargs: + Additional keywords to forward to the relevant handling method by scheme. + Keywords should be defined as ``{scheme}_{option}`` with one of the known :data:`SUPPORTED_FILE_SCHEMES`. + If not prefixed by any scheme, the option will apply to all handling methods (if applicable). + :return: Path of the local copy of the fetched file, the directory, or the listing of the directory files. + :raises HTTPException: applicable HTTP-based exception if any occurred during the operation. + :raises ValueError: when the reference scheme cannot be identified. + """ + if reference.endswith("/"): + path = fetch_directory(reference, out_dir, out_method=out_method, settings=settings, **option_kwargs) + path = path if out_listing else (os.path.realpath(out_dir) + "/") + else: + path = fetch_file(reference, out_dir, out_method=out_method, settings=settings, **option_kwargs) + return [path] if out_listing and isinstance(path, str) else path + + def load_file(file_path, text=False): # type: (str, bool) -> Union[JSON, str] """ @@ -1857,7 +2941,7 @@ def load_file(file_path, text=False): if is_remote_file(file_path): settings = get_settings() headers = {"Accept": ContentType.TEXT_PLAIN} - cwl_resp = request_extra("get", file_path, headers=headers, settings=settings) + cwl_resp = request_extra("GET", file_path, headers=headers, settings=settings) return cwl_resp.content if text else yaml.safe_load(cwl_resp.content) with open(file_path, mode="r", encoding="utf-8") as f: return f.read() if text else yaml.safe_load(f) @@ -2059,8 +3143,8 @@ def transform_json(json_data, # type: Dict[str, JSON] return json_data -def generate_diff(val, ref, val_name="Test", ref_name="Reference"): - # type: (Any, Any, str, str) -> str +def generate_diff(val, ref, val_name="Test", ref_name="Reference", val_show=False, ref_show=False, json=True, indent=2): + # type: (Any, Any, str, str, bool, bool, bool, Optional[int]) -> str """ Generates a line-by-line diff result of the test value against the reference value. @@ -2071,16 +3155,29 @@ def generate_diff(val, ref, val_name="Test", ref_name="Reference"): :param ref: Reference input value. :param val_name: Name to apply in diff for test input value. :param ref_name: Name to apply in diff for reference input value. + :param val_show: Whether to include full contents of test value. + :param ref_show: Whether to include full contents of reference value. + :param json: Whether to consider contents as :term:`JSON` for diff evaluation. + :param indent: Indentation to employ when using :term:`JSON` contents. :returns: Formatted multiline diff, """ - try: - val = json.dumps(val, sort_keys=True, indent=2, ensure_ascii=False) - except Exception: # noqa + import json as _json + if json: + try: + val = _json.dumps(val, sort_keys=True, indent=indent, ensure_ascii=False) + except Exception: # noqa + val = str(val) + try: + ref = _json.dumps(ref, sort_keys=True, indent=indent, ensure_ascii=False) + except Exception: # noqa + ref = str(ref) + else: val = str(val) - try: - ref = json.dumps(ref, sort_keys=True, indent=2, ensure_ascii=False) - except Exception: # noqa ref = str(ref) + if val_show: + val_name += f"\n\n{val}" + if ref_show: + ref_name += f"\n\n{ref}" val = val.splitlines() ref = ref.splitlines() return "\n".join(difflib.context_diff(val, ref, fromfile=val_name, tofile=ref_name)) @@ -2128,7 +3225,7 @@ def apply_number_with_unit(number, unit="", binary=False, decimals=3): def parse_number_with_unit(number, binary=None): # type: (str, Optional[bool]) -> Number """ - Parses a numeric value accompanied with a unit to generate the unit-less value without prefix factor. + Parses a numeric value accompanied by a unit to generate the unit-less value without prefix factor. :param number: Numerical value and unit. Unit is dissociated from value with first non-numerical match. diff --git a/weaver/wps/service.py b/weaver/wps/service.py index 9423b5929..6ec135d75 100644 --- a/weaver/wps/service.py +++ b/weaver/wps/service.py @@ -330,7 +330,7 @@ def execute_job(self, self.dispatched_processes[worker_process_id] = remote_process wps_response = super(WorkerService, self).execute(worker_process_id, wps_request, job.uuid) - # re-enable creation of status file so we can find it since we disabled 'status' earlier for sync execution + # re-enable creation of status file, so we can find it since we disabled 'status' earlier for sync execution wps_response.store_status_file = True # update execution status with actual status file and apply required references execution = check_wps_status(location=wps_response.process.status_location, settings=self.settings) diff --git a/weaver/wps/utils.py b/weaver/wps/utils.py index a1a9f310d..6ccfb375e 100644 --- a/weaver/wps/utils.py +++ b/weaver/wps/utils.py @@ -29,7 +29,8 @@ is_uuid, make_dirs, request_extra, - retry_on_cache_error + retry_on_cache_error, + validate_s3 ) from weaver.wps_restapi import swagger_definitions as sd @@ -177,7 +178,7 @@ def get_wps_local_status_location(url_status_location, container, must_exist=Tru if found or not must_exist: out_path = out_path_join if not found and must_exist: - out_path_join = os.path.join(dir_path, out_path[1:] if out_path.startswith("/") else out_path) + out_path_join = os.path.join(dir_path, out_path.lstrip("/")) if not os.path.isfile(out_path_join): LOGGER.debug("Could not map WPS status reference [%s] to input local file path [%s].", url_status_location, out_path) @@ -194,13 +195,17 @@ def map_wps_output_location(reference, container, url=False, exists=True, file_s :param reference: Local file path or file URL to be mapped. :param container: Retrieve application settings. - :param url: Perform URL mapping (local path -> URL endpoint), or map to local path (URL -> local path). + :param url: Perform URL mapping (``True``: local path -> URL endpoint, ``False``: URL endpoint -> local path). :param exists: Ensure that the mapped file exists, otherwise don't map it (otherwise ``None``). :param file_scheme: Ensure that the 'file://' scheme is applied to resulting local file location when mapped from WPS output URL. When in 'reverse' mode, 'file://' is always removed if present to form a potential local file path. :returns: Mapped reference that corresponds to the local/URL WPS output location. """ + def ref_exists(ref): + # type: (str) -> bool + return os.path.isdir(ref) if ref.endswith("/") else os.path.isfile(ref) + settings = get_settings(container) wps_out_dir = get_wps_output_dir(settings) wps_out_url = get_wps_output_url(settings) @@ -208,11 +213,11 @@ def map_wps_output_location(reference, container, url=False, exists=True, file_s reference = reference[7:] if url and reference.startswith(wps_out_dir): wps_out_ref = reference.replace(wps_out_dir, wps_out_url, 1) - if not exists or os.path.isfile(reference): + if not exists or ref_exists(reference): return wps_out_ref elif not url and reference.startswith(wps_out_url): wps_out_ref = reference.replace(wps_out_url, wps_out_dir, 1) - if not exists or os.path.isfile(wps_out_ref): + if not exists or ref_exists(wps_out_ref): if file_scheme: return "file://" + wps_out_ref return wps_out_ref @@ -344,6 +349,7 @@ def get_exception_from_xml_status(xml): Expects the following :term:`XML` status response structure (``ows:Exception`` block can be at any level): .. code-block:: xml + Error message about the cause of the exception. @@ -368,7 +374,7 @@ def get_exception_from_xml_status(xml): if ows_exc_txt is None: ows_exc_txt = ows_exc_xml.xpath("//ows:ExceptionText", namespaces=xml.nsmap)[0] ows_exc_msg = ows_exc_txt.text - ows_exc_loc = ows_exc_xml.attrib.get("locator") or ows_exc_xml.attrib.get("locater") # some WPS have typo + ows_exc_loc = ows_exc_xml.attrib.get("locator") or ows_exc_xml.attrib.get("locater") # noqa # WPS can have typo ows_exc_code = ows_exc_xml.attrib["exceptionCode"] for ows_exc_name in dir(owsexceptions): ows_exc_cls = getattr(owsexceptions, ows_exc_name) @@ -437,7 +443,7 @@ def load_pywps_config(container, config=None): output_dir = get_wps_output_dir(settings) make_dirs(output_dir, exist_ok=True) # find output url from app config (path/url) or wps config (url only) - # note: needs to be configured even when using S3 bucket since XML status is provided locally + # note: needs to be configured even when using AWS S3 bucket since XML status is provided locally if "weaver.wps_output_url" not in settings: output_path = settings.get("weaver.wps_output_path", "").rstrip("/") if output_path and isinstance(output_path, str): @@ -450,7 +456,7 @@ def load_pywps_config(container, config=None): make_dirs(settings["weaver.wps_workdir"], exist_ok=True) pywps_config.CONFIG.set("server", "workdir", settings["weaver.wps_workdir"]) - # configure S3 bucket if requested, storage of all process outputs + # configure AWS S3 bucket if requested, storage of all process outputs # note: # credentials and default profile are picked up automatically by 'boto3' from local AWS configs or env vars # region can also be picked from there unless explicitly provided by weaver config @@ -462,20 +468,23 @@ def load_pywps_config(container, config=None): pywps_config.CONFIG.set("server", "storagetype", "file") # pywps_config.CONFIG.set("server", "storagetype", "s3") if s3_bucket: - LOGGER.debug("Updating WPS S3 bucket configuration.") + LOGGER.debug("Updating WPS AWS S3 bucket configuration.") import boto3 from botocore.exceptions import ClientError s3 = boto3.client("s3") s3_region = settings.get("weaver.wps_output_s3_region", s3.meta.region_name) - LOGGER.info("Validating that S3 [Bucket=%s, Region=%s] exists or creating it.", s3_bucket, s3_region) + LOGGER.info("Validating that AWS S3 [Region=%s, Bucket=%s] exists or creating it.", s3_region, s3_bucket) + validate_s3(region=s3_region, bucket=s3_bucket) try: - s3.create_bucket(Bucket=s3_bucket, CreateBucketConfiguration={"LocationConstraint": s3_region}) - LOGGER.info("S3 bucket for WPS output created.") + s3.create_bucket(Bucket=s3_bucket, + CreateBucketConfiguration={"LocationConstraint": s3_region}) # type: ignore + LOGGER.info("AWS S3 bucket [Region=%s, Bucket=%s] for WPS output created.", s3_region, s3_bucket) except ClientError as exc: if exc.response.get("Error", {}).get("Code") != "BucketAlreadyExists": - LOGGER.error("Failed setup of S3 bucket for WPS output: [%s]", exc) + LOGGER.error("Failed setup of AWS S3 bucket [Region=%s, Bucket=%s] for WPS output: [%s]", + s3_region, s3_bucket, exc) raise - LOGGER.info("S3 bucket for WPS output already exists.") + LOGGER.info("AWS S3 bucket [Region=%s, Bucket=%s] for WPS output already exists.", s3_region, s3_bucket) pywps_config.CONFIG.set("s3", "region", s3_region) pywps_config.CONFIG.set("s3", "bucket", s3_bucket) pywps_config.CONFIG.set("s3", "public", "false") # don't automatically push results as publicly accessible diff --git a/weaver/wps_restapi/colander_extras.py b/weaver/wps_restapi/colander_extras.py index 665cdb802..2e676ab7b 100644 --- a/weaver/wps_restapi/colander_extras.py +++ b/weaver/wps_restapi/colander_extras.py @@ -69,6 +69,7 @@ ) from cornice_swagger.converters.schema import ( STRING_FORMATTERS, + BaseStringTypeConverter, NumberTypeConverter, ObjectTypeConverter, TypeConversionDispatcher, @@ -98,6 +99,11 @@ OpenAPISpecParameter ) +try: + RegexPattern = re.Pattern +except AttributeError: # Python 3.6 backport # pragma: no cover + RegexPattern = type(re.compile("_")) + # pylint: disable=C0209,consider-using-f-string @@ -111,6 +117,18 @@ # colander.Enum, # not supported but could be (literal int/str inferred from Python Enum object) ]) +# patch URL with negative look-ahead to invalidate following // after scheme +NO_DOUBLE_SLASH_PATTERN = r"(?!.*//.*$)" +URL_REGEX = colander.URL_REGEX.replace(r"://)?", rf"://)?{NO_DOUBLE_SLASH_PATTERN}") +URL = colander.Regex(URL_REGEX, msg=colander._("Must be a URL"), flags=re.IGNORECASE) +URI_REGEX = colander.URI_REGEX.replace(r"://", r"://(?!//)") +FILE_URI = colander.Regex(URI_REGEX, msg=colander._("Must be a file:// URI scheme"), flags=re.IGNORECASE) +STRING_FORMATTERS.update({ + "uri": {"converter": BaseStringTypeConverter, "validator": URL}, + "url": {"converter": BaseStringTypeConverter, "validator": URL}, + "file": {"converter": BaseStringTypeConverter, "validator": FILE_URI}, +}) + def _make_node_instance(schema_node_or_class): # type: (Union[colander.SchemaNode, Type[colander.SchemaNode]]) -> colander.SchemaNode @@ -161,7 +179,7 @@ def _get_schema_type(schema_node, check=False): def _get_node_name(schema_node, schema_name=False): # type: (colander.SchemaNode, bool) -> str """ - Obtains the name of the node with best available value. + Obtains the name of the node with the best available value. :param schema_node: node for which to retrieve the name. :param schema_name: @@ -321,17 +339,21 @@ class SchemeURL(colander.Regex): .. seealso:: :class:`colander.url` [remote http(s)/ftp(s)] :class:`colander.file_uri` [local file://] + :data:`URL` """ def __init__(self, schemes=None, path_pattern=None, msg=None, flags=re.IGNORECASE): - # type: (Optional[Iterable[str]], Optional[str], Optional[str], Optional[re.RegexFlag]) -> None + # type: (Optional[Iterable[str]], Union[None, str, RegexPattern], Optional[str], Optional[re.RegexFlag]) -> None if not schemes: schemes = [""] if not msg: msg = colander._(f"Must be a URL matching one of schemes {schemes}") # noqa regex_schemes = r"(?:" + "|".join(schemes) + r")" - regex = colander.URL_REGEX.replace(r"(?:http|ftp)s?", regex_schemes) + regex = URL_REGEX.replace(r"(?:http|ftp)s?", regex_schemes) + if path_pattern: + if isinstance(path_pattern, RegexPattern): + path_pattern = path_pattern.pattern regex = regex[:-1] + path_pattern + "$" super(SchemeURL, self).__init__(regex, msg=msg, flags=flags) @@ -640,7 +662,7 @@ def __init__(self, *args, **kwargs): if self.validator is None and isinstance(schema_type, colander.String): _format = kwargs.pop("format", getattr(self, "format", None)) pattern = kwargs.pop("pattern", getattr(self, "pattern", None)) - if isinstance(pattern, str): + if isinstance(pattern, (str, RegexPattern)): self.validator = colander.Regex(pattern) elif isinstance(pattern, colander.Regex): self.validator = pattern @@ -1660,7 +1682,9 @@ class OneOfWithRequiredFields(OneOfKeywordSchema, RequiredByBoth): As a shortcut, the OpenAPI keyword ``discriminator`` can be provided to try matching as a last resort. - For example:: + For example: + + .. code-block:: python class Animal(ExtendedMappingSchema): name = ExtendedSchemaNode(String()) @@ -1676,6 +1700,16 @@ class Dog(Animal): [...] # many **OPTIONAL** fields # With the discriminator keyword, following is possible + # (each schema must provide the same property name) + class SomeAnimal(OneOfMappingSchema): + discriminator = "type" + _one_of = [ + Cat(), + Dog(), + ] + + # If more specific mapping resolutions than 1-to-1 by name are needed, + # an explicit dictionary can be specified instead. class SomeAnimal(OneOfMappingSchema): discriminator = { "propertyName": "type", # correspond to 'type' of 'Animal' @@ -2105,6 +2139,17 @@ def _deserialize_keyword(self, cstruct): return ExtendedMappingSchema.deserialize(self, cstruct) +class ExtendedTypeConverter(TypeConverter): + def convert_type(self, schema_node): + # type: (colander.SchemaNode) -> OpenAPISchema + # base type converters expect raw pattern string + # undo the compiled pattern to allow conversion + pattern = getattr(schema_node, "pattern", None) + if isinstance(pattern, RegexPattern): + setattr(schema_node, "pattern", pattern.pattern) + return super(ExtendedTypeConverter, self).convert_type(schema_node) + + class KeywordTypeConverter(TypeConverter): """ Generic keyword converter that builds schema with a list of sub-schemas under the keyword. @@ -2248,7 +2293,7 @@ class DecimalTypeConverter(NumberTypeConverter): class MoneyTypeConverter(DecimalTypeConverter): - pattern = "^[0-9]+.[0-9]{2}$" + pattern = re.compile("^[0-9]+.[0-9]{2}$") convert_validator = ValidatorConversionDispatcher( convert_range_validator(colander.Range(min=0)), convert_regex_validator(colander.Regex(pattern, msg="Number must be formatted as currency decimal.")) @@ -2286,6 +2331,17 @@ def __init__(self, custom_converters=None, default_converter=None): if custom_converters: extended_converters.update(custom_converters) super(OAS3TypeConversionDispatcher, self).__init__(extended_converters, default_converter) + self.extend_converters() + + def extend_converters(self): + """ + Extend base :class:`TypeConverter` derived classes to provide additional capabilities seamlessly. + """ + for typ, cvt in self.converters.items(): + if issubclass(cvt, TypeConverter) and not issubclass(cvt, ExtendedTypeConverter): + class Extended(ExtendedTypeConverter, cvt): + __name__ = f"Extended{cvt}" + self.converters[typ] = Extended def __call__(self, schema_node): # type: (colander.SchemaNode) -> OpenAPISchema diff --git a/weaver/wps_restapi/swagger_definitions.py b/weaver/wps_restapi/swagger_definitions.py index 0df845df3..4f99c9b8c 100644 --- a/weaver/wps_restapi/swagger_definitions.py +++ b/weaver/wps_restapi/swagger_definitions.py @@ -15,12 +15,13 @@ # pylint: disable=C0103,invalid-name import datetime import os +import re from copy import copy from typing import TYPE_CHECKING import duration import yaml -from colander import DateTime, Email, Length, Money, OneOf, Range, Regex, drop, null, required +from colander import DateTime, Email, Length, Money, OneOf, Range, drop, null, required from dateutil import parser as date_parser from weaver import __meta__ @@ -37,6 +38,7 @@ CWL_REQUIREMENT_APP_WPS1, CWL_REQUIREMENT_CUDA, CWL_REQUIREMENT_INIT_WORKDIR, + CWL_REQUIREMENT_INLINE_JAVASCRIPT, CWL_REQUIREMENT_NETWORK_ACCESS, OAS_COMPLEX_TYPES, OAS_DATA_TYPES, @@ -51,8 +53,10 @@ from weaver.quotation.status import QuoteStatus from weaver.sort import Sort, SortMethods from weaver.status import JOB_STATUS_CODE_API, JOB_STATUS_SEARCH_API, Status +from weaver.utils import AWS_S3_BUCKET_REFERENCE_PATTERN from weaver.visibility import Visibility from weaver.wps_restapi.colander_extras import ( + NO_DOUBLE_SLASH_PATTERN, AllOfKeywordSchema, AnyOfKeywordSchema, BoundedRange, @@ -299,14 +303,17 @@ class SLUG(ExtendedSchemaNode): schema_type = String description = "Slug name pattern." example = "some-object-slug-name" - pattern = r"^[A-Za-z0-9]+(?:(-|_)[A-Za-z0-9]+)*$" + pattern = re.compile(r"^[A-Za-z0-9]+(?:[-_][A-Za-z0-9]+)*$") class Tag(ExtendedSchemaNode): schema_type = String - description = "Identifier with optional tagged version forming an unique reference." + description = "Identifier with optional tagged version forming a unique reference." # ranges used to remove starting/ending ^$ characters - pattern = SLUG.pattern[:-1] + rf"(:{SemanticVersion(v_prefix=False, rc_suffix=False).pattern[1:-1]})?$" + pattern = re.compile( + rf"{SLUG.pattern.pattern[:-1]}" + rf"(:{SemanticVersion(v_prefix=False, rc_suffix=False).pattern[1:-1]})?$" + ) class URL(ExtendedSchemaNode): @@ -319,7 +326,7 @@ class MediaType(ExtendedSchemaNode): schema_type = String description = "IANA identifier of content and format." example = ContentType.APP_JSON - pattern = r"^\w+\/[-.\w]+(?:\+[-.\w]+)?(?:\;\s*.+)*$" + pattern = re.compile(r"^\w+\/[-.\w]+(?:\+[-.\w]+)?(?:\;\s*.+)*$") class QueryBoolean(Boolean): @@ -344,25 +351,30 @@ class DateTimeInterval(ExtendedSchemaNode): "to get values with a specific date-time just pass the datetime. " ) example = "2022-03-02T03:32:38.487000+00:00/.." - regex_datetime = r"(\d{4}-\d\d-\d\dT\d\d:\d\d:\d\d(\.\d+)?(([+-]\d\d:\d\d)|Z)?)" - regex_interval_closed = fr"{regex_datetime}\/{regex_datetime}" - regex_interval_open_start = fr"\.\.\/{regex_datetime}" - regex_interval_open_end = fr"{regex_datetime}\/\.\." - - pattern = fr"^{regex_datetime}|{regex_interval_closed}|{regex_interval_open_start}|{regex_interval_open_end}$" + regex_datetime = re.compile(r"(\d{4}-\d\d-\d\dT\d\d:\d\d:\d\d(\.\d+)?(([+-]\d\d:\d\d)|Z)?)") + regex_interval_closed = re.compile(rf"{regex_datetime.pattern}\/{regex_datetime.pattern}") + regex_interval_open_start = re.compile(rf"\.\.\/{regex_datetime.pattern}") + regex_interval_open_end = re.compile(rf"{regex_datetime.pattern}\/\.\.") + pattern = re.compile( + rf"^{regex_datetime.pattern}" + rf"|{regex_interval_closed.pattern}" + rf"|{regex_interval_open_start.pattern}" + rf"|{regex_interval_open_end.pattern}" + r"$" + ) -class S3Bucket(ExtendedSchemaNode): +class S3BucketReference(ExtendedSchemaNode): schema_type = String - description = "S3 bucket shorthand URL representation [s3://{bucket}/{job-uuid}/{output}.ext]" - pattern = r"^s3://\S+$" + description = "S3 bucket shorthand URL representation: 's3://{bucket}/[{dirs}/][{file-key}]'" + pattern = AWS_S3_BUCKET_REFERENCE_PATTERN class FileLocal(ExtendedSchemaNode): schema_type = String description = "Local file reference." format = "file" - validator = Regex(r"^(file://)?(?:/|[/?]\S+)$") + pattern = re.compile(rf"^(file://)?{NO_DOUBLE_SLASH_PATTERN}(?:/|[/?]\S+)$") class FileURL(ExtendedSchemaNode): @@ -376,7 +388,7 @@ class VaultReference(ExtendedSchemaNode): schema_type = String description = "Vault file reference." example = "vault://399dc5ac-ff66-48d9-9c02-b144a975abe4" - pattern = r"^vault://[a-f0-9]{8}(?:-?[a-f0-9]{4}){3}-?[a-f0-9]{12}$" + pattern = re.compile(r"^vault://[a-f0-9]{8}(?:-?[a-f0-9]{4}){3}-?[a-f0-9]{12}$") class ProcessURL(ExtendedSchemaNode): @@ -390,7 +402,7 @@ class ReferenceURL(AnyOfKeywordSchema): _any_of = [ FileURL(), FileLocal(), - S3Bucket(), + S3BucketReference(), ] @@ -398,7 +410,7 @@ class ExecuteReferenceURL(AnyOfKeywordSchema): _any_of = [ FileURL(), FileLocal(), - S3Bucket(), + S3BucketReference(), VaultReference(), ] @@ -408,7 +420,7 @@ class UUID(ExtendedSchemaNode): description = "Unique identifier." example = "a9d14bf4-84e0-449a-bac8-16e598efe807" format = "uuid" - pattern = "^[a-f0-9]{8}(?:-?[a-f0-9]{4}){3}-?[a-f0-9]{12}$" + pattern = re.compile("^[a-f0-9]{8}(?:-?[a-f0-9]{4}){3}-?[a-f0-9]{12}$") title = "UUID" @@ -699,7 +711,7 @@ class Link(LinkRelationship, LinkBase): class MetadataValue(NotKeywordSchema, ValueLanguage, MetadataBase): _not = [ # make sure value metadata does not allow 'rel' and 'hreflang' reserved for link reference - # explicitly refuse them such that when an href/rel link is provided, only link details are possible + # explicitly refuse them such that when a href/rel link is provided, only link details are possible LinkRelationship(description="Field 'rel' must refer to a link reference with 'href'."), LinkLanguage(description="Field 'hreflang' must refer to a link reference with 'href'."), ] @@ -774,27 +786,27 @@ class Format(ExtendedMappingSchema): schema = FormatSchema(missing=drop) -class DeployFormatDefaultMimeType(FormatMimeType): +class FormatDefaultMimeType(FormatMimeType): description = ( "Format for process input are assumed plain/text if the media-type was omitted and is not one of the known " "formats by this instance. When executing a job, the best match against supported formats by the process " - "definition will be used to run the process, and will fallback to the default as last resort." + "definition will be used to run the process, and will fall back to the default as last resort." ) # NOTE: # The default is overridden from FormatMimeType since the FormatSelection 'oneOf' always fails, - # due to the 'default' value which is always generated and it causes the presence of both Format and FormatMimeType + # due to the 'default' value which is always generated, and it causes the presence of both Format and FormatMimeType mimeType = MediaType(example=ContentType.APP_JSON) -class DeployFormatDefault(Format): +class FormatDefaultMediaType(Format): description = ( "Format for process input are assumed plain/text if the media-type was omitted and is not one of the known " "formats by this instance. When executing a job, the best match against supported formats by the process " - "definition will be used to run the process, and will fallback to the default as last resort." + "definition will be used to run the process, and will fall back to the default as last resort." ) # NOTE: # The default is overridden from Format since the FormatSelection 'oneOf' always fails, - # due to the 'default' value which is always generated and it causes the presence of both Format and FormatMimeType + # due to the 'default' value which is always generated, and it causes the presence of both Format and FormatMimeType mediaType = MediaType(example=ContentType.APP_JSON) @@ -803,15 +815,15 @@ class FormatSelection(OneOfKeywordSchema): Validation against ``mimeType`` or ``mediaType`` format. .. seealso:: - - :class:`DeployFormatDefault` - - :class:`DeployFormatDefaultMimeType` + - :class:`FormatDefaultMediaType` + - :class:`FormatDefaultMimeType` .. note:: Format are validated to be retro-compatible with pre-existing/deployed/remote processes. """ _one_of = [ - DeployFormatDefault(), - DeployFormatDefaultMimeType() + FormatDefaultMediaType(), + FormatDefaultMimeType() ] @@ -1207,7 +1219,7 @@ class DeployMinMaxOccurs(ExtendedMappingSchema): maxOccurs = MaxOccursDefinition(default=null, missing=null) -# does not inherit from 'DescriptionLinks' because other 'ProcessDescription<>' schema depend from this without 'links' +# does not inherit from 'DescriptionLinks' because other 'ProcessDescription<>' schema depend on this without 'links' class ProcessDescriptionType(DescriptionBase, DescriptionExtra): id = ProcessIdentifierTag() version = Version(missing=None, default=None, example="1.2.3") @@ -2984,7 +2996,7 @@ class ProcessDeploymentProfile(ExtendedMappingSchema): class Process( # following are like 'ProcessSummary', - # except without 'ProcessControl' and 'DescriptionLinks' that are outside of nested 'process' + # except without 'ProcessControl' and 'DescriptionLinks' that are outside the nested 'process' ProcessDescriptionType, DescriptionMeta, # following are additional fields only in description, just like for OGC-API ProcessDescription ProcessContext, ProcessVisibility, ProcessLocations @@ -3312,7 +3324,7 @@ class ExecuteInputReference(Reference): class ExecuteInputFile(AnyOfKeywordSchema): - _any_of = [ + _any_of = [ # 'href' required for both to provide file link/reference ExecuteInputFileLink(), # 'OGC' schema with 'type: ' ExecuteInputReference(), # 'OLD' schema with 'format: {mimeType|mediaType: }' ] @@ -3370,8 +3382,8 @@ class ExecuteInputInlineOrRefData(OneOfKeywordSchema): schema_ref = f"{OGC_API_SCHEMA_CORE}/inlineOrRefData.yaml" _one_of = [ ExecuteInputInlineValue(), # - ExecuteInputQualifiedValue(), # {"value": } - ExecuteInputFile(), # 'href' with either 'type' (OGC) or 'format' (OLD) + ExecuteInputQualifiedValue(), # {"value": , "mediaType": "<>", "schema": } + ExecuteInputFile(), # 'href' with either 'type' (OGC) or 'format' (OLD) # FIXME: other types here, 'bbox+crs', 'collection', 'nested process', etc. ] @@ -3715,12 +3727,14 @@ class DockerGpuRequirementClass(DockerGpuRequirementSpecification): _class = RequirementClass(example=CWL_REQUIREMENT_APP_DOCKER_GPU, validator=OneOf([CWL_REQUIREMENT_APP_DOCKER_GPU])) -class DirectoryListing(PermissiveMappingSchema): +class DirectoryListingItem(PermissiveMappingSchema): entry = ExtendedSchemaNode(String(), missing=drop) + entryname = ExtendedSchemaNode(String(), missing=drop) + writable = ExtendedSchemaNode(Boolean(), missing=drop) class InitialWorkDirListing(ExtendedSequenceSchema): - listing = DirectoryListing() + item = DirectoryListingItem() class InitialWorkDirRequirementSpecification(PermissiveMappingSchema): @@ -3736,6 +3750,32 @@ class InitialWorkDirRequirementClass(InitialWorkDirRequirementSpecification): validator=OneOf([CWL_REQUIREMENT_INIT_WORKDIR])) +class InlineJavascriptLibraries(ExtendedSequenceSchema): + description = ( + "Additional code fragments that will also be inserted before executing the expression code. " + "Allows for function definitions that may be called from CWL expressions." + ) + exp_lib = ExtendedSchemaNode(String(), missing=drop) + + +class InlineJavascriptRequirementSpecification(PermissiveMappingSchema): + description = ( + "Indicates that the workflow platform must support inline Javascript expressions. " + "If this requirement is not present, the workflow platform must not perform expression interpolation. " + "https://www.commonwl.org/v1.2/CommandLineTool.html#InlineJavascriptRequirement" + ) + expressionLib = InlineJavascriptLibraries(missing=drop) + + +class InlineJavascriptRequirementMap(ExtendedMappingSchema): + req = InlineJavascriptRequirementSpecification(name=CWL_REQUIREMENT_INLINE_JAVASCRIPT) + + +class InlineJavascriptRequirementClass(InlineJavascriptRequirementSpecification): + _class = RequirementClass(example=CWL_REQUIREMENT_INLINE_JAVASCRIPT, + validator=OneOf([CWL_REQUIREMENT_INLINE_JAVASCRIPT])) + + class BuiltinRequirementSpecification(PermissiveMappingSchema): title = CWL_REQUIREMENT_APP_BUILTIN description = ( @@ -3818,16 +3858,21 @@ class CWLRequirementsMap(AnyOfKeywordSchema): DockerRequirementMap(missing=drop), DockerGpuRequirementMap(missing=drop), InitialWorkDirRequirementMap(missing=drop), + InlineJavascriptRequirementMap(missing=drop), NetworkAccessRequirementMap(missing=drop), PermissiveMappingSchema(missing=drop), ] class CWLRequirementsItem(OneOfKeywordSchema): + # in case there is any conflict between definitions, + # the class field can be used to discriminate which one is expected. + discriminator = "class" _one_of = [ DockerRequirementClass(missing=drop), DockerGpuRequirementClass(missing=drop), InitialWorkDirRequirementClass(missing=drop), + InlineJavascriptRequirementClass(missing=drop), NetworkAccessRequirementClass(missing=drop), UnknownRequirementClass(missing=drop), # allows anything, must be last ] @@ -3851,6 +3896,7 @@ class CWLHintsMap(AnyOfKeywordSchema, PermissiveMappingSchema): DockerRequirementMap(missing=drop), DockerGpuRequirementMap(missing=drop), InitialWorkDirRequirementMap(missing=drop), + InlineJavascriptRequirementMap(missing=drop), NetworkAccessRequirementMap(missing=drop), ESGF_CWT_RequirementMap(missing=drop), OGCAPIRequirementMap(missing=drop), @@ -3868,6 +3914,7 @@ class CWLHintsItem(OneOfKeywordSchema, PermissiveMappingSchema): DockerRequirementClass(missing=drop), DockerGpuRequirementClass(missing=drop), InitialWorkDirRequirementClass(missing=drop), + InlineJavascriptRequirementClass(missing=drop), NetworkAccessRequirementClass(missing=drop), ESGF_CWT_RequirementClass(missing=drop), OGCAPIRequirementClass(missing=drop), diff --git a/weaver/wps_restapi/utils.py b/weaver/wps_restapi/utils.py index 14b2b8c7f..506aaae2c 100644 --- a/weaver/wps_restapi/utils.py +++ b/weaver/wps_restapi/utils.py @@ -20,7 +20,7 @@ from weaver.wps_restapi import swagger_definitions as sd if TYPE_CHECKING: - from typing import Any, Callable, Dict, Optional, Union + from typing import Any, Dict, Optional, Union from weaver.formats import ContentType from weaver.typedefs import ( @@ -30,7 +30,8 @@ AnyRequestType, AnySettingsContainer, HeadersType, - ReturnValue + Params, + Return ) LOGGER = logging.getLogger(__name__) @@ -144,7 +145,7 @@ def get_schema_ref(schema, container=None, ref_type="$schema", ref_name=True): def handle_schema_validation(schema=None): - # type: (Optional[colander.SchemaNode]) -> Callable + # type: (Optional[colander.SchemaNode]) -> AnyCallableWrapped """ Convert a schema validation error into an HTTP error with error details about the failure. @@ -154,7 +155,7 @@ def handle_schema_validation(schema=None): def decorator(func): # type: (AnyCallableWrapped) -> AnyCallableWrapped @functools.wraps(func) def wrapped(*args, **kwargs): - # type: (*Any, **Any) -> ReturnValue + # type: (Params.args, Params.kwargs) -> Return try: return func(*args, **kwargs) except colander.Invalid as ex: diff --git a/weaver/xml_util.py b/weaver/xml_util.py index c0ef4cdfc..cd5cb3652 100644 --- a/weaver/xml_util.py +++ b/weaver/xml_util.py @@ -2,12 +2,12 @@ Define a default XML parser that avoids XXE injection. Package :mod:`lxml` is employed directly even though some linters (e.g.: ``bandit``) report to employ ``defusedxml`` -instead, because that package's extension with ``lxml`` is marked as deprecated. +instead, because that package's extension with :mod:`lxml` is marked as deprecated. .. seealso:: https://pypi.org/project/defusedxml/#defusedxml-lxml -To use the module, import is as if importing ``lxml.etree``: +To use the module, import is as if importing :mod:`lxml.etree`: .. code-block:: python @@ -18,12 +18,13 @@ """ from typing import TYPE_CHECKING +from bs4.builder._lxml import LXMLTreeBuilder # noqa: W0212 from lxml import etree as lxml_etree # nosec: B410 # flagged known issue, this is what the applied fix below is about from owslib.wps import etree as owslib_wps_etree if TYPE_CHECKING: from io import BufferedReader - from typing import AnyStr, Union + from typing import Any, AnyStr, Union XML_PARSER = lxml_etree.XMLParser( @@ -64,3 +65,35 @@ def parse(source, parser=XML_PARSER): # override OWSLib call with adjusted method reference with configured parser enforced owslib_wps_etree.fromstring = fromstring + + +HTML = lxml_etree.HTML + + +def _lxml_tree_parser_maker(**parser_kwargs): + # type: (**Any) -> lxml_etree.HTMLParser + """ + Generate the XML/HTML tree parser. + + Uses similar parameters as in :meth:`bs4.builder._lxml.LXMLTreeBuilderForXML.default_parser`, + but overriding some other options to make it more secure. + + Without this modification, the builder is usually created using: + + .. code-block:: python + + etree.XMLParser(target=self, strip_cdata=False, recover=True, encoding=encoding) + """ + parser_kwargs.update(dict( + no_network=True, + remove_pis=True, + huge_tree=False, + strip_cdata=True, + recover=True, + )) + return lxml_etree.HTMLParser(**parser_kwargs) + + +HTML_PARSER = _lxml_tree_parser_maker() +HTML_TREE_BUILDER = LXMLTreeBuilder(parser=_lxml_tree_parser_maker) +LXML_TREE_BUILDER = HTML_TREE_BUILDER