common-workflow-language · mr-c · Jun 5, 2023 · Jun 12, 2023 · Jun 15, 2023 · Jun 15, 2023
diff --git a/cwlref-runner/README b/cwlref-runner/README
@@ -1,4 +1,4 @@
-This an optional companion package to "cwltool" which provides provides an
+This an optional companion package to "cwltool" which provides an
 additional entry point under the alias "cwl-runner", which is the
 implementation-agnostic name for the default CWL interpreter installed on a
 host.
diff --git a/cwlref-runner/setup.py b/cwlref-runner/setup.py
@@ -4,22 +4,19 @@
 from setuptools import setup, find_packages
 
 SETUP_DIR = os.path.dirname(__file__)
-README = os.path.join(SETUP_DIR, 'README')
+README = os.path.join(SETUP_DIR, "README")
 
-setup(name='cwlref-runner',
-      version='1.0',
-      description='Common workflow language reference implementation',
-      long_description=open(README).read(),
-      author='Common workflow language working group',
-      author_email='[email protected]',
-      url="http://www.commonwl.org",
-      download_url="https://github.com/common-workflow-language/common-workflow-language",
-      license='Apache 2.0',
-      install_requires=[
-          'cwltool'
-        ],
-      entry_points={
-          'console_scripts': [ "cwl-runner=cwltool.main:main" ]
-      },
-      zip_safe=True
+setup(
+    name="cwlref-runner",
+    version="1.0",
+    description="Common workflow language reference implementation",
+    long_description=open(README).read(),
+    author="Common workflow language working group",
+    author_email="[email protected]",
+    url="http://www.commonwl.org",
+    download_url="https://github.com/common-workflow-language/common-workflow-language",
+    license="Apache 2.0",
+    install_requires=["cwltool"],
+    entry_points={"console_scripts": ["cwl-runner=cwltool.main:main"]},
+    zip_safe=True,
 )
diff --git a/cwltool/argparser.py b/cwltool/argparser.py
@@ -287,6 +287,24 @@ def arg_parser() -> argparse.ArgumentParser:
         type=str,
     )
 
+    # TO DO: Not yet implemented
+    provgroup.add_argument(
+        "--no-data",  # Maybe change to no-input and no-intermediate to ignore those kind of files?...
+        default=False,
+        action="store_true",
+        help="Disables the storage of input and output data files in provenence folder",
+        dest="no_data",
+    )
+
+    # TO DO: Not yet implemented
+    provgroup.add_argument(
+        "--no-input",  # Maybe change to no-input and no-intermediate to ignore those kind of files?...
+        default=False,
+        action="store_true",
+        help="Disables the storage of input data files in provenence folder",
+        dest="no_input",
+    )
+
     printgroup = parser.add_mutually_exclusive_group()
     printgroup.add_argument(
         "--print-rdf",

diff --git a/cwltool/builder.py b/cwltool/builder.py
@@ -573,6 +573,10 @@ def addsf(
                 datum = cast(CWLObjectType, datum)
                 ll = schema.get("loadListing") or self.loadListing
                 if ll and ll != "no_listing":
+                    # Debug show
+                    for k in datum:
+                        _logger.debug("Datum:  %s: %s" % (k, datum[k]))
+                    _logger.debug("----------------------------------------")
                     get_listing(
                         self.fs_access,
                         datum,

diff --git a/cwltool/cwlprov/__init__.py b/cwltool/cwlprov/__init__.py
@@ -6,7 +6,11 @@
 import re
 import uuid
 from getpass import getuser
-from typing import IO, Any, Callable, Dict, List, Optional, Tuple, TypedDict, Union
+from typing import IO, Any, Dict, List, Optional, Tuple, TypedDict, Union
+
+from cwltool.cwlprov.provenance_constants import Hasher
+
+from ..loghandler import _logger
 
 
 def _whoami() -> Tuple[str, str]:
@@ -135,17 +139,16 @@
 def checksum_copy(
     src_file: IO[Any],
     dst_file: Optional[IO[Any]] = None,
-    hasher: Optional[Callable[[], "hashlib._Hash"]] = None,
+    hasher: Optional[str] = Hasher,
     buffersize: int = 1024 * 1024,
 ) -> str:
     """Compute checksums while copying a file."""
-    # TODO: Use hashlib.new(Hasher_str) instead?
     if hasher:
-        checksum = hasher()
+        checksum = hashlib.new(hasher)
     else:
         from .provenance_constants import Hasher
 
-        checksum = Hasher()
+        checksum = hashlib.new(Hasher)
     contents = src_file.read(buffersize)
     if dst_file and hasattr(dst_file, "name") and hasattr(src_file, "name"):
         temp_location = os.path.join(os.path.dirname(dst_file.name), str(uuid.uuid4()))
@@ -158,6 +161,34 @@
             pass
         if os.path.exists(temp_location):
             os.rename(temp_location, dst_file.name)  # type: ignore
+
+    return content_processor(contents, src_file, dst_file, checksum, buffersize)
+
+
+def checksum_only(
+    src_file: IO[Any],
+    dst_file: Optional[IO[Any]] = None,
+    hasher: str = Hasher,
+    buffersize: int = 1024 * 1024,
+) -> str:
+    """Calculate the checksum only, does not copy the data files."""
+    if dst_file is not None:
+        _logger.error(
+            "[Debug Checksum Only] Destination file should be None but it is %s", dst_file
+        )
+    checksum = hashlib.new(hasher)
+    contents = src_file.read(buffersize)
+    return content_processor(contents, src_file, dst_file, checksum, buffersize)
+
+
+def content_processor(
+    contents: Any,
+    src_file: IO[Any],
+    dst_file: Optional[IO[Any]],
+    checksum: "hashlib._Hash",
+    buffersize: int,
+) -> str:
+    """Calculate the checksum based on the content."""
     while contents != b"":
         if dst_file is not None:
             dst_file.write(contents)

diff --git a/cwltool/cwlprov/provenance_constants.py b/cwltool/cwlprov/provenance_constants.py
@@ -1,4 +1,3 @@
-import hashlib
 import os
 import uuid
 
@@ -18,7 +17,12 @@
 
 # Research Object folders
 METADATA = "metadata"
+# sub-folders for data
 DATA = "data"
+INPUT_DATA = "data/input"
+INTM_DATA = "data/intermediate"
+OUTPUT_DATA = "data/output"
+
 WORKFLOW = "workflow"
 SNAPSHOT = "snapshot"
 # sub-folders
@@ -43,10 +47,11 @@
 # sha1, compatible with the File type's "checksum" field
 # e.g. "checksum" = "sha1$47a013e660d408619d894b20806b1d5086aab03b"
 # See ./cwltool/schemas/v1.0/Process.yml
-Hasher = hashlib.sha1
 SHA1 = "sha1"
 SHA256 = "sha256"
 SHA512 = "sha512"
+# set the default hash function as SHA1 for hashlib.new
+Hasher = SHA1
 
 # TODO: Better identifiers for user, at least
 # these should be preserved in ~/.config/cwl for every execution

diff --git a/cwltool/cwlprov/provenance_profile.py b/cwltool/cwlprov/provenance_profile.py
@@ -31,6 +31,8 @@
 from ..stdfsaccess import StdFsAccess
 from ..utils import CWLObjectType, JobsType, get_listing, posix_path, versionstring
 from ..workflow_job import WorkflowJob
+
+# from . import provenance_constants
 from .provenance_constants import (
     ACCOUNT_UUID,
     CWLPROV,
@@ -43,11 +45,14 @@
     SCHEMA,
     SHA1,
     SHA256,
+    Hasher,
     TEXT_PLAIN,
     UUID,
     WF4EVER,
     WFDESC,
     WFPROV,
+    INPUT_DATA,
+    OUTPUT_DATA,
 )
 from .writablebagfile import create_job, write_bag_file  # change this later
 
@@ -111,14 +116,24 @@
             _logger.debug("[provenance] Creator Full name: %s", self.full_name)
         self.workflow_run_uuid = run_uuid or uuid.uuid4()
         self.workflow_run_uri = self.workflow_run_uuid.urn
+        # default to input data, now only INPUT_DATA and OUTPUT_DATA are possible values
+        self.current_data_source = INPUT_DATA
         self.generate_prov_doc()
 
     def __str__(self) -> str:
         """Represent this Provenvance profile as a string."""
         return f"ProvenanceProfile <{self.workflow_run_uri}> in <{self.research_object}>"
 
     def generate_prov_doc(self) -> Tuple[str, ProvDocument]:
-        """Add basic namespaces."""
+        """Generate a provenance document.
+
+        This method adds basic namespaces to the provenance document and records host provenance.
+        It also adds information about the cwltool version, namespaces for various entities,
+        and creates agents, activities, and associations to represent the workflow execution.
+
+        Returns:
+            A tuple containing the workflow run URI and the generated ProvDocument.
+        """
 
         def host_provenance(document: ProvDocument) -> None:
             """Record host provenance."""
@@ -152,7 +167,7 @@
         #  https://tools.ietf.org/html/draft-thiemann-hash-urn-01
         # TODO: Change to nih:sha-256; hashes
         #  https://tools.ietf.org/html/rfc6920#section-7
-        self.document.add_namespace("data", "urn:hash::sha1:")
+        self.document.add_namespace("data", f"urn:hash::{Hasher}:")
         # Also needed for docker images
         self.document.add_namespace(SHA256, "nih:sha-256;")
 
@@ -287,6 +302,7 @@
         process_run_id: str,
         outputs: Union[CWLObjectType, MutableSequence[CWLObjectType], None],
         when: datetime.datetime,
+        # load_listing: None,
     ) -> None:
         self.generate_output_prov(outputs, process_run_id, process_name)
         self.document.wasEndedBy(process_run_id, None, self.workflow_run_uri, when)
@@ -300,14 +316,19 @@
         if "checksum" in value:
             csum = cast(str, value["checksum"])
             (method, checksum) = csum.split("$", 1)
-            if method == SHA1 and self.research_object.has_data_file(checksum):
+            # TODO intermediate file?...
+            if method == SHA1 and self.research_object.has_data_file(
+                self.current_data_source, checksum
+            ):
                 entity = self.document.entity("data:" + checksum)
 
         if not entity and "location" in value:
             location = str(value["location"])
             # If we made it here, we'll have to add it to the RO
             with self.fsaccess.open(location, "rb") as fhandle:
-                relative_path = self.research_object.add_data_file(fhandle)
+                relative_path = self.research_object.add_data_file(
+                    fhandle, current_source=self.current_data_source
+                )
                 # FIXME: This naively relies on add_data_file setting hash as filename
                 checksum = PurePath(relative_path).name
                 entity = self.document.entity("data:" + checksum, {PROV_TYPE: WFPROV["Artifact"]})
@@ -408,8 +429,10 @@
         # a later call to this method will sort that
         is_empty = True
 
-        if "listing" not in value:
-            get_listing(self.fsaccess, value)
+        # get loadlisting, and populate the listing of value if not no_listing, recursively if deep_listing
+        ll = value.get("loadListing")
+        if ll and ll != "no_listing":
+            get_listing(self.fsaccess, value, (ll == "deep_listing"))
         for entry in cast(MutableSequence[CWLObjectType], value.get("listing", [])):
             is_empty = False
             # Declare child-artifacts
@@ -472,7 +495,9 @@
     def declare_string(self, value: str) -> Tuple[ProvEntity, str]:
         """Save as string in UTF-8."""
         byte_s = BytesIO(str(value).encode(ENCODING))
-        data_file = self.research_object.add_data_file(byte_s, content_type=TEXT_PLAIN)
+        data_file = self.research_object.add_data_file(
+            byte_s, current_source=self.current_data_source, content_type=TEXT_PLAIN
+        )
         checksum = PurePosixPath(data_file).name
         # FIXME: Don't naively assume add_data_file uses hash in filename!
         data_id = f"data:{PurePosixPath(data_file).stem}"
@@ -505,7 +530,9 @@
         if isinstance(value, bytes):
             # If we got here then we must be in Python 3
             byte_s = BytesIO(value)
-            data_file = self.research_object.add_data_file(byte_s)
+            data_file = self.research_object.add_data_file(
+                byte_s, current_source=self.current_data_source
+            )
             # FIXME: Don't naively assume add_data_file uses hash in filename!
             data_id = f"data:{PurePosixPath(data_file).stem}"
             return self.document.entity(
@@ -604,6 +631,7 @@
         job_order: Union[CWLObjectType, List[CWLObjectType]],
         process_run_id: str,
         name: Optional[str] = None,
+        # load_listing=None,
     ) -> None:
         """Add used() for each data artefact."""
         if isinstance(job_order, list):
@@ -634,7 +662,17 @@
         process_run_id: Optional[str],
         name: Optional[str],
     ) -> None:
-        """Call wasGeneratedBy() for each output,copy the files into the RO."""
+        """
+        Call wasGeneratedBy() for each output, copy the files into the RO.
+
+        To save output data in ro.py add_data_file() method,
+        use a var current_data_source to keep track of whether it's
+        input or output (maybe intermediate in the future) data
+        it is later injected to add_data_file() method to save the data in the correct folder,
+        thus avoid changing the provenance_constants DATA
+        """
+        self.current_data_source = OUTPUT_DATA
+
         if isinstance(final_output, MutableSequence):
             for entry in final_output:
                 self.generate_output_prov(entry, process_run_id, name)
@@ -660,6 +698,7 @@
                 self.document.wasGeneratedBy(
                     entity, process_run_id, timestamp, None, {"prov:role": role}
                 )
+        # return current_data_source
 
     def prospective_prov(self, job: JobsType) -> None:
         """Create prospective prov recording as wfdesc prov:Plan."""
@@ -733,6 +772,8 @@
         # TODO: Also support other profiles than CWLProv, e.g. ProvOne
 
         # list of prov identifiers of provenance files
+        # NOTE: prov_ids are file names prepared for provenance/RO files in
+        # metadata/provenance for each sub-workflow of main workflow
         prov_ids = []
 
         # https://www.w3.org/TR/prov-xml/