diff --git a/pyproject.toml b/pyproject.toml index 47b2f37..7bb2df0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,10 +20,10 @@ dependencies = [ "boto3", "boto3-stubs[s3]", "pydantic-settings>=2.0", - "pydantic>=2.7,<2.9", + "pydantic>=2.10", "pymongo==4.3.3", "dask==2023.5.0", - "aind-data-schema==1.0.0", + "aind-data-schema==1.2.0", "aind-codeocean-api==0.5.0", ] diff --git a/src/aind_data_asset_indexer/__init__.py b/src/aind_data_asset_indexer/__init__.py index 9dd0800..24dc33a 100644 --- a/src/aind_data_asset_indexer/__init__.py +++ b/src/aind_data_asset_indexer/__init__.py @@ -1,3 +1,3 @@ """Package""" -__version__ = "0.12.0" +__version__ = "0.13.0" diff --git a/src/aind_data_asset_indexer/aind_bucket_indexer.py b/src/aind_data_asset_indexer/aind_bucket_indexer.py index eb08449..48899a4 100644 --- a/src/aind_data_asset_indexer/aind_bucket_indexer.py +++ b/src/aind_data_asset_indexer/aind_bucket_indexer.py @@ -11,6 +11,7 @@ import boto3 import dask.bag as dask_bag +from aind_data_schema.base import is_dict_corrupt from mypy_boto3_s3 import S3Client from mypy_boto3_s3.type_defs import CopySourceTypeDef from pymongo import MongoClient @@ -30,7 +31,6 @@ get_dict_of_file_info, get_s3_bucket_and_prefix, get_s3_location, - is_dict_corrupt, is_prefix_valid, is_record_location_valid, iterate_through_top_level, @@ -215,15 +215,15 @@ def _resolve_schema_information( The fields in the DocDb record that will require updating. """ docdb_record_fields_to_update = dict() - for core_schema_file_name in core_schema_file_names: - field_name = core_schema_file_name.replace(".json", "") + for ( + field_name, + core_schema_file_name, + ) in core_schema_file_names.items(): is_in_record = docdb_record.get(field_name) is not None is_in_root = ( core_schema_info_in_root.get(core_schema_file_name) is not None ) - is_in_copy_subdir = ( - core_schema_file_name in list_of_schemas_in_copy_subdir - ) + is_in_copy_subdir = field_name in list_of_schemas_in_copy_subdir # To avoid copying and pasting the same arguments, we'll keep it # them in a dict common_kwargs = { @@ -580,8 +580,9 @@ def _process_prefix( collection = db[ self.job_settings.doc_db_collection_name ] - if "_id" in json_contents: - # TODO: check is_dict_corrupt(json_contents) + if "_id" in json_contents and not is_dict_corrupt( + json_contents + ): logging.info( f"Adding record to docdb for: {location}" ) @@ -602,6 +603,10 @@ def _process_prefix( self.job_settings.copy_original_md_subdir ), ) + elif "_id" in json_contents: + logging.warning( + f"Metadata record for {location} is corrupt!" + ) else: logging.warning( f"Metadata record for {location} " diff --git a/src/aind_data_asset_indexer/utils.py b/src/aind_data_asset_indexer/utils.py index e0a7870..16f5ac1 100644 --- a/src/aind_data_asset_indexer/utils.py +++ b/src/aind_data_asset_indexer/utils.py @@ -11,8 +11,12 @@ from aind_codeocean_api.codeocean import CodeOceanClient from aind_data_schema.core.data_description import DataLevel, DataRegex -from aind_data_schema.core.metadata import ExternalPlatforms, Metadata -from aind_data_schema.utils.json_writer import SchemaWriter +from aind_data_schema.core.metadata import CORE_FILES as CORE_SCHEMAS +from aind_data_schema.core.metadata import ( + ExternalPlatforms, + Metadata, + create_metadata_json, +) from botocore.exceptions import ClientError from mypy_boto3_s3 import S3Client from mypy_boto3_s3.type_defs import ( @@ -23,12 +27,9 @@ metadata_filename = Metadata.default_filename() -# TODO: This would be better if it was available in aind-data-schema -core_schema_file_names = [ - s.default_filename() - for s in SchemaWriter.get_schemas() - if s.default_filename() != metadata_filename -] +core_schema_file_names = { + field_name: f"{field_name}.json" for field_name in CORE_SCHEMAS +} def create_object_key(prefix: str, filename: str) -> str: @@ -328,11 +329,10 @@ def does_s3_metadata_copy_exist( Bucket=bucket, Prefix=copy_prefix, Delimiter="/" ) if "Contents" in response: - core_schemas = [s.replace(".json", "") for s in core_schema_file_names] pattern = re.escape(copy_prefix) + r"([a-zA-Z0-9_]+)\.\d{8}\.json$" for obj in response["Contents"]: m = re.match(pattern, obj["Key"]) - if m is not None and m.group(1) in core_schemas: + if m is not None and m.group(1) in CORE_SCHEMAS: return True return False @@ -359,8 +359,8 @@ def list_metadata_copies( Returns ------- List[str] - A list of the core schemas in the copy_subdir without timestamp, e..g, - ["subject.json", "procedures.json", "processing.json"] + A list of the core schemas in the copy_subdir without timestamp, e.g, + ["subject", "procedures", "processing"] """ # Use trailing slash and delimiter to get top-level objects in copy_subdir copy_prefix = create_object_key(prefix, copy_subdir.strip("/") + "/") @@ -369,12 +369,11 @@ def list_metadata_copies( ) files = [] if "Contents" in response: - core_schemas = [s.replace(".json", "") for s in core_schema_file_names] pattern = re.escape(copy_prefix) + r"([a-zA-Z0-9_]+)\.\d{8}\.json$" for obj in response["Contents"]: m = re.match(pattern, obj["Key"]) - if m is not None and m.group(1) in core_schemas: - files.append(f"{m.group(1)}.json") + if m is not None and m.group(1) in CORE_SCHEMAS: + files.append(m.group(1)) return files @@ -440,12 +439,10 @@ def get_dict_of_core_schema_file_info( ... } """ - key_map = dict( - [ - (create_object_key(prefix=prefix, filename=s), s) - for s in core_schema_file_names - ] - ) + key_map = { + create_object_key(prefix=prefix, filename=file_name): file_name + for file_name in core_schema_file_names.values() + } file_info = get_dict_of_file_info( s3_client=s3_client, bucket=bucket, keys=list(key_map.keys()) ) @@ -491,32 +488,6 @@ def iterate_through_top_level( ] -def is_dict_corrupt(input_dict: dict) -> bool: - """ - Checks that all the keys, included nested keys, don't contain '$' or '.' - - Parameters - ---------- - input_dict : dict - - Returns - ------- - bool - True if input_dict is not a dict, or if nested dictionary keys contain - forbidden characters. False otherwise. - - """ - if not isinstance(input_dict, dict): - return True - for key, value in input_dict.items(): - if "$" in key or "." in key: - return True - elif isinstance(value, dict): - if is_dict_corrupt(value): - return True - return False - - def download_json_file_from_s3( s3_client: S3Client, bucket: str, object_key: str ) -> Optional[dict]: @@ -578,44 +549,34 @@ def build_metadata_record_from_prefix( there are issues with Metadata construction. """ - file_keys = [ - create_object_key(prefix=prefix, filename=file_name) - for file_name in core_schema_file_names - ] - s3_file_responses = get_dict_of_file_info( - s3_client=s3_client, bucket=bucket, keys=file_keys + core_files_infos = get_dict_of_core_schema_file_info( + s3_client=s3_client, bucket=bucket, prefix=prefix ) record_name = prefix.strip("/") if optional_name is None else optional_name try: - metadata_dict = { - "name": record_name, - "location": get_s3_location(bucket=bucket, prefix=prefix), - } - if optional_created is not None: - metadata_dict["created"] = optional_created - if optional_external_links is not None: - metadata_dict["external_links"] = optional_external_links - for object_key, response_data in s3_file_responses.items(): + core_jsons = dict() + for field_name, file_name in core_schema_file_names.items(): + response_data = core_files_infos.get(file_name) if response_data is not None: - field_name = object_key.split("/")[-1].replace(".json", "") + object_key = create_object_key(prefix, file_name) json_contents = download_json_file_from_s3( s3_client=s3_client, bucket=bucket, object_key=object_key ) if json_contents is not None: - is_corrupt = is_dict_corrupt(input_dict=json_contents) - if not is_corrupt: - metadata_dict[field_name] = json_contents - # TODO: We should handle constructing the Metadata file in a better way - # in aind-data-schema. By using model_validate, a lot of info from the - # original files get removed. For now, we can use model_construct - # until a better method is implemented in aind-data-schema. This will - # mark all the initial files as metadata_status=Unknown - metadata_dict = Metadata.model_construct( - **metadata_dict - ).model_dump_json(warnings=False, by_alias=True) + core_jsons[field_name] = json_contents + # Construct Metadata file using core schema jsons + # Validation and de/serialization are handled in aind-data-schema + metadata_dict = create_metadata_json( + name=record_name, + location=get_s3_location(bucket=bucket, prefix=prefix), + core_jsons=core_jsons, + optional_created=optional_created, + optional_external_links=optional_external_links, + ) + metadata_str = json.dumps(metadata_dict) except Exception: - metadata_dict = None - return metadata_dict + metadata_str = None + return metadata_str def cond_copy_then_sync_core_json_files( @@ -705,17 +666,13 @@ def copy_core_json_files( """ tgt_copy_subdir = copy_original_md_subdir.strip("/") tgt_copy_prefix = create_object_key(prefix, tgt_copy_subdir) - core_files_keys = [ - create_object_key(prefix=prefix, filename=s) - for s in core_schema_file_names - ] - core_files_infos = get_dict_of_file_info( - s3_client=s3_client, bucket=bucket, keys=core_files_keys + core_files_infos = get_dict_of_core_schema_file_info( + s3_client=s3_client, bucket=bucket, prefix=prefix ) - for file_name in core_schema_file_names: + for file_name in core_schema_file_names.values(): source = create_object_key(prefix, file_name) source_location = get_s3_location(bucket=bucket, prefix=source) - source_file_info = core_files_infos[source] + source_file_info = core_files_infos[file_name] if source_file_info is not None: date_stamp = source_file_info["last_modified"].strftime("%Y%m%d") target = create_object_key( @@ -766,16 +723,11 @@ def sync_core_json_files( None """ md_record_json = json.loads(metadata_json) - core_files_keys = [ - create_object_key(prefix=prefix, filename=s) - for s in core_schema_file_names - ] - core_files_infos = get_dict_of_file_info( - s3_client=s3_client, bucket=bucket, keys=core_files_keys + core_files_infos = get_dict_of_core_schema_file_info( + s3_client=s3_client, bucket=bucket, prefix=prefix ) - for file_name in core_schema_file_names: + for field_name, file_name in core_schema_file_names.items(): object_key = create_object_key(prefix, file_name) - field_name = file_name.replace(".json", "") location = get_s3_location(bucket=bucket, prefix=object_key) if ( field_name in md_record_json @@ -785,7 +737,7 @@ def sync_core_json_files( field_contents_str = json.dumps(field_contents) # Core schema jsons are created if they don't already exist. # Otherwise, they are only updated if their contents are outdated. - if core_files_infos[object_key] is None: + if core_files_infos[file_name] is None: logging.info(f"Uploading new {field_name} to {location}") response = upload_json_str_to_s3( bucket=bucket, @@ -795,7 +747,7 @@ def sync_core_json_files( ) logging.debug(response) else: - s3_object_hash = core_files_infos[object_key]["e_tag"].strip( + s3_object_hash = core_files_infos[file_name]["e_tag"].strip( '"' ) core_field_md5_hash = compute_md5_hash(field_contents_str) @@ -818,7 +770,7 @@ def sync_core_json_files( else: # If a core field is None but the core json exists, # delete the core json. - if core_files_infos[object_key] is not None: + if core_files_infos[file_name] is not None: logging.info( f"{field_name} not found in metadata.nd.json for " f"{prefix} but {location} exists! Deleting." diff --git a/tests/resources/utils/example_metadata.nd.json b/tests/resources/utils/example_metadata.nd.json index 96d9124..fa1a36f 100644 --- a/tests/resources/utils/example_metadata.nd.json +++ b/tests/resources/utils/example_metadata.nd.json @@ -4,114 +4,139 @@ "created": "2024-05-13T22:01:56.035469", "data_description": null, "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/core/metadata.py", - "external_links": [], + "external_links": {}, "instrument": null, "last_modified": "2024-05-13T22:01:56.035475", "location": "s3://aind-ephys-data-dev-u5u0i5/ecephys_642478_2023-01-17_13-56-29", - "metadata_status": "Unknown", + "metadata_status": "Valid", "name": "ecephys_642478_2023-01-17_13-56-29", "procedures": null, "processing": { - "data_processes": [ - { - "code_url": "https://github.com/AllenNeuralDynamics/aind-data-transfer", - "end_date_time": "2023-01-20T19:13:36.434644+00:00", - "input_location": "\\\\allen\\programs\\aind\\workgroups\\ephys\\data\\ephys\\642478_2023-01-17_13-56-29", - "name": "Ephys preprocessing", - "notes": null, - "output_location": "s3://aind-ephys-data/ecephys_642478_2023-01-17_13-56-29", - "parameters": { - "aws_secret_names": { - "code_ocean_api_token_name": "codeocean-api-token", - "region": "us-west-2", - "video_encryption_password": "video_encryption_password" - }, - "clip_data_job": { - "clip_kwargs": {} - }, - "compress_data_job": { - "compressor": { - "compressor_name": "wavpack", - "kwargs": { - "level": 3 + "analyses": [], + "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/core/processing.py", + "notes": null, + "processing_pipeline": { + "data_processes": [ + { + "code_url": "https://github.com/AllenNeuralDynamics/aind-data-transfer", + "code_version": null, + "end_date_time": "2023-01-20T19:13:36.434644Z", + "input_location": "\\\\allen\\programs\\aind\\workgroups\\ephys\\data\\ephys\\642478_2023-01-17_13-56-29", + "name": "Ephys preprocessing", + "notes": null, + "output_location": "s3://aind-ephys-data/ecephys_642478_2023-01-17_13-56-29", + "outputs": {}, + "parameters": { + "aws_secret_names": { + "code_ocean_api_token_name": "codeocean-api-token", + "region": "us-west-2", + "video_encryption_password": "video_encryption_password" + }, + "clip_data_job": { + "clip_kwargs": {} + }, + "compress_data_job": { + "compressor": { + "compressor_name": "wavpack", + "kwargs": { + "level": 3 + } + }, + "format_kwargs": {}, + "scale_params": {}, + "write_kwargs": { + "chunk_duration": "1s", + "n_jobs": 24, + "progress_bar": true } }, - "format_kwargs": {}, - "scale_params": {}, - "write_kwargs": { - "chunk_duration": "1s", - "n_jobs": 24, - "progress_bar": true + "data": { + "name": "openephys" + }, + "endpoints": { + "code_repo_location": "https://github.com/AllenNeuralDynamics/aind-data-transfer", + "codeocean_domain": "https://codeocean.allenneuraldynamics.org", + "dest_data_dir": "ecephys_642478_2023-01-17_13-56-29", + "gcp_prefix": "ecephys_642478_2023-01-17_13-56-29", + "metadata_service_url": "http://aind-metadata-service", + "raw_data_dir": "\\\\allen\\programs\\aind\\workgroups\\ephys\\data\\ephys\\642478_2023-01-17_13-56-29", + "s3_bucket": "aind-ephys-data", + "s3_prefix": "ecephys_642478_2023-01-17_13-56-29" + }, + "jobs": { + "attach_metadata": true, + "clip": true, + "compress": true, + "trigger_codeocean_job": true, + "upload_to_gcp": false, + "upload_to_s3": true + }, + "logging": { + "level": "INFO" + }, + "trigger_codeocean_job": { + "bucket": "aind-ephys-data", + "capsule_id": "648473aa-791e-4372-bd25-205cc587ec56", + "job_type": "openephys", + "prefix": "ecephys_642478_2023-01-17_13-56-29" + }, + "upload_data_job": { + "dryrun": false } }, - "data": { - "name": "openephys" - }, - "endpoints": { - "code_repo_location": "https://github.com/AllenNeuralDynamics/aind-data-transfer", - "codeocean_domain": "https://codeocean.allenneuraldynamics.org", - "dest_data_dir": "ecephys_642478_2023-01-17_13-56-29", - "gcp_prefix": "ecephys_642478_2023-01-17_13-56-29", - "metadata_service_url": "http://aind-metadata-service", - "raw_data_dir": "\\\\allen\\programs\\aind\\workgroups\\ephys\\data\\ephys\\642478_2023-01-17_13-56-29", - "s3_bucket": "aind-ephys-data", - "s3_prefix": "ecephys_642478_2023-01-17_13-56-29" - }, - "jobs": { - "attach_metadata": true, - "clip": true, - "compress": true, - "trigger_codeocean_job": true, - "upload_to_gcp": false, - "upload_to_s3": true - }, - "logging": { - "level": "INFO" - }, - "trigger_codeocean_job": { - "bucket": "aind-ephys-data", - "capsule_id": "648473aa-791e-4372-bd25-205cc587ec56", - "job_type": "openephys", - "prefix": "ecephys_642478_2023-01-17_13-56-29" - }, - "upload_data_job": { - "dryrun": false - } - }, - "start_date_time": "2023-01-20T19:06:02.945386+00:00", - "version": "0.2.9" - } - ], - "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/processing.py", - "pipeline_url": null, - "pipeline_version": null, - "schema_version": "0.1.0" + "resources": null, + "software_version": "0.2.9", + "start_date_time": "2023-01-20T19:06:02.945386Z" + } + ], + "note": null, + "pipeline_url": null, + "pipeline_version": null, + "processor_full_name": "service" + }, + "schema_version": "1.1.3" }, "quality_control": null, "rig": null, - "schema_version": "1.0.0", + "schema_version": "1.1.1", "session": null, "subject": { + "alleles": [], "background_strain": null, - "breeding_group": "Chat-IRES-Cre-neo", + "breeding_info": { + "breeding_group": "Chat-IRES-Cre-neo", + "maternal_genotype": "Chat-IRES-Cre-neo/Chat-IRES-Cre-neo", + "maternal_id": "624133", + "paternal_genotype": "Chat-IRES-Cre-neo/Chat-IRES-Cre-neo", + "paternal_id": "624115" + }, "date_of_birth": "2022-07-16", - "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/site-packages/aind_data_schema/subject.py", + "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/core/subject.py", "genotype": "Chat-IRES-Cre-neo/Chat-IRES-Cre-neo", - "home_cage_enrichment": null, - "light_cycle": null, - "maternal_genotype": "Chat-IRES-Cre-neo/Chat-IRES-Cre-neo", - "maternal_id": "624133", - "mgi_allele_ids": null, + "housing": null, "notes": null, - "paternal_genotype": "Chat-IRES-Cre-neo/Chat-IRES-Cre-neo", - "paternal_id": "624115", "restrictions": null, "rrid": null, - "schema_version": "0.2.2", + "schema_version": "1.0.0", "sex": "Male", - "source": null, - "species": "Mus musculus", + "source": { + "abbreviation": "AI", + "name": "Allen Institute", + "registry": { + "abbreviation": "ROR", + "name": "Research Organization Registry" + }, + "registry_identifier": "03cpe7c52" + }, + "species": { + "name": "Mus musculus", + "registry": { + "abbreviation": "NCBI", + "name": "National Center for Biotechnology Information" + }, + "registry_identifier": "NCBI:txid10090" + }, "subject_id": "642478", - "wellness_reports": null + "wellness_reports": [] } } \ No newline at end of file diff --git a/tests/resources/utils/example_metadata1.nd.json b/tests/resources/utils/example_metadata1.nd.json index 798da14..b9588de 100644 --- a/tests/resources/utils/example_metadata1.nd.json +++ b/tests/resources/utils/example_metadata1.nd.json @@ -52,7 +52,7 @@ "subject_id": "567890" }, "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/core/metadata.py", - "external_links": [], + "external_links": {}, "instrument": null, "last_modified": "2024-05-15T17:41:26.697535", "location": "s3://aind-ephys-data-dev-u5u0i5/ecephys_567890_2000-01-01_04-00-00", @@ -97,7 +97,7 @@ }, "quality_control": null, "rig": null, - "schema_version": "1.0.0", + "schema_version": "1.1.1", "session": null, "subject": { "background_strain": null, diff --git a/tests/resources/utils/example_metadata2.nd.json b/tests/resources/utils/example_metadata2.nd.json index 6f7b08f..e03a770 100644 --- a/tests/resources/utils/example_metadata2.nd.json +++ b/tests/resources/utils/example_metadata2.nd.json @@ -52,7 +52,7 @@ "subject_id": "655019" }, "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/core/metadata.py", - "external_links": [], + "external_links": {}, "instrument": null, "last_modified": "2024-05-15T17:41:30.131278", "location": "s3://aind-ephys-data-dev-u5u0i5/ecephys_655019_2000-01-01_01-01-02", @@ -175,7 +175,7 @@ }, "quality_control": null, "rig": null, - "schema_version": "1.0.0", + "schema_version": "1.1.1", "session": null, "subject": { "background_strain": null, diff --git a/tests/resources/utils/processing.json b/tests/resources/utils/processing.json index dc67940..319fe2c 100644 --- a/tests/resources/utils/processing.json +++ b/tests/resources/utils/processing.json @@ -1,76 +1,85 @@ { - "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/processing.py", - "schema_version": "0.1.0", - "pipeline_version": null, - "pipeline_url": null, - "data_processes": [ - { + "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/core/processing.py", + "schema_version": "1.1.3", + "processing_pipeline": { + "data_processes": [ + { "name": "Ephys preprocessing", - "version": "0.2.9", - "start_date_time": "2023-01-20T19:06:02.945386+00:00", - "end_date_time": "2023-01-20T19:13:36.434644+00:00", + "software_version": "0.2.9", + "start_date_time": "2023-01-20T19:06:02.945386Z", + "end_date_time": "2023-01-20T19:13:36.434644Z", "input_location": "\\\\allen\\programs\\aind\\workgroups\\ephys\\data\\ephys\\642478_2023-01-17_13-56-29", "output_location": "s3://aind-ephys-data/ecephys_642478_2023-01-17_13-56-29", "code_url": "https://github.com/AllenNeuralDynamics/aind-data-transfer", + "code_version": null, "parameters": { - "endpoints": { - "raw_data_dir": "\\\\allen\\programs\\aind\\workgroups\\ephys\\data\\ephys\\642478_2023-01-17_13-56-29", - "dest_data_dir": "ecephys_642478_2023-01-17_13-56-29", - "s3_bucket": "aind-ephys-data", - "s3_prefix": "ecephys_642478_2023-01-17_13-56-29", - "gcp_prefix": "ecephys_642478_2023-01-17_13-56-29", - "codeocean_domain": "https://codeocean.allenneuraldynamics.org", - "code_repo_location": "https://github.com/AllenNeuralDynamics/aind-data-transfer", - "metadata_service_url": "http://aind-metadata-service" - }, - "aws_secret_names": { - "region": "us-west-2", - "video_encryption_password": "video_encryption_password", - "code_ocean_api_token_name": "codeocean-api-token" - }, - "jobs": { - "clip": true, - "compress": true, - "attach_metadata": true, - "upload_to_s3": true, - "upload_to_gcp": false, - "trigger_codeocean_job": true - }, - "data": { - "name": "openephys" - }, - "clip_data_job": { - "clip_kwargs": {} - }, - "compress_data_job": { - "write_kwargs": { - "n_jobs": 24, - "chunk_duration": "1s", - "progress_bar": true - }, - "format_kwargs": {}, - "compressor": { - "compressor_name": "wavpack", - "kwargs": { - "level": 3 - } - }, - "scale_params": {} - }, - "upload_data_job": { - "dryrun": false - }, - "trigger_codeocean_job": { - "capsule_id": "648473aa-791e-4372-bd25-205cc587ec56", - "job_type": "openephys", - "bucket": "aind-ephys-data", - "prefix": "ecephys_642478_2023-01-17_13-56-29" - }, - "logging": { - "level": "INFO" - } + "endpoints": { + "raw_data_dir": "\\\\allen\\programs\\aind\\workgroups\\ephys\\data\\ephys\\642478_2023-01-17_13-56-29", + "dest_data_dir": "ecephys_642478_2023-01-17_13-56-29", + "s3_bucket": "aind-ephys-data", + "s3_prefix": "ecephys_642478_2023-01-17_13-56-29", + "gcp_prefix": "ecephys_642478_2023-01-17_13-56-29", + "codeocean_domain": "https://codeocean.allenneuraldynamics.org", + "code_repo_location": "https://github.com/AllenNeuralDynamics/aind-data-transfer", + "metadata_service_url": "http://aind-metadata-service" + }, + "aws_secret_names": { + "region": "us-west-2", + "video_encryption_password": "video_encryption_password", + "code_ocean_api_token_name": "codeocean-api-token" + }, + "jobs": { + "clip": true, + "compress": true, + "attach_metadata": true, + "upload_to_s3": true, + "upload_to_gcp": false, + "trigger_codeocean_job": true + }, + "data": { + "name": "openephys" + }, + "clip_data_job": { + "clip_kwargs": {} + }, + "compress_data_job": { + "write_kwargs": { + "n_jobs": 24, + "chunk_duration": "1s", + "progress_bar": true + }, + "format_kwargs": {}, + "compressor": { + "compressor_name": "wavpack", + "kwargs": { + "level": 3 + } + }, + "scale_params": {} + }, + "upload_data_job": { + "dryrun": false + }, + "trigger_codeocean_job": { + "capsule_id": "648473aa-791e-4372-bd25-205cc587ec56", + "job_type": "openephys", + "bucket": "aind-ephys-data", + "prefix": "ecephys_642478_2023-01-17_13-56-29" + }, + "logging": { + "level": "INFO" + } }, - "notes": null - } - ] + "outputs": {}, + "notes": null, + "resources": null + } + ], + "processor_full_name": "service", + "pipeline_version": null, + "pipeline_url": null, + "note": null + }, + "analyses": [], + "notes": null } \ No newline at end of file diff --git a/tests/resources/utils/subject.json b/tests/resources/utils/subject.json index af1e7a9..30bf2ce 100644 --- a/tests/resources/utils/subject.json +++ b/tests/resources/utils/subject.json @@ -1,23 +1,39 @@ { - "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/site-packages/aind_data_schema/subject.py", - "schema_version": "0.2.2", - "species": "Mus musculus", - "subject_id": "642478", - "sex": "Male", - "date_of_birth": "2022-07-16", - "genotype": "Chat-IRES-Cre-neo/Chat-IRES-Cre-neo", - "mgi_allele_ids": null, - "background_strain": null, - "source": null, - "rrid": null, - "restrictions": null, - "breeding_group": "Chat-IRES-Cre-neo", - "maternal_id": "624133", - "maternal_genotype": "Chat-IRES-Cre-neo/Chat-IRES-Cre-neo", - "paternal_id": "624115", - "paternal_genotype": "Chat-IRES-Cre-neo/Chat-IRES-Cre-neo", - "light_cycle": null, - "home_cage_enrichment": null, - "wellness_reports": null, - "notes": null + "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/core/subject.py", + "schema_version": "1.0.0", + "subject_id": "642478", + "sex": "Male", + "date_of_birth": "2022-07-16", + "genotype": "Chat-IRES-Cre-neo/Chat-IRES-Cre-neo", + "species": { + "name": "Mus musculus", + "registry": { + "name": "National Center for Biotechnology Information", + "abbreviation": "NCBI" + }, + "registry_identifier": "NCBI:txid10090" + }, + "alleles": [], + "background_strain": null, + "breeding_info": { + "breeding_group": "Chat-IRES-Cre-neo", + "maternal_id": "624133", + "maternal_genotype": "Chat-IRES-Cre-neo/Chat-IRES-Cre-neo", + "paternal_id": "624115", + "paternal_genotype": "Chat-IRES-Cre-neo/Chat-IRES-Cre-neo" + }, + "source": { + "name": "Allen Institute", + "abbreviation": "AI", + "registry": { + "name": "Research Organization Registry", + "abbreviation": "ROR" + }, + "registry_identifier": "03cpe7c52" + }, + "rrid": null, + "restrictions": null, + "wellness_reports": [], + "housing": null, + "notes": null } \ No newline at end of file diff --git a/tests/test_aind_bucket_indexer.py b/tests/test_aind_bucket_indexer.py index 2528a6d..631fd1e 100644 --- a/tests/test_aind_bucket_indexer.py +++ b/tests/test_aind_bucket_indexer.py @@ -70,7 +70,7 @@ def test_write_root_file_with_record_info_same_hash( "last_modified": datetime( 2024, 5, 15, 17, 41, 28, tzinfo=timezone.utc ), - "e_tag": '"275d922d2a1e547f2e0f35b5cc54f493"', + "e_tag": '"a0f1022e3b4a8bc60e63e3677171f784"', "version_id": "version_id", }, prefix="ecephys_642478_2023-01-17_13-56-29", @@ -201,7 +201,7 @@ def test_copy_file_from_root_to_subdir( ) @patch( "aind_data_asset_indexer.aind_bucket_indexer.core_schema_file_names", - ["subject.json"], + {"subject": "subject.json"}, ) # Mocking this to limit for loop to one iteration def test_resolve_schema_information_case_1( self, @@ -229,7 +229,7 @@ def test_resolve_schema_information_case_1( prefix="ecephys_642478_2023-01-17_13-56-29", s3_client=mock_s3_client, core_schema_info_in_root=core_schema_info_in_root, - list_of_schemas_in_copy_subdir=["subject.json"], + list_of_schemas_in_copy_subdir=["subject"], docdb_record=self.example_md_record, ) self.assertEqual(dict(), docdb_fields_to_update) @@ -260,7 +260,7 @@ def test_resolve_schema_information_case_1( ) @patch( "aind_data_asset_indexer.aind_bucket_indexer.core_schema_file_names", - ["subject.json"], + {"subject": "subject.json"}, ) # Mocking this to limit for loop to one iteration def test_resolve_schema_information_case_2( self, @@ -326,7 +326,7 @@ def test_resolve_schema_information_case_2( ) @patch( "aind_data_asset_indexer.aind_bucket_indexer.core_schema_file_names", - ["subject.json"], + {"subject": "subject.json"}, ) # Mocking this to limit for loop to one iteration def test_resolve_schema_information_case_3( self, @@ -346,7 +346,7 @@ def test_resolve_schema_information_case_3( prefix="ecephys_642478_2023-01-17_13-56-29", s3_client=mock_s3_client, core_schema_info_in_root=core_schema_info_in_root, - list_of_schemas_in_copy_subdir=["subject.json"], + list_of_schemas_in_copy_subdir=["subject"], docdb_record=self.example_md_record, ) self.assertEqual(dict(), docdb_fields_to_update) @@ -378,7 +378,7 @@ def test_resolve_schema_information_case_3( @patch("aind_data_asset_indexer.aind_bucket_indexer.get_dict_of_file_info") @patch( "aind_data_asset_indexer.aind_bucket_indexer.core_schema_file_names", - ["subject.json"], + {"subject": "subject.json"}, ) # Mocking this to limit for loop to one iteration def test_resolve_schema_information_case_4( self, @@ -454,7 +454,7 @@ def test_resolve_schema_information_case_4( ) @patch( "aind_data_asset_indexer.aind_bucket_indexer.core_schema_file_names", - ["subject.json"], + {"subject": "subject.json"}, ) # Mocking this to limit for loop to one iteration def test_resolve_schema_information_case_5( self, @@ -485,7 +485,7 @@ def test_resolve_schema_information_case_5( prefix="ecephys_642478_2023-01-17_13-56-29", s3_client=mock_s3_client, core_schema_info_in_root=core_schema_info_in_root, - list_of_schemas_in_copy_subdir=["subject.json"], + list_of_schemas_in_copy_subdir=["subject"], docdb_record=dict(), ) ) @@ -516,7 +516,7 @@ def test_resolve_schema_information_case_5( ) @patch( "aind_data_asset_indexer.aind_bucket_indexer.core_schema_file_names", - ["subject.json"], + {"subject": "subject.json"}, ) # Mocking this to limit for loop to one iteration def test_resolve_schema_information_case_6( self, @@ -579,7 +579,7 @@ def test_resolve_schema_information_case_6( ) @patch( "aind_data_asset_indexer.aind_bucket_indexer.core_schema_file_names", - ["subject.json"], + {"subject": "subject.json"}, ) # Mocking this to limit for loop to one iteration def test_resolve_schema_information_case_6_corrupt_download( self, @@ -652,7 +652,7 @@ def test_resolve_schema_information_case_6_corrupt_download( ) @patch( "aind_data_asset_indexer.aind_bucket_indexer.core_schema_file_names", - ["subject.json"], + {"subject": "subject.json"}, ) # Mocking this to limit for loop to one iteration def test_resolve_schema_information_case_7( self, @@ -674,7 +674,7 @@ def test_resolve_schema_information_case_7( prefix="ecephys_642478_2023-01-17_13-56-29", s3_client=mock_s3_client, core_schema_info_in_root=core_schema_info_in_root, - list_of_schemas_in_copy_subdir=["subject.json"], + list_of_schemas_in_copy_subdir=["subject"], docdb_record=dict(), ) ) @@ -704,7 +704,7 @@ def test_resolve_schema_information_case_7( ) @patch( "aind_data_asset_indexer.aind_bucket_indexer.core_schema_file_names", - ["subject.json"], + {"subject": "subject.json"}, ) # Mocking this to limit for loop to one iteration def test_resolve_schema_information_case_8( self, @@ -1315,6 +1315,62 @@ def test_process_prefix_no_record_yes_file_good_file( ) mock_upload_metadata_json_str_to_s3.assert_not_called() + @patch( + "aind_data_asset_indexer.aind_bucket_indexer." + "upload_metadata_json_str_to_s3" + ) + @patch( + "aind_data_asset_indexer.aind_bucket_indexer." + "cond_copy_then_sync_core_json_files" + ) + @patch("aind_data_asset_indexer.aind_bucket_indexer.is_dict_corrupt") + @patch( + "aind_data_asset_indexer.aind_bucket_indexer." + "download_json_file_from_s3" + ) + @patch("aind_data_asset_indexer.aind_bucket_indexer.does_s3_object_exist") + @patch("aind_data_asset_indexer.aind_bucket_indexer.MongoClient") + @patch("boto3.client") + def test_process_prefix_no_record_yes_file_corrupt_file( + self, + mock_s3_client: MagicMock, + mock_docdb_client: MagicMock, + mock_does_s3_object_exist: MagicMock, + mock_download_json_file_from_s3: MagicMock, + mock_is_dict_corrupt: MagicMock, + mock_cond_copy_then_sync_core_json_files: MagicMock, + mock_upload_metadata_json_str_to_s3: MagicMock, + ): + """Tests _process_prefix method when there is no record in DocDb, + there is and there is metadata.nd.json file in S3, and the file can + be serialized to json, but its contents are corrupt.""" + mock_db = MagicMock() + mock_docdb_client.__getitem__.return_value = mock_db + mock_collection = MagicMock() + mock_db.__getitem__.return_value = mock_collection + + mock_does_s3_object_exist.return_value = True + mocked_downloaded_record = deepcopy(self.example_md_record) + mock_download_json_file_from_s3.return_value = mocked_downloaded_record + mock_is_dict_corrupt.return_value = True + + location_to_id_map = dict() + with self.assertLogs(level="DEBUG") as captured: + self.basic_job._process_prefix( + s3_prefix="ecephys_642478_2023-01-17_13-56-29", + docdb_client=mock_docdb_client, + s3_client=mock_s3_client, + location_to_id_map=location_to_id_map, + ) + expected_log_messages = [ + "WARNING:root:Metadata record for s3://aind-ephys-data-dev-u5u0i5/" + "ecephys_642478_2023-01-17_13-56-29 is corrupt!" + ] + self.assertEqual(expected_log_messages, captured.output) + mock_collection.assert_not_called() + mock_cond_copy_then_sync_core_json_files.assert_not_called() + mock_upload_metadata_json_str_to_s3.assert_not_called() + @patch( "aind_data_asset_indexer.aind_bucket_indexer." "upload_metadata_json_str_to_s3" diff --git a/tests/test_codeocean_bucket_indexer.py b/tests/test_codeocean_bucket_indexer.py index 1b7b944..5b2f2cc 100644 --- a/tests/test_codeocean_bucket_indexer.py +++ b/tests/test_codeocean_bucket_indexer.py @@ -73,16 +73,17 @@ def setUpClass(cls) -> None: }, }, ] + # corresponds to cls.example_codeocean_records[1] cls.example_dict_of_file_info = { - "ecephys_642478_2023-01-17_13-56-29/acquisition.json": None, - "ecephys_642478_2023-01-17_13-56-29/data_description.json": None, - "ecephys_642478_2023-01-17_13-56-29/instrument.json": None, - "ecephys_642478_2023-01-17_13-56-29/procedures.json": None, - "ecephys_642478_2023-01-17_13-56-29/processing.json": None, - "ecephys_642478_2023-01-17_13-56-29/quality_control.json": None, - "ecephys_642478_2023-01-17_13-56-29/rig.json": None, - "ecephys_642478_2023-01-17_13-56-29/session.json": None, - "ecephys_642478_2023-01-17_13-56-29/subject.json": None, + "666666cc-66cc-6c66-666c-6c66c6666666/acquisition.json": None, + "666666cc-66cc-6c66-666c-6c66c6666666/data_description.json": None, + "666666cc-66cc-6c66-666c-6c66c6666666/instrument.json": None, + "666666cc-66cc-6c66-666c-6c66c6666666/procedures.json": None, + "666666cc-66cc-6c66-666c-6c66c6666666/processing.json": None, + "666666cc-66cc-6c66-666c-6c66c6666666/quality_control.json": None, + "666666cc-66cc-6c66-666c-6c66c6666666/rig.json": None, + "666666cc-66cc-6c66-666c-6c66c6666666/session.json": None, + "666666cc-66cc-6c66-666c-6c66c6666666/subject.json": None, } cls.example_docdb_records = [ { @@ -358,7 +359,7 @@ def test_process_codeocean_record( ], ) - @patch("aind_data_schema.core.metadata.Metadata.model_construct") + @patch("aind_data_asset_indexer.utils.create_metadata_json") @patch("aind_data_asset_indexer.codeocean_bucket_indexer.MongoClient") @patch("boto3.client") @patch("aind_data_asset_indexer.utils.get_dict_of_file_info") @@ -369,7 +370,7 @@ def test_process_codeocean_record_warning( mock_get_dict_of_file_info: MagicMock, mock_s3_client: MagicMock, mock_docdb_client: MagicMock, - mock_model_construct: MagicMock, + mock_create_metadata_json: MagicMock, ): """Tests _process_codeocean_record when there is an issue building the record""" @@ -378,8 +379,10 @@ def test_process_codeocean_record_warning( self.example_dict_of_file_info ) - # Suppose there is an error using model_construct - mock_model_construct.side_effect = Exception("Something went wrong") + # Suppose there is an error creating metadata file + mock_create_metadata_json.side_effect = Exception( + "Something went wrong" + ) with self.assertLogs(level="DEBUG") as captured: self.basic_job._process_codeocean_record( diff --git a/tests/test_utils.py b/tests/test_utils.py index 3b34ad6..c2f41dd 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -29,7 +29,6 @@ get_record_from_docdb, get_s3_bucket_and_prefix, get_s3_location, - is_dict_corrupt, is_prefix_valid, is_record_location_valid, iterate_through_top_level, @@ -145,23 +144,7 @@ def load_json_file(filename: str) -> dict: def test_compute_md5_hash(self): """Tests compute_md5_hash method""" md5_hash = compute_md5_hash(json.dumps(self.example_metadata_nd)) - self.assertEqual("275d922d2a1e547f2e0f35b5cc54f493", md5_hash) - - def test_is_dict_corrupt(self): - """Tests is_dict_corrupt method""" - good_contents = {"a": 1, "b": {"c": 2, "d": 3}} - bad_contents1 = {"a.1": 1, "b": {"c": 2, "d": 3}} - bad_contents2 = {"a": 1, "b": {"c": 2, "$d": 3}} - bad_contents3 = {"a": 1, "b": {"c": 2, "d": 3}, "$e": 4} - bad_contents4 = {"a": 1, "b": {"c": {"d": 3}, "$e": 4}} - bad_contents5 = [{"a": 1}, {"b": {"c": 2, "d": 3}}] - self.assertFalse(is_dict_corrupt(good_contents)) - self.assertTrue(is_dict_corrupt(bad_contents1)) - self.assertTrue(is_dict_corrupt(bad_contents2)) - self.assertTrue(is_dict_corrupt(bad_contents3)) - self.assertTrue(is_dict_corrupt(bad_contents4)) - # noinspection PyTypeChecker - self.assertTrue(is_dict_corrupt(bad_contents5)) + self.assertEqual("a0f1022e3b4a8bc60e63e3677171f784", md5_hash) def test_create_object_key(self): """Tests create_object_key""" @@ -480,7 +463,7 @@ def test_list_metadata_copies(self, mock_s3_client: MagicMock): copy_subdir="original_metadata", s3_client=mock_s3_client, ) - self.assertEqual(["data_description.json", "subject.json"], contents) + self.assertEqual(["data_description", "subject"], contents) @patch("boto3.client") def test_does_s3_metadata_copy_exist_none(self, mock_s3_client: MagicMock): @@ -717,17 +700,19 @@ def test_build_metadata_record_from_prefix( }, } mock_download_json_file.side_effect = [ - self.example_processing, self.example_subject, + self.example_processing, ] - # noinspection PyTypeChecker - md = json.loads( - build_metadata_record_from_prefix( - bucket="aind-ephys-data-dev-u5u0i5", - prefix="ecephys_642478_2023-01-17_13-56-29", - s3_client=mock_s3_client, + # there are some userwarnings when creating Subject from json + with self.assertWarns(UserWarning): + # noinspection PyTypeChecker + md = json.loads( + build_metadata_record_from_prefix( + bucket="aind-ephys-data-dev-u5u0i5", + prefix="ecephys_642478_2023-01-17_13-56-29", + s3_client=mock_s3_client, + ) ) - ) mock_get_dict_of_file_info.assert_called_once() mock_download_json_file.assert_has_calls( [ @@ -735,14 +720,14 @@ def test_build_metadata_record_from_prefix( s3_client=mock_s3_client, bucket="aind-ephys-data-dev-u5u0i5", object_key=( - "ecephys_642478_2023-01-17_13-56-29/processing.json" + "ecephys_642478_2023-01-17_13-56-29/subject.json" ), ), call( s3_client=mock_s3_client, bucket="aind-ephys-data-dev-u5u0i5", object_key=( - "ecephys_642478_2023-01-17_13-56-29/subject.json" + "ecephys_642478_2023-01-17_13-56-29/processing.json" ), ), ] @@ -765,15 +750,15 @@ def test_build_metadata_record_from_prefix_with_optional_fields( """Tests build_metadata_record_from_prefix method when 'created' and 'external_links' are provided""" mock_get_dict_of_file_info.return_value = { - "ecephys_642478_2023-01-17_13-56-29/acquisition.json": None, - "ecephys_642478_2023-01-17_13-56-29/data_description.json": None, - "ecephys_642478_2023-01-17_13-56-29/instrument.json": None, - "ecephys_642478_2023-01-17_13-56-29/procedures.json": None, - "ecephys_642478_2023-01-17_13-56-29/processing.json": None, - "ecephys_642478_2023-01-17_13-56-29/quality_control.json": None, - "ecephys_642478_2023-01-17_13-56-29/rig.json": None, - "ecephys_642478_2023-01-17_13-56-29/session.json": None, - "ecephys_642478_2023-01-17_13-56-29/subject.json": None, + "abc-123/acquisition.json": None, + "abc-123/data_description.json": None, + "abc-123/instrument.json": None, + "abc-123/procedures.json": None, + "abc-123/processing.json": None, + "abc-123/quality_control.json": None, + "abc-123/rig.json": None, + "abc-123/session.json": None, + "abc-123/subject.json": None, } # noinspection PyTypeChecker md = json.loads( @@ -793,7 +778,7 @@ def test_build_metadata_record_from_prefix_with_optional_fields( self.assertEqual("2020-01-02T03:04:05", md["created"]) self.assertEqual({"Code Ocean": ["123-456"]}, md["external_links"]) - @patch("aind_data_asset_indexer.utils.Metadata.model_construct") + @patch("aind_data_asset_indexer.utils.create_metadata_json") @patch("boto3.client") @patch("aind_data_asset_indexer.utils.get_dict_of_file_info") @patch("aind_data_asset_indexer.utils.download_json_file_from_s3") @@ -802,7 +787,7 @@ def test_build_metadata_record_from_prefix_error( mock_download_json_file: MagicMock, mock_get_dict_of_file_info: MagicMock, mock_s3_client: MagicMock, - mock_metadata_model_construct: MagicMock, + mock_create_metadata_json: MagicMock, ): """Tests build_metadata_record_from_prefix method when there is an error when creating the metadata record""" @@ -830,10 +815,10 @@ def test_build_metadata_record_from_prefix_error( }, } mock_download_json_file.side_effect = [ - self.example_processing, self.example_subject, + self.example_processing, ] - mock_metadata_model_construct.side_effect = ValueError( + mock_create_metadata_json.side_effect = ValueError( "Error creating metadata record" ) # noinspection PyTypeChecker @@ -849,14 +834,14 @@ def test_build_metadata_record_from_prefix_error( s3_client=mock_s3_client, bucket="aind-ephys-data-dev-u5u0i5", object_key=( - "ecephys_642478_2023-01-17_13-56-29/processing.json" + "ecephys_642478_2023-01-17_13-56-29/subject.json" ), ), call( s3_client=mock_s3_client, bucket="aind-ephys-data-dev-u5u0i5", object_key=( - "ecephys_642478_2023-01-17_13-56-29/subject.json" + "ecephys_642478_2023-01-17_13-56-29/processing.json" ), ), ] @@ -955,7 +940,7 @@ def test_sync_core_json_files( f"INFO:root:subject is up-to-date in " f"{s3_loc}/subject.json. Skipping.", ] - self.assertEqual(expected_log_messages, captured.output) + self.assertCountEqual(expected_log_messages, captured.output) mock_get_dict_of_file_info.assert_called_once() # assert that only new or updated core jsons were uploaded to s3 mock_upload_core_record.assert_has_calls( @@ -1057,7 +1042,7 @@ def test_cond_copy_then_sync_core_json_files( f"s3://{bucket}/{pfx}/subject.json", "DEBUG:root:Some Message", ] - self.assertEqual(expected_output_messages, captured.output) + self.assertCountEqual(expected_output_messages, captured.output) # assert that an existing /original_metadata folder was detected mock_does_s3_metadata_copy_exist.assert_called_once_with( bucket=bucket, @@ -1084,7 +1069,8 @@ def test_cond_copy_then_sync_core_json_files( json_str=json.dumps(self.example_metadata_nd["subject"]), s3_client=mock_s3_client, ), - ] + ], + any_order=True, ) mock_s3_client.delete_object.assert_not_called() @@ -1200,7 +1186,7 @@ def test_cond_copy_then_sync_core_json_files_mismatch( f"s3://{bucket}/{pfx}/subject.json", "DEBUG:root:Uploaded json", ] - self.assertEqual(expected_log_messages, captured.output) + self.assertCountEqual(expected_log_messages, captured.output) # assert that the original core jsons were copied, including # corrupt rig.json mock_s3_client.copy_object.assert_has_calls( @@ -1229,7 +1215,8 @@ def test_cond_copy_then_sync_core_json_files_mismatch( }, Key=f"{pfx}/original_metadata/subject.20240202.json", ), - ] + ], + any_order=True, ) # assert that only valid core jsons were overwritten mock_upload_core_record.assert_has_calls( @@ -1248,7 +1235,8 @@ def test_cond_copy_then_sync_core_json_files_mismatch( json_str=json.dumps(self.example_metadata_nd["subject"]), s3_client=mock_s3_client, ), - ] + ], + any_order=True, ) # assert the corrupt core json was deleted mock_s3_client.delete_object.assert_called_once_with(