Skip to content

Commit

Permalink
Merge pull request #115 from AllenNeuralDynamics/release-v0.14.0
Browse files Browse the repository at this point in the history
Release v0.14.0
  • Loading branch information
jtyoung84 authored Jan 16, 2025
2 parents 8dc615c + 77c1c2e commit 9570f4e
Show file tree
Hide file tree
Showing 15 changed files with 209 additions and 187 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/publish_dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:
publish:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v2
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/publish_main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
outputs:
pkg_version: ${{ steps.output_version.outputs.pkg_version }}
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Get version from file
run: |
pkg_name=$(grep -P 'version = \{attr = .*\}' pyproject.toml | grep -oP '\w+.__version__')
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/run_dev_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [ '3.8', '3.9', '3.10' ]
python-version: [ '3.9', '3.10', '3.11' ]
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/run_main_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [ '3.8', '3.9', '3.10' ]
python-version: [ '3.9', '3.10', '3.11' ]
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
Expand All @@ -28,7 +28,7 @@ jobs:
verify_version:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Check version incremented
run: |
pkg_name=$(grep -P 'version = \{attr = .*\}' pyproject.toml | grep -oP '\w+.__version__')
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ kept in sync:
1. **S3 buckets** store raw metadata files, including the ``metadata.nd.json``.
2. A **document database (DocDB)** contains unstructured json
documents describing the ``metadata.nd.json`` for a data asset.
3. **Code Ocean**: data assets are mounted as CodeOcean data asssets.
3. **Code Ocean**: data assets are mounted as CodeOcean data assets.
Processed results are also stored in an internal Code Ocean bucket.

We have automated jobs to keep changes in DocDB and S3 in sync.
Expand Down
1 change: 1 addition & 0 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Configuration file for the Sphinx documentation builder."""

#
# For the full list of built-in configuration values, see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
name = "aind-data-asset-indexer"
description = "Service Capsule to write data asset metadata to document store"
license = {text = "MIT"}
requires-python = ">=3.8"
requires-python = ">=3.9"
authors = [
{name = "AIND"}
]
Expand All @@ -24,7 +24,7 @@ dependencies = [
"pymongo==4.3.3",
"dask==2023.5.0",
"aind-data-schema==1.2.0",
"aind-codeocean-api==0.5.0",
"codeocean==0.3.0",
]

[project.optional-dependencies]
Expand Down
2 changes: 1 addition & 1 deletion src/aind_data_asset_indexer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""Package"""

__version__ = "0.13.0"
__version__ = "0.14.0"
20 changes: 9 additions & 11 deletions src/aind_data_asset_indexer/aind_bucket_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,14 +272,12 @@ def _resolve_schema_information(
object_key = create_object_key(
prefix=prefix, filename=core_schema_file_name
)
common_kwargs[
"core_schema_info_in_root"
] = get_dict_of_file_info(
s3_client=s3_client,
bucket=self.job_settings.s3_bucket,
keys=[object_key],
).get(
object_key
common_kwargs["core_schema_info_in_root"] = (
get_dict_of_file_info(
s3_client=s3_client,
bucket=self.job_settings.s3_bucket,
keys=[object_key],
).get(object_key)
)
self._copy_file_from_root_to_subdir(**common_kwargs)
# If field is null, a file exists in the root folder, and
Expand Down Expand Up @@ -424,9 +422,9 @@ def _process_docdb_record(
)
db = docdb_client[self.job_settings.doc_db_db_name]
collection = db[self.job_settings.doc_db_collection_name]
fields_to_update[
"last_modified"
] = datetime.utcnow().isoformat()
fields_to_update["last_modified"] = (
datetime.utcnow().isoformat()
)
response = collection.update_one(
{"_id": docdb_record["_id"]},
{"$set": fields_to_update},
Expand Down
70 changes: 50 additions & 20 deletions src/aind_data_asset_indexer/codeocean_bucket_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@

import boto3
import dask.bag as dask_bag
import requests
from aind_codeocean_api.codeocean import CodeOceanClient
from aind_data_schema.core.metadata import ExternalPlatforms
from codeocean import CodeOcean
from codeocean.data_asset import DataAssetSearchOrigin, DataAssetSearchParams
from mypy_boto3_s3 import S3Client
from pymongo import MongoClient
from pymongo.operations import UpdateOne
from requests.exceptions import ReadTimeout
from urllib3.util import Retry

from aind_data_asset_indexer.models import CodeOceanIndexBucketJobSettings
from aind_data_asset_indexer.utils import (
Expand Down Expand Up @@ -52,30 +52,51 @@ def __init__(self, job_settings: CodeOceanIndexBucketJobSettings):
"""Class constructor."""
self.job_settings = job_settings

def _get_external_data_asset_records(self) -> Optional[List[dict]]:
@staticmethod
def _get_external_data_asset_records(
co_client: CodeOcean,
) -> Optional[List[dict]]:
"""
Retrieves list of code ocean ids and locations for external data
assets. The timeout is set to 600 seconds.
Parameters
----------
co_client : CodeOcean
Returns
-------
List[dict] | None
List items have shape {"id": str, "location": str}. If error occurs,
return None.
"""
try:
response = requests.get(
self.job_settings.temp_codeocean_endpoint,
timeout=600,
search_params = DataAssetSearchParams(
archived=False,
origin=DataAssetSearchOrigin.External,
limit=1000,
)
if response.status_code == 200:
return response.json()
else:
return None
except ReadTimeout:
logging.error(
f"Read timed out at "
f"{self.job_settings.temp_codeocean_endpoint}"
data_assets = co_client.data_assets.search_data_assets_iterator(
search_params=search_params
)
external_records = []
for data_asset in data_assets:
data_asset_source = data_asset.source_bucket
if (
data_asset_source is not None
and data_asset_source.bucket is not None
and data_asset_source.prefix is not None
):
bucket = data_asset_source.bucket
prefix = data_asset_source.prefix
location = f"s3://{bucket}/{prefix}"
external_records.append(
{"id": data_asset.id, "location": location}
)
return external_records
except Exception as e:
logging.exception(e)
return None

@staticmethod
Expand All @@ -97,7 +118,7 @@ def _map_external_list_to_dict(external_recs: List[dict]) -> dict:
"""
new_records = dict()
for r in external_recs:
location = r.get("source")
location = r.get("location")
rec_id = r["id"]
if location is not None and new_records.get(location) is not None:
old_id_set = new_records.get(location)
Expand Down Expand Up @@ -140,7 +161,7 @@ def _get_co_links_from_record(
return external_links

def _update_external_links_in_docdb(
self, docdb_client: MongoClient
self, docdb_client: MongoClient, co_client: CodeOcean
) -> None:
"""
This method will:
Expand All @@ -159,7 +180,9 @@ def _update_external_links_in_docdb(
"""
# Should return a list like [{"id": co_id, "location": "s3://..."},]
list_of_co_ids_and_locations = self._get_external_data_asset_records()
list_of_co_ids_and_locations = self._get_external_data_asset_records(
co_client=co_client
)
db = docdb_client[self.job_settings.doc_db_db_name]
collection = db[self.job_settings.doc_db_collection_name]
if list_of_co_ids_and_locations is not None:
Expand Down Expand Up @@ -394,9 +417,16 @@ def _delete_records_from_docdb(self, record_list: List[str]):
def run_job(self):
"""Main method to run."""
logging.info("Starting to scan through CodeOcean.")
co_client = CodeOceanClient(
retry = Retry(
total=5,
backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504],
allowed_methods=["GET", "POST"],
)
co_client = CodeOcean(
domain=self.job_settings.codeocean_domain,
token=self.job_settings.codeocean_token.get_secret_value(),
retries=retry,
)
code_ocean_records = get_all_processed_codeocean_asset_records(
co_client=co_client,
Expand All @@ -416,7 +446,7 @@ def run_job(self):
# Use existing client to add external links to fields
logging.info("Adding links to records.")
self._update_external_links_in_docdb(
docdb_client=iterator_docdb_client
docdb_client=iterator_docdb_client, co_client=co_client
)
logging.info("Finished adding links to records")
all_docdb_records = dict()
Expand Down
9 changes: 5 additions & 4 deletions src/aind_data_asset_indexer/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,11 +124,12 @@ class CodeOceanIndexBucketJobSettings(IndexJobSettings):
doc_db_collection_name: str
codeocean_domain: str
codeocean_token: SecretStr
temp_codeocean_endpoint: str = Field(
temp_codeocean_endpoint: Optional[str] = Field(
default=None,
description=(
"Temp proxy to access code ocean information from their analytics "
"databases."
)
"(deprecated) Temp proxy to access code ocean information from "
"their analytics databases. Will be removed in a future release."
),
)

@classmethod
Expand Down
43 changes: 22 additions & 21 deletions src/aind_data_asset_indexer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from typing import Dict, Iterator, List, Optional
from urllib.parse import urlparse

from aind_codeocean_api.codeocean import CodeOceanClient
from aind_data_schema.core.data_description import DataLevel, DataRegex
from aind_data_schema.core.metadata import CORE_FILES as CORE_SCHEMAS
from aind_data_schema.core.metadata import (
Expand All @@ -18,6 +17,12 @@
create_metadata_json,
)
from botocore.exceptions import ClientError
from codeocean import CodeOcean
from codeocean.data_asset import (
DataAssetSearchParams,
DataAssetState,
DataAssetType,
)
from mypy_boto3_s3 import S3Client
from mypy_boto3_s3.type_defs import (
PaginatorConfigTypeDef,
Expand Down Expand Up @@ -934,7 +939,7 @@ def build_docdb_location_to_id_map(


def get_all_processed_codeocean_asset_records(
co_client: CodeOceanClient, co_data_asset_bucket: str
co_client: CodeOcean, co_data_asset_bucket: str
) -> Dict[str, dict]:
"""
Gets all the data asset records we're interested in indexing. The location
Expand All @@ -943,7 +948,7 @@ def get_all_processed_codeocean_asset_records(
Parameters
----------
co_client : CodeOceanClient
co_client : CodeOcean
co_data_asset_bucket : str
Name of Code Ocean's data asset bucket
Returns
Expand All @@ -966,31 +971,27 @@ def get_all_processed_codeocean_asset_records(
all_responses = dict()

for tag in {DataLevel.DERIVED.value, "processed"}:
response = co_client.search_all_data_assets(
type="result", query=f"tag:{tag}"
search_params = DataAssetSearchParams(
type=DataAssetType.Result, query=f"tag:{tag}"
)
iter_response = co_client.data_assets.search_data_assets_iterator(
search_params=search_params
)
# There is a bug with the codeocean api that caps the number of
# results in a single request to 10000.
if len(response.json()["results"]) >= 10000:
logging.warning(
"Number of records exceeds 10,000! This can lead to "
"possible data loss."
)
# Extract relevant information
extracted_info = dict()
for data_asset_info in response.json()["results"]:
data_asset_id = data_asset_info["id"]
data_asset_name = data_asset_info["name"]
created_timestamp = data_asset_info["created"]
for data_asset_info in iter_response:
data_asset_id = data_asset_info.id
data_asset_name = data_asset_info.name
created_timestamp = data_asset_info.created
created_datetime = datetime.fromtimestamp(
created_timestamp, tz=timezone.utc
)
# Results hosted externally have a source_bucket field
is_external = (
data_asset_info.get("sourceBucket") is not None
or data_asset_info.get("source_bucket") is not None
)
if not is_external and data_asset_info.get("state") == "ready":
is_external = data_asset_info.source_bucket is not None
if (
not is_external
and data_asset_info.state == DataAssetState.Ready
):
location = f"s3://{co_data_asset_bucket}/{data_asset_id}"
extracted_info[location] = {
"name": data_asset_name,
Expand Down
6 changes: 3 additions & 3 deletions tests/test_aind_bucket_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -920,9 +920,9 @@ def test_process_docdb_record_valid_metadata_nd_json_file(
]
self.assertEqual(expected_log_messages, captured.output)
expected_docdb_record_to_write = deepcopy(mock_docdb_record)
expected_docdb_record_to_write[
"last_modified"
] = "2024-08-25T17:41:28+00:00"
expected_docdb_record_to_write["last_modified"] = (
"2024-08-25T17:41:28+00:00"
)
expected_docdb_record_to_write["subject"] = self.example_md_record.get(
"subject"
)
Expand Down
Loading

0 comments on commit 9570f4e

Please sign in to comment.