diff --git a/samples/jax_samples.csv b/samples/jax_samples.csv index d26c345..920b00f 100644 --- a/samples/jax_samples.csv +++ b/samples/jax_samples.csv @@ -1,2 +1,2 @@ -https://raw.githubusercontent.com/TheJacksonLaboratory/jax-ngff-challenge-2024/66a060da191bf80f4e06fdd1f6b4e7196d6c6b45/KOMP_adult_lacZ.csv -https://raw.githubusercontent.com/TheJacksonLaboratory/jax-ngff-challenge-2024/66a060da191bf80f4e06fdd1f6b4e7196d6c6b45/KOMP_histopathology.csv +https://raw.githubusercontent.com/will-moore/jax-ngff-challenge-2024/refs/heads/add_shape_and_ontology_columns/KOMP_adult_lacZ.csv +https://raw.githubusercontent.com/will-moore/jax-ngff-challenge-2024/refs/heads/add_shape_and_ontology_columns/KOMP_histopathology.csv diff --git a/samples/load_zarr_stats.py b/samples/load_zarr_stats.py index 9cddcd3..5505cee 100644 --- a/samples/load_zarr_stats.py +++ b/samples/load_zarr_stats.py @@ -185,6 +185,8 @@ def load_zarr(zarr_url, average_count=5): column_names = [] column_data = [] + +unique_urls = set() # open a local csv file and iterate through rows... with Path(csv_name).open(newline="") as csvfile: csvreader = csv.reader(csvfile, delimiter=",") @@ -205,8 +207,18 @@ def load_zarr(zarr_url, average_count=5): zarr_url = row[url_col] if zarr_url.endswith(".csv"): continue + if zarr_url in unique_urls: + # print(f"Skipping duplicate url: {zarr_url}") + continue + unique_urls.add(zarr_url) average_count = 5 if "written" not in column_names else 1 - stats = load_zarr(zarr_url, average_count) + stats = {} + if ( + "written" not in column_names + or "shape" not in column_names + or "license" not in column_names + ): + stats = load_zarr(zarr_url, average_count) # Add the extra column data here... if "written" not in column_names: row.append(stats.get("written", 0))