From 324793235534225db92730680dd12821e005f523 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Tue, 14 Jan 2025 21:55:23 -0800 Subject: [PATCH 1/2] feat: add summary of Pydantic validation error differences for assets In the process, `_output_asset_validation_diff_reports` has been refactored to avoid duplications --- .../cmd_funcs/diff_manifests_reports.py | 155 +++++++++--------- src/dandisets_linkml_status_tools/tools/md.py | 49 ++++++ 2 files changed, 123 insertions(+), 81 deletions(-) diff --git a/src/dandisets_linkml_status_tools/cmd_funcs/diff_manifests_reports.py b/src/dandisets_linkml_status_tools/cmd_funcs/diff_manifests_reports.py index 1ef667a..b6f21a1 100644 --- a/src/dandisets_linkml_status_tools/cmd_funcs/diff_manifests_reports.py +++ b/src/dandisets_linkml_status_tools/cmd_funcs/diff_manifests_reports.py @@ -1,10 +1,7 @@ import logging from itertools import chain from pathlib import Path -from typing import TYPE_CHECKING, Annotated, cast - -if TYPE_CHECKING: - from collections.abc import Iterable +from typing import Annotated, Any, cast from jsondiff import diff from pydantic import Field @@ -35,14 +32,10 @@ gen_diff_cell, gen_pydantic_validation_errs_cell, gen_row, - pydantic_validation_err_diff_detailed_table, - validation_err_count_table, - validation_err_diff_detailed_tables, - validation_err_diff_table, + pydantic_validation_err_diff_summary, ) from dandisets_linkml_status_tools.tools.validation_err_counter import ( ValidationErrCounter, - validation_err_diff, ) logger = logging.getLogger(__name__) @@ -322,91 +315,30 @@ def _output_dandiset_validation_diff_reports( logger.info("Creating dandiset validation diff report directory %s", output_dir) output_dir.mkdir(parents=True) - err1_rep_lsts: list[list[tuple[str, str, tuple[str | int], Path]]] = [] - err2_rep_lsts: list[list[tuple[str, str, tuple[str | int], Path]]] = [] + err1_rep_lsts: list[list[tuple[str, str, tuple[str | int, ...], Path]]] = [] + err2_rep_lsts: list[list[tuple[str, str, tuple[str | int, ...], Path]]] = [] for r in reports: p = Path(r.dandiset_identifier, r.dandiset_version) # Tuple representation of the Pydantic validation errors err1_rep_lsts.append( - [ - (e["type"], e["msg"], tuple(e["loc"]), p) - for e in r.pydantic_validation_errs1 - ] + [pydantic_err_rep(e, p) for e in r.pydantic_validation_errs1] ) err2_rep_lsts.append( - [ - (e["type"], e["msg"], tuple(e["loc"]), p) - for e in r.pydantic_validation_errs2 - ] - ) - - err1_reps: Iterable[tuple[str, str, tuple[str | int, ...], Path]] = ( - chain.from_iterable(err1_rep_lsts) - ) - err2_reps: Iterable[tuple[str, str, tuple[str | int, ...], Path]] = ( - chain.from_iterable(err2_rep_lsts) - ) - - def err_categorizer(err: tuple) -> tuple[str, str, tuple[str, ...]]: - """ - Categorize a Pydantic validation error represented as a tuple using the same - tuple without the path component to the dandiset at a particular version and - with a generalized "loc" with all array indices replaced by "[*]" - - :param err: The tuple representing the Pydantic validation error - :return: The tuple representing the category that the error belongs to - """ - err = cast(tuple[str, str, tuple[str | int, ...], Path], err) - type_, msg = err[0], err[1] - - # Generalize the "loc" by replacing all array indices with "[*]" - loc = cast( - tuple[str, ...], tuple("[*]" if isinstance(v, int) else v for v in err[2]) + [pydantic_err_rep(e, p) for e in r.pydantic_validation_errs2] ) - return type_, msg, loc + pydantic_validation_errs1_ctr = ValidationErrCounter(pydantic_err_categorizer) + pydantic_validation_errs2_ctr = ValidationErrCounter(pydantic_err_categorizer) - pydantic_validation_errs1_ctr = ValidationErrCounter(err_categorizer) - pydantic_validation_errs2_ctr = ValidationErrCounter(err_categorizer) - - pydantic_validation_errs1_ctr.count(err1_reps) - pydantic_validation_errs2_ctr.count(err2_reps) - - pydantic_validation_err_diff = validation_err_diff( - pydantic_validation_errs1_ctr, pydantic_validation_errs2_ctr - ) + pydantic_validation_errs1_ctr.count(chain.from_iterable(err1_rep_lsts)) + pydantic_validation_errs2_ctr.count(chain.from_iterable(err2_rep_lsts)) with (output_dir / summary_file_name).open("w") as summary_f: - # === Output counts of different categories of Pydantic validation errors for - # validations done with separate schemas === - summary_f.write("### Pydantic errs 1 counts\n\n") + # Write the summary of the Pydantic validation error differences summary_f.write( - validation_err_count_table(pydantic_validation_errs1_ctr.counts_by_cat) - ) - - summary_f.write("\n") - summary_f.write("### Pydantic errs 2 counts\n\n") - summary_f.write( - validation_err_count_table(pydantic_validation_errs2_ctr.counts_by_cat) - ) - - # Output a table of the differences in the different categories of - # Pydantic validation errors between the two sets of validation results where - # each set is represented, and counted, by a `ValidationErrCounter` object - summary_f.write("\n") - summary_f.write("### Pydantic errs diff\n\n") - summary_f.write(validation_err_diff_table(pydantic_validation_err_diff)) - - # Write a sequence of tables detailing the differences in Pydantic validation - # errors between the two sets of validation results - summary_f.write("\n") - summary_f.write("## Pydantic errs diff detailed tables\n\n") - # noinspection PyTypeChecker - summary_f.write( - validation_err_diff_detailed_tables( - pydantic_validation_err_diff, - pydantic_validation_err_diff_detailed_table, + pydantic_validation_err_diff_summary( + pydantic_validation_errs1_ctr, pydantic_validation_errs2_ctr ) ) @@ -506,7 +438,33 @@ def _output_asset_validation_diff_reports( output_dir.mkdir(parents=True) logger.info("Created asset validation diff report directory %s", output_dir) + err1_rep_lsts: list[list[tuple[str, str, tuple[str | int, ...], Path]]] = [] + err2_rep_lsts: list[list[tuple[str, str, tuple[str | int, ...], Path]]] = [] + for r in reports: + p = Path(r.dandiset_identifier, r.dandiset_version, str(r.asset_idx)) + + # Tuple representation of the Pydantic validation errors + err1_rep_lsts.append( + [pydantic_err_rep(e, p) for e in r.pydantic_validation_errs1] + ) + err2_rep_lsts.append( + [pydantic_err_rep(e, p) for e in r.pydantic_validation_errs2] + ) + + pydantic_validation_errs1_ctr = ValidationErrCounter(pydantic_err_categorizer) + pydantic_validation_errs2_ctr = ValidationErrCounter(pydantic_err_categorizer) + + pydantic_validation_errs1_ctr.count(chain.from_iterable(err1_rep_lsts)) + pydantic_validation_errs2_ctr.count(chain.from_iterable(err2_rep_lsts)) + with (output_dir / summary_file_name).open("w") as summary_f: + # Write the summary of the Pydantic validation error differences + summary_f.write( + pydantic_validation_err_diff_summary( + pydantic_validation_errs1_ctr, pydantic_validation_errs2_ctr + ) + ) + # Write the header and alignment rows of the summary table summary_f.write(gen_header_and_alignment_rows(summary_headers)) @@ -585,3 +543,38 @@ def _output_asset_validation_diff_reports( summary_f.write(gen_row(row_cells)) logger.info("Output of asset validation diff reports is complete") + + +def pydantic_err_categorizer(err: tuple) -> tuple[str, str, tuple[str, ...]]: + """ + Categorize a Pydantic validation error represented as a tuple using the same + tuple without the path component to the dandiset at a particular version and + with a generalized "loc" with all array indices replaced by "[*]" + + :param err: The tuple representing the Pydantic validation error + :return: The tuple representing the category that the error belongs to + """ + err = cast(tuple[str, str, tuple[str | int, ...], Path], err) + type_, msg = err[0], err[1] + + # Generalize the "loc" by replacing all array indices with "[*]" + loc = cast( + tuple[str, ...], tuple("[*]" if isinstance(v, int) else v for v in err[2]) + ) + + return type_, msg, loc + + +def pydantic_err_rep( + err: dict[str, Any], path: Path +) -> tuple[str, str, tuple[str | int, ...], Path]: + """ + Get a representation of a Pydantic validation error as a tuple + + :param err: The Pydantic validation error as a `dict` + :param path: The path the data instance that the error pertained to + :return: The representation of the Pydantic validation error as tuple consisting of + the values for the `'type'`, `'msg'`, `'loc'` keys of the error and `path`. + Note: The value of the `'loc'` key is converted to a tuple from a list + """ + return err["type"], err["msg"], tuple(err["loc"]), path diff --git a/src/dandisets_linkml_status_tools/tools/md.py b/src/dandisets_linkml_status_tools/tools/md.py index f707aef..02fcc05 100644 --- a/src/dandisets_linkml_status_tools/tools/md.py +++ b/src/dandisets_linkml_status_tools/tools/md.py @@ -7,6 +7,10 @@ from dandisets_linkml_status_tools.models import PydanticValidationErrsType from dandisets_linkml_status_tools.tools.typing import Stringable +from dandisets_linkml_status_tools.tools.validation_err_counter import ( + ValidationErrCounter, + validation_err_diff, +) def gen_row(cell_values: Iterable[Stringable]) -> str: @@ -247,3 +251,48 @@ def pydantic_validation_err_diff_detailed_table( ) return f"{heading}{header_and_alignment_rows}{rows}" + + +def pydantic_validation_err_diff_summary( + c1: ValidationErrCounter, c2: ValidationErrCounter +) -> str: + """ + Generate a summary of the differences between two sets of Pydantic validation errors + + :param c1: A `ValidationErrCounter` that has counted the first set of Pydantic + validation errors + :param c2: A `ValidationErrCounter` that has counted the second set of Pydantic + validation errors + :return: The string presenting the summary in Markdown format + """ + + # The differences in the different categories of + # Pydantic validation errors between the two sets of validation results where + # each set is represented, and counted, by a `ValidationErrCounter` object + pydantic_validation_err_diff = validation_err_diff(c1, c2) + + count_table1 = validation_err_count_table(c1.counts_by_cat) + count_table2 = validation_err_count_table(c2.counts_by_cat) + + # A table of the differences in the different categories of Pydantic validation + # errors + diff_table = validation_err_diff_table(pydantic_validation_err_diff) + + # A sequence of tables detailing the differences in Pydantic validation + # errors between the two sets of validation results + # noinspection PyTypeChecker + diff_detailed_tables = validation_err_diff_detailed_tables( + pydantic_validation_err_diff, + pydantic_validation_err_diff_detailed_table, + ) + + return ( + f"### Pydantic errs 1 counts\n\n" + f"{count_table1}" + f"\n### Pydantic errs 2 counts\n\n" + f"{count_table2}" + f"\n### Pydantic errs diff\n\n" + f"{diff_table}" + f"\n## Pydantic errs diff detailed tables\n\n" + f"{diff_detailed_tables}" + ) From 9514fe8805f3136b78298e2f69652348a05f9656 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Wed, 15 Jan 2025 00:31:45 -0800 Subject: [PATCH 2/2] feat: remove the old summary table for asset The old summary table was too big to be any use and not as uninformative --- .../cmd_funcs/diff_manifests_reports.py | 61 +------------------ 1 file changed, 3 insertions(+), 58 deletions(-) diff --git a/src/dandisets_linkml_status_tools/cmd_funcs/diff_manifests_reports.py b/src/dandisets_linkml_status_tools/cmd_funcs/diff_manifests_reports.py index b6f21a1..74f52a7 100644 --- a/src/dandisets_linkml_status_tools/cmd_funcs/diff_manifests_reports.py +++ b/src/dandisets_linkml_status_tools/cmd_funcs/diff_manifests_reports.py @@ -424,17 +424,6 @@ def _output_asset_validation_diff_reports( """ summary_file_name = "summary.md" - summary_headers = [ - "dandiset", - "version", - "asset id", - "asset path", - "asset index", - "pydantic errs 1", - "pydantic errs 2", - "pydantic errs diff", - ] - output_dir.mkdir(parents=True) logger.info("Created asset validation diff report directory %s", output_dir) @@ -465,11 +454,8 @@ def _output_asset_validation_diff_reports( ) ) - # Write the header and alignment rows of the summary table - summary_f.write(gen_header_and_alignment_rows(summary_headers)) - - # Output individual asset validation diff reports by writing the supporting - # files and the summary table row + # Output individual asset validation diff reports by writing the constituting + # files for r in reports: report_dir = ( output_dir @@ -493,7 +479,7 @@ def _output_asset_validation_diff_reports( logger.info( "Dandiset %s:%s - asset %sat index %d: " - "Wrote asset validation diff report supporting files to %s", + "Wrote asset validation diff report constituting files to %s", r.dandiset_identifier, r.dandiset_version, f"{r.asset_id} " if r.asset_id else "", @@ -501,47 +487,6 @@ def _output_asset_validation_diff_reports( report_dir, ) - # === Write the summary table row for the validation diff report === - # Relative directory for storing all validation diff reports of the dandiset - dandiset_dir = Path(r.dandiset_identifier) - # Relative directory for storing all validation diff reports of the dandiset - # at a particular version - version_dir = dandiset_dir / r.dandiset_version - # Relative directory for storing all validation diff reports of the asset - asset_dir = version_dir / str(r.asset_idx) - - row_cells = ( - f" {c} " # Add spaces around the cell content for better readability - for c in [ - # For the dandiset column - f"[{r.dandiset_identifier}]({dandiset_dir})", - # For the version column - f"[{r.dandiset_version}]({version_dir})", - # For the asset id column - f"{r.asset_id}", - # For the asset path column - f"{r.asset_path}", - # For the asset index column - f"[{r.asset_idx}]({asset_dir})", - # For the pydantic errs 1 column - gen_pydantic_validation_errs_cell( - r.pydantic_validation_errs1, - asset_dir / f"{pydantic_errs1_base_fname}.json", - ), - # For the pydantic errs 2 column - gen_pydantic_validation_errs_cell( - r.pydantic_validation_errs2, - asset_dir / f"{pydantic_errs2_base_fname}.json", - ), - # For the pydantic errs diff column - gen_diff_cell( - r.pydantic_validation_errs_diff, - asset_dir / f"{pydantic_errs_diff_base_fname}.json", - ), - ] - ) - summary_f.write(gen_row(row_cells)) - logger.info("Output of asset validation diff reports is complete")