From 324793235534225db92730680dd12821e005f523 Mon Sep 17 00:00:00 2001
From: Isaac To <isaac.chun.to@gmail.com>
Date: Tue, 14 Jan 2025 21:55:23 -0800
Subject: [PATCH 1/2] feat: add summary of Pydantic validation error
 differences for assets

In the process, `_output_asset_validation_diff_reports` has been
refactored to avoid duplications
---
 .../cmd_funcs/diff_manifests_reports.py       | 155 +++++++++---------
 src/dandisets_linkml_status_tools/tools/md.py |  49 ++++++
 2 files changed, 123 insertions(+), 81 deletions(-)

diff --git a/src/dandisets_linkml_status_tools/cmd_funcs/diff_manifests_reports.py b/src/dandisets_linkml_status_tools/cmd_funcs/diff_manifests_reports.py
index 1ef667a..b6f21a1 100644
--- a/src/dandisets_linkml_status_tools/cmd_funcs/diff_manifests_reports.py
+++ b/src/dandisets_linkml_status_tools/cmd_funcs/diff_manifests_reports.py
@@ -1,10 +1,7 @@
 import logging
 from itertools import chain
 from pathlib import Path
-from typing import TYPE_CHECKING, Annotated, cast
-
-if TYPE_CHECKING:
-    from collections.abc import Iterable
+from typing import Annotated, Any, cast
 
 from jsondiff import diff
 from pydantic import Field
@@ -35,14 +32,10 @@
     gen_diff_cell,
     gen_pydantic_validation_errs_cell,
     gen_row,
-    pydantic_validation_err_diff_detailed_table,
-    validation_err_count_table,
-    validation_err_diff_detailed_tables,
-    validation_err_diff_table,
+    pydantic_validation_err_diff_summary,
 )
 from dandisets_linkml_status_tools.tools.validation_err_counter import (
     ValidationErrCounter,
-    validation_err_diff,
 )
 
 logger = logging.getLogger(__name__)
@@ -322,91 +315,30 @@ def _output_dandiset_validation_diff_reports(
     logger.info("Creating dandiset validation diff report directory %s", output_dir)
     output_dir.mkdir(parents=True)
 
-    err1_rep_lsts: list[list[tuple[str, str, tuple[str | int], Path]]] = []
-    err2_rep_lsts: list[list[tuple[str, str, tuple[str | int], Path]]] = []
+    err1_rep_lsts: list[list[tuple[str, str, tuple[str | int, ...], Path]]] = []
+    err2_rep_lsts: list[list[tuple[str, str, tuple[str | int, ...], Path]]] = []
     for r in reports:
         p = Path(r.dandiset_identifier, r.dandiset_version)
 
         # Tuple representation of the Pydantic validation errors
         err1_rep_lsts.append(
-            [
-                (e["type"], e["msg"], tuple(e["loc"]), p)
-                for e in r.pydantic_validation_errs1
-            ]
+            [pydantic_err_rep(e, p) for e in r.pydantic_validation_errs1]
         )
         err2_rep_lsts.append(
-            [
-                (e["type"], e["msg"], tuple(e["loc"]), p)
-                for e in r.pydantic_validation_errs2
-            ]
-        )
-
-    err1_reps: Iterable[tuple[str, str, tuple[str | int, ...], Path]] = (
-        chain.from_iterable(err1_rep_lsts)
-    )
-    err2_reps: Iterable[tuple[str, str, tuple[str | int, ...], Path]] = (
-        chain.from_iterable(err2_rep_lsts)
-    )
-
-    def err_categorizer(err: tuple) -> tuple[str, str, tuple[str, ...]]:
-        """
-        Categorize a Pydantic validation error represented as a tuple using the same
-        tuple without the path component to the dandiset at a particular version and
-        with a generalized "loc" with all array indices replaced by "[*]"
-
-        :param err: The tuple representing the Pydantic validation error
-        :return: The tuple representing the category that the error belongs to
-        """
-        err = cast(tuple[str, str, tuple[str | int, ...], Path], err)
-        type_, msg = err[0], err[1]
-
-        # Generalize the "loc" by replacing all array indices with "[*]"
-        loc = cast(
-            tuple[str, ...], tuple("[*]" if isinstance(v, int) else v for v in err[2])
+            [pydantic_err_rep(e, p) for e in r.pydantic_validation_errs2]
         )
 
-        return type_, msg, loc
+    pydantic_validation_errs1_ctr = ValidationErrCounter(pydantic_err_categorizer)
+    pydantic_validation_errs2_ctr = ValidationErrCounter(pydantic_err_categorizer)
 
-    pydantic_validation_errs1_ctr = ValidationErrCounter(err_categorizer)
-    pydantic_validation_errs2_ctr = ValidationErrCounter(err_categorizer)
-
-    pydantic_validation_errs1_ctr.count(err1_reps)
-    pydantic_validation_errs2_ctr.count(err2_reps)
-
-    pydantic_validation_err_diff = validation_err_diff(
-        pydantic_validation_errs1_ctr, pydantic_validation_errs2_ctr
-    )
+    pydantic_validation_errs1_ctr.count(chain.from_iterable(err1_rep_lsts))
+    pydantic_validation_errs2_ctr.count(chain.from_iterable(err2_rep_lsts))
 
     with (output_dir / summary_file_name).open("w") as summary_f:
-        # === Output counts of different categories of Pydantic validation errors for
-        # validations done with separate schemas ===
-        summary_f.write("### Pydantic errs 1 counts\n\n")
+        # Write the summary of the Pydantic validation error differences
         summary_f.write(
-            validation_err_count_table(pydantic_validation_errs1_ctr.counts_by_cat)
-        )
-
-        summary_f.write("\n")
-        summary_f.write("### Pydantic errs 2 counts\n\n")
-        summary_f.write(
-            validation_err_count_table(pydantic_validation_errs2_ctr.counts_by_cat)
-        )
-
-        # Output a table of the differences in the different categories of
-        # Pydantic validation errors between the two sets of validation results where
-        # each set is represented, and counted, by a `ValidationErrCounter` object
-        summary_f.write("\n")
-        summary_f.write("### Pydantic errs diff\n\n")
-        summary_f.write(validation_err_diff_table(pydantic_validation_err_diff))
-
-        # Write a sequence of tables detailing the differences in Pydantic validation
-        # errors between the two sets of validation results
-        summary_f.write("\n")
-        summary_f.write("## Pydantic errs diff detailed tables\n\n")
-        # noinspection PyTypeChecker
-        summary_f.write(
-            validation_err_diff_detailed_tables(
-                pydantic_validation_err_diff,
-                pydantic_validation_err_diff_detailed_table,
+            pydantic_validation_err_diff_summary(
+                pydantic_validation_errs1_ctr, pydantic_validation_errs2_ctr
             )
         )
 
@@ -506,7 +438,33 @@ def _output_asset_validation_diff_reports(
     output_dir.mkdir(parents=True)
     logger.info("Created asset validation diff report directory %s", output_dir)
 
+    err1_rep_lsts: list[list[tuple[str, str, tuple[str | int, ...], Path]]] = []
+    err2_rep_lsts: list[list[tuple[str, str, tuple[str | int, ...], Path]]] = []
+    for r in reports:
+        p = Path(r.dandiset_identifier, r.dandiset_version, str(r.asset_idx))
+
+        # Tuple representation of the Pydantic validation errors
+        err1_rep_lsts.append(
+            [pydantic_err_rep(e, p) for e in r.pydantic_validation_errs1]
+        )
+        err2_rep_lsts.append(
+            [pydantic_err_rep(e, p) for e in r.pydantic_validation_errs2]
+        )
+
+    pydantic_validation_errs1_ctr = ValidationErrCounter(pydantic_err_categorizer)
+    pydantic_validation_errs2_ctr = ValidationErrCounter(pydantic_err_categorizer)
+
+    pydantic_validation_errs1_ctr.count(chain.from_iterable(err1_rep_lsts))
+    pydantic_validation_errs2_ctr.count(chain.from_iterable(err2_rep_lsts))
+
     with (output_dir / summary_file_name).open("w") as summary_f:
+        # Write the summary of the Pydantic validation error differences
+        summary_f.write(
+            pydantic_validation_err_diff_summary(
+                pydantic_validation_errs1_ctr, pydantic_validation_errs2_ctr
+            )
+        )
+
         # Write the header and alignment rows of the summary table
         summary_f.write(gen_header_and_alignment_rows(summary_headers))
 
@@ -585,3 +543,38 @@ def _output_asset_validation_diff_reports(
             summary_f.write(gen_row(row_cells))
 
     logger.info("Output of asset validation diff reports is complete")
+
+
+def pydantic_err_categorizer(err: tuple) -> tuple[str, str, tuple[str, ...]]:
+    """
+    Categorize a Pydantic validation error represented as a tuple using the same
+    tuple without the path component to the dandiset at a particular version and
+    with a generalized "loc" with all array indices replaced by "[*]"
+
+    :param err: The tuple representing the Pydantic validation error
+    :return: The tuple representing the category that the error belongs to
+    """
+    err = cast(tuple[str, str, tuple[str | int, ...], Path], err)
+    type_, msg = err[0], err[1]
+
+    # Generalize the "loc" by replacing all array indices with "[*]"
+    loc = cast(
+        tuple[str, ...], tuple("[*]" if isinstance(v, int) else v for v in err[2])
+    )
+
+    return type_, msg, loc
+
+
+def pydantic_err_rep(
+    err: dict[str, Any], path: Path
+) -> tuple[str, str, tuple[str | int, ...], Path]:
+    """
+    Get a representation of a Pydantic validation error as a tuple
+
+    :param err: The Pydantic validation error as a `dict`
+    :param path: The path the data instance that the error pertained to
+    :return: The representation of the Pydantic validation error as tuple consisting of
+        the values for the `'type'`, `'msg'`, `'loc'` keys of the error and `path`.
+        Note: The value of the `'loc'` key is converted to a tuple from a list
+    """
+    return err["type"], err["msg"], tuple(err["loc"]), path
diff --git a/src/dandisets_linkml_status_tools/tools/md.py b/src/dandisets_linkml_status_tools/tools/md.py
index f707aef..02fcc05 100644
--- a/src/dandisets_linkml_status_tools/tools/md.py
+++ b/src/dandisets_linkml_status_tools/tools/md.py
@@ -7,6 +7,10 @@
 
 from dandisets_linkml_status_tools.models import PydanticValidationErrsType
 from dandisets_linkml_status_tools.tools.typing import Stringable
+from dandisets_linkml_status_tools.tools.validation_err_counter import (
+    ValidationErrCounter,
+    validation_err_diff,
+)
 
 
 def gen_row(cell_values: Iterable[Stringable]) -> str:
@@ -247,3 +251,48 @@ def pydantic_validation_err_diff_detailed_table(
     )
 
     return f"{heading}{header_and_alignment_rows}{rows}"
+
+
+def pydantic_validation_err_diff_summary(
+    c1: ValidationErrCounter, c2: ValidationErrCounter
+) -> str:
+    """
+    Generate a summary of the differences between two sets of Pydantic validation errors
+
+    :param c1: A `ValidationErrCounter` that has counted the first set of Pydantic
+        validation errors
+    :param c2: A `ValidationErrCounter` that has counted the second set of Pydantic
+        validation errors
+    :return: The string presenting the summary in Markdown format
+    """
+
+    # The differences in the different categories of
+    # Pydantic validation errors between the two sets of validation results where
+    # each set is represented, and counted, by a `ValidationErrCounter` object
+    pydantic_validation_err_diff = validation_err_diff(c1, c2)
+
+    count_table1 = validation_err_count_table(c1.counts_by_cat)
+    count_table2 = validation_err_count_table(c2.counts_by_cat)
+
+    # A table of the differences in the different categories of Pydantic validation
+    # errors
+    diff_table = validation_err_diff_table(pydantic_validation_err_diff)
+
+    # A sequence of tables detailing the differences in Pydantic validation
+    # errors between the two sets of validation results
+    # noinspection PyTypeChecker
+    diff_detailed_tables = validation_err_diff_detailed_tables(
+        pydantic_validation_err_diff,
+        pydantic_validation_err_diff_detailed_table,
+    )
+
+    return (
+        f"### Pydantic errs 1 counts\n\n"
+        f"{count_table1}"
+        f"\n### Pydantic errs 2 counts\n\n"
+        f"{count_table2}"
+        f"\n### Pydantic errs diff\n\n"
+        f"{diff_table}"
+        f"\n## Pydantic errs diff detailed tables\n\n"
+        f"{diff_detailed_tables}"
+    )

From 9514fe8805f3136b78298e2f69652348a05f9656 Mon Sep 17 00:00:00 2001
From: Isaac To <isaac.chun.to@gmail.com>
Date: Wed, 15 Jan 2025 00:31:45 -0800
Subject: [PATCH 2/2] feat: remove the old summary table for asset

The old summary table was too big to be any use and not as
uninformative
---
 .../cmd_funcs/diff_manifests_reports.py       | 61 +------------------
 1 file changed, 3 insertions(+), 58 deletions(-)

diff --git a/src/dandisets_linkml_status_tools/cmd_funcs/diff_manifests_reports.py b/src/dandisets_linkml_status_tools/cmd_funcs/diff_manifests_reports.py
index b6f21a1..74f52a7 100644
--- a/src/dandisets_linkml_status_tools/cmd_funcs/diff_manifests_reports.py
+++ b/src/dandisets_linkml_status_tools/cmd_funcs/diff_manifests_reports.py
@@ -424,17 +424,6 @@ def _output_asset_validation_diff_reports(
     """
     summary_file_name = "summary.md"
 
-    summary_headers = [
-        "dandiset",
-        "version",
-        "asset id",
-        "asset path",
-        "asset index",
-        "pydantic errs 1",
-        "pydantic errs 2",
-        "pydantic errs diff",
-    ]
-
     output_dir.mkdir(parents=True)
     logger.info("Created asset validation diff report directory %s", output_dir)
 
@@ -465,11 +454,8 @@ def _output_asset_validation_diff_reports(
             )
         )
 
-        # Write the header and alignment rows of the summary table
-        summary_f.write(gen_header_and_alignment_rows(summary_headers))
-
-        # Output individual asset validation diff reports by writing the supporting
-        # files and the summary table row
+        # Output individual asset validation diff reports by writing the constituting
+        # files
         for r in reports:
             report_dir = (
                 output_dir
@@ -493,7 +479,7 @@ def _output_asset_validation_diff_reports(
 
             logger.info(
                 "Dandiset %s:%s - asset %sat index %d: "
-                "Wrote asset validation diff report supporting files to %s",
+                "Wrote asset validation diff report constituting files to %s",
                 r.dandiset_identifier,
                 r.dandiset_version,
                 f"{r.asset_id} " if r.asset_id else "",
@@ -501,47 +487,6 @@ def _output_asset_validation_diff_reports(
                 report_dir,
             )
 
-            # === Write the summary table row for the validation diff report ===
-            # Relative directory for storing all validation diff reports of the dandiset
-            dandiset_dir = Path(r.dandiset_identifier)
-            # Relative directory for storing all validation diff reports of the dandiset
-            # at a particular version
-            version_dir = dandiset_dir / r.dandiset_version
-            # Relative directory for storing all validation diff reports of the asset
-            asset_dir = version_dir / str(r.asset_idx)
-
-            row_cells = (
-                f" {c} "  # Add spaces around the cell content for better readability
-                for c in [
-                    # For the dandiset column
-                    f"[{r.dandiset_identifier}]({dandiset_dir})",
-                    # For the version column
-                    f"[{r.dandiset_version}]({version_dir})",
-                    # For the asset id column
-                    f"{r.asset_id}",
-                    # For the asset path column
-                    f"{r.asset_path}",
-                    # For the asset index column
-                    f"[{r.asset_idx}]({asset_dir})",
-                    # For the pydantic errs 1 column
-                    gen_pydantic_validation_errs_cell(
-                        r.pydantic_validation_errs1,
-                        asset_dir / f"{pydantic_errs1_base_fname}.json",
-                    ),
-                    # For the pydantic errs 2 column
-                    gen_pydantic_validation_errs_cell(
-                        r.pydantic_validation_errs2,
-                        asset_dir / f"{pydantic_errs2_base_fname}.json",
-                    ),
-                    # For the pydantic errs diff column
-                    gen_diff_cell(
-                        r.pydantic_validation_errs_diff,
-                        asset_dir / f"{pydantic_errs_diff_base_fname}.json",
-                    ),
-                ]
-            )
-            summary_f.write(gen_row(row_cells))
-
     logger.info("Output of asset validation diff reports is complete")