add exclude columns to equality test

dbt-labs · gwenwindflower · Mar 5, 2024 · Nov 24, 2022 · Feb 6, 2023 · Feb 7, 2023
commit aefaa9d169d15ff77c837763a68f3e0e8aa23e97
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,12 @@
 ## Contributors:
 --->
 
+# Unreleased
+## New features
+- Add option to ignore columns in equality test ([#734](https://github.com/dbt-labs/dbt-utils/issues/734), [#737](https://github.com/dbt-labs/dbt-utils/pull/737))
+## Contributors:
+- [@brunocostalopes](https://github.com/brunocostalopes)
+
 # Unreleased
 ## Fixes
 - deduplicate macro for Databricks now uses the QUALIFY clause, which fixes NULL columns issues from the default natural join logic
@@ -19,7 +25,7 @@
 
 # dbt utils v1.1.1
 ## New features
-* Improve the performance of the `at_least_one` test by pruning early. This is especially helpful when running against external tables. By @joshuahuntley in https://github.com/dbt-labs/dbt-utils/pull/775
+* ZZZ by @YYY in https://github.com/dbt-labs/dbt-utils/pull/XXX
 ## Fixes
 * Fix legacy links in README by @dbeatty10 in https://github.com/dbt-labs/dbt-utils/pull/796
 

diff --git a/README.md b/README.md
@@ -114,21 +114,36 @@ This test supports the `group_by_columns` parameter; see [Grouping in tests](#gr
 
 ### equality ([source](macros/generic_tests/equality.sql))
 
-Asserts the equality of two relations. Optionally specify a subset of columns to compare.
+Asserts the equality of two relations. Optionally specify a subset of columns to compare or ignore.
 
 **Usage:**
 
 ```yaml
 version: 2
 
 models:
+  # compare the entire table 
   - name: model_name
+    tests:
+      - dbt_utils.equality:
+          compare_model: ref('other_table_name')
+
+  # only compare some of the columns
+  - name: model_name_compare_columns
     tests:
       - dbt_utils.equality:
           compare_model: ref('other_table_name')
           compare_columns:
             - first_column
             - second_column
+
+  # compare all columns except the ones on the ignore list
+  - name: model_name_ignore_columns
+    tests:
+      - dbt_utils.equality:
+          compare_model: ref('other_table_name')
+          ignore_columns:
+            - third_column
 ```
 
 ### expression_is_true ([source](macros/generic_tests/expression_is_true.sql))

diff --git a/integration_tests/data/schema_tests/data_test_equality_a.csv b/integration_tests/data/schema_tests/data_test_equality_a.csv
@@ -0,0 +1,4 @@
+col_a,col_b,col_c
+1,1,3
+1,2,1
+2,3,3
diff --git a/integration_tests/data/schema_tests/data_test_equality_b.csv b/integration_tests/data/schema_tests/data_test_equality_b.csv
@@ -0,0 +1,4 @@
+col_a,col_b,col_c
+1,1,2
+1,2,2
+2,3,2
diff --git a/integration_tests/models/generic_tests/schema.yml b/integration_tests/models/generic_tests/schema.yml
@@ -142,6 +142,24 @@ seeds:
           - dbt_utils.not_null_proportion:
               at_least: 0.9
 
+  - name: data_test_equality_a
+    tests:
+      - dbt_utils.equality:
+          compare_model: ref('data_test_equality_a')
+      - dbt_utils.equality:
+          compare_model: ref('data_test_equality_b')
+          error_if: "<1" #sneaky way to ensure that the test is returning failing rows
+          warn_if: "<0"
+      - dbt_utils.equality:
+          compare_model: ref('data_test_equality_b')
+          compare_columns:
+            - col_a
+            - col_b
+      - dbt_utils.equality:
+          compare_model: ref('data_test_equality_b')
+          ignore_columns:
+            - col_c
+
 models:
   - name: recency_time_included
     tests:

diff --git a/macros/generic_tests/equality.sql b/macros/generic_tests/equality.sql
@@ -1,10 +1,14 @@
-{% test equality(model, compare_model, compare_columns=None) %}
-  {{ return(adapter.dispatch('test_equality', 'dbt_utils')(model, compare_model, compare_columns)) }}
+{% test equality(model, compare_model, compare_columns=None, ignore_columns=None) %}
+  {{ return(adapter.dispatch('test_equality', 'dbt_utils')(model, compare_model, compare_columns, ignore_columns)) }}
 {% endtest %}
 
-{% macro default__test_equality(model, compare_model, compare_columns=None) %}
+{% macro default__test_equality(model, compare_model, compare_columns=None, ignore_columns=None) %}
 
-{% set set_diff %}
+{%- if compare_columns and ignore_columns -%}
+    {{ exceptions.raise_compiler_error("Both a compare and an ignore list were provided to the `equality` macro. Only one is allowed") }}
+{%- endif -%}
+
+{% set set_diff %}  
     count(*) + coalesce(abs(
         sum(case when which_diff = 'a_minus_b' then 1 else 0 end) -
         sum(case when which_diff = 'b_minus_a' then 1 else 0 end)
@@ -29,7 +33,22 @@ information schema — this allows the model to be an ephemeral model
 
 {%- if not compare_columns -%}
     {%- do dbt_utils._is_ephemeral(model, 'test_equality') -%}
-    {%- set compare_columns = adapter.get_columns_in_relation(model) | map(attribute='quoted') -%}
+    {%- set compare_columns = adapter.get_columns_in_relation(model) | map(attribute='name') -%}
+{%- endif -%}
+
+{%- if ignore_columns -%}
+    {#-- Lower case ignore columns for easier comparison --#}
+    {%- set ignore_columns = ignore_columns | map("lower") | list %}
+
+    {%- set include_columns = [] %}
+    {%- for column in compare_columns -%}
+        {%- if column | lower not in ignore_columns -%}
+            {% do include_columns.append(column) %}
+        {%- endif %}
+    {%- endfor %}
+
+    {%- set compare_columns = include_columns %}
+
 {%- endif -%}
 
 {% set compare_cols_csv = compare_columns | join(', ') %}