diff --git a/.circleci/config.yml b/.circleci/config.yml index 1701f721..950a6cfd 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -33,7 +33,7 @@ jobs: . dbt_venv/bin/activate python -m pip install --upgrade pip setuptools - python -m pip install --pre dbt-core dbt-postgres dbt-redshift dbt-snowflake dbt-bigquery + python -m pip install --pre dbt-core dbt-postgres dbt-redshift dbt-snowflake dbt-bigquery dbt-databricks mkdir -p ~/.dbt cp integration_tests/ci/sample.profiles.yml ~/.dbt/profiles.yml @@ -51,9 +51,8 @@ jobs: cd integration_tests dbt deps --target postgres dbt seed --target postgres --full-refresh - dbt compile --target postgres - dbt run --target postgres - dbt test --target postgres + dbt run --target postgres --exclude tag:skip+ tag:temporary_skip+ + dbt test --target postgres --exclude tag:skip+ tag:temporary_skip+ - run: name: "Run Tests - Redshift" @@ -63,9 +62,8 @@ jobs: cd integration_tests dbt deps --target redshift dbt seed --target redshift --full-refresh - dbt compile --target redshift - dbt run --target redshift - dbt test --target redshift + dbt run --target redshift --exclude tag:skip+ tag:temporary_skip+ + dbt test --target redshift --exclude tag:skip+ tag:temporary_skip+ - run: name: "Run Tests - Snowflake" @@ -75,9 +73,8 @@ jobs: cd integration_tests dbt deps --target snowflake dbt seed --target snowflake --full-refresh - dbt compile --target snowflake - dbt run --target snowflake - dbt test --target snowflake + dbt run --target snowflake --exclude tag:skip+ tag:temporary_skip+ + dbt test --target snowflake --exclude tag:skip+ tag:temporary_skip+ - run: name: "Run Tests - BigQuery" @@ -90,10 +87,19 @@ jobs: cd integration_tests dbt deps --target bigquery dbt seed --target bigquery --full-refresh - dbt compile --target bigquery - dbt run --target bigquery --full-refresh - dbt test --target bigquery + dbt run --target bigquery --full-refresh --exclude tag:skip+ tag:temporary_skip+ + dbt test --target bigquery --exclude tag:skip+ tag:temporary_skip+ + - run: + name: "Run Tests - Databricks" + command: | + . dbt_venv/bin/activate + echo `pwd` + cd integration_tests + dbt deps --target databricks + dbt seed --target databricks --full-refresh + dbt run --target databricks --exclude tag:skip+ tag:temporary_skip+ + dbt test --target databricks --exclude tag:skip+ tag:temporary_skip+ - save_cache: key: deps1-{{ .Branch }} @@ -115,3 +121,4 @@ workflows: - profile-redshift - profile-snowflake - profile-bigquery + - profile-databricks diff --git a/.gitignore b/.gitignore index a33e3f41..20eb2532 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,7 @@ target/ dbt_packages/ logs/ -logfile \ No newline at end of file +logfile +.DS_Store +package-lock.yml +integration_tests/package-lock.yml diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..437dcba6 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,21 @@ +{ + "yaml.schemas": { + "https://raw.githubusercontent.com/dbt-labs/dbt-jsonschema/main/schemas/latest/dbt_yml_files-latest.json": [ + "/**/*.yml", + "!profiles.yml", + "!dbt_project.yml", + "!packages.yml", + "!selectors.yml", + "!profile_template.yml" + ], + "https://raw.githubusercontent.com/dbt-labs/dbt-jsonschema/main/schemas/latest/dbt_project-latest.json": [ + "dbt_project.yml" + ], + "https://raw.githubusercontent.com/dbt-labs/dbt-jsonschema/main/schemas/latest/selectors-latest.json": [ + "selectors.yml" + ], + "https://raw.githubusercontent.com/dbt-labs/dbt-jsonschema/main/schemas/latest/packages-latest.json": [ + "packages.yml" + ] + }, +} \ No newline at end of file diff --git a/integration_tests/ci/sample.profiles.yml b/integration_tests/ci/sample.profiles.yml index 843d659e..ea8effc1 100644 --- a/integration_tests/ci/sample.profiles.yml +++ b/integration_tests/ci/sample.profiles.yml @@ -2,10 +2,6 @@ # HEY! This file is used in the dbt-audit-helper integrations tests with CircleCI. # You should __NEVER__ check credentials into version control. Thanks for reading :) -config: - send_anonymous_usage_stats: False - use_colors: True - integration_tests: target: postgres outputs: @@ -27,7 +23,7 @@ integration_tests: dbname: "{{ env_var('REDSHIFT_TEST_DBNAME') }}" port: "{{ env_var('REDSHIFT_TEST_PORT') | as_number }}" schema: audit_helper_integration_tests_redshift - threads: 1 + threads: 8 bigquery: type: bigquery @@ -35,7 +31,7 @@ integration_tests: keyfile: "{{ env_var('BIGQUERY_SERVICE_KEY_PATH') }}" project: "{{ env_var('BIGQUERY_TEST_DATABASE') }}" schema: audit_helper_integration_tests_bigquery - threads: 1 + threads: 8 snowflake: type: snowflake @@ -46,4 +42,12 @@ integration_tests: database: "{{ env_var('SNOWFLAKE_TEST_DATABASE') }}" warehouse: "{{ env_var('SNOWFLAKE_TEST_WAREHOUSE') }}" schema: audit_helper_integration_tests_snowflake - threads: 1 + threads: 8 + + databricks: + type: databricks + schema: dbt_project_evaluator_integration_tests_databricks + host: "{{ env_var('DATABRICKS_TEST_HOST') }}" + http_path: "{{ env_var('DATABRICKS_TEST_HTTP_PATH') }}" + token: "{{ env_var('DATABRICKS_TEST_ACCESS_TOKEN') }}" + threads: 10 diff --git a/integration_tests/dbt_project.yml b/integration_tests/dbt_project.yml index 07120e4c..2f4cb84d 100644 --- a/integration_tests/dbt_project.yml +++ b/integration_tests/dbt_project.yml @@ -17,3 +17,14 @@ clean-targets: # directories to be removed by `dbt clean` seeds: +quote_columns: false + +vars: + compare_queries_summarize: true + primary_key_columns_var: ['col1'] + columns_var: ['col1'] + event_time_var: + quick_are_queries_identical_cols: ['col1'] + +flags: + send_anonymous_usage_stats: False + use_colors: True \ No newline at end of file diff --git a/integration_tests/macros/unit_tests/struct_generation_macros.sql b/integration_tests/macros/unit_tests/struct_generation_macros.sql new file mode 100644 index 00000000..2c6767e4 --- /dev/null +++ b/integration_tests/macros/unit_tests/struct_generation_macros.sql @@ -0,0 +1,26 @@ +{%- macro _basic_json_function() -%} + {%- if target.type == 'snowflake' -%} + object_construct + {%- elif target.type == 'bigquery' -%} + json_object + {%- elif target.type == 'databricks' -%} + map + {%- elif execute -%} + {# Only raise exception if it's actually being called, not during parsing #} + {%- do exceptions.raise_compiler_error("Unknown adapter '"~ target.type ~ "'") -%} + {%- endif -%} +{%- endmacro -%} + +{% macro _complex_json_function(json) %} + + {% if target.type == 'redshift' %} + json_parse({{ json }}) + {% elif target.type == 'databricks' %} + from_json({{ json }}, schema_of_json({{ json }})) + {% elif target.type in ['snowflake', 'bigquery'] %} + parse_json({{ json }}) + {% elif execute %} + {# Only raise exception if it's actually being called, not during parsing #} + {%- do exceptions.raise_compiler_error("Unknown adapter '"~ target.type ~ "'") -%} + {% endif %} +{% endmacro %} \ No newline at end of file diff --git a/integration_tests/models/compare_which_columns_differ_exclude_cols.sql b/integration_tests/models/compare_which_columns_differ_exclude_cols.sql deleted file mode 100644 index 7630f549..00000000 --- a/integration_tests/models/compare_which_columns_differ_exclude_cols.sql +++ /dev/null @@ -1,18 +0,0 @@ -{% set a_relation=ref('data_compare_which_columns_differ_a')%} - -{% set b_relation=ref('data_compare_which_columns_differ_b') %} - - -select - lower(column_name) as column_name, - has_difference -from ( - - {{ audit_helper.compare_which_columns_differ( - a_relation=a_relation, - b_relation=b_relation, - primary_key="id", - exclude_columns=["becomes_null"] - ) }} - -) as macro_output \ No newline at end of file diff --git a/integration_tests/models/compare_all_columns_concat_pk_with_summary.sql b/integration_tests/models/data_tests/compare_all_columns_concat_pk_with_summary.sql similarity index 100% rename from integration_tests/models/compare_all_columns_concat_pk_with_summary.sql rename to integration_tests/models/data_tests/compare_all_columns_concat_pk_with_summary.sql diff --git a/integration_tests/models/compare_all_columns_concat_pk_without_summary.sql b/integration_tests/models/data_tests/compare_all_columns_concat_pk_without_summary.sql similarity index 100% rename from integration_tests/models/compare_all_columns_concat_pk_without_summary.sql rename to integration_tests/models/data_tests/compare_all_columns_concat_pk_without_summary.sql diff --git a/integration_tests/models/compare_all_columns_where_clause.sql b/integration_tests/models/data_tests/compare_all_columns_where_clause.sql similarity index 100% rename from integration_tests/models/compare_all_columns_where_clause.sql rename to integration_tests/models/data_tests/compare_all_columns_where_clause.sql diff --git a/integration_tests/models/compare_all_columns_with_summary.sql b/integration_tests/models/data_tests/compare_all_columns_with_summary.sql similarity index 100% rename from integration_tests/models/compare_all_columns_with_summary.sql rename to integration_tests/models/data_tests/compare_all_columns_with_summary.sql diff --git a/integration_tests/models/compare_all_columns_with_summary_and_exclude.sql b/integration_tests/models/data_tests/compare_all_columns_with_summary_and_exclude.sql similarity index 100% rename from integration_tests/models/compare_all_columns_with_summary_and_exclude.sql rename to integration_tests/models/data_tests/compare_all_columns_with_summary_and_exclude.sql diff --git a/integration_tests/models/compare_all_columns_without_summary.sql b/integration_tests/models/data_tests/compare_all_columns_without_summary.sql similarity index 100% rename from integration_tests/models/compare_all_columns_without_summary.sql rename to integration_tests/models/data_tests/compare_all_columns_without_summary.sql diff --git a/integration_tests/models/data_tests/compare_and_classify_query_results.sql b/integration_tests/models/data_tests/compare_and_classify_query_results.sql new file mode 100644 index 00000000..747f146f --- /dev/null +++ b/integration_tests/models/data_tests/compare_and_classify_query_results.sql @@ -0,0 +1,11 @@ +-- this has no tests, it's just making sure that the introspecive queries for event_time actually run + +{{ + audit_helper.compare_and_classify_query_results( + a_query="select * from " ~ ref('unit_test_model_a') ~ " where 1=1", + b_query="select * from " ~ ref('unit_test_model_b') ~ " where 1=1", + primary_key_columns=['id'], + columns=['id', 'col1', 'col2'], + event_time='created_at' + ) +}} \ No newline at end of file diff --git a/integration_tests/models/compare_queries.sql b/integration_tests/models/data_tests/compare_queries.sql similarity index 100% rename from integration_tests/models/compare_queries.sql rename to integration_tests/models/data_tests/compare_queries.sql diff --git a/integration_tests/models/compare_queries_concat_pk_without_summary.sql b/integration_tests/models/data_tests/compare_queries_concat_pk_without_summary.sql similarity index 100% rename from integration_tests/models/compare_queries_concat_pk_without_summary.sql rename to integration_tests/models/data_tests/compare_queries_concat_pk_without_summary.sql diff --git a/integration_tests/models/compare_queries_with_summary.sql b/integration_tests/models/data_tests/compare_queries_with_summary.sql similarity index 100% rename from integration_tests/models/compare_queries_with_summary.sql rename to integration_tests/models/data_tests/compare_queries_with_summary.sql diff --git a/integration_tests/models/compare_queries_without_summary.sql b/integration_tests/models/data_tests/compare_queries_without_summary.sql similarity index 100% rename from integration_tests/models/compare_queries_without_summary.sql rename to integration_tests/models/data_tests/compare_queries_without_summary.sql diff --git a/integration_tests/models/compare_relation_columns.sql b/integration_tests/models/data_tests/compare_relation_columns.sql similarity index 100% rename from integration_tests/models/compare_relation_columns.sql rename to integration_tests/models/data_tests/compare_relation_columns.sql diff --git a/integration_tests/models/compare_relations_concat_pk_without_summary.sql b/integration_tests/models/data_tests/compare_relations_concat_pk_without_summary.sql similarity index 100% rename from integration_tests/models/compare_relations_concat_pk_without_summary.sql rename to integration_tests/models/data_tests/compare_relations_concat_pk_without_summary.sql diff --git a/integration_tests/models/compare_relations_with_exclude.sql b/integration_tests/models/data_tests/compare_relations_with_exclude.sql similarity index 100% rename from integration_tests/models/compare_relations_with_exclude.sql rename to integration_tests/models/data_tests/compare_relations_with_exclude.sql diff --git a/integration_tests/models/compare_relations_with_summary.sql b/integration_tests/models/data_tests/compare_relations_with_summary.sql similarity index 100% rename from integration_tests/models/compare_relations_with_summary.sql rename to integration_tests/models/data_tests/compare_relations_with_summary.sql diff --git a/integration_tests/models/compare_relations_without_exclude.sql b/integration_tests/models/data_tests/compare_relations_without_exclude.sql similarity index 100% rename from integration_tests/models/compare_relations_without_exclude.sql rename to integration_tests/models/data_tests/compare_relations_without_exclude.sql diff --git a/integration_tests/models/compare_relations_without_summary.sql b/integration_tests/models/data_tests/compare_relations_without_summary.sql similarity index 100% rename from integration_tests/models/compare_relations_without_summary.sql rename to integration_tests/models/data_tests/compare_relations_without_summary.sql diff --git a/integration_tests/models/compare_row_counts.sql b/integration_tests/models/data_tests/compare_row_counts.sql similarity index 100% rename from integration_tests/models/compare_row_counts.sql rename to integration_tests/models/data_tests/compare_row_counts.sql diff --git a/integration_tests/models/compare_which_columns_differ.sql b/integration_tests/models/data_tests/compare_which_columns_differ.sql similarity index 76% rename from integration_tests/models/compare_which_columns_differ.sql rename to integration_tests/models/data_tests/compare_which_columns_differ.sql index a68523d3..ef158803 100644 --- a/integration_tests/models/compare_which_columns_differ.sql +++ b/integration_tests/models/data_tests/compare_which_columns_differ.sql @@ -9,9 +9,9 @@ select has_difference from ( - {{ audit_helper.compare_which_columns_differ( + {{ audit_helper.compare_which_relation_columns_differ( a_relation=a_relation, b_relation=b_relation, - primary_key="id" + primary_key_columns=["id"] ) }} ) as macro_output diff --git a/integration_tests/models/data_tests/compare_which_columns_differ_exclude_cols.sql b/integration_tests/models/data_tests/compare_which_columns_differ_exclude_cols.sql new file mode 100644 index 00000000..8d2d5aa2 --- /dev/null +++ b/integration_tests/models/data_tests/compare_which_columns_differ_exclude_cols.sql @@ -0,0 +1,25 @@ +{% set a_relation=ref('data_compare_which_columns_differ_a')%} + +{% set b_relation=ref('data_compare_which_columns_differ_b') %} + +{% set pk_cols = ['id'] %} +{% set cols = ['id','value_changes','becomes_not_null','does_not_change'] %} + +{% if target.type == 'snowflake' %} + {% set pk_cols = pk_cols | map("upper") | list %} + {% set cols = cols | map("upper") | list %} +{% endif %} + +select + lower(column_name) as column_name, + has_difference +from ( + + {{ audit_helper.compare_which_relation_columns_differ( + a_relation=a_relation, + b_relation=b_relation, + primary_key_columns=pk_cols, + columns=cols + ) }} + +) as macro_output \ No newline at end of file diff --git a/integration_tests/models/schema.yml b/integration_tests/models/data_tests/schema.yml similarity index 90% rename from integration_tests/models/schema.yml rename to integration_tests/models/data_tests/schema.yml index 4bea9838..fbe74ff7 100644 --- a/integration_tests/models/schema.yml +++ b/integration_tests/models/data_tests/schema.yml @@ -2,96 +2,96 @@ version: 2 models: - name: compare_queries - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_relations_without_exclude') - name: compare_queries_concat_pk_without_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_without_summary') - name: compare_queries_with_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_with_summary') - name: compare_queries_without_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_without_summary') - name: compare_relations_with_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_with_summary') - name: compare_relations_without_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_without_summary') - name: compare_relations_with_exclude - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_relations_with_exclude') - name: compare_relations_without_exclude - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_relations_without_exclude') - name: compare_all_columns_with_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_all_columns_with_summary') - name: compare_all_columns_without_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_all_columns_without_summary') - name: compare_all_columns_concat_pk_with_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_all_columns_concat_pk_with_summary') - name: compare_all_columns_concat_pk_without_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_all_columns_concat_pk_without_summary') - name: compare_all_columns_with_summary_and_exclude - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_all_columns_with_summary_and_exclude') - name: compare_all_columns_where_clause - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_all_columns_where_clause') - name: compare_relation_columns - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_relation_columns') - name: compare_relations_concat_pk_without_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_without_summary') - name: compare_which_columns_differ - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_which_columns_differ') - name: compare_which_columns_differ_exclude_cols - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_which_columns_differ_exclude_cols') - name: compare_row_counts - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_row_counts') diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_model_a.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_model_a.sql new file mode 100644 index 00000000..a4bc3985 --- /dev/null +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_model_a.sql @@ -0,0 +1 @@ +select 12 as id, 22 as id_2, 'xyz' as col1, 'tuv' as col2, 123 as col3, {{ dbt.current_timestamp() }} as created_at \ No newline at end of file diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_model_b.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_model_b.sql new file mode 100644 index 00000000..a4bc3985 --- /dev/null +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_model_b.sql @@ -0,0 +1 @@ +select 12 as id, 22 as id_2, 'xyz' as col1, 'tuv' as col2, 123 as col3, {{ dbt.current_timestamp() }} as created_at \ No newline at end of file diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql new file mode 100644 index 00000000..1cfabba6 --- /dev/null +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql @@ -0,0 +1,16 @@ +{{ config(tags=['skip' if (target.type in ['postgres']) else 'runnable']) }} + +{% if target.name != 'redshift' %} + +select + 1 as id, + 'John Doe' as col1, + {{ audit_helper_integration_tests._basic_json_function() -}}('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 + +{% else %} + +select + 1 AS id, + 'John Doe' AS col1, + json_parse('{"street": "123 Main St", "city": "Anytown", "state": "CA"}') AS col2 +{% endif %} \ No newline at end of file diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql new file mode 100644 index 00000000..1cfabba6 --- /dev/null +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql @@ -0,0 +1,16 @@ +{{ config(tags=['skip' if (target.type in ['postgres']) else 'runnable']) }} + +{% if target.name != 'redshift' %} + +select + 1 as id, + 'John Doe' as col1, + {{ audit_helper_integration_tests._basic_json_function() -}}('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 + +{% else %} + +select + 1 AS id, + 'John Doe' AS col1, + json_parse('{"street": "123 Main St", "city": "Anytown", "state": "CA"}') AS col2 +{% endif %} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_compare_classify.sql b/integration_tests/models/unit_test_wrappers/unit_compare_classify.sql new file mode 100644 index 00000000..e2c707a8 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_compare_classify.sql @@ -0,0 +1,9 @@ +{{ + audit_helper.compare_and_classify_query_results( + "select * from " ~ ref('unit_test_model_a') ~ " where 1=1", + "select * from " ~ ref('unit_test_model_b') ~ " where 1=1", + primary_key_columns=var('primary_key_columns_var'), + columns=var('columns_var'), + event_time=var('event_time_var') + ) +}} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_compare_classify.yml b/integration_tests/models/unit_test_wrappers/unit_compare_classify.yml new file mode 100644 index 00000000..759526fe --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_compare_classify.yml @@ -0,0 +1,256 @@ +unit_tests: + - name: compare_classify_identical_tables + model: unit_compare_classify + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"dbt_audit_row_status": 'identical', 'id': 1, dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': 2, dbt_audit_num_rows_in_status: 3} + + overrides: + vars: + columns_var: ['id', 'col1', 'col2'] + event_time_var: + primary_key_columns_var: ['id'] + + - name: compare_classify_identical_tables_event_time_filter + model: unit_compare_classify + overrides: + vars: + columns_var: ['id', 'col1', 'col2', 'created_at'] + event_time_var: 'created_at' + primary_key_columns_var: ['id'] + macros: + audit_helper._get_comparison_bounds: + "min_event_time": "2024-01-02" + "max_event_time": "2024-01-03" + "event_time": 'created_at' + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def", "created_at": '2024-01-01' } + - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } + - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } + - input: ref('unit_test_model_b') + rows: + - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } + - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } + + expect: + rows: + - {"dbt_audit_row_status": 'identical', 'id': 2, dbt_audit_num_rows_in_status: 2} + - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 2} + + - name: compare_classify_all_statuses + model: unit_compare_classify + overrides: + vars: + columns_var: ['id', 'col1', 'col2'] + event_time_var: + primary_key_columns_var: ['id'] + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "changed", "col2": "values" } + - { "id": 4, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"dbt_audit_row_status": 'identical', 'id': 1, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'removed', 'id': 3, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'added', 'id': 4, dbt_audit_num_rows_in_status: 1} + config: + tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-core/issues/10167 + + - name: compare_classify_identical_tables_multiple_pk_cols + model: unit_compare_classify + overrides: + vars: + columns_var: ['id', 'id_2', 'col1', 'col2'] + event_time_var: + primary_key_columns_var: ['id', 'id_2'] + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 12, "id_2": 3, "col1": "abc", "col2": "def" } + - { "id": 1, "id_2": 23, "col1": "hij", "col2": "klm" } + - { "id": 3, "id_2": 4, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 12, "id_2": 3, "col1": "abc", "col2": "def" } + - { "id": 1, "id_2": 23, "col1": "hij", "col2": "klm" } + - { "id": 3, "id_2": 4, "col1": "nop", "col2": "qrs" } + expect: + rows: + - {"dbt_audit_row_status": 'identical', 'id': 12, "id_2": 3, "dbt_audit_num_rows_in_status": 3} + - {"dbt_audit_row_status": 'identical', 'id': 1, "id_2": 23, "dbt_audit_num_rows_in_status": 3} + - {"dbt_audit_row_status": 'identical', 'id': 3, "id_2": 4, "dbt_audit_num_rows_in_status": 3} + + - name: compare_classify_identical_tables_single_null_pk + model: unit_compare_classify + description: "`nonunique_pk` status checks whether a PK is unique. It's intended to avoid arbitrary comparisons, not protect against null records (that's what constraints or tests are for)." + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": , "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": , "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"dbt_audit_row_status": 'identical', 'id': , dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': 2, dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 3} + + overrides: + vars: + columns_var: ['id', 'col1', 'col2'] + event_time_var: + primary_key_columns_var: ['id'] + config: + tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 + + - name: compare_classify_identical_tables_multiple_null_pk + model: unit_compare_classify + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": , "col1": "abc", "col2": "def" } + - { "id": , "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": , "col1": "abc", "col2": "def" } + - { "id": , "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 2} + - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 2} + - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 1} + + overrides: + vars: + columns_var: ['id', 'col1', 'col2'] + event_time_var: + primary_key_columns_var: ['id'] + config: + tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 + + - name: compare_classify_identical_tables_multi_null_pk_dupe_rows + description: All rows with a null ID are identical. They should be returned as individual rows instead of being combined + model: unit_compare_classify + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": , "col1": "abc", "col2": "def" } + - { "id": , "col1": "abc", "col2": "def" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": , "col1": "abc", "col2": "def" } + - { "id": , "col1": "abc", "col2": "def" } + - { "id": , "col1": "abc", "col2": "def" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 3} + + overrides: + vars: + columns_var: ['id', 'col1', 'col2'] + event_time_var: + primary_key_columns_var: ['id'] + config: + tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 + + - name: compare_classify_all_statuses_different_column_set + model: unit_compare_classify + overrides: + vars: + primary_key_columns_var: ['id'] + columns_var: ['id', 'col1'] + event_time_var: + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc" } + - { "id": 2, "col1": "ddd" } + - { "id": 4, "col1": "nop" } + + expect: + rows: + - {"dbt_audit_row_status": 'added', 'id': 4, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'identical', 'id': 1, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'removed', 'id': 3, dbt_audit_num_rows_in_status: 1} + config: + tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-core/issues/10167 + + - name: compare_classify_identical_tables_without_pk_in_cols_list + model: unit_compare_classify + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"dbt_audit_row_status": 'identical', 'id': 1, dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': 2, dbt_audit_num_rows_in_status: 3} + + overrides: + vars: + columns_var: ['col1', 'col2'] + event_time_var: + primary_key_columns_var: ['id'] diff --git a/integration_tests/models/unit_test_wrappers/unit_compare_classify_struct.sql b/integration_tests/models/unit_test_wrappers/unit_compare_classify_struct.sql new file mode 100644 index 00000000..4184f3dc --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_compare_classify_struct.sql @@ -0,0 +1,9 @@ +{{ + audit_helper.compare_and_classify_query_results( + "select * from " ~ ref('unit_test_struct_model_a') ~ " where 1=1", + "select * from " ~ ref('unit_test_struct_model_b') ~ " where 1=1", + primary_key_columns=var('primary_key_columns_var'), + columns=var('columns_var'), + event_time=var('event_time_var') + ) +}} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_compare_classify_struct.yml b/integration_tests/models/unit_test_wrappers/unit_compare_classify_struct.yml new file mode 100644 index 00000000..ab86ed44 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_compare_classify_struct.yml @@ -0,0 +1,148 @@ +unit_tests: + - name: compare_classify_simple_struct + model: unit_compare_classify_struct + given: + - input: ref('unit_test_struct_model_a') + format: sql + fixture: simple_struct + - input: ref('unit_test_struct_model_b') + format: sql + fixture: simple_struct + expect: + rows: + - {"id": 1, "dbt_audit_row_status": "identical", "dbt_audit_num_rows_in_status": 1} + overrides: + vars: + columns_var: ['id', 'col1', 'col2'] + event_time_var: + primary_key_columns_var: ['id'] + + - name: unit_compare_classify_struct_identical_values_different_order + model: unit_compare_classify_struct + description: Objects' keys are generally sorted alphabetically, so sort order is ignored. + given: + - input: ref('unit_test_struct_model_a') + format: sql + fixture: simple_struct + - input: ref('unit_test_struct_model_b') + format: sql + fixture: simple_struct_different_order + expect: + rows: + - {"id": 1, "dbt_audit_row_status": "identical", "dbt_audit_num_rows_in_status": 1} + overrides: + vars: + columns_var: ['id', 'col1', 'col2'] + event_time_var: + primary_key_columns_var: ['id'] + config: + #Databricks cares about the order and considers it a difference. We're not trying to have identical behaviour across warehouses so that's OK. + tags: "{{ 'skip' if (target.type in ['databricks']) else 'runnable' }}" + + - name: unit_compare_classify_struct_identical_values_different_order_dbx + model: unit_compare_classify_struct + description: Most platforms don't care about sort order. Databricks does. + given: + - input: ref('unit_test_struct_model_a') + format: sql + fixture: simple_struct + - input: ref('unit_test_struct_model_b') + format: sql + fixture: simple_struct_different_order + expect: + rows: + - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} + - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} + overrides: + vars: + columns_var: ['id', 'col1', 'col2'] + event_time_var: + primary_key_columns_var: ['id'] + config: + #Only for databricks + tags: "{{ 'skip' if (target.type not in ['databricks']) else 'runnable' }}" + + - name: unit_compare_classify_struct_removed_key + model: unit_compare_classify_struct + given: + - input: ref('unit_test_struct_model_a') + format: sql + fixture: simple_struct + - input: ref('unit_test_struct_model_b') + format: sql + fixture: simple_struct_removed_key + expect: + rows: + - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} + - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} + overrides: + vars: + columns_var: ['id', 'col1', 'col2'] + event_time_var: + primary_key_columns_var: ['id'] + # config: + # tags: "{{ 'skip' if (target.type in ['databricks']) else 'runnable' }}" #Can't do set operations on even simple JSON cols + + - name: compare_classify_complex_struct + model: unit_compare_classify_struct + given: + - input: ref('unit_test_struct_model_a') + format: sql + fixture: complex_struct + - input: ref('unit_test_struct_model_b') + format: sql + fixture: complex_struct + expect: + rows: + - {"id": 1, "dbt_audit_row_status": "identical", "dbt_audit_num_rows_in_status": 1} + overrides: + vars: + columns_var: ['id', 'col1', 'col2'] + event_time_var: + primary_key_columns_var: ['id'] + # config: + # tags: "{{ 'skip' if (target.type in ['redshift', 'databricks']) else 'runnable' }}" #haven't ported these to be multi-warehouse yet + + - name: compare_classify_complex_struct_different_values + model: unit_compare_classify_struct + given: + - input: ref('unit_test_struct_model_a') + format: sql + fixture: complex_struct + - input: ref('unit_test_struct_model_b') + format: sql + fixture: complex_struct_different_value + + expect: + rows: + - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} + - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} + overrides: + vars: + columns_var: ['id', 'col1', 'col2'] + event_time_var: + primary_key_columns_var: ['id'] + # config: + # tags: "{{ 'skip' if (target.type in ['redshift', 'databricks']) else 'runnable' }}" #haven't ported these to be multi-warehouse yet + + - name: unit_compare_classify_complex_struct_identical_values_different_order + model: unit_compare_classify_struct + description: Snowflake sorts objects' keys alphabetically, but respects the order items are added to arrays so differences are detected. + given: + - input: ref('unit_test_struct_model_a') + format: sql + fixture: complex_struct + - input: ref('unit_test_struct_model_b') + format: sql + fixture: complex_struct_different_order + expect: + rows: + - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} + - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} + overrides: + vars: + columns_var: ['id', 'col1', 'col2'] + event_time_var: + primary_key_columns_var: ['id'] + # config: + # tags: "{{ 'skip' if (target.type in ['redshift', 'databricks']) else 'runnable' }}" #haven't ported these to be multi-warehouse yet diff --git a/integration_tests/models/unit_test_wrappers/unit_compare_queries.sql b/integration_tests/models/unit_test_wrappers/unit_compare_queries.sql new file mode 100644 index 00000000..03272c9f --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_compare_queries.sql @@ -0,0 +1,8 @@ + +{{ + audit_helper.compare_queries( + "select * from " ~ ref('unit_test_model_a') ~ " where 1=1", + "select * from " ~ ref('unit_test_model_b') ~ " where 1=1", + summarize = var('compare_queries_summarize') + ) +}} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_compare_queries.yml b/integration_tests/models/unit_test_wrappers/unit_compare_queries.yml new file mode 100644 index 00000000..0308e509 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_compare_queries.yml @@ -0,0 +1,47 @@ +unit_tests: + - name: identical_records_compare_queries + model: unit_compare_queries + description: The world's most basic unit test. + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"in_a": true, "in_b": true} + + overrides: + vars: + compare_queries_summarize: true + + - name: identical_records_compare_queries_no_summarize + model: unit_compare_queries + description: The world's second most basic unit test. + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: [] + + overrides: + vars: + compare_queries_summarize: false diff --git a/integration_tests/models/unit_test_wrappers/unit_compare_which_query_columns_differ.sql b/integration_tests/models/unit_test_wrappers/unit_compare_which_query_columns_differ.sql new file mode 100644 index 00000000..d2c12fde --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_compare_which_query_columns_differ.sql @@ -0,0 +1,17 @@ +{% set pk_cols = var('primary_key_columns_var') %} +{% set cols = var('columns_var') %} + +{% if target.type == 'snowflake' and flags.WHICH == 'run' %} + {% set pk_cols = pk_cols | map("upper") | list %} + {% set cols = cols | map("upper") | list %} +{% endif %} + +{{ + audit_helper.compare_which_query_columns_differ( + a_query = "select * from " ~ ref('unit_test_model_a') ~ " where 1=1", + b_query = "select * from " ~ ref('unit_test_model_b') ~ " where 1=1", + primary_key_columns = pk_cols, + columns = cols, + event_time = var('event_time_var') + ) +}} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_compare_which_query_columns_differ.yml b/integration_tests/models/unit_test_wrappers/unit_compare_which_query_columns_differ.yml new file mode 100644 index 00000000..49007dd1 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_compare_which_query_columns_differ.yml @@ -0,0 +1,124 @@ +unit_tests: + - name: compare_cols_identical_tables + model: unit_compare_which_query_columns_differ + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"column_name": 'id', 'has_difference': false} + - {"column_name": 'col1', 'has_difference': false} + - {"column_name": 'col2', 'has_difference': false} + + overrides: + vars: + columns_var: ['id', 'col1', 'col2'] + event_time_var: + primary_key_columns_var: ['id'] + config: + tags: "{{ 'skip' if (target.type in ['snowflake']) else 'runnable' }}" #Case sensitivity + + - name: compare_cols_identical_tables_event_time_filter + model: unit_compare_which_query_columns_differ + overrides: + vars: + columns_var: ['id', 'col1', 'col2', 'created_at'] + event_time_var: 'created_at' + primary_key_columns_var: ['id'] + macros: + audit_helper._get_comparison_bounds: + "min_event_time": "2024-01-02" + "max_event_time": "2024-01-03" + "event_time": 'created_at' + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def", "created_at": '2024-01-01' } + - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } + - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } + - input: ref('unit_test_model_b') + rows: + - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } + - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } + + expect: + rows: + - {"column_name": 'id', "has_difference": false} + - {"column_name": 'col1', "has_difference": false} + - {"column_name": 'col2', "has_difference": false} + - {"column_name": 'created_at', "has_difference": false} + config: + tags: "{{ 'skip' if (target.type in ['snowflake']) else 'runnable' }}" #Case sensitivity + + - name: compare_cols_identical_tables_snowflake + model: unit_compare_which_query_columns_differ + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"column_name": 'ID', 'has_difference': false} + - {"column_name": 'COL1', 'has_difference': false} + - {"column_name": 'COL2', 'has_difference': false} + + overrides: + vars: + columns_var: ['ID', 'COL1', 'COL2'] + event_time_var: + primary_key_columns_var: ['ID'] + config: + tags: "{{ 'skip' if (target.type not in ['snowflake']) else 'runnable' }}" #Case sensitivity + + - name: compare_cols_identical_tables_event_time_filter_snowflake + model: unit_compare_which_query_columns_differ + overrides: + vars: + columns_var: ['ID', 'COL1', 'COL2', 'CREATED_AT'] + event_time_var: 'CREATED_AT' + primary_key_columns_var: ['ID'] + macros: + audit_helper._get_comparison_bounds: + "min_event_time": "2024-01-02" + "max_event_time": "2024-01-03" + "event_time": 'created_at' + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def", "created_at": '2024-01-01' } + - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } + - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } + - input: ref('unit_test_model_b') + rows: + - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } + - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } + + expect: + rows: + - {"column_name": 'ID', "has_difference": false} + - {"column_name": 'COL1', "has_difference": false} + - {"column_name": 'COL2', "has_difference": false} + - {"column_name": 'CREATED_AT', "has_difference": false} + config: + tags: "{{ 'skip' if (target.type not in ['snowflake']) else 'runnable' }}" #Case sensitivity \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_ensure_all_pks_are_in_column_set.sql b/integration_tests/models/unit_test_wrappers/unit_ensure_all_pks_are_in_column_set.sql new file mode 100644 index 00000000..a76f30cd --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_ensure_all_pks_are_in_column_set.sql @@ -0,0 +1,19 @@ +{% set results = + audit_helper._ensure_all_pks_are_in_column_set( + primary_key_columns=var('primary_key_columns_var', ['a_column_with_a_large_unwieldy_name']), + columns=var('columns_var', ['b_column_with_a_large_unwieldy_name']), + ) +%} + +{% if (var('primary_key_columns_var') | length == 0) and (var('columns_var') | length == 0) %} +-- need to still provide a table shape +select 'abcdefabcdef' as col, 1 as row_index +limit 0 +{% endif %} + +{% for result in results %} + select '{{ result }}' as col, {{ loop.index }} as row_index + {% if not loop.last %} + union all + {% endif %} +{% endfor %} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_ensure_all_pks_are_in_column_set.yml b/integration_tests/models/unit_test_wrappers/unit_ensure_all_pks_are_in_column_set.yml new file mode 100644 index 00000000..35767c9d --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_ensure_all_pks_are_in_column_set.yml @@ -0,0 +1,130 @@ +unit_tests: + - name: ensure_all_pks_in_columns + model: unit_ensure_all_pks_are_in_column_set + given: [] + overrides: + vars: + primary_key_columns_var: ['pk1', 'pk2'] + columns_var: ['pk1', 'pk2', 'column_a', 'column_b'] + + expect: + rows: + - {"col": 'pk1', "row_index": 1} + - {"col": 'pk2', "row_index": 2} + - {"col": 'column_a', "row_index": 3} + - {"col": 'column_b', "row_index": 4} + + - name: ensure_all_pks_in_columns_pks_at_end + model: unit_ensure_all_pks_are_in_column_set + description: PKs are specified in `columns` so should be at end of list + given: [] + overrides: + vars: + primary_key_columns_var: ['pk1', 'pk2'] + columns_var: ['column_a', 'column_b', 'pk1', 'pk2'] + + expect: + rows: + - {"col": 'column_a', "row_index": 1} + - {"col": 'column_b', "row_index": 2} + - {"col": 'pk1', "row_index": 3} + - {"col": 'pk2', "row_index": 4} + + - name: ensure_all_pks_in_columns_one_missing_pk + model: unit_ensure_all_pks_are_in_column_set + description: PK specified in `columns` should be at end of list, missing PK will be added at front + given: [] + overrides: + vars: + primary_key_columns_var: ['pk1', 'pk2'] + columns_var: ['column_a', 'column_b', 'pk2'] + + expect: + rows: + - {"col": 'pk1', "row_index": 1} + - {"col": 'column_a', "row_index": 2} + - {"col": 'column_b', "row_index": 3} + - {"col": 'pk2', "row_index": 4} + + - name: ensure_all_pks_in_columns_empty_sets + model: unit_ensure_all_pks_are_in_column_set + given: [] + overrides: + vars: + primary_key_columns_var: [] + columns_var: [] + + expect: + rows: [] + + - name: ensure_all_pks_in_columns_no_pks + model: unit_ensure_all_pks_are_in_column_set + given: [] + overrides: + vars: + primary_key_columns_var: [] + columns_var: ['column_a', 'column_b'] + + expect: + rows: + - {"col": 'column_a', "row_index": 1} + - {"col": 'column_b', "row_index": 2} + + - name: ensure_all_pks_in_columns_no_cols + model: unit_ensure_all_pks_are_in_column_set + given: [] + overrides: + vars: + primary_key_columns_var: ['pk1', 'pk2'] + columns_var: [] + + expect: + rows: + - {"col": 'pk1', "row_index": 1} + - {"col": 'pk2', "row_index": 2} + + - name: ensure_all_pks_in_columns_caps_pk + model: unit_ensure_all_pks_are_in_column_set + given: [] + overrides: + vars: + primary_key_columns_var: ['pk2', 'PK1'] + columns_var: ['pk1', 'pk2', 'column_a', 'column_b'] + + expect: + rows: + - {"col": 'pk1', "row_index": 1} + - {"col": 'pk2', "row_index": 2} + - {"col": 'column_a', "row_index": 3} + - {"col": 'column_b', "row_index": 4} + + - name: ensure_all_pks_in_columns_caps_col + model: unit_ensure_all_pks_are_in_column_set + given: [] + overrides: + vars: + primary_key_columns_var: ['pk2', 'pk1'] + columns_var: ['pk1', 'pk2', 'COLUMN_A', 'column_b'] + + expect: + rows: + - {"col": 'pk1', "row_index": 1} + - {"col": 'pk2', "row_index": 2} + - {"col": 'COLUMN_A', "row_index": 3} + - {"col": 'column_b', "row_index": 4} + + - name: ensure_all_pks_in_columns_caps_pk_in_both + model: unit_ensure_all_pks_are_in_column_set + given: [] + overrides: + vars: + primary_key_columns_var: ['pk2', 'PK1'] + columns_var: ['PK1', 'pk2', 'column_a', 'column_b'] + + expect: + rows: + - {"col": 'PK1', "row_index": 1} + - {"col": 'pk2', "row_index": 2} + - {"col": 'column_a', "row_index": 3} + - {"col": 'column_b', "row_index": 4} + \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql new file mode 100644 index 00000000..92661c51 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql @@ -0,0 +1,10 @@ +{{ config(tags=['skip' if (target.type in ['redshift', 'postgres', 'databricks']) else 'runnable']) }} + +{{ + audit_helper.quick_are_queries_identical( + "select * from " ~ ref('unit_test_model_a') ~ " where 1=1", + "select * from " ~ ref('unit_test_model_b') ~ " where 1=1", + columns=var('quick_are_queries_identical_cols'), + event_time=var('event_time_var') + ) +}} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml new file mode 100644 index 00000000..66f56a27 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml @@ -0,0 +1,97 @@ +unit_tests: + - name: quick_are_queries_identical_identical_tables + model: unit_quick_are_queries_identical + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"are_tables_identical": true} + + overrides: + vars: + quick_are_queries_identical_cols: ['id', 'col1', 'col2'] + event_time_var: + + - name: quick_are_queries_identical_identical_tables_event_time_filter + model: unit_quick_are_queries_identical + overrides: + vars: + quick_are_queries_identical_cols: ['id', 'col1', 'col2', 'created_at'] + event_time_var: 'created_at' + macros: + audit_helper._get_comparison_bounds: + "min_event_time": "2024-01-02" + "max_event_time": "2024-01-03" + "event_time": 'created_at' + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def", "created_at": '2024-01-01' } + - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } + - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } + - input: ref('unit_test_model_b') + rows: + - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } + - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } + + expect: + rows: + - {"are_tables_identical": true} + + - name: quick_are_queries_identical_differences + model: unit_quick_are_queries_identical + overrides: + vars: + quick_are_queries_identical_cols: ['id', 'col1', 'col2'] + event_time_var: + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "changed", "col2": "values" } + - { "id": 4, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"are_tables_identical": false} + + - name: quick_are_queries_identical_identical_tables_with_null_pks + model: unit_quick_are_queries_identical + + given: + - input: ref('unit_test_model_a') + rows: + - { "id":, "col1": "abc", "col2": "def" } + - { "id":, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id":, "col1": "abc", "col2": "def" } + - { "id":, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"are_tables_identical": true} + + overrides: + vars: + quick_are_queries_identical_cols: ['id', 'col1', 'col2'] + event_time_var: diff --git a/integration_tests/tests/fixtures/complex_struct.sql b/integration_tests/tests/fixtures/complex_struct.sql new file mode 100644 index 00000000..b96206c3 --- /dev/null +++ b/integration_tests/tests/fixtures/complex_struct.sql @@ -0,0 +1,8 @@ +{% set json %} + '{"emails":["john.doe@example.com","john.d@example.com"],"phones":[{"number":"123-456-7890","type":"home"},{"number":"987-654-3210","type":"work"}]}' +{% endset %} + +select + 1 as id, + 'John Doe' as col1, + {{ audit_helper_integration_tests._complex_json_function(json) }} as col2 \ No newline at end of file diff --git a/integration_tests/tests/fixtures/complex_struct_different_order.sql b/integration_tests/tests/fixtures/complex_struct_different_order.sql new file mode 100644 index 00000000..24ead4fc --- /dev/null +++ b/integration_tests/tests/fixtures/complex_struct_different_order.sql @@ -0,0 +1,8 @@ +{% set json %} + '{"emails":["john.doe@example.com","john.d@example.com"],"phones":[{"number":"987-654-3210","type":"work"}, {"number":"123-456-7890","type":"home"}]}' +{% endset %} + +select + 1 as id, + 'John Doe' as col1, + {{ audit_helper_integration_tests._complex_json_function(json) }} as col2 \ No newline at end of file diff --git a/integration_tests/tests/fixtures/complex_struct_different_value.sql b/integration_tests/tests/fixtures/complex_struct_different_value.sql new file mode 100644 index 00000000..5446b11a --- /dev/null +++ b/integration_tests/tests/fixtures/complex_struct_different_value.sql @@ -0,0 +1,8 @@ +{% set json %} +'{"emails":["john.smith@example.com","john.s@example.com"],"phones":[{"number":"123-456-7890","type":"home"},{"number":"987-654-3210","type":"work"}]}' +{% endset %} + +select + 1 as id, + 'John Doe' as col1, + {{ audit_helper_integration_tests._complex_json_function(json) }} as col2 \ No newline at end of file diff --git a/integration_tests/tests/fixtures/simple_struct.sql b/integration_tests/tests/fixtures/simple_struct.sql new file mode 100644 index 00000000..006e62f6 --- /dev/null +++ b/integration_tests/tests/fixtures/simple_struct.sql @@ -0,0 +1,14 @@ +{% if target.name != 'redshift' %} + +select + 1 as id, + 'John Doe' as col1, + {{ audit_helper_integration_tests._basic_json_function() -}}('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 + +{% else %} + +select + 1 AS id, + 'John Doe' AS col1, + json_parse('{"street": "123 Main St", "city": "Anytown", "state": "CA"}') AS col2 +{% endif %} \ No newline at end of file diff --git a/integration_tests/tests/fixtures/simple_struct_different_order.sql b/integration_tests/tests/fixtures/simple_struct_different_order.sql new file mode 100644 index 00000000..ee89fb70 --- /dev/null +++ b/integration_tests/tests/fixtures/simple_struct_different_order.sql @@ -0,0 +1,14 @@ +{% if target.name != 'redshift' %} + +select + 1 as id, + 'John Doe' as col1, + {{ audit_helper_integration_tests._basic_json_function() -}}( 'state', 'CA', 'street', '123 Main St', 'city', 'Anytown') as col2 + +{% else %} + +select + 1 AS id, + 'John Doe' AS col1, + json_parse('{"state": "CA", "street": "123 Main St", "city": "Anytown"}') AS col2 +{% endif %} \ No newline at end of file diff --git a/integration_tests/tests/fixtures/simple_struct_removed_key.sql b/integration_tests/tests/fixtures/simple_struct_removed_key.sql new file mode 100644 index 00000000..ae3084bd --- /dev/null +++ b/integration_tests/tests/fixtures/simple_struct_removed_key.sql @@ -0,0 +1,14 @@ +{% if target.name != 'redshift' %} + +select + 1 as id, + 'John Doe' as col1, + {{ audit_helper_integration_tests._basic_json_function() -}}('street', '123 Main St', 'state', 'CA') as col2 + +{% else %} + +select + 1 AS id, + 'John Doe' AS col1, + json_parse('{"street": "123 Main St", "state": "CA"}') AS col2 +{% endif %} \ No newline at end of file diff --git a/macros/compare_and_classify_query_results.sql b/macros/compare_and_classify_query_results.sql new file mode 100644 index 00000000..c96353f0 --- /dev/null +++ b/macros/compare_and_classify_query_results.sql @@ -0,0 +1,65 @@ +{% macro compare_and_classify_query_results(a_query, b_query, primary_key_columns=[], columns=[], event_time=None, sample_limit=20) %} + + {% set columns = audit_helper._ensure_all_pks_are_in_column_set(primary_key_columns, columns) %} + {% set joined_cols = columns | join(", ") %} + + {% if event_time %} + {% set event_time_props = audit_helper._get_comparison_bounds(a_query, b_query, event_time) %} + {% endif %} + + with + + {{ audit_helper._generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props)}} + + , + + all_records as ( + + select + *, + true as dbt_audit_in_a, + true as dbt_audit_in_b + from a_intersect_b + + union all + + select + *, + true as dbt_audit_in_a, + false as dbt_audit_in_b + from a_except_b + + union all + + select + *, + false as dbt_audit_in_a, + true as dbt_audit_in_b + from b_except_a + + ), + + classified as ( + select + *, + {{ audit_helper._classify_audit_row_status() }} as dbt_audit_row_status + from all_records + ), + + final as ( + select + *, + {{ audit_helper._count_num_rows_in_status() }} as dbt_audit_num_rows_in_status, + -- using dense_rank so that modified rows (which have a full row for both the left and right side) both get picked up in the sample. + -- For every other type this is equivalent to a row_number() + dense_rank() over (partition by dbt_audit_row_status order by dbt_audit_surrogate_key, dbt_audit_pk_row_num) as dbt_audit_sample_number + from classified + ) + + select * from final + {% if sample_limit %} + where dbt_audit_sample_number <= {{ sample_limit }} + {% endif %} + order by dbt_audit_row_status, dbt_audit_sample_number + +{% endmacro %} \ No newline at end of file diff --git a/macros/compare_and_classify_relation_rows.sql b/macros/compare_and_classify_relation_rows.sql new file mode 100644 index 00000000..d7c6f0f3 --- /dev/null +++ b/macros/compare_and_classify_relation_rows.sql @@ -0,0 +1,16 @@ +{% macro compare_and_classify_relation_rows(a_relation, b_relation, primary_key_columns=[], columns=None, event_time=None, sample_limit=20) %} + {%- if not columns -%} + {%- set columns = audit_helper._get_intersecting_columns_from_relations(a_relation, b_relation) -%} + {%- endif -%} + + {{ + audit_helper.compare_and_classify_query_results( + "select * from " ~ a_relation, + "select * from " ~ b_relation, + primary_key_columns, + columns, + event_time, + sample_limit + ) + }} +{% endmacro %} \ No newline at end of file diff --git a/macros/compare_which_columns_differ.sql b/macros/compare_which_columns_differ.sql deleted file mode 100644 index bc7c16cc..00000000 --- a/macros/compare_which_columns_differ.sql +++ /dev/null @@ -1,46 +0,0 @@ -{% macro compare_which_columns_differ(a_relation, b_relation, primary_key, exclude_columns=[]) %} - {{ return(adapter.dispatch('compare_which_columns_differ', 'audit_helper')(a_relation, b_relation, primary_key, exclude_columns)) }} -{% endmacro %} - -{% macro default__compare_which_columns_differ(a_relation, b_relation, primary_key, exclude_columns=[]) %} - -{% set column_names = dbt_utils.get_filtered_columns_in_relation(from=a_relation, except=exclude_columns) %} - -with bool_or as ( - - select - true as anchor - {% for column in column_names %} - {% set column_name = adapter.quote(column) %} - {% set compare_statement %} - ((a.{{ column_name }} != b.{{ column_name }}) - or (a.{{ column_name }} is null and b.{{ column_name }} is not null) - or (a.{{ column_name }} is not null and b.{{ column_name }} is null)) - {% endset %} - - , {{ dbt.bool_or(compare_statement) }} as {{ column | lower }}_has_difference - - {% endfor %} - from {{ a_relation }} as a - inner join {{ b_relation }} as b - on a.{{ primary_key }} = b.{{ primary_key }} - -) - -{% for column in column_names %} - - select - '{{ column }}' as column_name, - {{ column | lower }}_has_difference as has_difference - - from bool_or - - {% if not loop.last %} - - union all - - {% endif %} - -{% endfor %} - -{% endmacro %} diff --git a/macros/compare_which_query_columns_differ.sql b/macros/compare_which_query_columns_differ.sql new file mode 100644 index 00000000..139b8c17 --- /dev/null +++ b/macros/compare_which_query_columns_differ.sql @@ -0,0 +1,64 @@ +{% macro compare_which_query_columns_differ(a_query, b_query, primary_key_columns=[], columns=[], event_time=None) %} + {{ return(adapter.dispatch('compare_which_query_columns_differ', 'audit_helper')(a_query, b_query, primary_key_columns, columns, event_time)) }} +{% endmacro %} + +{% macro default__compare_which_query_columns_differ(a_query, b_query, primary_key_columns, columns, event_time) %} + {% set columns = audit_helper._ensure_all_pks_are_in_column_set(primary_key_columns, columns) %} + {% if event_time %} + {% set event_time_props = audit_helper._get_comparison_bounds(event_time) %} + {% endif %} + + {% set joined_cols = columns | join (", ") %} + + with a as ( + select + {{ joined_cols }}, + {{ audit_helper._generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key + from ({{ a_query }}) as a_subq + {{ audit_helper.event_time_filter(event_time_props) }} + ), + b as ( + select + {{ joined_cols }}, + {{ audit_helper._generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key + from ({{ b_query }}) as b_subq + {{ audit_helper.event_time_filter(event_time_props) }} + ), + + calculated as ( + select + {% for column in columns %} + {% set quoted_column = adapter.quote(column) %} + {% set compare_statement %} + ( + (a.{{ quoted_column }} != b.{{ quoted_column }}) + or (a.{{ quoted_column }} is null and b.{{ quoted_column }} is not null) + or (a.{{ quoted_column }} is not null and b.{{ quoted_column }} is null) + ) + {% endset %} + + {{ dbt.bool_or(compare_statement) }} as {{ column | lower }}_has_difference + + {%- if not loop.last %}, {% endif %} + {% endfor %} + from a + inner join b on a.dbt_audit_surrogate_key = b.dbt_audit_surrogate_key + ) + + {% for column in columns %} + + select + '{{ column }}' as column_name, + {{ column | lower }}_has_difference as has_difference + + from calculated + + {% if not loop.last %} + + union all + + {% endif %} + + {% endfor %} + +{% endmacro %} diff --git a/macros/compare_which_relation_columns_differ.sql b/macros/compare_which_relation_columns_differ.sql new file mode 100644 index 00000000..ac6efe12 --- /dev/null +++ b/macros/compare_which_relation_columns_differ.sql @@ -0,0 +1,15 @@ +{% macro compare_which_relation_columns_differ(a_relation, b_relation, primary_key_columns=[], columns=[], event_time=None) %} + {%- if not columns -%} + {%- set columns = audit_helper._get_intersecting_columns_from_relations(a_relation, b_relation) -%} + {%- endif -%} + + {{ + audit_helper.compare_which_query_columns_differ( + "select * from " ~ a_relation, + "select * from " ~ b_relation, + primary_key_columns, + columns, + event_time + ) + }} +{% endmacro %} \ No newline at end of file diff --git a/macros/quick_are_queries_identical.sql b/macros/quick_are_queries_identical.sql new file mode 100644 index 00000000..a0800c3d --- /dev/null +++ b/macros/quick_are_queries_identical.sql @@ -0,0 +1,70 @@ +/* +As described by the Infinite Lambda team here: https://infinitelambda.com/data-validation-refactoring-snowflake/ + +Some platforms let you take a hash of the whole table, which can be very very fast compared to comparing each row. + +If you run this and it returns false, you still have to run the more in-depth queries to find out what specific changes there are, +but it's a good way to quickly verify identical results if that's what you're expecting. +*/ + +{% macro quick_are_queries_identical(query_a, query_b, columns=[], event_time=None) %} + {{ return (adapter.dispatch('quick_are_queries_identical', 'audit_helper')(query_a, query_b, columns, event_time)) }} +{% endmacro %} + +{% macro default__quick_are_queries_identical(query_a, query_b, columns, event_time) %} + {% if execute %} + {# Need to only throw this error when the macro is actually trying to be used, not during intial parse phase #} + {# if/when unit tests get support for `enabled` config, this check can be removed as they won't be supplied for parse anyway #} + {% do exceptions.raise_compiler_error("quick_are_queries_identical() is not implemented for adapter '"~ target.type ~ "'" ) %} + {% endif %} +{% endmacro %} + +{% macro bigquery__quick_are_queries_identical(query_a, query_b, columns, event_time) %} + {% set joined_cols = columns | join(", ") %} + {% if event_time %} + {% set event_time_props = audit_helper._get_comparison_bounds(a_query, b_query, event_time) %} + {% endif %} + + with query_a as ( + select {{ joined_cols }} + from ({{ query_a }}) + {{ audit_helper.event_time_filter(event_time_props) }} + ), + query_b as ( + select {{ joined_cols }} + from ({{ query_b }}) + {{ audit_helper.event_time_filter(event_time_props) }} + ) + + select count(distinct hash_result) = 1 as are_tables_identical + from ( + select bit_xor(farm_fingerprint(to_json_string(query_a))) as hash_result + from query_a + + union all + + select bit_xor(farm_fingerprint(to_json_string(query_b))) as hash_result + from query_b + ) as hashes +{% endmacro %} + +{% macro snowflake__quick_are_queries_identical(query_a, query_b, columns, event_time) %} + {% set joined_cols = columns | join(", ") %} + {% if event_time %} + {% set event_time_props = audit_helper._get_comparison_bounds(a_query, b_query, event_time) %} + {% endif %} + + select count(distinct hash_result) = 1 as are_tables_identical + from ( + select hash_agg({{ joined_cols }}) as hash_result + from ({{ query_a }}) query_a_subq + {{ audit_helper.event_time_filter(event_time_props) }} + + union all + + select hash_agg({{ joined_cols }}) as hash_result + from ({{ query_b }}) query_b_subq + {{ audit_helper.event_time_filter(event_time_props) }} + + ) as hashes +{% endmacro %} \ No newline at end of file diff --git a/macros/quick_are_relations_identical.sql b/macros/quick_are_relations_identical.sql new file mode 100644 index 00000000..2b3173da --- /dev/null +++ b/macros/quick_are_relations_identical.sql @@ -0,0 +1,14 @@ +{% macro quick_are_relations_identical(a_relation, b_relation, columns=None, event_time=None) %} + {% if not columns %} + {% set columns = audit_helper._get_intersecting_columns_from_relations(a_relation, b_relation) %} + {% endif %} + + {{ + audit_helper.quick_are_queries_identical( + "select * from " ~ a_relation, + "select * from " ~ b_relation, + columns, + event_time + ) + }} +{% endmacro %} \ No newline at end of file diff --git a/macros/utils/_classify_audit_row_status.sql b/macros/utils/_classify_audit_row_status.sql new file mode 100644 index 00000000..73dd631d --- /dev/null +++ b/macros/utils/_classify_audit_row_status.sql @@ -0,0 +1,28 @@ +{% macro _classify_audit_row_status() %} + {{ return(adapter.dispatch('_classify_audit_row_status', 'audit_helper')()) }} +{% endmacro %} + +{%- macro default___classify_audit_row_status() -%} + case + when max(dbt_audit_pk_row_num) over (partition by dbt_audit_surrogate_key) > 1 then 'nonunique_pk' + when dbt_audit_in_a and dbt_audit_in_b then 'identical' + when {{ dbt.bool_or('dbt_audit_in_a') }} over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) + and {{ dbt.bool_or('dbt_audit_in_b') }} over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) + then 'modified' + when dbt_audit_in_a then 'removed' + when dbt_audit_in_b then 'added' + end +{% endmacro %} + + +{%- macro redshift___classify_audit_row_status() -%} + {#- Redshift doesn't support bitwise operations (e.g. bool_or) inside of a window function :( -#} + case + when max(dbt_audit_pk_row_num) over (partition by dbt_audit_surrogate_key) > 1 then 'nonunique_pk' + when dbt_audit_in_a and dbt_audit_in_b then 'identical' + when max(case when dbt_audit_in_a then 1 else 0 end) over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) = 1 + and max(case when dbt_audit_in_b then 1 else 0 end) over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) = 1 + then 'modified' + when dbt_audit_in_a then 'removed' + when dbt_audit_in_b then 'added' + end{% endmacro %} \ No newline at end of file diff --git a/macros/utils/_count_num_rows_in_status.sql b/macros/utils/_count_num_rows_in_status.sql new file mode 100644 index 00000000..fa81c591 --- /dev/null +++ b/macros/utils/_count_num_rows_in_status.sql @@ -0,0 +1,28 @@ +{% macro _count_num_rows_in_status() %} + {{ return(adapter.dispatch('_count_num_rows_in_status', 'audit_helper')()) }} +{% endmacro %} + +{%- macro default___count_num_rows_in_status() -%} + count(distinct dbt_audit_surrogate_key, dbt_audit_pk_row_num) over (partition by dbt_audit_row_status) +{% endmacro %} + +{%- macro bigquery___count_num_rows_in_status() -%} + count(distinct {{ dbt.concat(["dbt_audit_surrogate_key", "dbt_audit_pk_row_num"]) }}) over (partition by dbt_audit_row_status) +{% endmacro %} + +{%- macro postgres___count_num_rows_in_status() -%} + {{ audit_helper._count_num_rows_in_status_without_distinct_window_func() }} +{% endmacro %} + +{%- macro databricks___count_num_rows_in_status() -%} + {{ audit_helper._count_num_rows_in_status_without_distinct_window_func() }} +{% endmacro %} + +{% macro _count_num_rows_in_status_without_distinct_window_func() %} + {#- Some platforms don't support count(distinct) inside of window functions -#} + {#- You can get the same outcome by dense_rank, assuming no nulls (we've already handled that) #} + {# https://stackoverflow.com/a/22347502 -#} + dense_rank() over (partition by dbt_audit_row_status order by dbt_audit_surrogate_key, dbt_audit_pk_row_num) + + dense_rank() over (partition by dbt_audit_row_status order by dbt_audit_surrogate_key desc, dbt_audit_pk_row_num desc) + - 1 +{% endmacro %} \ No newline at end of file diff --git a/macros/utils/_ensure_all_pks_are_in_column_set.sql b/macros/utils/_ensure_all_pks_are_in_column_set.sql new file mode 100644 index 00000000..5c190541 --- /dev/null +++ b/macros/utils/_ensure_all_pks_are_in_column_set.sql @@ -0,0 +1,19 @@ +{# If someone forgot to include the PK columns in their main set of columns, fix it up for them #} +{# Assuming that the PKs are the most important columns, so they go to the front of the list #} + +{% macro _ensure_all_pks_are_in_column_set(primary_key_columns, columns) %} + {% set lower_cols = columns | map('lower') | list %} + {% set missing_pks = [] %} + + {% for pk in primary_key_columns %} + {% if pk | lower not in lower_cols %} + {% do missing_pks.append(pk) %} + {% endif %} + {% endfor %} + + {% if missing_pks | length > 0 %} + {% set columns = missing_pks + columns %} + {% endif %} + + {% do return (columns) %} +{% endmacro %} \ No newline at end of file diff --git a/macros/utils/_generate_null_safe_sk.sql b/macros/utils/_generate_null_safe_sk.sql new file mode 100644 index 00000000..26ed6450 --- /dev/null +++ b/macros/utils/_generate_null_safe_sk.sql @@ -0,0 +1,25 @@ +{# Taken from https://github.com/dbt-labs/dbt-utils/blob/main/macros/sql/generate_surrogate_key.sql but without the option to treat nulls as empty strings #} + +{%- macro _generate_null_safe_surrogate_key(field_list) -%} + {{ return(adapter.dispatch('_generate_null_safe_surrogate_key', 'audit_helper')(field_list)) }} +{% endmacro %} + +{%- macro default___generate_null_safe_surrogate_key(field_list) -%} + +{%- set fields = [] -%} + +{%- for field in field_list -%} + + {%- do fields.append( + "coalesce(cast(" ~ field ~ " as " ~ dbt.type_string() ~ "), '_dbt_audit_helper_surrogate_key_null_')" + ) -%} + + {%- if not loop.last %} + {%- do fields.append("'-'") -%} + {%- endif -%} + +{%- endfor -%} + +{{ dbt.hash(dbt.concat(fields)) }} + +{%- endmacro -%} \ No newline at end of file diff --git a/macros/utils/_generate_set_results.sql b/macros/utils/_generate_set_results.sql new file mode 100644 index 00000000..890cb9d1 --- /dev/null +++ b/macros/utils/_generate_set_results.sql @@ -0,0 +1,219 @@ +{#- + Set generation is dispatched because it's possible to get performance optimisations + on some platforms, while keeping the post-processing standardised + See https://infinitelambda.com/data-validation-refactoring-snowflake/ for an example and background +-#} + +{% macro _generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props=None) %} + {{ return(adapter.dispatch('_generate_set_results', 'audit_helper')(a_query, b_query, primary_key_columns, columns, event_time_props)) }} +{% endmacro %} + +{% macro default___generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props) %} + {% set joined_cols = columns | join(", ") %} + + a_base as ( + select + {{ joined_cols }}, + {{ audit_helper._generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key + from ( {{- a_query -}} ) a_base_subq + {{ audit_helper.event_time_filter(event_time_props) }} + ), + + b_base as ( + select + {{ joined_cols }}, + {{ audit_helper._generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key + from ( {{- b_query -}} ) b_base_subq + {{ audit_helper.event_time_filter(event_time_props) }} + ), + + a as ( + select + *, + row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key) as dbt_audit_pk_row_num + from a_base + ), + + b as ( + select + *, + row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key) as dbt_audit_pk_row_num + from b_base + ), + + a_intersect_b as ( + + select * from a + {{ dbt.intersect() }} + select * from b + + ), + + a_except_b as ( + + select * from a + {{ dbt.except() }} + select * from b + + ), + + b_except_a as ( + + select * from b + {{ dbt.except() }} + select * from a + + ) +{% endmacro %} + +{% macro bigquery___generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props) %} + {% set joined_cols = columns | join(", ") %} + {% set surrogate_key = audit_helper._generate_null_safe_surrogate_key(primary_key_columns) %} + subset_columns_a as ( + select + {{ joined_cols }}, + {{ surrogate_key }} as dbt_audit_surrogate_key, + row_number() over (partition by {{ surrogate_key }} order by 1 ) as dbt_audit_pk_row_num + from ( {{- a_query -}} ) + {{ audit_helper.event_time_filter(event_time_props) }} + ), + + subset_columns_b as ( + select + {{ joined_cols }}, + {{ surrogate_key }} as dbt_audit_surrogate_key, + row_number() over (partition by {{ surrogate_key }} order by 1 ) as dbt_audit_pk_row_num + from ( {{- b_query -}} ) + {{ audit_helper.event_time_filter(event_time_props) }} + ), + + a as ( + select + *, + farm_fingerprint(to_json_string(subset_columns_a)) as dbt_audit_row_hash + from subset_columns_a + ), + + b as ( + select + *, + farm_fingerprint(to_json_string(subset_columns_b)) as dbt_audit_row_hash + from subset_columns_b + ), + + a_intersect_b as ( + + select * from a + where a.dbt_audit_row_hash in (select b.dbt_audit_row_hash from b) + + ), + + a_except_b as ( + + select * from a + where a.dbt_audit_row_hash not in (select b.dbt_audit_row_hash from b) + + ), + + b_except_a as ( + + select * from b + where b.dbt_audit_row_hash not in (select a.dbt_audit_row_hash from a) + + ) +{% endmacro %} + +{% macro databricks___generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props) %} + {% set cast_columns = [] %} + {# Map types can't be compared by default (you need to opt in to a legacy behaviour flag) #} + {# so everything needs to be cast as a string first :( #} + {% for col in columns %} + {% do cast_columns.append(dbt.cast(col, api.Column.translate_type("string"))) %} + {% endfor %} + {% set joined_cols = cast_columns | join(", ") %} + {% set surrogate_key = audit_helper._generate_null_safe_surrogate_key(primary_key_columns) %} + a as ( + select + {{ joined_cols }}, + {{ surrogate_key }} as dbt_audit_surrogate_key, + row_number() over (partition by {{ surrogate_key }} order by 1 ) as dbt_audit_pk_row_num, + xxhash64({{ joined_cols }}, dbt_audit_pk_row_num) as dbt_audit_row_hash + from ( {{- a_query -}} ) + {{ audit_helper.event_time_filter(event_time_props) }} + ), + + b as ( + select + {{ joined_cols }}, + {{ surrogate_key }} as dbt_audit_surrogate_key, + row_number() over (partition by {{ surrogate_key }} order by 1 ) as dbt_audit_pk_row_num, + xxhash64({{ joined_cols }}, dbt_audit_pk_row_num) as dbt_audit_row_hash + from ( {{- b_query -}} ) + {{ audit_helper.event_time_filter(event_time_props) }} + ), + + a_intersect_b as ( + + select * from a + where a.dbt_audit_row_hash in (select b.dbt_audit_row_hash from b) + + ), + + a_except_b as ( + + select * from a + where a.dbt_audit_row_hash not in (select b.dbt_audit_row_hash from b) + + ), + + b_except_a as ( + + select * from b + where b.dbt_audit_row_hash not in (select a.dbt_audit_row_hash from a) + + ) +{% endmacro %} + +{% macro snowflake___generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props) %} + {% set joined_cols = columns | join(", ") %} + a as ( + select + {{ joined_cols }}, + {{ audit_helper._generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key, + row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key ) as dbt_audit_pk_row_num, + hash({{ joined_cols }}, dbt_audit_pk_row_num) as dbt_audit_row_hash + from ( {{- a_query -}} ) + {{ audit_helper.event_time_filter(event_time_props) }} + ), + + b as ( + select + {{ joined_cols }}, + {{ audit_helper._generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key, + row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key ) as dbt_audit_pk_row_num, + hash({{ joined_cols }}, dbt_audit_pk_row_num) as dbt_audit_row_hash + from ( {{- b_query -}} ) + {{ audit_helper.event_time_filter(event_time_props) }} + ), + + a_intersect_b as ( + + select * from a + where a.dbt_audit_row_hash in (select b.dbt_audit_row_hash from b) + + ), + + a_except_b as ( + + select * from a + where a.dbt_audit_row_hash not in (select b.dbt_audit_row_hash from b) + + ), + + b_except_a as ( + + select * from b + where b.dbt_audit_row_hash not in (select a.dbt_audit_row_hash from a) + + ) +{% endmacro %} \ No newline at end of file diff --git a/macros/utils/_get_comparison_bounds.sql b/macros/utils/_get_comparison_bounds.sql new file mode 100644 index 00000000..8ac05731 --- /dev/null +++ b/macros/utils/_get_comparison_bounds.sql @@ -0,0 +1,52 @@ +/* +The idea here is that if the event_time is set, we will only compare records enclosed in both models. +This improves performance and allows us to compare apples to apples, instead of detecting millions/billions +of "deletions" identified due to prod having all data while CI only has a few days' worth. + +In the diagram below, the thatched section is the comparison bounds. You can think of it as + + greatest(model_a.min_value, model_b.min_value) + least(model_a.max_value, model_b.max_value) + + ┌────────────────────────────┐ + a min_value │ a max_value │ + └──► ┌───────┼────────────────────┐ ◄───┘ │ + │ │┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼│ │ +model_a │ │┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼│ │ model_b + │ │┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼│ │ + └───────┼────────────────────┘ │ + ┌──► └────────────────────────────┘ ◄────┐ + b min_value b max_value +*/ +{% macro _get_comparison_bounds(a_query, b_query, event_time) %} + {% set min_max_queries %} + with min_maxes as ( + select min({{ event_time }}) as min_event_time, max({{ event_time }}) as max_event_time + from ({{ a_query }}) a_subq + union all + select min({{ event_time }}) as min_event_time, max({{ event_time }}) as max_event_time + from ({{ b_query }}) b_subq + ) + select max(min_event_time) as min_event_time, min(max_event_time) as max_event_time + from min_maxes + {% endset %} + + {% set query_response = dbt_utils.get_query_results_as_dict(min_max_queries) %} + + {% set event_time_props = {"event_time": event_time} %} + + {# query_response.keys() are only `min_event_time` and `max_event_time`, but they have indeterminate capitalisation #} + {# hence the dynamic approach for what is otherwise just two well-known values #} + {% for k in query_response.keys() %} + {% do event_time_props.update({k | lower: query_response[k][0]}) %} + {% endfor %} + + {% do return(event_time_props) %} +{% endmacro %} + +{% macro event_time_filter(event_time_props) %} + {% if event_time_props %} + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} +{% endmacro %} \ No newline at end of file diff --git a/macros/utils/_get_intersecting_columns_from_relations.sql b/macros/utils/_get_intersecting_columns_from_relations.sql new file mode 100644 index 00000000..18d2ccb1 --- /dev/null +++ b/macros/utils/_get_intersecting_columns_from_relations.sql @@ -0,0 +1,13 @@ +{% macro _get_intersecting_columns_from_relations(a_relation, b_relation) %} + {%- set a_cols = dbt_utils.get_filtered_columns_in_relation(a_relation) -%} + {%- set b_cols = dbt_utils.get_filtered_columns_in_relation(b_relation) -%} + + {%- set intersection = [] -%} + {%- for col in a_cols -%} + {%- if col in b_cols -%} + {%- do intersection.append(col) -%} + {%- endif -%} + {%- endfor -%} + + {% do return(intersection) %} +{% endmacro %} \ No newline at end of file