diff --git a/macros/get_dependencies.sql b/macros/get_dependencies.sql deleted file mode 100644 index b516f734..00000000 --- a/macros/get_dependencies.sql +++ /dev/null @@ -1,93 +0,0 @@ -{% macro get_dependencies() %} - - {%- set sql -%} - -- one record for each node in the DAG (models and sources) and its direct parent - with direct_relationships as ( - - {%- for model in graph.nodes.values() | selectattr("resource_type", "equalto", "model") -%} - {%- set outer_loop = loop -%} - - {%- if model.depends_on.nodes|length == 0 -%} - - select - '{{model.name}}' as model, - NULL as direct_parent, - NULL as direct_parent_type {# if this field still useful? if not could simply by looping through depends_on.nodes instead of refs & sources seperately #} - - {%- else -%} - - {%- for model_parent in model.refs -%} - - select - '{{model.name}}' as model, - '{{model_parent.0}}' as direct_parent, - 'model' as direct_parent_type - {% if not loop.last %}union all{% endif %} - - {% endfor -%} - - {%- for source_parent in model.sources -%} - - {% if loop.first and model.refs|length > 0 %}union all{% endif %} - select - '{{model.name}}' as model, - '{{source_parent.0}}.{{source_parent.1}}' as direct_parent, - 'source' as direct_parent_type - {% if not loop.last %}union all{% endif %} - - {% endfor -%} - - {%- endif -%} - - {% if not outer_loop.last %}union all{% endif %} - - {% endfor -%} - - {%- for source in graph.sources.values() -%} - - {% if loop.first and graph.nodes|length > 0 %}union all{% endif %} - select - '{{source.source_name}}.{{source.name}}' as model, - NULL as direct_parent, - NULL as direct_parent_type - {% if not loop.last %}union all{% endif %} - - {% endfor -%} - - ), - - -- recursive CTE - -- one record for every node and each of its downstream children (including itself) - all_relationships as ( - - -- anchor - select distinct - model as parent, - model as child, - 0 as distance, - array_construct(child) as path {# snowflake-specific, but helpful for troubleshooting right now #} - from direct_relationships - -- where direct_parent is null {# optional lever to change filtering of anchor clause to only include root nodes #} - - union all - - -- recursive clause - select - all_relationships.parent as parent, - direct_relationships.model as child, - all_relationships.distance+1 as distance, - array_append(all_relationships.path, direct_relationships.model) as path - from direct_relationships - inner join all_relationships - on all_relationships.child = direct_relationships.direct_parent - - ) - - select * from all_relationships - order by parent, distance - - {%- endset -%} - - {% do log(sql, info=true) %} - -{% endmacro %} \ No newline at end of file diff --git a/models/audit/dag/dag.yml b/models/audit/dag/dag.yml new file mode 100644 index 00000000..71cf3560 --- /dev/null +++ b/models/audit/dag/dag.yml @@ -0,0 +1,9 @@ +version: 2 + +models: + - name: stg_dag_relationships + description: "This table shows one record for every node and each of its downstream children (including itself)." + - name: fct_source_fanout + description: "This table shows each parent/child relationship where a source is the direct parent of multiple nodes in the DAG." + tests: + - is_empty \ No newline at end of file diff --git a/models/audit/dag/fct_source_fanout.sql b/models/audit/dag/fct_source_fanout.sql new file mode 100644 index 00000000..56302249 --- /dev/null +++ b/models/audit/dag/fct_source_fanout.sql @@ -0,0 +1,28 @@ +-- this model finds cases where a source is used in multiple direct downstream models +with direct_source_relationships as ( + select + * + from {{ ref('stg_dag_relationships') }} + where distance = 1 + and parent_type = 'source' +), + +source_fanout as ( + select + parent, + count(*) + from direct_source_relationships + group by 1 + having count(*) > 1 +), + +final as ( + select + direct_source_relationships.* + from direct_source_relationships + inner join source_fanout + on direct_source_relationships.parent = source_fanout.parent + order by direct_source_relationships.parent +) + +select * from final \ No newline at end of file diff --git a/models/audit/dag/stg_dag_relationships.sql b/models/audit/dag/stg_dag_relationships.sql new file mode 100644 index 00000000..c4bb249f --- /dev/null +++ b/models/audit/dag/stg_dag_relationships.sql @@ -0,0 +1,95 @@ +-- TO DO: only include ENABLED nodes +-- TO DO: exclude models that are part of the audit package + -- can use package_name attribute in final version +-- TO DO: fix whitespace + +-- one record for each node in the DAG (models and sources) and its direct parent +with direct_relationships as ( + +{%- for model in graph.nodes.values() | selectattr("resource_type", "equalto", "model") -%} +{%- set outer_loop = loop -%} + + {%- if model.depends_on.nodes|length == 0 -%} + + select + '{{model.name}}' as node, + '{{model.unique_id}}' as node_id, + 'model' as node_type, + NULL as direct_parent_id + + {%- else -%} + + {%- for model_parent in model.depends_on.nodes -%} + + select + '{{model.name}}' as node, + '{{model.unique_id}}' as node_id, + 'model' as node_type, + '{{model_parent}}' as direct_parent_id + {% if not loop.last %}union all{% endif %} + + {% endfor -%} + + {%- endif %} + + {% if not outer_loop.last %}union all{% endif %} + +{% endfor -%} + +{%- for source in graph.sources.values() -%} + + {% if loop.first and graph.nodes|length > 0 %}union all{% endif %} + select + '{{source.source_name}}.{{source.name}}' as node, + '{{source.unique_id}}' as node_id, + 'source' as node_type, + NULL as direct_parent_id + {% if not loop.last %}union all{% endif %} + +{% endfor -%} + +), + +-- recursive CTE +-- one record for every node and each of its downstream children (including itself) +all_relationships as ( + -- anchor + select distinct + node as parent, + node_id as parent_id, + node_type as parent_type, + node as child, + node_id as child_id, + 0 as distance, + array_construct(child) as path {# snowflake-specific, but helpful for troubleshooting right now #} + from direct_relationships + -- where direct_parent is null {# optional lever to change filtering of anchor clause to only include root nodes #} + + union all + + -- recursive clause + select + all_relationships.parent as parent, + all_relationships.parent_id as parent_id, + all_relationships.parent_type as parent_type, + direct_relationships.node as child, + direct_relationships.node_id as child_id, + all_relationships.distance+1 as distance, + array_append(all_relationships.path, direct_relationships.node) as path + from direct_relationships + inner join all_relationships + on all_relationships.child_id = direct_relationships.direct_parent_id +), + +final as ( + select + parent, + parent_type, + child, + distance, + path + from all_relationships +) + +select * from final +order by parent, distance \ No newline at end of file diff --git a/tests/generic/test_is_empty.sql b/tests/generic/test_is_empty.sql new file mode 100644 index 00000000..ab08745d --- /dev/null +++ b/tests/generic/test_is_empty.sql @@ -0,0 +1,11 @@ +{% test is_empty(model) %} + + {{ config ( + severity = 'warn', + fail_calc = "n_records" + ) }} + + select count(*) as n_records + from {{ model }} + +{% endtest %} \ No newline at end of file