catalyst-cooperative · zschira · Jan 9, 2025 · Jan 10, 2025 · Jan 10, 2025 · Jan 10, 2025
diff --git a/dbt/README.md b/dbt/README.md
@@ -0,0 +1,121 @@
+## Overview
+This directory contains an initial setup of a `dbt` project meant to write
+[data tests](https://docs.getdbt.com/docs/build/data-tests) for PUDL data. The
+project is setup with profiles that allow you to select running tests on `nightly`
+builds, `etl-full`, or `etl-fast` outputs. The `nightly` profile will operate
+directly on parquet files in our S3 bucket, while both the `etl-full` and `etl-fast`
+profiles will look for parquet files based on your `PUDL_OUTPUT` environment
+variable. See the `Usage` section below for examples using these profiles.
+
+
+## Development
+To setup the `dbt` project, simply install the PUDL `conda` environment as normal,
+then run the following commands from this directory.
+
+```
+dbt deps
+dbt seed
+```
+
+### Adding new tables
+#### Helper script
+To add a new table to the project, you must add it as a
+[dbt source](https://docs.getdbt.com/docs/build/sources). We've included a helper
+script to automate the process at `devtools/dbt_helper.py`.
+
+#### Usage
+Basic usage of the helper script looks like:
+
+```
+python devtools/dbt_helper.py --tables {table_name(s)}
+```
+
+This will add a file called `dbt/models/{data_source}/{table_name}/schema.yml` which
+tells `dbt` about the table and it's schema. It will also apply the test
+`check_row_counts_per_partition`, which by default will check row counts per year.
+To accomplish this it will add row counts to the file `seeds/row_counts.csv`, which
+get compared to observed row counts in the table when running tests.
+
+If a table is not partitioned by year, you can add the option
+`--partition-column {column_name}` to the command. This will find row counts per
+unique value in the column. This is common for monthly and hourly tables that are
+often partitioned by `report_date` and `datetime_utc` respectively.
+
+To see all options for command run:
+
+```
+python devtools/dbt_helper.py add-tables --help
+```
+
+### Adding tests
+#### Default case
+Once a table is included as a `source`, you can add tests for the table. You can
+either add a generic test directly in `src/pudl/dbt/models/{table_name}/schema.yml`,
+or create a `sql` file in the directory `src/pudl/dbt/tests/`, which references the `source`.
+When adding `sql` tests like this, you should construct a query that `SELECT`'s rows
+that indicate a failure. That is, if the query returns any rows, `dbt` will raise a
+failure for that test.
+
+The project includes [dbt-expectations](https://github.com/calogica/dbt-expectations)
+and [dbt-utils](https://github.com/dbt-labs/dbt-utils) as dependencies. These
+packages include useful tests out of the box that can be applied to any tables
+in the project. There are several examples in
+`src/pudl/dbt/models/out_vcerare__hourly_available_capacity_factor/schema.yml` which
+use `dbt-expectations`.
+
+#### Modifying a table before test
+In some cases you may want to modify the table before applying tests. There are two
+ways to accomplish this. First, you can add the table as a `source` as described
+above, then create a SQL file in the `tests/` directory like
+`tests/{data_source}/{table_name}.yml`. From here you can construct a SQL query to
+modify the table and execute a test on the intermediate table you've created. `dbt`
+expects a SQL test to be a query that returns 0 rows for a successful test. See
+the `dbt` [source function](https://docs.getdbt.com/reference/dbt-jinja-functions/source)
+for guidance on how to reference a `source` from a SQL file.
+
+The second method is to create a [model](https://docs.getdbt.com/docs/build/models)
+which will produce the intermediate table you want to execute tests on. To use this
+approach, simply add a sql file to `dbt/models/{data_source}/{table_name}/`.
+Now, add a SQL file to this directory named `validate_{table_name}` and define your model
+for producing the intermediate table here. Finally, add the model to the `schema.yml` file
+and define tests exactly as you would for a `source` table. See
+`models/ferc1/out_ferc1__yearly_steam_plants_fuel_by_plant_sched402` for an example of this
+pattern.
+
+### Usage
+There are a few ways to execute tests. To run all tests with a single command:
+
+```
+dbt build
+```
+
+This command will first run any models, then execute all tests.
+
+For more finegrained control, first run:
+
+```
+dbt run
+```
+
+This will run all models, thus prepairing any `sql` views that will be referenced in
+tests. Once you've done this, you can run all tests with:
+
+```
+dbt test
+```
+
+To run all tests for a single source table:
+
+```
+dbt test --select source:pudl.{table_name}
+```
+
+To run all tests for a model table:
+
+```
+dbt test --select {model_name}
+```
+
+#### Selecting target profile
+To select between `nightly`, `etl-full`, and `etl-fast` profiles, append
+`--target {target_name}` to any of the previous commands.
diff --git a/dbt/dbt_project.yml b/dbt/dbt_project.yml
@@ -0,0 +1,23 @@
+# Name your project! Project names should contain only lowercase characters
+# and underscores. A good package name should reflect your organization's
+# name or the intended use of these models
+name: "pudl_dbt"
+version: "1.0.0"
+
+# This setting configures which "profile" dbt uses for this project.
+profile: "pudl_dbt"
+
+# These configurations specify where dbt should look for different types of files.
+# The `model-paths` config, for example, states that models in this project can be
+# found in the "models/" directory. You probably won't need to change these!
+model-paths: ["models"]
+macro-paths: ["macros"]
+seed-paths: ["seeds"]
+test-paths: ["tests"]
+
+sources:
+  pudl_dbt:
+    +external_location: |
+      {%- if target.name == "nightly" -%} 'https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/nightly/{name}.parquet'
+      {%- else -%} '{{ env_var('PUDL_OUTPUT') }}/parquet/{name}.parquet'
+      {%- endif -%}
diff --git a/dbt/macros/check_row_counts_per_partition.sql b/dbt/macros/check_row_counts_per_partition.sql
@@ -0,0 +1,18 @@
+{% test check_row_counts_per_partition(model, table_name, partition_column) %}
+
+WITH
+    expected AS (
+        SELECT table_name, partition, row_count as expected_count
+        FROM {{ ref("row_counts") }} WHERE table_name = '{{ table_name }}'
+    ),
+    observed AS (
+        SELECT {{ partition_column }} as partition, COUNT(*) as observed_count
+        FROM {{ model }}
+        GROUP BY {{ partition_column }}
+    )
+SELECT expected.partition, expected.expected_count, observed.observed_count
+FROM expected
+INNER JOIN observed ON expected.partition=observed.partition
+WHERE expected.expected_count != observed.observed_count
+
+{% endtest %}
diff --git a/dbt/macros/expect_column_weighted_quantile_values_to_be_between.sql b/dbt/macros/expect_column_weighted_quantile_values_to_be_between.sql
@@ -0,0 +1,21 @@
+{% test expect_column_weighted_quantile_values_to_be_between(model, column_name,
+                                                                    quantile,
+                                                                    weight_column,
+                                                                    min_value=None,
+                                                                    max_value=None,
+                                                                    group_by=None,
+                                                                    row_condition=None,
+                                                                    strictly=False
+                                                                    ) %}
+{% set expression %}
+{{ weighted_quantile(column_name, weight_column, quantile) }}
+{% endset %}
+{{ dbt_expectations.expression_between(model,
+                                        expression=expression,
+                                        min_value=min_value,
+                                        max_value=max_value,
+                                        group_by_columns=group_by,
+                                        row_condition=row_condition,
+                                        strictly=strictly
+                                        ) }}
+{% endtest %}
diff --git a/dbt/macros/weighted_quantile.sql b/dbt/macros/weighted_quantile.sql
@@ -0,0 +1,26 @@
+{% macro weighted_quantile(model, column_name, weight_col, quantile) %}
+
+WITH CumulativeWeights AS (
+    SELECT
+        {{ column_name }},
+        {{ weight_col }},
+        SUM({{ weight_col }}) OVER (ORDER BY {{ column_name }}) AS cumulative_weight,
+        SUM({{ weight_col }}) OVER () AS total_weight
+    FROM bf
+),
+QuantileData AS (
+    SELECT
+        {{ column_name }},
+        {{ weight_col }},
+        cumulative_weight,
+        total_weight,
+        cumulative_weight / total_weight AS cumulative_probability
+    FROM CumulativeWeights
+)
+SELECT {{ column_name }}
+FROM QuantileData
+WHERE cumulative_probability >= {{ quantile }} AND {{ column_name }} < {{ lower_bound }}
+ORDER BY {{ column_name }}
+LIMIT 1
+
+{%  endmacro %}
diff --git a/dbt/models/eia923/out_eia923__boiler_fuel/schema.yml b/dbt/models/eia923/out_eia923__boiler_fuel/schema.yml
@@ -0,0 +1,28 @@
+version: 2
+sources:
+  - name: pudl
+    tables:
+      - name: out_eia923__boiler_fuel
+        data_tests:
+          - check_row_counts_per_partition:
+              table_name: out_eia923__boiler_fuel
+              partition_column: report_date
+        columns:
+          - name: report_date
+          - name: plant_id_eia
+          - name: plant_id_pudl
+          - name: plant_name_eia
+          - name: utility_id_eia
+          - name: utility_id_pudl
+          - name: utility_name_eia
+          - name: boiler_id
+          - name: unit_id_pudl
+          - name: energy_source_code
+          - name: prime_mover_code
+          - name: fuel_type_code_pudl
+          - name: fuel_consumed_units
+          - name: fuel_mmbtu_per_unit
+          - name: fuel_consumed_mmbtu
+          - name: sulfur_content_pct
+          - name: ash_content_pct
+          - name: data_maturity
diff --git a/dbt/models/ferc1/out_ferc1__yearly_steam_plants_fuel_by_plant_sched402/schema.yml b/dbt/models/ferc1/out_ferc1__yearly_steam_plants_fuel_by_plant_sched402/schema.yml
@@ -0,0 +1,67 @@
+version: 2
+sources:
+  - name: pudl
+    tables:
+      - name: out_ferc1__yearly_steam_plants_fuel_by_plant_sched402
+        data_tests:
+          - check_row_counts_per_partition:
+              table_name: out_ferc1__yearly_steam_plants_fuel_by_plant_sched402
+              partition_column: report_year
+        columns:
+          - name: report_year
+          - name: utility_id_ferc1
+          - name: utility_id_pudl
+          - name: utility_name_ferc1
+          - name: plant_id_pudl
+          - name: plant_name_ferc1
+          - name: coal_fraction_cost
+          - name: coal_fraction_mmbtu
+          - name: fuel_cost
+          - name: fuel_mmbtu
+          - name: gas_fraction_cost
+          - name: gas_fraction_mmbtu
+          - name: nuclear_fraction_cost
+          - name: nuclear_fraction_mmbtu
+          - name: oil_fraction_cost
+          - name: oil_fraction_mmbtu
+          - name: primary_fuel_by_cost
+          - name: primary_fuel_by_mmbtu
+          - name: waste_fraction_cost
+          - name: waste_fraction_mmbtu
+
+models:
+  - name: validate_ferc1__yearly_steam_plants_fuel_by_plant_sched402
+    columns:
+      - name: gas_cost_per_mmbtu
+        data_tests:
+          - dbt_expectations.expect_column_quantile_values_to_be_between:
+              quantile: 0.05
+              min_value: 1.5
+          - dbt_expectations.expect_column_quantile_values_to_be_between:
+              quantile: 0.90
+              max_value: 15.0
+          - dbt_expectations.expect_column_median_to_be_between:
+              min_value: 2.0
+              max_value: 10.0
+      - name: oil_cost_per_mmbtu
+        data_tests:
+          - dbt_expectations.expect_column_quantile_values_to_be_between:
+              quantile: 0.10
+              min_value: 3.5
+          - dbt_expectations.expect_column_quantile_values_to_be_between:
+              quantile: 0.90
+              max_value: 25.0
+          - dbt_expectations.expect_column_median_to_be_between:
+              min_value: 6.5
+              max_value: 17.0
+      - name: coal_cost_per_mmbtu
+        data_tests:
+          - dbt_expectations.expect_column_quantile_values_to_be_between:
+              quantile: 0.10
+              min_value: 0.75
+          - dbt_expectations.expect_column_quantile_values_to_be_between:
+              quantile: 0.90
+              max_value: 4.0
+          - dbt_expectations.expect_column_median_to_be_between:
+              min_value: 1.0
+              max_value: 2.5
diff --git a/...nts_fuel_by_plant_sched402/validate_ferc1__yearly_steam_plants_fuel_by_plant_sched402.sql b/...nts_fuel_by_plant_sched402/validate_ferc1__yearly_steam_plants_fuel_by_plant_sched402.sql
@@ -0,0 +1,6 @@
+
+select
+{% for fuel_type in ["gas", "oil", "coal"] %}
+    {{ fuel_type }}_fraction_cost * fuel_cost / ({{ fuel_type }}_fraction_mmbtu * fuel_mmbtu) as {{ fuel_type }}_cost_per_mmbtu,
+{% endfor %}
+from {{ source('pudl', 'out_ferc1__yearly_steam_plants_fuel_by_plant_sched402') }}
diff --git a/dbt/models/ferc1/out_ferc1__yearly_steam_plants_sched402/schema.yml b/dbt/models/ferc1/out_ferc1__yearly_steam_plants_sched402/schema.yml
@@ -0,0 +1,76 @@
+version: 2
+sources:
+  - name: pudl
+    tables:
+      - name: out_ferc1__yearly_steam_plants_sched402
+        data_tests:
+          - check_row_counts_per_partition:
+              table_name: out_ferc1__yearly_steam_plants_sched402
+              partition_column: report_year
+        columns:
+          - name: report_year
+          - name: utility_id_ferc1
+          - name: utility_id_pudl
+          - name: utility_name_ferc1
+          - name: plant_id_pudl
+          - name: plant_id_ferc1
+          - name: plant_name_ferc1
+          - name: asset_retirement_cost
+          - name: avg_num_employees
+          - name: capacity_factor
+          - name: capacity_mw
+          - name: capex_annual_addition
+          - name: capex_annual_addition_rolling
+          - name: capex_annual_per_kw
+          - name: capex_annual_per_mw
+          - name: capex_annual_per_mw_rolling
+          - name: capex_annual_per_mwh
+          - name: capex_annual_per_mwh_rolling
+          - name: capex_equipment
+          - name: capex_land
+          - name: capex_per_mw
+          - name: capex_structures
+          - name: capex_total
+          - name: capex_wo_retirement_total
+          - name: construction_type
+          - name: construction_year
+          - name: installation_year
+          - name: net_generation_mwh
+          - name: not_water_limited_capacity_mw
+          - name: opex_allowances
+          - name: opex_boiler
+          - name: opex_coolants
+          - name: opex_electric
+          - name: opex_engineering
+          - name: opex_fuel
+          - name: opex_fuel_per_mwh
+          - name: opex_misc_power
+          - name: opex_misc_steam
+          - name: opex_nonfuel_per_mwh
+          - name: opex_operations
+          - name: opex_per_mwh
+          - name: opex_plants
+          - name: opex_production_total
+          - name: opex_rents
+          - name: opex_steam
+          - name: opex_steam_other
+          - name: opex_structures
+          - name: opex_total_nonfuel
+          - name: opex_transfer
+          - name: peak_demand_mw
+          - name: plant_capability_mw
+          - name: plant_hours_connected_while_generating
+          - name: plant_type
+          - name: record_id
+          - name: water_limited_capacity_mw
+
+models:
+  - name: validate_ferc1__yearly_steam_plants_sched402
+    columns:
+      - name: gas_cost_per_mmbtu
+        data_tests:
+          - expect_column_weighted_quantile_values_to_be_between:
+              quantile: 0.5
+              min_value: 200000
+              max_value: 600000
+              weight_column: capacity_mw