catalyst-cooperative · zschira · Jan 9, 2025 · Jan 10, 2025 · Jan 10, 2025 · Jan 10, 2025
diff --git a/.gitignore b/.gitignore
@@ -49,3 +49,9 @@ devtools/datasette/fly/Dockerfile
 devtools/datasette/fly/inspect-data.json
 devtools/datasette/fly/metadata.yml
 devtools/datasette/fly/all_dbs.tar.zst
+
+# dbt specific ignores
+dbt/dbt_packages/
+dbt/target/
+dbt/logs/
+dbt/.user.yml
diff --git a/dbt/README.md b/dbt/README.md
@@ -0,0 +1,97 @@
+## Overview
+This directory contains an initial setup of a `dbt` project meant to write
+[data tests](https://docs.getdbt.com/docs/build/data-tests) for PUDL data. The
+project is setup with profiles that allow you to select running tests on `nightly`
+builds, `etl-full`, or `etl-fast` outputs. The `nightly` profile will operate
+directly on parquet files in our S3 bucket, while both the `etl-full` and `etl-fast`
+profiles will look for parquet files based on your `PUDL_OUTPUT` environment
+variable. See the `Usage` section below for examples using these profiles.
+
+
+## Development
+To setup the `dbt` project, simply install the PUDL `conda` environment as normal,
+then run the following command from this directory.
+
+```
+dbt deps
+```
+
+### Adding new tables
+To add a new table to the project, you must add it as a
+[dbt source](https://docs.getdbt.com/docs/build/sources). The standard way to do
+this is to create a new file `models/{data_source}/{table_name}.yml`. If the the
+`data_source` doesn't already have a directory within `models/` you should first
+create one and add the yaml file here.
+
+### Adding tests
+#### Default case
+Once a table is included as a `source`, you can add tests for the table. You can
+either add a generic test directly in `src/pudl/dbt/models/schema.yml`, or create
+a `sql` file in the directory `src/pudl/dbt/tests/`, which references the `source`.
+When adding `sql` tests like this, you should construct a query that `SELECT`'s rows
+that indicate a failure. That is, if the query returns any rows, `dbt` will raise a
+failure for that test.
+
+The project includes [dbt-expectations](https://github.com/calogica/dbt-expectations)
+and [dbt-utils](https://github.com/dbt-labs/dbt-utils) as dependencies. These
+packages include useful tests out of the box that can be applied to any tables
+in the project. There are several examples in `src/pudl/dbt/models/schema.yml` which
+use `dbt-expectations`.
+
+#### Modifying a table before test
+In some cases you may want to modify the table before applying tests. There are two
+ways to accomplish this. First, you can add the table as a `source` as described
+above, then create a SQL file in the `tests/` directory like
+`tests/{data_source}/{table_name}.yml`. From here you can construct a SQL query to
+modify the table and execute a test on the intermediate table you've created. `dbt`
+expects a SQL test to be a query that returns 0 rows for a successful test. See
+the `dbt` [source function](https://docs.getdbt.com/reference/dbt-jinja-functions/source)
+for guidance on how to reference a `source` from a SQL file.
+
+The second method is to create a [model](https://docs.getdbt.com/docs/build/models)
+which will produce the intermediate table you want to execute tests on. To use this
+approach, first create a directory named `tests/{data_source}/{table_name}/` and move
+your yaml file defining the `source` table to `tests/{data_source}/{table_name}/schema.yml`.
+Now, add a SQL file to this directory named `validate_{table_name}` and define your model
+for producing the intermediate table here. Finally, add the model to the `schema.yml` file
+and define tests exactly as you would for a `source` table. See
+`models/ferc1/out_ferc1__yearly_steam_plants_fuel_by_plant_sched402` for an example of this
+pattern.
+
+### Usage
+There are a few ways to execute tests. To run all tests with a single command:
+
+```
+dbt build
+```
+
+This command will first run any models, then execute all tests.
+
+For more finegrained control, first run:
+
+```
+dbt run
+```
+
+This will run all models, thus prepairing any `sql` views that will be referenced in
+tests. Once you've done this, you can run all tests with:
+
+```
+dbt test
+```
+
+To run all tests for a single source table:
+
+```
+dbt test --select source:pudl.{table_name}
+```
+
+To run all tests for a model table:
+
+```
+dbt test --select {model_name}
+```
+
+#### Selecting target profile
+To select between `nightly`, `etl-full`, and `etl-fast` profiles, append
+`--target {target_name}` to any of the previous commands.
diff --git a/dbt/dbt_project.yml b/dbt/dbt_project.yml
@@ -0,0 +1,22 @@
+# Name your project! Project names should contain only lowercase characters
+# and underscores. A good package name should reflect your organization's
+# name or the intended use of these models
+name: "pudl_dbt"
+version: "1.0.0"
+
+# This setting configures which "profile" dbt uses for this project.
+profile: "pudl_dbt"
+
+# These configurations specify where dbt should look for different types of files.
+# The `model-paths` config, for example, states that models in this project can be
+# found in the "models/" directory. You probably won't need to change these!
+model-paths: ["models"]
+macro-paths: ["macros"]
+test-paths: ["tests"]
+
+sources:
+  pudl_dbt:
+    +external_location: |
+      {%- if target.name == "nightly" -%} 'https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/nightly/{name}.parquet'
+      {%- else -%} '{{ env_var('PUDL_OUTPUT') }}/parquet/{name}.parquet'
+      {%- endif -%}
diff --git a/dbt/models/ferc1/out_ferc1__yearly_steam_plants_fuel_by_plant_sched402/schema.yml b/dbt/models/ferc1/out_ferc1__yearly_steam_plants_fuel_by_plant_sched402/schema.yml
@@ -0,0 +1,43 @@
+version: 2
+
+sources:
+  - name: pudl
+    tables:
+      - name: out_ferc1__yearly_steam_plants_fuel_by_plant_sched402
+
+models:
+  - name: validate_ferc1__yearly_steam_plants_fuel_by_plant_sched402
+    columns:
+      - name: gas_cost_per_mmbtu
+        data_tests:
+          - dbt_expectations.expect_column_quantile_values_to_be_between:
+              quantile: 0.05
+              min_value: 1.5
+          - dbt_expectations.expect_column_quantile_values_to_be_between:
+              quantile: 0.90
+              max_value: 15.0
+          - dbt_expectations.expect_column_median_to_be_between:
+              min_value: 2.0
+              max_value: 10.0
+      - name: oil_cost_per_mmbtu
+        data_tests:
+          - dbt_expectations.expect_column_quantile_values_to_be_between:
+              quantile: 0.10
+              min_value: 3.5
+          - dbt_expectations.expect_column_quantile_values_to_be_between:
+              quantile: 0.90
+              max_value: 25.0
+          - dbt_expectations.expect_column_median_to_be_between:
+              min_value: 6.5
+              max_value: 17.0
+      - name: coal_cost_per_mmbtu
+        data_tests:
+          - dbt_expectations.expect_column_quantile_values_to_be_between:
+              quantile: 0.10
+              min_value: 0.75
+          - dbt_expectations.expect_column_quantile_values_to_be_between:
+              quantile: 0.90
+              max_value: 4.0
+          - dbt_expectations.expect_column_median_to_be_between:
+              min_value: 1.0
+              max_value: 2.5
diff --git a/...nts_fuel_by_plant_sched402/validate_ferc1__yearly_steam_plants_fuel_by_plant_sched402.sql b/...nts_fuel_by_plant_sched402/validate_ferc1__yearly_steam_plants_fuel_by_plant_sched402.sql
@@ -0,0 +1,6 @@
+
+select
+{% for fuel_type in ["gas", "oil", "coal"] %}
+    {{ fuel_type }}_fraction_cost * fuel_cost / ({{ fuel_type }}_fraction_mmbtu * fuel_mmbtu) as {{ fuel_type }}_cost_per_mmbtu,
+{% endfor %}
+from {{ source('pudl', 'out_ferc1__yearly_steam_plants_fuel_by_plant_sched402') }}
diff --git a/dbt/models/vcerare/out_vcerare__hourly_available_capacity_factor.yml b/dbt/models/vcerare/out_vcerare__hourly_available_capacity_factor.yml
@@ -0,0 +1,46 @@
+version: 2
+
+sources:
+  - name: pudl
+    tables:
+      - name: out_vcerare__hourly_available_capacity_factor
+        data_tests:
+          - dbt_expectations.expect_table_row_count_to_equal:
+              value: |
+                {%- if target.name == "etl-fast" -%} 27287400
+                {%- else -%} 136437000
+                {%- endif -%}
+          - dbt_expectations.expect_compound_columns_to_be_unique:
+              column_list: ["county_id_fips", "datetime_utc"]
+              row_condition: "county_id_fips is not null"
+        columns:
+          - name: capacity_factor_solar_pv
+            data_tests:
+              - not_null
+              - dbt_expectations.expect_column_max_to_be_between:
+                  max_value: 1.02
+              - dbt_expectations.expect_column_min_to_be_between:
+                  min_value: 0.00
+          - name: capacity_factor_offshore_wind
+            data_tests:
+              - not_null
+              - dbt_expectations.expect_column_max_to_be_between:
+                  max_value: 1.00
+              - dbt_expectations.expect_column_min_to_be_between:
+                  min_value: 0.00
+          - name: hour_of_year
+            data_tests:
+              - not_null
+              - dbt_expectations.expect_column_max_to_be_between:
+                  min_value: 8759
+                  max_value: 8761
+          - name: datetime_utc
+            data_tests:
+              - not_null
+              - dbt_expectations.expect_column_values_to_not_be_in_set:
+                  value_set: ["{{ dbt_date.date(2020, 12, 31) }}"]
+          - name: county_or_lake_name
+            data_tests:
+              - not_null
+              - dbt_expectations.expect_column_values_to_not_be_in_set:
+                  value_set: ["bedford_city", "clifton_forge_city"]
diff --git a/dbt/package-lock.yml b/dbt/package-lock.yml
@@ -0,0 +1,8 @@
+packages:
+  - package: calogica/dbt_expectations
+    version: 0.10.4
+  - package: dbt-labs/dbt_utils
+    version: 1.3.0
+  - package: calogica/dbt_date
+    version: 0.10.1
+sha1_hash: 29571f46f50e6393ca399c3db7361c22657f2d6b
diff --git a/dbt/packages.yml b/dbt/packages.yml
@@ -0,0 +1,5 @@
+packages:
+  - package: calogica/dbt_expectations
+    version: [">=0.10.0", "<0.11.0"]
+  - package: dbt-labs/dbt_utils
+    version: [">=1.3.0", "<1.4.0"]
diff --git a/dbt/profiles.yml b/dbt/profiles.yml
@@ -0,0 +1,17 @@
+pudl_dbt:
+  outputs:
+    # Define targets for nightly builds, and local ETL full/fast
+    # See models/schema.yml for further configuration
+    nightly:
+      type: duckdb
+      path: "{{ env_var('PUDL_OUTPUT') }}/pudl.duckdb"
+      filesystems:
+        - fs: s3
+    etl-full:
+      type: duckdb
+      path: "{{ env_var('PUDL_OUTPUT') }}/pudl.duckdb"
+    etl-fast:
+      type: duckdb
+      path: "{{ env_var('PUDL_OUTPUT') }}/pudl.duckdb"
+
+  target: nightly