Prototyping: Duckdb based metrics calculation (#2469)

* Reorder metrics queries * wip * more wip * test * Fixes * fix * tester * test * test * test * test * More fixes * test * test * test * test * more testing * fixes * add connectorx * big rewrite, try using trino as part of the export mechanism * add bokeh * test * Update dockerfile * Clean up some in progress work
opensource-observer · Nov 19, 2024 · 34df5fb · 34df5fb
1 parent c6e20ac
commit 34df5fb
Show file tree

Hide file tree

Showing 14 changed files with 1,109 additions and 67 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -77,4 +77,6 @@ Dockerfile
 
 # Github directory
 .github/scripts
-.git
+.git
+logs
+.mypy_cache
diff --git a/docker/images/dagster-dask/Dockerfile b/docker/images/dagster-dask/Dockerfile
@@ -4,17 +4,19 @@ FROM ghcr.io/opensource-observer/oso-public-vars:latest AS public_vars
 FROM ubuntu:jammy
 
 ARG GCLOUD_VERSION=478.0.0
+ARG PYTHON_VERSION=3.12
+ARG PYTHON_PACKAGE=python3.12
 
 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && \
     apt-get install -y software-properties-common && \
     add-apt-repository ppa:deadsnakes/ppa && \
     apt-get update && \
-    apt-get install -y python3.12 && \
+    apt-get install -y ${PYTHON_PACKAGE} && \
     apt-get install -y curl git && \
     curl -o get-pip.py https://bootstrap.pypa.io/get-pip.py && \
-    python3.12 get-pip.py && \
-    pip3.12 install poetry && \
+    python${PYTHON_VERSION} get-pip.py && \
+    pip${PYTHON_VERSION} install poetry && \
     curl -o gcloud.tar.gz https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-${GCLOUD_VERSION}-linux-x86_64.tar.gz && \
     tar xvf gcloud.tar.gz && \
     bash ./google-cloud-sdk/install.sh && \
@@ -48,6 +50,6 @@ ENV DAGSTER_DBT_TARGET_BASE_DIR=/dbt_targets
 
 COPY --from=public_vars ./public/vars.env /usr/src/app/vars.env
 RUN mkdir -p ${DAGSTER_DBT_TARGET_BASE_DIR} && \
-    python3.12 -m oso_dagster.compile --additional-vars /usr/src/app/vars.env
+    python${PYTHON_VERSION} -m oso_dagster.compile --additional-vars /usr/src/app/vars.env
 
 ENTRYPOINT  []
diff --git a/ops/tf-modules/warehouse-cluster/main.tf b/ops/tf-modules/warehouse-cluster/main.tf
@@ -135,12 +135,12 @@ locals {
     # SQLMesh Workers
     {
       name                              = "${var.cluster_name}-sqlmesh-worker-node-pool"
-      machine_type                      = "n1-highmem-32"
+      machine_type                      = "n1-highmem-16"
       node_locations                    = join(",", var.cluster_zones)
       min_count                         = 0
       max_count                         = 10
       local_ssd_count                   = 0
-      local_ssd_ephemeral_storage_count = 2
+      local_ssd_ephemeral_storage_count = 1
       spot                              = false
       disk_size_gb                      = 100
       disk_type                         = "pd-standard"

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -64,6 +64,8 @@ google-auth = "^2.34.0"
 pillow = "^10.4.0"
 dagster-k8s = "^0.24.6"
 pyiceberg = { extras = ["hive"], version = "^0.7.1" }
+connectorx = "^0.4.0"
+bokeh = "^3.6.1"
 
 
 [tool.poetry.scripts]

diff --git a/warehouse/metrics_mesh/fixtures/README.md b/warehouse/metrics_mesh/fixtures/README.md
@@ -0,0 +1,2 @@
+This folder contains both the data generator and the csv most recently generated
+from that data generator. This is so we can easily update tests as we need.
diff --git a/warehouse/metrics_mesh/models/metrics_factories.py b/warehouse/metrics_mesh/models/metrics_factories.py
@@ -21,6 +21,22 @@
             ref="stars.sql",
             time_aggregations=["daily", "weekly", "monthly"],
         ),
+        "active_addresses": MetricQueryDef(
+            ref="active_addresses.sql",
+            time_aggregations=["daily", "weekly", "monthly"],
+        ),
+        "commits": MetricQueryDef(
+            ref="commits.sql",
+            time_aggregations=["daily", "weekly", "monthly"],
+        ),
+        "forks": MetricQueryDef(
+            ref="forks.sql",
+            time_aggregations=["daily", "weekly", "monthly"],
+        ),
+        "gas_fees": MetricQueryDef(
+            ref="gas_fees.sql",
+            time_aggregations=["daily", "weekly", "monthly"],
+        ),
         # This defines something with a rolling option that allows you to look back
         # to some arbitrary window. So you specify the window and specify the unit.
         # The unit and the window are used to pass in variables to the query. So it's
@@ -60,22 +76,6 @@
                 cron="@daily",
             ),
         ),
-        "active_addresses": MetricQueryDef(
-            ref="active_addresses.sql",
-            time_aggregations=["daily", "weekly", "monthly"],
-        ),
-        "commits": MetricQueryDef(
-            ref="commits.sql",
-            time_aggregations=["daily", "weekly", "monthly"],
-        ),
-        "forks": MetricQueryDef(
-            ref="forks.sql",
-            time_aggregations=["daily", "weekly", "monthly"],
-        ),
-        "gas_fees": MetricQueryDef(
-            ref="gas_fees.sql",
-            time_aggregations=["daily", "weekly", "monthly"],
-        ),
         "change_in_30_developer_activity": MetricQueryDef(
             vars={
                 "comparison_interval": 30,

diff --git a/warehouse/metrics_tools/compute/cluster.py b/warehouse/metrics_tools/compute/cluster.py
@@ -0,0 +1,20 @@
+"""Sets up a dask cluster
+"""
+
+import typing as t
+from dask_kubernetes.operator import KubeCluster
+
+
+def start_duckdb_cluster(
+    namespace: str,
+    gcs_key_id: str,
+    gcs_secret: str,
+    duckdb_path: str,
+    cluster_spec: t.Optional[dict] = None,
+):
+    options: t.Dict[str, t.Any] = {"namespace": namespace}
+    if cluster_spec:
+        options["custom_cluster_spec"] = cluster_spec
+    cluster = KubeCluster(**options)
+    cluster.adapt(minimum=6, maximum=6)
+    return cluster
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		This folder contains both the data generator and the csv most recently generated
		from that data generator. This is so we can easily update tests as we need.