google · bennsimon · Apr 28, 2023 · May 2, 2023 · May 2, 2023 · Jul 21, 2023
diff --git a/docker/compose-controller-spark-sql-external-storage.yaml b/docker/compose-controller-spark-sql-external-storage.yaml
@@ -0,0 +1,109 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This docker-compose configuration is for bringing up a pipeline controller
+# along with a single-process Spark environment with a JDBC endpoint.
+
+# Environment variables:
+#
+# PIPELINE_CONFIG: The directory that contains pipeline configurations, namely
+#   application.yaml and flink-conf.yaml files.
+#
+# DWH_ROOT: The directory where Parquet files are written. This is shared
+#   between all containers; the pipeline writes to it and Spark ones read.
+#
+# Note if local paths are used, they should start with `./ `or `../`. Also the
+# mounted files should be readable by containers, e.g., world-readable.
+#
+
+# NOTES ON SPARK:
+# This is a very simple single-process Spark configuration to be able to run
+# SQL queries against Parquet files generated by the pipeline. It exposes an
+# endpoint on port 10001 which can be used for JDBC connection from any SQL
+# client.
+#
+# For a more complete configuration which shows different pieces that are needed
+# for a cluster environment, please see `compose-controller-spark-sql.yaml`.
+
+# NOTES ON METASTORE:
+# This configuration uses the default embedded Derby database as Metastore for
+# the thriftserver. Example config lines are provided (but commented out) that
+# show how to use an external DB instead.
+
+# OTHER CONFIGS:
+# If you want to change Spark default configs, you can mount your config files
+# to /opt/bitnami/spark/conf/
+# https://spark.apache.org/docs/latest/configuration.html
+
+version: '2'
+
+services:
+  drivers-build:
+    container_name: drivers-build
+    build:
+      context: ./drivers-build
+    command:
+      # copies the drivers from the drivers-build folder to JdbcDrivers volume to be used by the spark containers
+      - /bin/sh
+      - -ec
+      - |-
+        cp -R /jdbcDrivers/* /drivers-build/jdbcDrivers
+    volumes:
+      - jdbcDrivers:/drivers-build/jdbcDrivers
+  pipeline-controller:
+    # to force a build use `--build` option of `docker-compose up`.
+    build:
+      context: ..
+    container_name: pipeline-controller
+    volumes:
+      - ${PIPELINE_CONFIG}:/app/config:ro
+      - ${DWH_ROOT}:/dwh
+    ports:
+      - '8090:8080'
+
+  spark:
+    image: docker.io/bitnami/spark:3.3
+    container_name: spark-thriftserver
+    command:
+      # copies the drivers to the jars directory before the thrift server starts.
+      - /bin/bash
+      - -ec
+      - |-
+        cp -R /drivers-build/jdbcDrivers/* /opt/bitnami/spark/jars/
+        sbin/start-thriftserver.sh
+    environment:
+      - HIVE_SERVER2_THRIFT_PORT=10000
+    ports:
+      - '10001:10000'
+      - '4041:4040'
+    volumes:
+      - ${DWH_ROOT}:/dwh
+      - ./hive-site_example.xml:/opt/bitnami/spark/conf/hive-site.xml
+    volumes_from:
+      - drivers-build
+
+  postgres:
+    image: postgres:14
+    ports:
+      - "5470:5432"
+    environment:
+      - "POSTGRES_PASSWORD=admin"
+      - "POSTGRES_USER=admin"
+      - "POSTGRES_DB=custom_metastore_db"
+    volumes:
+      - pgdata:/var/lib/postgresql/data
+
+volumes:
+  jdbcDrivers:
+  pgdata:
diff --git a/docker/drivers-build/Dockerfile b/docker/drivers-build/Dockerfile
@@ -0,0 +1,12 @@
+FROM alpine:3.17.3
+
+WORKDIR /jdbcDrivers
+
+ARG POSTGRESQL_DRIVER_VERSION=42.6.0
+
+# Install required packages
+RUN apk update && apk add curl
+
+# Fetch drivers
+RUN curl -s https://jdbc.postgresql.org/download/postgresql-$POSTGRESQL_DRIVER_VERSION.jar \
+    -o postgresql-$POSTGRESQL_DRIVER_VERSION.jar
diff --git a/docker/drivers-build/README.md b/docker/drivers-build/README.md
@@ -0,0 +1,6 @@
+# Overview
+
+The docker/drivers-build directory contains a sample Dockerfile for packaging postgresql database JDBC driver to be used by spark-thriftserver as an external storage. See demonstration [docker compose file](../compose-controller-spark-sql-external-storage.yaml). 
+
+
+ > This folder should be ignored during continuous integration tests.
diff --git a/docker/hive-site_example.xml b/docker/hive-site_example.xml
@@ -17,7 +17,8 @@ https://cwiki.apache.org/confluence/display/Hive/Configuration+Properties#Config
     <!-- This setup assumes that custom_metastore_db is already created. -->
     <!-- The IP address should be adjusted based on how the thriftserver sees
       the external DB; this example is for a docker image on Linux. -->
-    <value>jdbc:postgresql://172.18.0.1:5432/custom_metastore_db</value>
+    <!-- value>jdbc:postgresql://172.18.0.1:5432/custom_metastore_db</value -->
+    <value>jdbc:postgresql://postgres:5432/custom_metastore_db</value>
     <!-- value>jdbc:postgresql://localhost:5432/custom_metastore_db</value -->
   </property>
   <property>