diff --git a/README.md b/README.md
index 8fffc15..2410e41 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
-
+
Examples and custom spark images for working with the spark-on-k8s operator on AWS.
@@ -14,24 +14,23 @@ Allows using Spark 2 with IRSA and Spark 3 with IRSA and AWS Glue as a metastore
---
-![docker](https://img.shields.io/docker/automated/bbenzikry/spark-eks?style=plastic)
-![build](https://img.shields.io/docker/build/bbenzikry/spark-eks?style=plastic)
-
-![spark2](https://img.shields.io/docker/v/bbenzikry/spark-eks/spark2-latest)
-![pyspark2](https://img.shields.io/docker/v/bbenzikry/spark-eks/pyspark2-latest)
-![spark3](https://img.shields.io/docker/v/bbenzikry/spark-eks/spark3-latest)
-![pyspark3](https://img.shields.io/docker/v/bbenzikry/spark-eks/pyspark3-latest)
-![spark3-edge](https://img.shields.io/docker/v/bbenzikry/spark-eks/spark3-edge)
-![pyspark3-edge](https://img.shields.io/docker/v/bbenzikry/spark-eks/pyspark3-edge)
-![operator](https://img.shields.io/docker/v/bbenzikry/spark-eks/operator)
+![operator](https://img.shields.io/docker/cloud/build/bbenzikry/spark-eks-operator?style=plastic&label=operator)
+![spark2](https://img.shields.io/docker/cloud/build/bbenzikry/spark-eks/spark2-latest?label=spark2)
+![pyspark2](https://img.shields.io/docker/cloud/build/bbenzikry/spark-eks/pyspark2-latest?label=pyspark2)
+![spark3](https://img.shields.io/docker/cloud/build/bbenzikry/spark-eks/spark3-latest?label=spark3)
+![pyspark3](https://img.shields.io/docker/cloud/build/bbenzikry/spark-eks/pyspark3-latest?label=pyspark3)
+![spark3-edge](https://img.shields.io/docker/cloud/build/bbenzikry/spark-eks/spark3-edge?label=spark3-edge)
+![pyspark3-edge](https://img.shields.io/docker/cloud/build/bbenzikry/spark-eks/pyspark3-edge?label=pyspark3-edge)
## Prerequisites
-- Deploy [spark-on-k8s operator](https://github.com/GoogleCloudPlatform/spark-on-k8s-operator) using the [helm chart](https://github.com/helm/charts/tree/master/incubator/sparkoperator) or with [flux](./flux/releases/operator.yaml) using the [patched operator](https://github.com/bbenzikry/spark-on-k8s-operator/tree/hive-subpath) image.
+- Deploy [spark-on-k8s operator](https://github.com/GoogleCloudPlatform/spark-on-k8s-operator) using the [helm chart](https://github.com/helm/charts/tree/master/incubator/sparkoperator) and the [patched operator](https://github.com/bbenzikry/spark-on-k8s-operator/tree/hive-subpath) image `bbenzikry/spark-eks-operator:latest`
+
+Suggested values for the helm chart can be found in the [flux](./flux/releases/operator.yaml) example.
-> Note: Do not create the spark service account automatically as part of chart use
+> Note: Do not create the spark service account automatically as part of chart use.
## using IAM roles for service accounts on EKS
@@ -42,10 +41,10 @@ Allows using Spark 2 with IRSA and Spark 3 with IRSA and AWS Glue as a metastore
> [AWS docs on creating policies and roles](https://docs.aws.amazon.com/eks/latest/userguide/create-service-account-iam-policy-and-role.html)
-- Add default service account EKS role for executors in your spark job namespace
+- Add default service account EKS role for executors in your spark job namespace ( optional )
```yaml
-# NOTE: This is only required when not building spark from source or using a version of spark < 3.1. If using our edge docker images for spark3/pyspark3 you can skip this step
+# NOTE: Only required when not building spark from source or using a version of spark < 3.1. If you use our *-edge docker images for spark3/pyspark3 you can skip this step, as it will rely on the driver pod.
apiVersion: v1
kind: ServiceAccount
metadata:
@@ -60,6 +59,7 @@ metadata:
```yaml
## With the spark3 source builds, when this is configured and no executor role exists, executors default to this SA as well.
+# This is not recommended for production until a stable release is provided.
apiVersion: v1
kind: ServiceAccount
metadata:
@@ -77,19 +77,36 @@ metadata:
- For pyspark, see [pyspark.Dockerfile](./docker/pyspark.Dockerfile)
+### Submit your spark application with IRSA support
+
+#### Select the right implementation for you
+
+> Below are examples for latest versions.
+>
+> If you want to use pinned versions, all images are tagged by the commit SHA.
+>
+> You can find a full list of tags [here](https://hub.docker.com/repository/docker/bbenzikry/spark-eks/tags)
+
```dockerfile
+# spark2
+FROM bbenzikry/spark-eks:spark2-latest
# spark3
FROM bbenzikry/spark-eks:spark3-latest
-# source build
-FROM bbenzikry/spark-eks:spark3-edge-latest
-# pyspark
-
+# source / master build
+FROM bbenzikry/spark-eks:spark3-edge
+# pyspark2
+FROM bbenzikry/spark-eks:pyspark2-latest
+# pyspark3
+FROM bbenzikry/spark-eks:pyspark3-latest
+# pyspark3-edge
+FROM bbenzikry/spark-eks:pyspark3-edge
```
-### Submit your spark application with IRSA support
+#### Submit your SparkApplication spec
```yaml
hadoopConf:
+ # IRSA configuration
"fs.s3a.aws.credentials.provider": "com.amazonaws.auth.WebIdentityTokenCredentialsProvider"
driver:
.....
@@ -104,7 +121,37 @@ driver:
- Full example [here]()
-## Working with AWS Glue as metastore
+### Working with AWS Glue as metastore
+
+#### Prerequisites
+
+- Make sure your driver and executor roles have the relevant glue permissions
+
+```json
+{
+ /* Example below is an example configuration for accessing db1/table1.
+ Modify this as you deem worthy for potential access.
+ Last 3 resources must be present for your region.
+ */
+
+ "Effect": "Allow",
+ "Action": ["glue:*Database*", "glue:*Table*", "glue:*Partition*"],
+ "Resource": [
+ "arn:aws:glue:us-west-2:123456789012:catalog",
+ "arn:aws:glue:us-west-2:123456789012:database/db1",
+ "arn:aws:glue:us-west-2:123456789012:table/db1/table1",
+
+ "arn:aws:glue:eu-west-1:123456789012:database/default",
+ "arn:aws:glue:eu-west-1:123456789012:database/global_temp",
+ "arn:aws:glue:eu-west-1:123456789012:database/parquet"
+ ]
+}
+```
+
+- Make sure you are using the patched operator image
+- Add a config map to your spark job namespace as defined [here](conf/configmap.yaml)
+
+### Submitting your application
## Working with the spark history server on S3
diff --git a/conf/configmap.yaml b/conf/configmap.yaml
new file mode 100644
index 0000000..78e8fb3
--- /dev/null
+++ b/conf/configmap.yaml
@@ -0,0 +1,13 @@
+apiVersion: v1
+data:
+ hive-site.xml: |-
+
+
+ hive.imetastoreclient.factory.class
+ com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory
+
+
+kind: ConfigMap
+metadata:
+ namespace: SPARK_JOB_NAMESPACE
+ name: spark-config-map
diff --git a/docker/spark3.Dockerfile b/docker/spark3.Dockerfile
index 07378ba..79c3d05 100644
--- a/docker/spark3.Dockerfile
+++ b/docker/spark3.Dockerfile
@@ -5,15 +5,17 @@ ARG BUILD_DATE
ARG VCS_REF
FROM python:3.7-slim-buster as builder
-
# Build options
ARG spark_version=3.0.0
+ARG scala_version=2.12
# uncomment if you want the dev build
# ARG spark_dev_version=v3.0.1-rc2
# HIVE version for glue support
ARG hive_version=2.3.7
# Hadoop and SDK versions for IRSA support
ARG hadoop_version=3.3.0
+# due to no substition
+ARG hadoop_major_version=3
ARG aws_java_sdk_version=1.11.797
ARG jmx_prometheus_javaagent_version=0.12.0
@@ -81,11 +83,29 @@ RUN mv hadoop-${hadoop_version} hadoop
# Delete unnecessary hadoop documentation
RUN rm -rf hadoop/share/doc
+WORKDIR /spark/jars
+# Copy patched hive jar to distro
+RUN cp /hive/ql/target/hive-exec-${hive_version}.jar .
+
+ADD https://repo1.maven.org/maven2/org/apache/spark/spark-hive_${scala_version}/${spark_version}/spark-hive_${scala_version}-${spark_version}.jar .
+
+# Add updated guava
+RUN rm -f guava-14.0.1.jar
+ADD https://repo1.maven.org/maven2/com/google/guava/guava/23.0/guava-23.0.jar .
+
+# Add GCS and BQ just in case
+ADD https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-latest-hadoop${hadoop_major_version}.jar .
+ADD https://storage.googleapis.com/spark-lib/bigquery/spark-bigquery-latest.jar .
+
+# chmods
+RUN chmod 0644 guava-23.0.jar spark-hive_${scala_version}-${spark_version}.jar spark-bigquery-latest.jar gcs-connector-latest-hadoop${hadoop_major_version}.jar
+
WORKDIR /hadoop/share/hadoop/tools/lib
RUN rm ./aws-java-sdk-bundle-*.jar
ADD https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${aws_java_sdk_version}/aws-java-sdk-bundle-${aws_java_sdk_version}.jar .
RUN chmod 0644 aws-java-sdk-bundle*.jar
+
FROM openjdk:8-jdk-slim as final
LABEL maintainer="bbenzikry@gmail.com" \
org.label-schema.build-date=$BUILD_DATE \
@@ -128,6 +148,7 @@ ENV SPARK_DIST_CLASSPATH="$HADOOP_HOME/etc/hadoop:$HADOOP_HOME/share/hadoop/comm
ENV SPARK_EXTRA_CLASSPATH="$SPARK_DIST_CLASSPATH"
ENV LD_LIBRARY_PATH /lib64
+
WORKDIR /opt/spark/work-dir
RUN chmod g+w /opt/spark/work-dir
# RUN chmod a+x /opt/decom.sh
diff --git a/docker/spark3.edge.Dockerfile b/docker/spark3.edge.Dockerfile
index b6d2888..12302f1 100644
--- a/docker/spark3.edge.Dockerfile
+++ b/docker/spark3.edge.Dockerfile
@@ -15,6 +15,7 @@ ARG spark_version=3.0.0
ARG hive_version=2.3.7
# Hadoop and SDK versions for IRSA support
ARG hadoop_version=3.3.0
+ARG hadoop_major_version=${hadoop_version:0:1}
ARG aws_java_sdk_version=1.11.797
ARG jmx_prometheus_javaagent_version=0.12.0
@@ -62,10 +63,24 @@ RUN mkdir /jars && find /catalog -name "*.jar" -exec cp {} /jars \;
WORKDIR /
RUN git clone https://github.com/apache/spark
-# Uncomment for source
WORKDIR /spark
+# We get local hive here, so no need to get specific jar
RUN dev/make-distribution.sh --name custom-spark --pip -Pkubernetes -Phive -Phive-thriftserver -Phadoop-provided -Dhive.version=${hive_version}
+WORKDIR /spark/jars
+
+# Add updated guava
+RUN rm -f guava-14.0.1.jar
+ADD https://repo1.maven.org/maven2/com/google/guava/guava/23.0/guava-23.0.jar .
+
+# Add GCS and BQ just in case
+ADD https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-latest-hadoop${hadoop_major_version}.jar .
+ADD https://storage.googleapis.com/spark-lib/bigquery/spark-bigquery-latest.jar .
+
+# chmods
+RUN chmod 0644 guava-23.0.jar spark-bigquery-latest.jar gcs-connector-latest-hadoop${hadoop_major_version}.jar
+
+
WORKDIR /
# Hadoop
ADD http://mirrors.whoishostingthis.com/apache/hadoop/common/hadoop-${hadoop_version}/hadoop-${hadoop_version}.tar.gz .