diff --git a/README.md b/README.md index 8fffc15..2410e41 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@
- + Examples and custom spark images for working with the spark-on-k8s operator on AWS. @@ -14,24 +14,23 @@ Allows using Spark 2 with IRSA and Spark 3 with IRSA and AWS Glue as a metastore --- -![docker](https://img.shields.io/docker/automated/bbenzikry/spark-eks?style=plastic) -![build](https://img.shields.io/docker/build/bbenzikry/spark-eks?style=plastic) - -![spark2](https://img.shields.io/docker/v/bbenzikry/spark-eks/spark2-latest) -![pyspark2](https://img.shields.io/docker/v/bbenzikry/spark-eks/pyspark2-latest) -![spark3](https://img.shields.io/docker/v/bbenzikry/spark-eks/spark3-latest) -![pyspark3](https://img.shields.io/docker/v/bbenzikry/spark-eks/pyspark3-latest) -![spark3-edge](https://img.shields.io/docker/v/bbenzikry/spark-eks/spark3-edge) -![pyspark3-edge](https://img.shields.io/docker/v/bbenzikry/spark-eks/pyspark3-edge) -![operator](https://img.shields.io/docker/v/bbenzikry/spark-eks/operator) +![operator](https://img.shields.io/docker/cloud/build/bbenzikry/spark-eks-operator?style=plastic&label=operator) +![spark2](https://img.shields.io/docker/cloud/build/bbenzikry/spark-eks/spark2-latest?label=spark2) +![pyspark2](https://img.shields.io/docker/cloud/build/bbenzikry/spark-eks/pyspark2-latest?label=pyspark2) +![spark3](https://img.shields.io/docker/cloud/build/bbenzikry/spark-eks/spark3-latest?label=spark3) +![pyspark3](https://img.shields.io/docker/cloud/build/bbenzikry/spark-eks/pyspark3-latest?label=pyspark3) +![spark3-edge](https://img.shields.io/docker/cloud/build/bbenzikry/spark-eks/spark3-edge?label=spark3-edge) +![pyspark3-edge](https://img.shields.io/docker/cloud/build/bbenzikry/spark-eks/pyspark3-edge?label=pyspark3-edge)
## Prerequisites -- Deploy [spark-on-k8s operator](https://github.com/GoogleCloudPlatform/spark-on-k8s-operator) using the [helm chart](https://github.com/helm/charts/tree/master/incubator/sparkoperator) or with [flux](./flux/releases/operator.yaml) using the [patched operator](https://github.com/bbenzikry/spark-on-k8s-operator/tree/hive-subpath) image. +- Deploy [spark-on-k8s operator](https://github.com/GoogleCloudPlatform/spark-on-k8s-operator) using the [helm chart](https://github.com/helm/charts/tree/master/incubator/sparkoperator) and the [patched operator](https://github.com/bbenzikry/spark-on-k8s-operator/tree/hive-subpath) image `bbenzikry/spark-eks-operator:latest` + +Suggested values for the helm chart can be found in the [flux](./flux/releases/operator.yaml) example. -> Note: Do not create the spark service account automatically as part of chart use +> Note: Do not create the spark service account automatically as part of chart use. ## using IAM roles for service accounts on EKS @@ -42,10 +41,10 @@ Allows using Spark 2 with IRSA and Spark 3 with IRSA and AWS Glue as a metastore > [AWS docs on creating policies and roles](https://docs.aws.amazon.com/eks/latest/userguide/create-service-account-iam-policy-and-role.html) -- Add default service account EKS role for executors in your spark job namespace +- Add default service account EKS role for executors in your spark job namespace ( optional ) ```yaml -# NOTE: This is only required when not building spark from source or using a version of spark < 3.1. If using our edge docker images for spark3/pyspark3 you can skip this step +# NOTE: Only required when not building spark from source or using a version of spark < 3.1. If you use our *-edge docker images for spark3/pyspark3 you can skip this step, as it will rely on the driver pod. apiVersion: v1 kind: ServiceAccount metadata: @@ -60,6 +59,7 @@ metadata: ```yaml ## With the spark3 source builds, when this is configured and no executor role exists, executors default to this SA as well. +# This is not recommended for production until a stable release is provided. apiVersion: v1 kind: ServiceAccount metadata: @@ -77,19 +77,36 @@ metadata: - For pyspark, see [pyspark.Dockerfile](./docker/pyspark.Dockerfile) +### Submit your spark application with IRSA support + +#### Select the right implementation for you + +> Below are examples for latest versions. +> +> If you want to use pinned versions, all images are tagged by the commit SHA. +> +> You can find a full list of tags [here](https://hub.docker.com/repository/docker/bbenzikry/spark-eks/tags) + ```dockerfile +# spark2 +FROM bbenzikry/spark-eks:spark2-latest # spark3 FROM bbenzikry/spark-eks:spark3-latest -# source build -FROM bbenzikry/spark-eks:spark3-edge-latest -# pyspark - +# source / master build +FROM bbenzikry/spark-eks:spark3-edge +# pyspark2 +FROM bbenzikry/spark-eks:pyspark2-latest +# pyspark3 +FROM bbenzikry/spark-eks:pyspark3-latest +# pyspark3-edge +FROM bbenzikry/spark-eks:pyspark3-edge ``` -### Submit your spark application with IRSA support +#### Submit your SparkApplication spec ```yaml hadoopConf: + # IRSA configuration "fs.s3a.aws.credentials.provider": "com.amazonaws.auth.WebIdentityTokenCredentialsProvider" driver: ..... @@ -104,7 +121,37 @@ driver: - Full example [here]() -## Working with AWS Glue as metastore +### Working with AWS Glue as metastore + +#### Prerequisites + +- Make sure your driver and executor roles have the relevant glue permissions + +```json +{ + /* Example below is an example configuration for accessing db1/table1. + Modify this as you deem worthy for potential access. + Last 3 resources must be present for your region. + */ + + "Effect": "Allow", + "Action": ["glue:*Database*", "glue:*Table*", "glue:*Partition*"], + "Resource": [ + "arn:aws:glue:us-west-2:123456789012:catalog", + "arn:aws:glue:us-west-2:123456789012:database/db1", + "arn:aws:glue:us-west-2:123456789012:table/db1/table1", + + "arn:aws:glue:eu-west-1:123456789012:database/default", + "arn:aws:glue:eu-west-1:123456789012:database/global_temp", + "arn:aws:glue:eu-west-1:123456789012:database/parquet" + ] +} +``` + +- Make sure you are using the patched operator image +- Add a config map to your spark job namespace as defined [here](conf/configmap.yaml) + +### Submitting your application ## Working with the spark history server on S3 diff --git a/conf/configmap.yaml b/conf/configmap.yaml new file mode 100644 index 0000000..78e8fb3 --- /dev/null +++ b/conf/configmap.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +data: + hive-site.xml: |- + + + hive.imetastoreclient.factory.class + com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory + + +kind: ConfigMap +metadata: + namespace: SPARK_JOB_NAMESPACE + name: spark-config-map diff --git a/docker/spark3.Dockerfile b/docker/spark3.Dockerfile index 07378ba..79c3d05 100644 --- a/docker/spark3.Dockerfile +++ b/docker/spark3.Dockerfile @@ -5,15 +5,17 @@ ARG BUILD_DATE ARG VCS_REF FROM python:3.7-slim-buster as builder - # Build options ARG spark_version=3.0.0 +ARG scala_version=2.12 # uncomment if you want the dev build # ARG spark_dev_version=v3.0.1-rc2 # HIVE version for glue support ARG hive_version=2.3.7 # Hadoop and SDK versions for IRSA support ARG hadoop_version=3.3.0 +# due to no substition +ARG hadoop_major_version=3 ARG aws_java_sdk_version=1.11.797 ARG jmx_prometheus_javaagent_version=0.12.0 @@ -81,11 +83,29 @@ RUN mv hadoop-${hadoop_version} hadoop # Delete unnecessary hadoop documentation RUN rm -rf hadoop/share/doc +WORKDIR /spark/jars +# Copy patched hive jar to distro +RUN cp /hive/ql/target/hive-exec-${hive_version}.jar . + +ADD https://repo1.maven.org/maven2/org/apache/spark/spark-hive_${scala_version}/${spark_version}/spark-hive_${scala_version}-${spark_version}.jar . + +# Add updated guava +RUN rm -f guava-14.0.1.jar +ADD https://repo1.maven.org/maven2/com/google/guava/guava/23.0/guava-23.0.jar . + +# Add GCS and BQ just in case +ADD https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-latest-hadoop${hadoop_major_version}.jar . +ADD https://storage.googleapis.com/spark-lib/bigquery/spark-bigquery-latest.jar . + +# chmods +RUN chmod 0644 guava-23.0.jar spark-hive_${scala_version}-${spark_version}.jar spark-bigquery-latest.jar gcs-connector-latest-hadoop${hadoop_major_version}.jar + WORKDIR /hadoop/share/hadoop/tools/lib RUN rm ./aws-java-sdk-bundle-*.jar ADD https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${aws_java_sdk_version}/aws-java-sdk-bundle-${aws_java_sdk_version}.jar . RUN chmod 0644 aws-java-sdk-bundle*.jar + FROM openjdk:8-jdk-slim as final LABEL maintainer="bbenzikry@gmail.com" \ org.label-schema.build-date=$BUILD_DATE \ @@ -128,6 +148,7 @@ ENV SPARK_DIST_CLASSPATH="$HADOOP_HOME/etc/hadoop:$HADOOP_HOME/share/hadoop/comm ENV SPARK_EXTRA_CLASSPATH="$SPARK_DIST_CLASSPATH" ENV LD_LIBRARY_PATH /lib64 + WORKDIR /opt/spark/work-dir RUN chmod g+w /opt/spark/work-dir # RUN chmod a+x /opt/decom.sh diff --git a/docker/spark3.edge.Dockerfile b/docker/spark3.edge.Dockerfile index b6d2888..12302f1 100644 --- a/docker/spark3.edge.Dockerfile +++ b/docker/spark3.edge.Dockerfile @@ -15,6 +15,7 @@ ARG spark_version=3.0.0 ARG hive_version=2.3.7 # Hadoop and SDK versions for IRSA support ARG hadoop_version=3.3.0 +ARG hadoop_major_version=${hadoop_version:0:1} ARG aws_java_sdk_version=1.11.797 ARG jmx_prometheus_javaagent_version=0.12.0 @@ -62,10 +63,24 @@ RUN mkdir /jars && find /catalog -name "*.jar" -exec cp {} /jars \; WORKDIR / RUN git clone https://github.com/apache/spark -# Uncomment for source WORKDIR /spark +# We get local hive here, so no need to get specific jar RUN dev/make-distribution.sh --name custom-spark --pip -Pkubernetes -Phive -Phive-thriftserver -Phadoop-provided -Dhive.version=${hive_version} +WORKDIR /spark/jars + +# Add updated guava +RUN rm -f guava-14.0.1.jar +ADD https://repo1.maven.org/maven2/com/google/guava/guava/23.0/guava-23.0.jar . + +# Add GCS and BQ just in case +ADD https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-latest-hadoop${hadoop_major_version}.jar . +ADD https://storage.googleapis.com/spark-lib/bigquery/spark-bigquery-latest.jar . + +# chmods +RUN chmod 0644 guava-23.0.jar spark-bigquery-latest.jar gcs-connector-latest-hadoop${hadoop_major_version}.jar + + WORKDIR / # Hadoop ADD http://mirrors.whoishostingthis.com/apache/hadoop/common/hadoop-${hadoop_version}/hadoop-${hadoop_version}.tar.gz .