Spark: Remove Spark 3.3 support

apache · Feb 15, 2025 · cd52168 · cd52168
1 parent abb4783
commit cd52168
Show file tree

Hide file tree

Showing 511 changed files with 18 additions and 110,400 deletions.
diff --git a/.github/workflows/publish-snapshot.yml b/.github/workflows/publish-snapshot.yml
@@ -41,4 +41,4 @@ jobs:
       - run: |
           ./gradlew printVersion
           ./gradlew -DallModules publishApachePublicationToMavenRepository -PmavenUser=${{ secrets.NEXUS_USER }} -PmavenPassword=${{ secrets.NEXUS_PW }}
-          ./gradlew -DflinkVersions= -DsparkVersions=3.3,3.4,3.5 -DscalaVersion=2.13 -DkafkaVersions=3 publishApachePublicationToMavenRepository -PmavenUser=${{ secrets.NEXUS_USER }} -PmavenPassword=${{ secrets.NEXUS_PW }}
+          ./gradlew -DflinkVersions= -DsparkVersions=3.4,3.5 -DscalaVersion=2.13 -DkafkaVersions=3 publishApachePublicationToMavenRepository -PmavenUser=${{ secrets.NEXUS_USER }} -PmavenPassword=${{ secrets.NEXUS_PW }}
diff --git a/.github/workflows/spark-ci.yml b/.github/workflows/spark-ci.yml
@@ -70,13 +70,11 @@ jobs:
     strategy:
       matrix:
         jvm: [11, 17, 21]
-        spark: ['3.3', '3.4', '3.5']
+        spark: ['3.4', '3.5']
         scala: ['2.12', '2.13']
         exclude:
           # Spark 3.5 is the first version not failing on Java 21 (https://issues.apache.org/jira/browse/SPARK-42369)
           # Full Java 21 support is coming in Spark 4 (https://issues.apache.org/jira/browse/SPARK-43831)
-          - jvm: 21
-            spark: '3.3'
           - jvm: 21
             spark: '3.4'
     env:

diff --git a/.gitignore b/.gitignore
@@ -31,8 +31,6 @@ site/docs/.asf.yaml
 site/docs/javadoc/
 
 # benchmark output
-spark/v3.3/spark/benchmark/*
-spark/v3.3/spark-extensions/benchmark/*
 spark/v3.4/spark/benchmark/*
 spark/v3.4/spark-extensions/benchmark/*
 spark/v3.5/spark/benchmark/*

diff --git a/build.gradle b/build.gradle
@@ -1080,7 +1080,6 @@ project(':iceberg-bom') {
       // the BOM references the artifacts for all Scala versions.
       def sparkScalaPattern = ~"(.*)-([0-9][.][0-9]+)_([0-9][.][0-9]+)"
       def sparkScalaVersions = [
-        "3.3": ["2.12", "2.13"],
         "3.4": ["2.12", "2.13"],
         "3.5": ["2.12", "2.13"],
       ]

diff --git a/dev/stage-binaries.sh b/dev/stage-binaries.sh
@@ -20,14 +20,13 @@
 
 SCALA_VERSION=2.12
 FLINK_VERSIONS=1.18,1.19,1.20
-SPARK_VERSIONS=3.3,3.4,3.5
+SPARK_VERSIONS=3.4,3.5
 KAFKA_VERSIONS=3
 
 ./gradlew -Prelease -DscalaVersion=$SCALA_VERSION -DflinkVersions=$FLINK_VERSIONS -DsparkVersions=$SPARK_VERSIONS -DkafkaVersions=$KAFKA_VERSIONS publishApachePublicationToMavenRepository
 
 # Also publish Scala 2.13 Artifacts for versions that support it.
 # Flink does not yet support 2.13 (and is largely dropping a user-facing dependency on Scala). Hive doesn't need a Scala specification.
-./gradlew -Prelease -DscalaVersion=2.13 -DsparkVersions=3.3 :iceberg-spark:iceberg-spark-3.3_2.13:publishApachePublicationToMavenRepository :iceberg-spark:iceberg-spark-extensions-3.3_2.13:publishApachePublicationToMavenRepository :iceberg-spark:iceberg-spark-runtime-3.3_2.13:publishApachePublicationToMavenRepository
 ./gradlew -Prelease -DscalaVersion=2.13 -DsparkVersions=3.4 :iceberg-spark:iceberg-spark-3.4_2.13:publishApachePublicationToMavenRepository :iceberg-spark:iceberg-spark-extensions-3.4_2.13:publishApachePublicationToMavenRepository :iceberg-spark:iceberg-spark-runtime-3.4_2.13:publishApachePublicationToMavenRepository
 ./gradlew -Prelease -DscalaVersion=2.13 -DsparkVersions=3.5 :iceberg-spark:iceberg-spark-3.5_2.13:publishApachePublicationToMavenRepository :iceberg-spark:iceberg-spark-extensions-3.5_2.13:publishApachePublicationToMavenRepository :iceberg-spark:iceberg-spark-runtime-3.5_2.13:publishApachePublicationToMavenRepository
 
diff --git a/docs/docs/aws.md b/docs/docs/aws.md
@@ -440,7 +440,7 @@ This is turned off by default.
 ### S3 Tags
 
 Custom [tags](https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-tagging.html) can be added to S3 objects while writing and deleting.
-For example, to write S3 tags with Spark 3.3, you can start the Spark SQL shell with:
+For example, to write S3 tags with Spark 3.5, you can start the Spark SQL shell with:
 ```
 spark-sql --conf spark.sql.catalog.my_catalog=org.apache.iceberg.spark.SparkCatalog \
     --conf spark.sql.catalog.my_catalog.warehouse=s3://my-bucket/my/key/prefix \
@@ -457,7 +457,7 @@ The property is set to `true` by default.
 
 With the `s3.delete.tags` config, objects are tagged with the configured key-value pairs before deletion.
 Users can configure tag-based object lifecycle policy at bucket level to transition objects to different tiers.
-For example, to add S3 delete tags with Spark 3.3, you can start the Spark SQL shell with: 
+For example, to add S3 delete tags with Spark 3.5, you can start the Spark SQL shell with: 
 
 ```
 sh spark-sql --conf spark.sql.catalog.my_catalog=org.apache.iceberg.spark.SparkCatalog \
@@ -473,7 +473,7 @@ Users can also use the catalog property `s3.delete.num-threads` to mention the n
 
 When the catalog property `s3.write.table-tag-enabled` and `s3.write.namespace-tag-enabled` is set to `true` then the objects in S3 will be saved with tags: `iceberg.table=<table-name>` and `iceberg.namespace=<namespace-name>`.
 Users can define access and data retention policy per namespace or table based on these tags.
-For example, to write table and namespace name as S3 tags with Spark 3.3, you can start the Spark SQL shell with:
+For example, to write table and namespace name as S3 tags with Spark 3.5, you can start the Spark SQL shell with:
 ```
 sh spark-sql --conf spark.sql.catalog.my_catalog=org.apache.iceberg.spark.SparkCatalog \
     --conf spark.sql.catalog.my_catalog.warehouse=s3://iceberg-warehouse/s3-tagging \
@@ -493,7 +493,7 @@ disaster recovery, etc.
 For using cross-region access points, we need to additionally set `use-arn-region-enabled` catalog property to
 `true` to enable `S3FileIO` to make cross-region calls, it's not required for same / multi-region access points.
 
-For example, to use S3 access-point with Spark 3.3, you can start the Spark SQL shell with:
+For example, to use S3 access-point with Spark 3.5, you can start the Spark SQL shell with:
 ```
 spark-sql --conf spark.sql.catalog.my_catalog=org.apache.iceberg.spark.SparkCatalog \
     --conf spark.sql.catalog.my_catalog.warehouse=s3://my-bucket2/my/key/prefix \
@@ -520,7 +520,7 @@ you to fallback to using your IAM role (and its permission sets directly) to acc
 is unable to authorize your S3 call. This can be done using the `s3.access-grants.fallback-to-iam` boolean catalog property. By default,
 this property is set to `false`.
 
-For example, to add the S3 Access Grants Integration with Spark 3.3, you can start the Spark SQL shell with:
+For example, to add the S3 Access Grants Integration with Spark 3.5, you can start the Spark SQL shell with:
 ```
 spark-sql --conf spark.sql.catalog.my_catalog=org.apache.iceberg.spark.SparkCatalog \
     --conf spark.sql.catalog.my_catalog.warehouse=s3://my-bucket2/my/key/prefix \
@@ -537,7 +537,7 @@ For more details on using S3 Access Grants, please refer to [Managing access wit
 S3 Cross-Region bucket access can be turned on by setting catalog property `s3.cross-region-access-enabled` to `true`. 
 This is turned off by default to avoid first S3 API call increased latency.
 
-For example, to enable S3 Cross-Region bucket access with Spark 3.3, you can start the Spark SQL shell with:
+For example, to enable S3 Cross-Region bucket access with Spark 3.5, you can start the Spark SQL shell with:
 ```
 spark-sql --conf spark.sql.catalog.my_catalog=org.apache.iceberg.spark.SparkCatalog \
     --conf spark.sql.catalog.my_catalog.warehouse=s3://my-bucket2/my/key/prefix \
@@ -554,7 +554,7 @@ For more details, please refer to [Cross-Region access for Amazon S3](https://do
 
 To use S3 Acceleration, we need to set `s3.acceleration-enabled` catalog property to `true` to enable `S3FileIO` to make accelerated S3 calls.
 
-For example, to use S3 Acceleration with Spark 3.3, you can start the Spark SQL shell with:
+For example, to use S3 Acceleration with Spark 3.5, you can start the Spark SQL shell with:
 ```
 spark-sql --conf spark.sql.catalog.my_catalog=org.apache.iceberg.spark.SparkCatalog \
     --conf spark.sql.catalog.my_catalog.warehouse=s3://my-bucket2/my/key/prefix \
@@ -572,7 +572,7 @@ When clients request a dual-stack endpoint, the bucket URL resolves to an IPv6 a
 
 To use S3 Dual-stack, we need to set `s3.dualstack-enabled` catalog property to `true` to enable `S3FileIO` to make dual-stack S3 calls.
 
-For example, to use S3 Dual-stack with Spark 3.3, you can start the Spark SQL shell with:
+For example, to use S3 Dual-stack with Spark 3.5, you can start the Spark SQL shell with:
 ```
 spark-sql --conf spark.sql.catalog.my_catalog=org.apache.iceberg.spark.SparkCatalog \
     --conf spark.sql.catalog.my_catalog.warehouse=s3://my-bucket2/my/key/prefix \
@@ -698,7 +698,7 @@ LIB_PATH=/usr/share/aws/aws-java-sdk/
 
 
 ICEBERG_PACKAGES=(
-  "iceberg-spark-runtime-3.3_2.12"
+  "iceberg-spark-runtime-3.5_2.12"
   "iceberg-flink-runtime"
   "iceberg-aws-bundle"
 )

diff --git a/docs/docs/nessie.md b/docs/docs/nessie.md
@@ -33,17 +33,17 @@ See [Project Nessie](https://projectnessie.org) for more information on Nessie.
 ## Enabling Nessie Catalog
 
 The `iceberg-nessie` module is bundled with Spark and Flink runtimes for all versions from `0.11.0`. To get started
-with Nessie (with spark-3.3) and Iceberg simply add the Iceberg runtime to your process. Eg: `spark-sql --packages
-org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:{{ icebergVersion }}`. 
+with Nessie (with spark-3.5) and Iceberg simply add the Iceberg runtime to your process. Eg: `spark-sql --packages
+org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:{{ icebergVersion }}`. 
 
 ## Spark SQL Extensions
 
 Nessie SQL extensions can be used to manage the Nessie repo as shown below.
-Example for Spark 3.3 with scala 2.12:
+Example for Spark 3.5 with scala 2.12:
 
 ```
 bin/spark-sql 
-  --packages "org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:{{ icebergVersion }},org.projectnessie.nessie-integrations:nessie-spark-extensions-3.3_2.12:{{ nessieVersion }}"
+  --packages "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:{{ icebergVersion }},org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:{{ nessieVersion }}"
   --conf spark.sql.extensions="org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions"
   --conf <other settings>
 ```

diff --git a/gradle.properties b/gradle.properties
@@ -19,7 +19,7 @@ jmhIncludeRegex=.*
 systemProp.defaultFlinkVersions=1.20
 systemProp.knownFlinkVersions=1.18,1.19,1.20
 systemProp.defaultSparkVersions=3.5
-systemProp.knownSparkVersions=3.3,3.4,3.5
+systemProp.knownSparkVersions=3.4,3.5
 systemProp.defaultKafkaVersions=3
 systemProp.knownKafkaVersions=3
 systemProp.defaultScalaVersion=2.12

diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml
@@ -78,7 +78,6 @@ roaringbitmap = "1.3.0"
 scala-collection-compat = "2.13.0"
 slf4j = "2.0.16"
 snowflake-jdbc = "3.22.0"
-spark-hive33 = "3.3.4"
 spark-hive34 = "3.4.4"
 spark-hive35 = "3.5.4"
 sqlite-jdbc = "3.49.0.0"

diff --git a/jmh.gradle b/jmh.gradle
@@ -38,11 +38,6 @@ if (flinkVersions.contains("1.20")) {
   jmhProjects.add(project(":iceberg-flink:iceberg-flink-1.20"))
 }
 
-if (sparkVersions.contains("3.3")) {
-  jmhProjects.add(project(":iceberg-spark:iceberg-spark-3.3_${scalaVersion}"))
-  jmhProjects.add(project(":iceberg-spark:iceberg-spark-extensions-3.3_${scalaVersion}"))
-}
-
 if (sparkVersions.contains("3.4")) {
   jmhProjects.add(project(":iceberg-spark:iceberg-spark-3.4_${scalaVersion}"))
   jmhProjects.add(project(":iceberg-spark:iceberg-spark-extensions-3.4_${scalaVersion}"))

diff --git a/settings.gradle b/settings.gradle
@@ -137,18 +137,6 @@ if (flinkVersions.contains("1.20")) {
   project(":iceberg-flink:flink-runtime-1.20").name = "iceberg-flink-runtime-1.20"
 }
 
-if (sparkVersions.contains("3.3")) {
-  include ":iceberg-spark:spark-3.3_${scalaVersion}"
-  include ":iceberg-spark:spark-extensions-3.3_${scalaVersion}"
-  include ":iceberg-spark:spark-runtime-3.3_${scalaVersion}"
-  project(":iceberg-spark:spark-3.3_${scalaVersion}").projectDir = file('spark/v3.3/spark')
-  project(":iceberg-spark:spark-3.3_${scalaVersion}").name = "iceberg-spark-3.3_${scalaVersion}"
-  project(":iceberg-spark:spark-extensions-3.3_${scalaVersion}").projectDir = file('spark/v3.3/spark-extensions')
-  project(":iceberg-spark:spark-extensions-3.3_${scalaVersion}").name = "iceberg-spark-extensions-3.3_${scalaVersion}"
-  project(":iceberg-spark:spark-runtime-3.3_${scalaVersion}").projectDir = file('spark/v3.3/spark-runtime')
-  project(":iceberg-spark:spark-runtime-3.3_${scalaVersion}").name = "iceberg-spark-runtime-3.3_${scalaVersion}"
-}
-
 if (sparkVersions.contains("3.4")) {
   include ":iceberg-spark:spark-3.4_${scalaVersion}"
   include ":iceberg-spark:spark-extensions-3.4_${scalaVersion}"

diff --git a/site/docs/multi-engine-support.md b/site/docs/multi-engine-support.md
@@ -67,7 +67,7 @@ Each engine version undergoes the following lifecycle stages:
 | 3.0        | End of Life        | 0.9.0                   | 1.0.0                  | [iceberg-spark-runtime-3.0_2.12](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.0_2.12/1.0.0/iceberg-spark-runtime-3.0_2.12-1.0.0.jar) |
 | 3.1        | End of Life        | 0.12.0                  | 1.3.1                  | [iceberg-spark-runtime-3.1_2.12](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.1_2.12/1.3.1/iceberg-spark-runtime-3.1_2.12-1.3.1.jar) [1] |
 | 3.2        | End of Life        | 0.13.0                  | 1.4.3                  | [iceberg-spark-runtime-3.2_2.12](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.2_2.12/1.4.3/iceberg-spark-runtime-3.2_2.12-1.4.3.jar) |
-| 3.3        | Deprecated         | 0.14.0                  | {{ icebergVersion }}   | [iceberg-spark-runtime-3.3_2.12](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.3_2.12/{{ icebergVersion }}/iceberg-spark-runtime-3.3_2.12-{{ icebergVersion }}.jar) |
+| 3.3        | End of Life        | 0.14.0                  | 1.8.0                  | [iceberg-spark-runtime-3.3_2.12](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.3_2.12/1.8.0/iceberg-spark-runtime-3.3_2.12-1.8.0.jar) |
 | 3.4        | Maintained         | 1.3.0                   | {{ icebergVersion }}   | [iceberg-spark-runtime-3.4_2.12](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.4_2.12/{{ icebergVersion }}/iceberg-spark-runtime-3.4_2.12-{{ icebergVersion }}.jar) |
 | 3.5        | Maintained         | 1.4.0                   | {{ icebergVersion }}   | [iceberg-spark-runtime-3.5_2.12](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/{{ icebergVersion }}/iceberg-spark-runtime-3.5_2.12-{{ icebergVersion }}.jar) |
 

diff --git a/spark/build.gradle b/spark/build.gradle
@@ -20,10 +20,6 @@
 // add enabled Spark version modules to the build
 def sparkVersions = (System.getProperty("sparkVersions") != null ? System.getProperty("sparkVersions") : System.getProperty("defaultSparkVersions")).split(",")
 
-if (sparkVersions.contains("3.3")) {
-  apply from: file("$projectDir/v3.3/build.gradle")
-}
-
 if (sparkVersions.contains("3.4")) {
   apply from: file("$projectDir/v3.4/build.gradle")
 }