Merge pull request #18 from lyft/lyft/3.0

Update master to the current 3.0.1
lyft · Nov 16, 2020 · 1c5396f · 1c5396f
2 parents 5c29572 + d0e5c87
commit 1c5396f
Show file tree

Hide file tree

Showing 4,725 changed files with 449,775 additions and 103,604 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -1,2 +1,7 @@
 *.bat text eol=crlf
 *.cmd text eol=crlf
+*.java text eol=lf
+*.scala text eol=lf
+*.xml text eol=lf
+*.py text eol=lf
+*.R text eol=lf
diff --git a/.github/PULL_REQUEST_TEMPLATE b/.github/PULL_REQUEST_TEMPLATE
@@ -1,10 +1,42 @@
-## What changes were proposed in this pull request?
+<!--
+Thanks for sending a pull request!  Here are some tips for you:
+  1. If this is your first time, please read our contributor guidelines: https://spark.apache.org/contributing.html
+  2. Ensure you have added or run the appropriate tests for your PR: https://spark.apache.org/developer-tools.html
+  3. If the PR is unfinished, add '[WIP]' in your PR title, e.g., '[WIP][SPARK-XXXX] Your PR title ...'.
+  4. Be sure to keep the PR description updated to reflect all changes.
+  5. Please write your PR title to summarize what this PR proposes.
+  6. If possible, provide a concise example to reproduce the issue for a faster review.
+-->
 
-(Please fill in changes proposed in this fix)
+### What changes were proposed in this pull request?
+<!--
+Please clarify what changes you are proposing. The purpose of this section is to outline the changes and how this PR fixes the issue. 
+If possible, please consider writing useful notes for better and faster reviews in your PR. See the examples below.
+  1. If you refactor some codes with changing classes, showing the class hierarchy will help reviewers.
+  2. If you fix some SQL features, you can provide some references of other DBMSes.
+  3. If there is design documentation, please add the link.
+  4. If there is a discussion in the mailing list, please add the link.
+-->
 
-## How was this patch tested?
 
-(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)
-(If this patch involves UI changes, please attach a screenshot; otherwise, remove this)
+### Why are the changes needed?
+<!--
+Please clarify why the changes are needed. For instance,
+  1. If you propose a new API, clarify the use case for a new API.
+  2. If you fix a bug, you can clarify why it is a bug.
+-->
 
-Please review http://spark.apache.org/contributing.html before opening a pull request.
+
+### Does this PR introduce any user-facing change?
+<!--
+If yes, please clarify the previous behavior and the change this PR proposes - provide the console output, description and/or an example to show the behavior difference if possible.
+If no, write 'No'.
+-->
+
+
+### How was this patch tested?
+<!--
+If tests were added, say they were added here. Please make sure to add some test cases that check the changes thoroughly including negative and positive cases if possible.
+If it was tested in a way different from regular unit tests, please clarify how you tested step by step, ideally copy and paste-able, so that other reviewers can test and check, and descendants can verify in the future.
+If tests were not added, please describe why they were not added and/or why it was difficult to add.
+-->
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -0,0 +1,287 @@
+name: Build and test
+
+on:
+  push:
+    branches:
+    - branch-3.0
+  pull_request:
+    branches:
+    - branch-3.0
+
+jobs:
+  # Build: build Spark and run the tests for specified modules.
+  build:
+    name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})"
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        java:
+          - 1.8
+        hadoop:
+          - hadoop2.7
+        hive:
+          - hive2.3
+        # TODO(SPARK-32246): We don't test 'streaming-kinesis-asl' for now.
+        # Kinesis tests depends on external Amazon kinesis service.
+        # Note that the modules below are from sparktestsupport/modules.py.
+        modules:
+          - >-
+            core, unsafe, kvstore, avro,
+            network-common, network-shuffle, repl, launcher,
+            examples, sketch, graphx
+          - >-
+            catalyst, hive-thriftserver
+          - >-
+            streaming, sql-kafka-0-10, streaming-kafka-0-10,
+            mllib-local, mllib,
+            yarn, mesos, kubernetes, hadoop-cloud, spark-ganglia-lgpl
+          - >-
+            pyspark-sql, pyspark-mllib
+          - >-
+            pyspark-core, pyspark-streaming, pyspark-ml
+          - >-
+            sparkr
+        # Here, we split Hive and SQL tests into some of slow ones and the rest of them.
+        included-tags: [""]
+        # Some tests are disabled in GitHun Actions. Ideally, we should remove this tag
+        # and run all tests.
+        excluded-tags: ["org.apache.spark.tags.GitHubActionsUnstableTest"]
+        comment: [""]
+        include:
+          # Hive tests
+          - modules: hive
+            java: 1.8
+            hadoop: hadoop2.7
+            hive: hive2.3
+            included-tags: org.apache.spark.tags.SlowHiveTest
+            comment: "- slow tests"
+          - modules: hive
+            java: 1.8
+            hadoop: hadoop2.7
+            hive: hive2.3
+            excluded-tags: org.apache.spark.tags.SlowHiveTest,org.apache.spark.tags.GitHubActionsUnstableTest
+            comment: "- other tests"
+          # SQL tests
+          - modules: sql
+            java: 1.8
+            hadoop: hadoop2.7
+            hive: hive2.3
+            included-tags: org.apache.spark.tags.ExtendedSQLTest
+            comment: "- slow tests"
+          - modules: sql
+            java: 1.8
+            hadoop: hadoop2.7
+            hive: hive2.3
+            excluded-tags: org.apache.spark.tags.ExtendedSQLTest,org.apache.spark.tags.GitHubActionsUnstableTest
+            comment: "- other tests"
+    env:
+      MODULES_TO_TEST: ${{ matrix.modules }}
+      EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
+      INCLUDED_TAGS: ${{ matrix.included-tags }}
+      HADOOP_PROFILE: ${{ matrix.hadoop }}
+      HIVE_PROFILE: ${{ matrix.hive }}
+      # GitHub Actions' default miniconda to use in pip packaging test.
+      CONDA_PREFIX: /usr/share/miniconda
+      GITHUB_PREV_SHA: ${{ github.event.before }}
+    steps:
+    - name: Checkout Spark repository
+      uses: actions/checkout@v2
+      # In order to fetch changed files
+      with:
+        fetch-depth: 0
+    # Cache local repositories. Note that GitHub Actions cache has a 2G limit.
+    - name: Cache Scala, SBT, Maven and Zinc
+      uses: actions/cache@v2
+      with:
+        path: |
+          build/apache-maven-*
+          build/zinc-*
+          build/scala-*
+          build/*.jar
+        key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
+        restore-keys: |
+          build-
+    - name: Cache Maven local repository
+      uses: actions/cache@v2
+      with:
+        path: ~/.m2/repository
+        key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-${{ hashFiles('**/pom.xml') }}
+        restore-keys: |
+          ${{ matrix.java }}-${{ matrix.hadoop }}-maven-
+    - name: Cache Ivy local repository
+      uses: actions/cache@v2
+      with:
+        path: ~/.ivy2/cache
+        key: ${{ matrix.java }}-${{ matrix.hadoop }}-ivy-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
+        restore-keys: |
+          ${{ matrix.java }}-${{ matrix.hadoop }}-ivy-
+    - name: Install JDK ${{ matrix.java }}
+      uses: actions/setup-java@v1
+      with:
+        java-version: ${{ matrix.java }}
+    # PySpark
+    - name: Install PyPy3
+      # Note that order of Python installations here matters because default python3 is
+      # overridden by pypy3.
+      uses: actions/setup-python@v2
+      if: contains(matrix.modules, 'pyspark')
+      with:
+        python-version: pypy3
+        architecture: x64
+    - name: Install Python 2.7
+      uses: actions/setup-python@v2
+      if: contains(matrix.modules, 'pyspark')
+      with:
+        python-version: 2.7
+        architecture: x64
+    - name: Install Python 3.8
+      uses: actions/setup-python@v2
+      # We should install one Python that is higher then 3+ for SQL and Yarn because:
+      # - SQL component also has Python related tests, for example, IntegratedUDFTestUtils.
+      # - Yarn has a Python specific test too, for example, YarnClusterSuite.
+      if: contains(matrix.modules, 'yarn') || contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
+      with:
+        python-version: 3.8
+        architecture: x64
+    - name: Install Python packages (Python 2.7 and PyPy3)
+      if: contains(matrix.modules, 'pyspark')
+      # PyArrow is not supported in PyPy yet, see ARROW-2651.
+      # TODO(SPARK-32247): scipy installation with PyPy fails for an unknown reason.
+      run: |
+        python2.7 -m pip install numpy pyarrow pandas scipy xmlrunner
+        python2.7 -m pip list
+        # PyPy does not have xmlrunner
+        pypy3 -m pip install numpy pandas
+        pypy3 -m pip list
+    - name: Install Python packages (Python 3.8)
+      if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
+      run: |
+        python3.8 -m pip install numpy pyarrow pandas scipy xmlrunner
+        python3.8 -m pip list
+    # SparkR
+    - name: Install R 4.0
+      if: contains(matrix.modules, 'sparkr')
+      run: |
+        sudo sh -c "echo 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/' >> /etc/apt/sources.list"
+        curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xE298A3A825C0D65DFD57CBB651716619E084DAB9" | sudo apt-key add
+        sudo apt-get update
+        sudo apt-get install -y r-base r-base-dev libcurl4-openssl-dev
+    - name: Install R packages
+      if: contains(matrix.modules, 'sparkr')
+      run: |
+        # qpdf is required to reduce the size of PDFs to make CRAN check pass. See SPARK-32497.
+        sudo apt-get install -y libcurl4-openssl-dev qpdf
+        sudo Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'devtools', 'e1071', 'survival', 'arrow', 'roxygen2'), repos='https://cloud.r-project.org/')"
+        # Show installed packages in R.
+        sudo Rscript -e 'pkg_list <- as.data.frame(installed.packages()[, c(1,3:4)]); pkg_list[is.na(pkg_list$Priority), 1:2, drop = FALSE]'
+    # Run the tests.
+    - name: Run tests
+      run: |
+        # Hive tests become flaky when running in parallel as it's too intensive.
+        if [[ "$MODULES_TO_TEST" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi
+        mkdir -p ~/.m2
+        ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS"
+        rm -rf ~/.m2/repository/org/apache/spark
+    - name: Upload test results to report
+      if: always()
+      uses: actions/upload-artifact@v2
+      with:
+        name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
+        path: "**/target/test-reports/*.xml"
+    - name: Upload unit tests log files
+      if: failure()
+      uses: actions/upload-artifact@v2
+      with:
+        name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
+        path: "**/target/unit-tests.log"
+
+  # Static analysis, and documentation build
+  lint:
+    name: Linters, licenses, dependencies and documentation generation
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout Spark repository
+      uses: actions/checkout@v2
+    - name: Cache Maven local repository
+      uses: actions/cache@v2
+      with:
+        path: ~/.m2/repository
+        key: docs-maven-repo-${{ hashFiles('**/pom.xml') }}
+        restore-keys: |
+          docs-maven-
+    - name: Install JDK 1.8
+      uses: actions/setup-java@v1
+      with:
+        java-version: 1.8
+    - name: Install Python 3.6
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.6
+        architecture: x64
+    - name: Install Python linter dependencies
+      run: |
+        pip3 install flake8 sphinx numpy
+    - name: Install R 4.0
+      run: |
+        sudo sh -c "echo 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/' >> /etc/apt/sources.list"
+        curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xE298A3A825C0D65DFD57CBB651716619E084DAB9" | sudo apt-key add
+        sudo apt-get update
+        sudo apt-get install -y r-base r-base-dev libcurl4-openssl-dev
+    - name: Install R linter dependencies and SparkR
+      run: |
+        sudo apt-get install -y libcurl4-openssl-dev
+        sudo Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')"
+        sudo Rscript -e "devtools::install_github('jimhester/[email protected]')"
+        ./R/install-dev.sh
+    - name: Install Ruby 2.7 for documentation generation
+      uses: actions/setup-ruby@v1
+      with:
+        ruby-version: 2.7
+    - name: Install dependencies for documentation generation
+      run: |
+        sudo apt-get install -y libcurl4-openssl-dev pandoc
+        pip install sphinx mkdocs numpy
+        gem install jekyll jekyll-redirect-from rouge
+        sudo Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')"
+    - name: Scala linter
+      run: ./dev/lint-scala
+    - name: Java linter
+      run: ./dev/lint-java
+    - name: Python linter
+      run: ./dev/lint-python
+    - name: R linter
+      run: ./dev/lint-r
+    - name: License test
+      run: ./dev/check-license
+    - name: Dependencies test
+      run: ./dev/test-dependencies.sh
+    - name: Run documentation build
+      run: |
+        cd docs
+        jekyll build
+
+  java11:
+    name: Java 11 build
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout Spark repository
+      uses: actions/checkout@v2
+    - name: Cache Maven local repository
+      uses: actions/cache@v2
+      with:
+        path: ~/.m2/repository
+        key: java11-maven-${{ hashFiles('**/pom.xml') }}
+        restore-keys: |
+          java11-maven-
+    - name: Install Java 11
+      uses: actions/setup-java@v1
+      with:
+        java-version: 11
+    - name: Build with Maven
+      run: |
+        export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
+        export MAVEN_CLI_OPTS="--no-transfer-progress"
+        mkdir -p ~/.m2
+        ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=11 install
+        rm -rf ~/.m2/repository/org/apache/spark
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
@@ -0,0 +1,24 @@
+name: Close stale PRs
+
+on:
+  schedule:
+  - cron: "0 0 * * *"
+
+jobs:
+  stale:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/[email protected]
+      with:
+        repo-token: ${{ secrets.GITHUB_TOKEN }}
+        stale-pr-message: >
+          We're closing this PR because it hasn't been updated in a while.
+          This isn't a judgement on the merit of the PR in any way. It's just
+          a way of keeping the PR queue manageable.
+
+          If you'd like to revive this PR, please reopen it and ask a
+          committer to remove the Stale tag!
+        days-before-stale: 100
+        # Setting this to 0 is the same as setting it to 1.
+        # See: https://github.com/actions/stale/issues/28
+        days-before-close: 0
diff --git a/.github/workflows/test_report.yml b/.github/workflows/test_report.yml
@@ -0,0 +1,24 @@
+name: Report test results
+on:
+  workflow_run:
+    workflows: ["Build and test"]
+    types:
+      - completed
+
+jobs:
+  test_report:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Download test results to report
+      uses: dawidd6/action-download-artifact@v2
+      with:
+        github_token: ${{ secrets.GITHUB_TOKEN }}
+        workflow: ${{ github.event.workflow_run.workflow_id }}
+        commit: ${{ github.event.workflow_run.head_commit.id }}
+    - name: Publish test report
+      uses: scacap/action-surefire-report@v1
+      with:
+        check_name: Report test results
+        github_token: ${{ secrets.GITHUB_TOKEN }}
+        report_paths: "**/target/test-reports/*.xml"
+        commit: ${{ github.event.workflow_run.head_commit.id }}