Skip to content

Commit

Permalink
[CI] Upgrade to CUDA 12.8 (#11202)
Browse files Browse the repository at this point in the history
  • Loading branch information
hcho3 authored Feb 5, 2025
1 parent fc32798 commit 30a7fd5
Show file tree
Hide file tree
Showing 8 changed files with 26 additions and 29 deletions.
4 changes: 2 additions & 2 deletions demo/dask/gpu_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ def using_quantile_device_dmatrix(client: Client, X: da.Array, y: da.Array) -> d
.. versionadded:: 1.2.0
"""
X = dask_cudf.from_dask_dataframe(dd.from_dask_array(X))
y = dask_cudf.from_dask_dataframe(dd.from_dask_array(y))
X = dd.from_dask_array(X).to_backend("cudf")
y = dd.from_dask_array(y).to_backend("cudf")

# `DaskQuantileDMatrix` is used instead of `DaskDMatrix`, be careful that it can not
# be used for anything else other than training unless a reference is specified. See
Expand Down
6 changes: 3 additions & 3 deletions tests/buildkite/conftest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ function set_buildkite_env_vars_in_container {

set -x

CUDA_VERSION=11.8.0
NCCL_VERSION=2.16.5-1
RAPIDS_VERSION=24.06
CUDA_VERSION=12.8.0
NCCL_VERSION=2.25.1-1
RAPIDS_VERSION=24.12
DEV_RAPIDS_VERSION=24.06
SPARK_VERSION=3.5.1
JDK_VERSION=8
Expand Down
10 changes: 0 additions & 10 deletions tests/buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,6 @@ steps:
queue: linux-amd64-cpu
- wait
#### -------- BUILD --------
- label: ":console: Run clang-tidy"
command: "tests/buildkite/run-clang-tidy.sh"
key: run-clang-tidy
agents:
queue: linux-amd64-cpu
- label: ":console: Build CPU"
command: "tests/buildkite/build-cpu.sh"
key: build-cpu
Expand All @@ -41,11 +36,6 @@ steps:
key: build-cuda
agents:
queue: linux-amd64-cpu
- label: ":console: Build CUDA with RMM"
command: "tests/buildkite/build-cuda-with-rmm.sh"
key: build-cuda-with-rmm
agents:
queue: linux-amd64-cpu
- label: ":console: Build R package with CUDA"
command: "tests/buildkite/build-gpu-rpkg.sh"
key: build-gpu-rpkg
Expand Down
13 changes: 3 additions & 10 deletions tests/buildkite/test-cpp-gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ set -euo pipefail

source tests/buildkite/conftest.sh

# Work around https://github.com/dmlc/xgboost/issues/11154
export CI_DOCKER_EXTRA_PARAMS_INIT='-e NCCL_RAS_ENABLE=0'

echo "--- Run Google Tests with CUDA, using a GPU"
buildkite-agent artifact download "build/testxgboost" . --step build-cuda
chmod +x build/testxgboost
Expand All @@ -12,13 +15,3 @@ tests/ci_build/ci_build.sh gpu --use-gpus \
--build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
--build-arg NCCL_VERSION_ARG=$NCCL_VERSION \
build/testxgboost

echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled"
rm -rfv build/
buildkite-agent artifact download "build/testxgboost" . --step build-cuda-with-rmm
chmod +x build/testxgboost
tests/ci_build/ci_build.sh gpu --use-gpus \
--build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
--build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
--build-arg NCCL_VERSION_ARG=$NCCL_VERSION \
build/testxgboost --use-rmm-pool
3 changes: 2 additions & 1 deletion tests/buildkite/test-cpp-mgpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@ set -euo pipefail

source tests/buildkite/conftest.sh

# Work around https://github.com/dmlc/xgboost/issues/11154
# Allocate extra space in /dev/shm to enable NCCL
export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g'
export CI_DOCKER_EXTRA_PARAMS_INIT='-e NCCL_RAS_ENABLE=0 --shm-size=4g'

echo "--- Run Google Tests with CUDA, using multiple GPUs"
buildkite-agent artifact download "build/testxgboost" . --step build-cuda
Expand Down
6 changes: 4 additions & 2 deletions tests/ci_build/Dockerfile.gpu
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,13 @@ ENV PATH=/opt/miniforge/bin:$PATH
RUN \
export NCCL_SHORT_VER=$(echo "$NCCL_VERSION_ARG" | cut -d "-" -f 1) && \
mamba create -y -n gpu_test -c rapidsai -c nvidia -c conda-forge \
python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \
python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cuda-version=$CUDA_VERSION_ARG \
"nccl>=${NCCL_SHORT_VER}" \
dask \
dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz "hypothesis<=6.112" \
numpy pytest pytest-timeout scipy \
"scikit-learn<=1.5.2" \
pandas matplotlib wheel python-kubernetes urllib3 graphviz "hypothesis<=6.112" \
"pyspark>=3.4.0" cloudpickle cuda-python && \
mamba clean --all && \
conda run --no-capture-output -n gpu_test pip install buildkite-test-collector
Expand Down
1 change: 1 addition & 0 deletions tests/ci_build/test_python.sh
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ case "$suite" in
set -x
install_xgboost
setup_pyspark_envs
export NCCL_RAS_ENABLE=0
pytest -v -s -rxXs --fulltrace --durations=0 -m "mgpu" ${args} tests/python-gpu
pytest -v -s -rxXs --fulltrace --durations=0 -m "mgpu" ${args} tests/test_distributed/test_gpu_with_dask
pytest -v -s -rxXs --fulltrace --durations=0 -m "mgpu" ${args} tests/test_distributed/test_gpu_with_spark
Expand Down
12 changes: 11 additions & 1 deletion tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import pytest
from hypothesis import given, note, settings, strategies
from hypothesis._settings import duration
from packaging.version import parse as parse_version

import xgboost as xgb
from xgboost import testing as tm
Expand Down Expand Up @@ -41,14 +42,20 @@
try:
import cudf
import dask.dataframe as dd
from dask import __version__ as dask_version
from dask import array as da
from dask.distributed import Client
from dask_cuda import LocalCUDACluster

from xgboost import dask as dxgb
from xgboost.testing.dask import check_init_estimation, check_uneven_nan
except ImportError:
pass
dask_version = None


dask_version_ge110 = dask_version and parse_version(dask_version) >= parse_version(
"2024.11.0"
)


def run_with_dask_dataframe(DMatrixT: Type, client: Client) -> None:
Expand Down Expand Up @@ -378,6 +385,9 @@ def test_early_stopping(self, local_cuda_client: Client) -> None:
dump = booster.get_dump(dump_format="json")
assert len(dump) - booster.best_iteration == early_stopping_rounds + 1

@pytest.mark.xfail(
dask_version_ge110, reason="Test cannot pass with Dask 2024.11.0+"
)
@pytest.mark.skipif(**tm.no_cudf())
@pytest.mark.parametrize("model", ["boosting"])
def test_dask_classifier(self, model: str, local_cuda_client: Client) -> None:
Expand Down

0 comments on commit 30a7fd5

Please sign in to comment.