diff --git a/.clang-tidy b/.clang-tidy index 04689c330..9b3f844c9 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -64,4 +64,6 @@ CheckOptions: value: '1' - key: readability-magic-numbers.IgnorePowersOf2IntegerValues value: '1' + - key: cppcoreguidelines-avoid-do-while.IgnoreMacros + value: 'true' ... diff --git a/.clangd b/.clangd new file mode 100644 index 000000000..7c4fe036d --- /dev/null +++ b/.clangd @@ -0,0 +1,65 @@ +# https://clangd.llvm.org/config + +# Apply a config conditionally to all C files +If: + PathMatch: .*\.(c|h)$ + +--- + +# Apply a config conditionally to all C++ files +If: + PathMatch: .*\.(c|h)pp + +--- + +# Apply a config conditionally to all CUDA files +If: + PathMatch: .*\.cuh? +CompileFlags: + Add: + - "-x" + - "cuda" + # No error on unknown CUDA versions + - "-Wno-unknown-cuda-version" + # Allow variadic CUDA functions + - "-Xclang=-fcuda-allow-variadic-functions" +Diagnostics: + Suppress: + - "variadic_device_fn" + - "attributes_not_allowed" + +--- + +# Tweak the clangd parse settings for all files +CompileFlags: + Add: + # report all errors + - "-ferror-limit=0" + - "-fmacro-backtrace-limit=0" + - "-ftemplate-backtrace-limit=0" + # Skip the CUDA version check + - "--no-cuda-version-check" + Remove: + # remove gcc's -fcoroutines + - -fcoroutines + # remove nvc++ flags unknown to clang + - "-gpu=*" + - "-stdpar*" + # remove nvcc flags unknown to clang + - "-arch*" + - "-gencode*" + - "--generate-code*" + - "-ccbin*" + - "-t=*" + - "--threads*" + - "-Xptxas*" + - "-Xcudafe*" + - "-Xfatbin*" + - "-Xcompiler*" + - "--diag-suppress*" + - "--diag_suppress*" + - "--compiler-options*" + - "--expt-extended-lambda" + - "--expt-relaxed-constexpr" + - "-forward-unknown-to-host-compiler" + - "-Werror=cross-execution-space-call" diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile new file mode 100644 index 000000000..9d35e3f97 --- /dev/null +++ b/.devcontainer/Dockerfile @@ -0,0 +1,30 @@ +# syntax=docker/dockerfile:1.5 + +ARG BASE +ARG PYTHON_PACKAGE_MANAGER=conda + +FROM ${BASE} as pip-base + +ENV DEFAULT_VIRTUAL_ENV=rapids + +FROM ${BASE} as conda-base + +ENV DEFAULT_CONDA_ENV=rapids + +FROM ${PYTHON_PACKAGE_MANAGER}-base + +ARG CUDA +ENV CUDAARCHS="RAPIDS" +ENV CUDA_VERSION="${CUDA_VERSION:-${CUDA}}" + +ARG PYTHON_PACKAGE_MANAGER +ENV PYTHON_PACKAGE_MANAGER="${PYTHON_PACKAGE_MANAGER}" + +ENV PYTHONSAFEPATH="1" +ENV PYTHONUNBUFFERED="1" +ENV PYTHONDONTWRITEBYTECODE="1" + +ENV SCCACHE_REGION="us-east-2" +ENV SCCACHE_BUCKET="rapids-sccache-devs" +ENV VAULT_HOST="https://vault.ops.k8s.rapids.ai" +ENV HISTFILE="/home/coder/.cache/._bash_history" diff --git a/.devcontainer/README.md b/.devcontainer/README.md new file mode 100644 index 000000000..31ddab9aa --- /dev/null +++ b/.devcontainer/README.md @@ -0,0 +1,35 @@ +# RMM Development Containers + +This directory contains [devcontainer configurations](https://containers.dev/implementors/json_reference/) for using VSCode to [develop in a container](https://code.visualstudio.com/docs/devcontainers/containers) via the `Remote Containers` [extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) or [GitHub Codespaces](https://github.com/codespaces). + +This container is a turnkey development environment for building and testing the RMM C++ and Python libraries. + +## Table of Contents + +* [Prerequisites](#prerequisites) +* [Host bind mounts](#host-bind-mounts) +* [Launch a Dev Container](#launch-a-dev-container) + +## Prerequisites + +* [VSCode](https://code.visualstudio.com/download) +* [VSCode Remote Containers extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) + +## Host bind mounts + +By default, the following directories are bind-mounted into the devcontainer: + +* `${repo}:/home/coder/rmm` +* `${repo}/../.aws:/home/coder/.aws` +* `${repo}/../.local:/home/coder/.local` +* `${repo}/../.cache:/home/coder/.cache` +* `${repo}/../.conda:/home/coder/.conda` +* `${repo}/../.config:/home/coder/.config` + +This ensures caches, configurations, dependencies, and your commits are persisted on the host across container runs. + +## Launch a Dev Container + +To launch a devcontainer from VSCode, open the RMM repo and select the "Reopen in Container" button in the bottom right:
+ +Alternatively, open the VSCode command palette (typically `cmd/ctrl + shift + P`) and run the "Rebuild and Reopen in Container" command. diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json new file mode 100644 index 000000000..ba3555a26 --- /dev/null +++ b/.devcontainer/cuda11.8-conda/devcontainer.json @@ -0,0 +1,37 @@ +{ + "build": { + "context": "${localWorkspaceFolder}/.devcontainer", + "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile", + "args": { + "CUDA": "11.8", + "PYTHON_PACKAGE_MANAGER": "conda", + "BASE": "rapidsai/devcontainers:23.10-cpp-llvm16-cuda11.8-mambaforge-ubuntu22.04" + } + }, + "hostRequirements": {"gpu": "optional"}, + "features": { + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.10": {} + }, + "overrideFeatureInstallOrder": [ + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" + ], + "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda11.8-envs}"], + "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"], + "workspaceFolder": "/home/coder", + "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/rmm,type=bind,consistency=consistent", + "mounts": [ + "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda11.8-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent" + ], + "customizations": { + "vscode": { + "extensions": [ + "ms-python.flake8", + "nvidia.nsight-vscode-edition" + ] + } + } +} diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json new file mode 100644 index 000000000..a5b4290e9 --- /dev/null +++ b/.devcontainer/cuda11.8-pip/devcontainer.json @@ -0,0 +1,36 @@ +{ + "build": { + "context": "${localWorkspaceFolder}/.devcontainer", + "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile", + "args": { + "CUDA": "11.8", + "PYTHON_PACKAGE_MANAGER": "pip", + "BASE": "rapidsai/devcontainers:23.10-cpp-llvm16-cuda11.8-ubuntu22.04" + } + }, + "hostRequirements": {"gpu": "optional"}, + "features": { + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.10": {} + }, + "overrideFeatureInstallOrder": [ + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" + ], + "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda11.8-venvs}"], + "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"], + "workspaceFolder": "/home/coder", + "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/rmm,type=bind,consistency=consistent", + "mounts": [ + "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda11.8-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent" + ], + "customizations": { + "vscode": { + "extensions": [ + "ms-python.flake8", + "nvidia.nsight-vscode-edition" + ] + } + } +} diff --git a/.devcontainer/cuda12.0-conda/devcontainer.json b/.devcontainer/cuda12.0-conda/devcontainer.json new file mode 100644 index 000000000..03282635e --- /dev/null +++ b/.devcontainer/cuda12.0-conda/devcontainer.json @@ -0,0 +1,37 @@ +{ + "build": { + "context": "${localWorkspaceFolder}/.devcontainer", + "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile", + "args": { + "CUDA": "12.0", + "PYTHON_PACKAGE_MANAGER": "conda", + "BASE": "rapidsai/devcontainers:23.10-cpp-mambaforge-ubuntu22.04" + } + }, + "hostRequirements": {"gpu": "optional"}, + "features": { + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.10": {} + }, + "overrideFeatureInstallOrder": [ + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" + ], + "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.0-envs}"], + "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"], + "workspaceFolder": "/home/coder", + "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/rmm,type=bind,consistency=consistent", + "mounts": [ + "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.0-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent" + ], + "customizations": { + "vscode": { + "extensions": [ + "ms-python.flake8", + "nvidia.nsight-vscode-edition" + ] + } + } +} diff --git a/.devcontainer/cuda12.0-pip/devcontainer.json b/.devcontainer/cuda12.0-pip/devcontainer.json new file mode 100644 index 000000000..8dc607f28 --- /dev/null +++ b/.devcontainer/cuda12.0-pip/devcontainer.json @@ -0,0 +1,36 @@ +{ + "build": { + "context": "${localWorkspaceFolder}/.devcontainer", + "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile", + "args": { + "CUDA": "12.0", + "PYTHON_PACKAGE_MANAGER": "pip", + "BASE": "rapidsai/devcontainers:23.10-cpp-llvm16-cuda12.0-ubuntu22.04" + } + }, + "hostRequirements": {"gpu": "optional"}, + "features": { + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.10": {} + }, + "overrideFeatureInstallOrder": [ + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" + ], + "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.0-venvs}"], + "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"], + "workspaceFolder": "/home/coder", + "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/rmm,type=bind,consistency=consistent", + "mounts": [ + "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.0-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent" + ], + "customizations": { + "vscode": { + "extensions": [ + "ms-python.flake8", + "nvidia.nsight-vscode-edition" + ] + } + } +} diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml new file mode 100644 index 000000000..895ba83ee --- /dev/null +++ b/.github/copy-pr-bot.yaml @@ -0,0 +1,4 @@ +# Configuration file for `copy-pr-bot` GitHub App +# https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/ + +enabled: true diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml index 57bdd6110..9a0b41550 100644 --- a/.github/ops-bot.yaml +++ b/.github/ops-bot.yaml @@ -5,6 +5,4 @@ auto_merger: true branch_checker: true label_checker: true release_drafter: true -copy_prs: true -rerun_tests: true recently_updated: true diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 67af4dafc..d80bf1d41 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -28,7 +28,7 @@ concurrency: jobs: cpp-build: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.08 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -37,7 +37,7 @@ jobs: python-build: needs: [cpp-build] secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.08 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.08 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -56,7 +56,7 @@ jobs: if: github.ref_type == 'branch' needs: python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.08 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -64,11 +64,11 @@ jobs: date: ${{ inputs.date }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci:latest" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/build_docs.sh" wheel-build: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.08 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -78,7 +78,7 @@ jobs: wheel-publish: needs: wheel-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.08 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index f5b7e3bf5..95ac72d59 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -20,58 +20,67 @@ jobs: - docs-build - wheel-build - wheel-tests + - devcontainer secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.08 + uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.10 checks: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.08 + uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.10 with: enable_check_generated_files: false conda-cpp-build: needs: checks secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.08 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.10 with: build_type: pull-request conda-cpp-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.08 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.10 with: build_type: pull-request conda-python-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.08 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.10 with: build_type: pull-request conda-python-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.08 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10 with: build_type: pull-request docs-build: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.08 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10 with: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci:latest" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/build_docs.sh" wheel-build: needs: checks secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.08 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10 with: build_type: pull-request script: ci/build_wheel.sh wheel-tests: needs: wheel-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.08 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10 with: build_type: pull-request script: ci/test_wheel.sh + devcontainer: + secrets: inherit + uses: rapidsai/shared-action-workflows/.github/workflows/build-in-devcontainer.yaml@branch-23.10 + with: + build_command: | + sccache -z; + build-all -DBUILD_BENCHMARKS=ON --verbose; + sccache -s; diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 9a26dac4a..81f1b791b 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -16,7 +16,7 @@ on: jobs: cpp-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.08 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -24,7 +24,7 @@ jobs: sha: ${{ inputs.sha }} python-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.08 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -32,7 +32,7 @@ jobs: sha: ${{ inputs.sha }} wheel-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.08 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10 with: build_type: nightly branch: ${{ inputs.branch }} diff --git a/.gitignore b/.gitignore index 1ab57e4d4..0f98d3e44 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ DartConfiguration.tcl .DS_Store *.manifest *.spec +compile_commands.json ## Python build directories & artifacts dist/ @@ -153,3 +154,7 @@ rmm_log.txt # cibuildwheel /wheelhouse + +# clang tooling +compile_commands.json +.clangd/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index aa076e6be..7f095025a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,7 +27,7 @@ repos: hooks: - id: cython-lint - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v16.0.1 + rev: v16.0.6 hooks: - id: clang-format types_or: [c, c++, cuda] @@ -69,6 +69,13 @@ repos: # of dependencies, so we'll have to update this manually. additional_dependencies: - cmakelang==0.6.13 + - id: doxygen-check + name: doxygen-check + entry: ./scripts/doxygen.sh + types_or: [file] + language: system + pass_filenames: false + verbose: true - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.0.278 hooks: diff --git a/CHANGELOG.md b/CHANGELOG.md index 453ab54c2..901b3790a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,42 @@ +# RMM 23.10.00 (11 Oct 2023) + +## 🚨 Breaking Changes + +- Update to Cython 3.0.0 ([#1313](https://github.com/rapidsai/rmm/pull/1313)) [@vyasr](https://github.com/vyasr) + +## 🐛 Bug Fixes + +- Compile cdef public functions from torch_allocator with C ABI ([#1350](https://github.com/rapidsai/rmm/pull/1350)) [@wence-](https://github.com/wence-) +- Make doxygen only a conda dependency. ([#1344](https://github.com/rapidsai/rmm/pull/1344)) [@bdice](https://github.com/bdice) +- Use `conda mambabuild` not `mamba mambabuild` ([#1338](https://github.com/rapidsai/rmm/pull/1338)) [@wence-](https://github.com/wence-) +- Fix stream_ordered_memory_resource attempt to record event in stream from another device ([#1333](https://github.com/rapidsai/rmm/pull/1333)) [@harrism](https://github.com/harrism) + +## 📖 Documentation + +- Clean up headers in CMakeLists.txt. ([#1341](https://github.com/rapidsai/rmm/pull/1341)) [@bdice](https://github.com/bdice) +- Add pre-commit hook to validate doxygen ([#1334](https://github.com/rapidsai/rmm/pull/1334)) [@vyasr](https://github.com/vyasr) +- Fix doxygen warnings ([#1317](https://github.com/rapidsai/rmm/pull/1317)) [@vyasr](https://github.com/vyasr) +- Treat warnings as errors in Python documentation ([#1316](https://github.com/rapidsai/rmm/pull/1316)) [@vyasr](https://github.com/vyasr) + +## 🚀 New Features + +- Enable RMM Debug Logging via Python ([#1339](https://github.com/rapidsai/rmm/pull/1339)) [@harrism](https://github.com/harrism) + +## 🛠️ Improvements + +- Update image names ([#1346](https://github.com/rapidsai/rmm/pull/1346)) [@AyodeAwe](https://github.com/AyodeAwe) +- Update to clang 16.0.6. ([#1343](https://github.com/rapidsai/rmm/pull/1343)) [@bdice](https://github.com/bdice) +- Update doxygen to 1.9.1 ([#1337](https://github.com/rapidsai/rmm/pull/1337)) [@vyasr](https://github.com/vyasr) +- Simplify wheel build scripts and allow alphas of RAPIDS dependencies ([#1335](https://github.com/rapidsai/rmm/pull/1335)) [@divyegala](https://github.com/divyegala) +- Use `copy-pr-bot` ([#1329](https://github.com/rapidsai/rmm/pull/1329)) [@ajschmidt8](https://github.com/ajschmidt8) +- Add RMM devcontainers ([#1328](https://github.com/rapidsai/rmm/pull/1328)) [@trxcllnt](https://github.com/trxcllnt) +- Add Python bindings for `limiting_resource_adaptor` ([#1327](https://github.com/rapidsai/rmm/pull/1327)) [@pentschev](https://github.com/pentschev) +- Fix missing jQuery error in docs ([#1321](https://github.com/rapidsai/rmm/pull/1321)) [@AyodeAwe](https://github.com/AyodeAwe) +- Use fetch_rapids.cmake. ([#1319](https://github.com/rapidsai/rmm/pull/1319)) [@bdice](https://github.com/bdice) +- Update to Cython 3.0.0 ([#1313](https://github.com/rapidsai/rmm/pull/1313)) [@vyasr](https://github.com/vyasr) +- Branch 23.10 merge 23.08 ([#1312](https://github.com/rapidsai/rmm/pull/1312)) [@vyasr](https://github.com/vyasr) +- Branch 23.10 merge 23.08 ([#1309](https://github.com/rapidsai/rmm/pull/1309)) [@vyasr](https://github.com/vyasr) + # RMM 23.08.00 (9 Aug 2023) ## 🚨 Breaking Changes diff --git a/CMakeLists.txt b/CMakeLists.txt index 2c4453968..814f41be0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,11 +14,7 @@ cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR) -if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/RMM_RAPIDS.cmake) - file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.08/RAPIDS.cmake - ${CMAKE_CURRENT_BINARY_DIR}/RMM_RAPIDS.cmake) -endif() -include(${CMAKE_CURRENT_BINARY_DIR}/RMM_RAPIDS.cmake) +include(fetch_rapids.cmake) include(rapids-cmake) include(rapids-cpm) @@ -27,16 +23,21 @@ include(rapids-find) project( RMM - VERSION 23.08.00 + VERSION 23.10.00 LANGUAGES CXX) # Write the version header rapids_cmake_write_version_file(include/rmm/version_config.hpp) +# ################################################################################################## +# * build type ------------------------------------------------------------------------------------- + # Set a default build type if none was specified rapids_cmake_build_type(Release) -# build options +# ################################################################################################## +# * build options ---------------------------------------------------------------------------------- + option(BUILD_TESTS "Configure CMake to build tests" ON) option(BUILD_BENCHMARKS "Configure CMake to build (google) benchmarks" OFF) set(RMM_LOGGING_LEVEL @@ -52,17 +53,28 @@ message(STATUS "RMM: RMM_LOGGING_LEVEL = '${RMM_LOGGING_LEVEL}'") # cudart can be statically linked or dynamically linked the python ecosystem wants dynamic linking option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF) +# ################################################################################################## +# * compiler options ------------------------------------------------------------------------------- + # find packages we depend on rapids_find_package( CUDAToolkit REQUIRED BUILD_EXPORT_SET rmm-exports INSTALL_EXPORT_SET rmm-exports) + +# ################################################################################################## +# * dependencies ----------------------------------------------------------------------------------- + +# add third party dependencies using CPM rapids_cpm_init() + include(cmake/thirdparty/get_fmt.cmake) include(cmake/thirdparty/get_spdlog.cmake) include(cmake/thirdparty/get_thrust.cmake) -# library targets +# ################################################################################################## +# * library targets -------------------------------------------------------------------------------- + add_library(rmm INTERFACE) add_library(rmm::rmm ALIAS rmm) @@ -83,6 +95,9 @@ target_link_libraries(rmm INTERFACE spdlog::spdlog_header_only) target_link_libraries(rmm INTERFACE dl) target_compile_features(rmm INTERFACE cxx_std_17 $) +# ################################################################################################## +# * tests and benchmarks --------------------------------------------------------------------------- + if((BUILD_TESTS OR BUILD_BENCHMARKS) AND CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) include(rapids-cuda) rapids_cuda_init_architectures(RMM) @@ -94,7 +109,9 @@ if((BUILD_TESTS OR BUILD_BENCHMARKS) AND CMAKE_PROJECT_NAME STREQUAL PROJECT_NAM message(STATUS "RMM: Building benchmarks with GPU Architectures: ${CMAKE_CUDA_ARCHITECTURES}") endif() -# optionally build tests +# ################################################################################################## +# * add tests -------------------------------------------------------------------------------------- + if(BUILD_TESTS AND CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) include(cmake/thirdparty/get_gtest.cmake) include(CTest) # calls enable_testing() @@ -102,19 +119,19 @@ if(BUILD_TESTS AND CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) add_subdirectory(tests) endif() -# optionally build benchmarks +# ################################################################################################## +# * add benchmarks --------------------------------------------------------------------------------- + if(BUILD_BENCHMARKS AND CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) include(${rapids-cmake-dir}/cpm/gbench.cmake) rapids_cpm_gbench() add_subdirectory(benchmarks) endif() -include(CPack) +# ################################################################################################## +# * install targets -------------------------------------------------------------------------------- -# optionally assemble Thrust pretty-printers -if(Thrust_SOURCE_DIR) - configure_file(scripts/load-pretty-printers.in load-pretty-printers @ONLY) -endif() +include(CPack) # install export targets install(TARGETS rmm EXPORT rmm-exports) @@ -147,7 +164,9 @@ rapids_export( DOCUMENTATION doc_string FINAL_CODE_BLOCK code_string) -# build export targets +# ################################################################################################## +# * build export ----------------------------------------------------------------------------------- + rapids_export( BUILD rmm EXPORT_SET rmm-exports @@ -156,7 +175,8 @@ rapids_export( DOCUMENTATION doc_string FINAL_CODE_BLOCK code_string) -# make documentation +# ################################################################################################## +# * make documentation ----------------------------------------------------------------------------- add_custom_command( OUTPUT RMM_DOXYGEN @@ -169,3 +189,11 @@ add_custom_target( rmm_doc DEPENDS RMM_DOXYGEN COMMENT "Target for the custom command to build the RMM doxygen docs") + +# ################################################################################################## +# * make gdb helper scripts ------------------------------------------------------------------------ + +# optionally assemble Thrust pretty-printers +if(Thrust_SOURCE_DIR) + configure_file(scripts/load-pretty-printers.in load-pretty-printers @ONLY) +endif() diff --git a/README.md b/README.md index 8d65b7d33..4059c9ce0 100644 --- a/README.md +++ b/README.md @@ -354,11 +354,32 @@ objects for each device and sets them as the per-device resource for that device ```c++ std::vector> per_device_pools; for(int i = 0; i < N; ++i) { - cudaSetDevice(i); // set device i before creating MR - // Use a vector of unique_ptr to maintain the lifetime of the MRs - per_device_pools.push_back(std::make_unique()); - // Set the per-device resource for device i - set_per_device_resource(cuda_device_id{i}, &per_device_pools.back()); + cudaSetDevice(i); // set device i before creating MR + // Use a vector of unique_ptr to maintain the lifetime of the MRs + per_device_pools.push_back(std::make_unique()); + // Set the per-device resource for device i + set_per_device_resource(cuda_device_id{i}, &per_device_pools.back()); +} +``` + +Note that the CUDA device that is current when creating a `device_memory_resource` must also be +current any time that `device_memory_resource` is used to deallocate memory, including in a +destructor. This affects RAII classes like `rmm::device_buffer` and `rmm::device_uvector`. Here's an +(incorrect) example that assumes the above example loop has been run to create a +`pool_memory_resource` for each device. A correct example adds a call to `cudaSetDevice(0)` on the +line of the error comment. + +```c++ +{ + RMM_CUDA_TRY(cudaSetDevice(0)); + rmm::device_buffer buf_a(16); + + { + RMM_CUDA_TRY(cudaSetDevice(1)); + rmm::device_buffer buf_b(16); + } + + // Error: when buf_a is destroyed, the current device must be 0, but it is 1 } ``` @@ -560,9 +581,12 @@ of more detailed logging. The default is `INFO`. Available levels are `TRACE`, ` The log relies on the [spdlog](https://github.com/gabime/spdlog.git) library. -Note that to see logging below the `INFO` level, the C++ application must also call -`rmm::logger().set_level()`, e.g. to enable all levels of logging down to `TRACE`, call -`rmm::logger().set_level(spdlog::level::trace)` (and compile with `-DRMM_LOGGING_LEVEL=TRACE`). +Note that to see logging below the `INFO` level, the application must also set the logging level at +run time. C++ applications must must call `rmm::logger().set_level()`, for example to enable all +levels of logging down to `TRACE`, call `rmm::logger().set_level(spdlog::level::trace)` (and compile +librmm with `-DRMM_LOGGING_LEVEL=TRACE`). Python applications must call `rmm.set_logging_level()`, +for example to enable all levels of logging down to `TRACE`, call `rmm.set_logging_level("trace")` +(and compile the RMM Python module with `-DRMM_LOGGING_LEVEL=TRACE`). Note that debug logging is different from the CSV memory allocation logging provided by `rmm::mr::logging_resource_adapter`. The latter is for logging a history of allocation / diff --git a/benchmarks/random_allocations/random_allocations.cpp b/benchmarks/random_allocations/random_allocations.cpp index 8c670525d..470442830 100644 --- a/benchmarks/random_allocations/random_allocations.cpp +++ b/benchmarks/random_allocations/random_allocations.cpp @@ -68,7 +68,7 @@ void random_allocation_free(rmm::mr::device_memory_resource& mr, { std::default_random_engine generator; - max_usage *= size_mb; // convert to bytes + max_usage *= size_mb; // convert to bytes constexpr int allocation_probability{73}; // percent constexpr int max_op_chance{99}; diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh index bc6e18021..0ae14a88f 100755 --- a/ci/build_cpp.sh +++ b/ci/build_cpp.sh @@ -11,6 +11,7 @@ rapids-print-env rapids-logger "Begin cpp build" -rapids-mamba-retry mambabuild conda/recipes/librmm +# This calls mambabuild when boa is installed (as is the case in the CI images) +rapids-conda-retry mambabuild conda/recipes/librmm rapids-upload-conda-to-s3 cpp diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 7e1efef9a..d54947106 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -25,7 +25,7 @@ rapids-mamba-retry install \ --channel "${PYTHON_CHANNEL}" \ rmm librmm -export RAPIDS_VERSION_NUMBER="23.08" +export RAPIDS_VERSION_NUMBER="23.10" export RAPIDS_DOCS_DIR="$(mktemp -d)" rapids-logger "Build CPP docs" @@ -37,11 +37,11 @@ popd rapids-logger "Build Python docs" pushd python/docs -sphinx-build -b dirhtml . _html -sphinx-build -b text . _text +make dirhtml +make text mkdir -p "${RAPIDS_DOCS_DIR}/rmm/"{html,txt} -mv _html/* "${RAPIDS_DOCS_DIR}/rmm/html" -mv _text/* "${RAPIDS_DOCS_DIR}/rmm/txt" +mv _build/dirhtml/* "${RAPIDS_DOCS_DIR}/rmm/html" +mv _build/text/* "${RAPIDS_DOCS_DIR}/rmm/txt" popd rapids-upload-docs diff --git a/ci/build_python.sh b/ci/build_python.sh index b306d3e47..9da4025c7 100755 --- a/ci/build_python.sh +++ b/ci/build_python.sh @@ -13,6 +13,7 @@ rapids-logger "Begin py build" CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp) -rapids-mamba-retry mambabuild -c "${CPP_CHANNEL}" conda/recipes/rmm +# This calls mambabuild when boa is installed (as is the case in the CI images) +rapids-conda-retry mambabuild -c "${CPP_CHANNEL}" conda/recipes/rmm rapids-upload-conda-to-s3 python diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh index 09c1e104e..660c88387 100755 --- a/ci/build_wheel.sh +++ b/ci/build_wheel.sh @@ -3,6 +3,9 @@ set -euo pipefail +package_name="rmm" +package_dir="python" + source rapids-configure-sccache source rapids-date-string @@ -12,15 +15,34 @@ version_override="$(rapids-pip-wheel-version ${RAPIDS_DATE_STRING})" RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" -ci/release/apply_wheel_modifications.sh ${version_override} "-${RAPIDS_PY_CUDA_SUFFIX}" -echo "The package name and/or version was modified in the package source. The git diff is:" -git diff +# This is the version of the suffix with a preceding hyphen. It's used +# everywhere except in the final wheel name. +PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}" + +# Patch project metadata files to include the CUDA version suffix and version override. +pyproject_file="${package_dir}/pyproject.toml" + +sed -i "s/^version = .*/version = \"${version_override}\"/g" ${pyproject_file} +sed -i "s/name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file} + +# For nightlies we want to ensure that we're pulling in alphas as well. The +# easiest way to do so is to augment the spec with a constraint containing a +# min alpha version that doesn't affect the version bounds but does allow usage +# of alpha versions for that dependency without --pre +alpha_spec='' +if ! rapids-is-release-build; then + alpha_spec=',>=0.0.0a0' +fi + +if [[ $PACKAGE_CUDA_SUFFIX == "-cu12" ]]; then + sed -i "s/cuda-python[<=>\.,0-9a]*/cuda-python>=12.0,<13.0a0/g" ${pyproject_file} +fi -cd python +cd "${package_dir}" SKBUILD_CONFIGURE_OPTIONS="-DRMM_BUILD_WHEELS=ON" python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check mkdir -p final_dist python -m auditwheel repair -w final_dist dist/* -RAPIDS_PY_WHEEL_NAME="rmm_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist +RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist diff --git a/ci/check_style.sh b/ci/check_style.sh index 4beaca333..e96ad8bf1 100755 --- a/ci/check_style.sh +++ b/ci/check_style.sh @@ -14,7 +14,7 @@ rapids-dependency-file-generator \ rapids-mamba-retry env create --force -f env.yaml -n checks conda activate checks -FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.08/cmake-format-rapids-cmake.json +FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.10/cmake-format-rapids-cmake.json export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake.json mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE}) wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL} diff --git a/ci/release/apply_wheel_modifications.sh b/ci/release/apply_wheel_modifications.sh deleted file mode 100755 index cc71389ea..000000000 --- a/ci/release/apply_wheel_modifications.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -# Copyright (c) 2023, NVIDIA CORPORATION. -# -# Usage: bash apply_wheel_modifications.sh - -VERSION=${1} -CUDA_SUFFIX=${2} - -sed -i "s/^version = .*/version = \"${VERSION}\"/g" python/pyproject.toml - -sed -i "s/^name = \"rmm\"/name = \"rmm${CUDA_SUFFIX}\"/g" python/pyproject.toml - -if [[ $CUDA_SUFFIX == "-cu12" ]]; then - sed -i "s/cuda-python[<=>\.,0-9]*/cuda-python>=12.0,<13.0/g" python/pyproject.toml -fi diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 7314442ff..8b7f07167 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -22,6 +22,9 @@ NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}') NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}') NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR} +# Need to distutils-normalize the original version +NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))") + echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG" # Inplace sed replace; workaround for Linux and Mac @@ -31,14 +34,15 @@ function sed_runner() { # cpp update sed_runner 's/'" VERSION .*"'/'" VERSION ${NEXT_FULL_TAG}"'/g' CMakeLists.txt -sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' CMakeLists.txt # Python update sed_runner 's/'"rmm_version .*)"'/'"rmm_version ${NEXT_FULL_TAG})"'/g' python/CMakeLists.txt -sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' python/CMakeLists.txt sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/rmm/__init__.py sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/pyproject.toml +# rapids-cmake version +sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' fetch_rapids.cmake + # cmake-format rapids-cmake definitions sed_runner 's/'"branch-.*\/cmake-format-rapids-cmake.json"'/'"branch-${NEXT_SHORT_TAG}\/cmake-format-rapids-cmake.json"'/g' ci/check_style.sh @@ -54,3 +58,9 @@ for FILE in .github/workflows/*.yaml; do sed_runner "/shared-action-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}" done sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh + +# .devcontainer files +find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r -d '' filename; do + sed_runner "s@rapidsai/devcontainers:[0-9.]*@rapidsai/devcontainers:${NEXT_SHORT_TAG}@g" "${filename}" + sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${filename}" +done diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index f668c4a4c..8571a1928 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -4,17 +4,28 @@ channels: - rapidsai - conda-forge dependencies: +- c-compiler +- clang-tools==16.0.6 +- clang==16.0.6 - cmake>=3.26.4 - cuda-python>=11.7.1,<12.0a0 - cuda-version=11.8 - cudatoolkit -- cython>=0.29,<0.30 +- cxx-compiler +- cython>=3.0.0 +- doxygen=1.9.1 - fmt>=9.1.0,<10 +- gcc_linux-64=11.* - gcovr>=5.0 +- graphviz - identify>=2.5.20 +- ipython +- make +- nbsphinx - ninja - numba>=0.57 - numpy>=1.21 +- numpydoc - nvcc_linux-64=11.8 - pre-commit - pytest @@ -22,5 +33,10 @@ dependencies: - python>=3.9,<3.11 - scikit-build>=0.13.1 - spdlog>=1.11.0,<1.12 +- sphinx +- sphinx-copybutton +- sphinx-markdown-tables +- sphinx_rtd_theme +- sysroot_linux-64==2.17 - tomli name: all_cuda-118_arch-x86_64 diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml index 40291ba33..aec31b3db 100644 --- a/conda/environments/all_cuda-120_arch-x86_64.yaml +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -4,22 +4,38 @@ channels: - rapidsai - conda-forge dependencies: +- c-compiler +- clang-tools==16.0.6 +- clang==16.0.6 - cmake>=3.26.4 - cuda-nvcc - cuda-python>=12.0,<13.0a0 - cuda-version=12.0 -- cython>=0.29,<0.30 +- cxx-compiler +- cython>=3.0.0 +- doxygen=1.9.1 - fmt>=9.1.0,<10 +- gcc_linux-64=11.* - gcovr>=5.0 +- graphviz - identify>=2.5.20 +- ipython +- make +- nbsphinx - ninja - numba>=0.57 - numpy>=1.21 +- numpydoc - pre-commit - pytest - pytest-cov - python>=3.9,<3.11 - scikit-build>=0.13.1 - spdlog>=1.11.0,<1.12 +- sphinx +- sphinx-copybutton +- sphinx-markdown-tables +- sphinx_rtd_theme +- sysroot_linux-64==2.17 - tomli name: all_cuda-120_arch-x86_64 diff --git a/conda/recipes/rmm/meta.yaml b/conda/recipes/rmm/meta.yaml index 5d905b853..e13254d4c 100644 --- a/conda/recipes/rmm/meta.yaml +++ b/conda/recipes/rmm/meta.yaml @@ -58,7 +58,7 @@ requirements: - cuda-cudart-dev - cuda-python ==12.0.0 {% endif %} - - cython >=0.29,<0.30 + - cython >=3.0.0 - librmm ={{ version }} - python - scikit-build >=0.13.1 diff --git a/dependencies.yaml b/dependencies.yaml index 91eed8280..7d9af48cf 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -10,6 +10,7 @@ files: - checks - cudatoolkit - develop + - docs - run - test_python test_python: @@ -62,12 +63,14 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - &cmake_ver cmake>=3.26.4 - - cython>=0.29,<0.30 + - cython>=3.0.0 - ninja - scikit-build>=0.13.1 - tomli - output_types: conda packages: + - c-compiler + - cxx-compiler - fmt>=9.1.0,<10 - spdlog>=1.11.0,<1.12 - python>=3.9,<3.11 @@ -76,6 +79,18 @@ dependencies: - wheel - setuptools>=61.0.0 specific: + - output_types: conda + matrices: + - matrix: + arch: x86_64 + packages: + - gcc_linux-64=11.* + - sysroot_linux-64==2.17 + - matrix: + arch: aarch64 + packages: + - gcc_linux-aarch64=11.* + - sysroot_linux-aarch64==2.17 - output_types: conda matrices: - matrix: @@ -110,6 +125,9 @@ dependencies: # pre-commit requires identify minimum version 1.0, but clang-format requires textproto support and that was # added in 2.5.20, so we need to call out the minimum version needed for our plugins - identify>=2.5.20 + - output_types: conda + packages: + - &doxygen doxygen=1.9.1 cudatoolkit: specific: - output_types: conda @@ -148,13 +166,18 @@ dependencies: - output_types: [conda, requirements] packages: - gcovr>=5.0 + - output_types: conda + packages: + - clang==16.0.6 + - clang-tools==16.0.6 docs: common: - - output_types: [conda] + - output_types: conda packages: - - doxygen=1.8.20 + - *doxygen - graphviz - ipython + - make - nbsphinx - numpydoc - sphinx diff --git a/doxygen/Doxyfile b/doxygen/Doxyfile index 264d99df0..5d6153f81 100644 --- a/doxygen/Doxyfile +++ b/doxygen/Doxyfile @@ -1,4 +1,4 @@ -# Doxyfile 1.8.20 +# Doxyfile 1.9.1 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project. @@ -32,13 +32,13 @@ DOXYFILE_ENCODING = UTF-8 # title of most generated pages and in a few other places. # The default value is: My Project. -PROJECT_NAME = "RMM" +PROJECT_NAME = RMM # The PROJECT_NUMBER tag can be used to enter a project or revision number. This # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 23.08 +PROJECT_NUMBER = 23.10 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a @@ -259,7 +259,11 @@ TAB_SIZE = 4 # commands \{ and \} for these it is advised to use the version @{ and @} or use # a double escape (\\{ and \\}) -ALIASES = +ALIASES = "briefreturn{1}=@brief \1 @return \1" \ + "default_copy_constructor=Default copy constructor" \ + "default_move_constructor=Default move constructor" \ + "default_copy_assignment{1}=@brief Default copy assignment operator @return \1& Reference to the assigned object" \ + "default_move_assignment{1}=@brief Default move assignment operator @return \1& Reference to the assigned object" # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources # only. Doxygen will then generate output that is more tailored for C. For @@ -313,7 +317,10 @@ OPTIMIZE_OUTPUT_SLICE = NO # Note: For files without extension you can use no_extension as a placeholder. # # Note that for custom extensions you also need to set FILE_PATTERNS otherwise -# the files are not read by doxygen. +# the files are not read by doxygen. When specifying no_extension you should add +# * to the FILE_PATTERNS. +# +# Note see also the list of default file extension mappings. EXTENSION_MAPPING = cu=C++ \ cuh=C++ @@ -524,6 +531,13 @@ EXTRACT_LOCAL_METHODS = NO EXTRACT_ANON_NSPACES = NO +# If this flag is set to YES, the name of an unnamed parameter in a declaration +# will be determined by the corresponding definition. By default unnamed +# parameters remain unnamed in the output. +# The default value is: YES. + +RESOLVE_UNNAMED_PARAMS = YES + # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all # undocumented members inside documented classes or files. If set to NO these # members will be included in the various overviews, but no documentation @@ -561,11 +575,18 @@ HIDE_IN_BODY_DOCS = NO INTERNAL_DOCS = NO -# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file -# names in lower-case letters. If set to YES, upper-case letters are also -# allowed. This is useful if you have classes or files whose names only differ -# in case and if your file system supports case sensitive file names. Windows -# (including Cygwin) and Mac users are advised to set this option to NO. +# With the correct setting of option CASE_SENSE_NAMES doxygen will better be +# able to match the capabilities of the underlying filesystem. In case the +# filesystem is case sensitive (i.e. it supports files in the same directory +# whose names only differ in casing), the option must be set to YES to properly +# deal with such files in case they appear in the input. For filesystems that +# are not case sensitive the option should be be set to NO to properly deal with +# output files written for symbols that only differ in casing, such as for two +# classes, one named CLASS and the other named Class, and to also support +# references to files without having to specify the exact matching casing. On +# Windows (including Cygwin) and MacOS, users should typically set this option +# to NO, whereas on Linux or other Unix flavors it should typically be set to +# YES. # The default value is: system dependent. CASE_SENSE_NAMES = YES @@ -804,7 +825,10 @@ WARN_IF_DOC_ERROR = YES WARN_NO_PARAMDOC = YES # If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when -# a warning is encountered. +# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS +# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but +# at the end of the doxygen process doxygen will return with a non-zero status. +# Possible values are: NO, YES and FAIL_ON_WARNINGS. # The default value is: NO. WARN_AS_ERROR = NO @@ -840,8 +864,8 @@ INPUT = ../include # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses # libiconv (or the iconv built into libc) for the transcoding. See the libiconv -# documentation (see: https://www.gnu.org/software/libiconv/) for the list of -# possible encodings. +# documentation (see: +# https://www.gnu.org/software/libiconv/) for the list of possible encodings. # The default value is: UTF-8. INPUT_ENCODING = UTF-8 @@ -854,13 +878,15 @@ INPUT_ENCODING = UTF-8 # need to set EXTENSION_MAPPING for the extension otherwise the files are not # read by doxygen. # +# Note the list of default checked file patterns might differ from the list of +# default file extension mappings. +# # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, # *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment), -# *.doc (to be provided as doxygen C comment), *.txt (to be provided as doxygen -# C comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd, -# *.vhdl, *.ucf, *.qsf and *.ice. +# *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd, *.vhdl, +# *.ucf, *.qsf and *.ice. FILE_PATTERNS = *.c \ *.cpp \ @@ -898,7 +924,7 @@ EXCLUDE_SYMLINKS = NO # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories for example use the pattern */test/* -EXCLUDE_PATTERNS = +EXCLUDE_PATTERNS = */detail/* # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the @@ -1090,13 +1116,6 @@ VERBATIM_HEADERS = YES ALPHABETICAL_INDEX = YES -# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in -# which the alphabetical index list will be split. -# Minimum value: 1, maximum value: 20, default value: 5. -# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. - -COLS_IN_ALPHA_INDEX = 5 - # In case all classes in a project start with a common prefix, all classes will # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag # can be used to specify a prefix (or a list of prefixes) that should be ignored @@ -1267,10 +1286,11 @@ HTML_INDEX_NUM_ENTRIES = 100 # If the GENERATE_DOCSET tag is set to YES, additional index files will be # generated that can be used as input for Apple's Xcode 3 integrated development -# environment (see: https://developer.apple.com/xcode/), introduced with OSX -# 10.5 (Leopard). To create a documentation set, doxygen will generate a -# Makefile in the HTML output directory. Running make will produce the docset in -# that directory and running make install will install the docset in +# environment (see: +# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To +# create a documentation set, doxygen will generate a Makefile in the HTML +# output directory. Running make will produce the docset in that directory and +# running make install will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at # startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy # genXcode/_index.html for more information. @@ -1312,8 +1332,8 @@ DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three # additional HTML index files: index.hhp, index.hhc, and index.hhk. The # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop -# (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on -# Windows. +# (see: +# https://www.microsoft.com/en-us/download/details.aspx?id=21138) on Windows. # # The HTML Help Workshop contains a compiler that can convert all HTML output # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML @@ -1388,7 +1408,8 @@ QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace -# (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace). +# (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace). # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_QHP is set to YES. @@ -1396,8 +1417,8 @@ QHP_NAMESPACE = org.doxygen.Project # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt # Help Project output. For more information please see Qt Help Project / Virtual -# Folders (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual- -# folders). +# Folders (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders). # The default value is: doc. # This tag requires that the tag GENERATE_QHP is set to YES. @@ -1405,16 +1426,16 @@ QHP_VIRTUAL_FOLDER = doc # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom # filter to add. For more information please see Qt Help Project / Custom -# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom- -# filters). +# Filters (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom -# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom- -# filters). +# Filters (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_ATTRS = @@ -1426,9 +1447,9 @@ QHP_CUST_FILTER_ATTRS = QHP_SECT_FILTER_ATTRS = -# The QHG_LOCATION tag can be used to specify the location of Qt's -# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the -# generated .qhp file. +# The QHG_LOCATION tag can be used to specify the location (absolute path +# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to +# run qhelpgenerator on the generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. QHG_LOCATION = @@ -1555,7 +1576,7 @@ USE_MATHJAX = NO # When MathJax is enabled you can set the default output format to be used for # the MathJax output. See the MathJax site (see: -# http://docs.mathjax.org/en/latest/output.html) for more details. +# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. # Possible values are: HTML-CSS (which is slower, but has the best # compatibility), NativeMML (i.e. MathML) and SVG. # The default value is: HTML-CSS. @@ -1585,7 +1606,8 @@ MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces # of code that will be used on startup of the MathJax code. See the MathJax site -# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an +# (see: +# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. @@ -1632,7 +1654,8 @@ SERVER_BASED_SEARCH = NO # # Doxygen ships with an example indexer (doxyindexer) and search engine # (doxysearch.cgi) which are based on the open source search engine library -# Xapian (see: https://xapian.org/). +# Xapian (see: +# https://xapian.org/). # # See the section "External Indexing and Searching" for details. # The default value is: NO. @@ -1645,8 +1668,9 @@ EXTERNAL_SEARCH = NO # # Doxygen ships with an example indexer (doxyindexer) and search engine # (doxysearch.cgi) which are based on the open source search engine library -# Xapian (see: https://xapian.org/). See the section "External Indexing and -# Searching" for details. +# Xapian (see: +# https://xapian.org/). See the section "External Indexing and Searching" for +# details. # This tag requires that the tag SEARCHENGINE is set to YES. SEARCHENGINE_URL = @@ -2325,10 +2349,32 @@ UML_LOOK = NO # but if the number exceeds 15, the total amount of fields shown is limited to # 10. # Minimum value: 0, maximum value: 100, default value: 10. -# This tag requires that the tag HAVE_DOT is set to YES. +# This tag requires that the tag UML_LOOK is set to YES. UML_LIMIT_NUM_FIELDS = 10 +# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and +# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS +# tag is set to YES, doxygen will add type and arguments for attributes and +# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen +# will not generate fields with class member information in the UML graphs. The +# class diagrams will look similar to the default class diagrams but using UML +# notation for the relationships. +# Possible values are: NO, YES and NONE. +# The default value is: NO. +# This tag requires that the tag UML_LOOK is set to YES. + +DOT_UML_DETAILS = NO + +# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters +# to display on a single line. If the actual line length exceeds this threshold +# significantly it will wrapped across multiple lines. Some heuristics are apply +# to avoid ugly line breaks. +# Minimum value: 0, maximum value: 1000, default value: 17. +# This tag requires that the tag HAVE_DOT is set to YES. + +DOT_WRAP_THRESHOLD = 17 + # If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and # collaboration graphs will show the relations between templates and their # instances. @@ -2518,9 +2564,11 @@ DOT_MULTI_TARGETS = NO GENERATE_LEGEND = YES -# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot +# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate # files that are used to generate the various graphs. +# +# Note: This setting is not only used for dot files but also for msc and +# plantuml temporary files. # The default value is: YES. -# This tag requires that the tag HAVE_DOT is set to YES. DOT_CLEANUP = YES diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake new file mode 100644 index 000000000..1ff8119ba --- /dev/null +++ b/fetch_rapids.cmake @@ -0,0 +1,18 @@ +# ============================================================================= +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= +if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/RMM_RAPIDS.cmake) + file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.10/RAPIDS.cmake + ${CMAKE_CURRENT_BINARY_DIR}/RMM_RAPIDS.cmake) +endif() +include(${CMAKE_CURRENT_BINARY_DIR}/RMM_RAPIDS.cmake) diff --git a/include/rmm/cuda_device.hpp b/include/rmm/cuda_device.hpp index ab225490e..8d355ee23 100644 --- a/include/rmm/cuda_device.hpp +++ b/include/rmm/cuda_device.hpp @@ -26,23 +26,22 @@ namespace rmm { * */ struct cuda_device_id { - using value_type = int; + using value_type = int; ///< Integer type used for device identifier /** * @brief Construct a `cuda_device_id` from the specified integer value * - * @param id The device's integer identifier + * @param dev_id The device's integer identifier */ explicit constexpr cuda_device_id(value_type dev_id) noexcept : id_{dev_id} {} - /// Returns the wrapped integer value + /// @briefreturn{The wrapped integer value} [[nodiscard]] constexpr value_type value() const noexcept { return id_; } private: value_type id_; }; -namespace detail { /** * @brief Returns a `cuda_device_id` for the current device * @@ -50,11 +49,56 @@ namespace detail { * * @return `cuda_device_id` for the current device */ -inline cuda_device_id current_device() +inline cuda_device_id get_current_cuda_device() { - int dev_id{}; - RMM_CUDA_TRY(cudaGetDevice(&dev_id)); + cuda_device_id::value_type dev_id{-1}; + RMM_ASSERT_CUDA_SUCCESS(cudaGetDevice(&dev_id)); return cuda_device_id{dev_id}; } -} // namespace detail + +/** + * @brief Returns the number of CUDA devices in the system + * + * @return Number of CUDA devices in the system + */ +inline int get_num_cuda_devices() +{ + cuda_device_id::value_type num_dev{-1}; + RMM_ASSERT_CUDA_SUCCESS(cudaGetDeviceCount(&num_dev)); + return num_dev; +} + +/** + * @brief RAII class that sets the current CUDA device to the specified device on construction + * and restores the previous device on destruction. + */ +struct cuda_set_device_raii { + /** + * @brief Construct a new cuda_set_device_raii object and sets the current CUDA device to `dev_id` + * + * @param dev_id The device to set as the current CUDA device + */ + explicit cuda_set_device_raii(cuda_device_id dev_id) + : old_device_{get_current_cuda_device()}, needs_reset_{old_device_.value() != dev_id.value()} + { + if (needs_reset_) RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(dev_id.value())); + } + /** + * @brief Reactivates the previous CUDA device + */ + ~cuda_set_device_raii() noexcept + { + if (needs_reset_) RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(old_device_.value())); + } + + cuda_set_device_raii(cuda_set_device_raii const&) = delete; + cuda_set_device_raii& operator=(cuda_set_device_raii const&) = delete; + cuda_set_device_raii(cuda_set_device_raii&&) = delete; + cuda_set_device_raii& operator=(cuda_set_device_raii&&) = delete; + + private: + cuda_device_id old_device_; + bool needs_reset_; +}; + } // namespace rmm diff --git a/include/rmm/cuda_stream.hpp b/include/rmm/cuda_stream.hpp index 9d2dab4f0..b778cf7bd 100644 --- a/include/rmm/cuda_stream.hpp +++ b/include/rmm/cuda_stream.hpp @@ -46,6 +46,8 @@ class cuda_stream { * * A moved-from cuda_stream is invalid and it is Undefined Behavior to call methods that access * the owned stream. + * + * @return A reference to this cuda_stream */ cuda_stream& operator=(cuda_stream&&) = default; ~cuda_stream() = default; diff --git a/include/rmm/cuda_stream_view.hpp b/include/rmm/cuda_stream_view.hpp index bc30f26d8..150fe3957 100644 --- a/include/rmm/cuda_stream_view.hpp +++ b/include/rmm/cuda_stream_view.hpp @@ -33,41 +33,47 @@ namespace rmm { */ class cuda_stream_view { public: - constexpr cuda_stream_view() = default; - constexpr cuda_stream_view(cuda_stream_view const&) = default; - constexpr cuda_stream_view(cuda_stream_view&&) = default; - constexpr cuda_stream_view& operator=(cuda_stream_view const&) = default; - constexpr cuda_stream_view& operator=(cuda_stream_view&&) = default; - ~cuda_stream_view() = default; + constexpr cuda_stream_view() = default; + ~cuda_stream_view() = default; + constexpr cuda_stream_view(cuda_stream_view const&) = default; ///< @default_copy_constructor + constexpr cuda_stream_view(cuda_stream_view&&) = default; ///< @default_move_constructor + constexpr cuda_stream_view& operator=(cuda_stream_view const&) = + default; ///< @default_copy_assignment{cuda_stream_view} + constexpr cuda_stream_view& operator=(cuda_stream_view&&) = + default; ///< @default_move_assignment{cuda_stream_view} // Disable construction from literal 0 constexpr cuda_stream_view(int) = delete; //< Prevent cast from 0 constexpr cuda_stream_view(std::nullptr_t) = delete; //< Prevent cast from nullptr /** - * @brief Implicit conversion from cudaStream_t. + * @brief Constructor from a cudaStream_t + * + * @param stream The underlying stream for this view */ constexpr cuda_stream_view(cudaStream_t stream) noexcept : stream_{stream} {} /** * @brief Get the wrapped stream. * - * @return cudaStream_t The wrapped stream. + * @return cudaStream_t The underlying stream referenced by this cuda_stream_view */ [[nodiscard]] constexpr cudaStream_t value() const noexcept { return stream_; } /** * @brief Implicit conversion to cudaStream_t. + * + * @return cudaStream_t The underlying stream referenced by this cuda_stream_view */ constexpr operator cudaStream_t() const noexcept { return value(); } /** - * @brief Return true if the wrapped stream is the CUDA per-thread default stream. + * @briefreturn{true if the wrapped stream is the CUDA per-thread default stream} */ [[nodiscard]] inline bool is_per_thread_default() const noexcept; /** - * @brief Return true if the wrapped stream is explicitly the CUDA legacy default stream. + * @briefreturn{true if the wrapped stream is explicitly the CUDA legacy default stream} */ [[nodiscard]] inline bool is_default() const noexcept; diff --git a/include/rmm/detail/dynamic_load_runtime.hpp b/include/rmm/detail/dynamic_load_runtime.hpp index 28121e6a8..b45dbae25 100644 --- a/include/rmm/detail/dynamic_load_runtime.hpp +++ b/include/rmm/detail/dynamic_load_runtime.hpp @@ -115,7 +115,7 @@ struct async_alloc { int cuda_pool_supported{}; auto result = cudaDeviceGetAttribute(&cuda_pool_supported, cudaDevAttrMemoryPoolsSupported, - rmm::detail::current_device().value()); + rmm::get_current_cuda_device().value()); return result == cudaSuccess and cuda_pool_supported == 1; }()}; return runtime_supports_pool and driver_supports_pool; @@ -139,7 +139,7 @@ struct async_alloc { if (cudaMemHandleTypeNone != handle_type) { auto const result = cudaDeviceGetAttribute(&supported_handle_types_bitmask, cudaDevAttrMemoryPoolSupportedHandleTypes, - rmm::detail::current_device().value()); + rmm::get_current_cuda_device().value()); // Don't throw on cudaErrorInvalidValue auto const unsupported_runtime = (result == cudaErrorInvalidValue); diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp index bd8a8211a..84638ee89 100644 --- a/include/rmm/device_buffer.hpp +++ b/include/rmm/device_buffer.hpp @@ -29,7 +29,6 @@ namespace rmm { /** - * @file device_buffer.hpp * @brief RAII construct for device memory allocation * * This class allocates untyped and *uninitialized* device memory using a @@ -202,6 +201,8 @@ class device_buffer { * replaced by the `other.stream()`. * * @param other The `device_buffer` whose contents will be moved. + * + * @return A reference to this `device_buffer` */ device_buffer& operator=(device_buffer&& other) noexcept { @@ -331,22 +332,22 @@ class device_buffer { } /** - * @brief Returns raw pointer to underlying device memory allocation + * @briefreturn{Const pointer to the device memory allocation} */ [[nodiscard]] void const* data() const noexcept { return _data; } /** - * @brief Returns raw pointer to underlying device memory allocation + * @briefreturn{Pointer to the device memory allocation} */ void* data() noexcept { return _data; } /** - * @brief Returns the number of bytes. + * @briefreturn{The number of bytes} */ [[nodiscard]] std::size_t size() const noexcept { return _size; } /** - * @brief Returns the signed number of bytes. + * @briefreturn{The signed number of bytes} */ [[nodiscard]] std::int64_t ssize() const noexcept { @@ -356,11 +357,10 @@ class device_buffer { } /** - * @brief returns the number of bytes that can be held in currently allocated storage. + * @briefreturn{Whether or not the buffer currently holds any data} * * If `is_empty() == true`, the `device_buffer` may still hold an allocation * if `capacity() > 0`. - * */ [[nodiscard]] bool is_empty() const noexcept { return 0 == size(); } @@ -368,11 +368,13 @@ class device_buffer { * @brief Returns actual size in bytes of device memory allocation. * * The invariant `size() <= capacity()` holds. + * + * @return The actual size in bytes of the device memory allocation */ [[nodiscard]] std::size_t capacity() const noexcept { return _capacity; } /** - * @brief Returns stream most recently specified for allocation/deallocation + * @briefreturn{The stream most recently specified for allocation/deallocation} */ [[nodiscard]] cuda_stream_view stream() const noexcept { return _stream; } @@ -384,20 +386,21 @@ class device_buffer { * will be used for deallocation in the `rmm::device_uvector` destructor. * However, if either of `resize()` or `shrink_to_fit()` is called after this, * the later stream parameter will be stored and used in the destructor. + * + * @param stream The stream to use for deallocation */ void set_stream(cuda_stream_view stream) noexcept { _stream = stream; } /** - * @brief Returns pointer to the memory resource used to allocate and - * deallocate the device memory + * @briefreturn{Pointer to the memory resource used to allocate and deallocate} */ [[nodiscard]] mr::device_memory_resource* memory_resource() const noexcept { return _mr; } private: - void* _data{nullptr}; ///< Pointer to device memory allocation - std::size_t _size{}; ///< Requested size of the device memory allocation - std::size_t _capacity{}; ///< The actual size of the device memory allocation - cuda_stream_view _stream{}; ///< Stream to use for device memory deallocation + void* _data{nullptr}; ///< Pointer to device memory allocation + std::size_t _size{}; ///< Requested size of the device memory allocation + std::size_t _capacity{}; ///< The actual size of the device memory allocation + cuda_stream_view _stream{}; ///< Stream to use for device memory deallocation mr::device_memory_resource* _mr{ mr::get_current_device_resource()}; ///< The memory resource used to ///< allocate/deallocate device memory diff --git a/include/rmm/device_scalar.hpp b/include/rmm/device_scalar.hpp index e476b2d87..83fcdda09 100644 --- a/include/rmm/device_scalar.hpp +++ b/include/rmm/device_scalar.hpp @@ -37,18 +37,25 @@ class device_scalar { public: static_assert(std::is_trivially_copyable::value, "Scalar type must be trivially copyable"); - using value_type = typename device_uvector::value_type; - using reference = typename device_uvector::reference; - using const_reference = typename device_uvector::const_reference; - using pointer = typename device_uvector::pointer; - using const_pointer = typename device_uvector::const_pointer; + using value_type = typename device_uvector::value_type; ///< T, the type of the scalar element + using reference = typename device_uvector::reference; ///< value_type& + using const_reference = typename device_uvector::const_reference; ///< const value_type& + using pointer = + typename device_uvector::pointer; ///< The type of the pointer returned by data() + using const_pointer = typename device_uvector::const_pointer; ///< The type of the iterator + ///< returned by data() const RMM_EXEC_CHECK_DISABLE ~device_scalar() = default; RMM_EXEC_CHECK_DISABLE - device_scalar(device_scalar&&) noexcept = default; + device_scalar(device_scalar&&) noexcept = default; ///< Default move constructor + /** + * @brief Default move assignment operator + * + * @return device_scalar& A reference to the assigned-to object + */ device_scalar& operator=(device_scalar&&) noexcept = default; /** @@ -224,6 +231,8 @@ class device_scalar { * specified to the constructor, then appropriate dependencies must be inserted between the * streams (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`), otherwise there may * be a race condition. + * + * @return Pointer to underlying device memory */ [[nodiscard]] pointer data() noexcept { return static_cast(_storage.data()); } @@ -234,6 +243,8 @@ class device_scalar { * specified to the constructor, then appropriate dependencies must be inserted between the * streams (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`), otherwise there may * be a race condition. + * + * @return Const pointer to underlying device memory */ [[nodiscard]] const_pointer data() const noexcept { @@ -241,12 +252,14 @@ class device_scalar { } /** - * @brief Returns stream most recently specified for allocation/deallocation + * @briefreturn{Stream associated with the device memory allocation} */ [[nodiscard]] cuda_stream_view stream() const noexcept { return _storage.stream(); } /** * @brief Sets the stream to be used for deallocation + * + * @param stream Stream to be used for deallocation */ void set_stream(cuda_stream_view stream) noexcept { _storage.set_stream(stream); } diff --git a/include/rmm/device_uvector.hpp b/include/rmm/device_uvector.hpp index 40f5d8c5d..47ed1adff 100644 --- a/include/rmm/device_uvector.hpp +++ b/include/rmm/device_uvector.hpp @@ -71,22 +71,24 @@ class device_uvector { "device_uvector only supports types that are trivially copyable."); public: - using value_type = T; - using size_type = std::size_t; - using reference = value_type&; - using const_reference = value_type const&; - using pointer = value_type*; - using const_pointer = value_type const*; - using iterator = pointer; - using const_iterator = const_pointer; + using value_type = T; ///< T; stored value type + using size_type = std::size_t; ///< The type used for the size of the vector + using reference = value_type&; ///< value_type&; reference type returned by operator[](size_type) + using const_reference = value_type const&; ///< value_type const&; constant reference type + ///< returned by operator[](size_type) const + using pointer = value_type*; ///< The type of the pointer returned by data() + using const_pointer = value_type const*; ///< The type of the pointer returned by data() const + using iterator = pointer; ///< The type of the iterator returned by begin() + using const_iterator = const_pointer; ///< The type of the const iterator returned by cbegin() RMM_EXEC_CHECK_DISABLE ~device_uvector() = default; RMM_EXEC_CHECK_DISABLE - device_uvector(device_uvector&&) noexcept = default; + device_uvector(device_uvector&&) noexcept = default; ///< @default_move_constructor - device_uvector& operator=(device_uvector&&) noexcept = default; + device_uvector& operator=(device_uvector&&) noexcept = + default; ///< @default_move_assignment{device_uvector} /** * @brief Copy ctor is deleted as it doesn't allow a stream argument @@ -497,12 +499,12 @@ class device_uvector { [[nodiscard]] const_iterator end() const noexcept { return cend(); } /** - * @brief Returns the number of elements. + * @briefreturn{The number of elements in the vector} */ [[nodiscard]] std::size_t size() const noexcept { return bytes_to_elements(_storage.size()); } /** - * @brief Returns the signed number of elements. + * @briefreturn{The signed number of elements in the vector} */ [[nodiscard]] std::int64_t ssize() const noexcept { @@ -512,17 +514,12 @@ class device_uvector { } /** - * @brief Returns true if the vector contains no elements, i.e., `size() == 0`. - * - * @return true The vector is empty - * @return false The vector is not empty + * @briefreturn{true if the vector contains no elements, i.e. `size() == 0`} */ [[nodiscard]] bool is_empty() const noexcept { return size() == 0; } /** - * @brief Returns pointer to the resource used to allocate and deallocate the device storage. - * - * @return Pointer to underlying resource + * @briefreturn{Pointer to underlying resource used to allocate and deallocate the device storage} */ [[nodiscard]] mr::device_memory_resource* memory_resource() const noexcept { @@ -530,7 +527,7 @@ class device_uvector { } /** - * @brief Returns stream most recently specified for allocation/deallocation + * @briefreturn{Stream most recently specified for allocation/deallocation} */ [[nodiscard]] cuda_stream_view stream() const noexcept { return _storage.stream(); } @@ -542,6 +539,8 @@ class device_uvector { * will be used for deallocation in the `rmm::device_uvector destructor. * However, if either of `resize()` or `shrink_to_fit()` is called after this, * the later stream parameter will be stored and used in the destructor. + * + * @param stream The stream to use for deallocation */ void set_stream(cuda_stream_view stream) noexcept { _storage.set_stream(stream); } diff --git a/include/rmm/exec_policy.hpp b/include/rmm/exec_policy.hpp index 015c3646d..78c747652 100644 --- a/include/rmm/exec_policy.hpp +++ b/include/rmm/exec_policy.hpp @@ -30,6 +30,9 @@ namespace rmm { +/** + * @brief Synchronous execution policy for allocations using thrust + */ using thrust_exec_policy_t = thrust::detail::execute_with_allocator, thrust::cuda_cub::execute_on_stream_base>; @@ -40,6 +43,12 @@ using thrust_exec_policy_t = */ class exec_policy : public thrust_exec_policy_t { public: + /** + * @brief Construct a new execution policy object + * + * @param stream The stream on which to allocate temporary memory + * @param mr The resource to use for allocating temporary memory + */ explicit exec_policy(cuda_stream_view stream = cuda_stream_default, rmm::mr::device_memory_resource* mr = mr::get_current_device_resource()) : thrust_exec_policy_t( @@ -50,6 +59,9 @@ class exec_policy : public thrust_exec_policy_t { #if THRUST_VERSION >= 101600 +/** + * @brief Asynchronous execution policy for allocations using thrust + */ using thrust_exec_policy_nosync_t = thrust::detail::execute_with_allocator, thrust::cuda_cub::execute_on_stream_nosync_base>; @@ -72,9 +84,11 @@ class exec_policy_nosync : public thrust_exec_policy_nosync_t { #else -using thrust_exec_policy_nosync_t = thrust_exec_policy_t; -using exec_policy_nosync = exec_policy; - +using thrust_exec_policy_nosync_t = + thrust_exec_policy_t; ///< When used with Thrust < 1.16.0, thrust_exec_policy_nosync_t is an + ///< alias for thrust_exec_policy_t +using exec_policy_nosync = + exec_policy; ///< When used with Thrust < 1.16.0, exec_policy_nosync is an alias for exec_policy #endif } // namespace rmm diff --git a/include/rmm/logger.hpp b/include/rmm/logger.hpp index d0d6f8fa4..4e17878bd 100644 --- a/include/rmm/logger.hpp +++ b/include/rmm/logger.hpp @@ -43,9 +43,11 @@ inline std::string default_log_filename() return (filename == nullptr) ? std::string{"rmm_log.txt"} : std::string{filename}; } -// Simple wrapper around a spdlog::logger that performs RMM-specific initialization +/** + * @brief Simple wrapper around a spdlog::logger that performs RMM-specific initialization + */ struct logger_wrapper { - spdlog::logger logger_; + spdlog::logger logger_; ///< The underlying logger logger_wrapper() : logger_{"RMM", @@ -70,8 +72,14 @@ struct logger_wrapper { * @brief Represent a size in number of bytes. */ struct bytes { - std::size_t value; - + std::size_t value; ///< The size in bytes + + /** + * @brief Construct a new bytes object + * + * @param os The output stream + * @param value The size in bytes + */ friend std::ostream& operator<<(std::ostream& os, bytes const& value) { static std::array units{"B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"}; @@ -113,5 +121,8 @@ inline spdlog::logger& logger() } // namespace rmm +// Doxygen doesn't like this because we're overloading something from fmt +//! @cond Doxygen_Suppress template <> struct fmt::formatter : fmt::ostream_formatter {}; +//! @endcond diff --git a/include/rmm/mr/device/callback_memory_resource.hpp b/include/rmm/mr/device/callback_memory_resource.hpp index 1937e7ef0..2d9695be2 100644 --- a/include/rmm/mr/device/callback_memory_resource.hpp +++ b/include/rmm/mr/device/callback_memory_resource.hpp @@ -90,12 +90,14 @@ class callback_memory_resource final : public device_memory_resource { { } - callback_memory_resource() = delete; - ~callback_memory_resource() override = default; - callback_memory_resource(callback_memory_resource const&) = delete; - callback_memory_resource& operator=(callback_memory_resource const&) = delete; - callback_memory_resource(callback_memory_resource&&) noexcept = default; - callback_memory_resource& operator=(callback_memory_resource&&) noexcept = default; + callback_memory_resource() = delete; + ~callback_memory_resource() override = default; + callback_memory_resource(callback_memory_resource const&) = delete; + callback_memory_resource& operator=(callback_memory_resource const&) = delete; + callback_memory_resource(callback_memory_resource&&) noexcept = + default; ///< @default_move_constructor + callback_memory_resource& operator=(callback_memory_resource&&) noexcept = + default; ///< @default_move_assignment{callback_memory_resource} private: void* do_allocate(std::size_t bytes, cuda_stream_view stream) override diff --git a/include/rmm/mr/device/cuda_async_memory_resource.hpp b/include/rmm/mr/device/cuda_async_memory_resource.hpp index d41eae63e..329d8f29a 100644 --- a/include/rmm/mr/device/cuda_async_memory_resource.hpp +++ b/include/rmm/mr/device/cuda_async_memory_resource.hpp @@ -98,7 +98,7 @@ class cuda_async_memory_resource final : public device_memory_resource { RMM_EXPECTS(rmm::detail::async_alloc::is_export_handle_type_supported(pool_props.handleTypes), "Requested IPC memory handle type not supported"); pool_props.location.type = cudaMemLocationTypeDevice; - pool_props.location.id = rmm::detail::current_device().value(); + pool_props.location.id = rmm::get_current_cuda_device().value(); cudaMemPool_t cuda_pool_handle{}; RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolCreate(&cuda_pool_handle, &pool_props)); pool_ = cuda_async_view_memory_resource{cuda_pool_handle}; diff --git a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp index 191e4741d..c685cd75f 100644 --- a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp +++ b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp @@ -60,7 +60,7 @@ class cuda_async_view_memory_resource final : public device_memory_resource { }()} { // Check if cudaMallocAsync Memory pool supported - auto const device = rmm::detail::current_device(); + auto const device = rmm::get_current_cuda_device(); int cuda_pool_supported{}; auto result = cudaDeviceGetAttribute(&cuda_pool_supported, cudaDevAttrMemoryPoolsSupported, device.value()); @@ -77,11 +77,15 @@ class cuda_async_view_memory_resource final : public device_memory_resource { [[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return cuda_pool_handle_; } #endif - cuda_async_view_memory_resource() = default; - cuda_async_view_memory_resource(cuda_async_view_memory_resource const&) = default; - cuda_async_view_memory_resource(cuda_async_view_memory_resource&&) = default; - cuda_async_view_memory_resource& operator=(cuda_async_view_memory_resource const&) = default; - cuda_async_view_memory_resource& operator=(cuda_async_view_memory_resource&&) = default; + cuda_async_view_memory_resource() = default; + cuda_async_view_memory_resource(cuda_async_view_memory_resource const&) = + default; ///< @default_copy_constructor + cuda_async_view_memory_resource(cuda_async_view_memory_resource&&) = + default; ///< @default_move_constructor + cuda_async_view_memory_resource& operator=(cuda_async_view_memory_resource const&) = + default; ///< @default_copy_assignment{cuda_async_view_memory_resource} + cuda_async_view_memory_resource& operator=(cuda_async_view_memory_resource&&) = + default; ///< @default_move_assignment{cuda_async_view_memory_resource} /** * @brief Query whether the resource supports use of non-null CUDA streams for diff --git a/include/rmm/mr/device/cuda_memory_resource.hpp b/include/rmm/mr/device/cuda_memory_resource.hpp index 7a5385f4e..b0bf9ae09 100644 --- a/include/rmm/mr/device/cuda_memory_resource.hpp +++ b/include/rmm/mr/device/cuda_memory_resource.hpp @@ -29,12 +29,14 @@ namespace rmm::mr { */ class cuda_memory_resource final : public device_memory_resource { public: - cuda_memory_resource() = default; - ~cuda_memory_resource() override = default; - cuda_memory_resource(cuda_memory_resource const&) = default; - cuda_memory_resource(cuda_memory_resource&&) = default; - cuda_memory_resource& operator=(cuda_memory_resource const&) = default; - cuda_memory_resource& operator=(cuda_memory_resource&&) = default; + cuda_memory_resource() = default; + ~cuda_memory_resource() override = default; + cuda_memory_resource(cuda_memory_resource const&) = default; ///< @default_copy_constructor + cuda_memory_resource(cuda_memory_resource&&) = default; ///< @default_move_constructor + cuda_memory_resource& operator=(cuda_memory_resource const&) = + default; ///< @default_copy_assignment{cuda_memory_resource} + cuda_memory_resource& operator=(cuda_memory_resource&&) = + default; ///< @default_move_assignment{cuda_memory_resource} /** * @brief Query whether the resource supports use of non-null CUDA streams for diff --git a/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp b/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp index 53575e5ce..f071717c0 100644 --- a/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp +++ b/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp @@ -15,15 +15,16 @@ */ #pragma once +#include #include #include #include #include -#include - #include +#include + #include #include #include @@ -288,17 +289,25 @@ class stream_ordered_memory_resource : public crtp, public device_ stream_event_pair get_event(cuda_stream_view stream) { if (stream.is_per_thread_default()) { - // Create a thread-local shared event wrapper. Shared pointers in the thread and in each MR - // instance ensures it is destroyed cleaned up only after all are finished with it. - thread_local auto event_tls = std::make_shared(); - default_stream_events.insert(event_tls); - return stream_event_pair{stream.value(), event_tls->event}; + // Create a thread-local shared event wrapper for each device. Shared pointers in the thread + // and in each MR instance ensure the wrappers are destroyed only after all are finished + // with them. + thread_local std::vector> events_tls( + rmm::get_num_cuda_devices()); + auto event = [&, device_id = this->device_id_]() { + if (events_tls[device_id.value()]) { return events_tls[device_id.value()]->event; } + + auto event = std::make_shared(); + this->default_stream_events.insert(event); + return (events_tls[device_id.value()] = std::move(event))->event; + }(); + return stream_event_pair{stream.value(), event}; } // We use cudaStreamLegacy as the event map key for the default stream for consistency between // PTDS and non-PTDS mode. In PTDS mode, the cudaStreamLegacy map key will only exist if the // user explicitly passes it, so it is used as the default location for the free list - // at construction. For consistency, the same key is used for null stream free lists in non-PTDS - // mode. + // at construction. For consistency, the same key is used for null stream free lists in + // non-PTDS mode. // NOLINTNEXTLINE(cppcoreguidelines-pro-type-cstyle-cast) auto* const stream_to_store = stream.is_default() ? cudaStreamLegacy : stream.value(); @@ -496,11 +505,13 @@ class stream_ordered_memory_resource : public crtp, public device_ // bidirectional mapping between non-default streams and events std::unordered_map stream_events_; - // shared pointers to events keeps the events alive as long as either the thread that created them - // or the MR that is using them exists. + // shared pointers to events keeps the events alive as long as either the thread that created + // them or the MR that is using them exists. std::set> default_stream_events; std::mutex mtx_; // mutex for thread-safe access -}; // namespace detail + + rmm::cuda_device_id device_id_{rmm::get_current_cuda_device()}; +}; // namespace detail } // namespace rmm::mr::detail diff --git a/include/rmm/mr/device/device_memory_resource.hpp b/include/rmm/mr/device/device_memory_resource.hpp index 4778d9bda..5f511e393 100644 --- a/include/rmm/mr/device/device_memory_resource.hpp +++ b/include/rmm/mr/device/device_memory_resource.hpp @@ -81,12 +81,15 @@ namespace rmm::mr { */ class device_memory_resource { public: - device_memory_resource() = default; - virtual ~device_memory_resource() = default; - device_memory_resource(device_memory_resource const&) = default; - device_memory_resource& operator=(device_memory_resource const&) = default; - device_memory_resource(device_memory_resource&&) noexcept = default; - device_memory_resource& operator=(device_memory_resource&&) noexcept = default; + device_memory_resource() = default; + virtual ~device_memory_resource() = default; + device_memory_resource(device_memory_resource const&) = default; ///< @default_copy_constructor + device_memory_resource(device_memory_resource&&) noexcept = + default; ///< @default_move_constructor + device_memory_resource& operator=(device_memory_resource const&) = + default; ///< @default_copy_assignment{device_memory_resource} + device_memory_resource& operator=(device_memory_resource&&) noexcept = + default; ///< @default_move_assignment{device_memory_resource} /** * @brief Allocates memory of size at least \p bytes. @@ -121,7 +124,7 @@ class device_memory_resource { * * @throws Nothing. * - * @param p Pointer to be deallocated + * @param ptr Pointer to be deallocated * @param bytes The size in bytes of the allocation. This must be equal to the * value of `bytes` that was passed to the `allocate` call that returned `p`. * @param stream Stream on which to perform deallocation diff --git a/include/rmm/mr/device/failure_callback_resource_adaptor.hpp b/include/rmm/mr/device/failure_callback_resource_adaptor.hpp index 4a46cb660..48f0513d8 100644 --- a/include/rmm/mr/device/failure_callback_resource_adaptor.hpp +++ b/include/rmm/mr/device/failure_callback_resource_adaptor.hpp @@ -109,14 +109,13 @@ class failure_callback_resource_adaptor final : public device_memory_resource { ~failure_callback_resource_adaptor() override = default; failure_callback_resource_adaptor(failure_callback_resource_adaptor const&) = delete; failure_callback_resource_adaptor& operator=(failure_callback_resource_adaptor const&) = delete; - failure_callback_resource_adaptor(failure_callback_resource_adaptor&&) noexcept = default; + failure_callback_resource_adaptor(failure_callback_resource_adaptor&&) noexcept = + default; ///< @default_move_constructor failure_callback_resource_adaptor& operator=(failure_callback_resource_adaptor&&) noexcept = - default; + default; ///< @default_move_assignment{failure_callback_resource_adaptor} /** - * @brief Return pointer to the upstream resource. - * - * @return Upstream* Pointer to the upstream resource. + * @briefreturn{Pointer to the upstream resource} */ Upstream* get_upstream() const noexcept { return upstream_; } diff --git a/include/rmm/mr/device/fixed_size_memory_resource.hpp b/include/rmm/mr/device/fixed_size_memory_resource.hpp index 4954415c3..a24ac6ddd 100644 --- a/include/rmm/mr/device/fixed_size_memory_resource.hpp +++ b/include/rmm/mr/device/fixed_size_memory_resource.hpp @@ -50,10 +50,10 @@ class fixed_size_memory_resource friend class detail::stream_ordered_memory_resource, detail::fixed_size_free_list>; - // A block is the fixed size this resource alloates - static constexpr std::size_t default_block_size = 1 << 20; // 1 MiB - // This is the number of blocks that the pool starts out with, and also the number of - // blocks by which the pool grows when all of its current blocks are allocated + static constexpr std::size_t default_block_size = 1 << 20; ///< Default allocation block size + + /// The number of blocks that the pool starts out with, and also the number of + /// blocks by which the pool grows when all of its current blocks are allocated static constexpr std::size_t default_blocks_to_preallocate = 128; /** @@ -121,11 +121,11 @@ class fixed_size_memory_resource [[nodiscard]] std::size_t get_block_size() const noexcept { return block_size_; } protected: - using free_list = detail::fixed_size_free_list; - using block_type = free_list::block_type; + using free_list = detail::fixed_size_free_list; ///< The free list type + using block_type = free_list::block_type; ///< The type of block managed by the free list using typename detail::stream_ordered_memory_resource, detail::fixed_size_free_list>::split_block; - using lock_guard = std::lock_guard; + using lock_guard = std::lock_guard; ///< Type of lock used to synchronize access /** * @brief Get the (fixed) size of allocations supported by this memory resource @@ -142,6 +142,7 @@ class fixed_size_memory_resource * strategy (see `size_to_grow()`). * * @param size The minimum size to allocate + * @param blocks The set of blocks from which to allocate * @param stream The stream on which the memory is to be used. * @return block_type The allocated block */ @@ -154,7 +155,6 @@ class fixed_size_memory_resource /** * @brief Allocate blocks from upstream to expand the suballocation pool. * - * @param size The minimum size to allocate * @param stream The stream on which the memory is to be used. * @return block_type The allocated block */ @@ -182,7 +182,6 @@ class fixed_size_memory_resource * * @param block The block to allocate from. * @param size The size in bytes of the requested allocation. - * @param stream_event The stream and associated event on which the allocation will be used. * @return A pair comprising the allocated pointer and any unallocated remainder of the input * block. */ @@ -196,7 +195,6 @@ class fixed_size_memory_resource * * @param ptr The pointer to the memory to free. * @param size The size of the memory to free. Must be equal to the original allocation size. - * @param stream The stream-event pair for the stream on which the memory was last used. * @return The (now freed) block associated with `p`. The caller is expected to return the block * to the pool. */ @@ -217,7 +215,8 @@ class fixed_size_memory_resource * @param stream the stream being executed on * @return std::pair with available and free memory for resource */ - [[nodiscard]] std::pair do_get_mem_info(cuda_stream_view) const override + [[nodiscard]] std::pair do_get_mem_info( + [[maybe_unused]] cuda_stream_view stream) const override { return std::make_pair(0, 0); } @@ -272,7 +271,7 @@ class fixed_size_memory_resource } private: - Upstream* upstream_mr_; // The resource from which to allocate new blocks + Upstream* upstream_mr_; // The resource from which to allocate new blocks std::size_t const block_size_; // size of blocks this MR allocates std::size_t const upstream_chunk_size_; // size of chunks allocated from heap MR diff --git a/include/rmm/mr/device/limiting_resource_adaptor.hpp b/include/rmm/mr/device/limiting_resource_adaptor.hpp index 6d1843e82..895c404b0 100644 --- a/include/rmm/mr/device/limiting_resource_adaptor.hpp +++ b/include/rmm/mr/device/limiting_resource_adaptor.hpp @@ -44,7 +44,8 @@ class limiting_resource_adaptor final : public device_memory_resource { * @throws `rmm::logic_error` if `upstream == nullptr` * * @param upstream The resource used for allocating/deallocating device memory - * @param allocation_limit Maximum memory allowed for this allocator. + * @param allocation_limit Maximum memory allowed for this allocator + * @param alignment Alignment in bytes for the start of each allocated buffer */ limiting_resource_adaptor(Upstream* upstream, std::size_t allocation_limit, @@ -57,17 +58,17 @@ class limiting_resource_adaptor final : public device_memory_resource { RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); } - limiting_resource_adaptor() = delete; - ~limiting_resource_adaptor() override = default; - limiting_resource_adaptor(limiting_resource_adaptor const&) = delete; - limiting_resource_adaptor(limiting_resource_adaptor&&) noexcept = default; - limiting_resource_adaptor& operator=(limiting_resource_adaptor const&) = delete; - limiting_resource_adaptor& operator=(limiting_resource_adaptor&&) noexcept = default; + limiting_resource_adaptor() = delete; + ~limiting_resource_adaptor() override = default; + limiting_resource_adaptor(limiting_resource_adaptor const&) = delete; + limiting_resource_adaptor(limiting_resource_adaptor&&) noexcept = + default; ///< @default_move_constructor + limiting_resource_adaptor& operator=(limiting_resource_adaptor const&) = delete; + limiting_resource_adaptor& operator=(limiting_resource_adaptor&&) noexcept = + default; ///< @default_move_assignment{limiting_resource_adaptor} /** - * @brief Return pointer to the upstream resource. - * - * @return Upstream* Pointer to the upstream resource. + * @briefreturn{Pointer to the upstream resource} */ [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; } diff --git a/include/rmm/mr/device/logging_resource_adaptor.hpp b/include/rmm/mr/device/logging_resource_adaptor.hpp index 71487705a..0ff9e950b 100644 --- a/include/rmm/mr/device/logging_resource_adaptor.hpp +++ b/include/rmm/mr/device/logging_resource_adaptor.hpp @@ -98,6 +98,20 @@ class logging_resource_adaptor final : public device_memory_resource { init_logger(auto_flush); } + /** + * @brief Construct a new logging resource adaptor using `upstream` to satisfy + * allocation requests and logging information about each allocation/free to + * the ostream specified by `stream`. + * + * The logfile will be written using CSV formatting. + * + * @throws `rmm::logic_error` if `upstream == nullptr` + * + * @param upstream The resource used for allocating/deallocating device memory + * @param sinks A list of logging sinks to which log output will be written. + * @param auto_flush If true, flushes the log for every (de)allocation. Warning, this will degrade + * performance. + */ logging_resource_adaptor(Upstream* upstream, spdlog::sinks_init_list sinks, bool auto_flush = false) @@ -108,12 +122,14 @@ class logging_resource_adaptor final : public device_memory_resource { init_logger(auto_flush); } - logging_resource_adaptor() = delete; - ~logging_resource_adaptor() override = default; - logging_resource_adaptor(logging_resource_adaptor const&) = delete; - logging_resource_adaptor& operator=(logging_resource_adaptor const&) = delete; - logging_resource_adaptor(logging_resource_adaptor&&) noexcept = default; - logging_resource_adaptor& operator=(logging_resource_adaptor&&) noexcept = default; + logging_resource_adaptor() = delete; + ~logging_resource_adaptor() override = default; + logging_resource_adaptor(logging_resource_adaptor const&) = delete; + logging_resource_adaptor& operator=(logging_resource_adaptor const&) = delete; + logging_resource_adaptor(logging_resource_adaptor&&) noexcept = + default; ///< @default_move_constructor + logging_resource_adaptor& operator=(logging_resource_adaptor&&) noexcept = + default; ///< @default_move_assignment{logging_resource_adaptor} /** * @brief Return pointer to the upstream resource. @@ -301,8 +317,8 @@ class logging_resource_adaptor final : public device_memory_resource { std::shared_ptr logger_; ///< spdlog logger object - Upstream* upstream_; ///< The upstream resource used for satisfying - ///< allocation requests + Upstream* upstream_; ///< The upstream resource used for satisfying + ///< allocation requests }; /** diff --git a/include/rmm/mr/device/managed_memory_resource.hpp b/include/rmm/mr/device/managed_memory_resource.hpp index 50317720a..4a0f7701a 100644 --- a/include/rmm/mr/device/managed_memory_resource.hpp +++ b/include/rmm/mr/device/managed_memory_resource.hpp @@ -29,12 +29,14 @@ namespace rmm::mr { */ class managed_memory_resource final : public device_memory_resource { public: - managed_memory_resource() = default; - ~managed_memory_resource() override = default; - managed_memory_resource(managed_memory_resource const&) = default; - managed_memory_resource(managed_memory_resource&&) = default; - managed_memory_resource& operator=(managed_memory_resource const&) = default; - managed_memory_resource& operator=(managed_memory_resource&&) = default; + managed_memory_resource() = default; + ~managed_memory_resource() override = default; + managed_memory_resource(managed_memory_resource const&) = default; ///< @default_copy_constructor + managed_memory_resource(managed_memory_resource&&) = default; ///< @default_move_constructor + managed_memory_resource& operator=(managed_memory_resource const&) = + default; ///< @default_copy_assignment{managed_memory_resource} + managed_memory_resource& operator=(managed_memory_resource&&) = + default; ///< @default_move_assignment{managed_memory_resource} /** * @brief Query whether the resource supports use of non-null streams for diff --git a/include/rmm/mr/device/owning_wrapper.hpp b/include/rmm/mr/device/owning_wrapper.hpp index d7c7b8c46..eca0b3207 100644 --- a/include/rmm/mr/device/owning_wrapper.hpp +++ b/include/rmm/mr/device/owning_wrapper.hpp @@ -73,7 +73,8 @@ auto make_resource(std::tuple...> const& upstreams, A template class owning_wrapper : public device_memory_resource { public: - using upstream_tuple = std::tuple...>; + using upstream_tuple = + std::tuple...>; ///< Tuple of upstream memory resources /** * @brief Constructs the wrapped resource using the provided upstreams and any additional @@ -116,14 +117,12 @@ class owning_wrapper : public device_memory_resource { } /** - * @brief Returns a constant reference to the wrapped resource. - * + * @briefreturn{A constant reference to the wrapped resource} */ [[nodiscard]] Resource const& wrapped() const noexcept { return *wrapped_; } /** - * @brief Returns reference to the wrapped resource. - * + * @briefreturn{A reference to the wrapped resource} */ [[nodiscard]] Resource& wrapped() noexcept { return *wrapped_; } @@ -136,9 +135,7 @@ class owning_wrapper : public device_memory_resource { } /** - * @brief Query whether the resource supports the get_mem_info API. - * - * @return true if the wrapped resource supports get_mem_info, false otherwise. + * @briefreturn{true if the wrapped resource supports get_mem_info, false otherwise} */ [[nodiscard]] bool supports_get_mem_info() const noexcept override { diff --git a/include/rmm/mr/device/per_device_resource.hpp b/include/rmm/mr/device/per_device_resource.hpp index 4ddbd874a..aa7217758 100644 --- a/include/rmm/mr/device/per_device_resource.hpp +++ b/include/rmm/mr/device/per_device_resource.hpp @@ -88,13 +88,19 @@ inline device_memory_resource* initial_resource() return &mr; } +/** + * @briefreturn{Reference to the lock} + */ inline std::mutex& map_lock() { static std::mutex map_lock; return map_lock; } -// Must have default visibility, see: https://github.com/rapidsai/rmm/issues/826 +// This symbol must have default visibility, see: https://github.com/rapidsai/rmm/issues/826 +/** + * @briefreturn{Reference to the map from device id -> resource} + */ RMM_EXPORT inline auto& get_map() { static std::map device_id_to_resource; @@ -121,7 +127,7 @@ RMM_EXPORT inline auto& get_map() * undefined if used while the active CUDA device is a different device from the one that was active * when the device_memory_resource was created. * - * @param id The id of the target device + * @param device_id The id of the target device * @return Pointer to the current `device_memory_resource` for device `id` */ inline device_memory_resource* get_per_device_resource(cuda_device_id device_id) @@ -156,7 +162,7 @@ inline device_memory_resource* get_per_device_resource(cuda_device_id device_id) * undefined if used while the active CUDA device is a different device from the one that was active * when the device_memory_resource was created. * - * @param id The id of the target device + * @param device_id The id of the target device * @param new_mr If not `nullptr`, pointer to new `device_memory_resource` to use as new resource * for `id` * @return Pointer to the previous memory resource for `id` @@ -196,7 +202,7 @@ inline device_memory_resource* set_per_device_resource(cuda_device_id device_id, */ inline device_memory_resource* get_current_device_resource() { - return get_per_device_resource(rmm::detail::current_device()); + return get_per_device_resource(rmm::get_current_cuda_device()); } /** @@ -225,6 +231,6 @@ inline device_memory_resource* get_current_device_resource() */ inline device_memory_resource* set_current_device_resource(device_memory_resource* new_mr) { - return set_per_device_resource(rmm::detail::current_device(), new_mr); + return set_per_device_resource(rmm::get_current_cuda_device(), new_mr); } } // namespace rmm::mr diff --git a/include/rmm/mr/device/polymorphic_allocator.hpp b/include/rmm/mr/device/polymorphic_allocator.hpp index 5c87ef7f7..a52ec14d1 100644 --- a/include/rmm/mr/device/polymorphic_allocator.hpp +++ b/include/rmm/mr/device/polymorphic_allocator.hpp @@ -43,7 +43,7 @@ namespace rmm::mr { template class polymorphic_allocator { public: - using value_type = T; + using value_type = T; ///< T, the value type of objects allocated by this allocator /** * @brief Construct a `polymorphic_allocator` using the return value of * `rmm::mr::get_current_device_resource()` as the underlying memory resource. @@ -148,7 +148,9 @@ bool operator!=(polymorphic_allocator const& lhs, polymorphic_allocator co template class stream_allocator_adaptor { public: - using value_type = typename std::allocator_traits::value_type; + using value_type = + typename std::allocator_traits::value_type; ///< The value type of objects allocated + ///< by this allocator stream_allocator_adaptor() = delete; @@ -187,8 +189,8 @@ class stream_allocator_adaptor { */ template struct rebind { - using other = - stream_allocator_adaptor::template rebind_alloc>; + using other = stream_allocator_adaptor::template rebind_alloc>; ///< The type to bind to }; /** @@ -212,14 +214,12 @@ class stream_allocator_adaptor { void deallocate(value_type* ptr, std::size_t num) { alloc_.deallocate(ptr, num, stream()); } /** - * @brief Returns the underlying stream on which calls to the underlying allocator are made. - * + * @briefreturn{The stream on which calls to the underlying allocator are made} */ [[nodiscard]] cuda_stream_view stream() const noexcept { return stream_; } /** - * @brief Returns the underlying stream-ordered allocator - * + * @briefreturn{The underlying allocator} */ [[nodiscard]] Allocator underlying_allocator() const noexcept { return alloc_; } diff --git a/include/rmm/mr/device/pool_memory_resource.hpp b/include/rmm/mr/device/pool_memory_resource.hpp index 8955fd958..703ce8ea7 100644 --- a/include/rmm/mr/device/pool_memory_resource.hpp +++ b/include/rmm/mr/device/pool_memory_resource.hpp @@ -143,11 +143,11 @@ class pool_memory_resource final [[nodiscard]] std::size_t pool_size() const noexcept { return current_pool_size_; } protected: - using free_list = detail::coalescing_free_list; - using block_type = free_list::block_type; + using free_list = detail::coalescing_free_list; ///< The free list implementation + using block_type = free_list::block_type; ///< The type of block returned by the free list using typename detail::stream_ordered_memory_resource, detail::coalescing_free_list>::split_block; - using lock_guard = std::lock_guard; + using lock_guard = std::lock_guard; ///< Type of lock used to synchronize access /** * @brief Get the maximum size of allocations supported by this memory resource @@ -306,7 +306,6 @@ class pool_memory_resource final * * @param block The block to allocate from. * @param size The size in bytes of the requested allocation. - * @param stream_event The stream and associated event on which the allocation will be used. * @return A pair comprising the allocated pointer and any unallocated remainder of the input * block. */ @@ -329,7 +328,6 @@ class pool_memory_resource final * * @param ptr The pointer to the memory to free. * @param size The size of the memory to free. Must be equal to the original allocation size. - * @param stream The stream-event pair for the stream on which the memory was last used. * @return The (now freed) block associated with `p`. The caller is expected to return the block * to the pool. */ diff --git a/include/rmm/mr/device/statistics_resource_adaptor.hpp b/include/rmm/mr/device/statistics_resource_adaptor.hpp index c3fe3010f..7af75593d 100644 --- a/include/rmm/mr/device/statistics_resource_adaptor.hpp +++ b/include/rmm/mr/device/statistics_resource_adaptor.hpp @@ -40,17 +40,24 @@ template class statistics_resource_adaptor final : public device_memory_resource { public: // can be a std::shared_mutex once C++17 is adopted - using read_lock_t = std::shared_lock; - using write_lock_t = std::unique_lock; - + using read_lock_t = + std::shared_lock; ///< Type of lock used to synchronize read access + using write_lock_t = + std::unique_lock; ///< Type of lock used to synchronize write access /** * @brief Utility struct for counting the current, peak, and total value of a number */ struct counter { - int64_t value{0}; // Current value - int64_t peak{0}; // Max value of `value` - int64_t total{0}; // Sum of all added values + int64_t value{0}; ///< Current value + int64_t peak{0}; ///< Max value of `value` + int64_t total{0}; ///< Sum of all added values + /** + * @brief Add `val` to the current value and update the peak value if necessary + * + * @param val Value to add + * @return Reference to this object + */ counter& operator+=(int64_t val) { value += val; @@ -59,6 +66,12 @@ class statistics_resource_adaptor final : public device_memory_resource { return *this; } + /** + * @brief Subtract `val` from the current value and update the peak value if necessary + * + * @param val Value to subtract + * @return Reference to this object + */ counter& operator-=(int64_t val) { value -= val; @@ -79,17 +92,17 @@ class statistics_resource_adaptor final : public device_memory_resource { RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); } - statistics_resource_adaptor() = delete; - ~statistics_resource_adaptor() override = default; - statistics_resource_adaptor(statistics_resource_adaptor const&) = delete; - statistics_resource_adaptor& operator=(statistics_resource_adaptor const&) = delete; - statistics_resource_adaptor(statistics_resource_adaptor&&) noexcept = default; - statistics_resource_adaptor& operator=(statistics_resource_adaptor&&) noexcept = default; + statistics_resource_adaptor() = delete; + ~statistics_resource_adaptor() override = default; + statistics_resource_adaptor(statistics_resource_adaptor const&) = delete; + statistics_resource_adaptor& operator=(statistics_resource_adaptor const&) = delete; + statistics_resource_adaptor(statistics_resource_adaptor&&) noexcept = + default; ///< @default_move_constructor + statistics_resource_adaptor& operator=(statistics_resource_adaptor&&) noexcept = + default; ///< @default_move_assignment{statistics_resource_adaptor} /** - * @brief Return pointer to the upstream resource. - * - * @return Upstream* Pointer to the upstream resource. + * @briefreturn{Pointer to the upstream resource} */ Upstream* get_upstream() const noexcept { return upstream_; } diff --git a/include/rmm/mr/device/thread_safe_resource_adaptor.hpp b/include/rmm/mr/device/thread_safe_resource_adaptor.hpp index 37053cd07..0b8570985 100644 --- a/include/rmm/mr/device/thread_safe_resource_adaptor.hpp +++ b/include/rmm/mr/device/thread_safe_resource_adaptor.hpp @@ -35,7 +35,7 @@ namespace rmm::mr { template class thread_safe_resource_adaptor final : public device_memory_resource { public: - using lock_t = std::lock_guard; + using lock_t = std::lock_guard; ///< Type of lock used to synchronize access /** * @brief Construct a new thread safe resource adaptor using `upstream` to satisfy diff --git a/include/rmm/mr/device/thrust_allocator_adaptor.hpp b/include/rmm/mr/device/thrust_allocator_adaptor.hpp index 2b2aee0dd..de2f25043 100644 --- a/include/rmm/mr/device/thrust_allocator_adaptor.hpp +++ b/include/rmm/mr/device/thrust_allocator_adaptor.hpp @@ -38,9 +38,9 @@ namespace rmm::mr { template class thrust_allocator : public thrust::device_malloc_allocator { public: - using Base = thrust::device_malloc_allocator; - using pointer = typename Base::pointer; - using size_type = typename Base::size_type; + using Base = thrust::device_malloc_allocator; ///< The base type of this allocator + using pointer = typename Base::pointer; ///< The pointer type + using size_type = typename Base::size_type; ///< The size type /** * @brief Provides the type of a `thrust_allocator` instantiated with another @@ -50,7 +50,7 @@ class thrust_allocator : public thrust::device_malloc_allocator { */ template struct rebind { - using other = thrust_allocator; + using other = thrust_allocator; ///< The type to bind to }; /** @@ -113,12 +113,12 @@ class thrust_allocator : public thrust::device_malloc_allocator { } /** - * @brief Returns the device memory resource used by this allocator. + * @briefreturn{The device memory resource used by this} */ [[nodiscard]] device_memory_resource* resource() const noexcept { return _mr; } /** - * @brief Returns the stream used by this allocator. + * @briefreturn{The stream used by this allocator} */ [[nodiscard]] cuda_stream_view stream() const noexcept { return _stream; } diff --git a/include/rmm/mr/device/tracking_resource_adaptor.hpp b/include/rmm/mr/device/tracking_resource_adaptor.hpp index 0bbdc7468..b87218d12 100644 --- a/include/rmm/mr/device/tracking_resource_adaptor.hpp +++ b/include/rmm/mr/device/tracking_resource_adaptor.hpp @@ -48,9 +48,10 @@ template class tracking_resource_adaptor final : public device_memory_resource { public: // can be a std::shared_mutex once C++17 is adopted - using read_lock_t = std::shared_lock; - using write_lock_t = std::unique_lock; - + using read_lock_t = + std::shared_lock; ///< Type of lock used to synchronize read access + using write_lock_t = + std::unique_lock; ///< Type of lock used to synchronize write access /** * @brief Information stored about an allocation. Includes the size * and a stack trace if the `tracking_resource_adaptor` was initialized @@ -58,10 +59,16 @@ class tracking_resource_adaptor final : public device_memory_resource { * */ struct allocation_info { - std::unique_ptr strace; - std::size_t allocation_size; + std::unique_ptr strace; ///< Stack trace of the allocation + std::size_t allocation_size; ///< Size of the allocation allocation_info() = delete; + /** + * @brief Construct a new allocation info object + * + * @param size Size of the allocation + * @param capture_stack If true, capture the stack trace for the allocation + */ allocation_info(std::size_t size, bool capture_stack) : strace{[&]() { return capture_stack ? std::make_unique() : nullptr; @@ -84,17 +91,17 @@ class tracking_resource_adaptor final : public device_memory_resource { RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); } - tracking_resource_adaptor() = delete; - ~tracking_resource_adaptor() override = default; - tracking_resource_adaptor(tracking_resource_adaptor const&) = delete; - tracking_resource_adaptor& operator=(tracking_resource_adaptor const&) = delete; - tracking_resource_adaptor(tracking_resource_adaptor&&) noexcept = default; - tracking_resource_adaptor& operator=(tracking_resource_adaptor&&) noexcept = default; + tracking_resource_adaptor() = delete; + ~tracking_resource_adaptor() override = default; + tracking_resource_adaptor(tracking_resource_adaptor const&) = delete; + tracking_resource_adaptor(tracking_resource_adaptor&&) noexcept = + default; ///< @default_move_constructor + tracking_resource_adaptor& operator=(tracking_resource_adaptor const&) = delete; + tracking_resource_adaptor& operator=(tracking_resource_adaptor&&) noexcept = + default; ///< @default_move_assignment{tracking_resource_adaptor} /** - * @brief Return pointer to the upstream resource. - * - * @return Upstream* Pointer to the upstream resource. + * @briefreturn{Pointer to the upstream resource} */ Upstream* get_upstream() const noexcept { return upstream_; } diff --git a/include/rmm/mr/host/host_memory_resource.hpp b/include/rmm/mr/host/host_memory_resource.hpp index c0fe85594..3279bfb3f 100644 --- a/include/rmm/mr/host/host_memory_resource.hpp +++ b/include/rmm/mr/host/host_memory_resource.hpp @@ -46,12 +46,14 @@ namespace rmm::mr { */ class host_memory_resource { public: - host_memory_resource() = default; - virtual ~host_memory_resource() = default; - host_memory_resource(host_memory_resource const&) = default; - host_memory_resource& operator=(host_memory_resource const&) = default; - host_memory_resource(host_memory_resource&&) noexcept = default; - host_memory_resource& operator=(host_memory_resource&&) noexcept = default; + host_memory_resource() = default; + virtual ~host_memory_resource() = default; + host_memory_resource(host_memory_resource const&) = default; ///< @default_copy_constructor + host_memory_resource(host_memory_resource&&) noexcept = default; ///< @default_move_constructor + host_memory_resource& operator=(host_memory_resource const&) = + default; ///< @default_copy_assignment{host_memory_resource} + host_memory_resource& operator=(host_memory_resource&&) noexcept = + default; ///< @default_move_assignment{host_memory_resource} /** * @brief Allocates memory on the host of size at least `bytes` bytes. @@ -84,7 +86,6 @@ class host_memory_resource { * that was passed to the `allocate` call that returned `ptr`. * @param alignment Alignment of the allocation. This must be equal to the value of `alignment` * that was passed to the `allocate` call that returned `ptr`. - * @param stream Stream on which to perform deallocation */ void deallocate(void* ptr, std::size_t bytes, std::size_t alignment = alignof(std::max_align_t)) { diff --git a/include/rmm/mr/host/new_delete_resource.hpp b/include/rmm/mr/host/new_delete_resource.hpp index ffd3345bb..68cccc320 100644 --- a/include/rmm/mr/host/new_delete_resource.hpp +++ b/include/rmm/mr/host/new_delete_resource.hpp @@ -30,12 +30,14 @@ namespace rmm::mr { */ class new_delete_resource final : public host_memory_resource { public: - new_delete_resource() = default; - ~new_delete_resource() override = default; - new_delete_resource(new_delete_resource const&) = default; - new_delete_resource(new_delete_resource&&) = default; - new_delete_resource& operator=(new_delete_resource const&) = default; - new_delete_resource& operator=(new_delete_resource&&) = default; + new_delete_resource() = default; + ~new_delete_resource() override = default; + new_delete_resource(new_delete_resource const&) = default; ///< @default_copy_constructor + new_delete_resource(new_delete_resource&&) = default; ///< @default_move_constructor + new_delete_resource& operator=(new_delete_resource const&) = + default; ///< @default_copy_assignment{new_delete_resource} + new_delete_resource& operator=(new_delete_resource&&) = + default; ///< @default_move_assignment{new_delete_resource} private: /** diff --git a/include/rmm/mr/host/pinned_memory_resource.hpp b/include/rmm/mr/host/pinned_memory_resource.hpp index 3e0c5e3e6..fb28ebfb7 100644 --- a/include/rmm/mr/host/pinned_memory_resource.hpp +++ b/include/rmm/mr/host/pinned_memory_resource.hpp @@ -24,7 +24,7 @@ namespace rmm::mr { -/* +/** * @brief A `host_memory_resource` that uses `cudaMallocHost` to allocate * pinned/page-locked host memory. * @@ -32,12 +32,14 @@ namespace rmm::mr { */ class pinned_memory_resource final : public host_memory_resource { public: - pinned_memory_resource() = default; - ~pinned_memory_resource() override = default; - pinned_memory_resource(pinned_memory_resource const&) = default; - pinned_memory_resource(pinned_memory_resource&&) = default; - pinned_memory_resource& operator=(pinned_memory_resource const&) = default; - pinned_memory_resource& operator=(pinned_memory_resource&&) = default; + pinned_memory_resource() = default; + ~pinned_memory_resource() override = default; + pinned_memory_resource(pinned_memory_resource const&) = default; ///< @default_copy_constructor + pinned_memory_resource(pinned_memory_resource&&) = default; ///< @default_move_constructor + pinned_memory_resource& operator=(pinned_memory_resource const&) = + default; ///< @default_copy_assignment{pinned_memory_resource} + pinned_memory_resource& operator=(pinned_memory_resource&&) = + default; ///< @default_move_assignment{pinned_memory_resource} private: /** diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index b26a6dd43..3506bf9a9 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -14,11 +14,9 @@ cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR) -set(rmm_version 23.08.00) +set(rmm_version 23.10.00) -file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.08/RAPIDS.cmake - ${CMAKE_BINARY_DIR}/RAPIDS.cmake) -include(${CMAKE_BINARY_DIR}/RAPIDS.cmake) +include(../fetch_rapids.cmake) project( rmm-python @@ -58,5 +56,7 @@ endif() include(rapids-cython) rapids_cython_init() +add_compile_definitions("SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM_LOGGING_LEVEL}") + add_subdirectory(rmm/_cuda) add_subdirectory(rmm/_lib) diff --git a/python/docs/Makefile b/python/docs/Makefile index d4bb2cbb9..72d1dcd85 100644 --- a/python/docs/Makefile +++ b/python/docs/Makefile @@ -3,7 +3,7 @@ # You can set these variables from the command line, and also # from the environment for the first two. -SPHINXOPTS ?= +SPHINXOPTS = -n -v -W --keep-going SPHINXBUILD ?= sphinx-build SOURCEDIR = . BUILDDIR = _build diff --git a/python/docs/api.rst b/python/docs/api.rst index 73cd5dd81..b229d8214 100644 --- a/python/docs/api.rst +++ b/python/docs/api.rst @@ -15,6 +15,7 @@ Memory Resources .. automodule:: rmm.mr :members: + :inherited-members: :undoc-members: :show-inheritance: @@ -28,6 +29,7 @@ Memory Allocators .. automodule:: rmm.allocators.numba :members: + :inherited-members: :undoc-members: :show-inheritance: diff --git a/python/docs/basics.md b/python/docs/basics.md index 0c47073c1..997745f00 100644 --- a/python/docs/basics.md +++ b/python/docs/basics.md @@ -91,7 +91,7 @@ example, enabling the `ManagedMemoryResource` tells RMM to use > :warning: The default resource must be set for any device **before** > allocating any device memory on that device. Setting or changing the > resource after device allocations have been made can lead to unexpected -> behaviour or crashes. See [Multiple Devices](#multiple-devices) +> behaviour or crashes. As another example, `PoolMemoryResource` allows you to allocate a large "pool" of device memory up-front. Subsequent allocations will diff --git a/python/docs/conf.py b/python/docs/conf.py index f6f3adf43..17c012194 100644 --- a/python/docs/conf.py +++ b/python/docs/conf.py @@ -23,9 +23,9 @@ # built documents. # # The short X.Y version. -version = "23.08" +version = "23.10" # The full version, including alpha/beta/rc tags. -release = "23.08.00" +release = "23.10.00" # -- General configuration --------------------------------------------------- @@ -35,6 +35,7 @@ # ones. extensions = [ + "sphinxcontrib.jquery", "sphinx.ext.intersphinx", "sphinx.ext.autodoc", "sphinx.ext.autosummary", @@ -113,7 +114,7 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ["_static"] +html_static_path = [] # -- Options for HTMLHelp output ------------------------------------------ @@ -179,7 +180,10 @@ # Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = {"https://docs.python.org/": None} +intersphinx_mapping = { + "python": ("https://docs.python.org/3", None), + "numba": ("https://numba.readthedocs.io/en/stable", None), +} # Config numpydoc numpydoc_show_inherited_class_members = True @@ -187,6 +191,11 @@ autoclass_content = "init" +nitpick_ignore = [ + ("py:class", "size_t"), + ("py:class", "void"), +] + def setup(app): app.add_js_file("copybutton_pydocs.js") diff --git a/python/pyproject.toml b/python/pyproject.toml index 58ac1d5ad..f83e4fec4 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -17,7 +17,7 @@ build-backend = "setuptools.build_meta" requires = [ "cmake>=3.26.4", "cuda-python>=11.7.1,<12.0a0", - "cython>=0.29,<0.30", + "cython>=3.0.0", "ninja", "scikit-build>=0.13.1", "setuptools>=61.0.0", @@ -27,7 +27,7 @@ requires = [ [project] name = "rmm" -version = "23.08.00" +version = "23.10.00" description = "rmm - RAPIDS Memory Manager" readme = { file = "README.md", content-type = "text/markdown" } authors = [ diff --git a/python/rmm/__init__.py b/python/rmm/__init__.py index a5467251e..52d232dc2 100644 --- a/python/rmm/__init__.py +++ b/python/rmm/__init__.py @@ -14,6 +14,15 @@ from rmm import mr from rmm._lib.device_buffer import DeviceBuffer +from rmm._lib.logger import ( + flush_logger, + get_flush_level, + get_logging_level, + logging_level, + set_flush_level, + set_logging_level, + should_log, +) from rmm.mr import disable_logging, enable_logging, get_log_filenames from rmm.rmm import ( RMMError, @@ -25,15 +34,22 @@ __all__ = [ "DeviceBuffer", - "RMMError", "disable_logging", + "RMMError", "enable_logging", + "flush_logger", + "get_flush_level", "get_log_filenames", + "get_logging_level", "is_initialized", + "logging_level", "mr", "register_reinitialize_hook", "reinitialize", + "set_flush_level", + "set_logging_level", + "should_log", "unregister_reinitialize_hook", ] -__version__ = "23.08.00" +__version__ = "23.10.00" diff --git a/python/rmm/_cuda/stream.pxd b/python/rmm/_cuda/stream.pxd index 6aa4e0b24..3c3d3aa6f 100644 --- a/python/rmm/_cuda/stream.pxd +++ b/python/rmm/_cuda/stream.pxd @@ -26,8 +26,8 @@ cdef class Stream: @staticmethod cdef Stream _from_cudaStream_t(cudaStream_t s, object owner=*) - cdef cuda_stream_view view(self) nogil except * - cdef void c_synchronize(self) nogil except * - cdef bool c_is_default(self) nogil except * + cdef cuda_stream_view view(self) except * nogil + cdef void c_synchronize(self) except * nogil + cdef bool c_is_default(self) except * nogil cdef void _init_with_new_cuda_stream(self) except * cdef void _init_from_stream(self, Stream stream) except * diff --git a/python/rmm/_cuda/stream.pyx b/python/rmm/_cuda/stream.pyx index d60dde4e1..4795cbb9f 100644 --- a/python/rmm/_cuda/stream.pyx +++ b/python/rmm/_cuda/stream.pyx @@ -48,7 +48,7 @@ cdef class Stream: self._init_from_cupy_stream(obj) @staticmethod - cdef Stream _from_cudaStream_t(cudaStream_t s, object owner=None): + cdef Stream _from_cudaStream_t(cudaStream_t s, object owner=None) except *: """ Construct a Stream from a cudaStream_t. """ @@ -57,13 +57,13 @@ cdef class Stream: obj._owner = owner return obj - cdef cuda_stream_view view(self) nogil except *: + cdef cuda_stream_view view(self) except * nogil: """ Generate a rmm::cuda_stream_view from this Stream instance """ return cuda_stream_view((self._cuda_stream)) - cdef void c_synchronize(self) nogil except *: + cdef void c_synchronize(self) except * nogil: """ Synchronize the CUDA stream. This function *must* be called in a `with nogil` block @@ -77,7 +77,7 @@ cdef class Stream: with nogil: self.c_synchronize() - cdef bool c_is_default(self) nogil except *: + cdef bool c_is_default(self) except * nogil: """ Check if we are the default CUDA stream """ diff --git a/python/rmm/_lib/CMakeLists.txt b/python/rmm/_lib/CMakeLists.txt index 9e90d7e99..852dd87c4 100644 --- a/python/rmm/_lib/CMakeLists.txt +++ b/python/rmm/_lib/CMakeLists.txt @@ -12,10 +12,12 @@ # the License. # ============================================================================= -set(cython_sources device_buffer.pyx lib.pyx memory_resource.pyx cuda_stream.pyx +set(cython_sources device_buffer.pyx lib.pyx logger.pyx memory_resource.pyx cuda_stream.pyx torch_allocator.pyx) set(linked_libraries rmm::rmm) # Build all of the Cython targets rapids_cython_create_modules(SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" CXX) +# The cdef public functions in this file need to have a C ABI +target_compile_definitions(torch_allocator PRIVATE CYTHON_EXTERN_C=extern\ "C") diff --git a/python/rmm/_lib/cuda_stream.pxd b/python/rmm/_lib/cuda_stream.pxd index 1eed1cefb..e224cf9af 100644 --- a/python/rmm/_lib/cuda_stream.pxd +++ b/python/rmm/_lib/cuda_stream.pxd @@ -33,5 +33,5 @@ cdef extern from "rmm/cuda_stream.hpp" namespace "rmm" nogil: @cython.final cdef class CudaStream: cdef unique_ptr[cuda_stream] c_obj - cdef cudaStream_t value(self) nogil except * - cdef bool is_valid(self) nogil except * + cdef cudaStream_t value(self) except * nogil + cdef bool is_valid(self) except * nogil diff --git a/python/rmm/_lib/cuda_stream.pyx b/python/rmm/_lib/cuda_stream.pyx index fb35ec11f..0861f0663 100644 --- a/python/rmm/_lib/cuda_stream.pyx +++ b/python/rmm/_lib/cuda_stream.pyx @@ -27,8 +27,8 @@ cdef class CudaStream: def __cinit__(self): self.c_obj.reset(new cuda_stream()) - cdef cudaStream_t value(self) nogil except *: + cdef cudaStream_t value(self) except * nogil: return self.c_obj.get()[0].value() - cdef bool is_valid(self) nogil except *: + cdef bool is_valid(self) except * nogil: return self.c_obj.get()[0].is_valid() diff --git a/python/rmm/_lib/device_buffer.pxd b/python/rmm/_lib/device_buffer.pxd index 364dbb2c0..3d5f29f9a 100644 --- a/python/rmm/_lib/device_buffer.pxd +++ b/python/rmm/_lib/device_buffer.pxd @@ -56,7 +56,7 @@ cdef class DeviceBuffer: @staticmethod cdef DeviceBuffer c_to_device(const unsigned char[::1] b, - Stream stream=*) + Stream stream=*) except * cpdef copy_to_host(self, ary=*, Stream stream=*) cpdef copy_from_host(self, ary, Stream stream=*) cpdef copy_from_device(self, cuda_ary, Stream stream=*) diff --git a/python/rmm/_lib/device_buffer.pyx b/python/rmm/_lib/device_buffer.pyx index c2bfd1459..d248d01ab 100644 --- a/python/rmm/_lib/device_buffer.pyx +++ b/python/rmm/_lib/device_buffer.pyx @@ -172,7 +172,7 @@ cdef class DeviceBuffer: @staticmethod cdef DeviceBuffer c_to_device(const unsigned char[::1] b, - Stream stream=DEFAULT_STREAM): + Stream stream=DEFAULT_STREAM) except *: """Calls ``to_device`` function on arguments provided""" return to_device(b, stream) @@ -382,7 +382,7 @@ cdef void _copy_async(const void* src, void* dst, size_t count, ccudart.cudaMemcpyKind kind, - cuda_stream_view stream) nogil except *: + cuda_stream_view stream) except * nogil: """ Asynchronously copy data between host and/or device pointers. diff --git a/python/rmm/_lib/logger.pyx b/python/rmm/_lib/logger.pyx new file mode 100644 index 000000000..029bbdd79 --- /dev/null +++ b/python/rmm/_lib/logger.pyx @@ -0,0 +1,260 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings + +from libcpp cimport bool + + +cdef extern from "spdlog/common.h" namespace "spdlog::level" nogil: + cpdef enum logging_level "spdlog::level::level_enum": + """ + The debug logging level for RMM. + + Debug logging prints messages to a log file. See + `Debug Logging `_ + for more information. + + Valid levels, in decreasing order of verbosity, are TRACE, DEBUG, + INFO, WARN, ERR, CRITICAL, and OFF. Default is INFO. + + Examples + -------- + >>> import rmm + >>> rmm.logging_level.DEBUG + + >>> rmm.logging_level.DEBUG.value + 1 + >>> rmm.logging_level.DEBUG.name + 'DEBUG' + + See Also + -------- + set_logging_level : Set the debug logging level + get_logging_level : Get the current debug logging level + """ + TRACE "spdlog::level::trace" + DEBUG "spdlog::level::debug" + INFO "spdlog::level::info" + WARN "spdlog::level::warn" + ERR "spdlog::level::err" + CRITICAL "spdlog::level::critical" + OFF "spdlog::level::off" + + +cdef extern from "spdlog/spdlog.h" namespace "spdlog" nogil: + cdef cppclass spdlog_logger "spdlog::logger": + spdlog_logger() except + + void set_level(logging_level level) + logging_level level() + void flush() except + + void flush_on(logging_level level) + logging_level flush_level() + bool should_log(logging_level msg_level) + + +cdef extern from "rmm/logger.hpp" namespace "rmm" nogil: + cdef spdlog_logger& logger() except + + + +def _validate_level_type(level): + if not isinstance(level, logging_level): + raise TypeError("level must be an instance of the logging_level enum") + + +def should_log(level): + """ + Check if a message at the given level would be logged. + + A message at the given level would be logged if the current debug logging + level is set to a level that is at least as verbose than the given level, + *and* the RMM module is compiled for a logging level at least as verbose. + If these conditions are not both met, this function will return false. + + Debug logging prints messages to a log file. See + `Debug Logging `_ + for more information. + + Parameters + ---------- + level : logging_level + The debug logging level. Valid values are instances of the + ``logging_level`` enum. + + Returns + ------- + should_log : bool + True if a message at the given level would be logged, False otherwise. + + Raises + ------ + TypeError + If the logging level is not an instance of the ``logging_level`` enum. + """ + _validate_level_type(level) + return logger().should_log(level) + + +def set_logging_level(level): + """ + Set the debug logging level. + + Debug logging prints messages to a log file. See + `Debug Logging `_ + for more information. + + Parameters + ---------- + level : logging_level + The debug logging level. Valid values are instances of the + ``logging_level`` enum. + + Raises + ------ + TypeError + If the logging level is not an instance of the ``logging_level`` enum. + + See Also + -------- + get_logging_level : Get the current debug logging level. + + Examples + -------- + >>> import rmm + >>> rmm.set_logging_level(rmm.logging_level.WARN) # set logging level to warn + """ + _validate_level_type(level) + logger().set_level(level) + + if not should_log(level): + warnings.warn(f"RMM will not log logging_level.{level.name}. This " + "may be because the C++ library is compiled for a " + "less-verbose logging level.") + + +def get_logging_level(): + """ + Get the current debug logging level. + + Debug logging prints messages to a log file. See + `Debug Logging `_ + for more information. + + Returns + ------- + level : logging_level + The current debug logging level, an instance of the ``logging_level`` + enum. + + See Also + -------- + set_logging_level : Set the debug logging level. + + Examples + -------- + >>> import rmm + >>> rmm.get_logging_level() # get current logging level + + """ + return logging_level(logger().level()) + + +def flush_logger(): + """ + Flush the debug logger. This will cause any buffered log messages to + be written to the log file. + + Debug logging prints messages to a log file. See + `Debug Logging `_ + for more information. + + See Also + -------- + set_flush_level : Set the flush level for the debug logger. + get_flush_level : Get the current debug logging flush level. + + Examples + -------- + >>> import rmm + >>> rmm.flush_logger() # flush the logger + """ + logger().flush() + + +def set_flush_level(level): + """ + Set the flush level for the debug logger. Messages of this level or higher + will automatically flush to the file. + + Debug logging prints messages to a log file. See + `Debug Logging `_ + for more information. + + Parameters + ---------- + level : logging_level + The debug logging level. Valid values are instances of the + ``logging_level`` enum. + + Raises + ------ + TypeError + If the logging level is not an instance of the ``logging_level`` enum. + + See Also + -------- + get_flush_level : Get the current debug logging flush level. + flush_logger : Flush the logger. + + Examples + -------- + >>> import rmm + >>> rmm.flush_on(rmm.logging_level.WARN) # set flush level to warn + """ + _validate_level_type(level) + logger().flush_on(level) + + if not should_log(level): + warnings.warn(f"RMM will not log logging_level.{level.name}. This " + "may be because the C++ library is compiled for a " + "less-verbose logging level.") + + +def get_flush_level(): + """ + Get the current debug logging flush level for the RMM logger. Messages of + this level or higher will automatically flush to the file. + + Debug logging prints messages to a log file. See + `Debug Logging `_ + for more information. + + Returns + ------- + logging_level + The current flush level, an instance of the ``logging_level`` + enum. + + See Also + -------- + set_flush_level : Set the flush level for the logger. + flush_logger : Flush the logger. + + Examples + -------- + >>> import rmm + >>> rmm.flush_level() # get current flush level + + """ + return logging_level(logger().flush_level()) diff --git a/python/rmm/_lib/memory_resource.pxd b/python/rmm/_lib/memory_resource.pxd index 5bb3746bc..0770fb8ed 100644 --- a/python/rmm/_lib/memory_resource.pxd +++ b/python/rmm/_lib/memory_resource.pxd @@ -69,6 +69,9 @@ cdef class CallbackMemoryResource(DeviceMemoryResource): cdef object _allocate_func cdef object _deallocate_func +cdef class LimitingResourceAdaptor(UpstreamResourceAdaptor): + pass + cdef class LoggingResourceAdaptor(UpstreamResourceAdaptor): cdef object _log_file_name cpdef get_file_name(self) diff --git a/python/rmm/_lib/memory_resource.pyx b/python/rmm/_lib/memory_resource.pyx index 774db374a..ce7f45e19 100644 --- a/python/rmm/_lib/memory_resource.pyx +++ b/python/rmm/_lib/memory_resource.pyx @@ -14,10 +14,14 @@ import os import warnings +# This import is needed for Cython typing in translate_python_except_to_cpp +# See https://github.com/cython/cython/issues/5589 +from builtins import BaseException from collections import defaultdict cimport cython from cython.operator cimport dereference as deref +from libc.stddef cimport size_t from libc.stdint cimport int8_t, int64_t, uintptr_t from libcpp cimport bool from libcpp.memory cimport make_unique, unique_ptr @@ -37,7 +41,7 @@ from rmm._lib.per_device_resource cimport ( # Transparent handle of a C++ exception ctypedef pair[int, string] CppExcept -cdef CppExcept translate_python_except_to_cpp(err: BaseException): +cdef CppExcept translate_python_except_to_cpp(err: BaseException) noexcept: """Translate a Python exception into a C++ exception handle The returned exception handle can then be thrown by `throw_cpp_except()`, @@ -160,6 +164,16 @@ cdef extern from "rmm/mr/device/binning_memory_resource.hpp" \ size_t allocation_size, device_memory_resource* bin_resource) except + +cdef extern from "rmm/mr/device/limiting_resource_adaptor.hpp" \ + namespace "rmm::mr" nogil: + cdef cppclass limiting_resource_adaptor[Upstream](device_memory_resource): + limiting_resource_adaptor( + Upstream* upstream_mr, + size_t allocation_limit) except + + + size_t get_allocated_bytes() except + + size_t get_allocation_limit() except + + cdef extern from "rmm/mr/device/logging_resource_adaptor.hpp" \ namespace "rmm::mr" nogil: cdef cppclass logging_resource_adaptor[Upstream](device_memory_resource): @@ -213,18 +227,41 @@ cdef extern from "rmm/mr/device/failure_callback_resource_adaptor.hpp" \ cdef class DeviceMemoryResource: cdef device_memory_resource* get_mr(self): + """Get the underlying C++ memory resource object.""" return self.c_obj.get() def allocate(self, size_t nbytes): + """Allocate ``nbytes`` bytes of memory. + + Parameters + ---------- + nbytes : size_t + The size of the allocation in bytes + """ return self.c_obj.get().allocate(nbytes) def deallocate(self, uintptr_t ptr, size_t nbytes): + """Deallocate memory pointed to by ``ptr`` of size ``nbytes``. + + Parameters + ---------- + ptr : uintptr_t + Pointer to be deallocated + nbytes : size_t + Size of the allocation in bytes + """ self.c_obj.get().deallocate((ptr), nbytes) # See the note about `no_gc_clear` in `device_buffer.pyx`. @cython.no_gc_clear cdef class UpstreamResourceAdaptor(DeviceMemoryResource): + """Parent class for all memory resources that track an upstream. + + Upstream resource tracking requires maintaining a reference to the upstream + mr so that it is kept alive and may be accessed by any downstream resource + adaptors. + """ def __cinit__(self, DeviceMemoryResource upstream_mr, *args, **kwargs): @@ -526,7 +563,10 @@ cdef void* _allocate_callback_wrapper( size_t nbytes, cuda_stream_view stream, void* ctx -) nogil: + # Note that this function is specifically designed to rethrow Python + # exceptions as C++ exceptions when called as a callback from C++, so it is + # noexcept from Cython's perspective. +) noexcept nogil: cdef CppExcept err with gil: try: @@ -540,7 +580,7 @@ cdef void _deallocate_callback_wrapper( size_t nbytes, cuda_stream_view stream, void* ctx -) with gil: +) except * with gil: (ctx)((ptr), nbytes) @@ -619,6 +659,61 @@ def _append_id(filename, id): return f"{name}.dev{id}{ext}" +cdef class LimitingResourceAdaptor(UpstreamResourceAdaptor): + + def __cinit__( + self, + DeviceMemoryResource upstream_mr, + size_t allocation_limit + ): + self.c_obj.reset( + new limiting_resource_adaptor[device_memory_resource]( + upstream_mr.get_mr(), + allocation_limit + ) + ) + + def __init__( + self, + DeviceMemoryResource upstream_mr, + size_t allocation_limit + ): + """ + Memory resource that limits the total allocation amount possible + performed by an upstream memory resource. + + Parameters + ---------- + upstream_mr : DeviceMemoryResource + The upstream memory resource. + allocation_limit : size_t + Maximum memory allowed for this allocator. + """ + pass + + def get_allocated_bytes(self) -> size_t: + """ + Query the number of bytes that have been allocated. Note that this can + not be used to know how large of an allocation is possible due to both + possible fragmentation and also internal page sizes and alignment that + is not tracked by this allocator. + """ + return (( + self.c_obj.get()) + )[0].get_allocated_bytes() + + def get_allocation_limit(self) -> size_t: + """ + Query the maximum number of bytes that this allocator is allowed to + allocate. This is the limit on the allocator and not a representation + of the underlying device. The device may not be able to support this + limit. + """ + return (( + self.c_obj.get()) + )[0].get_allocation_limit() + + cdef class LoggingResourceAdaptor(UpstreamResourceAdaptor): def __cinit__( self, @@ -796,7 +891,10 @@ cdef class TrackingResourceAdaptor(UpstreamResourceAdaptor): self.c_obj.get()))[0].log_outstanding_allocations() -cdef bool _oom_callback_function(size_t bytes, void *callback_arg) nogil: +# Note that this function is specifically designed to rethrow Python exceptions +# as C++ exceptions when called as a callback from C++, so it is noexcept from +# Cython's perspective. +cdef bool _oom_callback_function(size_t bytes, void *callback_arg) noexcept nogil: cdef CppExcept err with gil: try: diff --git a/python/rmm/allocators/numba.py b/python/rmm/allocators/numba.py index 18a010e1c..5e87b87b6 100644 --- a/python/rmm/allocators/numba.py +++ b/python/rmm/allocators/numba.py @@ -13,6 +13,7 @@ # limitations under the License. import ctypes +import inspect from cuda.cuda import CUdeviceptr, cuIpcGetMemHandle from numba import config, cuda @@ -112,6 +113,12 @@ def get_ipc_handle(self, memory): ) def get_memory_info(self): + """Returns ``(free, total)`` memory in bytes in the context. + + This implementation raises `NotImplementedError` because the allocation + will be performed using rmm's currently set default mr, which may be a + pool allocator. + """ raise NotImplementedError() @property @@ -119,6 +126,16 @@ def interface_version(self): return 1 +# The parent class docstrings contain references without fully qualified names, +# so we need to replace them here for our Sphinx docs to render properly. +for _, method in inspect.getmembers(RMMNumbaManager, inspect.isfunction): + if method.__doc__ is not None: + method.__doc__ = method.__doc__.replace( + ":class:`BaseCUDAMemoryManager`", + ":class:`numba.cuda.BaseCUDAMemoryManager`", + ) + + # Enables the use of RMM for Numba via an environment variable setting, # NUMBA_CUDA_MEMORY_MANAGER=rmm. See: # https://numba.readthedocs.io/en/stable/cuda/external-memory.html#environment-variable diff --git a/python/rmm/mr.py b/python/rmm/mr.py index ea0a0cf8c..4f6b801f5 100644 --- a/python/rmm/mr.py +++ b/python/rmm/mr.py @@ -19,11 +19,13 @@ DeviceMemoryResource, FailureCallbackResourceAdaptor, FixedSizeMemoryResource, + LimitingResourceAdaptor, LoggingResourceAdaptor, ManagedMemoryResource, PoolMemoryResource, StatisticsResourceAdaptor, TrackingResourceAdaptor, + UpstreamResourceAdaptor, _flush_logs, _initialize, disable_logging, @@ -45,12 +47,14 @@ "CudaMemoryResource", "DeviceMemoryResource", "FixedSizeMemoryResource", + "LimitingResourceAdaptor", "LoggingResourceAdaptor", "ManagedMemoryResource", "PoolMemoryResource", "StatisticsResourceAdaptor", "TrackingResourceAdaptor", "FailureCallbackResourceAdaptor", + "UpstreamResourceAdaptor", "_flush_logs", "_initialize", "set_per_device_resource", diff --git a/python/rmm/tests/test_rmm.py b/python/rmm/tests/test_rmm.py index 70aafe601..fd537749b 100644 --- a/python/rmm/tests/test_rmm.py +++ b/python/rmm/tests/test_rmm.py @@ -16,6 +16,7 @@ import gc import os import pickle +import warnings from itertools import product import numpy as np @@ -586,6 +587,41 @@ def test_cuda_async_memory_resource_threshold(nelem, alloc): array_tester("u1", 2 * nelem, alloc) # should trigger release +@pytest.mark.parametrize( + "mr", + [ + rmm.mr.CudaMemoryResource, + pytest.param( + rmm.mr.CudaAsyncMemoryResource, + marks=pytest.mark.skipif( + not _CUDAMALLOC_ASYNC_SUPPORTED, + reason="cudaMallocAsync not supported", + ), + ), + ], +) +def test_limiting_resource_adaptor(mr): + cuda_mr = mr() + + allocation_limit = 1 << 20 + num_buffers = 2 + buffer_size = allocation_limit // num_buffers + + mr = rmm.mr.LimitingResourceAdaptor( + cuda_mr, allocation_limit=allocation_limit + ) + assert mr.get_allocation_limit() == allocation_limit + + rmm.mr.set_current_device_resource(mr) + + buffers = [rmm.DeviceBuffer(size=buffer_size) for _ in range(num_buffers)] + + assert mr.get_allocated_bytes() == sum(b.size for b in buffers) + + with pytest.raises(MemoryError): + rmm.DeviceBuffer(size=1) + + def test_statistics_resource_adaptor(stats_mr): buffers = [rmm.DeviceBuffer(size=1000) for _ in range(10)] @@ -907,3 +943,35 @@ def test_rmm_device_buffer_copy(cuda_ary, make_copy): result = db_copy.copy_to_host() np.testing.assert_equal(expected, result) + + +@pytest.mark.parametrize("level", rmm.logging_level) +def test_valid_logging_level(level): + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", message="RMM will not log logging_level.TRACE." + ) + warnings.filterwarnings( + "ignore", message="RMM will not log logging_level.DEBUG." + ) + rmm.set_logging_level(level) + assert rmm.get_logging_level() == level + rmm.set_logging_level(rmm.logging_level.INFO) # reset to default + + rmm.set_flush_level(level) + assert rmm.get_flush_level() == level + rmm.set_flush_level(rmm.logging_level.INFO) # reset to default + + rmm.should_log(level) + + +@pytest.mark.parametrize( + "level", ["INFO", 3, "invalid", 100, None, 1.2345, [1, 2, 3]] +) +def test_invalid_logging_level(level): + with pytest.raises(TypeError): + rmm.set_logging_level(level) + with pytest.raises(TypeError): + rmm.set_flush_level(level) + with pytest.raises(TypeError): + rmm.should_log(level) diff --git a/scripts/doxygen.sh b/scripts/doxygen.sh new file mode 100755 index 000000000..c584723ae --- /dev/null +++ b/scripts/doxygen.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# Copyright (c) 2023, NVIDIA CORPORATION. +############################## +# RMM doxygen warnings check # +############################## + +# skip if doxygen is not installed +if ! [ -x "$(command -v doxygen)" ]; then + echo -e "warning: doxygen is not installed" + exit 0 +fi + +# Utility to return version as number for comparison +function version { echo "$@" | awk -F. '{ printf("%d%03d%03d%03d\n", $1,$2,$3,$4); }'; } + +# doxygen supported version 1.9.1 +DOXYGEN_VERSION=`doxygen --version` +if [ ! $(version "$DOXYGEN_VERSION") -eq $(version "1.9.1") ] ; then + echo -e "warning: Unsupported doxygen version $DOXYGEN_VERSION" + echo -e "Expecting doxygen version 1.9.1" + exit 0 +fi + +# Run doxygen, ignore missing tag files error +TAG_ERROR1="error: Tag file '.*.tag' does not exist or is not a file. Skipping it..." +TAG_ERROR2="error: cannot open tag file .*.tag for writing" +DOXYGEN_STDERR=`cd doxygen && { cat Doxyfile ; echo QUIET = YES; echo GENERATE_HTML = NO; } | doxygen - 2>&1 | sed "/\($TAG_ERROR1\|$TAG_ERROR2\)/d"` +RETVAL=$? + +if [ "$RETVAL" != "0" ] || [ ! -z "$DOXYGEN_STDERR" ]; then + echo -e "$DOXYGEN_STDERR" + RETVAL=1 #because return value is not generated by doxygen 1.8.20 +fi + +exit $RETVAL diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 24e9d240d..36c3aa043 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -31,7 +31,11 @@ function(ConfigureTestInternal TEST_NAME) PROPERTIES POSITION_INDEPENDENT_CODE ON RUNTIME_OUTPUT_DIRECTORY "$" CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}" - INSTALL_RPATH "\$ORIGIN/../../../lib") + INSTALL_RPATH "\$ORIGIN/../../../lib" + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON) target_compile_definitions(${TEST_NAME} PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM_LOGGING_LEVEL}") target_compile_options(${TEST_NAME} PUBLIC $<$:-Wall -Werror diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp index 23d0c1b85..48967d06a 100644 --- a/tests/mr/device/arena_mr_tests.cpp +++ b/tests/mr/device/arena_mr_tests.cpp @@ -486,7 +486,7 @@ TEST_F(ArenaTest, SizeSmallerThanSuperblockSize) // NOLINT TEST_F(ArenaTest, AllocateNinetyPercent) // NOLINT { - EXPECT_NO_THROW([]() { // NOLINT(cppcoreguidelines-avoid-goto) + EXPECT_NO_THROW([]() { // NOLINT(cppcoreguidelines-avoid-goto) auto const free = rmm::detail::available_device_memory().first; auto const ninety_percent = rmm::detail::align_up(static_cast(static_cast(free) * 0.9), @@ -497,7 +497,7 @@ TEST_F(ArenaTest, AllocateNinetyPercent) // NOLINT TEST_F(ArenaTest, SmallMediumLarge) // NOLINT { - EXPECT_NO_THROW([]() { // NOLINT(cppcoreguidelines-avoid-goto) + EXPECT_NO_THROW([]() { // NOLINT(cppcoreguidelines-avoid-goto) arena_mr mr(rmm::mr::get_current_device_resource()); auto* small = mr.allocate(256); auto* medium = mr.allocate(64_MiB); @@ -511,7 +511,7 @@ TEST_F(ArenaTest, SmallMediumLarge) // NOLINT TEST_F(ArenaTest, Defragment) // NOLINT { - EXPECT_NO_THROW([]() { // NOLINT(cppcoreguidelines-avoid-goto) + EXPECT_NO_THROW([]() { // NOLINT(cppcoreguidelines-avoid-goto) auto const arena_size = superblock::minimum_size * 4; arena_mr mr(rmm::mr::get_current_device_resource(), arena_size); std::vector threads; diff --git a/tests/mr/device/cuda_async_view_mr_tests.cpp b/tests/mr/device/cuda_async_view_mr_tests.cpp index 86cb6f106..209429b4b 100644 --- a/tests/mr/device/cuda_async_view_mr_tests.cpp +++ b/tests/mr/device/cuda_async_view_mr_tests.cpp @@ -31,7 +31,7 @@ TEST(PoolTest, UsePool) { cudaMemPool_t memPool{}; RMM_CUDA_TRY(rmm::detail::async_alloc::cudaDeviceGetDefaultMemPool( - &memPool, rmm::detail::current_device().value())); + &memPool, rmm::get_current_cuda_device().value())); const auto pool_init_size{100}; cuda_async_view_mr mr{memPool}; @@ -44,7 +44,7 @@ TEST(PoolTest, NotTakingOwnershipOfPool) { cudaMemPoolProps poolProps = {}; poolProps.allocType = cudaMemAllocationTypePinned; - poolProps.location.id = rmm::detail::current_device().value(); + poolProps.location.id = rmm::get_current_cuda_device().value(); poolProps.location.type = cudaMemLocationTypeDevice; cudaMemPool_t memPool{}; diff --git a/tests/mr/device/failure_callback_mr_tests.cpp b/tests/mr/device/failure_callback_mr_tests.cpp index 11c119304..bb5484c69 100644 --- a/tests/mr/device/failure_callback_mr_tests.cpp +++ b/tests/mr/device/failure_callback_mr_tests.cpp @@ -40,7 +40,7 @@ bool failure_handler(std::size_t /*bytes*/, void* arg) retried = true; return true; // First time we request an allocation retry } - return false; // Second time we let the adaptor throw std::bad_alloc + return false; // Second time we let the adaptor throw std::bad_alloc } TEST(FailureCallbackTest, RetryAllocationOnce) diff --git a/tests/mr/device/pool_mr_tests.cpp b/tests/mr/device/pool_mr_tests.cpp index c5df1951c..4a234d2f9 100644 --- a/tests/mr/device/pool_mr_tests.cpp +++ b/tests/mr/device/pool_mr_tests.cpp @@ -14,10 +14,12 @@ * limitations under the License. */ +#include #include #include #include #include +#include #include #include #include @@ -100,7 +102,7 @@ TEST(PoolTest, ForceGrowth) EXPECT_NO_THROW(mr.allocate(1000)); EXPECT_THROW(mr.allocate(4000), rmm::out_of_memory); // too much EXPECT_NO_THROW(mr.allocate(500)); - EXPECT_NO_THROW(mr.allocate(2000)); // fits + EXPECT_NO_THROW(mr.allocate(2000)); // fits } } @@ -150,5 +152,42 @@ TEST(PoolTest, UpstreamDoesntSupportMemInfo) mr2.deallocate(ptr, 1024); } +TEST(PoolTest, MultidevicePool) +{ + using MemoryResource = rmm::mr::pool_memory_resource; + + // Get the number of cuda devices + int num_devices = rmm::get_num_cuda_devices(); + + // only run on multidevice systems + if (num_devices >= 2) { + rmm::mr::cuda_memory_resource general_mr; + + // initializing pool_memory_resource of multiple devices + int devices = 2; + size_t pool_size = 1024; + std::vector> mrs; + + for (int i = 0; i < devices; ++i) { + RMM_CUDA_TRY(cudaSetDevice(i)); + auto mr = std::make_shared(&general_mr, pool_size, pool_size); + rmm::mr::set_per_device_resource(rmm::cuda_device_id{i}, mr.get()); + mrs.emplace_back(mr); + } + + { + RMM_CUDA_TRY(cudaSetDevice(0)); + rmm::device_buffer buf_a(16, rmm::cuda_stream_per_thread, mrs[0].get()); + + { + RMM_CUDA_TRY(cudaSetDevice(1)); + rmm::device_buffer buf_b(16, rmm::cuda_stream_per_thread, mrs[1].get()); + } + + RMM_CUDA_TRY(cudaSetDevice(0)); + } + } +} + } // namespace } // namespace rmm::test