diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 15d33376..b19c6d6b 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -198,6 +198,9 @@ jobs: - "12.6.2" - "12.0.1" - "11.8.0" + local-ctk: + - 1 # use mini CTK + - 0 # use CTK wheels runner: - default include: @@ -205,174 +208,23 @@ jobs: python-version: "3.12" cuda-version: "12.6.2" runner: H100 - name: Test (${{ matrix.host-platform }}, Python ${{ matrix.python-version }}, CUDA ${{ matrix.cuda-version }}, Runner ${{ matrix.runner }}) + name: Test (${{ matrix.host-platform }}, Python ${{ matrix.python-version }}, CUDA ${{ matrix.cuda-version }}, Runner ${{ matrix.runner }}, ${{ (matrix.local-ctk == '1' && 'local CTK') || 'CTK wheels' }}) # The build stage could fail but we want the CI to keep moving. if: ${{ github.repository_owner == 'nvidia' && !cancelled() }} permissions: - id-token: write # This is required for configure-aws-credentials contents: read # This is required for actions/checkout - runs-on: ${{ (matrix.runner == 'default' && matrix.host-platform == 'linux-64' && 'linux-amd64-gpu-v100-latest-1') || - (matrix.runner == 'default' && matrix.host-platform == 'linux-aarch64' && 'linux-arm64-gpu-a100-latest-1') || - (matrix.runner == 'H100' && 'linux-amd64-gpu-h100-latest-1-testing') }} - # Our self-hosted runners require a container - # TODO: use a different (nvidia?) container - container: - options: -u root --security-opt seccomp=unconfined --shm-size 16g - image: ubuntu:22.04 - env: - NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} needs: - build - defaults: - run: - shell: bash --noprofile --norc -xeuo pipefail {0} - steps: - - name: Ensure GPU is working - run: nvidia-smi - - - name: Checkout ${{ github.event.repository.name }} - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Set environment variables - run: | - PYTHON_VERSION_FORMATTED=$(echo '${{ matrix.python-version }}' | tr -d '.') - if [[ "${{ matrix.host-platform }}" == linux* ]]; then - REPO_DIR=$(pwd) - elif [[ "${{ matrix.host-platform }}" == win* ]]; then - PWD=$(pwd) - REPO_DIR=$(cygpath -w $PWD) - fi - - BUILD_CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${{ needs.build.outputs.BUILD_CTK_VER }})" - TEST_CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${{ matrix.cuda-version }})" - if [[ $BUILD_CUDA_MAJOR != $TEST_CUDA_MAJOR ]]; then - SKIP_CUDA_BINDINGS_TEST=1 - else - SKIP_CUDA_BINDINGS_TEST=0 - fi - - # make outputs from the previous job as env vars - CUDA_CORE_ARTIFACT_BASENAME="cuda-core-python${PYTHON_VERSION_FORMATTED}-${{ matrix.host-platform }}" - echo "PYTHON_VERSION_FORMATTED=${PYTHON_VERSION_FORMATTED}" >> $GITHUB_ENV - echo "CUDA_CORE_ARTIFACT_BASENAME=${CUDA_CORE_ARTIFACT_BASENAME}" >> $GITHUB_ENV - echo "CUDA_CORE_ARTIFACT_NAME=${CUDA_CORE_ARTIFACT_BASENAME}-${{ github.sha }}" >> $GITHUB_ENV - echo "CUDA_CORE_ARTIFACTS_DIR=$(realpath "$REPO_DIR/cuda_core/dist")" >> $GITHUB_ENV - CUDA_BINDINGS_ARTIFACT_BASENAME="cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda${{ needs.build.outputs.BUILD_CTK_VER }}-${{ matrix.host-platform }}" - echo "CUDA_BINDINGS_ARTIFACT_BASENAME=${CUDA_BINDINGS_ARTIFACT_BASENAME}" >> $GITHUB_ENV - echo "CUDA_BINDINGS_ARTIFACT_NAME=${CUDA_BINDINGS_ARTIFACT_BASENAME}-${{ github.sha }}" >> $GITHUB_ENV - echo "CUDA_BINDINGS_ARTIFACTS_DIR=$(realpath "$REPO_DIR/cuda_bindings/dist")" >> $GITHUB_ENV - echo "SKIP_CUDA_BINDINGS_TEST=${SKIP_CUDA_BINDINGS_TEST}" >> $GITHUB_ENV - - - name: Install dependencies - uses: ./.github/actions/install_unix_deps - continue-on-error: false - with: - # gcc for Cython tests, jq/wget for artifact fetching - dependencies: "build-essential jq wget" - dependent_exes: "gcc jq wget" - - - name: Download cuda.bindings build artifacts - if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '0'}} - uses: actions/download-artifact@v4 - with: - name: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }} - path: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }} - - - name: Download cuda.bindings build artifacts from the prior branch - if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '1'}} - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - # See https://github.com/cli/cli/blob/trunk/docs/install_linux.md#debian-ubuntu-linux-raspberry-pi-os-apt. - # gh is needed for artifact fetching. - mkdir -p -m 755 /etc/apt/keyrings \ - && out=$(mktemp) && wget -nv -O$out https://cli.github.com/packages/githubcli-archive-keyring.gpg \ - && cat $out | tee /etc/apt/keyrings/githubcli-archive-keyring.gpg > /dev/null \ - && chmod go+r /etc/apt/keyrings/githubcli-archive-keyring.gpg \ - && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | tee /etc/apt/sources.list.d/github-cli.list > /dev/null \ - && apt update \ - && apt install gh -y - - OLD_BRANCH=$(cat .github/BACKPORT_BRANCH) - OLD_BASENAME="cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda*-${{ matrix.host-platform }}*" - LATEST_PRIOR_RUN_ID=$(gh run list -b ${OLD_BRANCH} -L 1 -w "CI: Build and test" -s completed -R NVIDIA/cuda-python --json databaseId | jq '.[]| .databaseId') - gh run download $LATEST_PRIOR_RUN_ID -p ${OLD_BASENAME} -R NVIDIA/cuda-python - ls -al $OLD_BASENAME - mkdir -p "${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}" - mv $OLD_BASENAME/*.whl "${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}"/ - - - name: Display structure of downloaded cuda.bindings artifacts - run: | - pwd - ls -lahR $CUDA_BINDINGS_ARTIFACTS_DIR - - - name: Download cuda.core build artifacts - uses: actions/download-artifact@v4 - with: - name: ${{ env.CUDA_CORE_ARTIFACT_NAME }} - path: ${{ env.CUDA_CORE_ARTIFACTS_DIR }} - - - name: Display structure of downloaded cuda.core build artifacts - run: | - pwd - ls -lahR $CUDA_CORE_ARTIFACTS_DIR - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - env: - # we use self-hosted runners on which setup-python behaves weirdly... - AGENT_TOOLSDIRECTORY: "/opt/hostedtoolcache" - - - name: Set up mini CTK - uses: ./.github/actions/fetch_ctk - continue-on-error: false - with: - host-platform: ${{ matrix.host-platform }} - cuda-version: ${{ matrix.cuda-version }} - - - name: Run cuda.bindings tests - if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '0' }} - run: | - ls $CUDA_PATH - - pushd "${CUDA_BINDINGS_ARTIFACTS_DIR}" - pip install *.whl - popd - - pushd ./cuda_bindings - pip install -r requirements.txt - pytest -rxXs tests/ - if [[ "${{ matrix.host-platform }}" == linux* ]]; then - bash tests/cython/build_tests.sh - elif [[ "${{ matrix.host-platform }}" == win* ]]; then - # TODO: enable this once win-64 runners are up - exit 1 - fi - pytest -rxXs tests/cython - popd - - - name: Run cuda.core tests - run: | - # If build/test majors match: cuda.bindings is installed in the previous step. - # If mismatch: cuda.bindings is installed from the backport branch. - if [[ "${SKIP_CUDA_BINDINGS_TEST}" == 1 ]]; then - pushd "${CUDA_BINDINGS_ARTIFACTS_DIR}" - pip install *.whl - popd - fi - TEST_CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${{ matrix.cuda-version }})" - pushd "${CUDA_CORE_ARTIFACTS_DIR}" - pip install $(ls *.whl)["cu${TEST_CUDA_MAJOR}"] - popd - - pushd ./cuda_core - pip install -r "tests/requirements-cu${TEST_CUDA_MAJOR}.txt" - pytest -rxXs tests/ - popd + secrets: inherit + uses: + ./.github/workflows/test-wheel.yml + with: + host-platform: ${{ matrix.host-platform }} + python-version: ${{ matrix.python-version }} + build-ctk-ver: ${{ needs.build.outputs.BUILD_CTK_VER }} + cuda-version: ${{ matrix.cuda-version }} + local-ctk: ${{ matrix.local-ctk}} + runner: ${{ matrix.runner }} doc: name: Docs @@ -388,7 +240,7 @@ jobs: uses: ./.github/workflows/build-docs.yml with: - build_ctk_ver: ${{ needs.build.outputs.BUILD_CTK_VER }} + build-ctk-ver: ${{ needs.build.outputs.BUILD_CTK_VER }} checks: name: Check job status diff --git a/.github/workflows/build-docs.yml b/.github/workflows/build-docs.yml index cafb1fc9..f88e38bf 100644 --- a/.github/workflows/build-docs.yml +++ b/.github/workflows/build-docs.yml @@ -3,7 +3,7 @@ name: "CI: Build and update docs" on: workflow_call: inputs: - build_ctk_ver: + build-ctk-ver: type: string required: true @@ -53,7 +53,7 @@ jobs: continue-on-error: false with: host-platform: linux-64 - cuda-version: ${{ inputs.build_ctk_ver }} + cuda-version: ${{ inputs.build-ctk-ver }} - name: Set environment variables run: | @@ -65,7 +65,7 @@ jobs: echo "CUDA_CORE_ARTIFACT_BASENAME=${CUDA_CORE_ARTIFACT_BASENAME}" >> $GITHUB_ENV echo "CUDA_CORE_ARTIFACT_NAME=${CUDA_CORE_ARTIFACT_BASENAME}-${{ github.sha }}" >> $GITHUB_ENV echo "CUDA_CORE_ARTIFACTS_DIR=$(realpath "$REPO_DIR/cuda_core/dist")" >> $GITHUB_ENV - CUDA_BINDINGS_ARTIFACT_BASENAME="cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda${{ inputs.build_ctk_ver }}-linux-64" + CUDA_BINDINGS_ARTIFACT_BASENAME="cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda${{ inputs.build-ctk-ver }}-linux-64" echo "CUDA_BINDINGS_ARTIFACT_BASENAME=${CUDA_BINDINGS_ARTIFACT_BASENAME}" >> $GITHUB_ENV echo "CUDA_BINDINGS_ARTIFACT_NAME=${CUDA_BINDINGS_ARTIFACT_BASENAME}-${{ github.sha }}" >> $GITHUB_ENV echo "CUDA_BINDINGS_ARTIFACTS_DIR=$(realpath "$REPO_DIR/cuda_bindings/dist")" >> $GITHUB_ENV diff --git a/.github/workflows/test-wheel.yml b/.github/workflows/test-wheel.yml new file mode 100644 index 00000000..a021ff34 --- /dev/null +++ b/.github/workflows/test-wheel.yml @@ -0,0 +1,201 @@ +name: "CI: Test wheels" + +on: + workflow_call: + inputs: + host-platform: + type: string + required: true + python-version: + type: string + required: true + build-ctk-ver: + type: string + required: true + cuda-version: + type: string + required: true + local-ctk: + type: string + required: true + runner: + type: string + required: true + +jobs: + test: + # The build stage could fail but we want the CI to keep moving. + if: ${{ github.repository_owner == 'nvidia' && !cancelled() }} + runs-on: ${{ (inputs.runner == 'default' && inputs.host-platform == 'linux-64' && 'linux-amd64-gpu-v100-latest-1') || + (inputs.runner == 'default' && inputs.host-platform == 'linux-aarch64' && 'linux-arm64-gpu-a100-latest-1') || + (inputs.runner == 'H100' && 'linux-amd64-gpu-h100-latest-1-testing') }} + # Our self-hosted runners require a container + # TODO: use a different (nvidia?) container + container: + options: -u root --security-opt seccomp=unconfined --shm-size 16g + image: ubuntu:22.04 + env: + NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} + defaults: + run: + shell: bash --noprofile --norc -xeuo pipefail {0} + steps: + - name: Ensure GPU is working + run: nvidia-smi + + - name: Checkout ${{ github.event.repository.name }} + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set environment variables + run: | + PYTHON_VERSION_FORMATTED=$(echo '${{ inputs.python-version }}' | tr -d '.') + if [[ "${{ inputs.host-platform }}" == linux* ]]; then + REPO_DIR=$(pwd) + elif [[ "${{ inputs.host-platform }}" == win* ]]; then + PWD=$(pwd) + REPO_DIR=$(cygpath -w $PWD) + fi + + BUILD_CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${{ inputs.build-ctk-ver }})" + TEST_CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${{ inputs.cuda-version }})" + if [[ $BUILD_CUDA_MAJOR != $TEST_CUDA_MAJOR ]]; then + SKIP_CUDA_BINDINGS_TEST=1 + else + SKIP_CUDA_BINDINGS_TEST=0 + fi + + # make outputs from the previous job as env vars + CUDA_CORE_ARTIFACT_BASENAME="cuda-core-python${PYTHON_VERSION_FORMATTED}-${{ inputs.host-platform }}" + echo "PYTHON_VERSION_FORMATTED=${PYTHON_VERSION_FORMATTED}" >> $GITHUB_ENV + echo "CUDA_CORE_ARTIFACT_BASENAME=${CUDA_CORE_ARTIFACT_BASENAME}" >> $GITHUB_ENV + echo "CUDA_CORE_ARTIFACT_NAME=${CUDA_CORE_ARTIFACT_BASENAME}-${{ github.sha }}" >> $GITHUB_ENV + echo "CUDA_CORE_ARTIFACTS_DIR=$(realpath "$REPO_DIR/cuda_core/dist")" >> $GITHUB_ENV + CUDA_BINDINGS_ARTIFACT_BASENAME="cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda${{ inputs.build-ctk-ver }}-${{ inputs.host-platform }}" + echo "CUDA_BINDINGS_ARTIFACT_BASENAME=${CUDA_BINDINGS_ARTIFACT_BASENAME}" >> $GITHUB_ENV + echo "CUDA_BINDINGS_ARTIFACT_NAME=${CUDA_BINDINGS_ARTIFACT_BASENAME}-${{ github.sha }}" >> $GITHUB_ENV + echo "CUDA_BINDINGS_ARTIFACTS_DIR=$(realpath "$REPO_DIR/cuda_bindings/dist")" >> $GITHUB_ENV + echo "SKIP_CUDA_BINDINGS_TEST=${SKIP_CUDA_BINDINGS_TEST}" >> $GITHUB_ENV + + - name: Install dependencies + uses: ./.github/actions/install_unix_deps + continue-on-error: false + with: + # gcc for Cython tests, jq/wget for artifact fetching + dependencies: "build-essential jq wget" + dependent_exes: "gcc jq wget" + + - name: Download cuda.bindings build artifacts + if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '0'}} + uses: actions/download-artifact@v4 + with: + name: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }} + path: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }} + + - name: Download cuda.bindings build artifacts from the prior branch + if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '1'}} + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + # See https://github.com/cli/cli/blob/trunk/docs/install_linux.md#debian-ubuntu-linux-raspberry-pi-os-apt. + # gh is needed for artifact fetching. + mkdir -p -m 755 /etc/apt/keyrings \ + && out=$(mktemp) && wget -nv -O$out https://cli.github.com/packages/githubcli-archive-keyring.gpg \ + && cat $out | tee /etc/apt/keyrings/githubcli-archive-keyring.gpg > /dev/null \ + && chmod go+r /etc/apt/keyrings/githubcli-archive-keyring.gpg \ + && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | tee /etc/apt/sources.list.d/github-cli.list > /dev/null \ + && apt update \ + && apt install gh -y + + OLD_BRANCH=$(cat .github/BACKPORT_BRANCH) + OLD_BASENAME="cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda*-${{ inputs.host-platform }}*" + LATEST_PRIOR_RUN_ID=$(gh run list -b ${OLD_BRANCH} -L 1 -w "CI: Build and test" -s completed -R NVIDIA/cuda-python --json databaseId | jq '.[]| .databaseId') + gh run download $LATEST_PRIOR_RUN_ID -p ${OLD_BASENAME} -R NVIDIA/cuda-python + ls -al $OLD_BASENAME + mkdir -p "${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}" + mv $OLD_BASENAME/*.whl "${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}"/ + + - name: Display structure of downloaded cuda.bindings artifacts + run: | + pwd + ls -lahR $CUDA_BINDINGS_ARTIFACTS_DIR + + - name: Download cuda.core build artifacts + uses: actions/download-artifact@v4 + with: + name: ${{ env.CUDA_CORE_ARTIFACT_NAME }} + path: ${{ env.CUDA_CORE_ARTIFACTS_DIR }} + + - name: Display structure of downloaded cuda.core build artifacts + run: | + pwd + ls -lahR $CUDA_CORE_ARTIFACTS_DIR + + - name: Set up Python ${{ inputs.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python-version }} + env: + # we use self-hosted runners on which setup-python behaves weirdly... + AGENT_TOOLSDIRECTORY: "/opt/hostedtoolcache" + + - name: Set up mini CTK + if: ${{ inputs.local-ctk == '1' }} + uses: ./.github/actions/fetch_ctk + continue-on-error: false + with: + host-platform: ${{ inputs.host-platform }} + cuda-version: ${{ inputs.cuda-version }} + + - name: Run cuda.bindings tests + if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '0' }} + run: | + pushd "${CUDA_BINDINGS_ARTIFACTS_DIR}" + if [[ "${{ inputs.local-ctk }}" == 1 ]]; then + ls $CUDA_PATH + pip install *.whl + else + pip install $(ls *.whl)[all] + fi + popd + + pushd ./cuda_bindings + pip install -r requirements.txt + pytest -rxXs tests/ + + # It is a bit convoluted to run the Cython tests against CTK wheels, + # so let's just skip them. + if [[ "${{ inputs.local-ctk }}" == 1 ]]; then + if [[ "${{ inputs.host-platform }}" == linux* ]]; then + bash tests/cython/build_tests.sh + elif [[ "${{ inputs.host-platform }}" == win* ]]; then + # TODO: enable this once win-64 runners are up + exit 1 + fi + pytest -rxXs tests/cython + popd + fi + + - name: Run cuda.core tests + run: | + # If build/test majors match: cuda.bindings is installed in the previous step. + # If mismatch: cuda.bindings is installed from the backport branch. + if [[ "${SKIP_CUDA_BINDINGS_TEST}" == 1 ]]; then + pushd "${CUDA_BINDINGS_ARTIFACTS_DIR}" + if [[ "${{ inputs.local-ctk }}" == 1 ]]; then + pip install *.whl + else + pip install $(ls *.whl)[all] + fi + popd + fi + TEST_CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${{ inputs.cuda-version }})" + pushd "${CUDA_CORE_ARTIFACTS_DIR}" + pip install $(ls *.whl)["cu${TEST_CUDA_MAJOR}"] + popd + + pushd ./cuda_core + pip install -r "tests/requirements-cu${TEST_CUDA_MAJOR}.txt" + pytest -rxXs tests/ + popd diff --git a/cuda_core/tests/requirements-cu11.txt b/cuda_core/tests/requirements-cu11.txt index 8fb37e92..d9bd566c 100644 --- a/cuda_core/tests/requirements-cu11.txt +++ b/cuda_core/tests/requirements-cu11.txt @@ -1,3 +1,4 @@ pytest # TODO: remove this hack once cupy has a cp313 build cupy-cuda11x; python_version < "3.13" +nvidia-cuda-runtime-cu11 # headers consumed by CuPy diff --git a/cuda_core/tests/requirements-cu12.txt b/cuda_core/tests/requirements-cu12.txt index 2e82e12d..18f67360 100644 --- a/cuda_core/tests/requirements-cu12.txt +++ b/cuda_core/tests/requirements-cu12.txt @@ -1,3 +1,4 @@ pytest # TODO: remove this hack once cupy has a cp313 build cupy-cuda12x; python_version < "3.13" +nvidia-cuda-runtime-cu12 # headers consumed by CuPy