ci: Fixes for pre-built wheels (#214)

* build: Allow NGC builds

Signed-off-by: oliver könig <okoenig@nvidia.com>

* reduce grid

Signed-off-by: oliver könig <okoenig@nvidia.com>

* update grid

Signed-off-by: oliver könig <okoenig@nvidia.com>

* fix

Signed-off-by: oliver könig <okoenig@nvidia.com>

* upgrade cuda action

Signed-off-by: oliver könig <okoenig@nvidia.com>

* remove test

Signed-off-by: oliver könig <okoenig@nvidia.com>

* py3.8

Signed-off-by: oliver könig <okoenig@nvidia.com>

* fix

Signed-off-by: oliver könig <okoenig@nvidia.com>

* exclude

Signed-off-by: oliver könig <okoenig@nvidia.com>

* fix

Signed-off-by: oliver könig <okoenig@nvidia.com>

* torch-version

Signed-off-by: oliver könig <okoenig@nvidia.com>

* py3.8/torch2.1/cuda12.3

Signed-off-by: oliver könig <okoenig@nvidia.com>

* Update publish.yml

* fix grid

Signed-off-by: oliver könig <okoenig@nvidia.com>

* fix

Signed-off-by: oliver könig <okoenig@nvidia.com>

* cuda11.8

Signed-off-by: oliver könig <okoenig@nvidia.com>

* no hopper for 118

Signed-off-by: oliver könig <okoenig@nvidia.com>

* fix

Signed-off-by: oliver könig <okoenig@nvidia.com>

* fix

Signed-off-by: oliver könig <okoenig@nvidia.com>

---------

Signed-off-by: oliver könig <okoenig@nvidia.com>
This commit is contained in:
oliver könig
2025-10-14 07:05:47 +02:00
committed by GitHub
parent f8f41145da
commit 93b3c28fa8
8 changed files with 337 additions and 34 deletions

View File

@@ -53,6 +53,11 @@ jobs:
ref: ${{ inputs.release-version }}
submodules: recursive
- name: Checkout build scripts
uses: actions/checkout@v4
with:
path: build-scripts/
- name: Set up Python
uses: actions/setup-python@v5
with:
@@ -82,7 +87,7 @@ jobs:
- name: Install CUDA ${{ inputs.cuda-version }}
if: ${{ inputs.cuda-version != 'cpu' }}
uses: Jimver/cuda-toolkit@v0.2.26
uses: Jimver/cuda-toolkit@v0.2.28
id: cuda-toolkit
with:
cuda: ${{ inputs.cuda-version }}
@@ -109,8 +114,8 @@ jobs:
# see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
# This code is ugly, maybe there's a better way to do this.
export TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \
minv = {'2.4': 118, '2.5': 118, '2.6': 118, '2.7': 118, '2.8': 126}[env['MATRIX_TORCH_VERSION']]; \
maxv = {'2.4': 124, '2.5': 124, '2.6': 126, '2.7': 128, '2.8': 129}[env['MATRIX_TORCH_VERSION']]; \
minv = {'2.1': 121, '2.4': 118, '2.5': 118, '2.6': 118, '2.7': 118, '2.8': 126}[env['MATRIX_TORCH_VERSION']]; \
maxv = {'2.1': 121, '2.4': 124, '2.5': 124, '2.6': 126, '2.7': 128, '2.8': 129}[env['MATRIX_TORCH_VERSION']]; \
print(minv if int(env['MATRIX_CUDA_VERSION']) < 120 else maxv)" \
)
if [[ ${{ inputs.torch-version }} == *"dev"* ]]; then
@@ -156,39 +161,24 @@ jobs:
- name: Build wheel
id: build_wheel
env:
CXX11_ABI: ${{ inputs.cxx11_abi }}
MATRIX_TORCH_VERSION: ${{ env.MATRIX_TORCH_VERSION}}
WHEEL_CUDA_VERSION: ${{ env.WHEEL_CUDA_VERSION }}
MATRIX_PYTHON_VERSION: ${{ env.MATRIX_PYTHON_VERSION }}
DG_USE_LOCAL_VERSION: ${{ inputs.use-local-version && '1' || '0' }}
run: |
# We want setuptools >= 49.6.0 otherwise we can't compile the extension if system CUDA version is 11.7 and pytorch cuda version is 11.6
# https://github.com/pytorch/pytorch/blob/664058fa83f1d8eede5d66418abff6e20bd76ca8/torch/utils/cpp_extension.py#L810
# However this still fails so I'm using a newer version of setuptools
pip install setuptools==75.8.0
pip install ninja packaging wheel
export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
# Limit MAX_JOBS otherwise the github runner goes OOM
# nvcc 11.8 can compile with 2 jobs, but nvcc 12.3 goes OOM
export MAX_JOBS=$([ "$MATRIX_CUDA_VERSION" == "129" ] && echo 1 || echo 2)
export NVCC_THREADS=2
export TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 9.0+PTX"
export DG_USE_LOCAL_VERSION=${{ inputs.use-local-version && '1' || '0' }}
# 5h timeout since GH allows max 6h and we want some buffer
EXIT_CODE=0
timeout 5h python setup.py bdist_wheel --dist-dir=dist || EXIT_CODE=$?
if [ $EXIT_CODE -eq 0 ]; then
tmpname=cu${WHEEL_CUDA_VERSION}torch${MATRIX_TORCH_VERSION}cxx11abi${{ inputs.cxx11_abi }}
wheel_name=$(ls dist/*whl | xargs -n 1 basename | sed "s/-/+$tmpname-/2")
ls dist/*whl |xargs -I {} mv {} dist/${wheel_name}
echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
fi
EXIT_CODE=$(bash build-scripts/.github/scripts/build.sh | tail -n 1)
# Store exit code in GitHub env for later steps
echo "build_exit_code=$EXIT_CODE" | tee -a "$GITHUB_OUTPUT"
# Do not fail the job if timeout killed the build
exit $EXIT_CODE
- name: Log Built Wheels
run: |
ls dist
- name: Log build logs after timeout
if: always() && steps.build_wheel.outputs.build_exit_code == 124
run: |

View File

@@ -0,0 +1,139 @@
name: ~Build wheel template
on:
workflow_call:
inputs:
runs-on:
description: "The runner to use for the build"
required: true
type: string
container-image:
description: "Container image"
required: true
type: string
upload-to-release:
description: "Upload wheel to this release"
required: false
type: boolean
default: false
release-version:
description: "Upload wheel to this release"
required: false
type: string
defaults:
run:
shell: bash -x -e -u -o pipefail {0}
jobs:
build-wheel:
runs-on: ${{ inputs.runs-on }}
name: Build wheel (${{ inputs.container-image }})
steps:
- name: Move /var/lib/docker/
run: sudo mv /var/lib/docker/ "${GITHUB_WORKSPACE}/docker"
- name: Maximize build space
uses: easimon/maximize-build-space@master
with:
root-reserve-mb: 5120
temp-reserve-mb: 32
swap-size-mb: 10240
remove-dotnet: "true"
remove-android: "true"
remove-haskell: "true"
remove-codeql: "true"
build-mount-path: "/var/lib/docker/"
- name: Restore /var/lib/docker/
run: sudo sh -c "mv ${GITHUB_WORKSPACE}/docker/* /var/lib/docker"
- name: Checkout source
uses: actions/checkout@v4
with:
ref: ${{ inputs.release-version }}
submodules: recursive
- name: Checkout build scripts
uses: actions/checkout@v4
with:
path: build-scripts/
- name: Build
run: |
echo "Free space:"
df -h
- name: Pull the container
run: docker pull ${{ inputs.container-image }}
- name: Set CUDA and PyTorch versions
run: |
cat <<'EOF' >> script.sh
#!/bin/bash
set -eoxu pipefail
echo "MATRIX_CUDA_VERSION=$(echo $CUDA_VERSION | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
echo "MATRIX_TORCH_VERSION=$NVIDIA_PYTORCH_VERSION" >> $GITHUB_ENV
echo "WHEEL_CUDA_VERSION=$(echo $CUDA_VERSION | awk -F \. {'print $1'})" >> $GITHUB_ENV
echo "MATRIX_PYTHON_VERSION=$(python -c "import sys; print('{}.{}'.format(sys.version_info[0], sys.version_info[1]))" | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
echo "CXX11_ABI=$(python -c 'import torch; print(str(torch._C._GLIBCXX_USE_CXX11_ABI).upper())')" >> $GITHUB_ENV
cat $GITHUB_ENV
EOF
docker run \
--rm \
--shm-size=64g \
--workdir /workspace \
--volume $(pwd):/workspace \
--volume $GITHUB_ENV:$GITHUB_ENV \
-e GITHUB_ENV=$GITHUB_ENV \
${{ inputs.container-image }} bash /workspace/script.sh
- name: Build wheel
id: build_wheel
env:
CXX11_ABI: ${{ env.CXX11_ABI }}
MATRIX_TORCH_VERSION: ${{ env.MATRIX_TORCH_VERSION}}
WHEEL_CUDA_VERSION: ${{ env.WHEEL_CUDA_VERSION }}
MATRIX_PYTHON_VERSION: ${{ env.MATRIX_PYTHON_VERSION }}
run: |
EXIT_CODE=$(docker run \
--rm \
--shm-size=64g \
--workdir /workspace \
--volume $(pwd):/workspace \
--volume $GITHUB_ENV:$GITHUB_ENV \
-e PIP_CONSTRAINT= \
-e GITHUB_ENV=$GITHUB_ENV \
-e CXX11_ABI=$CXX11_ABI \
-e MATRIX_TORCH_VERSION=$MATRIX_TORCH_VERSION \
-e WHEEL_CUDA_VERSION=$WHEEL_CUDA_VERSION \
-e MATRIX_PYTHON_VERSION=$MATRIX_PYTHON_VERSION \
${{ inputs.container-image }} bash /workspace/build-scripts/.github/scripts/build.sh | tail -n 1)
- name: Log Built Wheels
run: |
ls dist
- name: Get Release with tag
id: get_current_release
uses: joutvhu/get-release@v1
with:
tag_name: ${{ inputs.release-version }}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Upload Release Asset
id: upload_release_asset
if: inputs.upload-to-release
uses: actions/upload-release-asset@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
upload_url: ${{ steps.get_current_release.outputs.upload_url }}
asset_path: ./dist/${{env.wheel_name}}
asset_name: ${{env.wheel_name}}
asset_content_type: application/*

View File

@@ -0,0 +1,34 @@
name: Build wheels in a container
on:
workflow_dispatch:
inputs:
runs-on:
description: "The runner to use for the build"
required: true
type: string
default: ubuntu-22.04
container-image:
description: "Container image"
required: true
type: string
upload-to-release:
description: "Upload wheel to this release"
required: false
type: boolean
default: false
release-version:
description: "Upload wheel to this release"
required: false
type: string
push:
jobs:
build-wheels:
uses: ./.github/workflows/_build_in_container.yml
with:
runs-on: ${{ inputs.runs-on || 'ubuntu-22.04' }}
container-image: ${{ inputs.container-image || 'nvcr.io/nvidia/pytorch:25.06-py3' }}
upload-to-release: ${{ inputs.upload-to-release || false }}
release-version: ${{ inputs.release-version || 'v2.2.5' }}

View File

@@ -41,9 +41,9 @@ jobs:
# Using ubuntu-22.04 instead of 24.04 for more compatibility (glibc). Ideally we'd use the
# manylinux docker image, but I haven't figured out how to install CUDA on manylinux.
os: [ubuntu-22.04]
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
torch-version: ["2.4.0", "2.5.1", "2.6.0", "2.7.1", "2.8.0"]
cuda-version: ["12.9.1"]
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
torch-version: ["2.5.1", "2.6.0", "2.7.1", "2.8.0"]
cuda-version: ["12.9.1", "13.0.0"]
# We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not.
# Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI.
# Without this we get import error (undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationESs)
@@ -51,9 +51,14 @@ jobs:
cxx11_abi: ["FALSE", "TRUE"]
exclude:
# see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
# Pytorch < 2.5 does not support Python 3.13
- torch-version: "2.4.0"
python-version: "3.13"
include:
- os: "ubuntu-22.04"
cxx11_abi: "FALSE"
cuda-version: "12.1.0"
python-version: "3.8"
torch-version: "2.1.0"
uses: ./.github/workflows/_build.yml
with:
runs-on: ${{ matrix.os }}
@@ -64,6 +69,35 @@ jobs:
release-version: ${{ needs.setup_release.outputs.release-version }}
upload-to-release: true
use-local-version: false
check_for_ngc_images:
runs-on: ubuntu-latest
outputs:
images: ${{ steps.check_for_ngc_images.outputs.IMAGES }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Check for NGC PyTorch images
id: check_for_ngc_images
run: |
bash ./.github/scripts/check_for_ngc_images.sh
echo "IMAGES=$(cat ngc_images.json| jq -cr)" | tee -a $GITHUB_OUTPUT
build_ngc_wheels:
name: Build Wheel for NGC PyTorch
needs: [setup_release, check_for_ngc_images]
strategy:
fail-fast: false
matrix:
os: [ubuntu-22.04]
container-image: ${{ fromJson(needs.check_for_ngc_images.outputs.images) }}
uses: ./.github/workflows/_build_in_container.yml
with:
runs-on: ${{ matrix.os }}
container-image: ${{ matrix.container-image }}
release-version: ${{ needs.setup_release.outputs.release-version }}
upload-to-release: true
publish_package:
name: Publish package