chore: Build and store bdist wheels (#181)

* build: Minor tweeks for wheel build

Signed-off-by: oliver könig <okoenig@nvidia.com>

* ci: Workflows for wheel build

Signed-off-by: oliver könig <okoenig@nvidia.com>

* fix

Signed-off-by: oliver könig <okoenig@nvidia.com>

* fix

Signed-off-by: oliver könig <okoenig@nvidia.com>

* build: Add CachedWheel

Signed-off-by: oliver könig <okoenig@nvidia.com>

* add version to init

Signed-off-by: oliver könig <okoenig@nvidia.com>

* revert

Signed-off-by: oliver könig <okoenig@nvidia.com>

* revert

Signed-off-by: oliver könig <okoenig@nvidia.com>

* revert

Signed-off-by: oliver könig <okoenig@nvidia.com>

* v2

Signed-off-by: oliver könig <okoenig@nvidia.com>

* update

Signed-off-by: oliver könig <okoenig@nvidia.com>

* test

Signed-off-by: oliver könig <okoenig@nvidia.com>

* from packaging.version import parse

Signed-off-by: oliver könig <okoenig@nvidia.com>

* local version

Signed-off-by: oliver könig <okoenig@nvidia.com>

* remove file

Signed-off-by: oliver könig <okoenig@nvidia.com>

* revert

Signed-off-by: oliver könig <okoenig@nvidia.com>

* Updates and lint

* revert missing cudaextension args

Signed-off-by: oliver könig <okoenig@nvidia.com>

* Add timeout

* fix DG settings

Signed-off-by: oliver könig <okoenig@nvidia.com>

* DG_USE_LOCAL_VERSION

Signed-off-by: oliver könig <okoenig@nvidia.com>

* Update version

* Detect local changes

* Minor fix

* Revert CUTLASS

* Unify options

---------

Signed-off-by: oliver könig <okoenig@nvidia.com>
Co-authored-by: Chenggang Zhao <chenggangz@deepseek.com>
This commit is contained in:
oliver könig
2025-10-10 12:23:40 +02:00
committed by GitHub
parent 6e74faad5c
commit 9f196058ae
5 changed files with 491 additions and 21 deletions

227
.github/workflows/_build.yml vendored Normal file
View File

@@ -0,0 +1,227 @@
name: ~Build wheel template
on:
workflow_call:
inputs:
runs-on:
description: "The runner to use for the build"
required: true
type: string
python-version:
description: "The Python version to use for the build"
required: true
type: string
cuda-version:
description: "The CUDA version to use for the build"
required: true
type: string
torch-version:
description: "The PyTorch version to use for the build"
required: true
type: string
cxx11_abi:
description: "The C++11 ABI to use for the build"
required: true
type: string
upload-to-release:
description: "Upload wheel to this release"
required: false
type: boolean
default: false
release-version:
description: "Upload wheel to this release"
required: false
type: string
use-local-version:
description: "Use local version"
required: false
type: boolean
default: false
defaults:
run:
shell: bash -x -e -u -o pipefail {0}
jobs:
build-wheel:
runs-on: ${{ inputs.runs-on }}
name: Build wheel (${{ inputs.release-version }}-${{ inputs.python-version }}-${{ inputs.cuda-version }}-${{ inputs.torch-version }}-${{ inputs.cxx11_abi }})
steps:
- name: Checkout
uses: actions/checkout@v4
with:
ref: ${{ inputs.release-version }}
submodules: recursive
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ inputs.python-version }}
- name: Set CUDA and PyTorch versions
run: |
echo "MATRIX_CUDA_VERSION=$(echo ${{ inputs.cuda-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
echo "MATRIX_TORCH_VERSION=$(echo ${{ inputs.torch-version }} | awk -F \. {'print $1 "." $2'})" >> $GITHUB_ENV
echo "WHEEL_CUDA_VERSION=$(echo ${{ inputs.cuda-version }} | awk -F \. {'print $1'})" >> $GITHUB_ENV
echo "MATRIX_PYTHON_VERSION=$(echo ${{ inputs.python-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
- name: Free up disk space
if: ${{ runner.os == 'Linux' }}
# https://github.com/easimon/maximize-build-space/blob/master/action.yml
# https://github.com/easimon/maximize-build-space/tree/test-report
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache/CodeQL
- name: Set up swap space
if: runner.os == 'Linux'
uses: pierotofy/set-swap-space@v1.0
with:
swap-size-gb: 10
- name: Install CUDA ${{ inputs.cuda-version }}
if: ${{ inputs.cuda-version != 'cpu' }}
uses: Jimver/cuda-toolkit@v0.2.26
id: cuda-toolkit
with:
cuda: ${{ inputs.cuda-version }}
linux-local-args: '["--toolkit"]'
# default method is "local", and we're hitting some error with caching for CUDA 11.8 and 12.1
# method: ${{ (inputs.cuda-version == '11.8.0' || inputs.cuda-version == '12.1.0') && 'network' || 'local' }}
method: "network"
- name: Install additional CUDA libraries
run: |
CUDA_VERSION=$(echo ${{ inputs.cuda-version }} | awk -F \. {'print $1 "-" $2'})
sudo apt-get update
sudo apt-get install -y libcusparse-$CUDA_VERSION libcusolver-$CUDA_VERSION
sudo apt-get clean
- name: Install PyTorch ${{ inputs.torch-version }}+cu${{ inputs.cuda-version }}
run: |
pip install --upgrade pip
# With python 3.13 and torch 2.5.1, unless we update typing-extensions, we get error
# AttributeError: attribute '__default__' of 'typing.ParamSpec' objects is not writable
pip install typing-extensions==4.12.2
# We want to figure out the CUDA version to download pytorch
# e.g. we can have system CUDA version being 11.7 but if torch==1.12 then we need to download the wheel from cu116
# see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
# This code is ugly, maybe there's a better way to do this.
export TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \
minv = {'2.4': 118, '2.5': 118, '2.6': 118, '2.7': 118, '2.8': 126}[env['MATRIX_TORCH_VERSION']]; \
maxv = {'2.4': 124, '2.5': 124, '2.6': 126, '2.7': 128, '2.8': 129}[env['MATRIX_TORCH_VERSION']]; \
print(minv if int(env['MATRIX_CUDA_VERSION']) < 120 else maxv)" \
)
if [[ ${{ inputs.torch-version }} == *"dev"* ]]; then
# pip install --no-cache-dir --pre torch==${{ inputs.torch-version }} --index-url https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION}
# Can't use --no-deps because we need cudnn etc.
# Hard-coding this version of pytorch-triton for torch 2.6.0.dev20241001
pip install jinja2
pip install https://download.pytorch.org/whl/nightly/pytorch_triton-3.1.0%2Bcf34004b8a-cp${MATRIX_PYTHON_VERSION}-cp${MATRIX_PYTHON_VERSION}-linux_x86_64.whl
pip install --no-cache-dir --pre https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION}/torch-${{ inputs.torch-version }}%2Bcu${TORCH_CUDA_VERSION}-cp${MATRIX_PYTHON_VERSION}-cp${MATRIX_PYTHON_VERSION}-linux_x86_64.whl
else
pip install --no-cache-dir torch==${{ inputs.torch-version }} --index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION}
fi
nvcc --version
python --version
python -c "import torch; print('PyTorch:', torch.__version__)"
python -c "import torch; print('CUDA:', torch.version.cuda)"
python -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
- name: Restore build cache
uses: actions/cache/restore@v4
with:
path: build.tar
key: build-${{ inputs.release-version }}-${{ inputs.python-version }}-${{ inputs.cuda-version }}-${{ inputs.torch-version }}-${{ inputs.cxx11_abi }}-${{ github.run_number }}-${{ github.run_attempt }}
restore-keys: |
build-${{ inputs.release-version }}-${{ inputs.python-version }}-${{ inputs.cuda-version }}-${{ inputs.torch-version }}-${{ inputs.cxx11_abi }}-
- name: Unpack build cache
run: |
echo ::group::Adjust timestamps
sudo find / -exec touch -t 197001010000 {} + || true
echo ::endgroup::
if [ -f build.tar ]; then
find . -mindepth 1 -maxdepth 1 ! -name 'build.tar' -exec rm -rf {} +
tar -xpvf build.tar -C .
else
echo "No build.tar found, skipping"
fi
ls -al ./
ls -al build/ || true
ls -al csrc/ || true
- name: Build wheel
id: build_wheel
run: |
# We want setuptools >= 49.6.0 otherwise we can't compile the extension if system CUDA version is 11.7 and pytorch cuda version is 11.6
# https://github.com/pytorch/pytorch/blob/664058fa83f1d8eede5d66418abff6e20bd76ca8/torch/utils/cpp_extension.py#L810
# However this still fails so I'm using a newer version of setuptools
pip install setuptools==75.8.0
pip install ninja packaging wheel
export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
# Limit MAX_JOBS otherwise the github runner goes OOM
# nvcc 11.8 can compile with 2 jobs, but nvcc 12.3 goes OOM
export MAX_JOBS=$([ "$MATRIX_CUDA_VERSION" == "129" ] && echo 1 || echo 2)
export NVCC_THREADS=2
export TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 9.0+PTX"
export DG_USE_LOCAL_VERSION=${{ inputs.use-local-version && '1' || '0' }}
# 5h timeout since GH allows max 6h and we want some buffer
EXIT_CODE=0
timeout 5h python setup.py bdist_wheel --dist-dir=dist || EXIT_CODE=$?
if [ $EXIT_CODE -eq 0 ]; then
tmpname=cu${WHEEL_CUDA_VERSION}torch${MATRIX_TORCH_VERSION}cxx11abi${{ inputs.cxx11_abi }}
wheel_name=$(ls dist/*whl | xargs -n 1 basename | sed "s/-/+$tmpname-/2")
ls dist/*whl |xargs -I {} mv {} dist/${wheel_name}
echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
fi
# Store exit code in GitHub env for later steps
echo "build_exit_code=$EXIT_CODE" | tee -a "$GITHUB_OUTPUT"
# Do not fail the job if timeout killed the build
exit $EXIT_CODE
- name: Log build logs after timeout
if: always() && steps.build_wheel.outputs.build_exit_code == 124
run: |
ls -al ./
tar -cvf build.tar . --atime-preserve=replace
- name: Save build cache timeout
if: always() && steps.build_wheel.outputs.build_exit_code == 124
uses: actions/cache/save@v4
with:
key: build-${{ inputs.release-version }}-${{ inputs.python-version }}-${{ inputs.cuda-version }}-${{ inputs.torch-version }}-${{ inputs.cxx11_abi }}-${{ github.run_number }}-${{ github.run_attempt }}
path: build.tar
- name: Log Built Wheels
run: |
ls dist
- name: Get Release with tag
id: get_current_release
uses: joutvhu/get-release@v1
with:
tag_name: ${{ inputs.release-version }}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Upload Release Asset
id: upload_release_asset
if: inputs.upload-to-release
uses: actions/upload-release-asset@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
upload_url: ${{ steps.get_current_release.outputs.upload_url }}
asset_path: ./dist/${{env.wheel_name}}
asset_name: ${{env.wheel_name}}
asset_content_type: application/*

53
.github/workflows/build.yml vendored Normal file
View File

@@ -0,0 +1,53 @@
name: Build wheels
on:
workflow_dispatch:
inputs:
runs-on:
description: "The runner to use for the build"
required: true
type: string
default: ubuntu-22.04
python-version:
description: "The Python version to use for the build"
required: true
type: string
cuda-version:
description: "The CUDA version to use for the build"
required: true
type: string
torch-version:
description: "The PyTorch version to use for the build"
required: true
type: string
cxx11_abi:
description: "Enable torch flag C++11 ABI (TRUE/FALSE)"
required: true
type: string
upload-to-release:
description: "Upload wheel to this release"
required: false
type: boolean
default: false
release-version:
description: "Upload wheel to this release"
required: false
type: string
use-local-version:
description: "Use local version"
required: false
type: boolean
default: false
jobs:
build-wheels:
uses: ./.github/workflows/_build.yml
with:
runs-on: ${{ inputs.runs-on }}
python-version: ${{ inputs.python-version }}
cuda-version: ${{ inputs.cuda-version }}
torch-version: ${{ inputs.torch-version }}
cxx11_abi: ${{ inputs.cxx11_abi }}
upload-to-release: ${{ inputs.upload-to-release }}
release-version: ${{ inputs.release-version }}
use-local-version: ${{ inputs.use-local-version }}

95
.github/workflows/publish.yml vendored Normal file
View File

@@ -0,0 +1,95 @@
# This workflow will:
# - Create a new Github release
# - Build wheels for supported architectures
# - Deploy the wheels to the Github release
# - Release the static code to PyPi
# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
name: Build wheels and deploy
on:
create:
tags:
- v*
jobs:
setup_release:
name: Create Release
runs-on: ubuntu-latest
outputs:
release-version: ${{ steps.extract_branch.outputs.branch }}
steps:
- name: Get the tag version
id: extract_branch
run: echo ::set-output name=branch::${GITHUB_REF#refs/tags/}
shell: bash
- name: Create Release
id: create_release
uses: actions/create-release@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
tag_name: ${{ steps.extract_branch.outputs.branch }}
release_name: ${{ steps.extract_branch.outputs.branch }}
build_wheels:
name: Build Wheel
needs: setup_release
strategy:
fail-fast: false
matrix:
# Using ubuntu-22.04 instead of 24.04 for more compatibility (glibc). Ideally we'd use the
# manylinux docker image, but I haven't figured out how to install CUDA on manylinux.
os: [ubuntu-22.04]
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
torch-version: ["2.4.0", "2.5.1", "2.6.0", "2.7.1", "2.8.0"]
cuda-version: ["12.9.1"]
# We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not.
# Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI.
# Without this we get import error (undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationESs)
# when building without C++11 ABI and using it on nvcr images.
cxx11_abi: ["FALSE", "TRUE"]
exclude:
# see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
# Pytorch < 2.5 does not support Python 3.13
- torch-version: "2.4.0"
python-version: "3.13"
uses: ./.github/workflows/_build.yml
with:
runs-on: ${{ matrix.os }}
python-version: ${{ matrix.python-version }}
cuda-version: ${{ matrix.cuda-version }}
torch-version: ${{ matrix.torch-version }}
cxx11_abi: ${{ matrix.cxx11_abi }}
release-version: ${{ needs.setup_release.outputs.release-version }}
upload-to-release: true
use-local-version: false
publish_package:
name: Publish package
needs: [build_wheels]
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.10"
- name: Install dependencies
run: |
pip install ninja packaging wheel twine
# Install latest setuptools with support for pypi metadata 2.2 (improved compat w/ uv)
pip install setuptools==75.8.0
# We don't want to download anything CUDA-related here
pip install torch --index-url https://download.pytorch.org/whl/cpu
- name: Build core package
env:
DG_USE_LOCAL_VERSION: "0"
DG_SKIP_CUDA_BUILD: "1"
run: |
python setup.py sdist --dist-dir=dist
- name: Deploy
env:
TWINE_USERNAME: "__token__"
TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
run: |
python -m twine upload dist/*

View File

@@ -83,3 +83,5 @@ deep_gemm_cpp.init(
os.path.dirname(os.path.abspath(__file__)), # Library root directory path
_find_cuda_home() # CUDA home
)
__version__ = '2.1.1'

135
setup.py
View File

@@ -1,15 +1,36 @@
import ast
import os
import setuptools
import re
import shutil
import setuptools
import subprocess
import sys
import torch
import platform
import urllib
import urllib.error
import urllib.request
from setuptools import find_packages
from setuptools.command.build_py import build_py
from packaging.version import parse
from pathlib import Path
from torch.utils.cpp_extension import CUDAExtension, CUDA_HOME
from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
current_dir = os.path.dirname(os.path.realpath(__file__))
DG_SKIP_CUDA_BUILD = int(os.getenv('DG_SKIP_CUDA_BUILD', '0')) == 1
DG_FORCE_BUILD = int(os.getenv('DG_FORCE_BUILD', '0')) == 1
DG_USE_LOCAL_VERSION = int(os.getenv('DG_USE_LOCAL_VERSION', '1')) == 1
DG_JIT_USE_RUNTIME_API = int(os.environ.get('DG_JIT_USE_RUNTIME_API', '0')) == 1
# Compiler flags
cxx_flags = ['-std=c++17', '-O3', '-fPIC', '-Wno-psabi', '-Wno-deprecated-declarations',
f'-D_GLIBCXX_USE_CXX11_ABI={int(torch.compiled_with_cxx11_abi())}']
if DG_JIT_USE_RUNTIME_API:
cxx_flags.append('-DDG_JIT_USE_RUNTIME_API')
# Sources
current_dir = os.path.dirname(os.path.realpath(__file__))
sources = ['csrc/python_api.cpp']
build_include_dirs = [
f'{CUDA_HOME}/include',
@@ -28,9 +49,68 @@ third_party_include_dirs = [
'third-party/cutlass/include/cutlass',
]
# Use runtime API
if int(os.environ.get('DG_JIT_USE_RUNTIME_API', '0')):
cxx_flags.append('-DDG_JIT_USE_RUNTIME_API')
# Release
base_wheel_url = 'https://github.com/DeepSeek-AI/DeepGEMM/releases/download/{tag_name}/{wheel_name}'
def get_package_version():
with open(Path(current_dir) / 'deep_gemm' / '__init__.py', 'r') as f:
version_match = re.search(r'^__version__\s*=\s*(.*)$', f.read(), re.MULTILINE)
public_version = ast.literal_eval(version_match.group(1))
revision = ''
if DG_USE_LOCAL_VERSION:
# noinspection PyBroadException
try:
status_cmd = ['git', 'status', '--porcelain']
status_output = subprocess.check_output(status_cmd).decode('ascii').strip()
if status_output:
print(f'Warning: Git working directory is not clean. Uncommitted changes:\n{status_output}')
assert False, 'Git working directory is not clean'
cmd = ['git', 'rev-parse', '--short', 'HEAD']
revision = '+' + subprocess.check_output(cmd).decode('ascii').rstrip()
except:
revision = '+local'
return f'{public_version}{revision}'
def get_platform():
if sys.platform.startswith('linux'):
return f'linux_{platform.uname().machine}'
else:
raise ValueError('Unsupported platform: {}'.format(sys.platform))
def get_wheel_url():
torch_version = parse(torch.__version__)
torch_version = f'{torch_version.major}.{torch_version.minor}'
python_version = f'cp{sys.version_info.major}{sys.version_info.minor}'
platform_name = get_platform()
deep_gemm_version = get_package_version()
cxx11_abi = int(torch._C._GLIBCXX_USE_CXX11_ABI)
# Determine the version numbers that will be used to determine the correct wheel
# We're using the CUDA version used to build torch, not the one currently installed
cuda_version = parse(torch.version.cuda)
cuda_version = f'{cuda_version.major}'
# Determine wheel URL based on CUDA version, torch version, python version and OS
wheel_filename = f'deep_gemm-{deep_gemm_version}+cu{cuda_version}-torch{torch_version}-cxx11abi{cxx11_abi}-{python_version}-{platform_name}.whl'
wheel_url = base_wheel_url.format(tag_name=f'v{deep_gemm_version}', wheel_name=wheel_filename)
return wheel_url, wheel_filename
def get_ext_modules():
if DG_SKIP_CUDA_BUILD:
return []
return [CUDAExtension(name='deep_gemm_cpp',
sources=sources,
include_dirs=build_include_dirs,
libraries=build_libraries,
library_dirs=build_library_dirs,
extra_compile_args=cxx_flags)]
class CustomBuildPy(build_py):
@@ -72,18 +152,37 @@ class CustomBuildPy(build_py):
shutil.copytree(src_dir, dst_dir)
if __name__ == '__main__':
# noinspection PyBroadException
try:
cmd = ['git', 'rev-parse', '--short', 'HEAD']
revision = '+' + subprocess.check_output(cmd).decode('ascii').rstrip()
except:
revision = ''
class CachedWheelsCommand(_bdist_wheel):
def run(self):
if DG_FORCE_BUILD or DG_USE_LOCAL_VERSION:
return super().run()
wheel_url, wheel_filename = get_wheel_url()
print(f'Try to download wheel from URL: {wheel_url}')
try:
with urllib.request.urlopen(wheel_url, timeout=1) as response:
with open(wheel_filename, 'wb') as out_file:
data = response.read()
out_file.write(data)
# Make the archive
if not os.path.exists(self.dist_dir):
os.makedirs(self.dist_dir)
impl_tag, abi_tag, plat_tag = self.get_tag()
archive_basename = f'{self.wheel_dist_name}-{impl_tag}-{abi_tag}-{plat_tag}'
wheel_path = os.path.join(self.dist_dir, archive_basename + '.whl')
os.rename(wheel_filename, wheel_path)
except (urllib.error.HTTPError, urllib.error.URLError):
print('Precompiled wheel not found. Building from source...')
# If the wheel could not be downloaded, build from source
super().run()
if __name__ == '__main__':
# noinspection PyTypeChecker
setuptools.setup(
name='deep_gemm',
version='2.1.0' + revision,
version=get_package_version(),
packages=find_packages('.'),
package_data={
'deep_gemm': [
@@ -92,16 +191,10 @@ if __name__ == '__main__':
'include/cutlass/**/*',
]
},
ext_modules=[
CUDAExtension(name='deep_gemm_cpp',
sources=sources,
include_dirs=build_include_dirs,
libraries=build_libraries,
library_dirs=build_library_dirs,
extra_compile_args=cxx_flags)
],
ext_modules=get_ext_modules(),
zip_safe=False,
cmdclass={
'build_py': CustomBuildPy,
'bdist_wheel': CachedWheelsCommand,
},
)