From 9f196058ae05f4f392fe71d12841e4b9df736a02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 10 Oct 2025 12:23:40 +0200 Subject: [PATCH] chore: Build and store bdist wheels (#181) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * build: Minor tweeks for wheel build Signed-off-by: oliver könig * ci: Workflows for wheel build Signed-off-by: oliver könig * fix Signed-off-by: oliver könig * fix Signed-off-by: oliver könig * build: Add CachedWheel Signed-off-by: oliver könig * add version to init Signed-off-by: oliver könig * revert Signed-off-by: oliver könig * revert Signed-off-by: oliver könig * revert Signed-off-by: oliver könig * v2 Signed-off-by: oliver könig * update Signed-off-by: oliver könig * test Signed-off-by: oliver könig * from packaging.version import parse Signed-off-by: oliver könig * local version Signed-off-by: oliver könig * remove file Signed-off-by: oliver könig * revert Signed-off-by: oliver könig * Updates and lint * revert missing cudaextension args Signed-off-by: oliver könig * Add timeout * fix DG settings Signed-off-by: oliver könig * DG_USE_LOCAL_VERSION Signed-off-by: oliver könig * Update version * Detect local changes * Minor fix * Revert CUTLASS * Unify options --------- Signed-off-by: oliver könig Co-authored-by: Chenggang Zhao --- .github/workflows/_build.yml | 227 ++++++++++++++++++++++++++++++++++ .github/workflows/build.yml | 53 ++++++++ .github/workflows/publish.yml | 95 ++++++++++++++ deep_gemm/__init__.py | 2 + setup.py | 135 ++++++++++++++++---- 5 files changed, 491 insertions(+), 21 deletions(-) create mode 100644 .github/workflows/_build.yml create mode 100644 .github/workflows/build.yml create mode 100644 .github/workflows/publish.yml diff --git a/.github/workflows/_build.yml b/.github/workflows/_build.yml new file mode 100644 index 0000000..cff8013 --- /dev/null +++ b/.github/workflows/_build.yml @@ -0,0 +1,227 @@ +name: ~Build wheel template + +on: + workflow_call: + inputs: + runs-on: + description: "The runner to use for the build" + required: true + type: string + python-version: + description: "The Python version to use for the build" + required: true + type: string + cuda-version: + description: "The CUDA version to use for the build" + required: true + type: string + torch-version: + description: "The PyTorch version to use for the build" + required: true + type: string + cxx11_abi: + description: "The C++11 ABI to use for the build" + required: true + type: string + upload-to-release: + description: "Upload wheel to this release" + required: false + type: boolean + default: false + release-version: + description: "Upload wheel to this release" + required: false + type: string + use-local-version: + description: "Use local version" + required: false + type: boolean + default: false + +defaults: + run: + shell: bash -x -e -u -o pipefail {0} + +jobs: + build-wheel: + runs-on: ${{ inputs.runs-on }} + name: Build wheel (${{ inputs.release-version }}-${{ inputs.python-version }}-${{ inputs.cuda-version }}-${{ inputs.torch-version }}-${{ inputs.cxx11_abi }}) + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + ref: ${{ inputs.release-version }} + submodules: recursive + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python-version }} + + - name: Set CUDA and PyTorch versions + run: | + echo "MATRIX_CUDA_VERSION=$(echo ${{ inputs.cuda-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV + echo "MATRIX_TORCH_VERSION=$(echo ${{ inputs.torch-version }} | awk -F \. {'print $1 "." $2'})" >> $GITHUB_ENV + echo "WHEEL_CUDA_VERSION=$(echo ${{ inputs.cuda-version }} | awk -F \. {'print $1'})" >> $GITHUB_ENV + echo "MATRIX_PYTHON_VERSION=$(echo ${{ inputs.python-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV + + - name: Free up disk space + if: ${{ runner.os == 'Linux' }} + # https://github.com/easimon/maximize-build-space/blob/master/action.yml + # https://github.com/easimon/maximize-build-space/tree/test-report + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf /opt/hostedtoolcache/CodeQL + + - name: Set up swap space + if: runner.os == 'Linux' + uses: pierotofy/set-swap-space@v1.0 + with: + swap-size-gb: 10 + + - name: Install CUDA ${{ inputs.cuda-version }} + if: ${{ inputs.cuda-version != 'cpu' }} + uses: Jimver/cuda-toolkit@v0.2.26 + id: cuda-toolkit + with: + cuda: ${{ inputs.cuda-version }} + linux-local-args: '["--toolkit"]' + # default method is "local", and we're hitting some error with caching for CUDA 11.8 and 12.1 + # method: ${{ (inputs.cuda-version == '11.8.0' || inputs.cuda-version == '12.1.0') && 'network' || 'local' }} + method: "network" + + - name: Install additional CUDA libraries + run: | + CUDA_VERSION=$(echo ${{ inputs.cuda-version }} | awk -F \. {'print $1 "-" $2'}) + sudo apt-get update + sudo apt-get install -y libcusparse-$CUDA_VERSION libcusolver-$CUDA_VERSION + sudo apt-get clean + + - name: Install PyTorch ${{ inputs.torch-version }}+cu${{ inputs.cuda-version }} + run: | + pip install --upgrade pip + # With python 3.13 and torch 2.5.1, unless we update typing-extensions, we get error + # AttributeError: attribute '__default__' of 'typing.ParamSpec' objects is not writable + pip install typing-extensions==4.12.2 + # We want to figure out the CUDA version to download pytorch + # e.g. we can have system CUDA version being 11.7 but if torch==1.12 then we need to download the wheel from cu116 + # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix + # This code is ugly, maybe there's a better way to do this. + export TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \ + minv = {'2.4': 118, '2.5': 118, '2.6': 118, '2.7': 118, '2.8': 126}[env['MATRIX_TORCH_VERSION']]; \ + maxv = {'2.4': 124, '2.5': 124, '2.6': 126, '2.7': 128, '2.8': 129}[env['MATRIX_TORCH_VERSION']]; \ + print(minv if int(env['MATRIX_CUDA_VERSION']) < 120 else maxv)" \ + ) + if [[ ${{ inputs.torch-version }} == *"dev"* ]]; then + # pip install --no-cache-dir --pre torch==${{ inputs.torch-version }} --index-url https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION} + # Can't use --no-deps because we need cudnn etc. + # Hard-coding this version of pytorch-triton for torch 2.6.0.dev20241001 + pip install jinja2 + pip install https://download.pytorch.org/whl/nightly/pytorch_triton-3.1.0%2Bcf34004b8a-cp${MATRIX_PYTHON_VERSION}-cp${MATRIX_PYTHON_VERSION}-linux_x86_64.whl + pip install --no-cache-dir --pre https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION}/torch-${{ inputs.torch-version }}%2Bcu${TORCH_CUDA_VERSION}-cp${MATRIX_PYTHON_VERSION}-cp${MATRIX_PYTHON_VERSION}-linux_x86_64.whl + else + pip install --no-cache-dir torch==${{ inputs.torch-version }} --index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION} + fi + nvcc --version + python --version + python -c "import torch; print('PyTorch:', torch.__version__)" + python -c "import torch; print('CUDA:', torch.version.cuda)" + python -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)" + + - name: Restore build cache + uses: actions/cache/restore@v4 + with: + path: build.tar + key: build-${{ inputs.release-version }}-${{ inputs.python-version }}-${{ inputs.cuda-version }}-${{ inputs.torch-version }}-${{ inputs.cxx11_abi }}-${{ github.run_number }}-${{ github.run_attempt }} + restore-keys: | + build-${{ inputs.release-version }}-${{ inputs.python-version }}-${{ inputs.cuda-version }}-${{ inputs.torch-version }}-${{ inputs.cxx11_abi }}- + + - name: Unpack build cache + run: | + echo ::group::Adjust timestamps + sudo find / -exec touch -t 197001010000 {} + || true + echo ::endgroup:: + + if [ -f build.tar ]; then + find . -mindepth 1 -maxdepth 1 ! -name 'build.tar' -exec rm -rf {} + + tar -xpvf build.tar -C . + else + echo "No build.tar found, skipping" + fi + + ls -al ./ + ls -al build/ || true + ls -al csrc/ || true + + - name: Build wheel + id: build_wheel + run: | + # We want setuptools >= 49.6.0 otherwise we can't compile the extension if system CUDA version is 11.7 and pytorch cuda version is 11.6 + # https://github.com/pytorch/pytorch/blob/664058fa83f1d8eede5d66418abff6e20bd76ca8/torch/utils/cpp_extension.py#L810 + # However this still fails so I'm using a newer version of setuptools + pip install setuptools==75.8.0 + pip install ninja packaging wheel + export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH + export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH + # Limit MAX_JOBS otherwise the github runner goes OOM + # nvcc 11.8 can compile with 2 jobs, but nvcc 12.3 goes OOM + + export MAX_JOBS=$([ "$MATRIX_CUDA_VERSION" == "129" ] && echo 1 || echo 2) + export NVCC_THREADS=2 + export TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 9.0+PTX" + export DG_USE_LOCAL_VERSION=${{ inputs.use-local-version && '1' || '0' }} + + # 5h timeout since GH allows max 6h and we want some buffer + EXIT_CODE=0 + timeout 5h python setup.py bdist_wheel --dist-dir=dist || EXIT_CODE=$? + + if [ $EXIT_CODE -eq 0 ]; then + tmpname=cu${WHEEL_CUDA_VERSION}torch${MATRIX_TORCH_VERSION}cxx11abi${{ inputs.cxx11_abi }} + wheel_name=$(ls dist/*whl | xargs -n 1 basename | sed "s/-/+$tmpname-/2") + ls dist/*whl |xargs -I {} mv {} dist/${wheel_name} + echo "wheel_name=${wheel_name}" >> $GITHUB_ENV + fi + + # Store exit code in GitHub env for later steps + echo "build_exit_code=$EXIT_CODE" | tee -a "$GITHUB_OUTPUT" + + # Do not fail the job if timeout killed the build + exit $EXIT_CODE + + - name: Log build logs after timeout + if: always() && steps.build_wheel.outputs.build_exit_code == 124 + run: | + ls -al ./ + tar -cvf build.tar . --atime-preserve=replace + + - name: Save build cache timeout + if: always() && steps.build_wheel.outputs.build_exit_code == 124 + uses: actions/cache/save@v4 + with: + key: build-${{ inputs.release-version }}-${{ inputs.python-version }}-${{ inputs.cuda-version }}-${{ inputs.torch-version }}-${{ inputs.cxx11_abi }}-${{ github.run_number }}-${{ github.run_attempt }} + path: build.tar + + - name: Log Built Wheels + run: | + ls dist + + - name: Get Release with tag + id: get_current_release + uses: joutvhu/get-release@v1 + with: + tag_name: ${{ inputs.release-version }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Upload Release Asset + id: upload_release_asset + if: inputs.upload-to-release + uses: actions/upload-release-asset@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: ${{ steps.get_current_release.outputs.upload_url }} + asset_path: ./dist/${{env.wheel_name}} + asset_name: ${{env.wheel_name}} + asset_content_type: application/* diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..ee250aa --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,53 @@ +name: Build wheels + +on: + workflow_dispatch: + inputs: + runs-on: + description: "The runner to use for the build" + required: true + type: string + default: ubuntu-22.04 + python-version: + description: "The Python version to use for the build" + required: true + type: string + cuda-version: + description: "The CUDA version to use for the build" + required: true + type: string + torch-version: + description: "The PyTorch version to use for the build" + required: true + type: string + cxx11_abi: + description: "Enable torch flag C++11 ABI (TRUE/FALSE)" + required: true + type: string + upload-to-release: + description: "Upload wheel to this release" + required: false + type: boolean + default: false + release-version: + description: "Upload wheel to this release" + required: false + type: string + use-local-version: + description: "Use local version" + required: false + type: boolean + default: false + +jobs: + build-wheels: + uses: ./.github/workflows/_build.yml + with: + runs-on: ${{ inputs.runs-on }} + python-version: ${{ inputs.python-version }} + cuda-version: ${{ inputs.cuda-version }} + torch-version: ${{ inputs.torch-version }} + cxx11_abi: ${{ inputs.cxx11_abi }} + upload-to-release: ${{ inputs.upload-to-release }} + release-version: ${{ inputs.release-version }} + use-local-version: ${{ inputs.use-local-version }} diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..a7b3e6b --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,95 @@ +# This workflow will: +# - Create a new Github release +# - Build wheels for supported architectures +# - Deploy the wheels to the Github release +# - Release the static code to PyPi +# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries + +name: Build wheels and deploy + +on: + create: + tags: + - v* + +jobs: + setup_release: + name: Create Release + runs-on: ubuntu-latest + outputs: + release-version: ${{ steps.extract_branch.outputs.branch }} + steps: + - name: Get the tag version + id: extract_branch + run: echo ::set-output name=branch::${GITHUB_REF#refs/tags/} + shell: bash + - name: Create Release + id: create_release + uses: actions/create-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + tag_name: ${{ steps.extract_branch.outputs.branch }} + release_name: ${{ steps.extract_branch.outputs.branch }} + + build_wheels: + name: Build Wheel + needs: setup_release + strategy: + fail-fast: false + matrix: + # Using ubuntu-22.04 instead of 24.04 for more compatibility (glibc). Ideally we'd use the + # manylinux docker image, but I haven't figured out how to install CUDA on manylinux. + os: [ubuntu-22.04] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] + torch-version: ["2.4.0", "2.5.1", "2.6.0", "2.7.1", "2.8.0"] + cuda-version: ["12.9.1"] + # We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not. + # Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI. + # Without this we get import error (undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationESs) + # when building without C++11 ABI and using it on nvcr images. + cxx11_abi: ["FALSE", "TRUE"] + exclude: + # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix + # Pytorch < 2.5 does not support Python 3.13 + - torch-version: "2.4.0" + python-version: "3.13" + uses: ./.github/workflows/_build.yml + with: + runs-on: ${{ matrix.os }} + python-version: ${{ matrix.python-version }} + cuda-version: ${{ matrix.cuda-version }} + torch-version: ${{ matrix.torch-version }} + cxx11_abi: ${{ matrix.cxx11_abi }} + release-version: ${{ needs.setup_release.outputs.release-version }} + upload-to-release: true + use-local-version: false + + publish_package: + name: Publish package + needs: [build_wheels] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + - name: Install dependencies + run: | + pip install ninja packaging wheel twine + # Install latest setuptools with support for pypi metadata 2.2 (improved compat w/ uv) + pip install setuptools==75.8.0 + # We don't want to download anything CUDA-related here + pip install torch --index-url https://download.pytorch.org/whl/cpu + - name: Build core package + env: + DG_USE_LOCAL_VERSION: "0" + DG_SKIP_CUDA_BUILD: "1" + run: | + python setup.py sdist --dist-dir=dist + - name: Deploy + env: + TWINE_USERNAME: "__token__" + TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} + run: | + python -m twine upload dist/* diff --git a/deep_gemm/__init__.py b/deep_gemm/__init__.py index da3403e..a4633ae 100644 --- a/deep_gemm/__init__.py +++ b/deep_gemm/__init__.py @@ -83,3 +83,5 @@ deep_gemm_cpp.init( os.path.dirname(os.path.abspath(__file__)), # Library root directory path _find_cuda_home() # CUDA home ) + +__version__ = '2.1.1' diff --git a/setup.py b/setup.py index e5b9665..01a72eb 100644 --- a/setup.py +++ b/setup.py @@ -1,15 +1,36 @@ +import ast import os -import setuptools +import re import shutil +import setuptools import subprocess +import sys import torch +import platform +import urllib +import urllib.error +import urllib.request from setuptools import find_packages from setuptools.command.build_py import build_py +from packaging.version import parse +from pathlib import Path from torch.utils.cpp_extension import CUDAExtension, CUDA_HOME +from wheel.bdist_wheel import bdist_wheel as _bdist_wheel -current_dir = os.path.dirname(os.path.realpath(__file__)) + +DG_SKIP_CUDA_BUILD = int(os.getenv('DG_SKIP_CUDA_BUILD', '0')) == 1 +DG_FORCE_BUILD = int(os.getenv('DG_FORCE_BUILD', '0')) == 1 +DG_USE_LOCAL_VERSION = int(os.getenv('DG_USE_LOCAL_VERSION', '1')) == 1 +DG_JIT_USE_RUNTIME_API = int(os.environ.get('DG_JIT_USE_RUNTIME_API', '0')) == 1 + +# Compiler flags cxx_flags = ['-std=c++17', '-O3', '-fPIC', '-Wno-psabi', '-Wno-deprecated-declarations', f'-D_GLIBCXX_USE_CXX11_ABI={int(torch.compiled_with_cxx11_abi())}'] +if DG_JIT_USE_RUNTIME_API: + cxx_flags.append('-DDG_JIT_USE_RUNTIME_API') + +# Sources +current_dir = os.path.dirname(os.path.realpath(__file__)) sources = ['csrc/python_api.cpp'] build_include_dirs = [ f'{CUDA_HOME}/include', @@ -28,9 +49,68 @@ third_party_include_dirs = [ 'third-party/cutlass/include/cutlass', ] -# Use runtime API -if int(os.environ.get('DG_JIT_USE_RUNTIME_API', '0')): - cxx_flags.append('-DDG_JIT_USE_RUNTIME_API') +# Release +base_wheel_url = 'https://github.com/DeepSeek-AI/DeepGEMM/releases/download/{tag_name}/{wheel_name}' + + +def get_package_version(): + with open(Path(current_dir) / 'deep_gemm' / '__init__.py', 'r') as f: + version_match = re.search(r'^__version__\s*=\s*(.*)$', f.read(), re.MULTILINE) + public_version = ast.literal_eval(version_match.group(1)) + + revision = '' + if DG_USE_LOCAL_VERSION: + # noinspection PyBroadException + try: + status_cmd = ['git', 'status', '--porcelain'] + status_output = subprocess.check_output(status_cmd).decode('ascii').strip() + if status_output: + print(f'Warning: Git working directory is not clean. Uncommitted changes:\n{status_output}') + assert False, 'Git working directory is not clean' + + cmd = ['git', 'rev-parse', '--short', 'HEAD'] + revision = '+' + subprocess.check_output(cmd).decode('ascii').rstrip() + except: + revision = '+local' + return f'{public_version}{revision}' + + +def get_platform(): + if sys.platform.startswith('linux'): + return f'linux_{platform.uname().machine}' + else: + raise ValueError('Unsupported platform: {}'.format(sys.platform)) + + +def get_wheel_url(): + torch_version = parse(torch.__version__) + torch_version = f'{torch_version.major}.{torch_version.minor}' + python_version = f'cp{sys.version_info.major}{sys.version_info.minor}' + platform_name = get_platform() + deep_gemm_version = get_package_version() + cxx11_abi = int(torch._C._GLIBCXX_USE_CXX11_ABI) + + # Determine the version numbers that will be used to determine the correct wheel + # We're using the CUDA version used to build torch, not the one currently installed + cuda_version = parse(torch.version.cuda) + cuda_version = f'{cuda_version.major}' + + # Determine wheel URL based on CUDA version, torch version, python version and OS + wheel_filename = f'deep_gemm-{deep_gemm_version}+cu{cuda_version}-torch{torch_version}-cxx11abi{cxx11_abi}-{python_version}-{platform_name}.whl' + wheel_url = base_wheel_url.format(tag_name=f'v{deep_gemm_version}', wheel_name=wheel_filename) + return wheel_url, wheel_filename + + +def get_ext_modules(): + if DG_SKIP_CUDA_BUILD: + return [] + + return [CUDAExtension(name='deep_gemm_cpp', + sources=sources, + include_dirs=build_include_dirs, + libraries=build_libraries, + library_dirs=build_library_dirs, + extra_compile_args=cxx_flags)] class CustomBuildPy(build_py): @@ -72,18 +152,37 @@ class CustomBuildPy(build_py): shutil.copytree(src_dir, dst_dir) -if __name__ == '__main__': - # noinspection PyBroadException - try: - cmd = ['git', 'rev-parse', '--short', 'HEAD'] - revision = '+' + subprocess.check_output(cmd).decode('ascii').rstrip() - except: - revision = '' +class CachedWheelsCommand(_bdist_wheel): + def run(self): + if DG_FORCE_BUILD or DG_USE_LOCAL_VERSION: + return super().run() + wheel_url, wheel_filename = get_wheel_url() + print(f'Try to download wheel from URL: {wheel_url}') + try: + with urllib.request.urlopen(wheel_url, timeout=1) as response: + with open(wheel_filename, 'wb') as out_file: + data = response.read() + out_file.write(data) + + # Make the archive + if not os.path.exists(self.dist_dir): + os.makedirs(self.dist_dir) + impl_tag, abi_tag, plat_tag = self.get_tag() + archive_basename = f'{self.wheel_dist_name}-{impl_tag}-{abi_tag}-{plat_tag}' + wheel_path = os.path.join(self.dist_dir, archive_basename + '.whl') + os.rename(wheel_filename, wheel_path) + except (urllib.error.HTTPError, urllib.error.URLError): + print('Precompiled wheel not found. Building from source...') + # If the wheel could not be downloaded, build from source + super().run() + + +if __name__ == '__main__': # noinspection PyTypeChecker setuptools.setup( name='deep_gemm', - version='2.1.0' + revision, + version=get_package_version(), packages=find_packages('.'), package_data={ 'deep_gemm': [ @@ -92,16 +191,10 @@ if __name__ == '__main__': 'include/cutlass/**/*', ] }, - ext_modules=[ - CUDAExtension(name='deep_gemm_cpp', - sources=sources, - include_dirs=build_include_dirs, - libraries=build_libraries, - library_dirs=build_library_dirs, - extra_compile_args=cxx_flags) - ], + ext_modules=get_ext_modules(), zip_safe=False, cmdclass={ 'build_py': CustomBuildPy, + 'bdist_wheel': CachedWheelsCommand, }, )