From 9f196058ae05f4f392fe71d12841e4b9df736a02 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Fri, 10 Oct 2025 12:23:40 +0200
Subject: [PATCH] chore: Build and store bdist wheels (#181)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* build: Minor tweeks for wheel build

Signed-off-by: oliver könig <okoenig@nvidia.com>

* ci: Workflows for wheel build

Signed-off-by: oliver könig <okoenig@nvidia.com>

* fix

Signed-off-by: oliver könig <okoenig@nvidia.com>

* fix

Signed-off-by: oliver könig <okoenig@nvidia.com>

* build: Add CachedWheel

Signed-off-by: oliver könig <okoenig@nvidia.com>

* add version to init

Signed-off-by: oliver könig <okoenig@nvidia.com>

* revert

Signed-off-by: oliver könig <okoenig@nvidia.com>

* revert

Signed-off-by: oliver könig <okoenig@nvidia.com>

* revert

Signed-off-by: oliver könig <okoenig@nvidia.com>

* v2

Signed-off-by: oliver könig <okoenig@nvidia.com>

* update

Signed-off-by: oliver könig <okoenig@nvidia.com>

* test

Signed-off-by: oliver könig <okoenig@nvidia.com>

* from packaging.version import parse

Signed-off-by: oliver könig <okoenig@nvidia.com>

* local version

Signed-off-by: oliver könig <okoenig@nvidia.com>

* remove file

Signed-off-by: oliver könig <okoenig@nvidia.com>

* revert

Signed-off-by: oliver könig <okoenig@nvidia.com>

* Updates and lint

* revert missing cudaextension args

Signed-off-by: oliver könig <okoenig@nvidia.com>

* Add timeout

* fix DG settings

Signed-off-by: oliver könig <okoenig@nvidia.com>

* DG_USE_LOCAL_VERSION

Signed-off-by: oliver könig <okoenig@nvidia.com>

* Update version

* Detect local changes

* Minor fix

* Revert CUTLASS

* Unify options

---------

Signed-off-by: oliver könig <okoenig@nvidia.com>
Co-authored-by: Chenggang Zhao <chenggangz@deepseek.com>
---
 .github/workflows/_build.yml  | 227 ++++++++++++++++++++++++++++++++++
 .github/workflows/build.yml   |  53 ++++++++
 .github/workflows/publish.yml |  95 ++++++++++++++
 deep_gemm/__init__.py         |   2 +
 setup.py                      | 135 ++++++++++++++++----
 5 files changed, 491 insertions(+), 21 deletions(-)
 create mode 100644 .github/workflows/_build.yml
 create mode 100644 .github/workflows/build.yml
 create mode 100644 .github/workflows/publish.yml

diff --git a/.github/workflows/_build.yml b/.github/workflows/_build.yml
new file mode 100644
index 0000000..cff8013
--- /dev/null
+++ b/.github/workflows/_build.yml
@@ -0,0 +1,227 @@
+name: ~Build wheel template
+
+on:
+  workflow_call:
+    inputs:
+      runs-on:
+        description: "The runner to use for the build"
+        required: true
+        type: string
+      python-version:
+        description: "The Python version to use for the build"
+        required: true
+        type: string
+      cuda-version:
+        description: "The CUDA version to use for the build"
+        required: true
+        type: string
+      torch-version:
+        description: "The PyTorch version to use for the build"
+        required: true
+        type: string
+      cxx11_abi:
+        description: "The C++11 ABI to use for the build"
+        required: true
+        type: string
+      upload-to-release:
+        description: "Upload wheel to this release"
+        required: false
+        type: boolean
+        default: false
+      release-version:
+        description: "Upload wheel to this release"
+        required: false
+        type: string
+      use-local-version:
+        description: "Use local version"
+        required: false
+        type: boolean
+        default: false
+
+defaults:
+  run:
+    shell: bash -x -e -u -o pipefail {0}
+
+jobs:
+  build-wheel:
+    runs-on: ${{ inputs.runs-on }}
+    name: Build wheel (${{ inputs.release-version }}-${{ inputs.python-version }}-${{ inputs.cuda-version }}-${{ inputs.torch-version }}-${{ inputs.cxx11_abi }})
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.release-version }}
+          submodules: recursive
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ inputs.python-version }}
+
+      - name: Set CUDA and PyTorch versions
+        run: |
+          echo "MATRIX_CUDA_VERSION=$(echo ${{ inputs.cuda-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
+          echo "MATRIX_TORCH_VERSION=$(echo ${{ inputs.torch-version }} | awk -F \. {'print $1 "." $2'})" >> $GITHUB_ENV
+          echo "WHEEL_CUDA_VERSION=$(echo ${{ inputs.cuda-version }} | awk -F \. {'print $1'})" >> $GITHUB_ENV
+          echo "MATRIX_PYTHON_VERSION=$(echo ${{ inputs.python-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
+
+      - name: Free up disk space
+        if: ${{ runner.os == 'Linux' }}
+        # https://github.com/easimon/maximize-build-space/blob/master/action.yml
+        # https://github.com/easimon/maximize-build-space/tree/test-report
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /opt/hostedtoolcache/CodeQL
+
+      - name: Set up swap space
+        if: runner.os == 'Linux'
+        uses: pierotofy/set-swap-space@v1.0
+        with:
+          swap-size-gb: 10
+
+      - name: Install CUDA ${{ inputs.cuda-version }}
+        if: ${{ inputs.cuda-version != 'cpu' }}
+        uses: Jimver/cuda-toolkit@v0.2.26
+        id: cuda-toolkit
+        with:
+          cuda: ${{ inputs.cuda-version }}
+          linux-local-args: '["--toolkit"]'
+          # default method is "local", and we're hitting some error with caching for CUDA 11.8 and 12.1
+          # method: ${{ (inputs.cuda-version == '11.8.0' || inputs.cuda-version == '12.1.0') && 'network' || 'local' }}
+          method: "network"
+
+      - name: Install additional CUDA libraries
+        run: |
+          CUDA_VERSION=$(echo ${{ inputs.cuda-version }} | awk -F \. {'print $1 "-" $2'})
+          sudo apt-get update
+          sudo apt-get install -y libcusparse-$CUDA_VERSION libcusolver-$CUDA_VERSION 
+          sudo apt-get clean
+
+      - name: Install PyTorch ${{ inputs.torch-version }}+cu${{ inputs.cuda-version }}
+        run: |
+          pip install --upgrade pip
+          # With python 3.13 and torch 2.5.1, unless we update typing-extensions, we get error
+          # AttributeError: attribute '__default__' of 'typing.ParamSpec' objects is not writable
+          pip install typing-extensions==4.12.2
+          # We want to figure out the CUDA version to download pytorch
+          # e.g. we can have system CUDA version being 11.7 but if torch==1.12 then we need to download the wheel from cu116
+          # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
+          # This code is ugly, maybe there's a better way to do this.
+          export TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \
+            minv = {'2.4': 118, '2.5': 118, '2.6': 118, '2.7': 118, '2.8': 126}[env['MATRIX_TORCH_VERSION']]; \
+            maxv = {'2.4': 124, '2.5': 124, '2.6': 126, '2.7': 128, '2.8': 129}[env['MATRIX_TORCH_VERSION']]; \
+            print(minv if int(env['MATRIX_CUDA_VERSION']) < 120 else maxv)" \
+          )
+          if [[ ${{ inputs.torch-version }} == *"dev"* ]]; then
+            # pip install --no-cache-dir --pre torch==${{ inputs.torch-version }} --index-url https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION}
+            # Can't use --no-deps because we need cudnn etc.
+            # Hard-coding this version of pytorch-triton for torch 2.6.0.dev20241001
+            pip install jinja2
+            pip install https://download.pytorch.org/whl/nightly/pytorch_triton-3.1.0%2Bcf34004b8a-cp${MATRIX_PYTHON_VERSION}-cp${MATRIX_PYTHON_VERSION}-linux_x86_64.whl
+            pip install --no-cache-dir --pre https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION}/torch-${{ inputs.torch-version }}%2Bcu${TORCH_CUDA_VERSION}-cp${MATRIX_PYTHON_VERSION}-cp${MATRIX_PYTHON_VERSION}-linux_x86_64.whl
+          else
+            pip install --no-cache-dir torch==${{ inputs.torch-version }} --index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION}
+          fi
+          nvcc --version
+          python --version
+          python -c "import torch; print('PyTorch:', torch.__version__)"
+          python -c "import torch; print('CUDA:', torch.version.cuda)"
+          python -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
+
+      - name: Restore build cache
+        uses: actions/cache/restore@v4
+        with:
+          path: build.tar
+          key: build-${{ inputs.release-version }}-${{ inputs.python-version }}-${{ inputs.cuda-version }}-${{ inputs.torch-version }}-${{ inputs.cxx11_abi }}-${{ github.run_number }}-${{ github.run_attempt }}
+          restore-keys: |
+            build-${{ inputs.release-version }}-${{ inputs.python-version }}-${{ inputs.cuda-version }}-${{ inputs.torch-version }}-${{ inputs.cxx11_abi }}-
+
+      - name: Unpack build cache
+        run: |
+          echo ::group::Adjust timestamps
+          sudo find / -exec touch -t 197001010000 {} + || true
+          echo ::endgroup::
+
+          if [ -f build.tar ]; then
+            find . -mindepth 1 -maxdepth 1 ! -name 'build.tar' -exec rm -rf {} +
+            tar -xpvf build.tar -C .
+          else
+            echo "No build.tar found, skipping"
+          fi
+
+          ls -al ./
+          ls -al build/ || true
+          ls -al csrc/ || true
+
+      - name: Build wheel
+        id: build_wheel
+        run: |
+          # We want setuptools >= 49.6.0 otherwise we can't compile the extension if system CUDA version is 11.7 and pytorch cuda version is 11.6
+          # https://github.com/pytorch/pytorch/blob/664058fa83f1d8eede5d66418abff6e20bd76ca8/torch/utils/cpp_extension.py#L810
+          # However this still fails so I'm using a newer version of setuptools
+          pip install setuptools==75.8.0
+          pip install ninja packaging wheel
+          export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
+          export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+          # Limit MAX_JOBS otherwise the github runner goes OOM
+          # nvcc 11.8 can compile with 2 jobs, but nvcc 12.3 goes OOM
+
+          export MAX_JOBS=$([ "$MATRIX_CUDA_VERSION" == "129" ] && echo 1 || echo 2)
+          export NVCC_THREADS=2
+          export TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 9.0+PTX"
+          export DG_USE_LOCAL_VERSION=${{ inputs.use-local-version && '1' || '0' }}
+
+          # 5h timeout since GH allows max 6h and we want some buffer
+          EXIT_CODE=0
+          timeout 5h python setup.py bdist_wheel --dist-dir=dist || EXIT_CODE=$?
+
+          if [ $EXIT_CODE -eq 0 ]; then
+            tmpname=cu${WHEEL_CUDA_VERSION}torch${MATRIX_TORCH_VERSION}cxx11abi${{ inputs.cxx11_abi }}
+            wheel_name=$(ls dist/*whl | xargs -n 1 basename | sed "s/-/+$tmpname-/2")
+            ls dist/*whl |xargs -I {} mv {} dist/${wheel_name}
+            echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
+          fi
+
+          # Store exit code in GitHub env for later steps
+          echo "build_exit_code=$EXIT_CODE" | tee -a "$GITHUB_OUTPUT"
+
+          # Do not fail the job if timeout killed the build
+          exit $EXIT_CODE
+
+      - name: Log build logs after timeout
+        if: always() && steps.build_wheel.outputs.build_exit_code == 124
+        run: |
+          ls -al ./
+          tar -cvf build.tar . --atime-preserve=replace
+
+      - name: Save build cache timeout
+        if: always() && steps.build_wheel.outputs.build_exit_code == 124
+        uses: actions/cache/save@v4
+        with:
+          key: build-${{ inputs.release-version }}-${{ inputs.python-version }}-${{ inputs.cuda-version }}-${{ inputs.torch-version }}-${{ inputs.cxx11_abi }}-${{ github.run_number }}-${{ github.run_attempt }}
+          path: build.tar
+
+      - name: Log Built Wheels
+        run: |
+          ls dist
+
+      - name: Get Release with tag
+        id: get_current_release
+        uses: joutvhu/get-release@v1
+        with:
+          tag_name: ${{ inputs.release-version }}
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Upload Release Asset
+        id: upload_release_asset
+        if: inputs.upload-to-release
+        uses: actions/upload-release-asset@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ steps.get_current_release.outputs.upload_url }}
+          asset_path: ./dist/${{env.wheel_name}}
+          asset_name: ${{env.wheel_name}}
+          asset_content_type: application/*
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 0000000..ee250aa
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,53 @@
+name: Build wheels
+
+on:
+  workflow_dispatch:
+    inputs:
+      runs-on:
+        description: "The runner to use for the build"
+        required: true
+        type: string
+        default: ubuntu-22.04
+      python-version:
+        description: "The Python version to use for the build"
+        required: true
+        type: string
+      cuda-version:
+        description: "The CUDA version to use for the build"
+        required: true
+        type: string
+      torch-version:
+        description: "The PyTorch version to use for the build"
+        required: true
+        type: string
+      cxx11_abi:
+        description: "Enable torch flag C++11 ABI (TRUE/FALSE)"
+        required: true
+        type: string
+      upload-to-release:
+        description: "Upload wheel to this release"
+        required: false
+        type: boolean
+        default: false
+      release-version:
+        description: "Upload wheel to this release"
+        required: false
+        type: string
+      use-local-version:
+        description: "Use local version"
+        required: false
+        type: boolean
+        default: false
+
+jobs:
+  build-wheels:
+    uses: ./.github/workflows/_build.yml
+    with:
+      runs-on: ${{ inputs.runs-on }}
+      python-version: ${{ inputs.python-version }}
+      cuda-version: ${{ inputs.cuda-version }}
+      torch-version: ${{ inputs.torch-version }}
+      cxx11_abi: ${{ inputs.cxx11_abi }}
+      upload-to-release: ${{ inputs.upload-to-release }}
+      release-version: ${{ inputs.release-version }}
+      use-local-version: ${{ inputs.use-local-version }}
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
new file mode 100644
index 0000000..a7b3e6b
--- /dev/null
+++ b/.github/workflows/publish.yml
@@ -0,0 +1,95 @@
+# This workflow will:
+# - Create a new Github release
+# - Build wheels for supported architectures
+# - Deploy the wheels to the Github release
+# - Release the static code to PyPi
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
+
+name: Build wheels and deploy
+
+on:
+  create:
+    tags:
+      - v*
+
+jobs:
+  setup_release:
+    name: Create Release
+    runs-on: ubuntu-latest
+    outputs:
+      release-version: ${{ steps.extract_branch.outputs.branch }}
+    steps:
+      - name: Get the tag version
+        id: extract_branch
+        run: echo ::set-output name=branch::${GITHUB_REF#refs/tags/}
+        shell: bash
+      - name: Create Release
+        id: create_release
+        uses: actions/create-release@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          tag_name: ${{ steps.extract_branch.outputs.branch }}
+          release_name: ${{ steps.extract_branch.outputs.branch }}
+
+  build_wheels:
+    name: Build Wheel
+    needs: setup_release
+    strategy:
+      fail-fast: false
+      matrix:
+        # Using ubuntu-22.04 instead of 24.04 for more compatibility (glibc). Ideally we'd use the
+        # manylinux docker image, but I haven't figured out how to install CUDA on manylinux.
+        os: [ubuntu-22.04]
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
+        torch-version: ["2.4.0", "2.5.1", "2.6.0", "2.7.1", "2.8.0"]
+        cuda-version: ["12.9.1"]
+        # We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not.
+        # Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI.
+        # Without this we get import error (undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationESs)
+        # when building without C++11 ABI and using it on nvcr images.
+        cxx11_abi: ["FALSE", "TRUE"]
+        exclude:
+          # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
+          # Pytorch < 2.5 does not support Python 3.13
+          - torch-version: "2.4.0"
+            python-version: "3.13"
+    uses: ./.github/workflows/_build.yml
+    with:
+      runs-on: ${{ matrix.os }}
+      python-version: ${{ matrix.python-version }}
+      cuda-version: ${{ matrix.cuda-version }}
+      torch-version: ${{ matrix.torch-version }}
+      cxx11_abi: ${{ matrix.cxx11_abi }}
+      release-version: ${{ needs.setup_release.outputs.release-version }}
+      upload-to-release: true
+      use-local-version: false
+
+  publish_package:
+    name: Publish package
+    needs: [build_wheels]
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+      - name: Install dependencies
+        run: |
+          pip install ninja packaging wheel twine
+          # Install latest setuptools with support for pypi metadata 2.2 (improved compat w/ uv)
+          pip install setuptools==75.8.0
+          # We don't want to download anything CUDA-related here
+          pip install torch --index-url https://download.pytorch.org/whl/cpu
+      - name: Build core package
+        env:
+          DG_USE_LOCAL_VERSION: "0"
+          DG_SKIP_CUDA_BUILD: "1"
+        run: |
+          python setup.py sdist --dist-dir=dist
+      - name: Deploy
+        env:
+          TWINE_USERNAME: "__token__"
+          TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
+        run: |
+          python -m twine upload dist/*
diff --git a/deep_gemm/__init__.py b/deep_gemm/__init__.py
index da3403e..a4633ae 100644
--- a/deep_gemm/__init__.py
+++ b/deep_gemm/__init__.py
@@ -83,3 +83,5 @@ deep_gemm_cpp.init(
     os.path.dirname(os.path.abspath(__file__)), # Library root directory path
     _find_cuda_home()                           # CUDA home
 )
+
+__version__ = '2.1.1'
diff --git a/setup.py b/setup.py
index e5b9665..01a72eb 100644
--- a/setup.py
+++ b/setup.py
@@ -1,15 +1,36 @@
+import ast
 import os
-import setuptools
+import re
 import shutil
+import setuptools
 import subprocess
+import sys
 import torch
+import platform
+import urllib
+import urllib.error
+import urllib.request
 from setuptools import find_packages
 from setuptools.command.build_py import build_py
+from packaging.version import parse
+from pathlib import Path
 from torch.utils.cpp_extension import CUDAExtension, CUDA_HOME
+from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
 
-current_dir = os.path.dirname(os.path.realpath(__file__))
+
+DG_SKIP_CUDA_BUILD = int(os.getenv('DG_SKIP_CUDA_BUILD', '0')) == 1
+DG_FORCE_BUILD = int(os.getenv('DG_FORCE_BUILD', '0')) == 1
+DG_USE_LOCAL_VERSION = int(os.getenv('DG_USE_LOCAL_VERSION', '1')) == 1
+DG_JIT_USE_RUNTIME_API = int(os.environ.get('DG_JIT_USE_RUNTIME_API', '0')) == 1
+
+# Compiler flags
 cxx_flags = ['-std=c++17', '-O3', '-fPIC', '-Wno-psabi', '-Wno-deprecated-declarations',
              f'-D_GLIBCXX_USE_CXX11_ABI={int(torch.compiled_with_cxx11_abi())}']
+if DG_JIT_USE_RUNTIME_API:
+    cxx_flags.append('-DDG_JIT_USE_RUNTIME_API')
+
+# Sources
+current_dir = os.path.dirname(os.path.realpath(__file__))
 sources = ['csrc/python_api.cpp']
 build_include_dirs = [
     f'{CUDA_HOME}/include',
@@ -28,9 +49,68 @@ third_party_include_dirs = [
     'third-party/cutlass/include/cutlass',
 ]
 
-# Use runtime API
-if int(os.environ.get('DG_JIT_USE_RUNTIME_API', '0')):
-    cxx_flags.append('-DDG_JIT_USE_RUNTIME_API')
+# Release
+base_wheel_url = 'https://github.com/DeepSeek-AI/DeepGEMM/releases/download/{tag_name}/{wheel_name}'
+
+
+def get_package_version():
+    with open(Path(current_dir) / 'deep_gemm' / '__init__.py', 'r') as f:
+        version_match = re.search(r'^__version__\s*=\s*(.*)$', f.read(), re.MULTILINE)
+    public_version = ast.literal_eval(version_match.group(1))
+
+    revision = ''
+    if DG_USE_LOCAL_VERSION:
+        # noinspection PyBroadException
+        try:
+            status_cmd = ['git', 'status', '--porcelain']
+            status_output = subprocess.check_output(status_cmd).decode('ascii').strip()
+            if status_output:
+                print(f'Warning: Git working directory is not clean. Uncommitted changes:\n{status_output}')
+                assert False, 'Git working directory is not clean'
+
+            cmd = ['git', 'rev-parse', '--short', 'HEAD']
+            revision = '+' + subprocess.check_output(cmd).decode('ascii').rstrip()
+        except:
+            revision = '+local'
+    return f'{public_version}{revision}'
+
+
+def get_platform():
+    if sys.platform.startswith('linux'):
+        return f'linux_{platform.uname().machine}'
+    else:
+        raise ValueError('Unsupported platform: {}'.format(sys.platform))
+
+
+def get_wheel_url():
+    torch_version = parse(torch.__version__)
+    torch_version = f'{torch_version.major}.{torch_version.minor}'
+    python_version = f'cp{sys.version_info.major}{sys.version_info.minor}'
+    platform_name = get_platform()
+    deep_gemm_version = get_package_version()
+    cxx11_abi = int(torch._C._GLIBCXX_USE_CXX11_ABI)
+
+    # Determine the version numbers that will be used to determine the correct wheel
+    # We're using the CUDA version used to build torch, not the one currently installed
+    cuda_version = parse(torch.version.cuda)
+    cuda_version = f'{cuda_version.major}'
+
+    # Determine wheel URL based on CUDA version, torch version, python version and OS
+    wheel_filename = f'deep_gemm-{deep_gemm_version}+cu{cuda_version}-torch{torch_version}-cxx11abi{cxx11_abi}-{python_version}-{platform_name}.whl'
+    wheel_url = base_wheel_url.format(tag_name=f'v{deep_gemm_version}', wheel_name=wheel_filename)
+    return wheel_url, wheel_filename
+
+
+def get_ext_modules():
+    if DG_SKIP_CUDA_BUILD:
+        return []
+
+    return [CUDAExtension(name='deep_gemm_cpp',
+                          sources=sources,
+                          include_dirs=build_include_dirs,
+                          libraries=build_libraries,
+                          library_dirs=build_library_dirs,
+                          extra_compile_args=cxx_flags)]
 
 
 class CustomBuildPy(build_py):
@@ -72,18 +152,37 @@ class CustomBuildPy(build_py):
             shutil.copytree(src_dir, dst_dir)
 
 
-if __name__ == '__main__':
-    # noinspection PyBroadException
-    try:
-        cmd = ['git', 'rev-parse', '--short', 'HEAD']
-        revision = '+' + subprocess.check_output(cmd).decode('ascii').rstrip()
-    except:
-        revision = ''
+class CachedWheelsCommand(_bdist_wheel):
+    def run(self):
+        if DG_FORCE_BUILD or DG_USE_LOCAL_VERSION:
+            return super().run()
 
+        wheel_url, wheel_filename = get_wheel_url()
+        print(f'Try to download wheel from URL: {wheel_url}')
+        try:
+            with urllib.request.urlopen(wheel_url, timeout=1) as response:
+                with open(wheel_filename, 'wb') as out_file:
+                    data = response.read()
+                    out_file.write(data)
+
+            # Make the archive
+            if not os.path.exists(self.dist_dir):
+                os.makedirs(self.dist_dir)
+            impl_tag, abi_tag, plat_tag = self.get_tag()
+            archive_basename = f'{self.wheel_dist_name}-{impl_tag}-{abi_tag}-{plat_tag}'
+            wheel_path = os.path.join(self.dist_dir, archive_basename + '.whl')
+            os.rename(wheel_filename, wheel_path)
+        except (urllib.error.HTTPError, urllib.error.URLError):
+            print('Precompiled wheel not found. Building from source...')
+            # If the wheel could not be downloaded, build from source
+            super().run()
+
+
+if __name__ == '__main__':
     # noinspection PyTypeChecker
     setuptools.setup(
         name='deep_gemm',
-        version='2.1.0' + revision,
+        version=get_package_version(),
         packages=find_packages('.'),
         package_data={
             'deep_gemm': [
@@ -92,16 +191,10 @@ if __name__ == '__main__':
                 'include/cutlass/**/*',
             ]
         },
-        ext_modules=[
-            CUDAExtension(name='deep_gemm_cpp',
-                          sources=sources,
-                          include_dirs=build_include_dirs,
-                          libraries=build_libraries,
-                          library_dirs=build_library_dirs,
-                          extra_compile_args=cxx_flags)
-        ],
+        ext_modules=get_ext_modules(),
         zip_safe=False,
         cmdclass={
             'build_py': CustomBuildPy,
+            'bdist_wheel': CachedWheelsCommand,
         },
     )