docker: docker-aware precompiled wheel support (#21127)

Signed-off-by: dougbtv <dosmith@redhat.com>
2025-07-29 17:45:19 -04:00
parent a33ea28b1b
commit a1873db23d
3 changed files with 68 additions and 27 deletions
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -209,16 +209,7 @@ ARG SCCACHE_REGION_NAME=us-west-2
 ARG SCCACHE_S3_NO_CREDENTIALS=0
 # Flag to control whether to use pre-built vLLM wheels
-ARG VLLM_USE_PRECOMPILED
+ARG VLLM_USE_PRECOMPILED=""
 # TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed
 ENV VLLM_USE_PRECOMPILED=""
 RUN if [ "${VLLM_USE_PRECOMPILED}" = "1" ]; then \
        export VLLM_USE_PRECOMPILED=1 && \
        echo "Using precompiled wheels"; \
    else \
        unset VLLM_USE_PRECOMPILED && \
        echo "Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"; \
    fi
 # if USE_SCCACHE is set, use sccache to speed up compilation
 RUN --mount=type=cache,target=/root/.cache/uv \
@@ -235,6 +226,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
        && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
        && export SCCACHE_IDLE_TIMEOUT=0 \
        && export CMAKE_BUILD_TYPE=Release \
        && export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
        && export VLLM_DOCKER_BUILD_CONTEXT=1 \
        && sccache --show-stats \
        && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
        && sccache --show-stats; \
@@ -248,9 +241,22 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
        # Clean any existing CMake artifacts
        rm -rf .deps && \
        mkdir -p .deps && \
        export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
        export VLLM_DOCKER_BUILD_CONTEXT=1 && \
        python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
    fi
 # When using precompiled wheels, keep only the newest manylinux1 wheel and delete others
 RUN if [ "$VLLM_USE_PRECOMPILED" = "1" ]; then \
        echo "Cleaning up extra wheels in dist/..." && \
        # Identify the most recent manylinux1_x86_64 wheel
        KEEP_WHEEL=$(ls -t dist/*manylinux1_x86_64.whl 2>/dev/null | head -n1) && \
        if [ -n "$KEEP_WHEEL" ]; then \
            echo "Keeping wheel: $KEEP_WHEEL"; \
            find dist/ -type f -name "*.whl" ! -path "${KEEP_WHEEL}" -delete; \
        fi; \
    fi
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
 # sync the default value with .buildkite/check-wheel-size.py
--- a/setup.py
+++ b/setup.py
@@ -7,6 +7,7 @@ import json
 import logging
 import os
 import re
 import shutil
 import subprocess
 import sys
 from pathlib import Path
@@ -297,6 +298,10 @@ class repackage_wheel(build_ext):
            ]).decode("utf-8")
            upstream_main_commit = json.loads(resp_json)["sha"]
            # In Docker build context, .git may be immutable or missing.
            if envs.VLLM_DOCKER_BUILD_CONTEXT:
                return upstream_main_commit
            # Check if the upstream_main_commit exists in the local repo
            try:
                subprocess.check_output(
@@ -357,19 +362,48 @@ class repackage_wheel(build_ext):
            # create a temporary directory to store the wheel
            temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
            wheel_path = os.path.join(temp_dir, wheel_filename)
            print(f"Downloading wheel from {wheel_location} to {wheel_path}")
            from urllib.request import urlretrieve
            try:
                urlretrieve(wheel_location, filename=wheel_path)
            except Exception as e:
                from setuptools.errors import SetupError
                raise SetupError(
                    f"Failed to get vLLM wheel from {wheel_location}") from e
        # During a docker build: determine correct filename, copy wheel.
        if envs.VLLM_DOCKER_BUILD_CONTEXT:
            dist_dir = "/workspace/dist"
            os.makedirs(dist_dir, exist_ok=True)
            # Determine correct wheel filename from METADATA
            with zipfile.ZipFile(wheel_path, "r") as z:
                metadata_file = next(
                    (n for n in z.namelist()
                     if n.endswith(".dist-info/METADATA")),
                    None,
                )
                if not metadata_file:
                    raise RuntimeError(
                        "Could not find METADATA in precompiled wheel.")
                metadata = z.read(metadata_file).decode()
                version_line = next((line for line in metadata.splitlines()
                                     if line.startswith("Version: ")), None)
                if not version_line:
                    raise RuntimeError(
                        "Could not determine version from METADATA.")
                version = version_line.split(": ")[1].strip()
            # Build correct filename using internal version
            arch_tag = "cp38-abi3-manylinux1_x86_64"
            corrected_wheel_name = f"vllm-{version}-{arch_tag}.whl"
            final_wheel_path = os.path.join(dist_dir, corrected_wheel_name)
            print(f"Docker build context detected, copying precompiled wheel "
                  f"({version}) to {final_wheel_path}")
            shutil.copy2(wheel_path, final_wheel_path)
            return
        # Unzip the wheel when not in Docker context
        with zipfile.ZipFile(wheel_path) as wheel:
            files_to_copy = [
                "vllm/_C.abi3.so",
@@ -378,15 +412,9 @@ class repackage_wheel(build_ext):
                "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
                "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
                "vllm/cumem_allocator.abi3.so",
                # "vllm/_version.py", # not available in nightly wheels yet
            ]
            file_members = list(
                filter(lambda x: x.filename in files_to_copy, wheel.filelist))
            # vllm_flash_attn python code:
            # Regex from
            #  `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)`
            compiled_regex = re.compile(
                r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
            file_members += list(
@@ -403,11 +431,8 @@ class repackage_wheel(build_ext):
                    package_data[package_name] = []
                wheel.extract(file)
-                if file_name.endswith(".py"):
+                if not file_name.endswith(".py"):
-                    # python files shouldn't be added to package_data
+                    package_data[package_name].append(file_name)
                    continue
                package_data[package_name].append(file_name)
 def _no_device() -> bool:
@@ -415,6 +440,9 @@ def _no_device() -> bool:
 def _is_cuda() -> bool:
    # Allow forced CUDA in Docker/precompiled builds, even without torch.cuda
    if envs.VLLM_USE_PRECOMPILED and envs.VLLM_DOCKER_BUILD_CONTEXT:
        return True
    has_cuda = torch.version.cuda is not None
    return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
            and not (_is_neuron() or _is_tpu()))
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -68,6 +68,7 @@ if TYPE_CHECKING:
    MAX_JOBS: Optional[str] = None
    NVCC_THREADS: Optional[str] = None
    VLLM_USE_PRECOMPILED: bool = False
    VLLM_DOCKER_BUILD_CONTEXT: bool = False
    VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
    VLLM_NO_DEPRECATION_WARNING: bool = False
    VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
@@ -222,8 +223,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # If set, vllm will use precompiled binaries (*.so)
    "VLLM_USE_PRECOMPILED":
-    lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")) or bool(
+    lambda: os.environ.get("VLLM_USE_PRECOMPILED", "").strip().lower() in
-        os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
+    ("1", "true") or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
    # Used to mark that setup.py is running in a Docker build context,
    # in order to force the use of precompiled binaries.
    "VLLM_DOCKER_BUILD_CONTEXT":
    lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "").strip().lower() in
    ("1", "true"),
    # Whether to force using nightly wheel in python build.
    # This is used for testing the nightly wheel in python build.