From d1af8b7be9c5ad9d2926ce215771e9cd7279147b Mon Sep 17 00:00:00 2001 From: Doug Smith Date: Sun, 10 Aug 2025 19:29:02 -0400 Subject: [PATCH] enable Docker-aware precompiled wheel setup (#22106) Signed-off-by: dougbtv --- docker/Dockerfile | 15 ++-- setup.py | 185 +++++++++++++++++++++++++--------------------- vllm/envs.py | 11 ++- 3 files changed, 116 insertions(+), 95 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 04a63f5d6..85f55cac8 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -210,16 +210,7 @@ ARG SCCACHE_REGION_NAME=us-west-2 ARG SCCACHE_S3_NO_CREDENTIALS=0 # Flag to control whether to use pre-built vLLM wheels -ARG VLLM_USE_PRECOMPILED -# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed -ENV VLLM_USE_PRECOMPILED="" -RUN if [ "${VLLM_USE_PRECOMPILED}" = "1" ]; then \ - export VLLM_USE_PRECOMPILED=1 && \ - echo "Using precompiled wheels"; \ - else \ - unset VLLM_USE_PRECOMPILED && \ - echo "Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"; \ - fi +ARG VLLM_USE_PRECOMPILED="" # if USE_SCCACHE is set, use sccache to speed up compilation RUN --mount=type=cache,target=/root/.cache/uv \ @@ -236,6 +227,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \ && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \ && export SCCACHE_IDLE_TIMEOUT=0 \ && export CMAKE_BUILD_TYPE=Release \ + && export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \ + && export VLLM_DOCKER_BUILD_CONTEXT=1 \ && sccache --show-stats \ && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \ && sccache --show-stats; \ @@ -249,6 +242,8 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ # Clean any existing CMake artifacts rm -rf .deps && \ mkdir -p .deps && \ + export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \ + export VLLM_DOCKER_BUILD_CONTEXT=1 && \ python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \ fi diff --git a/setup.py b/setup.py index e374fcb81..7f6c78712 100644 --- a/setup.py +++ b/setup.py @@ -7,6 +7,7 @@ import json import logging import os import re +import shutil import subprocess import sys from pathlib import Path @@ -281,10 +282,81 @@ class cmake_build_ext(build_ext): self.copy_file(file, dst_file) -class repackage_wheel(build_ext): +class precompiled_build_ext(build_ext): + """Disables extension building when using precompiled binaries.""" + + def run(self) -> None: + assert _is_cuda( + ), "VLLM_USE_PRECOMPILED is only supported for CUDA builds" + + def build_extensions(self) -> None: + print("Skipping build_ext: using precompiled extensions.") + return + + +class precompiled_wheel_utils: """Extracts libraries and other files from an existing wheel.""" - def get_base_commit_in_main_branch(self) -> str: + @staticmethod + def extract_precompiled_and_patch_package(wheel_url_or_path: str) -> dict: + import tempfile + import zipfile + + temp_dir = None + try: + if not os.path.isfile(wheel_url_or_path): + wheel_filename = wheel_url_or_path.split("/")[-1] + temp_dir = tempfile.mkdtemp(prefix="vllm-wheels") + wheel_path = os.path.join(temp_dir, wheel_filename) + print(f"Downloading wheel from {wheel_url_or_path} " + f"to {wheel_path}") + from urllib.request import urlretrieve + urlretrieve(wheel_url_or_path, filename=wheel_path) + else: + wheel_path = wheel_url_or_path + print(f"Using existing wheel at {wheel_path}") + + package_data_patch = {} + + with zipfile.ZipFile(wheel_path) as wheel: + files_to_copy = [ + "vllm/_C.abi3.so", + "vllm/_moe_C.abi3.so", + "vllm/_flashmla_C.abi3.so", + "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so", + "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so", + "vllm/cumem_allocator.abi3.so", + ] + + compiled_regex = re.compile( + r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py") + file_members = list( + filter(lambda x: x.filename in files_to_copy, + wheel.filelist)) + file_members += list( + filter(lambda x: compiled_regex.match(x.filename), + wheel.filelist)) + + for file in file_members: + print(f"[extract] {file.filename}") + target_path = os.path.join(".", file.filename) + os.makedirs(os.path.dirname(target_path), exist_ok=True) + with wheel.open(file.filename) as src, open( + target_path, "wb") as dst: + shutil.copyfileobj(src, dst) + + pkg = os.path.dirname(file.filename).replace("/", ".") + package_data_patch.setdefault(pkg, []).append( + os.path.basename(file.filename)) + + return package_data_patch + finally: + if temp_dir is not None: + print(f"Removing temporary directory {temp_dir}") + shutil.rmtree(temp_dir) + + @staticmethod + def get_base_commit_in_main_branch() -> str: # Force to use the nightly wheel. This is mainly used for CI testing. if envs.VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: return "nightly" @@ -297,6 +369,10 @@ class repackage_wheel(build_ext): ]).decode("utf-8") upstream_main_commit = json.loads(resp_json)["sha"] + # In Docker build context, .git may be immutable or missing. + if envs.VLLM_DOCKER_BUILD_CONTEXT: + return upstream_main_commit + # Check if the upstream_main_commit exists in the local repo try: subprocess.check_output( @@ -329,86 +405,6 @@ class repackage_wheel(build_ext): "wheel may not be compatible with your dev branch: %s", err) return "nightly" - def run(self) -> None: - assert _is_cuda( - ), "VLLM_USE_PRECOMPILED is only supported for CUDA builds" - - wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None) - if wheel_location is None: - base_commit = self.get_base_commit_in_main_branch() - wheel_location = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" - # Fallback to nightly wheel if latest commit wheel is unavailable, - # in this rare case, the nightly release CI hasn't finished on main. - if not is_url_available(wheel_location): - wheel_location = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" - - import zipfile - - if os.path.isfile(wheel_location): - wheel_path = wheel_location - print(f"Using existing wheel={wheel_path}") - else: - # Download the wheel from a given URL, assume - # the filename is the last part of the URL - wheel_filename = wheel_location.split("/")[-1] - - import tempfile - - # create a temporary directory to store the wheel - temp_dir = tempfile.mkdtemp(prefix="vllm-wheels") - wheel_path = os.path.join(temp_dir, wheel_filename) - - print(f"Downloading wheel from {wheel_location} to {wheel_path}") - - from urllib.request import urlretrieve - - try: - urlretrieve(wheel_location, filename=wheel_path) - except Exception as e: - from setuptools.errors import SetupError - - raise SetupError( - f"Failed to get vLLM wheel from {wheel_location}") from e - - with zipfile.ZipFile(wheel_path) as wheel: - files_to_copy = [ - "vllm/_C.abi3.so", - "vllm/_moe_C.abi3.so", - "vllm/_flashmla_C.abi3.so", - "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so", - "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so", - "vllm/cumem_allocator.abi3.so", - # "vllm/_version.py", # not available in nightly wheels yet - ] - - file_members = list( - filter(lambda x: x.filename in files_to_copy, wheel.filelist)) - - # vllm_flash_attn python code: - # Regex from - # `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)` - compiled_regex = re.compile( - r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py") - file_members += list( - filter(lambda x: compiled_regex.match(x.filename), - wheel.filelist)) - - for file in file_members: - print(f"Extracting and including {file.filename} " - "from existing wheel") - package_name = os.path.dirname(file.filename).replace("/", ".") - file_name = os.path.basename(file.filename) - - if package_name not in package_data: - package_data[package_name] = [] - - wheel.extract(file) - if file_name.endswith(".py"): - # python files shouldn't be added to package_data - continue - - package_data[package_name].append(file_name) - def _no_device() -> bool: return VLLM_TARGET_DEVICE == "empty" @@ -639,6 +635,29 @@ package_data = { ] } +# If using precompiled, extract and patch package_data (in advance of setup) +if envs.VLLM_USE_PRECOMPILED: + assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds" + wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None) + if wheel_location is not None: + wheel_url = wheel_location + else: + base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch() + wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" + from urllib.request import urlopen + try: + with urlopen(wheel_url) as resp: + if resp.status != 200: + wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" + except Exception as e: + print(f"[warn] Falling back to nightly wheel: {e}") + wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" + + patch = precompiled_wheel_utils.extract_precompiled_and_patch_package( + wheel_url) + for pkg, files in patch.items(): + package_data.setdefault(pkg, []).extend(files) + if _no_device(): ext_modules = [] @@ -647,7 +666,7 @@ if not ext_modules: else: cmdclass = { "build_ext": - repackage_wheel if envs.VLLM_USE_PRECOMPILED else cmake_build_ext + precompiled_build_ext if envs.VLLM_USE_PRECOMPILED else cmake_build_ext } setup( diff --git a/vllm/envs.py b/vllm/envs.py index f81f6dacd..c26c7f215 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -70,6 +70,7 @@ if TYPE_CHECKING: MAX_JOBS: Optional[str] = None NVCC_THREADS: Optional[str] = None VLLM_USE_PRECOMPILED: bool = False + VLLM_DOCKER_BUILD_CONTEXT: bool = False VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False CMAKE_BUILD_TYPE: Optional[str] = None @@ -234,8 +235,14 @@ environment_variables: dict[str, Callable[[], Any]] = { # If set, vllm will use precompiled binaries (*.so) "VLLM_USE_PRECOMPILED": - lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")) or bool( - os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")), + lambda: os.environ.get("VLLM_USE_PRECOMPILED", "").strip().lower() in + ("1", "true") or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")), + + # Used to mark that setup.py is running in a Docker build context, + # in order to force the use of precompiled binaries. + "VLLM_DOCKER_BUILD_CONTEXT": + lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "").strip().lower() in + ("1", "true"), # Whether to force using nightly wheel in python build. # This is used for testing the nightly wheel in python build.