docker: docker-aware precompiled wheel support (#21127)

Signed-off-by: dougbtv <dosmith@redhat.com>
This commit is contained in:
Doug Smith
2025-07-29 17:45:19 -04:00
committed by GitHub
parent a33ea28b1b
commit a1873db23d
3 changed files with 68 additions and 27 deletions

View File

@@ -209,16 +209,7 @@ ARG SCCACHE_REGION_NAME=us-west-2
ARG SCCACHE_S3_NO_CREDENTIALS=0 ARG SCCACHE_S3_NO_CREDENTIALS=0
# Flag to control whether to use pre-built vLLM wheels # Flag to control whether to use pre-built vLLM wheels
ARG VLLM_USE_PRECOMPILED ARG VLLM_USE_PRECOMPILED=""
# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed
ENV VLLM_USE_PRECOMPILED=""
RUN if [ "${VLLM_USE_PRECOMPILED}" = "1" ]; then \
export VLLM_USE_PRECOMPILED=1 && \
echo "Using precompiled wheels"; \
else \
unset VLLM_USE_PRECOMPILED && \
echo "Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"; \
fi
# if USE_SCCACHE is set, use sccache to speed up compilation # if USE_SCCACHE is set, use sccache to speed up compilation
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
@@ -235,6 +226,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \ && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
&& export SCCACHE_IDLE_TIMEOUT=0 \ && export SCCACHE_IDLE_TIMEOUT=0 \
&& export CMAKE_BUILD_TYPE=Release \ && export CMAKE_BUILD_TYPE=Release \
&& export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
&& export VLLM_DOCKER_BUILD_CONTEXT=1 \
&& sccache --show-stats \ && sccache --show-stats \
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \ && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
&& sccache --show-stats; \ && sccache --show-stats; \
@@ -248,9 +241,22 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
# Clean any existing CMake artifacts # Clean any existing CMake artifacts
rm -rf .deps && \ rm -rf .deps && \
mkdir -p .deps && \ mkdir -p .deps && \
export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \ python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
fi fi
# When using precompiled wheels, keep only the newest manylinux1 wheel and delete others
RUN if [ "$VLLM_USE_PRECOMPILED" = "1" ]; then \
echo "Cleaning up extra wheels in dist/..." && \
# Identify the most recent manylinux1_x86_64 wheel
KEEP_WHEEL=$(ls -t dist/*manylinux1_x86_64.whl 2>/dev/null | head -n1) && \
if [ -n "$KEEP_WHEEL" ]; then \
echo "Keeping wheel: $KEEP_WHEEL"; \
find dist/ -type f -name "*.whl" ! -path "${KEEP_WHEEL}" -delete; \
fi; \
fi
# Check the size of the wheel if RUN_WHEEL_CHECK is true # Check the size of the wheel if RUN_WHEEL_CHECK is true
COPY .buildkite/check-wheel-size.py check-wheel-size.py COPY .buildkite/check-wheel-size.py check-wheel-size.py
# sync the default value with .buildkite/check-wheel-size.py # sync the default value with .buildkite/check-wheel-size.py

View File

@@ -7,6 +7,7 @@ import json
import logging import logging
import os import os
import re import re
import shutil
import subprocess import subprocess
import sys import sys
from pathlib import Path from pathlib import Path
@@ -297,6 +298,10 @@ class repackage_wheel(build_ext):
]).decode("utf-8") ]).decode("utf-8")
upstream_main_commit = json.loads(resp_json)["sha"] upstream_main_commit = json.loads(resp_json)["sha"]
# In Docker build context, .git may be immutable or missing.
if envs.VLLM_DOCKER_BUILD_CONTEXT:
return upstream_main_commit
# Check if the upstream_main_commit exists in the local repo # Check if the upstream_main_commit exists in the local repo
try: try:
subprocess.check_output( subprocess.check_output(
@@ -357,19 +362,48 @@ class repackage_wheel(build_ext):
# create a temporary directory to store the wheel # create a temporary directory to store the wheel
temp_dir = tempfile.mkdtemp(prefix="vllm-wheels") temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
wheel_path = os.path.join(temp_dir, wheel_filename) wheel_path = os.path.join(temp_dir, wheel_filename)
print(f"Downloading wheel from {wheel_location} to {wheel_path}") print(f"Downloading wheel from {wheel_location} to {wheel_path}")
from urllib.request import urlretrieve from urllib.request import urlretrieve
try: try:
urlretrieve(wheel_location, filename=wheel_path) urlretrieve(wheel_location, filename=wheel_path)
except Exception as e: except Exception as e:
from setuptools.errors import SetupError from setuptools.errors import SetupError
raise SetupError( raise SetupError(
f"Failed to get vLLM wheel from {wheel_location}") from e f"Failed to get vLLM wheel from {wheel_location}") from e
# During a docker build: determine correct filename, copy wheel.
if envs.VLLM_DOCKER_BUILD_CONTEXT:
dist_dir = "/workspace/dist"
os.makedirs(dist_dir, exist_ok=True)
# Determine correct wheel filename from METADATA
with zipfile.ZipFile(wheel_path, "r") as z:
metadata_file = next(
(n for n in z.namelist()
if n.endswith(".dist-info/METADATA")),
None,
)
if not metadata_file:
raise RuntimeError(
"Could not find METADATA in precompiled wheel.")
metadata = z.read(metadata_file).decode()
version_line = next((line for line in metadata.splitlines()
if line.startswith("Version: ")), None)
if not version_line:
raise RuntimeError(
"Could not determine version from METADATA.")
version = version_line.split(": ")[1].strip()
# Build correct filename using internal version
arch_tag = "cp38-abi3-manylinux1_x86_64"
corrected_wheel_name = f"vllm-{version}-{arch_tag}.whl"
final_wheel_path = os.path.join(dist_dir, corrected_wheel_name)
print(f"Docker build context detected, copying precompiled wheel "
f"({version}) to {final_wheel_path}")
shutil.copy2(wheel_path, final_wheel_path)
return
# Unzip the wheel when not in Docker context
with zipfile.ZipFile(wheel_path) as wheel: with zipfile.ZipFile(wheel_path) as wheel:
files_to_copy = [ files_to_copy = [
"vllm/_C.abi3.so", "vllm/_C.abi3.so",
@@ -378,15 +412,9 @@ class repackage_wheel(build_ext):
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so", "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so", "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
"vllm/cumem_allocator.abi3.so", "vllm/cumem_allocator.abi3.so",
# "vllm/_version.py", # not available in nightly wheels yet
] ]
file_members = list( file_members = list(
filter(lambda x: x.filename in files_to_copy, wheel.filelist)) filter(lambda x: x.filename in files_to_copy, wheel.filelist))
# vllm_flash_attn python code:
# Regex from
# `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)`
compiled_regex = re.compile( compiled_regex = re.compile(
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py") r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
file_members += list( file_members += list(
@@ -403,11 +431,8 @@ class repackage_wheel(build_ext):
package_data[package_name] = [] package_data[package_name] = []
wheel.extract(file) wheel.extract(file)
if file_name.endswith(".py"): if not file_name.endswith(".py"):
# python files shouldn't be added to package_data package_data[package_name].append(file_name)
continue
package_data[package_name].append(file_name)
def _no_device() -> bool: def _no_device() -> bool:
@@ -415,6 +440,9 @@ def _no_device() -> bool:
def _is_cuda() -> bool: def _is_cuda() -> bool:
# Allow forced CUDA in Docker/precompiled builds, even without torch.cuda
if envs.VLLM_USE_PRECOMPILED and envs.VLLM_DOCKER_BUILD_CONTEXT:
return True
has_cuda = torch.version.cuda is not None has_cuda = torch.version.cuda is not None
return (VLLM_TARGET_DEVICE == "cuda" and has_cuda return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
and not (_is_neuron() or _is_tpu())) and not (_is_neuron() or _is_tpu()))

View File

@@ -68,6 +68,7 @@ if TYPE_CHECKING:
MAX_JOBS: Optional[str] = None MAX_JOBS: Optional[str] = None
NVCC_THREADS: Optional[str] = None NVCC_THREADS: Optional[str] = None
VLLM_USE_PRECOMPILED: bool = False VLLM_USE_PRECOMPILED: bool = False
VLLM_DOCKER_BUILD_CONTEXT: bool = False
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
VLLM_NO_DEPRECATION_WARNING: bool = False VLLM_NO_DEPRECATION_WARNING: bool = False
VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
@@ -222,8 +223,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
# If set, vllm will use precompiled binaries (*.so) # If set, vllm will use precompiled binaries (*.so)
"VLLM_USE_PRECOMPILED": "VLLM_USE_PRECOMPILED":
lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")) or bool( lambda: os.environ.get("VLLM_USE_PRECOMPILED", "").strip().lower() in
os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")), ("1", "true") or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
# Used to mark that setup.py is running in a Docker build context,
# in order to force the use of precompiled binaries.
"VLLM_DOCKER_BUILD_CONTEXT":
lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "").strip().lower() in
("1", "true"),
# Whether to force using nightly wheel in python build. # Whether to force using nightly wheel in python build.
# This is used for testing the nightly wheel in python build. # This is used for testing the nightly wheel in python build.