docker: docker-aware precompiled wheel support (#21127)
Signed-off-by: dougbtv <dosmith@redhat.com>
This commit is contained in:
@@ -209,16 +209,7 @@ ARG SCCACHE_REGION_NAME=us-west-2
|
|||||||
ARG SCCACHE_S3_NO_CREDENTIALS=0
|
ARG SCCACHE_S3_NO_CREDENTIALS=0
|
||||||
|
|
||||||
# Flag to control whether to use pre-built vLLM wheels
|
# Flag to control whether to use pre-built vLLM wheels
|
||||||
ARG VLLM_USE_PRECOMPILED
|
ARG VLLM_USE_PRECOMPILED=""
|
||||||
# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed
|
|
||||||
ENV VLLM_USE_PRECOMPILED=""
|
|
||||||
RUN if [ "${VLLM_USE_PRECOMPILED}" = "1" ]; then \
|
|
||||||
export VLLM_USE_PRECOMPILED=1 && \
|
|
||||||
echo "Using precompiled wheels"; \
|
|
||||||
else \
|
|
||||||
unset VLLM_USE_PRECOMPILED && \
|
|
||||||
echo "Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
# if USE_SCCACHE is set, use sccache to speed up compilation
|
# if USE_SCCACHE is set, use sccache to speed up compilation
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
@@ -235,6 +226,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
|
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
|
||||||
&& export SCCACHE_IDLE_TIMEOUT=0 \
|
&& export SCCACHE_IDLE_TIMEOUT=0 \
|
||||||
&& export CMAKE_BUILD_TYPE=Release \
|
&& export CMAKE_BUILD_TYPE=Release \
|
||||||
|
&& export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
|
||||||
|
&& export VLLM_DOCKER_BUILD_CONTEXT=1 \
|
||||||
&& sccache --show-stats \
|
&& sccache --show-stats \
|
||||||
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
|
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
|
||||||
&& sccache --show-stats; \
|
&& sccache --show-stats; \
|
||||||
@@ -248,9 +241,22 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
|
|||||||
# Clean any existing CMake artifacts
|
# Clean any existing CMake artifacts
|
||||||
rm -rf .deps && \
|
rm -rf .deps && \
|
||||||
mkdir -p .deps && \
|
mkdir -p .deps && \
|
||||||
|
export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
|
||||||
|
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
|
||||||
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
|
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# When using precompiled wheels, keep only the newest manylinux1 wheel and delete others
|
||||||
|
RUN if [ "$VLLM_USE_PRECOMPILED" = "1" ]; then \
|
||||||
|
echo "Cleaning up extra wheels in dist/..." && \
|
||||||
|
# Identify the most recent manylinux1_x86_64 wheel
|
||||||
|
KEEP_WHEEL=$(ls -t dist/*manylinux1_x86_64.whl 2>/dev/null | head -n1) && \
|
||||||
|
if [ -n "$KEEP_WHEEL" ]; then \
|
||||||
|
echo "Keeping wheel: $KEEP_WHEEL"; \
|
||||||
|
find dist/ -type f -name "*.whl" ! -path "${KEEP_WHEEL}" -delete; \
|
||||||
|
fi; \
|
||||||
|
fi
|
||||||
|
|
||||||
# Check the size of the wheel if RUN_WHEEL_CHECK is true
|
# Check the size of the wheel if RUN_WHEEL_CHECK is true
|
||||||
COPY .buildkite/check-wheel-size.py check-wheel-size.py
|
COPY .buildkite/check-wheel-size.py check-wheel-size.py
|
||||||
# sync the default value with .buildkite/check-wheel-size.py
|
# sync the default value with .buildkite/check-wheel-size.py
|
||||||
|
|||||||
58
setup.py
58
setup.py
@@ -7,6 +7,7 @@ import json
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -297,6 +298,10 @@ class repackage_wheel(build_ext):
|
|||||||
]).decode("utf-8")
|
]).decode("utf-8")
|
||||||
upstream_main_commit = json.loads(resp_json)["sha"]
|
upstream_main_commit = json.loads(resp_json)["sha"]
|
||||||
|
|
||||||
|
# In Docker build context, .git may be immutable or missing.
|
||||||
|
if envs.VLLM_DOCKER_BUILD_CONTEXT:
|
||||||
|
return upstream_main_commit
|
||||||
|
|
||||||
# Check if the upstream_main_commit exists in the local repo
|
# Check if the upstream_main_commit exists in the local repo
|
||||||
try:
|
try:
|
||||||
subprocess.check_output(
|
subprocess.check_output(
|
||||||
@@ -357,19 +362,48 @@ class repackage_wheel(build_ext):
|
|||||||
# create a temporary directory to store the wheel
|
# create a temporary directory to store the wheel
|
||||||
temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
|
temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
|
||||||
wheel_path = os.path.join(temp_dir, wheel_filename)
|
wheel_path = os.path.join(temp_dir, wheel_filename)
|
||||||
|
|
||||||
print(f"Downloading wheel from {wheel_location} to {wheel_path}")
|
print(f"Downloading wheel from {wheel_location} to {wheel_path}")
|
||||||
|
|
||||||
from urllib.request import urlretrieve
|
from urllib.request import urlretrieve
|
||||||
|
|
||||||
try:
|
try:
|
||||||
urlretrieve(wheel_location, filename=wheel_path)
|
urlretrieve(wheel_location, filename=wheel_path)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
from setuptools.errors import SetupError
|
from setuptools.errors import SetupError
|
||||||
|
|
||||||
raise SetupError(
|
raise SetupError(
|
||||||
f"Failed to get vLLM wheel from {wheel_location}") from e
|
f"Failed to get vLLM wheel from {wheel_location}") from e
|
||||||
|
|
||||||
|
# During a docker build: determine correct filename, copy wheel.
|
||||||
|
if envs.VLLM_DOCKER_BUILD_CONTEXT:
|
||||||
|
dist_dir = "/workspace/dist"
|
||||||
|
os.makedirs(dist_dir, exist_ok=True)
|
||||||
|
# Determine correct wheel filename from METADATA
|
||||||
|
with zipfile.ZipFile(wheel_path, "r") as z:
|
||||||
|
metadata_file = next(
|
||||||
|
(n for n in z.namelist()
|
||||||
|
if n.endswith(".dist-info/METADATA")),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
if not metadata_file:
|
||||||
|
raise RuntimeError(
|
||||||
|
"Could not find METADATA in precompiled wheel.")
|
||||||
|
metadata = z.read(metadata_file).decode()
|
||||||
|
version_line = next((line for line in metadata.splitlines()
|
||||||
|
if line.startswith("Version: ")), None)
|
||||||
|
if not version_line:
|
||||||
|
raise RuntimeError(
|
||||||
|
"Could not determine version from METADATA.")
|
||||||
|
version = version_line.split(": ")[1].strip()
|
||||||
|
|
||||||
|
# Build correct filename using internal version
|
||||||
|
arch_tag = "cp38-abi3-manylinux1_x86_64"
|
||||||
|
corrected_wheel_name = f"vllm-{version}-{arch_tag}.whl"
|
||||||
|
final_wheel_path = os.path.join(dist_dir, corrected_wheel_name)
|
||||||
|
|
||||||
|
print(f"Docker build context detected, copying precompiled wheel "
|
||||||
|
f"({version}) to {final_wheel_path}")
|
||||||
|
shutil.copy2(wheel_path, final_wheel_path)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Unzip the wheel when not in Docker context
|
||||||
with zipfile.ZipFile(wheel_path) as wheel:
|
with zipfile.ZipFile(wheel_path) as wheel:
|
||||||
files_to_copy = [
|
files_to_copy = [
|
||||||
"vllm/_C.abi3.so",
|
"vllm/_C.abi3.so",
|
||||||
@@ -378,15 +412,9 @@ class repackage_wheel(build_ext):
|
|||||||
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
|
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
|
||||||
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
|
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
|
||||||
"vllm/cumem_allocator.abi3.so",
|
"vllm/cumem_allocator.abi3.so",
|
||||||
# "vllm/_version.py", # not available in nightly wheels yet
|
|
||||||
]
|
]
|
||||||
|
|
||||||
file_members = list(
|
file_members = list(
|
||||||
filter(lambda x: x.filename in files_to_copy, wheel.filelist))
|
filter(lambda x: x.filename in files_to_copy, wheel.filelist))
|
||||||
|
|
||||||
# vllm_flash_attn python code:
|
|
||||||
# Regex from
|
|
||||||
# `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)`
|
|
||||||
compiled_regex = re.compile(
|
compiled_regex = re.compile(
|
||||||
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
|
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
|
||||||
file_members += list(
|
file_members += list(
|
||||||
@@ -403,11 +431,8 @@ class repackage_wheel(build_ext):
|
|||||||
package_data[package_name] = []
|
package_data[package_name] = []
|
||||||
|
|
||||||
wheel.extract(file)
|
wheel.extract(file)
|
||||||
if file_name.endswith(".py"):
|
if not file_name.endswith(".py"):
|
||||||
# python files shouldn't be added to package_data
|
package_data[package_name].append(file_name)
|
||||||
continue
|
|
||||||
|
|
||||||
package_data[package_name].append(file_name)
|
|
||||||
|
|
||||||
|
|
||||||
def _no_device() -> bool:
|
def _no_device() -> bool:
|
||||||
@@ -415,6 +440,9 @@ def _no_device() -> bool:
|
|||||||
|
|
||||||
|
|
||||||
def _is_cuda() -> bool:
|
def _is_cuda() -> bool:
|
||||||
|
# Allow forced CUDA in Docker/precompiled builds, even without torch.cuda
|
||||||
|
if envs.VLLM_USE_PRECOMPILED and envs.VLLM_DOCKER_BUILD_CONTEXT:
|
||||||
|
return True
|
||||||
has_cuda = torch.version.cuda is not None
|
has_cuda = torch.version.cuda is not None
|
||||||
return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
|
return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
|
||||||
and not (_is_neuron() or _is_tpu()))
|
and not (_is_neuron() or _is_tpu()))
|
||||||
|
|||||||
11
vllm/envs.py
11
vllm/envs.py
@@ -68,6 +68,7 @@ if TYPE_CHECKING:
|
|||||||
MAX_JOBS: Optional[str] = None
|
MAX_JOBS: Optional[str] = None
|
||||||
NVCC_THREADS: Optional[str] = None
|
NVCC_THREADS: Optional[str] = None
|
||||||
VLLM_USE_PRECOMPILED: bool = False
|
VLLM_USE_PRECOMPILED: bool = False
|
||||||
|
VLLM_DOCKER_BUILD_CONTEXT: bool = False
|
||||||
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
|
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
|
||||||
VLLM_NO_DEPRECATION_WARNING: bool = False
|
VLLM_NO_DEPRECATION_WARNING: bool = False
|
||||||
VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
|
VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
|
||||||
@@ -222,8 +223,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
|
|
||||||
# If set, vllm will use precompiled binaries (*.so)
|
# If set, vllm will use precompiled binaries (*.so)
|
||||||
"VLLM_USE_PRECOMPILED":
|
"VLLM_USE_PRECOMPILED":
|
||||||
lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")) or bool(
|
lambda: os.environ.get("VLLM_USE_PRECOMPILED", "").strip().lower() in
|
||||||
os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
|
("1", "true") or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
|
||||||
|
|
||||||
|
# Used to mark that setup.py is running in a Docker build context,
|
||||||
|
# in order to force the use of precompiled binaries.
|
||||||
|
"VLLM_DOCKER_BUILD_CONTEXT":
|
||||||
|
lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "").strip().lower() in
|
||||||
|
("1", "true"),
|
||||||
|
|
||||||
# Whether to force using nightly wheel in python build.
|
# Whether to force using nightly wheel in python build.
|
||||||
# This is used for testing the nightly wheel in python build.
|
# This is used for testing the nightly wheel in python build.
|
||||||
|
|||||||
Reference in New Issue
Block a user