Compare commits
3 Commits
v0.11.0rc5
...
v0.11.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b8b302cde4 | ||
|
|
f71952c1c4 | ||
|
|
d1007767c5 |
@@ -48,7 +48,7 @@ steps:
|
|||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
|
|||||||
@@ -13,8 +13,13 @@ ARG PYTHON_VERSION=3.12
|
|||||||
# private registries that use a different repository naming conventions.
|
# private registries that use a different repository naming conventions.
|
||||||
#
|
#
|
||||||
# Example:
|
# Example:
|
||||||
# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
|
||||||
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
|
||||||
|
# Important: We build with an old version of Ubuntu to maintain broad
|
||||||
|
# compatibility with other Linux OSes. The main reason for this is that the
|
||||||
|
# glibc version is baked into the distro, and binaries built with one glibc
|
||||||
|
# version are not backwards compatible with OSes that use an earlier version.
|
||||||
|
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
|
||||||
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
||||||
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
||||||
|
|
||||||
@@ -75,34 +80,19 @@ ARG TARGETPLATFORM
|
|||||||
ARG INSTALL_KV_CONNECTORS=false
|
ARG INSTALL_KV_CONNECTORS=false
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
ARG DEADSNAKES_MIRROR_URL
|
|
||||||
ARG DEADSNAKES_GPGKEY_URL
|
|
||||||
ARG GET_PIP_URL
|
ARG GET_PIP_URL
|
||||||
|
|
||||||
# Install Python and other dependencies
|
# Install system dependencies and uv, then create Python virtual environment
|
||||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||||
&& apt-get update -y \
|
&& apt-get update -y \
|
||||||
&& apt-get install -y ccache software-properties-common git curl sudo \
|
&& apt-get install -y ccache software-properties-common git curl sudo python3-pip \
|
||||||
&& if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
|
&& curl -LsSf https://astral.sh/uv/install.sh | sh \
|
||||||
if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
|
&& $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
|
||||||
mkdir -p -m 0755 /etc/apt/keyrings ; \
|
&& rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
|
||||||
curl -L ${DEADSNAKES_GPGKEY_URL} | gpg --dearmor > /etc/apt/keyrings/deadsnakes.gpg ; \
|
&& ln -s /opt/venv/bin/python3 /usr/bin/python3 \
|
||||||
sudo chmod 644 /etc/apt/keyrings/deadsnakes.gpg ; \
|
&& ln -s /opt/venv/bin/python3-config /usr/bin/python3-config \
|
||||||
echo "deb [signed-by=/etc/apt/keyrings/deadsnakes.gpg] ${DEADSNAKES_MIRROR_URL} $(lsb_release -cs) main" > /etc/apt/sources.list.d/deadsnakes.list ; \
|
&& ln -s /opt/venv/bin/pip /usr/bin/pip \
|
||||||
fi ; \
|
|
||||||
else \
|
|
||||||
for i in 1 2 3; do \
|
|
||||||
add-apt-repository -y ppa:deadsnakes/ppa && break || \
|
|
||||||
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
|
|
||||||
done ; \
|
|
||||||
fi \
|
|
||||||
&& apt-get update -y \
|
|
||||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
|
|
||||||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
|
||||||
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
|
|
||||||
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
|
||||||
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
|
|
||||||
&& python3 --version && python3 -m pip --version
|
&& python3 --version && python3 -m pip --version
|
||||||
|
|
||||||
ARG PIP_INDEX_URL UV_INDEX_URL
|
ARG PIP_INDEX_URL UV_INDEX_URL
|
||||||
@@ -111,9 +101,9 @@ ARG PYTORCH_CUDA_INDEX_BASE_URL
|
|||||||
ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
|
ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
|
||||||
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
|
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
|
||||||
|
|
||||||
# Install uv for faster pip installs
|
# Activate virtual environment and add uv to PATH
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
ENV PATH="/opt/venv/bin:/root/.local/bin:$PATH"
|
||||||
python3 -m pip install uv
|
ENV VIRTUAL_ENV="/opt/venv"
|
||||||
|
|
||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
@@ -142,7 +132,7 @@ WORKDIR /workspace
|
|||||||
COPY requirements/common.txt requirements/common.txt
|
COPY requirements/common.txt requirements/common.txt
|
||||||
COPY requirements/cuda.txt requirements/cuda.txt
|
COPY requirements/cuda.txt requirements/cuda.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system -r requirements/cuda.txt \
|
uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
|
||||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||||
|
|
||||||
# cuda arch list used by torch
|
# cuda arch list used by torch
|
||||||
@@ -172,7 +162,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
|||||||
ENV UV_LINK_MODE=copy
|
ENV UV_LINK_MODE=copy
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system -r requirements/build.txt \
|
uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
|
||||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
@@ -269,7 +259,7 @@ COPY requirements/lint.txt requirements/lint.txt
|
|||||||
COPY requirements/test.txt requirements/test.txt
|
COPY requirements/test.txt requirements/test.txt
|
||||||
COPY requirements/dev.txt requirements/dev.txt
|
COPY requirements/dev.txt requirements/dev.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system -r requirements/dev.txt \
|
uv pip install --python /opt/venv/bin/python3 -r requirements/dev.txt \
|
||||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||||
#################### DEV IMAGE ####################
|
#################### DEV IMAGE ####################
|
||||||
|
|
||||||
|
|||||||
Binary file not shown.
|
Before Width: | Height: | Size: 127 KiB After Width: | Height: | Size: 119 KiB |
@@ -29,7 +29,6 @@ from vllm.utils.flashinfer import (can_use_trtllm_attention,
|
|||||||
flashinfer_disable_q_quantization,
|
flashinfer_disable_q_quantization,
|
||||||
supports_trtllm_attention,
|
supports_trtllm_attention,
|
||||||
use_trtllm_attention)
|
use_trtllm_attention)
|
||||||
from vllm.v1.attention.backends.flash_attn import use_cascade_attention
|
|
||||||
# yapf conflicts with isort for this block
|
# yapf conflicts with isort for this block
|
||||||
# yapf: disable
|
# yapf: disable
|
||||||
from vllm.v1.attention.backends.utils import (AttentionCGSupport,
|
from vllm.v1.attention.backends.utils import (AttentionCGSupport,
|
||||||
@@ -677,7 +676,9 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
|
|||||||
# TODO: The cascade wrapper currently does not support setting
|
# TODO: The cascade wrapper currently does not support setting
|
||||||
# kv cache dtype to something different from query dtype.
|
# kv cache dtype to something different from query dtype.
|
||||||
return False
|
return False
|
||||||
return use_cascade_attention(*args, **kwargs)
|
# TODO: Cascade attention doesn't work, disable it for now
|
||||||
|
# return use_cascade_attention(*args, **kwargs)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
class FlashInferImpl(AttentionImpl):
|
class FlashInferImpl(AttentionImpl):
|
||||||
|
|||||||
Reference in New Issue
Block a user