Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 71e7e8179b | |||
| 8310d82f6e | |||
| 1f12853509 | |||
| a4e75cc67a |
@@ -15,27 +15,22 @@ FROM vllm/vllm-openai-rocm:nightly
|
||||
ENV PYTORCH_ROCM_ARCH=gfx942 \
|
||||
AITER_ROCM_ARCH=gfx942 \
|
||||
GPU_ARCHS=gfx942 \
|
||||
VLLM_ROCM_USE_AITER=1 \
|
||||
VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 \
|
||||
VLLM_ROCM_USE_AITER_RMSNORM=0 \
|
||||
HSA_ENABLE_SDMA=0 \
|
||||
HSA_NO_SCRATCH_RECLAIM=1 \
|
||||
OMP_NUM_THREADS=1
|
||||
VLLM_ROCM_USE_AITER=1
|
||||
|
||||
# --- Copy and apply DFlash patches ---
|
||||
COPY patches/patch_dflash_rocm.py /tmp/patch_dflash_rocm.py
|
||||
RUN python3 /tmp/patch_dflash_rocm.py && rm /tmp/patch_dflash_rocm.py
|
||||
#COPY patches/patch_dflash_rocm.py /tmp/patch_dflash_rocm.py
|
||||
#RUN python3 /tmp/patch_dflash_rocm.py && rm /tmp/patch_dflash_rocm.py
|
||||
|
||||
# --- Pre-download DFlash draft models ---
|
||||
# These are needed for speculative decoding and must be local paths.
|
||||
# Baking them into the image avoids runtime downloads/mounts.
|
||||
# Pass HF_TOKEN build arg if the models are gated.
|
||||
ARG HF_TOKEN=
|
||||
RUN bash -c 'if [ ! -d "/opt/draft-models/Kimi-K2.5-DFlash" ]; then \
|
||||
pip install --no-cache-dir huggingface_hub && \
|
||||
python3 -c "from huggingface_hub import snapshot_download; snapshot_download(\"z-lab/Kimi-K2.5-DFlash\", local_dir=\"/opt/draft-models/Kimi-K2.5-DFlash\")" && \
|
||||
rm -rf /root/.cache/huggingface; \
|
||||
fi'
|
||||
#RUN bash -c 'if [ ! -d "/opt/draft-models/Kimi-K2.5-DFlash" ]; then \
|
||||
# pip install --no-cache-dir huggingface_hub && \
|
||||
# python3 -c "from huggingface_hub import snapshot_download; snapshot_download(\"z-lab/Kimi-K2.5-DFlash\", local_dir=\"/opt/draft-models/Kimi-K2.5-DFlash\")" && \
|
||||
# rm -rf /root/.cache/huggingface; \
|
||||
# fi'
|
||||
|
||||
# Patch tool and reasoning parsers for Eagle
|
||||
#COPY kimi_k2_tool_parser.py /usr/local/lib/python3.12/dist-packages/vllm/tool_parsers/kimi_k2_tool_parser.py
|
||||
@@ -45,4 +40,16 @@ RUN bash -c 'if [ ! -d "/opt/draft-models/Kimi-K2.5-DFlash" ]; then \
|
||||
# Patch serving layer: flush reasoning→content on finish_reason=length
|
||||
#COPY serving.py /usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/chat_completion/serving.py
|
||||
|
||||
# --- Upgrade xgrammar to bleeding edge for tool-call constrained decoding ---
|
||||
# Kimi K2 drops optional tool-call params with older xgrammar; upgrading fixes
|
||||
# the grammar matcher so it doesn't prematurely terminate optional fields.
|
||||
#
|
||||
# IMPORTANT: --no-deps prevents pip from nuking the ROCm torch build and
|
||||
# other vLLM-pinned dependencies. xgrammar's only runtime deps that matter
|
||||
# (torch, numpy, etc.) are already in the image. Build from git main for
|
||||
# nightly; pin to a release (e.g. xgrammar==0.1.33) if preferred.
|
||||
RUN pip install --no-cache-dir apache-tvm-ffi && \
|
||||
pip install --no-cache-dir --force-reinstall --no-deps \
|
||||
'xgrammar @ git+https://github.com/mlc-ai/xgrammar.git@main'
|
||||
|
||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||
|
||||
Reference in New Issue
Block a user