4 Commits

Author SHA1 Message Date
71e7e8179b python is more than just a disaster 2026-04-23 13:45:50 +00:00
8310d82f6e python is a disaster 2026-04-23 13:28:30 +00:00
1f12853509 i swear to god if this fixes tool calling, i will lose my shit 2026-04-23 13:20:43 +00:00
a4e75cc67a no patches, just raw 2026-04-23 10:07:34 +00:00

View File

@@ -15,27 +15,22 @@ FROM vllm/vllm-openai-rocm:nightly
ENV PYTORCH_ROCM_ARCH=gfx942 \
AITER_ROCM_ARCH=gfx942 \
GPU_ARCHS=gfx942 \
VLLM_ROCM_USE_AITER=1 \
VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 \
VLLM_ROCM_USE_AITER_RMSNORM=0 \
HSA_ENABLE_SDMA=0 \
HSA_NO_SCRATCH_RECLAIM=1 \
OMP_NUM_THREADS=1
VLLM_ROCM_USE_AITER=1
# --- Copy and apply DFlash patches ---
COPY patches/patch_dflash_rocm.py /tmp/patch_dflash_rocm.py
RUN python3 /tmp/patch_dflash_rocm.py && rm /tmp/patch_dflash_rocm.py
#COPY patches/patch_dflash_rocm.py /tmp/patch_dflash_rocm.py
#RUN python3 /tmp/patch_dflash_rocm.py && rm /tmp/patch_dflash_rocm.py
# --- Pre-download DFlash draft models ---
# These are needed for speculative decoding and must be local paths.
# Baking them into the image avoids runtime downloads/mounts.
# Pass HF_TOKEN build arg if the models are gated.
ARG HF_TOKEN=
RUN bash -c 'if [ ! -d "/opt/draft-models/Kimi-K2.5-DFlash" ]; then \
pip install --no-cache-dir huggingface_hub && \
python3 -c "from huggingface_hub import snapshot_download; snapshot_download(\"z-lab/Kimi-K2.5-DFlash\", local_dir=\"/opt/draft-models/Kimi-K2.5-DFlash\")" && \
rm -rf /root/.cache/huggingface; \
fi'
#RUN bash -c 'if [ ! -d "/opt/draft-models/Kimi-K2.5-DFlash" ]; then \
# pip install --no-cache-dir huggingface_hub && \
# python3 -c "from huggingface_hub import snapshot_download; snapshot_download(\"z-lab/Kimi-K2.5-DFlash\", local_dir=\"/opt/draft-models/Kimi-K2.5-DFlash\")" && \
# rm -rf /root/.cache/huggingface; \
# fi'
# Patch tool and reasoning parsers for Eagle
#COPY kimi_k2_tool_parser.py /usr/local/lib/python3.12/dist-packages/vllm/tool_parsers/kimi_k2_tool_parser.py
@@ -45,4 +40,16 @@ RUN bash -c 'if [ ! -d "/opt/draft-models/Kimi-K2.5-DFlash" ]; then \
# Patch serving layer: flush reasoning→content on finish_reason=length
#COPY serving.py /usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/chat_completion/serving.py
# --- Upgrade xgrammar to bleeding edge for tool-call constrained decoding ---
# Kimi K2 drops optional tool-call params with older xgrammar; upgrading fixes
# the grammar matcher so it doesn't prematurely terminate optional fields.
#
# IMPORTANT: --no-deps prevents pip from nuking the ROCm torch build and
# other vLLM-pinned dependencies. xgrammar's only runtime deps that matter
# (torch, numpy, etc.) are already in the image. Build from git main for
# nightly; pin to a release (e.g. xgrammar==0.1.33) if preferred.
RUN pip install --no-cache-dir apache-tvm-ffi && \
pip install --no-cache-dir --force-reinstall --no-deps \
'xgrammar @ git+https://github.com/mlc-ai/xgrammar.git@main'
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]