python is more than just a disaster

python is a disaster
i swear to god if this fixes tool calling, i will lose my shit
2026-04-23 13:45:50 +00:00 · 2026-04-23 13:28:30 +00:00 · 2026-04-23 13:20:43 +00:00 · 2026-04-23 10:07:34 +00:00
1 changed files with 20 additions and 13 deletions
--- a/Dockerfile.kimi26-dflash
+++ b/Dockerfile.kimi26-dflash
@@ -15,27 +15,22 @@ FROM vllm/vllm-openai-rocm:nightly
 ENV PYTORCH_ROCM_ARCH=gfx942 \
    AITER_ROCM_ARCH=gfx942 \
    GPU_ARCHS=gfx942 \
-    VLLM_ROCM_USE_AITER=1 \
-    VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 \
-    VLLM_ROCM_USE_AITER_RMSNORM=0 \
-    HSA_ENABLE_SDMA=0 \
-    HSA_NO_SCRATCH_RECLAIM=1 \
-    OMP_NUM_THREADS=1
+    VLLM_ROCM_USE_AITER=1

 # --- Copy and apply DFlash patches ---
-COPY patches/patch_dflash_rocm.py /tmp/patch_dflash_rocm.py
-RUN python3 /tmp/patch_dflash_rocm.py && rm /tmp/patch_dflash_rocm.py
+#COPY patches/patch_dflash_rocm.py /tmp/patch_dflash_rocm.py
+#RUN python3 /tmp/patch_dflash_rocm.py && rm /tmp/patch_dflash_rocm.py

 # --- Pre-download DFlash draft models ---
 # These are needed for speculative decoding and must be local paths.
 # Baking them into the image avoids runtime downloads/mounts.
 # Pass HF_TOKEN build arg if the models are gated.
 ARG HF_TOKEN=
-RUN bash -c 'if [ ! -d "/opt/draft-models/Kimi-K2.5-DFlash" ]; then \
-        pip install --no-cache-dir huggingface_hub && \
-        python3 -c "from huggingface_hub import snapshot_download; snapshot_download(\"z-lab/Kimi-K2.5-DFlash\", local_dir=\"/opt/draft-models/Kimi-K2.5-DFlash\")" && \
-        rm -rf /root/.cache/huggingface; \
-    fi'
+#RUN bash -c 'if [ ! -d "/opt/draft-models/Kimi-K2.5-DFlash" ]; then \
+#        pip install --no-cache-dir huggingface_hub && \
+#        python3 -c "from huggingface_hub import snapshot_download; snapshot_download(\"z-lab/Kimi-K2.5-DFlash\", local_dir=\"/opt/draft-models/Kimi-K2.5-DFlash\")" && \
+#        rm -rf /root/.cache/huggingface; \
+#    fi'

 # Patch tool and reasoning parsers for Eagle
 #COPY kimi_k2_tool_parser.py /usr/local/lib/python3.12/dist-packages/vllm/tool_parsers/kimi_k2_tool_parser.py
@@ -45,4 +40,16 @@ RUN bash -c 'if [ ! -d "/opt/draft-models/Kimi-K2.5-DFlash" ]; then \
 # Patch serving layer: flush reasoning→content on finish_reason=length
 #COPY serving.py /usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/chat_completion/serving.py

+# --- Upgrade xgrammar to bleeding edge for tool-call constrained decoding ---
+# Kimi K2 drops optional tool-call params with older xgrammar; upgrading fixes
+# the grammar matcher so it doesn't prematurely terminate optional fields.
+#
+# IMPORTANT: --no-deps prevents pip from nuking the ROCm torch build and
+# other vLLM-pinned dependencies. xgrammar's only runtime deps that matter
+# (torch, numpy, etc.) are already in the image. Build from git main for
+# nightly; pin to a release (e.g. xgrammar==0.1.33) if preferred.
+RUN pip install --no-cache-dir apache-tvm-ffi && \
+    pip install --no-cache-dir --force-reinstall --no-deps \
+    'xgrammar @ git+https://github.com/mlc-ai/xgrammar.git@main'
+
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
Author	SHA1	Message	Date
biondizzle	71e7e8179b	python is more than just a disaster	2026-04-23 13:45:50 +00:00
biondizzle	8310d82f6e	python is a disaster	2026-04-23 13:28:30 +00:00
biondizzle	1f12853509	i swear to god if this fixes tool calling, i will lose my shit	2026-04-23 13:20:43 +00:00
biondizzle	a4e75cc67a	no patches, just raw	2026-04-23 10:07:34 +00:00