Add triton_kernels for MoE support (vLLM v0.19.0)

- Add build-triton-kernels stage to fetch triton_kernels from Triton v3.6.0
- Install to site-packages for vLLM to find at runtime
- Resolves: No module named 'triton_kernels.matmul_ogs'
- Image tag: gh200-vllm-tfa:v0.19.0-tfa
This commit is contained in:
2026-04-06 16:39:56 +00:00
parent 643d5589a3
commit e6cc28a942
2 changed files with 143 additions and 5 deletions

View File

@@ -1,13 +1,16 @@
# ==============================================================================
# ⚠️⚠️⚠️ WORKING BUILD - DO NOT TOUCH ⚠️⚠️⚠️
# Triton Kernels Build (TFA) - vLLM v0.19.0 + triton_kernels
# ==============================================================================
# Build #43 succeeded on 2026-04-03 with these exact versions:
# - vLLM: v0.18.2rc0
# - flashinfer: v0.6.7
# This branch adds triton_kernels from Triton v3.6.0 for MoE support.
#
# Based on working Build #43 (v0.18.2rc0) with vLLM upgraded to v0.19.0:
# - vLLM: v0.19.0
# - flashinfer: v0.6.6
# - flash-attention: hopper branch
# - lmcache: dev branch
# - infinistore: main
# - triton: 3.6.0 (PyPI wheel)
# - triton_kernels: v3.6.0 (from Triton repo)
# - Base: nvcr.io/nvidia/pytorch:26.03-py3 (PyTorch 2.11.0a0, CUDA 13.2.0)
#
# HARD RULES:
@@ -16,7 +19,7 @@
# 3. CLEAR ALL CHANGES WITH MIKE BEFORE MAKING THEM
# 4. ONE BUILD AT A TIME - Mike reports failure → I assess → I report
#
# If you need to modify this file, ask Mike first.
# Image tag: gh200-vllm-tfa:v0.19.0-tfa
# ==============================================================================
# ---------- Builder Base ----------
@@ -79,6 +82,11 @@ FROM build-base AS build-triton
RUN mkdir -p /wheels && \
pip download triton==3.6.0 --platform manylinux_2_27_aarch64 --only-binary=:all: --no-deps -d /wheels
# Install triton_kernels from Triton repo (v3.6.0) for MoE support
# vLLM v0.19.0 requires this for triton_kernels.matmul_ogs module
FROM build-base AS build-triton-kernels
RUN pip install --target=/wheels git+https://github.com/triton-lang/triton.git@v3.6.0#subdirectory=python/triton_kernels
# Skip xformers - vLLM has built-in FlashAttention kernels
# xformers requires TORCH_STABLE_ONLY which needs PyTorch headers not in 2.9.0
# FROM build-base AS build-xformers
@@ -191,6 +199,7 @@ FROM base AS vllm-openai
COPY --from=build-flash-attention /wheels/* wheels/
COPY --from=build-flashinfer /wheels/* wheels/
COPY --from=build-triton /wheels/* wheels/
COPY --from=build-triton-kernels /wheels/triton_kernels /usr/local/lib/python3.12/dist-packages/triton_kernels
COPY --from=build-vllm /wheels/* wheels/
COPY --from=build-lmcache /wheels/* wheels/
COPY --from=build-infinistore /wheels/* wheels/