From ffc1a5c6a824e70694bc99ec96aca9d9e9770af0 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Tue, 19 May 2026 04:28:04 +0000 Subject: [PATCH] Fix workspace_shapes: remove wrong assertion, compute output dim from K The framework may pass K in different forms (packed or unpacked). Use max(K*2, hidden_dim) to handle both cases. --- vllm/patches/fused_moe/experts/cutedsl_moe.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/vllm/patches/fused_moe/experts/cutedsl_moe.py b/vllm/patches/fused_moe/experts/cutedsl_moe.py index 5a8398ad..67a0d887 100644 --- a/vllm/patches/fused_moe/experts/cutedsl_moe.py +++ b/vllm/patches/fused_moe/experts/cutedsl_moe.py @@ -271,9 +271,13 @@ class CuTeDSLMoEExperts(mk.FusedMoEExpertsModular): # Our runner manages its own workspace internally (pre-allocated buffers) workspace1 = (0,) workspace2 = (0,) - # K is packed (K//2 for uint8), so output uses hidden_dim - assert self.hidden_dim == K * 2 - output = (M, self.hidden_dim) + # K is the packed dimension from w1.shape[-1]. + # For NVFP4 uint8 packed weights, K_packed = K_logical // 2. + # The output of the L2 GEMM is hidden_dim (unpacked). + # If K == hidden_dim, weights are BF16 (not packed). + # If K == hidden_dim // 2, weights are NVFP4 packed. + output_dim = max(K * 2, self.hidden_dim) + output = (M, output_dim) return (workspace1, workspace2, output) def apply(