From ffc1a5c6a824e70694bc99ec96aca9d9e9770af0 Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Tue, 19 May 2026 04:28:04 +0000
Subject: [PATCH] Fix workspace_shapes: remove wrong assertion, compute output
 dim from K

The framework may pass K in different forms (packed or unpacked).
Use max(K*2, hidden_dim) to handle both cases.
---
 vllm/patches/fused_moe/experts/cutedsl_moe.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/vllm/patches/fused_moe/experts/cutedsl_moe.py b/vllm/patches/fused_moe/experts/cutedsl_moe.py
index 5a8398ad..67a0d887 100644
--- a/vllm/patches/fused_moe/experts/cutedsl_moe.py
+++ b/vllm/patches/fused_moe/experts/cutedsl_moe.py
@@ -271,9 +271,13 @@ class CuTeDSLMoEExperts(mk.FusedMoEExpertsModular):
         # Our runner manages its own workspace internally (pre-allocated buffers)
         workspace1 = (0,)
         workspace2 = (0,)
-        # K is packed (K//2 for uint8), so output uses hidden_dim
-        assert self.hidden_dim == K * 2
-        output = (M, self.hidden_dim)
+        # K is the packed dimension from w1.shape[-1].
+        # For NVFP4 uint8 packed weights, K_packed = K_logical // 2.
+        # The output of the L2 GEMM is hidden_dim (unpacked).
+        # If K == hidden_dim, weights are BF16 (not packed).
+        # If K == hidden_dim // 2, weights are NVFP4 packed.
+        output_dim = max(K * 2, self.hidden_dim)
+        output = (M, output_dim)
         return (workspace1, workspace2, output)
 
     def apply(