From 7256070dd373b16e853b486b3cd7ff93ab10be90 Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Sun, 17 May 2026 21:24:43 +0000
Subject: [PATCH] FIX Bug 26: quantize slot tokens, not padded buffer

The runner was quantizing the padded_hidden (4096 rows) and then
taking x_sf[:num_slots] (first 48 rows). This only got scales for
expert 0 (the first 48 rows of the padded buffer), not the scales
for tokens scattered across padded positions (expert 1 at row 128, etc).

Fix: quantize slot_hidden (sorted tokens, num_slots rows) to get
correct per-token x_sf, then scatter x_fp4 into padded FP4 buffer
for the GEMM. The scale assembly now receives the correct x_sf.

Added hidden_fp4 and activated_fp4 padded buffers for FP4 scatter.
---
 vllm/nvfp4_cutedsl.py | 49 ++++++++++++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 19 deletions(-)

diff --git a/vllm/nvfp4_cutedsl.py b/vllm/nvfp4_cutedsl.py
index f989651d..139e62cf 100644
--- a/vllm/nvfp4_cutedsl.py
+++ b/vllm/nvfp4_cutedsl.py
@@ -166,9 +166,15 @@ class CuTeDSLMoERunner:
                 'hidden': torch.zeros(
                     padded_max_slots, self.hidden_size, dtype=torch.bfloat16, device=self.device
                 ),
+                'hidden_fp4': torch.zeros(
+                    padded_max_slots, self.hidden_size // 2, dtype=torch.float4_e2m1fn_x2, device=self.device
+                ),
                 'activated': torch.zeros(
                     padded_max_slots, self.intermediate_size, dtype=torch.bfloat16, device=self.device
                 ),
+                'activated_fp4': torch.zeros(
+                    padded_max_slots, self.intermediate_size // 2, dtype=torch.float4_e2m1fn_x2, device=self.device
+                ),
             })
         self._shared_bufs = CuTeDSLMoERunner._shared_padded_bufs[device_key]
         
@@ -406,35 +412,40 @@ class CuTeDSLMoERunner:
         padded_expert_offsets[1:self.num_experts + 1] = padded_tokens_per_expert.cumsum(0)
         total_padded_slots = padded_expert_offsets[self.num_experts]
         
-        # -- Gather hidden states into slot order, scatter into padded layout --
-        # Each expert's tokens go at [padded_expert_offsets[e], padded_expert_offsets[e] + tokens_per_expert[e])
-        # Padding rows between tokens_per_expert and padded_tokens_per_expert are zero.
+        # -- Gather hidden states into slot order, compute padded_dst --
         slot_hidden = hidden_states[sorted_token_ids]
-        padded_hidden = self._shared_bufs['hidden']
-        padded_hidden.zero_()
-        # scatter: padded_hidden[padded_expert_offsets[expert_assign] + local_row] = slot_hidden
         row_indices = self._row_indices_buf[:num_slots]
         expert_assign = torch.searchsorted(
             expert_offsets[1:], row_indices, right=True
         ).clamp(max=self.num_experts - 1)
         local_row = row_indices - expert_offsets[expert_assign]
         padded_dst = padded_expert_offsets[expert_assign] + local_row
-        padded_hidden[padded_dst] = slot_hidden
         
         # === L1: gate + up ===
-        x_fp4, x_sf = quantize_activation_nvfp4(
-            padded_hidden, self._l1_activation_global_scale
+        # Quantize slot_hidden (sorted tokens), NOT padded_hidden.
+        # padded_hidden is padded with zeros; quantizing it produces
+        # x_sf rows at padded positions, but x_sf[:num_slots] would
+        # only get scales for the first num_slots PADDED rows (expert 0),
+        # not the scattered token positions. Quantizing slot_hidden
+        # gives x_sf with num_slots rows (one per token), which the
+        # scale assembly correctly scatters into padded layout.
+        slot_x_fp4, slot_x_sf = quantize_activation_nvfp4(
+            slot_hidden, self._l1_activation_global_scale
         )
+        # Scatter x_fp4 into padded layout for the GEMM
+        padded_x_fp4 = self._shared_bufs['hidden_fp4']
+        padded_x_fp4.zero_()
+        padded_x_fp4[padded_dst] = slot_x_fp4
         
         l1_scale_a = self._assemble_scales_cudagraph_safe(
-            x_sf[:num_slots], expert_offsets[:self.num_experts + 1],
+            slot_x_sf, expert_offsets[:self.num_experts + 1],
             padded_expert_offsets,
             self._padded_x_sf_buf_l1, self._per_expert_scale_bufs_l1
         )
         l1_gsa = self._l1_gsa_buf.fill_(self._l1_activation_global_scale)
         
         l1_out = run_nvfp4_grouped_gemm(
-            mat_a=x_fp4, mat_b=self._l1_mat_b,
+            mat_a=padded_x_fp4, mat_b=self._l1_mat_b,
             scale_a=l1_scale_a, scale_b=self._l1_scale_b,
             expert_offsets=padded_expert_offsets[1:self.num_experts + 1],
             global_scale_a=l1_gsa, global_scale_b=self._l1_gsb,
@@ -454,23 +465,23 @@ class CuTeDSLMoERunner:
         activated = gate_silu * up
         
         # === L2: down ===
-        padded_activated = self._shared_bufs['activated']
-        padded_activated.zero_()
-        padded_activated[padded_dst] = activated
-        
-        l2_x_fp4, l2_x_sf = quantize_activation_nvfp4(
-            padded_activated, self._l2_activation_global_scale
+        # Quantize activated (per-token), scatter into padded FP4 buffer
+        slot_l2_x_fp4, slot_l2_x_sf = quantize_activation_nvfp4(
+            activated, self._l2_activation_global_scale
         )
+        padded_activated_fp4 = self._shared_bufs['activated_fp4']
+        padded_activated_fp4.zero_()
+        padded_activated_fp4[padded_dst] = slot_l2_x_fp4
         
         l2_scale_a = self._assemble_scales_cudagraph_safe(
-            l2_x_sf[:num_slots], expert_offsets[:self.num_experts + 1],
+            slot_l2_x_sf, expert_offsets[:self.num_experts + 1],
             padded_expert_offsets,
             self._padded_x_sf_buf_l2, self._per_expert_scale_bufs_l2
         )
         l2_gsa = self._l2_gsa_buf.fill_(self._l2_activation_global_scale)
         
         l2_out = run_nvfp4_grouped_gemm(
-            mat_a=l2_x_fp4, mat_b=self._l2_mat_b,
+            mat_a=padded_activated_fp4, mat_b=self._l2_mat_b,
             scale_a=l2_scale_a, scale_b=self._l2_scale_b,
             expert_offsets=padded_expert_offsets[1:self.num_experts + 1],
             global_scale_a=l2_gsa, global_scale_b=self._l2_gsb,