Fix cudaErrorStreamCaptureUnsupported: no dynamic GPU-tensor slicing

Dynamic slicing with GPU scalars (e.g. buf[:gpu_scalar]) is a CUDA operation not permitted during stream capture. Use full pre-allocated buffers instead of dynamic slices. The GEMM only reads rows indicated by expert_offsets, ignoring the zero padding. Also pass x_sf[:num_slots] (Python int slicing, cudagraph-safe) to scale assembly so it only processes real token scale data.
2026-05-17 17:24:26 +00:00
parent 11b5aa5e37
commit 3f2f4e1882
1 changed files with 4 additions and 4 deletions
--- a/vllm/nvfp4_cutedsl.py
+++ b/vllm/nvfp4_cutedsl.py
@@ -405,7 +405,7 @@ class CuTeDSLMoERunner:
        # Each expert's tokens go at [padded_expert_offsets[e], padded_expert_offsets[e] + tokens_per_expert[e])
        # Padding rows between tokens_per_expert and padded_tokens_per_expert are zero.
        slot_hidden = hidden_states[sorted_token_ids]
-        padded_hidden = self._shared_bufs['hidden'][:total_padded_slots]
+        padded_hidden = self._shared_bufs['hidden']
        padded_hidden.zero_()
        # scatter: padded_hidden[padded_expert_offsets[expert_assign] + local_row] = slot_hidden
        row_indices = self._row_indices_buf[:num_slots]
@@ -422,7 +422,7 @@ class CuTeDSLMoERunner:
        )
        
        l1_scale_a = self._assemble_scales_cudagraph_safe(
-            x_sf, expert_offsets[:self.num_experts + 1],
+            x_sf[:num_slots], expert_offsets[:self.num_experts + 1],
            padded_expert_offsets,
            self._padded_x_sf_buf_l1, self._per_expert_scale_bufs_l1
        )
@@ -444,7 +444,7 @@ class CuTeDSLMoERunner:
        activated = torch.nn.functional.silu(gate) * up
        
        # === L2: down ===
-        padded_activated = self._shared_bufs['activated'][:total_padded_slots]
+        padded_activated = self._shared_bufs['activated']
        padded_activated.zero_()
        padded_activated[padded_dst] = activated
        
@@ -453,7 +453,7 @@ class CuTeDSLMoERunner:
        )
        
        l2_scale_a = self._assemble_scales_cudagraph_safe(
-            l2_x_sf, expert_offsets[:self.num_experts + 1],
+            l2_x_sf[:num_slots], expert_offsets[:self.num_experts + 1],
            padded_expert_offsets,
            self._padded_x_sf_buf_l2, self._per_expert_scale_bufs_l2
        )