From 3f2f4e1882b34c104e0a35b22c101ae8b31f5eb4 Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Sun, 17 May 2026 17:24:26 +0000
Subject: [PATCH] Fix cudaErrorStreamCaptureUnsupported: no dynamic GPU-tensor
 slicing

Dynamic slicing with GPU scalars (e.g. buf[:gpu_scalar]) is a CUDA
operation not permitted during stream capture. Use full pre-allocated
buffers instead of dynamic slices. The GEMM only reads rows indicated
by expert_offsets, ignoring the zero padding.

Also pass x_sf[:num_slots] (Python int slicing, cudagraph-safe) to
scale assembly so it only processes real token scale data.
---
 vllm/nvfp4_cutedsl.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/nvfp4_cutedsl.py b/vllm/nvfp4_cutedsl.py
index 8d79e254..1e640129 100644
--- a/vllm/nvfp4_cutedsl.py
+++ b/vllm/nvfp4_cutedsl.py
@@ -405,7 +405,7 @@ class CuTeDSLMoERunner:
         # Each expert's tokens go at [padded_expert_offsets[e], padded_expert_offsets[e] + tokens_per_expert[e])
         # Padding rows between tokens_per_expert and padded_tokens_per_expert are zero.
         slot_hidden = hidden_states[sorted_token_ids]
-        padded_hidden = self._shared_bufs['hidden'][:total_padded_slots]
+        padded_hidden = self._shared_bufs['hidden']
         padded_hidden.zero_()
         # scatter: padded_hidden[padded_expert_offsets[expert_assign] + local_row] = slot_hidden
         row_indices = self._row_indices_buf[:num_slots]
@@ -422,7 +422,7 @@ class CuTeDSLMoERunner:
         )
         
         l1_scale_a = self._assemble_scales_cudagraph_safe(
-            x_sf, expert_offsets[:self.num_experts + 1],
+            x_sf[:num_slots], expert_offsets[:self.num_experts + 1],
             padded_expert_offsets,
             self._padded_x_sf_buf_l1, self._per_expert_scale_bufs_l1
         )
@@ -444,7 +444,7 @@ class CuTeDSLMoERunner:
         activated = torch.nn.functional.silu(gate) * up
         
         # === L2: down ===
-        padded_activated = self._shared_bufs['activated'][:total_padded_slots]
+        padded_activated = self._shared_bufs['activated']
         padded_activated.zero_()
         padded_activated[padded_dst] = activated
         
@@ -453,7 +453,7 @@ class CuTeDSLMoERunner:
         )
         
         l2_scale_a = self._assemble_scales_cudagraph_safe(
-            l2_x_sf, expert_offsets[:self.num_experts + 1],
+            l2_x_sf[:num_slots], expert_offsets[:self.num_experts + 1],
             padded_expert_offsets,
             self._padded_x_sf_buf_l2, self._per_expert_scale_bufs_l2
         )