From 3f2f4e1882b34c104e0a35b22c101ae8b31f5eb4 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Sun, 17 May 2026 17:24:26 +0000 Subject: [PATCH] Fix cudaErrorStreamCaptureUnsupported: no dynamic GPU-tensor slicing Dynamic slicing with GPU scalars (e.g. buf[:gpu_scalar]) is a CUDA operation not permitted during stream capture. Use full pre-allocated buffers instead of dynamic slices. The GEMM only reads rows indicated by expert_offsets, ignoring the zero padding. Also pass x_sf[:num_slots] (Python int slicing, cudagraph-safe) to scale assembly so it only processes real token scale data. --- vllm/nvfp4_cutedsl.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/nvfp4_cutedsl.py b/vllm/nvfp4_cutedsl.py index 8d79e254..1e640129 100644 --- a/vllm/nvfp4_cutedsl.py +++ b/vllm/nvfp4_cutedsl.py @@ -405,7 +405,7 @@ class CuTeDSLMoERunner: # Each expert's tokens go at [padded_expert_offsets[e], padded_expert_offsets[e] + tokens_per_expert[e]) # Padding rows between tokens_per_expert and padded_tokens_per_expert are zero. slot_hidden = hidden_states[sorted_token_ids] - padded_hidden = self._shared_bufs['hidden'][:total_padded_slots] + padded_hidden = self._shared_bufs['hidden'] padded_hidden.zero_() # scatter: padded_hidden[padded_expert_offsets[expert_assign] + local_row] = slot_hidden row_indices = self._row_indices_buf[:num_slots] @@ -422,7 +422,7 @@ class CuTeDSLMoERunner: ) l1_scale_a = self._assemble_scales_cudagraph_safe( - x_sf, expert_offsets[:self.num_experts + 1], + x_sf[:num_slots], expert_offsets[:self.num_experts + 1], padded_expert_offsets, self._padded_x_sf_buf_l1, self._per_expert_scale_bufs_l1 ) @@ -444,7 +444,7 @@ class CuTeDSLMoERunner: activated = torch.nn.functional.silu(gate) * up # === L2: down === - padded_activated = self._shared_bufs['activated'][:total_padded_slots] + padded_activated = self._shared_bufs['activated'] padded_activated.zero_() padded_activated[padded_dst] = activated @@ -453,7 +453,7 @@ class CuTeDSLMoERunner: ) l2_scale_a = self._assemble_scales_cudagraph_safe( - l2_x_sf, expert_offsets[:self.num_experts + 1], + l2_x_sf[:num_slots], expert_offsets[:self.num_experts + 1], padded_expert_offsets, self._padded_x_sf_buf_l2, self._per_expert_scale_bufs_l2 )