Fix cudaErrorStreamCaptureUnsupported: no dynamic GPU-tensor slicing

Dynamic slicing with GPU scalars (e.g. buf[:gpu_scalar]) is a CUDA
operation not permitted during stream capture. Use full pre-allocated
buffers instead of dynamic slices. The GEMM only reads rows indicated
by expert_offsets, ignoring the zero padding.

Also pass x_sf[:num_slots] (Python int slicing, cudagraph-safe) to
scale assembly so it only processes real token scale data.
This commit is contained in:
2026-05-17 17:24:26 +00:00
parent 11b5aa5e37
commit 3f2f4e1882

View File

@@ -405,7 +405,7 @@ class CuTeDSLMoERunner:
# Each expert's tokens go at [padded_expert_offsets[e], padded_expert_offsets[e] + tokens_per_expert[e])
# Padding rows between tokens_per_expert and padded_tokens_per_expert are zero.
slot_hidden = hidden_states[sorted_token_ids]
padded_hidden = self._shared_bufs['hidden'][:total_padded_slots]
padded_hidden = self._shared_bufs['hidden']
padded_hidden.zero_()
# scatter: padded_hidden[padded_expert_offsets[expert_assign] + local_row] = slot_hidden
row_indices = self._row_indices_buf[:num_slots]
@@ -422,7 +422,7 @@ class CuTeDSLMoERunner:
)
l1_scale_a = self._assemble_scales_cudagraph_safe(
x_sf, expert_offsets[:self.num_experts + 1],
x_sf[:num_slots], expert_offsets[:self.num_experts + 1],
padded_expert_offsets,
self._padded_x_sf_buf_l1, self._per_expert_scale_bufs_l1
)
@@ -444,7 +444,7 @@ class CuTeDSLMoERunner:
activated = torch.nn.functional.silu(gate) * up
# === L2: down ===
padded_activated = self._shared_bufs['activated'][:total_padded_slots]
padded_activated = self._shared_bufs['activated']
padded_activated.zero_()
padded_activated[padded_dst] = activated
@@ -453,7 +453,7 @@ class CuTeDSLMoERunner:
)
l2_scale_a = self._assemble_scales_cudagraph_safe(
l2_x_sf, expert_offsets[:self.num_experts + 1],
l2_x_sf[:num_slots], expert_offsets[:self.num_experts + 1],
padded_expert_offsets,
self._padded_x_sf_buf_l2, self._per_expert_scale_bufs_l2
)