Fix cudaErrorStreamCaptureUnsupported: no dynamic GPU-tensor slicing
Dynamic slicing with GPU scalars (e.g. buf[:gpu_scalar]) is a CUDA operation not permitted during stream capture. Use full pre-allocated buffers instead of dynamic slices. The GEMM only reads rows indicated by expert_offsets, ignoring the zero padding. Also pass x_sf[:num_slots] (Python int slicing, cudagraph-safe) to scale assembly so it only processes real token scale data.
This commit is contained in:
@@ -405,7 +405,7 @@ class CuTeDSLMoERunner:
|
||||
# Each expert's tokens go at [padded_expert_offsets[e], padded_expert_offsets[e] + tokens_per_expert[e])
|
||||
# Padding rows between tokens_per_expert and padded_tokens_per_expert are zero.
|
||||
slot_hidden = hidden_states[sorted_token_ids]
|
||||
padded_hidden = self._shared_bufs['hidden'][:total_padded_slots]
|
||||
padded_hidden = self._shared_bufs['hidden']
|
||||
padded_hidden.zero_()
|
||||
# scatter: padded_hidden[padded_expert_offsets[expert_assign] + local_row] = slot_hidden
|
||||
row_indices = self._row_indices_buf[:num_slots]
|
||||
@@ -422,7 +422,7 @@ class CuTeDSLMoERunner:
|
||||
)
|
||||
|
||||
l1_scale_a = self._assemble_scales_cudagraph_safe(
|
||||
x_sf, expert_offsets[:self.num_experts + 1],
|
||||
x_sf[:num_slots], expert_offsets[:self.num_experts + 1],
|
||||
padded_expert_offsets,
|
||||
self._padded_x_sf_buf_l1, self._per_expert_scale_bufs_l1
|
||||
)
|
||||
@@ -444,7 +444,7 @@ class CuTeDSLMoERunner:
|
||||
activated = torch.nn.functional.silu(gate) * up
|
||||
|
||||
# === L2: down ===
|
||||
padded_activated = self._shared_bufs['activated'][:total_padded_slots]
|
||||
padded_activated = self._shared_bufs['activated']
|
||||
padded_activated.zero_()
|
||||
padded_activated[padded_dst] = activated
|
||||
|
||||
@@ -453,7 +453,7 @@ class CuTeDSLMoERunner:
|
||||
)
|
||||
|
||||
l2_scale_a = self._assemble_scales_cudagraph_safe(
|
||||
l2_x_sf, expert_offsets[:self.num_experts + 1],
|
||||
l2_x_sf[:num_slots], expert_offsets[:self.num_experts + 1],
|
||||
padded_expert_offsets,
|
||||
self._padded_x_sf_buf_l2, self._per_expert_scale_bufs_l2
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user