diff --git a/vllm/nvfp4_cutedsl.py b/vllm/nvfp4_cutedsl.py index 8d79e254..1e640129 100644 --- a/vllm/nvfp4_cutedsl.py +++ b/vllm/nvfp4_cutedsl.py @@ -405,7 +405,7 @@ class CuTeDSLMoERunner: # Each expert's tokens go at [padded_expert_offsets[e], padded_expert_offsets[e] + tokens_per_expert[e]) # Padding rows between tokens_per_expert and padded_tokens_per_expert are zero. slot_hidden = hidden_states[sorted_token_ids] - padded_hidden = self._shared_bufs['hidden'][:total_padded_slots] + padded_hidden = self._shared_bufs['hidden'] padded_hidden.zero_() # scatter: padded_hidden[padded_expert_offsets[expert_assign] + local_row] = slot_hidden row_indices = self._row_indices_buf[:num_slots] @@ -422,7 +422,7 @@ class CuTeDSLMoERunner: ) l1_scale_a = self._assemble_scales_cudagraph_safe( - x_sf, expert_offsets[:self.num_experts + 1], + x_sf[:num_slots], expert_offsets[:self.num_experts + 1], padded_expert_offsets, self._padded_x_sf_buf_l1, self._per_expert_scale_bufs_l1 ) @@ -444,7 +444,7 @@ class CuTeDSLMoERunner: activated = torch.nn.functional.silu(gate) * up # === L2: down === - padded_activated = self._shared_bufs['activated'][:total_padded_slots] + padded_activated = self._shared_bufs['activated'] padded_activated.zero_() padded_activated[padded_dst] = activated @@ -453,7 +453,7 @@ class CuTeDSLMoERunner: ) l2_scale_a = self._assemble_scales_cudagraph_safe( - l2_x_sf, expert_offsets[:self.num_experts + 1], + l2_x_sf[:num_slots], expert_offsets[:self.num_experts + 1], padded_expert_offsets, self._padded_x_sf_buf_l2, self._per_expert_scale_bufs_l2 )