diff --git a/vllm/nvfp4_cutedsl.py b/vllm/nvfp4_cutedsl.py index b7fd25e7..44a16349 100644 --- a/vllm/nvfp4_cutedsl.py +++ b/vllm/nvfp4_cutedsl.py @@ -162,10 +162,8 @@ class CuTeDSLMoERunner: padded_expert_offsets.zero_() padded_expert_offsets[1:] = padded_rows_per_expert.cumsum(0) - total_padded_rows = padded_expert_offsets[-1] - - # Reset the padded scales buffer - padded_scales = self._padded_scales_buf[:total_padded_rows, :padded_cols] + # Use the FULL pre-allocated scales buffer (no GPU scalar slicing) + padded_scales = self._padded_scales_buf padded_scales.zero_() # Build index mapping: for each row in x_sf, which expert does it belong to?