From 103fd451ce056f0f55bace4e7e2fc557f49dcf89 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Sat, 16 May 2026 18:50:35 +0000 Subject: [PATCH] fix: use full padded_scales_buf (no GPU scalar slicing in cudagraph) buf[:gpu_scalar, :] triggers cudaErrorStreamCaptureInvalidated. Always use the full pre-allocated buffer; extra rows are zeros. --- vllm/nvfp4_cutedsl.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/nvfp4_cutedsl.py b/vllm/nvfp4_cutedsl.py index b7fd25e7..44a16349 100644 --- a/vllm/nvfp4_cutedsl.py +++ b/vllm/nvfp4_cutedsl.py @@ -162,10 +162,8 @@ class CuTeDSLMoERunner: padded_expert_offsets.zero_() padded_expert_offsets[1:] = padded_rows_per_expert.cumsum(0) - total_padded_rows = padded_expert_offsets[-1] - - # Reset the padded scales buffer - padded_scales = self._padded_scales_buf[:total_padded_rows, :padded_cols] + # Use the FULL pre-allocated scales buffer (no GPU scalar slicing) + padded_scales = self._padded_scales_buf padded_scales.zero_() # Build index mapping: for each row in x_sf, which expert does it belong to?