[Perf] Optimize redundant sync for pooling model, 3.7% Throughput Improvement (#39113)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
2026-04-09 02:12:23 -04:00
parent 9e78555743
commit ed2f282bc8
1 changed files with 16 additions and 9 deletions
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1046,6 +1046,13 @@ class GPUModelRunner(
    def _sync_device(self) -> None:
        torch.accelerator.synchronize()

+    def _get_or_create_async_output_copy_stream(self) -> torch.cuda.Stream:
+        stream = self.async_output_copy_stream
+        if stream is None:
+            stream = torch.cuda.Stream()
+            self.async_output_copy_stream = stream
+        return stream
+
    def _update_states(self, scheduler_output: "SchedulerOutput") -> Callable | None:
        """Update the cached states and the persistent batch with the scheduler
        output.
@@ -3152,21 +3159,21 @@ class GPUModelRunner(
            model_runner_output.pooler_output = [None] * num_reqs
            return model_runner_output

-        if self.use_async_scheduling:
-            return AsyncGPUPoolingModelRunnerOutput(
-                model_runner_output=model_runner_output,
+        if not current_platform.is_cuda_alike():
+            # cpu/xpu runners cannot use the CUDA stream/event-based wrapper.
+            model_runner_output.pooler_output = _copy_pooler_output_to_cpu(
                raw_pooler_output=raw_pooler_output,
                finished_mask=finished_mask,
-                async_output_copy_stream=self.async_output_copy_stream,
            )
+            self._sync_device()
+            return model_runner_output

-        model_runner_output.pooler_output = _copy_pooler_output_to_cpu(
+        return AsyncGPUPoolingModelRunnerOutput(
+            model_runner_output=model_runner_output,
            raw_pooler_output=raw_pooler_output,
            finished_mask=finished_mask,
+            async_output_copy_stream=self._get_or_create_async_output_copy_stream(),
        )
-        self._sync_device()
-
-        return model_runner_output

    def _pad_for_sequence_parallelism(self, num_scheduled_tokens: int) -> int:
        # Pad tokens to multiple of tensor_parallel_size when
@@ -4333,7 +4340,7 @@ class GPUModelRunner(
                sampled_token_ids=sampler_output.sampled_token_ids,
                logprobs_tensors=sampler_output.logprobs_tensors,
                invalid_req_indices=invalid_req_indices,
-                async_output_copy_stream=self.async_output_copy_stream,
+                async_output_copy_stream=self._get_or_create_async_output_copy_stream(),
                vocab_size=self.input_batch.vocab_size,
            )
        with record_function_or_nullcontext(