From 417fd28fb125cbb166ef3ada187d06d0c8dd0d30 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 4 Mar 2026 10:53:17 -0800 Subject: [PATCH] [Model Runner V2] Fix pooling (#36019) Signed-off-by: Nick Hill --- vllm/v1/worker/gpu/async_utils.py | 2 +- vllm/v1/worker/gpu/model_runner.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu/async_utils.py b/vllm/v1/worker/gpu/async_utils.py index f87459efa..7f270c2b8 100644 --- a/vllm/v1/worker/gpu/async_utils.py +++ b/vllm/v1/worker/gpu/async_utils.py @@ -95,8 +95,8 @@ class AsyncPoolingOutput(AsyncModelRunnerOutput): self.copy_event.record(copy_stream) def get_output(self) -> ModelRunnerOutput: + pooler_output = list(self.pooler_output_cpu.unbind(dim=0)) self.copy_event.synchronize() - pooler_output = self.pooler_output_cpu.unbind(dim=0) if self.is_valid_cpu is not None: is_valid_cpu = self.is_valid_cpu.tolist() for i, is_valid in enumerate(is_valid_cpu): diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py index 203d31195..9f802ed76 100644 --- a/vllm/v1/worker/gpu/model_runner.py +++ b/vllm/v1/worker/gpu/model_runner.py @@ -1117,7 +1117,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): # The prior execute_model call must have failed. return None - input_batch, _, _, _, hidden_states, _, kv_connector_output = ( + input_batch, _, _, _, hidden_states, _, kv_connector_output, _ = ( self.execute_model_state ) self.execute_model_state = None