diff --git a/vllm/v1/worker/gpu/async_utils.py b/vllm/v1/worker/gpu/async_utils.py index f87459efa..7f270c2b8 100644 --- a/vllm/v1/worker/gpu/async_utils.py +++ b/vllm/v1/worker/gpu/async_utils.py @@ -95,8 +95,8 @@ class AsyncPoolingOutput(AsyncModelRunnerOutput): self.copy_event.record(copy_stream) def get_output(self) -> ModelRunnerOutput: + pooler_output = list(self.pooler_output_cpu.unbind(dim=0)) self.copy_event.synchronize() - pooler_output = self.pooler_output_cpu.unbind(dim=0) if self.is_valid_cpu is not None: is_valid_cpu = self.is_valid_cpu.tolist() for i, is_valid in enumerate(is_valid_cpu): diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py index 203d31195..9f802ed76 100644 --- a/vllm/v1/worker/gpu/model_runner.py +++ b/vllm/v1/worker/gpu/model_runner.py @@ -1117,7 +1117,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): # The prior execute_model call must have failed. return None - input_batch, _, _, _, hidden_states, _, kv_connector_output = ( + input_batch, _, _, _, hidden_states, _, kv_connector_output, _ = ( self.execute_model_state ) self.execute_model_state = None