diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index de8e168e4..b1f9cbf5e 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -263,7 +263,6 @@ class AsyncGPUPoolingModelRunnerOutput(AsyncModelRunnerOutput): async_output_copy_stream: torch.cuda.Stream, ): self._model_runner_output = model_runner_output - self._finished_mask = finished_mask # Event on the copy stream so we can synchronize the non-blocking copy. self.async_copy_ready_event = torch.Event() @@ -276,11 +275,15 @@ class AsyncGPUPoolingModelRunnerOutput(AsyncModelRunnerOutput): default_stream = torch.cuda.current_stream() with torch.cuda.stream(async_output_copy_stream): async_output_copy_stream.wait_stream(default_stream) - self._raw_pooler_output_cpu = json_map_leaves( + raw_pooler_output_cpu = json_map_leaves( lambda x: None if x is None else x.to("cpu", non_blocking=True), self._raw_pooler_output, ) self.async_copy_ready_event.record() + self._model_runner_output.pooler_output = [ + out if include else None + for out, include in zip(raw_pooler_output_cpu, finished_mask) + ] def get_output(self) -> ModelRunnerOutput: """Copy the device tensors to the host and return a ModelRunnerOutput. @@ -290,11 +293,6 @@ class AsyncGPUPoolingModelRunnerOutput(AsyncModelRunnerOutput): # Release the device tensors once the copy has completed. del self._raw_pooler_output - - self._model_runner_output.pooler_output = [ - out if include else None - for out, include in zip(self._raw_pooler_output_cpu, self._finished_mask) - ] return self._model_runner_output @@ -2537,8 +2535,7 @@ class GPUModelRunner( model = cast(VllmModelForPooling, self.model) raw_pooler_output: PoolerOutput = model.pooler( - hidden_states=hidden_states, - pooling_metadata=pooling_metadata, + hidden_states=hidden_states, pooling_metadata=pooling_metadata ) finished_mask = [ @@ -2568,12 +2565,12 @@ class GPUModelRunner( lambda x: None if x is None else x.to("cpu", non_blocking=True), raw_pooler_output, ) - self._sync_device() - model_runner_output.pooler_output = [ out if include else None for out, include in zip(raw_pooler_output, finished_mask) ] + self._sync_device() + return model_runner_output def _pad_for_sequence_parallelism(self, num_scheduled_tokens: int) -> int: