diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index fc57fed20..380a1567b 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -9,7 +9,6 @@ from collections.abc import AsyncGenerator, Iterable, Mapping from copy import copy from typing import Any, cast -import numpy as np import torch import vllm.envs as envs @@ -32,7 +31,6 @@ from vllm.transformers_utils.config import maybe_register_config_serialize_by_va from vllm.usage.usage_lib import UsageContext from vllm.utils.async_utils import cancel_task_threadsafe from vllm.utils.collection_utils import as_list -from vllm.utils.math_utils import cdiv from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError @@ -495,6 +493,7 @@ class AsyncLLM(EngineClient): log_stats = self.log_stats logger_manager = self.logger_manager input_processor = self.input_processor + chunk_size = envs.VLLM_V1_OUTPUT_PROC_CHUNK_SIZE async def output_handler(): try: @@ -510,15 +509,10 @@ class AsyncLLM(EngineClient): # Split outputs into chunks of at most # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the # event loop for too long. - if num_outputs <= envs.VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: - slices = (outputs.outputs,) - else: - slices = np.array_split( - outputs.outputs, - cdiv(num_outputs, envs.VLLM_V1_OUTPUT_PROC_CHUNK_SIZE), - ) - - for i, outputs_slice in enumerate(slices): + engine_core_outputs = outputs.outputs + for start in range(0, num_outputs, chunk_size): + end = start + chunk_size + outputs_slice = engine_core_outputs[start:end] # 2) Process EngineCoreOutputs. processed_outputs = output_processor.process_outputs( outputs_slice, outputs.timestamp, iteration_stats @@ -527,7 +521,7 @@ class AsyncLLM(EngineClient): assert not processed_outputs.request_outputs # Allow other asyncio tasks to run between chunks - if i + 1 < len(slices): + if end < num_outputs: await asyncio.sleep(0) # 3) Abort any reqs that finished due to stop strings.