[Refactor] Remove numpy split in async scheduling (#32034)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
@@ -9,7 +9,6 @@ from collections.abc import AsyncGenerator, Iterable, Mapping
|
||||
from copy import copy
|
||||
from typing import Any, cast
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
import vllm.envs as envs
|
||||
@@ -32,7 +31,6 @@ from vllm.transformers_utils.config import maybe_register_config_serialize_by_va
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils.async_utils import cancel_task_threadsafe
|
||||
from vllm.utils.collection_utils import as_list
|
||||
from vllm.utils.math_utils import cdiv
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.engine.core_client import EngineCoreClient
|
||||
from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
|
||||
@@ -495,6 +493,7 @@ class AsyncLLM(EngineClient):
|
||||
log_stats = self.log_stats
|
||||
logger_manager = self.logger_manager
|
||||
input_processor = self.input_processor
|
||||
chunk_size = envs.VLLM_V1_OUTPUT_PROC_CHUNK_SIZE
|
||||
|
||||
async def output_handler():
|
||||
try:
|
||||
@@ -510,15 +509,10 @@ class AsyncLLM(EngineClient):
|
||||
# Split outputs into chunks of at most
|
||||
# VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the
|
||||
# event loop for too long.
|
||||
if num_outputs <= envs.VLLM_V1_OUTPUT_PROC_CHUNK_SIZE:
|
||||
slices = (outputs.outputs,)
|
||||
else:
|
||||
slices = np.array_split(
|
||||
outputs.outputs,
|
||||
cdiv(num_outputs, envs.VLLM_V1_OUTPUT_PROC_CHUNK_SIZE),
|
||||
)
|
||||
|
||||
for i, outputs_slice in enumerate(slices):
|
||||
engine_core_outputs = outputs.outputs
|
||||
for start in range(0, num_outputs, chunk_size):
|
||||
end = start + chunk_size
|
||||
outputs_slice = engine_core_outputs[start:end]
|
||||
# 2) Process EngineCoreOutputs.
|
||||
processed_outputs = output_processor.process_outputs(
|
||||
outputs_slice, outputs.timestamp, iteration_stats
|
||||
@@ -527,7 +521,7 @@ class AsyncLLM(EngineClient):
|
||||
assert not processed_outputs.request_outputs
|
||||
|
||||
# Allow other asyncio tasks to run between chunks
|
||||
if i + 1 < len(slices):
|
||||
if end < num_outputs:
|
||||
await asyncio.sleep(0)
|
||||
|
||||
# 3) Abort any reqs that finished due to stop strings.
|
||||
|
||||
Reference in New Issue
Block a user