[Misc][Refactor] Introduce ExecuteModelData (#4540)

This commit is contained in:
Cody Yu
2024-05-03 17:47:07 -07:00
committed by GitHub
parent 344bf7cd2d
commit bc8ad68455
23 changed files with 355 additions and 511 deletions

View File

@@ -22,8 +22,8 @@ from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.outputs import RequestOutput
from vllm.sampling_params import SamplingParams
from vllm.sequence import (MultiModalData, SamplerOutput, Sequence,
SequenceGroup, SequenceGroupMetadata,
from vllm.sequence import (ExecuteModelRequest, MultiModalData, SamplerOutput,
Sequence, SequenceGroup, SequenceGroupMetadata,
SequenceStatus)
from vllm.transformers_utils.detokenizer import Detokenizer
from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup,
@@ -583,12 +583,16 @@ class LLMEngine:
seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
if not scheduler_outputs.is_empty():
output = self.model_executor.execute_model(
execute_model_req = ExecuteModelRequest(
seq_group_metadata_list=seq_group_metadata_list,
blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
blocks_to_copy=scheduler_outputs.blocks_to_copy,
num_lookahead_slots=scheduler_outputs.num_lookahead_slots)
num_lookahead_slots=scheduler_outputs.num_lookahead_slots,
running_queue_size=scheduler_outputs.running_queue_size,
)
output = self.model_executor.execute_model(
execute_model_req=execute_model_req)
else:
output = []