[Core] Add engine option to return only deltas or final output (#7381)
This commit is contained in:
@@ -19,7 +19,7 @@ from vllm.model_executor.guided_decoding.guided_fields import LLMGuidedOptions
|
||||
from vllm.outputs import EmbeddingRequestOutput, RequestOutput
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.prompt_adapter.request import PromptAdapterRequest
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.sampling_params import RequestOutputKind, SamplingParams
|
||||
from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
|
||||
get_cached_tokenizer)
|
||||
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
|
||||
@@ -642,14 +642,12 @@ class LLM:
|
||||
raise ValueError("The lengths of prompts and lora_request "
|
||||
"must be the same.")
|
||||
|
||||
if isinstance(params, list):
|
||||
params = [
|
||||
self._add_guided_processor(param, guided_options)
|
||||
if isinstance(param, SamplingParams) else param
|
||||
for param in params
|
||||
]
|
||||
elif isinstance(params, SamplingParams):
|
||||
params = self._add_guided_processor(params, guided_options)
|
||||
for sp in params if isinstance(params, list) else (params, ):
|
||||
if isinstance(sp, SamplingParams):
|
||||
self._add_guided_processor(sp, guided_options)
|
||||
|
||||
# We only care about the final output
|
||||
sp.output_kind = RequestOutputKind.FINAL_ONLY
|
||||
|
||||
# Add requests to the engine.
|
||||
for i, request_inputs in enumerate(inputs):
|
||||
@@ -709,9 +707,6 @@ class LLM:
|
||||
f"output: {0:.2f} toks/s"),
|
||||
)
|
||||
|
||||
# In the loop below, only finished outputs are used
|
||||
self.llm_engine.step_return_finished_only = True
|
||||
|
||||
# Run the engine.
|
||||
outputs: List[Union[RequestOutput, EmbeddingRequestOutput]] = []
|
||||
total_in_toks = 0
|
||||
@@ -724,6 +719,7 @@ class LLM:
|
||||
if use_tqdm:
|
||||
if isinstance(output, RequestOutput):
|
||||
# Calculate tokens only for RequestOutput
|
||||
assert output.prompt_token_ids is not None
|
||||
total_in_toks += len(output.prompt_token_ids)
|
||||
in_spd = total_in_toks / pbar.format_dict["elapsed"]
|
||||
total_out_toks += sum(
|
||||
@@ -735,9 +731,6 @@ class LLM:
|
||||
f"output: {out_spd:.2f} toks/s")
|
||||
pbar.update(1)
|
||||
|
||||
# Restore original behavior
|
||||
self.llm_engine.step_return_finished_only = False
|
||||
|
||||
if use_tqdm:
|
||||
pbar.close()
|
||||
# Sort the outputs by request ID.
|
||||
|
||||
Reference in New Issue
Block a user