Fix performance when --generation-config is not None (#14223)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-03-04 20:59:22 +01:00
committed by GitHub
parent beebf4742a
commit 9badee53de
4 changed files with 23 additions and 25 deletions

View File

@@ -51,11 +51,12 @@ class OpenAIServingCompletion(OpenAIServing):
models=models,
request_logger=request_logger,
return_tokens_as_token_ids=return_tokens_as_token_ids)
diff_sampling_param = self.model_config.get_diff_sampling_param()
if diff_sampling_param:
self.default_sampling_params = (
self.model_config.get_diff_sampling_param())
if self.default_sampling_params:
logger.info(
"Overwriting default completion sampling param with: %s",
diff_sampling_param)
self.default_sampling_params)
async def create_completion(
self,
@@ -119,17 +120,14 @@ class OpenAIServingCompletion(OpenAIServing):
sampling_params: Union[SamplingParams, BeamSearchParams]
default_max_tokens = self.max_model_len - len(
engine_prompt["prompt_token_ids"])
# Build default sampling params
default_sampling_params = (
self.model_config.get_diff_sampling_param())
if request.use_beam_search:
sampling_params = request.to_beam_search_params(
default_max_tokens, default_sampling_params)
default_max_tokens, self.default_sampling_params)
else:
sampling_params = request.to_sampling_params(
default_max_tokens,
self.model_config.logits_processor_pattern,
default_sampling_params)
self.default_sampling_params)
request_id_item = f"{request_id}-{i}"