[Frontend][Core] Move guided decoding params into sampling params (#8252)
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com> Co-authored-by: Nick Hill <nickhill@us.ibm.com>
This commit is contained in:
@@ -110,8 +110,6 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
|
||||
tokenizer = await self.engine_client.get_tokenizer(lora_request)
|
||||
|
||||
guided_decode_logits_processor = (
|
||||
await self._guided_decode_logits_processor(request, tokenizer))
|
||||
prompts = list(
|
||||
self._tokenize_prompt_input_or_inputs(
|
||||
request,
|
||||
@@ -123,8 +121,6 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
|
||||
for i, prompt_inputs in enumerate(prompts):
|
||||
sampling_params = request.to_sampling_params(
|
||||
tokenizer,
|
||||
guided_decode_logits_processor,
|
||||
default_max_tokens=self.max_model_len -
|
||||
len(prompt_inputs["prompt_token_ids"]))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user