[Bugfix] Fix OpenAI parallel sampling when using xgrammar (#11637)
Signed-off-by: mgoin <michael@neuralmagic.com>
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
# noqa: UP007
|
||||
from __future__ import annotations
|
||||
|
||||
import copy
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from typing import TYPE_CHECKING, Any
|
||||
@@ -309,3 +310,7 @@ class XGrammarLogitsProcessor:
|
||||
scores = scores.to(device_type).squeeze()
|
||||
|
||||
return scores
|
||||
|
||||
def clone(self) -> XGrammarLogitsProcessor:
|
||||
"""Deepcopy due to per-sequence state in the matchers"""
|
||||
return copy.deepcopy(self)
|
||||
|
||||
@@ -450,15 +450,16 @@ class SamplingParams(
|
||||
return self._all_stop_token_ids
|
||||
|
||||
def clone(self) -> "SamplingParams":
|
||||
"""Deep copy excluding LogitsProcessor objects.
|
||||
"""Deep copy, but maybe not the LogitsProcessor objects.
|
||||
|
||||
LogitsProcessor objects are excluded because they may contain an
|
||||
arbitrary, nontrivial amount of data.
|
||||
LogitsProcessor objects may contain an arbitrary, nontrivial amount of
|
||||
data that is expensive to copy. However, if not copied, the processor
|
||||
needs to support parallel decoding for multiple sequences
|
||||
See https://github.com/vllm-project/vllm/issues/3087
|
||||
"""
|
||||
|
||||
logit_processor_refs = None if self.logits_processors is None else {
|
||||
id(lp): lp
|
||||
id(lp): lp.clone() if hasattr(lp, 'clone') else lp
|
||||
for lp in self.logits_processors
|
||||
}
|
||||
return copy.deepcopy(self, memo=logit_processor_refs)
|
||||
|
||||
@@ -1372,7 +1372,7 @@ class ParallelSampleSequenceGroup(SequenceGroupBase):
|
||||
@staticmethod
|
||||
def add_request(request_id: str, engine, params, **kwargs):
|
||||
original_params = params
|
||||
params = copy.deepcopy(original_params)
|
||||
params = original_params.clone()
|
||||
params.n = 1
|
||||
group = ParallelSampleSequenceGroup(request_id)
|
||||
seqs = []
|
||||
|
||||
Reference in New Issue
Block a user