[Bugfix][Spec Decode] Avoid double call of Ngram CPU (#36952)
Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
This commit is contained in:
@@ -4247,15 +4247,6 @@ class GPUModelRunner(
|
||||
self.input_batch.token_ids_cpu,
|
||||
slot_mappings=slot_mappings,
|
||||
)
|
||||
if isinstance(self.drafter, NgramProposer):
|
||||
assert isinstance(sampled_token_ids, list), (
|
||||
"sampled_token_ids should be a python list when ngram is used."
|
||||
)
|
||||
draft_token_ids = self.drafter.propose(
|
||||
sampled_token_ids,
|
||||
self.input_batch.num_tokens_no_spec,
|
||||
self.input_batch.token_ids_cpu,
|
||||
)
|
||||
elif spec_config.use_ngram_gpu():
|
||||
assert isinstance(self.drafter, NgramProposerGPU)
|
||||
(
|
||||
|
||||
Reference in New Issue
Block a user