[V1][Spec Decode] Remove deprecated spec decode config params (#15466)

Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
This commit is contained in:
shangmingc
2025-04-01 00:19:35 +08:00
committed by GitHub
parent 09e974d483
commit 239b7befdd
10 changed files with 125 additions and 220 deletions

View File

@@ -3,6 +3,8 @@
tensor parallelism.
"""
import json
import openai
import pytest
import torch
@@ -33,7 +35,7 @@ SPEC_MODEL = "JackFram/llama-68m"
#TODO(wooyeon): add spec_draft_dp=2 case
[
"--speculative_config",
str({
json.dumps({
"model": f"{SPEC_MODEL}",
"num_speculative_tokens": 5,
"draft_tensor_parallel_size": 1,
@@ -80,7 +82,7 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
# Artificially limit the draft model max model len; this forces vLLM
# to skip speculation once the sequences grow beyond 32-k tokens.
"--speculative_config",
str({
json.dumps({
"model": f"{SPEC_MODEL}",
"num_speculative_tokens": 5,
"max_model_len": 32,