[Core] *Prompt* logprobs support in Multi-step (#8199)
This commit is contained in:
@@ -100,3 +100,95 @@ def test_multi_step_llm(
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("tp_size", [1])
|
||||
@pytest.mark.parametrize("max_tokens", [5])
|
||||
@pytest.mark.parametrize("enforce_eager", [True])
|
||||
@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
|
||||
@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
|
||||
@pytest.mark.parametrize("num_logprobs,num_prompt_logprobs", [(5, 5)])
|
||||
def test_multi_step_llm_w_prompt_logprobs(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
tp_size: int,
|
||||
max_tokens: int,
|
||||
enforce_eager: int,
|
||||
num_scheduler_steps: int,
|
||||
num_prompts: int,
|
||||
num_logprobs: Optional[int],
|
||||
num_prompt_logprobs: Optional[int],
|
||||
) -> None:
|
||||
"""Test prompt logprobs with multi-step scheduling via sync LLM Engine.
|
||||
|
||||
Set up a vLLM engine instance w/ single-step scheduling as a ground-truth
|
||||
reference.
|
||||
|
||||
Prompt them with the same example prompts.
|
||||
|
||||
Validate:
|
||||
* All generated logprobs are all very close
|
||||
|
||||
Args:
|
||||
hf_runner: HF transformers model runner fixture
|
||||
vllm_runner: vLLM model runner fixture
|
||||
example_prompts: test fixture providing example prompts
|
||||
model: model under test (same for single- and multi-step engines)
|
||||
dtype: tensor datatype for engine to utilize
|
||||
tp_size: degree of tensor-parallelism
|
||||
max_tokens: the maximum number of tokens to generate
|
||||
enforce_eager
|
||||
num_scheduler_steps: for multi-step scheduling, GPU-side steps per
|
||||
GPU -> CPU output transfer
|
||||
num_prompts: number of example prompts under test
|
||||
num_logprobs: corresponds to the `logprobs` argument to the OpenAI
|
||||
completions endpoint; `None` -> no logprobs
|
||||
num_prompt_logprobs: number of logprobs to return for each prompt token;
|
||||
note that this argument is not supported by the
|
||||
OpenAI completions endpoint.
|
||||
"""
|
||||
|
||||
prompts = example_prompts
|
||||
if len(prompts) < num_prompts:
|
||||
prompts = prompts * ((num_prompts // len(prompts)) + 1)
|
||||
prompts = prompts[:num_prompts]
|
||||
assert len(prompts) == num_prompts
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
enforce_eager=enforce_eager,
|
||||
gpu_memory_utilization=0.7,
|
||||
tensor_parallel_size=tp_size,
|
||||
use_v2_block_manager=True,
|
||||
num_scheduler_steps=num_scheduler_steps,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
num_prompt_logprobs=num_prompt_logprobs)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
enforce_eager=enforce_eager,
|
||||
gpu_memory_utilization=0.7,
|
||||
tensor_parallel_size=tp_size,
|
||||
) as vllm_model:
|
||||
single_step_vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
num_prompt_logprobs=num_prompt_logprobs)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=single_step_vllm_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user