[Misc] Replace os environ to monkeypatch in test suite (#14516)
Signed-off-by: sibi <85477603+t-sibiraj@users.noreply.github.com> Signed-off-by: Aaron Pham <contact@aarnphm.xyz> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Aaron Pham <contact@aarnphm.xyz>
This commit is contained in:
@@ -7,7 +7,7 @@ from typing import Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.kernels.utils import override_backend_env_variable
|
||||
from vllm.utils import STR_BACKEND_ENV_VAR
|
||||
|
||||
from ..models.utils import check_logprobs_close, check_outputs_equal
|
||||
|
||||
@@ -42,7 +42,7 @@ def test_multi_step_llm(
|
||||
num_prompts: int,
|
||||
num_logprobs: Optional[int],
|
||||
attention_backend: str,
|
||||
monkeypatch,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""Test vLLM engine with multi-step scheduling via sync LLM Engine.
|
||||
|
||||
@@ -70,48 +70,49 @@ def test_multi_step_llm(
|
||||
num_logprobs: corresponds to the `logprobs` argument to the OpenAI
|
||||
completions endpoint; `None` -> 1 logprob returned.
|
||||
"""
|
||||
override_backend_env_variable(monkeypatch, attention_backend)
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
|
||||
|
||||
prompts = example_prompts
|
||||
if len(prompts) < num_prompts:
|
||||
prompts = prompts * ((num_prompts // len(prompts)) + 1)
|
||||
prompts = prompts[:num_prompts]
|
||||
assert len(prompts) == num_prompts
|
||||
prompts = example_prompts
|
||||
if len(prompts) < num_prompts:
|
||||
prompts = prompts * ((num_prompts // len(prompts)) + 1)
|
||||
prompts = prompts[:num_prompts]
|
||||
assert len(prompts) == num_prompts
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
enforce_eager=enforce_eager,
|
||||
gpu_memory_utilization=0.7,
|
||||
tensor_parallel_size=tp_size,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
num_scheduler_steps=num_scheduler_steps,
|
||||
) as vllm_model:
|
||||
vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens)
|
||||
if num_logprobs is None else
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
prompts, max_tokens, num_logprobs))
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
enforce_eager=enforce_eager,
|
||||
gpu_memory_utilization=0.7,
|
||||
tensor_parallel_size=tp_size,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
num_scheduler_steps=num_scheduler_steps,
|
||||
) as vllm_model:
|
||||
vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens)
|
||||
if num_logprobs is None else
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
prompts, max_tokens, num_logprobs))
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = (hf_model.generate_greedy(prompts, max_tokens)
|
||||
if num_logprobs is None else
|
||||
hf_model.generate_greedy_logprobs_limit(
|
||||
prompts, max_tokens, num_logprobs))
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = (hf_model.generate_greedy(prompts, max_tokens)
|
||||
if num_logprobs is None else
|
||||
hf_model.generate_greedy_logprobs_limit(
|
||||
prompts, max_tokens, num_logprobs))
|
||||
|
||||
if num_logprobs is None:
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
else:
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
if num_logprobs is None:
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
else:
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@@ -136,7 +137,7 @@ def test_multi_step_llm_w_prompt_logprobs(
|
||||
num_logprobs: Optional[int],
|
||||
num_prompt_logprobs: Optional[int],
|
||||
attention_backend: str,
|
||||
monkeypatch,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""Test prompt logprobs with multi-step scheduling via sync LLM Engine.
|
||||
|
||||
@@ -166,47 +167,48 @@ def test_multi_step_llm_w_prompt_logprobs(
|
||||
note that this argument is not supported by the
|
||||
OpenAI completions endpoint.
|
||||
"""
|
||||
override_backend_env_variable(monkeypatch, attention_backend)
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
|
||||
|
||||
prompts = example_prompts
|
||||
if len(prompts) < num_prompts:
|
||||
prompts = prompts * ((num_prompts // len(prompts)) + 1)
|
||||
prompts = prompts[:num_prompts]
|
||||
assert len(prompts) == num_prompts
|
||||
prompts = example_prompts
|
||||
if len(prompts) < num_prompts:
|
||||
prompts = prompts * ((num_prompts // len(prompts)) + 1)
|
||||
prompts = prompts[:num_prompts]
|
||||
assert len(prompts) == num_prompts
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
enforce_eager=enforce_eager,
|
||||
gpu_memory_utilization=0.7,
|
||||
tensor_parallel_size=tp_size,
|
||||
num_scheduler_steps=num_scheduler_steps,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
num_prompt_logprobs=num_prompt_logprobs)
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
enforce_eager=enforce_eager,
|
||||
gpu_memory_utilization=0.7,
|
||||
tensor_parallel_size=tp_size,
|
||||
num_scheduler_steps=num_scheduler_steps,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
num_prompt_logprobs=num_prompt_logprobs)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
enforce_eager=enforce_eager,
|
||||
gpu_memory_utilization=0.7,
|
||||
tensor_parallel_size=tp_size,
|
||||
) as vllm_model:
|
||||
single_step_vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
num_prompt_logprobs=num_prompt_logprobs)
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
enforce_eager=enforce_eager,
|
||||
gpu_memory_utilization=0.7,
|
||||
tensor_parallel_size=tp_size,
|
||||
) as vllm_model:
|
||||
single_step_vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
num_prompt_logprobs=num_prompt_logprobs)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=single_step_vllm_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=single_step_vllm_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@@ -230,7 +232,7 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
|
||||
num_prompts: int,
|
||||
num_logprobs: Optional[int],
|
||||
attention_backend: str,
|
||||
monkeypatch,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""Test vLLM engine with multi-step+"single-step chunked prefill"+APC.
|
||||
|
||||
@@ -293,77 +295,78 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
|
||||
#
|
||||
# The Incorrect scheduling behavior - if it occurs - will cause an exception
|
||||
# in the model runner resulting from `do_sample=False`.
|
||||
override_backend_env_variable(monkeypatch, attention_backend)
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
|
||||
|
||||
assert len(example_prompts) >= 2
|
||||
challenge_prompts = copy.deepcopy(example_prompts)
|
||||
challenge_prompts[0] = ('vLLM is a high-throughput and memory-efficient '
|
||||
'inference and serving engine for LLMs.\n'
|
||||
) # 24 tok
|
||||
challenge_prompts[1] = (
|
||||
'Briefly describe the major milestones in the '
|
||||
'development of artificial intelligence from 1950 to 2020.\n'
|
||||
) # 30 tok
|
||||
assert len(example_prompts) >= 2
|
||||
challenge_prompts = copy.deepcopy(example_prompts)
|
||||
challenge_prompts[0] = (
|
||||
'vLLM is a high-throughput and memory-efficient '
|
||||
'inference and serving engine for LLMs.\n') # 24 tok
|
||||
challenge_prompts[1] = (
|
||||
'Briefly describe the major milestones in the '
|
||||
'development of artificial intelligence from 1950 to 2020.\n'
|
||||
) # 30 tok
|
||||
|
||||
# If necessary, adjust the length of `challenge_prompts` to match
|
||||
# `num_prompts`
|
||||
if len(challenge_prompts) < num_prompts:
|
||||
challenge_prompts = (challenge_prompts *
|
||||
((num_prompts // len(challenge_prompts)) + 1))
|
||||
challenge_prompts = challenge_prompts[:num_prompts]
|
||||
assert len(challenge_prompts) == num_prompts
|
||||
# If necessary, adjust the length of `challenge_prompts` to match
|
||||
# `num_prompts`
|
||||
if len(challenge_prompts) < num_prompts:
|
||||
challenge_prompts = (challenge_prompts *
|
||||
((num_prompts // len(challenge_prompts)) + 1))
|
||||
challenge_prompts = challenge_prompts[:num_prompts]
|
||||
assert len(challenge_prompts) == num_prompts
|
||||
|
||||
# Single-step scheduler baseline
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
enforce_eager=enforce_eager,
|
||||
gpu_memory_utilization=0.7,
|
||||
tensor_parallel_size=tp_size,
|
||||
num_scheduler_steps=num_scheduler_steps,
|
||||
max_model_len=48,
|
||||
max_num_batched_tokens=48,
|
||||
max_num_seqs=4,
|
||||
block_size=16,
|
||||
) as vllm_model:
|
||||
outputs_baseline = (vllm_model.generate_greedy(
|
||||
challenge_prompts, max_tokens) if num_logprobs is None else
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
challenge_prompts, max_tokens, num_logprobs))
|
||||
# Single-step scheduler baseline
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
enforce_eager=enforce_eager,
|
||||
gpu_memory_utilization=0.7,
|
||||
tensor_parallel_size=tp_size,
|
||||
num_scheduler_steps=num_scheduler_steps,
|
||||
max_model_len=48,
|
||||
max_num_batched_tokens=48,
|
||||
max_num_seqs=4,
|
||||
block_size=16,
|
||||
) as vllm_model:
|
||||
outputs_baseline = (
|
||||
vllm_model.generate_greedy(challenge_prompts, max_tokens) if
|
||||
num_logprobs is None else vllm_model.generate_greedy_logprobs(
|
||||
challenge_prompts, max_tokens, num_logprobs))
|
||||
|
||||
# multi-step+"single-step chunked prefill"+APC
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
enforce_eager=enforce_eager,
|
||||
gpu_memory_utilization=0.7,
|
||||
tensor_parallel_size=tp_size,
|
||||
enable_chunked_prefill=True,
|
||||
enable_prefix_caching=True,
|
||||
num_scheduler_steps=num_scheduler_steps,
|
||||
max_model_len=48,
|
||||
max_num_batched_tokens=48,
|
||||
max_num_seqs=4,
|
||||
block_size=16,
|
||||
) as vllm_model:
|
||||
outputs_w_features = (vllm_model.generate_greedy(
|
||||
challenge_prompts, max_tokens) if num_logprobs is None else
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
challenge_prompts, max_tokens, num_logprobs))
|
||||
# multi-step+"single-step chunked prefill"+APC
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
enforce_eager=enforce_eager,
|
||||
gpu_memory_utilization=0.7,
|
||||
tensor_parallel_size=tp_size,
|
||||
enable_chunked_prefill=True,
|
||||
enable_prefix_caching=True,
|
||||
num_scheduler_steps=num_scheduler_steps,
|
||||
max_model_len=48,
|
||||
max_num_batched_tokens=48,
|
||||
max_num_seqs=4,
|
||||
block_size=16,
|
||||
) as vllm_model:
|
||||
outputs_w_features = (
|
||||
vllm_model.generate_greedy(challenge_prompts, max_tokens) if
|
||||
num_logprobs is None else vllm_model.generate_greedy_logprobs(
|
||||
challenge_prompts, max_tokens, num_logprobs))
|
||||
|
||||
if num_logprobs is None:
|
||||
# No-logprobs test
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=outputs_baseline,
|
||||
outputs_1_lst=outputs_w_features,
|
||||
name_0="multi-step",
|
||||
name_1="multi-step+features",
|
||||
)
|
||||
else:
|
||||
# Yes-logprobs test
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=outputs_baseline,
|
||||
outputs_1_lst=outputs_w_features,
|
||||
name_0="multi-step",
|
||||
name_1="multi-step+features",
|
||||
)
|
||||
if num_logprobs is None:
|
||||
# No-logprobs test
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=outputs_baseline,
|
||||
outputs_1_lst=outputs_w_features,
|
||||
name_0="multi-step",
|
||||
name_1="multi-step+features",
|
||||
)
|
||||
else:
|
||||
# Yes-logprobs test
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=outputs_baseline,
|
||||
outputs_1_lst=outputs_w_features,
|
||||
name_0="multi-step",
|
||||
name_1="multi-step+features",
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user