[Misc] Replace os environ to monkeypatch in test suite (#14516)

Signed-off-by: sibi <85477603+t-sibiraj@users.noreply.github.com>
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Aaron Pham <contact@aarnphm.xyz>
This commit is contained in:
Sibi
2025-03-17 11:35:57 +08:00
committed by GitHub
parent 1e799b7ec1
commit a73e183e36
43 changed files with 1900 additions and 1658 deletions

View File

@@ -12,11 +12,10 @@ import pytest
from tests.kernels.utils import override_backend_env_variable
from tests.quantization.utils import is_quant_method_supported
from vllm.platforms import current_platform
from vllm.utils import STR_BACKEND_ENV_VAR
from ...utils import check_logprobs_close
os.environ["TOKENIZERS_PARALLELISM"] = "true"
@pytest.mark.quant_model
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
@@ -55,45 +54,47 @@ def test_models(
backend: str,
tensor_parallel_size: int,
disable_async_output_proc: bool,
monkeypatch,
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""
Only checks log probs match to cover the discrepancy in
numerical sensitive kernels.
"""
override_backend_env_variable(monkeypatch, backend)
with monkeypatch.context() as m:
m.setenv("TOKENIZERS_PARALLELISM", 'true')
m.setenv(STR_BACKEND_ENV_VAR, backend)
MAX_MODEL_LEN = 1024
NUM_LOG_PROBS = 8
MAX_MODEL_LEN = 1024
NUM_LOG_PROBS = 8
with vllm_runner(
base_model,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
kv_cache_dtype="auto",
disable_async_output_proc=disable_async_output_proc,
) as vllm_model:
baseline_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS)
with vllm_runner(
base_model,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
kv_cache_dtype="auto",
disable_async_output_proc=disable_async_output_proc,
) as vllm_model:
baseline_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS)
with vllm_runner(
test_model,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
kv_cache_dtype=kv_cache_dtype,
disable_async_output_proc=disable_async_output_proc,
) as vllm_model:
test_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS)
with vllm_runner(
test_model,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
kv_cache_dtype=kv_cache_dtype,
disable_async_output_proc=disable_async_output_proc,
) as vllm_model:
test_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS)
check_logprobs_close(
outputs_0_lst=baseline_outputs,
outputs_1_lst=test_outputs,
name_0="fp16_kv_cache",
name_1="fp8_kv_cache",
)
check_logprobs_close(
outputs_0_lst=baseline_outputs,
outputs_1_lst=test_outputs,
name_0="fp16_kv_cache",
name_1="fp8_kv_cache",
)
@pytest.mark.cpu_model
@@ -119,38 +120,41 @@ def test_cpu_models(
test_model: str,
max_tokens: int,
disable_async_output_proc: bool,
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""
Only checks log probs match to cover the discrepancy in
numerical sensitive kernels.
"""
with monkeypatch.context() as m:
m.setenv("TOKENIZERS_PARALLELISM", 'true')
MAX_MODEL_LEN = 1024
NUM_LOG_PROBS = 8
MAX_MODEL_LEN = 1024
NUM_LOG_PROBS = 8
with vllm_runner(
base_model,
max_model_len=MAX_MODEL_LEN,
dtype="bfloat16",
kv_cache_dtype="auto",
disable_async_output_proc=disable_async_output_proc,
) as vllm_model:
baseline_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS)
with vllm_runner(
base_model,
max_model_len=MAX_MODEL_LEN,
dtype="bfloat16",
kv_cache_dtype="auto",
disable_async_output_proc=disable_async_output_proc,
) as vllm_model:
baseline_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS)
with vllm_runner(
test_model,
max_model_len=MAX_MODEL_LEN,
dtype="bfloat16",
kv_cache_dtype=kv_cache_dtype,
disable_async_output_proc=disable_async_output_proc,
) as vllm_model:
test_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS)
with vllm_runner(
test_model,
max_model_len=MAX_MODEL_LEN,
dtype="bfloat16",
kv_cache_dtype=kv_cache_dtype,
disable_async_output_proc=disable_async_output_proc,
) as vllm_model:
test_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS)
check_logprobs_close(
outputs_0_lst=baseline_outputs,
outputs_1_lst=test_outputs,
name_0="bf16_kv_cache",
name_1="fp8_kv_cache",
)
check_logprobs_close(
outputs_0_lst=baseline_outputs,
outputs_1_lst=test_outputs,
name_0="bf16_kv_cache",
name_1="fp8_kv_cache",
)