Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-10-05 15:06:22 +01:00
committed by GitHub
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions

View File

@@ -51,8 +51,9 @@ AITER_MODEL_LIST = [
pytest.param(
"google/gemma-1.1-2b-it", # gemma
marks=[
pytest.mark.core_model, pytest.mark.cpu_model,
pytest.mark.slow_test
pytest.mark.core_model,
pytest.mark.cpu_model,
pytest.mark.slow_test,
],
),
pytest.param(
@@ -65,8 +66,7 @@ AITER_MODEL_LIST = [
pytest.param(
"openbmb/MiniCPM3-4B",
# fused_moe not supported on CPU
marks=[pytest.mark.core_model,
large_gpu_mark(min_gb=32)],
marks=[pytest.mark.core_model, large_gpu_mark(min_gb=32)],
),
pytest.param(
"facebook/opt-125m", # opt
@@ -82,8 +82,9 @@ AITER_MODEL_LIST = [
pytest.param(
"Qwen/Qwen2.5-0.5B-Instruct", # qwen2
marks=[
pytest.mark.core_model, pytest.mark.cpu_model,
pytest.mark.slow_test
pytest.mark.core_model,
pytest.mark.cpu_model,
pytest.mark.slow_test,
],
),
pytest.param(
@@ -100,16 +101,25 @@ AITER_MODEL_LIST = [
marks=[pytest.mark.cpu_model],
),
pytest.param("swiss-ai/Apertus-8B-2509"), # apertus
])
],
)
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize(
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
)
@pytest.mark.parametrize("use_prompt_embeds", [True, False])
def test_models(hf_runner, vllm_runner, example_prompts, model: str,
max_tokens: int, num_logprobs: int, use_rocm_aiter: bool,
use_prompt_embeds: bool, monkeypatch) -> None:
def test_models(
hf_runner,
vllm_runner,
example_prompts,
model: str,
max_tokens: int,
num_logprobs: int,
use_rocm_aiter: bool,
use_prompt_embeds: bool,
monkeypatch,
) -> None:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
@@ -125,34 +135,37 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
with hf_runner(model) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
prompt_embeds: Optional[list[torch.Tensor]] = ([] if use_prompt_embeds
else None)
prompt_embeds: Optional[list[torch.Tensor]] = [] if use_prompt_embeds else None
prompt_token_ids = []
for prompt in example_prompts:
token_ids = hf_model.tokenizer(prompt,
return_tensors="pt").input_ids.to(
hf_model.model.device)
token_ids = hf_model.tokenizer(prompt, return_tensors="pt").input_ids.to(
hf_model.model.device
)
prompt_token_ids.append(token_ids)
if prompt_embeds is not None:
prompt_embeds.append(hf_model.model.get_input_embeddings()(
token_ids).squeeze(0))
prompt_embeds.append(
hf_model.model.get_input_embeddings()(token_ids).squeeze(0)
)
with vllm_runner(
model,
tokenizer_name=model_info.tokenizer or model,
tokenizer_mode=model_info.tokenizer_mode,
trust_remote_code=model_info.trust_remote_code,
max_num_seqs=2,
enable_prompt_embeds=use_prompt_embeds,
model,
tokenizer_name=model_info.tokenizer or model,
tokenizer_mode=model_info.tokenizer_mode,
trust_remote_code=model_info.trust_remote_code,
max_num_seqs=2,
enable_prompt_embeds=use_prompt_embeds,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
if prompt_embeds is not None:
vllm_outputs_from_embeds = vllm_model.generate_greedy_logprobs(
prompt_embeds, max_tokens, num_logprobs)
prompt_embeds, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_outputs,

View File

@@ -11,17 +11,17 @@ def test_dummy_loader(vllm_runner, monkeypatch, model: str) -> None:
with monkeypatch.context() as m:
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
with vllm_runner(
model,
load_format="dummy",
model,
load_format="dummy",
) as llm:
if model == "google/gemma-3-4b-it":
normalizers = llm.llm.collective_rpc(
lambda self: self.model_runner.model.language_model.model.
normalizer.cpu().item())
lambda self: self.model_runner.model.language_model.model.normalizer.cpu().item()
)
config = llm.llm.llm_engine.model_config.hf_config.text_config
else:
normalizers = llm.llm.collective_rpc(
lambda self: self.model_runner.model.model.normalizer.cpu(
).item())
lambda self: self.model_runner.model.model.normalizer.cpu().item()
)
config = llm.llm.llm_engine.model_config.hf_config
assert np.allclose(normalizers, config.hidden_size**0.5, rtol=2e-3)

View File

@@ -26,11 +26,13 @@ def test_models(
) -> None:
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,

View File

@@ -24,7 +24,7 @@ SSM_MODELS = [
"tiiuae/falcon-mamba-tiny-dev",
# mamba2-codestral in transformers is broken pending:
# https://github.com/huggingface/transformers/pull/40861
#"yujiepan/mamba2-codestral-v0.1-tiny-random",
# "yujiepan/mamba2-codestral-v0.1-tiny-random",
]
HYBRID_MODELS = [
@@ -65,7 +65,6 @@ def test_models(
max_tokens: int,
num_logprobs: int,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
@@ -75,11 +74,13 @@ def test_models(
with hf_runner(model) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_outputs,
@@ -109,13 +110,14 @@ def test_batching(
for_loop_outputs = []
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
for prompt in example_prompts:
single_output, = vllm_model.generate_greedy_logprobs([prompt],
max_tokens,
num_logprobs)
(single_output,) = vllm_model.generate_greedy_logprobs(
[prompt], max_tokens, num_logprobs
)
for_loop_outputs.append(single_output)
batched_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=for_loop_outputs,
@@ -134,8 +136,8 @@ def test_chunked_prefill_with_parallel_sampling(
max_tokens: int,
) -> None:
"""
Tests chunked prefill in conjunction with n > 1.
Tests chunked prefill in conjunction with n > 1.
In this case, prefill is populated with decoding tokens and
we test that it doesn't fail.
@@ -143,16 +145,13 @@ def test_chunked_prefill_with_parallel_sampling(
decoding steps inside a chunked prefill forward pass
(where we have both prefill and decode together)
"""
sampling_params = SamplingParams(n=3,
temperature=1,
seed=0,
max_tokens=max_tokens)
sampling_params = SamplingParams(n=3, temperature=1, seed=0, max_tokens=max_tokens)
with vllm_runner(
model,
enable_chunked_prefill=True,
# forces prefill chunks with decoding
max_num_batched_tokens=MAX_NUM_SEQS * 3,
max_num_seqs=MAX_NUM_SEQS,
model,
enable_chunked_prefill=True,
# forces prefill chunks with decoding
max_num_batched_tokens=MAX_NUM_SEQS * 3,
max_num_seqs=MAX_NUM_SEQS,
) as vllm_model:
vllm_model.generate(example_prompts, sampling_params)
@@ -170,10 +169,8 @@ def test_mamba_cache_cg_padding(
batch size. If it's not, a torch RuntimeError will be raised because
tensor dimensions aren't compatible.
"""
vllm_config = EngineArgs(model=model,
trust_remote_code=True).create_engine_config()
while len(example_prompts) == vllm_config.pad_for_cudagraph(
len(example_prompts)):
vllm_config = EngineArgs(model=model, trust_remote_code=True).create_engine_config()
while len(example_prompts) == vllm_config.pad_for_cudagraph(len(example_prompts)):
example_prompts.append(example_prompts[0])
try:
@@ -183,7 +180,8 @@ def test_mamba_cache_cg_padding(
pytest.fail(
"Couldn't run batch size which is not equal to a Cuda Graph "
"captured batch size. "
"Could be related to mamba cache not padded correctly")
"Could be related to mamba cache not padded correctly"
)
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@@ -205,8 +203,10 @@ def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
except ValueError:
pytest.fail("Hybrid inner state wasn't cleaned up properly between"
"steps finished requests registered unnecessarily ")
pytest.fail(
"Hybrid inner state wasn't cleaned up properly between"
"steps finished requests registered unnecessarily "
)
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@@ -215,10 +215,10 @@ def test_state_cleanup(
example_prompts,
model: str,
) -> None:
"""
"""
This test is for verifying that the Hybrid state is cleaned up between
steps.
If it's not cleaned, an error would be expected.
"""
try:
@@ -226,8 +226,10 @@ def test_state_cleanup(
for _ in range(10):
vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
except ValueError:
pytest.fail("Hybrid inner state wasn't cleaned up between states, "
"could be related to finished_requests_ids")
pytest.fail(
"Hybrid inner state wasn't cleaned up between states, "
"could be related to finished_requests_ids"
)
@multi_gpu_test(num_gpus=2)
@@ -241,15 +243,19 @@ def test_distributed_correctness(
max_tokens: int,
num_logprobs: int,
) -> None:
with vllm_runner(model, tensor_parallel_size=1,
max_num_seqs=MAX_NUM_SEQS) as vllm_model:
with vllm_runner(
model, tensor_parallel_size=1, max_num_seqs=MAX_NUM_SEQS
) as vllm_model:
vllm_outputs_tp_1 = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model, tensor_parallel_size=2,
max_num_seqs=MAX_NUM_SEQS) as vllm_model:
with vllm_runner(
model, tensor_parallel_size=2, max_num_seqs=MAX_NUM_SEQS
) as vllm_model:
vllm_outputs_tp_2 = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=vllm_outputs_tp_1,
@@ -271,7 +277,6 @@ def test_full_cuda_graph(
max_tokens: int,
num_logprobs: int,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
@@ -281,11 +286,13 @@ def test_full_cuda_graph(
with hf_runner(model) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_outputs,
@@ -298,8 +305,9 @@ def test_full_cuda_graph(
@pytest.mark.parametrize("model", FP32_STATE_MODELS)
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("cache_dtype_param",
["mamba_ssm_cache_dtype", "mamba_cache_dtype"])
@pytest.mark.parametrize(
"cache_dtype_param", ["mamba_ssm_cache_dtype", "mamba_cache_dtype"]
)
def test_fp32_cache_state(
hf_runner,
vllm_runner,
@@ -310,7 +318,6 @@ def test_fp32_cache_state(
num_logprobs: int,
cache_dtype_param: str,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
@@ -320,13 +327,15 @@ def test_fp32_cache_state(
with hf_runner(model) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model,
max_num_seqs=MAX_NUM_SEQS,
**{cache_dtype_param: "float32"}) as vllm_model:
with vllm_runner(
model, max_num_seqs=MAX_NUM_SEQS, **{cache_dtype_param: "float32"}
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_outputs,
@@ -339,21 +348,23 @@ def test_fp32_cache_state(
# Helper functions for the APC tests
def _get_vllm_runner_params(model, max_model_len, tensor_parallel_size=1):
return {
'model_name': model,
'enable_prefix_caching': False,
'max_model_len': max_model_len,
'tensor_parallel_size': tensor_parallel_size,
'gpu_memory_utilization': 0.4
"model_name": model,
"enable_prefix_caching": False,
"max_model_len": max_model_len,
"tensor_parallel_size": tensor_parallel_size,
"gpu_memory_utilization": 0.4,
}
def _get_vLLM_output(vllm_runner,
kwargs,
prompts,
max_tokens,
num_logprobs,
num_repetitions=1,
vllm_model=None):
def _get_vLLM_output(
vllm_runner,
kwargs,
prompts,
max_tokens,
num_logprobs,
num_repetitions=1,
vllm_model=None,
):
outs = []
if vllm_model is None:
vllm_model = vllm_runner(**kwargs)
@@ -362,7 +373,8 @@ def _get_vLLM_output(vllm_runner,
vllm_output = vllm_model.generate_greedy(prompts, max_tokens)
else:
vllm_output = vllm_model.generate_greedy_logprobs(
prompts, max_tokens, num_logprobs)
prompts, max_tokens, num_logprobs
)
outs.append(vllm_output)
return outs, vllm_model
@@ -387,7 +399,6 @@ def test_apc_single_prompt(
num_logprobs: int,
tensor_parallel_size: int,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
@@ -395,29 +406,33 @@ def test_apc_single_prompt(
except ValueError:
pass
compare_operator: Callable = check_logprobs_close \
if num_logprobs > 0 else check_outputs_equal # type: ignore
compare_operator: Callable = (
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
)
MULTIPLE = 300
# Sample prompts.
generated_prompts = [MULTIPLE * example_prompts[0]]
max_model_len = max(
len(prompt) + max_tokens for prompt in generated_prompts)
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
vllm_runner_kwargs = _get_vllm_runner_params(
model, max_model_len, tensor_parallel_size=tensor_parallel_size)
vllm_runner_kwargs['mamba_ssm_cache_dtype'] = "float32"
vllm_outputs_no_cache, _ = _get_vLLM_output(vllm_runner,
vllm_runner_kwargs,
generated_prompts, max_tokens,
num_logprobs)
model, max_model_len, tensor_parallel_size=tensor_parallel_size
)
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
vllm_outputs_no_cache, _ = _get_vLLM_output(
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
)
vllm_runner_kwargs['enable_prefix_caching'] = True
vllm_outputs_cache_rep, _ = _get_vLLM_output(vllm_runner,
vllm_runner_kwargs,
generated_prompts, max_tokens,
num_logprobs, n_repetitions)
vllm_runner_kwargs["enable_prefix_caching"] = True
vllm_outputs_cache_rep, _ = _get_vLLM_output(
vllm_runner,
vllm_runner_kwargs,
generated_prompts,
max_tokens,
num_logprobs,
n_repetitions,
)
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
# In the first repetition, the caches are filled
@@ -450,7 +465,6 @@ def test_apc_single_prompt_block_align_alignment(
num_logprobs: int,
tensor_parallel_size: int,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
@@ -458,30 +472,29 @@ def test_apc_single_prompt_block_align_alignment(
except ValueError:
pass
compare_operator: Callable = check_logprobs_close \
if num_logprobs > 0 else check_outputs_equal # type: ignore
compare_operator: Callable = (
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
)
MULTIPLE = 300
# Sample prompts. This custom prompt is used, as it causes the most issues
generated_prompts = ["The president of the United States is " * MULTIPLE]
max_model_len = max(
len(prompt) + max_tokens for prompt in generated_prompts)
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
vllm_runner_kwargs = _get_vllm_runner_params(
model, max_model_len, tensor_parallel_size=tensor_parallel_size)
vllm_runner_kwargs['mamba_ssm_cache_dtype'] = "float32"
model, max_model_len, tensor_parallel_size=tensor_parallel_size
)
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
vllm_outputs_no_cache, _ = _get_vLLM_output(vllm_runner,
vllm_runner_kwargs,
generated_prompts, max_tokens,
num_logprobs)
vllm_outputs_no_cache, _ = _get_vLLM_output(
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
)
vllm_runner_kwargs['enable_prefix_caching'] = True
vllm_runner_kwargs["enable_prefix_caching"] = True
with vllm_runner(**vllm_runner_kwargs) as vllm_model:
# Retrieve the default mamba state block size
mamba_block_size = vllm_model.llm.llm_engine.cache_config. \
mamba_block_size
mamba_block_size = vllm_model.llm.llm_engine.cache_config.mamba_block_size
# In case the hybrid model does not have the
# "mamba_block_size" assume a fixed constant
@@ -489,18 +502,18 @@ def test_apc_single_prompt_block_align_alignment(
mamba_block_size = 512
mamba_block_size_multiplier = 10
for offsets in [
-3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3
]:
vllm_runner_kwargs[
'max_num_batched_tokens'] = mamba_block_size_multiplier * \
mamba_block_size - offsets
vllm_outputs_cache_rep, _ = _get_vLLM_output(vllm_runner,
vllm_runner_kwargs,
generated_prompts,
max_tokens, num_logprobs,
n_repetitions)
for offsets in [-3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3]:
vllm_runner_kwargs["max_num_batched_tokens"] = (
mamba_block_size_multiplier * mamba_block_size - offsets
)
vllm_outputs_cache_rep, _ = _get_vLLM_output(
vllm_runner,
vllm_runner_kwargs,
generated_prompts,
max_tokens,
num_logprobs,
n_repetitions,
)
# Check alignment of the output logits when using APC
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
@@ -534,7 +547,6 @@ def test_apc_multiple_prompts_all_cached_outputs(
num_logprobs: int,
tensor_parallel_size: int,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
@@ -542,30 +554,34 @@ def test_apc_multiple_prompts_all_cached_outputs(
except ValueError:
pass
compare_operator: Callable = check_logprobs_close \
if num_logprobs > 0 else check_outputs_equal # type: ignore
compare_operator: Callable = (
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
)
MULTIPLE = 300
# Sample prompts.
generated_prompts = [MULTIPLE * prompt for prompt in example_prompts]
max_model_len = max(
len(prompt) + max_tokens for prompt in generated_prompts)
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
vllm_runner_kwargs = _get_vllm_runner_params(
model, max_model_len, tensor_parallel_size=tensor_parallel_size)
vllm_runner_kwargs['mamba_ssm_cache_dtype'] = "float32"
model, max_model_len, tensor_parallel_size=tensor_parallel_size
)
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
vllm_outputs_no_cache, _ = _get_vLLM_output(vllm_runner,
vllm_runner_kwargs,
generated_prompts, max_tokens,
num_logprobs)
vllm_outputs_no_cache, _ = _get_vLLM_output(
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
)
vllm_runner_kwargs['enable_prefix_caching'] = True
vllm_outputs_cache_rep, _ = _get_vLLM_output(vllm_runner,
vllm_runner_kwargs,
generated_prompts, max_tokens,
num_logprobs, n_repetitions)
vllm_runner_kwargs["enable_prefix_caching"] = True
vllm_outputs_cache_rep, _ = _get_vLLM_output(
vllm_runner,
vllm_runner_kwargs,
generated_prompts,
max_tokens,
num_logprobs,
n_repetitions,
)
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
# In the first repetition, the caches are filled
@@ -598,7 +614,6 @@ def test_apc_multiple_prompts_block_align_alignment(
num_logprobs: int,
tensor_parallel_size: int,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
@@ -606,34 +621,31 @@ def test_apc_multiple_prompts_block_align_alignment(
except ValueError:
pass
compare_operator: Callable = check_logprobs_close \
if num_logprobs > 0 else check_outputs_equal # type: ignore
compare_operator: Callable = (
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
)
MULTIPLE = 300
# Sample prompts. This custom prompt is used, as it causes the most issues
prompt_text = "The president of the United States is "
prompt_offsets = [0, 3, 7, 13, 17, 22, 25, 31]
generated_prompts = [
prompt_text[offset:] * MULTIPLE for offset in prompt_offsets
]
generated_prompts = [prompt_text[offset:] * MULTIPLE for offset in prompt_offsets]
max_model_len = max(
len(prompt) + max_tokens for prompt in generated_prompts)
vllm_runner_kwargs = _get_vllm_runner_params(model, max_model_len,
tensor_parallel_size)
vllm_runner_kwargs['mamba_ssm_cache_dtype'] = "float32"
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
vllm_runner_kwargs = _get_vllm_runner_params(
model, max_model_len, tensor_parallel_size
)
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
vllm_outputs_no_cache, _ = _get_vLLM_output(vllm_runner,
vllm_runner_kwargs,
generated_prompts, max_tokens,
num_logprobs)
vllm_outputs_no_cache, _ = _get_vLLM_output(
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
)
vllm_runner_kwargs['enable_prefix_caching'] = True
vllm_runner_kwargs["enable_prefix_caching"] = True
with vllm_runner(**vllm_runner_kwargs) as vllm_model:
# Retrieve the default mamba state block size
mamba_block_size = vllm_model.llm.llm_engine.cache_config. \
mamba_block_size
mamba_block_size = vllm_model.llm.llm_engine.cache_config.mamba_block_size
# In case the hybrid model does not have the
# "mamba_block_size" assume a fixed constant
@@ -641,18 +653,18 @@ def test_apc_multiple_prompts_block_align_alignment(
mamba_block_size = 512
mamba_block_size_multiplier = 10
for offsets in [
-3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3
]:
vllm_runner_kwargs[
'max_num_batched_tokens'] = mamba_block_size_multiplier * \
mamba_block_size - offsets
vllm_outputs_cache_rep, _ = _get_vLLM_output(vllm_runner,
vllm_runner_kwargs,
generated_prompts,
max_tokens, num_logprobs,
n_repetitions)
for offsets in [-3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3]:
vllm_runner_kwargs["max_num_batched_tokens"] = (
mamba_block_size_multiplier * mamba_block_size - offsets
)
vllm_outputs_cache_rep, _ = _get_vLLM_output(
vllm_runner,
vllm_runner_kwargs,
generated_prompts,
max_tokens,
num_logprobs,
n_repetitions,
)
# Check alignment of the output logits when using APC
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
@@ -686,7 +698,6 @@ def test_apc_multiple_prompts_partial_cached_outputs(
num_logprobs: int,
tensor_parallel_size: int,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
@@ -694,30 +705,30 @@ def test_apc_multiple_prompts_partial_cached_outputs(
except ValueError:
pass
compare_operator: Callable = check_logprobs_close \
if num_logprobs > 0 else check_outputs_equal # type: ignore
compare_operator: Callable = (
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
)
MULTIPLE = 300
# Sample prompts.
generated_prompts = [MULTIPLE * prompt for prompt in example_prompts]
max_model_len = max(
len(prompt) + max_tokens for prompt in generated_prompts)
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
vllm_runner_kwargs = _get_vllm_runner_params(
model, max_model_len, tensor_parallel_size=tensor_parallel_size)
vllm_runner_kwargs['mamba_ssm_cache_dtype'] = "float32"
model, max_model_len, tensor_parallel_size=tensor_parallel_size
)
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
vllm_outputs_no_cache, _ = _get_vLLM_output(vllm_runner,
vllm_runner_kwargs,
generated_prompts, max_tokens,
num_logprobs)
vllm_outputs_no_cache, _ = _get_vLLM_output(
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
)
# Cache only part of all the prompts
vllm_runner_kwargs['enable_prefix_caching'] = True
vllm_runner_kwargs["enable_prefix_caching"] = True
vllm_outputs_partial_cache, vllm_model = _get_vLLM_output(
vllm_runner, vllm_runner_kwargs, generated_prompts[:3], max_tokens,
num_logprobs)
vllm_runner, vllm_runner_kwargs, generated_prompts[:3], max_tokens, num_logprobs
)
compare_operator(
outputs_0_lst=vllm_outputs_no_cache[0][:3],
@@ -726,13 +737,15 @@ def test_apc_multiple_prompts_partial_cached_outputs(
name_1="vllm_partial_cache",
)
vllm_outputs_cache_rep, _ = _get_vLLM_output(vllm_runner,
vllm_runner_kwargs,
generated_prompts,
max_tokens,
num_logprobs,
n_repetitions,
vllm_model=vllm_model)
vllm_outputs_cache_rep, _ = _get_vLLM_output(
vllm_runner,
vllm_runner_kwargs,
generated_prompts,
max_tokens,
num_logprobs,
n_repetitions,
vllm_model=vllm_model,
)
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
# In the first repetition, the caches are filled

View File

@@ -6,7 +6,9 @@ import json
import pytest
from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
MistralToolCall, MistralToolParser)
MistralToolCall,
MistralToolParser,
)
from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.tokenizer import MistralTokenizer
@@ -33,136 +35,114 @@ SYMBOLIC_LANG_PROMPTS = [
]
# for function calling
TOOLS = [{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type":
"string",
"description":
"The city to find the weather for, e.g. 'San Francisco'"
TOOLS = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string",
"description": "The city to find the weather for, e.g. 'San Francisco'",
},
"state": {
"type": "string",
"description": "the two-letter abbreviation for the state that the city is"
" in, e.g. 'CA' which would mean 'California'",
},
"unit": {
"type": "string",
"description": "The unit to fetch the temperature in",
"enum": ["celsius", "fahrenheit"],
},
},
"state": {
"type":
"string",
"description":
"the two-letter abbreviation for the state that the city is"
" in, e.g. 'CA' which would mean 'California'"
},
"unit": {
"type": "string",
"description": "The unit to fetch the temperature in",
"enum": ["celsius", "fahrenheit"]
}
"required": ["city", "state", "unit"],
},
"required": ["city", "state", "unit"]
}
},
},
}, {
"type": "function",
"function": {
"name": "rewrite",
"description": "Rewrites text",
"parameters": {
"type": "object",
"required": [],
"properties": {
"text": {
"type": "string",
"description": "The input text to rewrite."
}
}
}
}
}]
{
"type": "function",
"function": {
"name": "rewrite",
"description": "Rewrites text",
"parameters": {
"type": "object",
"required": [],
"properties": {
"text": {
"type": "string",
"description": "The input text to rewrite.",
}
},
},
},
},
]
MSGS = [
{"role": "system", "content": "You are an assistant."},
{
"role": "system",
"content": "You are an assistant."
},
{
"role":
"user",
"content":
"Could you please rewrite the below article? \n\n My English needs improvving, maybe I make errors." # noqa
},
{
"role":
"assistant",
"content":
"",
"tool_calls": [{
"id": "bbc5b7ede",
"type": "function",
"function": {
"name":
"rewrite",
"arguments":
'{\"text\":\"My English needs improvving, maybe I make errors.\"}' # noqa
}
}]
},
{
"role": "tool",
"content":
"{\"action\":\"rewrite\",\"outcome\":\"My English needs improving, maybe I make errors.\"}", # noqa
"tool_call_id": "bbc5b7ede",
"name": "rewrite"
"role": "user",
"content": "Could you please rewrite the below article? \n\n My English needs improvving, maybe I make errors.", # noqa
},
{
"role": "assistant",
"content": "---\n\nMy English needs improving, maybe I make errors"
"content": "",
"tool_calls": [
{
"id": "bbc5b7ede",
"type": "function",
"function": {
"name": "rewrite",
"arguments": '{"text":"My English needs improvving, maybe I make errors."}', # noqa
},
}
],
},
{
"role":
"user",
"content": ("Can you tell me what the temperate"
" will be in Dallas, in fahrenheit?")
}
"role": "tool",
"content": '{"action":"rewrite","outcome":"My English needs improving, maybe I make errors."}', # noqa
"tool_call_id": "bbc5b7ede",
"name": "rewrite",
},
{
"role": "assistant",
"content": "---\n\nMy English needs improving, maybe I make errors",
},
{
"role": "user",
"content": (
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
),
},
]
SAMPLE_JSON_SCHEMA = {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"age": {
"type": "integer"
},
"name": {"type": "string"},
"age": {"type": "integer"},
"skills": {
"type": "array",
"items": {
"type": "string",
"maxLength": 10
},
"minItems": 3
"items": {"type": "string", "maxLength": 10},
"minItems": 3,
},
"work_history": {
"type": "array",
"items": {
"type": "object",
"properties": {
"company": {
"type": "string"
},
"duration": {
"type": "number"
},
"position": {
"type": "string"
}
"company": {"type": "string"},
"duration": {"type": "number"},
"position": {"type": "string"},
},
"required": ["company", "position"]
}
}
"required": ["company", "position"],
},
},
},
"required": ["name", "age", "skills", "work_history"]
"required": ["name", "age", "skills", "work_history"],
}
@@ -170,17 +150,25 @@ SAMPLE_JSON_SCHEMA = {
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models(hf_runner, vllm_runner, example_prompts, model: str,
dtype: str, max_tokens: int, num_logprobs: int) -> None:
def test_models(
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
# TODO(sang): Sliding window should be tested separately.
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model, dtype=dtype,
tokenizer_mode="mistral") as vllm_model:
with vllm_runner(model, dtype=dtype, tokenizer_mode="mistral") as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_outputs,
@@ -194,27 +182,35 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
def test_mistral_format(vllm_runner, example_prompts, model: str, dtype: str,
max_tokens: int, num_logprobs: int) -> None:
def test_mistral_format(
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
with vllm_runner(
model,
dtype=dtype,
tokenizer_mode="mistral",
load_format="mistral",
config_format="mistral",
model,
dtype=dtype,
tokenizer_mode="mistral",
load_format="mistral",
config_format="mistral",
) as mistral_format_model:
mistral_format_outputs = mistral_format_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(
model,
dtype=dtype,
tokenizer_mode="auto",
load_format="safetensors",
config_format="hf",
model,
dtype=dtype,
tokenizer_mode="auto",
load_format="safetensors",
config_format="hf",
) as hf_format_model:
hf_format_outputs = hf_format_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_format_outputs,
@@ -226,34 +222,35 @@ def test_mistral_format(vllm_runner, example_prompts, model: str, dtype: str,
@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
def test_mistral_symbolic_languages(vllm_runner, model: str,
dtype: str) -> None:
with vllm_runner(model,
dtype=dtype,
max_model_len=8192,
tokenizer_mode="mistral",
config_format="mistral",
load_format="mistral") as vllm_model:
def test_mistral_symbolic_languages(vllm_runner, model: str, dtype: str) -> None:
with vllm_runner(
model,
dtype=dtype,
max_model_len=8192,
tokenizer_mode="mistral",
config_format="mistral",
load_format="mistral",
) as vllm_model:
for prompt in SYMBOLIC_LANG_PROMPTS:
msg = {"role": "user", "content": prompt}
outputs = vllm_model.llm.chat([msg],
sampling_params=SAMPLING_PARAMS)
outputs = vllm_model.llm.chat([msg], sampling_params=SAMPLING_PARAMS)
assert "<EFBFBD>" not in outputs[0].outputs[0].text.strip()
@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
with vllm_runner(model,
dtype=dtype,
tokenizer_mode="mistral",
config_format="mistral",
load_format="mistral") as vllm_model:
with vllm_runner(
model,
dtype=dtype,
tokenizer_mode="mistral",
config_format="mistral",
load_format="mistral",
) as vllm_model:
msgs = copy.deepcopy(MSGS)
outputs = vllm_model.llm.chat(msgs,
tools=TOOLS,
sampling_params=SAMPLING_PARAMS)
outputs = vllm_model.llm.chat(
msgs, tools=TOOLS, sampling_params=SAMPLING_PARAMS
)
tokenizer = vllm_model.llm.get_tokenizer()
tool_parser = MistralToolParser(tokenizer)
@@ -265,10 +262,11 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
assert parsed_message.tools_called
assert MistralToolCall.is_valid_id(parsed_message.tool_calls[0].id)
assert parsed_message.tool_calls[
0].function.name == "get_current_weather"
assert parsed_message.tool_calls[
0].function.arguments == '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}' # noqa
assert parsed_message.tool_calls[0].function.name == "get_current_weather"
assert (
parsed_message.tool_calls[0].function.arguments
== '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}'
) # noqa
assert parsed_message.content is None
@@ -297,17 +295,10 @@ def test_mistral_function_call_nested_json():
"city": "Dallas",
"state": "TX",
"unit": "fahrenheit",
"sub_dict": {
"foo": "bar",
"inner": {
"x": 1,
"y": 2
}
},
"sub_dict": {"foo": "bar", "inner": {"x": 1, "y": 2}},
}
model_output = (
f"{parser.bot_token}get_current_weather{json.dumps(args_dict)}")
model_output = f"{parser.bot_token}get_current_weather{json.dumps(args_dict)}"
parsed = parser.extract_tool_calls(model_output, None)

View File

@@ -15,62 +15,56 @@ MODELS = [
def test_phimoe_routing_function():
from vllm.model_executor.models.phimoe import phimoe_routing_function
test_case = {
0: {
"hidden_states":
torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
dtype=torch.float32,
requires_grad=False).view(4, 2),
"gating_output":
torch.tensor([0.1, 0.2, 0.3, 0.4],
dtype=torch.float32,
requires_grad=False),
"topk":
2,
"renormalize":
False,
"hidden_states": torch.tensor(
[1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.float32, requires_grad=False
).view(4, 2),
"gating_output": torch.tensor(
[0.1, 0.2, 0.3, 0.4], dtype=torch.float32, requires_grad=False
),
"topk": 2,
"renormalize": False,
},
1: {
"hidden_states":
torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
dtype=torch.float32,
requires_grad=False).view(4, 2),
"gating_output":
torch.tensor([0.4, 0.2, 0.3, 0.4],
dtype=torch.float32,
requires_grad=False),
"topk":
2,
"renormalize":
False,
}
"hidden_states": torch.tensor(
[1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.float32, requires_grad=False
).view(4, 2),
"gating_output": torch.tensor(
[0.4, 0.2, 0.3, 0.4], dtype=torch.float32, requires_grad=False
),
"topk": 2,
"renormalize": False,
},
}
ground_truth = {
0: {
"topk_weights":
torch.tensor([1., 1.], dtype=torch.float32, requires_grad=False),
"topk_ids":
torch.tensor([3, 2], dtype=torch.long, requires_grad=False),
"topk_weights": torch.tensor(
[1.0, 1.0], dtype=torch.float32, requires_grad=False
),
"topk_ids": torch.tensor([3, 2], dtype=torch.long, requires_grad=False),
},
1: {
"topk_weights":
torch.tensor([0.5, 1.], dtype=torch.float32, requires_grad=False),
"topk_ids":
torch.tensor([0, 3], dtype=torch.long, requires_grad=False),
}
"topk_weights": torch.tensor(
[0.5, 1.0], dtype=torch.float32, requires_grad=False
),
"topk_ids": torch.tensor([0, 3], dtype=torch.long, requires_grad=False),
},
}
for test_id in test_case:
topk_weights, topk_ids = phimoe_routing_function(**test_case[test_id])
assert torch.allclose(topk_weights,
ground_truth[test_id]["topk_weights"])
assert torch.allclose(topk_weights, ground_truth[test_id]["topk_weights"])
assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])
@pytest.mark.skipif(condition=current_platform.is_cpu(),
reason="This test takes a lot time to run on CPU, "
"and vllm CI's disk space is not enough for this model.")
@pytest.mark.skipif(
condition=current_platform.is_cpu(),
reason="This test takes a lot time to run on CPU, "
"and vllm CI's disk space is not enough for this model.",
)
@large_gpu_test(min_gb=80)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@@ -87,11 +81,13 @@ def test_models(
) -> None:
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,