Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-10-05 15:06:22 +01:00
committed by GitHub
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions

View File

@@ -100,35 +100,37 @@ def check_model_outputs(
):
tokenizer = AutoTokenizer.from_pretrained(model.original_model)
if tokenizer.chat_template is not None:
messages = [[{
'role': 'user',
'content': prompt
}] for prompt in prompts]
prompts = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
messages = [[{"role": "user", "content": prompt}] for prompt in prompts]
prompts = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
# Run gguf model.
with vllm_runner(model_name=model.gguf_model,
enforce_eager=True,
tokenizer_name=model.original_model,
dtype=dtype,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tp_size) as gguf_model:
with vllm_runner(
model_name=model.gguf_model,
enforce_eager=True,
tokenizer_name=model.original_model,
dtype=dtype,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tp_size,
) as gguf_model:
gguf_outputs = gguf_model.generate_greedy_logprobs(
prompts[:-1], max_tokens, num_logprobs)
prompts[:-1], max_tokens, num_logprobs
)
# Run unquantized model.
# Should run with tp=1, otherwise the test will stuck at
# nccl initialization.
with vllm_runner(
model_name=model.original_model,
enforce_eager=True, # faster tests
dtype=dtype,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=1) as original_model:
model_name=model.original_model,
enforce_eager=True, # faster tests
dtype=dtype,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=1,
) as original_model:
original_outputs = original_model.generate_greedy_logprobs(
prompts[:-1], max_tokens, num_logprobs)
prompts[:-1], max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=original_outputs,
@@ -138,12 +140,14 @@ def check_model_outputs(
)
@pytest.mark.skipif(not is_quant_method_supported("gguf"),
reason="gguf is not supported on this GPU type.")
@pytest.mark.parametrize("model", [
pytest.param(test_config, marks=test_config.marks)
for test_config in MODELS
])
@pytest.mark.skipif(
not is_quant_method_supported("gguf"),
reason="gguf is not supported on this GPU type.",
)
@pytest.mark.parametrize(
"model",
[pytest.param(test_config, marks=test_config.marks) for test_config in MODELS],
)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
@@ -157,12 +161,15 @@ def test_models(
num_logprobs: int,
tp_size: int,
) -> None:
check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens,
num_logprobs, tp_size)
check_model_outputs(
vllm_runner, example_prompts, model, dtype, max_tokens, num_logprobs, tp_size
)
@pytest.mark.skipif(not is_quant_method_supported("gguf"),
reason="gguf is not supported on this GPU type.")
@pytest.mark.skipif(
not is_quant_method_supported("gguf"),
reason="gguf is not supported on this GPU type.",
)
@pytest.mark.parametrize("model", [LLAMA_CONFIG])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [8])
@@ -178,5 +185,6 @@ def test_distributed(
num_logprobs: int,
tp_size: int,
) -> None:
check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens,
num_logprobs, tp_size)
check_model_outputs(
vllm_runner, example_prompts, model, dtype, max_tokens, num_logprobs, tp_size
)