Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-10-05 15:06:22 +01:00
committed by GitHub
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions

View File

@@ -11,12 +11,12 @@ from vllm.multimodal.image import rescale_image_size
from ...conftest import IMAGE_ASSETS, ImageTestAssets, VllmRunner
from ..utils import check_logprobs_close
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
"<|im_start|>User\n<image>\nWhat's the content in the center of the image?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
"cherry_blossom":
"<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
})
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
{
"stop_sign": "<|im_start|>User\n<image>\nWhat's the content in the center of the image?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
"cherry_blossom": "<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
}
)
def run_awq_test(
@@ -34,10 +34,13 @@ def run_awq_test(
):
images = [asset.pil_image for asset in image_assets]
inputs_per_image = [(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
inputs_per_image = [
(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
)
for image, prompt in zip(images, HF_IMAGE_PROMPTS)
]
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
@@ -46,42 +49,41 @@ def run_awq_test(
# max_model_len should be greater than image_feature_size
with vllm_runner(
source_model,
max_model_len=4096,
dtype=dtype,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True,
default_torch_num_threads=1,
source_model,
max_model_len=4096,
dtype=dtype,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True,
default_torch_num_threads=1,
) as vllm_model:
source_outputs_per_image = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images)
vllm_model.generate_greedy_logprobs(
prompts, max_tokens, num_logprobs=num_logprobs, images=images
)
for prompts, images in inputs_per_image
]
with vllm_runner(
quant_model,
quantization="awq",
max_model_len=4096,
dtype=dtype,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True,
default_torch_num_threads=1,
quant_model,
quantization="awq",
max_model_len=4096,
dtype=dtype,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True,
default_torch_num_threads=1,
) as vllm_model:
quant_outputs_per_image = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images)
vllm_model.generate_greedy_logprobs(
prompts, max_tokens, num_logprobs=num_logprobs, images=images
)
for prompts, images in inputs_per_image
]
for source_outputs, quant_outputs in zip(source_outputs_per_image,
quant_outputs_per_image):
for source_outputs, quant_outputs in zip(
source_outputs_per_image, quant_outputs_per_image
):
# TODO: Check whether using original CLIPVisionModel can improve
# consistency against HF
check_logprobs_close(
@@ -113,9 +115,16 @@ def run_awq_test(
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@torch.inference_mode()
def test_awq_models(vllm_runner, image_assets, source_model, quant_model,
size_factors, dtype, max_tokens, num_logprobs) -> None:
def test_awq_models(
vllm_runner,
image_assets,
source_model,
quant_model,
size_factors,
dtype,
max_tokens,
num_logprobs,
) -> None:
run_awq_test(
vllm_runner,
image_assets,

View File

@@ -7,9 +7,10 @@ As a result, in this test, we just confirm that the top selected tokens of the
bitblas/GPTQ models are in the top 3 selections of each other.
Note: bitblas internally uses locks to synchronize the threads. This can
result in very slight nondeterminism for bitblas. As a result, we re-run the
result in very slight nondeterminism for bitblas. As a result, we re-run the
test up to 3 times to see if we pass.
"""
from dataclasses import dataclass
import pytest
@@ -24,8 +25,10 @@ class ModelPair:
model_pairs = [
ModelPair(model_bitblas="hxbgsyxh/opt-125m-4bit-128g-bitblas",
model_gptq="hxbgsyxh/opt-125m-4bit-128g"),
ModelPair(
model_bitblas="hxbgsyxh/opt-125m-4bit-128g-bitblas",
model_gptq="hxbgsyxh/opt-125m-4bit-128g",
),
]
@@ -43,16 +46,19 @@ def test_models(
max_tokens: int,
num_logprobs: int,
) -> None:
with vllm_runner(model_pair.model_bitblas,
dtype=dtype,
quantization="bitblas") as bitblas_model:
with vllm_runner(
model_pair.model_bitblas, dtype=dtype, quantization="bitblas"
) as bitblas_model:
bitblas_outputs = bitblas_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model_pair.model_gptq, dtype=dtype,
quantization="gptq") as gptq_model:
with vllm_runner(
model_pair.model_gptq, dtype=dtype, quantization="gptq"
) as gptq_model:
gptq_outputs = gptq_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=gptq_outputs,

View File

@@ -1,9 +1,9 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
'''Tests whether bitsandbytes computation is enabled correctly.
"""Tests whether bitsandbytes computation is enabled correctly.
Run `pytest tests/quantization/test_bitsandbytes.py`.
'''
"""
import pytest
from transformers import BitsAndBytesConfig
@@ -15,8 +15,10 @@ from ..utils import check_embeddings_close, check_logprobs_close
models_4bit_to_test = [
("facebook/opt-125m", "quantize opt model inflight"),
("mistralai/Mistral-7B-Instruct-v0.3",
"quantize inflight model with both HF and Mistral format weights")
(
"mistralai/Mistral-7B-Instruct-v0.3",
"quantize inflight model with both HF and Mistral format weights",
),
]
models_4bit_to_embedding_test = [
@@ -28,72 +30,84 @@ models_4bit_to_moe_test = [
]
models_pre_qaunt_4bit_to_test = [
('PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed',
'read pre-quantized 4-bit FP4 model'),
('poedator/opt-125m-bnb-4bit', 'read pre-quantized 4-bit NF4 opt model'),
(
"PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed",
"read pre-quantized 4-bit FP4 model",
),
("poedator/opt-125m-bnb-4bit", "read pre-quantized 4-bit NF4 opt model"),
]
models_pre_quant_8bit_to_test = [
('meta-llama/Llama-Guard-3-8B-INT8',
'read pre-quantized llama 8-bit model'),
("meta-llama/Llama-Guard-3-8B-INT8", "read pre-quantized llama 8-bit model"),
("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"),
]
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.skipif(
not is_quant_method_supported("bitsandbytes"),
reason="bitsandbytes is not supported on this GPU type.",
)
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
model_name, description) -> None:
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
load_in_4bit=True))
validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
model_name, False, hf_model_kwargs)
def test_load_4bit_bnb_model(
hf_runner, vllm_runner, example_prompts, model_name, description
) -> None:
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(load_in_4bit=True))
validate_generated_texts(
hf_runner, vllm_runner, example_prompts[:1], model_name, False, hf_model_kwargs
)
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description",
models_pre_qaunt_4bit_to_test)
def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
model_name, description) -> None:
validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
model_name, True)
@pytest.mark.skipif(
not is_quant_method_supported("bitsandbytes"),
reason="bitsandbytes is not supported on this GPU type.",
)
@pytest.mark.parametrize("model_name, description", models_pre_qaunt_4bit_to_test)
def test_load_pre_quant_4bit_bnb_model(
hf_runner, vllm_runner, example_prompts, model_name, description
) -> None:
validate_generated_texts(
hf_runner, vllm_runner, example_prompts[:1], model_name, True
)
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description",
models_pre_quant_8bit_to_test)
def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
model_name, description) -> None:
validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
model_name, True)
@pytest.mark.skipif(
not is_quant_method_supported("bitsandbytes"),
reason="bitsandbytes is not supported on this GPU type.",
)
@pytest.mark.parametrize("model_name, description", models_pre_quant_8bit_to_test)
def test_load_8bit_bnb_model(
hf_runner, vllm_runner, example_prompts, model_name, description
) -> None:
validate_generated_texts(
hf_runner, vllm_runner, example_prompts[:1], model_name, True
)
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.skipif(
not is_quant_method_supported("bitsandbytes"),
reason="bitsandbytes is not supported on this GPU type.",
)
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
@multi_gpu_test(num_gpus=2)
def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
model_name, description) -> None:
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
load_in_4bit=True))
validate_generated_texts(hf_runner,
vllm_runner,
example_prompts[:1],
model_name,
False,
hf_model_kwargs,
vllm_tp_size=2)
def test_load_tp_4bit_bnb_model(
hf_runner, vllm_runner, example_prompts, model_name, description
) -> None:
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(load_in_4bit=True))
validate_generated_texts(
hf_runner,
vllm_runner,
example_prompts[:1],
model_name,
False,
hf_model_kwargs,
vllm_tp_size=2,
)
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.skipif(
not is_quant_method_supported("bitsandbytes"),
reason="bitsandbytes is not supported on this GPU type.",
)
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
@multi_gpu_test(num_gpus=2)
def test_load_pp_4bit_bnb_model(model_name, description) -> None:
@@ -115,30 +129,37 @@ def test_load_pp_4bit_bnb_model(model_name, description) -> None:
compare_two_settings(model_name, common_args, pp_args)
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.skipif(
not is_quant_method_supported("bitsandbytes"),
reason="bitsandbytes is not supported on this GPU type.",
)
@pytest.mark.parametrize("model_name, description", models_4bit_to_moe_test)
def test_4bit_bnb_moe_model(hf_runner, vllm_runner, example_prompts,
model_name, description) -> None:
def test_4bit_bnb_moe_model(
hf_runner, vllm_runner, example_prompts, model_name, description
) -> None:
hf_model_kwargs = dict(
quantization_config=BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
)
)
with vllm_runner(
model_name,
quantization="bitsandbytes",
enforce_eager=False,
default_torch_num_threads=1,
) as llm:
vllm_outputs = llm.generate_greedy_logprobs(
example_prompts, max_tokens=32, num_logprobs=5
)
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
))
with vllm_runner(model_name,
quantization='bitsandbytes',
enforce_eager=False,
default_torch_num_threads=1) as llm:
vllm_outputs = llm.generate_greedy_logprobs(example_prompts,
max_tokens=32,
num_logprobs=5)
with hf_runner(model_name,
model_kwargs=hf_model_kwargs,
default_torch_num_threads=1) as llm:
with hf_runner(
model_name, model_kwargs=hf_model_kwargs, default_torch_num_threads=1
) as llm:
transformers_outputs = llm.generate_greedy_logprobs_limit(
example_prompts, max_tokens=32, num_logprobs=5)
example_prompts, max_tokens=32, num_logprobs=5
)
check_logprobs_close(
outputs_0_lst=transformers_outputs,
outputs_1_lst=vllm_outputs,
@@ -147,10 +168,11 @@ def test_4bit_bnb_moe_model(hf_runner, vllm_runner, example_prompts,
)
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description",
models_4bit_to_embedding_test)
@pytest.mark.skipif(
not is_quant_method_supported("bitsandbytes"),
reason="bitsandbytes is not supported on this GPU type.",
)
@pytest.mark.parametrize("model_name, description", models_4bit_to_embedding_test)
@pytest.mark.parametrize("dtype", ["half"])
def test_4bit_bnb_embedding_model(
model_name,
@@ -160,7 +182,6 @@ def test_4bit_bnb_embedding_model(
example_prompts,
dtype: str,
) -> None:
# The example_prompts has ending "\n", for example:
# "Write a short story about a robot that dreams for the first time.\n"
# sentence_transformers will strip the input texts, see:
@@ -170,22 +191,23 @@ def test_4bit_bnb_embedding_model(
example_prompts = [str(s).strip() for s in example_prompts]
# Inflight 4bit quantization
with vllm_runner(model_name,
runner="pooling",
dtype=dtype,
gpu_memory_utilization=0.5,
quantization="bitsandbytes",
default_torch_num_threads=1) as vllm_model:
with vllm_runner(
model_name,
runner="pooling",
dtype=dtype,
gpu_memory_utilization=0.5,
quantization="bitsandbytes",
default_torch_num_threads=1,
) as vllm_model:
vllm_outputs = vllm_model.embed(example_prompts)
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
load_in_4bit=True))
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(load_in_4bit=True))
with hf_runner(
model_name,
dtype=dtype,
model_kwargs=hf_model_kwargs,
is_sentence_transformer=True,
default_torch_num_threads=1,
model_name,
dtype=dtype,
model_kwargs=hf_model_kwargs,
is_sentence_transformer=True,
default_torch_num_threads=1,
) as hf_model:
hf_outputs = hf_model.encode(example_prompts)
@@ -210,23 +232,25 @@ def log_generated_texts(prompts, outputs, runner_name):
return logged_texts
def validate_generated_texts(hf_runner,
vllm_runner,
prompts,
model_name,
pre_quant=False,
hf_model_kwargs=None,
vllm_tp_size=1,
max_tokens=8):
def validate_generated_texts(
hf_runner,
vllm_runner,
prompts,
model_name,
pre_quant=False,
hf_model_kwargs=None,
vllm_tp_size=1,
max_tokens=8,
):
# NOTE: run vLLM first, as it requires a clean process
# when using distributed inference
with vllm_runner(model_name,
quantization=None if pre_quant else 'bitsandbytes',
tensor_parallel_size=vllm_tp_size,
enforce_eager=False,
default_torch_num_threads=1) as llm:
with vllm_runner(
model_name,
quantization=None if pre_quant else "bitsandbytes",
tensor_parallel_size=vllm_tp_size,
enforce_eager=False,
default_torch_num_threads=1,
) as llm:
vllm_outputs = llm.generate_greedy(prompts, max_tokens)
vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")
@@ -234,9 +258,9 @@ def validate_generated_texts(hf_runner,
hf_model_kwargs = {}
# Run with HF runner
with hf_runner(model_name,
model_kwargs=hf_model_kwargs,
default_torch_num_threads=1) as llm:
with hf_runner(
model_name, model_kwargs=hf_model_kwargs, default_torch_num_threads=1
) as llm:
hf_outputs = llm.generate_greedy(prompts, max_tokens)
hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")
@@ -245,8 +269,10 @@ def validate_generated_texts(hf_runner,
hf_str = hf_log["generated_text"]
vllm_str = vllm_log["generated_text"]
prompt = hf_log["prompt"]
assert hf_str == vllm_str, (f"Model: {model_name}"
f"Mismatch between HF and vLLM outputs:\n"
f"Prompt: {prompt}\n"
f"HF Output: '{hf_str}'\n"
f"vLLM Output: '{vllm_str}'")
assert hf_str == vllm_str, (
f"Model: {model_name}"
f"Mismatch between HF and vLLM outputs:\n"
f"Prompt: {prompt}\n"
f"HF Output: '{hf_str}'\n"
f"vLLM Output: '{vllm_str}'"
)

View File

@@ -5,6 +5,7 @@
"""Tests fp8 models against ground truth generation
Note: these tests will only pass on L4 GPU.
"""
import pytest
from tests.quantization.utils import is_quant_method_supported
@@ -14,21 +15,33 @@ from vllm.utils import STR_BACKEND_ENV_VAR
from ..utils import check_logprobs_close
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="fp8 is not supported on this GPU type.")
@pytest.mark.skipif(
not is_quant_method_supported("fp8"),
reason="fp8 is not supported on this GPU type.",
)
@pytest.mark.parametrize(
"kv_cache_dtype,base_model,test_model",
[
# Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
"nm-testing/Llama-3.2-1B-Instruct-FP8-KV"),
(
"fp8_e4m3",
"meta-llama/Llama-3.2-1B-Instruct",
"nm-testing/Llama-3.2-1B-Instruct-FP8-KV",
),
# Test BF16 checkpoint w. fp8_e5m2 kv-cache.
("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
"meta-llama/Llama-3.2-1B-Instruct"),
(
"fp8_e5m2",
"meta-llama/Llama-3.2-1B-Instruct",
"meta-llama/Llama-3.2-1B-Instruct",
),
# Test BF16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
"meta-llama/Llama-3.2-1B-Instruct")
])
(
"fp8_e4m3",
"meta-llama/Llama-3.2-1B-Instruct",
"meta-llama/Llama-3.2-1B-Instruct",
),
],
)
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
@pytest.mark.parametrize("max_tokens", [4])
@pytest.mark.parametrize("enforce_eager", [True])
@@ -54,38 +67,39 @@ def test_models(
"""
if kv_cache_dtype == "fp8_e5m2" and current_platform.is_rocm():
pytest.skip(
f"{kv_cache_dtype} is currently not supported on ROCm/HIP.")
pytest.skip(f"{kv_cache_dtype} is currently not supported on ROCm/HIP.")
if not current_platform.is_kv_cache_dtype_supported(kv_cache_dtype, None):
pytest.skip(f"{kv_cache_dtype} is not supported on this platform.")
with monkeypatch.context() as m:
m.setenv("TOKENIZERS_PARALLELISM", 'true')
m.setenv("TOKENIZERS_PARALLELISM", "true")
m.setenv(STR_BACKEND_ENV_VAR, backend)
MAX_MODEL_LEN = 1024
NUM_LOG_PROBS = 8
with vllm_runner(
base_model,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
kv_cache_dtype="auto",
base_model,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
kv_cache_dtype="auto",
) as vllm_model:
baseline_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS)
example_prompts, max_tokens, NUM_LOG_PROBS
)
with vllm_runner(
test_model,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
kv_cache_dtype=kv_cache_dtype,
test_model,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
kv_cache_dtype=kv_cache_dtype,
) as vllm_model:
test_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS)
example_prompts, max_tokens, NUM_LOG_PROBS
)
check_logprobs_close(
outputs_0_lst=baseline_outputs,
@@ -96,15 +110,18 @@ def test_models(
@pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(),
reason="test for the CPU backend.")
@pytest.mark.skipif(not current_platform.is_cpu(), reason="test for the CPU backend.")
@pytest.mark.parametrize(
"kv_cache_dtype,base_model,test_model",
[
# Test BF16 checkpoint w. fp8_e5m2 kv-cache.
("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
"meta-llama/Llama-3.2-1B-Instruct"),
])
(
"fp8_e5m2",
"meta-llama/Llama-3.2-1B-Instruct",
"meta-llama/Llama-3.2-1B-Instruct",
),
],
)
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
@pytest.mark.parametrize("max_tokens", [4])
def test_cpu_models(
@@ -121,28 +138,30 @@ def test_cpu_models(
numerical sensitive kernels.
"""
with monkeypatch.context() as m:
m.setenv("TOKENIZERS_PARALLELISM", 'true')
m.setenv("TOKENIZERS_PARALLELISM", "true")
MAX_MODEL_LEN = 1024
NUM_LOG_PROBS = 8
with vllm_runner(
base_model,
max_model_len=MAX_MODEL_LEN,
dtype="bfloat16",
kv_cache_dtype="auto",
base_model,
max_model_len=MAX_MODEL_LEN,
dtype="bfloat16",
kv_cache_dtype="auto",
) as vllm_model:
baseline_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS)
example_prompts, max_tokens, NUM_LOG_PROBS
)
with vllm_runner(
test_model,
max_model_len=MAX_MODEL_LEN,
dtype="bfloat16",
kv_cache_dtype=kv_cache_dtype,
test_model,
max_model_len=MAX_MODEL_LEN,
dtype="bfloat16",
kv_cache_dtype=kv_cache_dtype,
) as vllm_model:
test_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS)
example_prompts, max_tokens, NUM_LOG_PROBS
)
check_logprobs_close(
outputs_0_lst=baseline_outputs,

View File

@@ -100,35 +100,37 @@ def check_model_outputs(
):
tokenizer = AutoTokenizer.from_pretrained(model.original_model)
if tokenizer.chat_template is not None:
messages = [[{
'role': 'user',
'content': prompt
}] for prompt in prompts]
prompts = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
messages = [[{"role": "user", "content": prompt}] for prompt in prompts]
prompts = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
# Run gguf model.
with vllm_runner(model_name=model.gguf_model,
enforce_eager=True,
tokenizer_name=model.original_model,
dtype=dtype,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tp_size) as gguf_model:
with vllm_runner(
model_name=model.gguf_model,
enforce_eager=True,
tokenizer_name=model.original_model,
dtype=dtype,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tp_size,
) as gguf_model:
gguf_outputs = gguf_model.generate_greedy_logprobs(
prompts[:-1], max_tokens, num_logprobs)
prompts[:-1], max_tokens, num_logprobs
)
# Run unquantized model.
# Should run with tp=1, otherwise the test will stuck at
# nccl initialization.
with vllm_runner(
model_name=model.original_model,
enforce_eager=True, # faster tests
dtype=dtype,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=1) as original_model:
model_name=model.original_model,
enforce_eager=True, # faster tests
dtype=dtype,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=1,
) as original_model:
original_outputs = original_model.generate_greedy_logprobs(
prompts[:-1], max_tokens, num_logprobs)
prompts[:-1], max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=original_outputs,
@@ -138,12 +140,14 @@ def check_model_outputs(
)
@pytest.mark.skipif(not is_quant_method_supported("gguf"),
reason="gguf is not supported on this GPU type.")
@pytest.mark.parametrize("model", [
pytest.param(test_config, marks=test_config.marks)
for test_config in MODELS
])
@pytest.mark.skipif(
not is_quant_method_supported("gguf"),
reason="gguf is not supported on this GPU type.",
)
@pytest.mark.parametrize(
"model",
[pytest.param(test_config, marks=test_config.marks) for test_config in MODELS],
)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
@@ -157,12 +161,15 @@ def test_models(
num_logprobs: int,
tp_size: int,
) -> None:
check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens,
num_logprobs, tp_size)
check_model_outputs(
vllm_runner, example_prompts, model, dtype, max_tokens, num_logprobs, tp_size
)
@pytest.mark.skipif(not is_quant_method_supported("gguf"),
reason="gguf is not supported on this GPU type.")
@pytest.mark.skipif(
not is_quant_method_supported("gguf"),
reason="gguf is not supported on this GPU type.",
)
@pytest.mark.parametrize("model", [LLAMA_CONFIG])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [8])
@@ -178,5 +185,6 @@ def test_distributed(
num_logprobs: int,
tp_size: int,
) -> None:
check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens,
num_logprobs, tp_size)
check_model_outputs(
vllm_runner, example_prompts, model, dtype, max_tokens, num_logprobs, tp_size
)

View File

@@ -7,9 +7,10 @@ As a result, in this test, we just confirm that the top selected tokens of the
bitblas/GPTQ models are in the top 3 selections of each other.
Note: bitblas internally uses locks to synchronize the threads. This can
result in very slight nondeterminism for bitblas. As a result, we re-run the
result in very slight nondeterminism for bitblas. As a result, we re-run the
test up to 3 times to see if we pass.
"""
from dataclasses import dataclass
import pytest
@@ -41,16 +42,19 @@ def test_models(
max_tokens: int,
num_logprobs: int,
) -> None:
with vllm_runner(model_pair.model_gptq,
dtype=dtype,
quantization="bitblas") as bitblas_model:
with vllm_runner(
model_pair.model_gptq, dtype=dtype, quantization="bitblas"
) as bitblas_model:
bitblas_outputs = bitblas_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model_pair.model_gptq, dtype=dtype,
quantization="gptq") as gptq_model:
with vllm_runner(
model_pair.model_gptq, dtype=dtype, quantization="gptq"
) as gptq_model:
gptq_outputs = gptq_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=gptq_outputs,

View File

@@ -9,6 +9,7 @@ Note: Marlin internally uses locks to synchronize the threads. This can
result in very slight nondeterminism for Marlin. As a result, we re-run the test
up to 3 times to see if we pass.
"""
import os
import pytest
@@ -26,20 +27,20 @@ MAX_MODEL_LEN = 1024
MODELS = [
# act_order==True, group_size=128
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),
# 8-bit, act_order==True, group_size=channelwise
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),
# 4-bit, act_order==True, group_size=128
("TechxGenus/gemma-1.1-2b-it-GPTQ", "main")
("TechxGenus/gemma-1.1-2b-it-GPTQ", "main"),
]
@pytest.mark.flaky(reruns=3)
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin")
or current_platform.is_rocm()
or not current_platform.is_cuda(),
reason="gptq_marlin is not supported on this GPU type.")
@pytest.mark.skipif(
not is_quant_method_supported("gptq_marlin")
or current_platform.is_rocm()
or not current_platform.is_cuda(),
reason="gptq_marlin is not supported on this GPU type.",
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half", "bfloat16"])
@pytest.mark.parametrize("max_tokens", [32])
@@ -55,29 +56,34 @@ def test_models(
model_name, revision = model
# Run marlin.
with vllm_runner(model_name=model_name,
revision=revision,
dtype=dtype,
quantization="marlin",
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=1) as gptq_marlin_model:
with vllm_runner(
model_name=model_name,
revision=revision,
dtype=dtype,
quantization="marlin",
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=1,
) as gptq_marlin_model:
gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
example_prompts[:-1], max_tokens, num_logprobs)
example_prompts[:-1], max_tokens, num_logprobs
)
_ROPE_DICT.clear() # clear rope cache to avoid rope dtype error
# Run gptq.
# The naive gptq kernel doesn't support bf16 yet.
# Here we always compare fp16/bf16 gpt marlin kernel
# to fp16 gptq kernel.
with vllm_runner(model_name=model_name,
revision=revision,
dtype="half",
quantization="gptq",
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=1) as gptq_model:
with vllm_runner(
model_name=model_name,
revision=revision,
dtype="half",
quantization="gptq",
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=1,
) as gptq_model:
gptq_outputs = gptq_model.generate_greedy_logprobs(
example_prompts[:-1], max_tokens, num_logprobs)
example_prompts[:-1], max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=gptq_outputs,

View File

@@ -6,6 +6,7 @@ Note: GPTQ and Marlin_24 do not have bitwise correctness.
As a result, in this test, we just confirm that the top selected tokens of the
Marlin/GPTQ models are in the top 3 selections of each other.
"""
from dataclasses import dataclass
import pytest
@@ -24,15 +25,18 @@ class ModelPair:
model_pairs = [
# 4-bit, group_size == 128
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"),
ModelPair(
model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128",
),
# # 4-bit, group_size == channelwise
# ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
# model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
# 8-bit, group_size == 128
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"),
ModelPair(
model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128",
),
# # 8-bit, group_size == channelwise
# ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
# model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
@@ -40,10 +44,12 @@ model_pairs = [
@pytest.mark.flaky(reruns=2)
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24")
or current_platform.is_rocm()
or not current_platform.is_cuda(),
reason="Marlin24 is not supported on this GPU type.")
@pytest.mark.skipif(
not is_quant_method_supported("gptq_marlin_24")
or current_platform.is_rocm()
or not current_platform.is_cuda(),
reason="Marlin24 is not supported on this GPU type.",
)
@pytest.mark.parametrize("model_pair", model_pairs)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [8])
@@ -56,16 +62,19 @@ def test_models(
max_tokens: int,
num_logprobs: int,
) -> None:
with vllm_runner(model_pair.model_marlin,
dtype=dtype,
quantization="gptq_marlin_24") as marlin_24_model:
with vllm_runner(
model_pair.model_marlin, dtype=dtype, quantization="gptq_marlin_24"
) as marlin_24_model:
marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model_pair.model_gptq, dtype=dtype,
quantization="gptq") as gptq_model:
with vllm_runner(
model_pair.model_gptq, dtype=dtype, quantization="gptq"
) as gptq_model:
gptq_outputs = gptq_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=gptq_outputs,

View File

@@ -5,6 +5,7 @@
"""Tests Model Optimizer fp8 models against ground truth generation
Note: these tests will only pass on H100
"""
import os
import pytest
@@ -22,13 +23,13 @@ MODELS = ["nvidia/Llama-3.1-8B-Instruct-FP8"]
EXPECTED_STRS_MAP = {
"nvidia/Llama-3.1-8B-Instruct-FP8": [
"You're referring to VLLM, a high-performance Large Language Model (LLM) inference and",
'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
'The comparison between artificial intelligence (AI) and human intelligence in terms of processing information is a complex and',
"Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ",
"The comparison between artificial intelligence (AI) and human intelligence in terms of processing information is a complex and",
'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
'**The Spark of Imagination**\n\nZeta-5, a sleek and efficient robot, whir',
'The COVID-19 pandemic has had a profound impact on global economic structures and business models, leading to',
'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
'Here are the translations:\n\n**Japanese:** 「早起きは早く獲物をとる'
"**The Spark of Imagination**\n\nZeta-5, a sleek and efficient robot, whir",
"The COVID-19 pandemic has had a profound impact on global economic structures and business models, leading to",
"The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of",
"Here are the translations:\n\n**Japanese:** 「早起きは早く獲物をとる",
]
}
@@ -39,10 +40,12 @@ EXPECTED_STRS_MAP = {
# the hardware being run on.
# Disabled to prevent it from breaking the build
@pytest.mark.skip(
reason=
"Prevent unstable test based on golden strings from breaking the build.")
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="fp8 is not supported on this GPU type.")
reason="Prevent unstable test based on golden strings from breaking the build."
)
@pytest.mark.skipif(
not is_quant_method_supported("fp8"),
reason="fp8 is not supported on this GPU type.",
)
@pytest.mark.parametrize("model_name", MODELS)
def test_models(example_prompts, model_name) -> None:
llm = LLM(
@@ -55,12 +58,11 @@ def test_models(example_prompts, model_name) -> None:
tokenizer = AutoTokenizer.from_pretrained(model_name)
formatted_prompts = [
tokenizer.apply_chat_template([{
"role": "user",
"content": prompt
}],
tokenize=False,
add_generation_prompt=True)
tokenizer.apply_chat_template(
[{"role": "user", "content": prompt}],
tokenize=False,
add_generation_prompt=True,
)
for prompt in example_prompts
]
params = SamplingParams(max_tokens=20, temperature=0)
@@ -78,4 +80,5 @@ def test_models(example_prompts, model_name) -> None:
generated_str = generations[i]
expected_str = expected_strs[i]
assert expected_str == generated_str, (
f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}"
)

View File

@@ -1,8 +1,8 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# flake8: noqa
"""Tests Quark mxfp4 models against ground truth generation
"""
"""Tests Quark mxfp4 models against ground truth generation"""
import pytest
from vllm import LLM, SamplingParams
@@ -11,13 +11,13 @@ MODELS = ["amd/Llama-2-7b-chat-hf-wmxfp4-amxfp4-kvfp8-scale-uint8"]
EXPECTED_STRS_MAP = {
"amd/Llama-2-7b-chat-hf-wmxfp4-amxfp4-kvfp8-scale-uint8": [
'\n### Key Features\n\n* **High-throughput Inference**: vLL',
'\nArtificial intelligence (AI) has evolved significantly since its inception in the 1',
'Artificial intelligence (AI) and human intelligence (HI) are two distinct concepts that have been',
'A neural network is a machine learning model inspired by the structure of the human brain. It consists of',
'\nTitle: The Dreaming Robot\n\nAs the sun set on the bustling metropol',
'\nThe COVID-19 pandemic has had a profound impact on global economic structures and business',
'The Mona Lisa painting, created by Leonardo da Vinci in the early 16th',
"\n### Key Features\n\n* **High-throughput Inference**: vLL",
"\nArtificial intelligence (AI) has evolved significantly since its inception in the 1",
"Artificial intelligence (AI) and human intelligence (HI) are two distinct concepts that have been",
"A neural network is a machine learning model inspired by the structure of the human brain. It consists of",
"\nTitle: The Dreaming Robot\n\nAs the sun set on the bustling metropol",
"\nThe COVID-19 pandemic has had a profound impact on global economic structures and business",
"The Mona Lisa painting, created by Leonardo da Vinci in the early 16th",
" everybody knows this proverbial saying, but did you know that it's not entirely accurate?",
]
}
@@ -38,4 +38,5 @@ def test_models(example_prompts, model_name) -> None:
output_str = output.outputs[0].text
expected_str = EXPECTED_STRS_MAP[model_name][i]
assert expected_str == output_str, (
f"Expected: {expected_str!r}\nvLLM: {output_str!r}")
f"Expected: {expected_str!r}\nvLLM: {output_str!r}"
)

View File

@@ -4,6 +4,7 @@
"""Tests Model Optimizer nvfp4 models against ground truth generation
Note: these tests will only pass on B200
"""
import os
from typing import List
@@ -21,14 +22,14 @@ MODELS = ["nvidia/Llama-3.3-70B-Instruct-FP4"]
EXPECTED_STRS_MAP = {
"nvidia/Llama-3.3-70B-Instruct-FP4": [
'vLLM (Vectorized Large Language Model) is indeed a high-throughput and memory-efficient inference',
'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
'Artificial intelligence (AI) and human intelligence (HI) are two distinct forms of intelligence that process',
'A neural network is a type of machine learning model inspired by the structure and function of the human brain',
'In the heart of a cutting-edge robotics lab, a team of engineers had been working tirelessly to push',
'The COVID-19 pandemic has had a profound impact on global economic structures and future business models, leading',
'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
'Here are the translations:\n\n* Japanese: (Sasuga no tori ga miwa o ts'
"vLLM (Vectorized Large Language Model) is indeed a high-throughput and memory-efficient inference",
"Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ",
"Artificial intelligence (AI) and human intelligence (HI) are two distinct forms of intelligence that process",
"A neural network is a type of machine learning model inspired by the structure and function of the human brain",
"In the heart of a cutting-edge robotics lab, a team of engineers had been working tirelessly to push",
"The COVID-19 pandemic has had a profound impact on global economic structures and future business models, leading",
"The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of",
"Here are the translations:\n\n* Japanese: (Sasuga no tori ga miwa o ts",
]
}
@@ -39,11 +40,13 @@ EXPECTED_STRS_MAP = {
# the hardware being run on.
# Disabled to prevent it from breaking the build
@pytest.mark.skip(
reason=
"Prevent unstable test based on golden strings from breaking the build "
" and test input model being too large and hanging the system.")
@pytest.mark.skipif(not is_quant_method_supported("modelopt_fp4"),
reason="modelopt_fp4 is not supported on this GPU type.")
reason="Prevent unstable test based on golden strings from breaking the build "
" and test input model being too large and hanging the system."
)
@pytest.mark.skipif(
not is_quant_method_supported("modelopt_fp4"),
reason="modelopt_fp4 is not supported on this GPU type.",
)
@pytest.mark.parametrize("model_name", MODELS)
def test_models(example_prompts, model_name) -> None:
llm = LLM(
@@ -56,12 +59,11 @@ def test_models(example_prompts, model_name) -> None:
tokenizer = AutoTokenizer.from_pretrained(model_name)
formatted_prompts = [
tokenizer.apply_chat_template([{
"role": "user",
"content": prompt
}],
tokenize=False,
add_generation_prompt=True)
tokenizer.apply_chat_template(
[{"role": "user", "content": prompt}],
tokenize=False,
add_generation_prompt=True,
)
for prompt in example_prompts
]
params = SamplingParams(max_tokens=20, temperature=0)
@@ -79,4 +81,5 @@ def test_models(example_prompts, model_name) -> None:
generated_str = generations[i]
expected_str = expected_strs[i]
assert expected_str == generated_str, (
f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}"
)