Convert formatting to use ruff instead of yapf + isort (#26247)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -11,12 +11,12 @@ from vllm.multimodal.image import rescale_image_size
|
||||
from ...conftest import IMAGE_ASSETS, ImageTestAssets, VllmRunner
|
||||
from ..utils import check_logprobs_close
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
"<|im_start|>User\n<image>\nWhat's the content in the center of the image?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||
"cherry_blossom":
|
||||
"<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||
})
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<|im_start|>User\n<image>\nWhat's the content in the center of the image?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||
"cherry_blossom": "<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def run_awq_test(
|
||||
@@ -34,10 +34,13 @@ def run_awq_test(
|
||||
):
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_image = [(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
||||
inputs_per_image = [
|
||||
(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
)
|
||||
for image, prompt in zip(images, HF_IMAGE_PROMPTS)
|
||||
]
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
@@ -46,42 +49,41 @@ def run_awq_test(
|
||||
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(
|
||||
source_model,
|
||||
max_model_len=4096,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True,
|
||||
default_torch_num_threads=1,
|
||||
source_model,
|
||||
max_model_len=4096,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True,
|
||||
default_torch_num_threads=1,
|
||||
) as vllm_model:
|
||||
source_outputs_per_image = [
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images)
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
prompts, max_tokens, num_logprobs=num_logprobs, images=images
|
||||
)
|
||||
for prompts, images in inputs_per_image
|
||||
]
|
||||
|
||||
with vllm_runner(
|
||||
quant_model,
|
||||
quantization="awq",
|
||||
max_model_len=4096,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True,
|
||||
default_torch_num_threads=1,
|
||||
quant_model,
|
||||
quantization="awq",
|
||||
max_model_len=4096,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True,
|
||||
default_torch_num_threads=1,
|
||||
) as vllm_model:
|
||||
quant_outputs_per_image = [
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images)
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
prompts, max_tokens, num_logprobs=num_logprobs, images=images
|
||||
)
|
||||
for prompts, images in inputs_per_image
|
||||
]
|
||||
|
||||
for source_outputs, quant_outputs in zip(source_outputs_per_image,
|
||||
quant_outputs_per_image):
|
||||
for source_outputs, quant_outputs in zip(
|
||||
source_outputs_per_image, quant_outputs_per_image
|
||||
):
|
||||
# TODO: Check whether using original CLIPVisionModel can improve
|
||||
# consistency against HF
|
||||
check_logprobs_close(
|
||||
@@ -113,9 +115,16 @@ def run_awq_test(
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@torch.inference_mode()
|
||||
def test_awq_models(vllm_runner, image_assets, source_model, quant_model,
|
||||
size_factors, dtype, max_tokens, num_logprobs) -> None:
|
||||
|
||||
def test_awq_models(
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
source_model,
|
||||
quant_model,
|
||||
size_factors,
|
||||
dtype,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
) -> None:
|
||||
run_awq_test(
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
|
||||
@@ -7,9 +7,10 @@ As a result, in this test, we just confirm that the top selected tokens of the
|
||||
bitblas/GPTQ models are in the top 3 selections of each other.
|
||||
|
||||
Note: bitblas internally uses locks to synchronize the threads. This can
|
||||
result in very slight nondeterminism for bitblas. As a result, we re-run the
|
||||
result in very slight nondeterminism for bitblas. As a result, we re-run the
|
||||
test up to 3 times to see if we pass.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
import pytest
|
||||
@@ -24,8 +25,10 @@ class ModelPair:
|
||||
|
||||
|
||||
model_pairs = [
|
||||
ModelPair(model_bitblas="hxbgsyxh/opt-125m-4bit-128g-bitblas",
|
||||
model_gptq="hxbgsyxh/opt-125m-4bit-128g"),
|
||||
ModelPair(
|
||||
model_bitblas="hxbgsyxh/opt-125m-4bit-128g-bitblas",
|
||||
model_gptq="hxbgsyxh/opt-125m-4bit-128g",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@@ -43,16 +46,19 @@ def test_models(
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
with vllm_runner(model_pair.model_bitblas,
|
||||
dtype=dtype,
|
||||
quantization="bitblas") as bitblas_model:
|
||||
with vllm_runner(
|
||||
model_pair.model_bitblas, dtype=dtype, quantization="bitblas"
|
||||
) as bitblas_model:
|
||||
bitblas_outputs = bitblas_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(model_pair.model_gptq, dtype=dtype,
|
||||
quantization="gptq") as gptq_model:
|
||||
with vllm_runner(
|
||||
model_pair.model_gptq, dtype=dtype, quantization="gptq"
|
||||
) as gptq_model:
|
||||
gptq_outputs = gptq_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=gptq_outputs,
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
'''Tests whether bitsandbytes computation is enabled correctly.
|
||||
"""Tests whether bitsandbytes computation is enabled correctly.
|
||||
|
||||
Run `pytest tests/quantization/test_bitsandbytes.py`.
|
||||
'''
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from transformers import BitsAndBytesConfig
|
||||
@@ -15,8 +15,10 @@ from ..utils import check_embeddings_close, check_logprobs_close
|
||||
|
||||
models_4bit_to_test = [
|
||||
("facebook/opt-125m", "quantize opt model inflight"),
|
||||
("mistralai/Mistral-7B-Instruct-v0.3",
|
||||
"quantize inflight model with both HF and Mistral format weights")
|
||||
(
|
||||
"mistralai/Mistral-7B-Instruct-v0.3",
|
||||
"quantize inflight model with both HF and Mistral format weights",
|
||||
),
|
||||
]
|
||||
|
||||
models_4bit_to_embedding_test = [
|
||||
@@ -28,72 +30,84 @@ models_4bit_to_moe_test = [
|
||||
]
|
||||
|
||||
models_pre_qaunt_4bit_to_test = [
|
||||
('PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed',
|
||||
'read pre-quantized 4-bit FP4 model'),
|
||||
('poedator/opt-125m-bnb-4bit', 'read pre-quantized 4-bit NF4 opt model'),
|
||||
(
|
||||
"PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed",
|
||||
"read pre-quantized 4-bit FP4 model",
|
||||
),
|
||||
("poedator/opt-125m-bnb-4bit", "read pre-quantized 4-bit NF4 opt model"),
|
||||
]
|
||||
|
||||
models_pre_quant_8bit_to_test = [
|
||||
('meta-llama/Llama-Guard-3-8B-INT8',
|
||||
'read pre-quantized llama 8-bit model'),
|
||||
("meta-llama/Llama-Guard-3-8B-INT8", "read pre-quantized llama 8-bit model"),
|
||||
("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("bitsandbytes"),
|
||||
reason="bitsandbytes is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
||||
def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
model_name, description) -> None:
|
||||
|
||||
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
|
||||
load_in_4bit=True))
|
||||
validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
|
||||
model_name, False, hf_model_kwargs)
|
||||
def test_load_4bit_bnb_model(
|
||||
hf_runner, vllm_runner, example_prompts, model_name, description
|
||||
) -> None:
|
||||
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(load_in_4bit=True))
|
||||
validate_generated_texts(
|
||||
hf_runner, vllm_runner, example_prompts[:1], model_name, False, hf_model_kwargs
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.parametrize("model_name, description",
|
||||
models_pre_qaunt_4bit_to_test)
|
||||
def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
model_name, description) -> None:
|
||||
|
||||
validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
|
||||
model_name, True)
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("bitsandbytes"),
|
||||
reason="bitsandbytes is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("model_name, description", models_pre_qaunt_4bit_to_test)
|
||||
def test_load_pre_quant_4bit_bnb_model(
|
||||
hf_runner, vllm_runner, example_prompts, model_name, description
|
||||
) -> None:
|
||||
validate_generated_texts(
|
||||
hf_runner, vllm_runner, example_prompts[:1], model_name, True
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.parametrize("model_name, description",
|
||||
models_pre_quant_8bit_to_test)
|
||||
def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
model_name, description) -> None:
|
||||
|
||||
validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
|
||||
model_name, True)
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("bitsandbytes"),
|
||||
reason="bitsandbytes is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("model_name, description", models_pre_quant_8bit_to_test)
|
||||
def test_load_8bit_bnb_model(
|
||||
hf_runner, vllm_runner, example_prompts, model_name, description
|
||||
) -> None:
|
||||
validate_generated_texts(
|
||||
hf_runner, vllm_runner, example_prompts[:1], model_name, True
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("bitsandbytes"),
|
||||
reason="bitsandbytes is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
model_name, description) -> None:
|
||||
|
||||
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
|
||||
load_in_4bit=True))
|
||||
validate_generated_texts(hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts[:1],
|
||||
model_name,
|
||||
False,
|
||||
hf_model_kwargs,
|
||||
vllm_tp_size=2)
|
||||
def test_load_tp_4bit_bnb_model(
|
||||
hf_runner, vllm_runner, example_prompts, model_name, description
|
||||
) -> None:
|
||||
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(load_in_4bit=True))
|
||||
validate_generated_texts(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts[:1],
|
||||
model_name,
|
||||
False,
|
||||
hf_model_kwargs,
|
||||
vllm_tp_size=2,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("bitsandbytes"),
|
||||
reason="bitsandbytes is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_load_pp_4bit_bnb_model(model_name, description) -> None:
|
||||
@@ -115,30 +129,37 @@ def test_load_pp_4bit_bnb_model(model_name, description) -> None:
|
||||
compare_two_settings(model_name, common_args, pp_args)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("bitsandbytes"),
|
||||
reason="bitsandbytes is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_moe_test)
|
||||
def test_4bit_bnb_moe_model(hf_runner, vllm_runner, example_prompts,
|
||||
model_name, description) -> None:
|
||||
def test_4bit_bnb_moe_model(
|
||||
hf_runner, vllm_runner, example_prompts, model_name, description
|
||||
) -> None:
|
||||
hf_model_kwargs = dict(
|
||||
quantization_config=BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_use_double_quant=True,
|
||||
)
|
||||
)
|
||||
with vllm_runner(
|
||||
model_name,
|
||||
quantization="bitsandbytes",
|
||||
enforce_eager=False,
|
||||
default_torch_num_threads=1,
|
||||
) as llm:
|
||||
vllm_outputs = llm.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens=32, num_logprobs=5
|
||||
)
|
||||
|
||||
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_use_double_quant=True,
|
||||
))
|
||||
with vllm_runner(model_name,
|
||||
quantization='bitsandbytes',
|
||||
enforce_eager=False,
|
||||
default_torch_num_threads=1) as llm:
|
||||
vllm_outputs = llm.generate_greedy_logprobs(example_prompts,
|
||||
max_tokens=32,
|
||||
num_logprobs=5)
|
||||
|
||||
with hf_runner(model_name,
|
||||
model_kwargs=hf_model_kwargs,
|
||||
default_torch_num_threads=1) as llm:
|
||||
with hf_runner(
|
||||
model_name, model_kwargs=hf_model_kwargs, default_torch_num_threads=1
|
||||
) as llm:
|
||||
transformers_outputs = llm.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens=32, num_logprobs=5)
|
||||
example_prompts, max_tokens=32, num_logprobs=5
|
||||
)
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=transformers_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
@@ -147,10 +168,11 @@ def test_4bit_bnb_moe_model(hf_runner, vllm_runner, example_prompts,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.parametrize("model_name, description",
|
||||
models_4bit_to_embedding_test)
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("bitsandbytes"),
|
||||
reason="bitsandbytes is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_embedding_test)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_4bit_bnb_embedding_model(
|
||||
model_name,
|
||||
@@ -160,7 +182,6 @@ def test_4bit_bnb_embedding_model(
|
||||
example_prompts,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
|
||||
# The example_prompts has ending "\n", for example:
|
||||
# "Write a short story about a robot that dreams for the first time.\n"
|
||||
# sentence_transformers will strip the input texts, see:
|
||||
@@ -170,22 +191,23 @@ def test_4bit_bnb_embedding_model(
|
||||
example_prompts = [str(s).strip() for s in example_prompts]
|
||||
|
||||
# Inflight 4bit quantization
|
||||
with vllm_runner(model_name,
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
gpu_memory_utilization=0.5,
|
||||
quantization="bitsandbytes",
|
||||
default_torch_num_threads=1) as vllm_model:
|
||||
with vllm_runner(
|
||||
model_name,
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
gpu_memory_utilization=0.5,
|
||||
quantization="bitsandbytes",
|
||||
default_torch_num_threads=1,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.embed(example_prompts)
|
||||
|
||||
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
|
||||
load_in_4bit=True))
|
||||
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(load_in_4bit=True))
|
||||
with hf_runner(
|
||||
model_name,
|
||||
dtype=dtype,
|
||||
model_kwargs=hf_model_kwargs,
|
||||
is_sentence_transformer=True,
|
||||
default_torch_num_threads=1,
|
||||
model_name,
|
||||
dtype=dtype,
|
||||
model_kwargs=hf_model_kwargs,
|
||||
is_sentence_transformer=True,
|
||||
default_torch_num_threads=1,
|
||||
) as hf_model:
|
||||
hf_outputs = hf_model.encode(example_prompts)
|
||||
|
||||
@@ -210,23 +232,25 @@ def log_generated_texts(prompts, outputs, runner_name):
|
||||
return logged_texts
|
||||
|
||||
|
||||
def validate_generated_texts(hf_runner,
|
||||
vllm_runner,
|
||||
prompts,
|
||||
model_name,
|
||||
pre_quant=False,
|
||||
hf_model_kwargs=None,
|
||||
vllm_tp_size=1,
|
||||
max_tokens=8):
|
||||
|
||||
def validate_generated_texts(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
prompts,
|
||||
model_name,
|
||||
pre_quant=False,
|
||||
hf_model_kwargs=None,
|
||||
vllm_tp_size=1,
|
||||
max_tokens=8,
|
||||
):
|
||||
# NOTE: run vLLM first, as it requires a clean process
|
||||
# when using distributed inference
|
||||
with vllm_runner(model_name,
|
||||
quantization=None if pre_quant else 'bitsandbytes',
|
||||
tensor_parallel_size=vllm_tp_size,
|
||||
enforce_eager=False,
|
||||
default_torch_num_threads=1) as llm:
|
||||
|
||||
with vllm_runner(
|
||||
model_name,
|
||||
quantization=None if pre_quant else "bitsandbytes",
|
||||
tensor_parallel_size=vllm_tp_size,
|
||||
enforce_eager=False,
|
||||
default_torch_num_threads=1,
|
||||
) as llm:
|
||||
vllm_outputs = llm.generate_greedy(prompts, max_tokens)
|
||||
vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")
|
||||
|
||||
@@ -234,9 +258,9 @@ def validate_generated_texts(hf_runner,
|
||||
hf_model_kwargs = {}
|
||||
|
||||
# Run with HF runner
|
||||
with hf_runner(model_name,
|
||||
model_kwargs=hf_model_kwargs,
|
||||
default_torch_num_threads=1) as llm:
|
||||
with hf_runner(
|
||||
model_name, model_kwargs=hf_model_kwargs, default_torch_num_threads=1
|
||||
) as llm:
|
||||
hf_outputs = llm.generate_greedy(prompts, max_tokens)
|
||||
hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")
|
||||
|
||||
@@ -245,8 +269,10 @@ def validate_generated_texts(hf_runner,
|
||||
hf_str = hf_log["generated_text"]
|
||||
vllm_str = vllm_log["generated_text"]
|
||||
prompt = hf_log["prompt"]
|
||||
assert hf_str == vllm_str, (f"Model: {model_name}"
|
||||
f"Mismatch between HF and vLLM outputs:\n"
|
||||
f"Prompt: {prompt}\n"
|
||||
f"HF Output: '{hf_str}'\n"
|
||||
f"vLLM Output: '{vllm_str}'")
|
||||
assert hf_str == vllm_str, (
|
||||
f"Model: {model_name}"
|
||||
f"Mismatch between HF and vLLM outputs:\n"
|
||||
f"Prompt: {prompt}\n"
|
||||
f"HF Output: '{hf_str}'\n"
|
||||
f"vLLM Output: '{vllm_str}'"
|
||||
)
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
"""Tests fp8 models against ground truth generation
|
||||
Note: these tests will only pass on L4 GPU.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
@@ -14,21 +15,33 @@ from vllm.utils import STR_BACKEND_ENV_VAR
|
||||
from ..utils import check_logprobs_close
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
|
||||
reason="fp8 is not supported on this GPU type.")
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("fp8"),
|
||||
reason="fp8 is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"kv_cache_dtype,base_model,test_model",
|
||||
[
|
||||
# Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
|
||||
("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
|
||||
"nm-testing/Llama-3.2-1B-Instruct-FP8-KV"),
|
||||
(
|
||||
"fp8_e4m3",
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
"nm-testing/Llama-3.2-1B-Instruct-FP8-KV",
|
||||
),
|
||||
# Test BF16 checkpoint w. fp8_e5m2 kv-cache.
|
||||
("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
|
||||
"meta-llama/Llama-3.2-1B-Instruct"),
|
||||
(
|
||||
"fp8_e5m2",
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
),
|
||||
# Test BF16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
|
||||
("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
|
||||
"meta-llama/Llama-3.2-1B-Instruct")
|
||||
])
|
||||
(
|
||||
"fp8_e4m3",
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
),
|
||||
],
|
||||
)
|
||||
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
|
||||
@pytest.mark.parametrize("max_tokens", [4])
|
||||
@pytest.mark.parametrize("enforce_eager", [True])
|
||||
@@ -54,38 +67,39 @@ def test_models(
|
||||
"""
|
||||
|
||||
if kv_cache_dtype == "fp8_e5m2" and current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
f"{kv_cache_dtype} is currently not supported on ROCm/HIP.")
|
||||
pytest.skip(f"{kv_cache_dtype} is currently not supported on ROCm/HIP.")
|
||||
|
||||
if not current_platform.is_kv_cache_dtype_supported(kv_cache_dtype, None):
|
||||
pytest.skip(f"{kv_cache_dtype} is not supported on this platform.")
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("TOKENIZERS_PARALLELISM", 'true')
|
||||
m.setenv("TOKENIZERS_PARALLELISM", "true")
|
||||
m.setenv(STR_BACKEND_ENV_VAR, backend)
|
||||
|
||||
MAX_MODEL_LEN = 1024
|
||||
NUM_LOG_PROBS = 8
|
||||
|
||||
with vllm_runner(
|
||||
base_model,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
enforce_eager=enforce_eager,
|
||||
kv_cache_dtype="auto",
|
||||
base_model,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
enforce_eager=enforce_eager,
|
||||
kv_cache_dtype="auto",
|
||||
) as vllm_model:
|
||||
baseline_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS
|
||||
)
|
||||
|
||||
with vllm_runner(
|
||||
test_model,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
enforce_eager=enforce_eager,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
test_model,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
enforce_eager=enforce_eager,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
) as vllm_model:
|
||||
test_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=baseline_outputs,
|
||||
@@ -96,15 +110,18 @@ def test_models(
|
||||
|
||||
|
||||
@pytest.mark.cpu_model
|
||||
@pytest.mark.skipif(not current_platform.is_cpu(),
|
||||
reason="test for the CPU backend.")
|
||||
@pytest.mark.skipif(not current_platform.is_cpu(), reason="test for the CPU backend.")
|
||||
@pytest.mark.parametrize(
|
||||
"kv_cache_dtype,base_model,test_model",
|
||||
[
|
||||
# Test BF16 checkpoint w. fp8_e5m2 kv-cache.
|
||||
("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
|
||||
"meta-llama/Llama-3.2-1B-Instruct"),
|
||||
])
|
||||
(
|
||||
"fp8_e5m2",
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
),
|
||||
],
|
||||
)
|
||||
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
|
||||
@pytest.mark.parametrize("max_tokens", [4])
|
||||
def test_cpu_models(
|
||||
@@ -121,28 +138,30 @@ def test_cpu_models(
|
||||
numerical sensitive kernels.
|
||||
"""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("TOKENIZERS_PARALLELISM", 'true')
|
||||
m.setenv("TOKENIZERS_PARALLELISM", "true")
|
||||
|
||||
MAX_MODEL_LEN = 1024
|
||||
NUM_LOG_PROBS = 8
|
||||
|
||||
with vllm_runner(
|
||||
base_model,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
dtype="bfloat16",
|
||||
kv_cache_dtype="auto",
|
||||
base_model,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
dtype="bfloat16",
|
||||
kv_cache_dtype="auto",
|
||||
) as vllm_model:
|
||||
baseline_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS
|
||||
)
|
||||
|
||||
with vllm_runner(
|
||||
test_model,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
dtype="bfloat16",
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
test_model,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
dtype="bfloat16",
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
) as vllm_model:
|
||||
test_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=baseline_outputs,
|
||||
|
||||
@@ -100,35 +100,37 @@ def check_model_outputs(
|
||||
):
|
||||
tokenizer = AutoTokenizer.from_pretrained(model.original_model)
|
||||
if tokenizer.chat_template is not None:
|
||||
messages = [[{
|
||||
'role': 'user',
|
||||
'content': prompt
|
||||
}] for prompt in prompts]
|
||||
prompts = tokenizer.apply_chat_template(messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
messages = [[{"role": "user", "content": prompt}] for prompt in prompts]
|
||||
prompts = tokenizer.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
|
||||
# Run gguf model.
|
||||
with vllm_runner(model_name=model.gguf_model,
|
||||
enforce_eager=True,
|
||||
tokenizer_name=model.original_model,
|
||||
dtype=dtype,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=tp_size) as gguf_model:
|
||||
with vllm_runner(
|
||||
model_name=model.gguf_model,
|
||||
enforce_eager=True,
|
||||
tokenizer_name=model.original_model,
|
||||
dtype=dtype,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=tp_size,
|
||||
) as gguf_model:
|
||||
gguf_outputs = gguf_model.generate_greedy_logprobs(
|
||||
prompts[:-1], max_tokens, num_logprobs)
|
||||
prompts[:-1], max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
# Run unquantized model.
|
||||
# Should run with tp=1, otherwise the test will stuck at
|
||||
# nccl initialization.
|
||||
with vllm_runner(
|
||||
model_name=model.original_model,
|
||||
enforce_eager=True, # faster tests
|
||||
dtype=dtype,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=1) as original_model:
|
||||
model_name=model.original_model,
|
||||
enforce_eager=True, # faster tests
|
||||
dtype=dtype,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=1,
|
||||
) as original_model:
|
||||
original_outputs = original_model.generate_greedy_logprobs(
|
||||
prompts[:-1], max_tokens, num_logprobs)
|
||||
prompts[:-1], max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=original_outputs,
|
||||
@@ -138,12 +140,14 @@ def check_model_outputs(
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("gguf"),
|
||||
reason="gguf is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("model", [
|
||||
pytest.param(test_config, marks=test_config.marks)
|
||||
for test_config in MODELS
|
||||
])
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("gguf"),
|
||||
reason="gguf is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[pytest.param(test_config, marks=test_config.marks) for test_config in MODELS],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@@ -157,12 +161,15 @@ def test_models(
|
||||
num_logprobs: int,
|
||||
tp_size: int,
|
||||
) -> None:
|
||||
check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens,
|
||||
num_logprobs, tp_size)
|
||||
check_model_outputs(
|
||||
vllm_runner, example_prompts, model, dtype, max_tokens, num_logprobs, tp_size
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("gguf"),
|
||||
reason="gguf is not supported on this GPU type.")
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("gguf"),
|
||||
reason="gguf is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("model", [LLAMA_CONFIG])
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [8])
|
||||
@@ -178,5 +185,6 @@ def test_distributed(
|
||||
num_logprobs: int,
|
||||
tp_size: int,
|
||||
) -> None:
|
||||
check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens,
|
||||
num_logprobs, tp_size)
|
||||
check_model_outputs(
|
||||
vllm_runner, example_prompts, model, dtype, max_tokens, num_logprobs, tp_size
|
||||
)
|
||||
|
||||
@@ -7,9 +7,10 @@ As a result, in this test, we just confirm that the top selected tokens of the
|
||||
bitblas/GPTQ models are in the top 3 selections of each other.
|
||||
|
||||
Note: bitblas internally uses locks to synchronize the threads. This can
|
||||
result in very slight nondeterminism for bitblas. As a result, we re-run the
|
||||
result in very slight nondeterminism for bitblas. As a result, we re-run the
|
||||
test up to 3 times to see if we pass.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
import pytest
|
||||
@@ -41,16 +42,19 @@ def test_models(
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
with vllm_runner(model_pair.model_gptq,
|
||||
dtype=dtype,
|
||||
quantization="bitblas") as bitblas_model:
|
||||
with vllm_runner(
|
||||
model_pair.model_gptq, dtype=dtype, quantization="bitblas"
|
||||
) as bitblas_model:
|
||||
bitblas_outputs = bitblas_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(model_pair.model_gptq, dtype=dtype,
|
||||
quantization="gptq") as gptq_model:
|
||||
with vllm_runner(
|
||||
model_pair.model_gptq, dtype=dtype, quantization="gptq"
|
||||
) as gptq_model:
|
||||
gptq_outputs = gptq_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=gptq_outputs,
|
||||
|
||||
@@ -9,6 +9,7 @@ Note: Marlin internally uses locks to synchronize the threads. This can
|
||||
result in very slight nondeterminism for Marlin. As a result, we re-run the test
|
||||
up to 3 times to see if we pass.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
@@ -26,20 +27,20 @@ MAX_MODEL_LEN = 1024
|
||||
MODELS = [
|
||||
# act_order==True, group_size=128
|
||||
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),
|
||||
|
||||
# 8-bit, act_order==True, group_size=channelwise
|
||||
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),
|
||||
|
||||
# 4-bit, act_order==True, group_size=128
|
||||
("TechxGenus/gemma-1.1-2b-it-GPTQ", "main")
|
||||
("TechxGenus/gemma-1.1-2b-it-GPTQ", "main"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.flaky(reruns=3)
|
||||
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin")
|
||||
or current_platform.is_rocm()
|
||||
or not current_platform.is_cuda(),
|
||||
reason="gptq_marlin is not supported on this GPU type.")
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("gptq_marlin")
|
||||
or current_platform.is_rocm()
|
||||
or not current_platform.is_cuda(),
|
||||
reason="gptq_marlin is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half", "bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@@ -55,29 +56,34 @@ def test_models(
|
||||
model_name, revision = model
|
||||
|
||||
# Run marlin.
|
||||
with vllm_runner(model_name=model_name,
|
||||
revision=revision,
|
||||
dtype=dtype,
|
||||
quantization="marlin",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=1) as gptq_marlin_model:
|
||||
|
||||
with vllm_runner(
|
||||
model_name=model_name,
|
||||
revision=revision,
|
||||
dtype=dtype,
|
||||
quantization="marlin",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=1,
|
||||
) as gptq_marlin_model:
|
||||
gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
|
||||
example_prompts[:-1], max_tokens, num_logprobs)
|
||||
example_prompts[:-1], max_tokens, num_logprobs
|
||||
)
|
||||
_ROPE_DICT.clear() # clear rope cache to avoid rope dtype error
|
||||
|
||||
# Run gptq.
|
||||
# The naive gptq kernel doesn't support bf16 yet.
|
||||
# Here we always compare fp16/bf16 gpt marlin kernel
|
||||
# to fp16 gptq kernel.
|
||||
with vllm_runner(model_name=model_name,
|
||||
revision=revision,
|
||||
dtype="half",
|
||||
quantization="gptq",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=1) as gptq_model:
|
||||
with vllm_runner(
|
||||
model_name=model_name,
|
||||
revision=revision,
|
||||
dtype="half",
|
||||
quantization="gptq",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=1,
|
||||
) as gptq_model:
|
||||
gptq_outputs = gptq_model.generate_greedy_logprobs(
|
||||
example_prompts[:-1], max_tokens, num_logprobs)
|
||||
example_prompts[:-1], max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=gptq_outputs,
|
||||
|
||||
@@ -6,6 +6,7 @@ Note: GPTQ and Marlin_24 do not have bitwise correctness.
|
||||
As a result, in this test, we just confirm that the top selected tokens of the
|
||||
Marlin/GPTQ models are in the top 3 selections of each other.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
import pytest
|
||||
@@ -24,15 +25,18 @@ class ModelPair:
|
||||
|
||||
model_pairs = [
|
||||
# 4-bit, group_size == 128
|
||||
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
|
||||
model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"),
|
||||
ModelPair(
|
||||
model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
|
||||
model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128",
|
||||
),
|
||||
# # 4-bit, group_size == channelwise
|
||||
# ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
|
||||
# model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
|
||||
|
||||
# 8-bit, group_size == 128
|
||||
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
|
||||
model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"),
|
||||
ModelPair(
|
||||
model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
|
||||
model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128",
|
||||
),
|
||||
# # 8-bit, group_size == channelwise
|
||||
# ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
|
||||
# model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
|
||||
@@ -40,10 +44,12 @@ model_pairs = [
|
||||
|
||||
|
||||
@pytest.mark.flaky(reruns=2)
|
||||
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24")
|
||||
or current_platform.is_rocm()
|
||||
or not current_platform.is_cuda(),
|
||||
reason="Marlin24 is not supported on this GPU type.")
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("gptq_marlin_24")
|
||||
or current_platform.is_rocm()
|
||||
or not current_platform.is_cuda(),
|
||||
reason="Marlin24 is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("model_pair", model_pairs)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [8])
|
||||
@@ -56,16 +62,19 @@ def test_models(
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
with vllm_runner(model_pair.model_marlin,
|
||||
dtype=dtype,
|
||||
quantization="gptq_marlin_24") as marlin_24_model:
|
||||
with vllm_runner(
|
||||
model_pair.model_marlin, dtype=dtype, quantization="gptq_marlin_24"
|
||||
) as marlin_24_model:
|
||||
marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(model_pair.model_gptq, dtype=dtype,
|
||||
quantization="gptq") as gptq_model:
|
||||
with vllm_runner(
|
||||
model_pair.model_gptq, dtype=dtype, quantization="gptq"
|
||||
) as gptq_model:
|
||||
gptq_outputs = gptq_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=gptq_outputs,
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
"""Tests Model Optimizer fp8 models against ground truth generation
|
||||
Note: these tests will only pass on H100
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
@@ -22,13 +23,13 @@ MODELS = ["nvidia/Llama-3.1-8B-Instruct-FP8"]
|
||||
EXPECTED_STRS_MAP = {
|
||||
"nvidia/Llama-3.1-8B-Instruct-FP8": [
|
||||
"You're referring to VLLM, a high-performance Large Language Model (LLM) inference and",
|
||||
'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
|
||||
'The comparison between artificial intelligence (AI) and human intelligence in terms of processing information is a complex and',
|
||||
"Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ",
|
||||
"The comparison between artificial intelligence (AI) and human intelligence in terms of processing information is a complex and",
|
||||
'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
|
||||
'**The Spark of Imagination**\n\nZeta-5, a sleek and efficient robot, whir',
|
||||
'The COVID-19 pandemic has had a profound impact on global economic structures and business models, leading to',
|
||||
'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
|
||||
'Here are the translations:\n\n**Japanese:** 「早起きは早く獲物をとる'
|
||||
"**The Spark of Imagination**\n\nZeta-5, a sleek and efficient robot, whir",
|
||||
"The COVID-19 pandemic has had a profound impact on global economic structures and business models, leading to",
|
||||
"The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of",
|
||||
"Here are the translations:\n\n**Japanese:** 「早起きは早く獲物をとる",
|
||||
]
|
||||
}
|
||||
|
||||
@@ -39,10 +40,12 @@ EXPECTED_STRS_MAP = {
|
||||
# the hardware being run on.
|
||||
# Disabled to prevent it from breaking the build
|
||||
@pytest.mark.skip(
|
||||
reason=
|
||||
"Prevent unstable test based on golden strings from breaking the build.")
|
||||
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
|
||||
reason="fp8 is not supported on this GPU type.")
|
||||
reason="Prevent unstable test based on golden strings from breaking the build."
|
||||
)
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("fp8"),
|
||||
reason="fp8 is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("model_name", MODELS)
|
||||
def test_models(example_prompts, model_name) -> None:
|
||||
llm = LLM(
|
||||
@@ -55,12 +58,11 @@ def test_models(example_prompts, model_name) -> None:
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
formatted_prompts = [
|
||||
tokenizer.apply_chat_template([{
|
||||
"role": "user",
|
||||
"content": prompt
|
||||
}],
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
tokenizer.apply_chat_template(
|
||||
[{"role": "user", "content": prompt}],
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
)
|
||||
for prompt in example_prompts
|
||||
]
|
||||
params = SamplingParams(max_tokens=20, temperature=0)
|
||||
@@ -78,4 +80,5 @@ def test_models(example_prompts, model_name) -> None:
|
||||
generated_str = generations[i]
|
||||
expected_str = expected_strs[i]
|
||||
assert expected_str == generated_str, (
|
||||
f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
|
||||
f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}"
|
||||
)
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# flake8: noqa
|
||||
"""Tests Quark mxfp4 models against ground truth generation
|
||||
"""
|
||||
"""Tests Quark mxfp4 models against ground truth generation"""
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
@@ -11,13 +11,13 @@ MODELS = ["amd/Llama-2-7b-chat-hf-wmxfp4-amxfp4-kvfp8-scale-uint8"]
|
||||
|
||||
EXPECTED_STRS_MAP = {
|
||||
"amd/Llama-2-7b-chat-hf-wmxfp4-amxfp4-kvfp8-scale-uint8": [
|
||||
'\n### Key Features\n\n* **High-throughput Inference**: vLL',
|
||||
'\nArtificial intelligence (AI) has evolved significantly since its inception in the 1',
|
||||
'Artificial intelligence (AI) and human intelligence (HI) are two distinct concepts that have been',
|
||||
'A neural network is a machine learning model inspired by the structure of the human brain. It consists of',
|
||||
'\nTitle: The Dreaming Robot\n\nAs the sun set on the bustling metropol',
|
||||
'\nThe COVID-19 pandemic has had a profound impact on global economic structures and business',
|
||||
'The Mona Lisa painting, created by Leonardo da Vinci in the early 16th',
|
||||
"\n### Key Features\n\n* **High-throughput Inference**: vLL",
|
||||
"\nArtificial intelligence (AI) has evolved significantly since its inception in the 1",
|
||||
"Artificial intelligence (AI) and human intelligence (HI) are two distinct concepts that have been",
|
||||
"A neural network is a machine learning model inspired by the structure of the human brain. It consists of",
|
||||
"\nTitle: The Dreaming Robot\n\nAs the sun set on the bustling metropol",
|
||||
"\nThe COVID-19 pandemic has had a profound impact on global economic structures and business",
|
||||
"The Mona Lisa painting, created by Leonardo da Vinci in the early 16th",
|
||||
" everybody knows this proverbial saying, but did you know that it's not entirely accurate?",
|
||||
]
|
||||
}
|
||||
@@ -38,4 +38,5 @@ def test_models(example_prompts, model_name) -> None:
|
||||
output_str = output.outputs[0].text
|
||||
expected_str = EXPECTED_STRS_MAP[model_name][i]
|
||||
assert expected_str == output_str, (
|
||||
f"Expected: {expected_str!r}\nvLLM: {output_str!r}")
|
||||
f"Expected: {expected_str!r}\nvLLM: {output_str!r}"
|
||||
)
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
"""Tests Model Optimizer nvfp4 models against ground truth generation
|
||||
Note: these tests will only pass on B200
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
@@ -21,14 +22,14 @@ MODELS = ["nvidia/Llama-3.3-70B-Instruct-FP4"]
|
||||
|
||||
EXPECTED_STRS_MAP = {
|
||||
"nvidia/Llama-3.3-70B-Instruct-FP4": [
|
||||
'vLLM (Vectorized Large Language Model) is indeed a high-throughput and memory-efficient inference',
|
||||
'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
|
||||
'Artificial intelligence (AI) and human intelligence (HI) are two distinct forms of intelligence that process',
|
||||
'A neural network is a type of machine learning model inspired by the structure and function of the human brain',
|
||||
'In the heart of a cutting-edge robotics lab, a team of engineers had been working tirelessly to push',
|
||||
'The COVID-19 pandemic has had a profound impact on global economic structures and future business models, leading',
|
||||
'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
|
||||
'Here are the translations:\n\n* Japanese: (Sasuga no tori ga miwa o ts'
|
||||
"vLLM (Vectorized Large Language Model) is indeed a high-throughput and memory-efficient inference",
|
||||
"Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ",
|
||||
"Artificial intelligence (AI) and human intelligence (HI) are two distinct forms of intelligence that process",
|
||||
"A neural network is a type of machine learning model inspired by the structure and function of the human brain",
|
||||
"In the heart of a cutting-edge robotics lab, a team of engineers had been working tirelessly to push",
|
||||
"The COVID-19 pandemic has had a profound impact on global economic structures and future business models, leading",
|
||||
"The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of",
|
||||
"Here are the translations:\n\n* Japanese: (Sasuga no tori ga miwa o ts",
|
||||
]
|
||||
}
|
||||
|
||||
@@ -39,11 +40,13 @@ EXPECTED_STRS_MAP = {
|
||||
# the hardware being run on.
|
||||
# Disabled to prevent it from breaking the build
|
||||
@pytest.mark.skip(
|
||||
reason=
|
||||
"Prevent unstable test based on golden strings from breaking the build "
|
||||
" and test input model being too large and hanging the system.")
|
||||
@pytest.mark.skipif(not is_quant_method_supported("modelopt_fp4"),
|
||||
reason="modelopt_fp4 is not supported on this GPU type.")
|
||||
reason="Prevent unstable test based on golden strings from breaking the build "
|
||||
" and test input model being too large and hanging the system."
|
||||
)
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("modelopt_fp4"),
|
||||
reason="modelopt_fp4 is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("model_name", MODELS)
|
||||
def test_models(example_prompts, model_name) -> None:
|
||||
llm = LLM(
|
||||
@@ -56,12 +59,11 @@ def test_models(example_prompts, model_name) -> None:
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
formatted_prompts = [
|
||||
tokenizer.apply_chat_template([{
|
||||
"role": "user",
|
||||
"content": prompt
|
||||
}],
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
tokenizer.apply_chat_template(
|
||||
[{"role": "user", "content": prompt}],
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
)
|
||||
for prompt in example_prompts
|
||||
]
|
||||
params = SamplingParams(max_tokens=20, temperature=0)
|
||||
@@ -79,4 +81,5 @@ def test_models(example_prompts, model_name) -> None:
|
||||
generated_str = generations[i]
|
||||
expected_str = expected_strs[i]
|
||||
assert expected_str == generated_str, (
|
||||
f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
|
||||
f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user