tests/models/quantization/test_nvfp4.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# flake8: noqa
"""Tests Model Optimizer nvfp4 models against ground truth generation
Note: these tests will only pass on B200
"""

import os
from typing import List

import pytest
from transformers import AutoTokenizer

from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams

from vllm.platforms import current_platform

os.environ["TOKENIZERS_PARALLELISM"] = "true"

MAX_MODEL_LEN = 1024

MODELS = ["nvidia/Llama-3.3-70B-Instruct-FP4"]

EXPECTED_STRS_MAP = {
    "nvidia/Llama-3.3-70B-Instruct-FP4": [
        "vLLM (Vectorized Large Language Model) is indeed a high-throughput and memory-efficient inference",
        "Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ",
        "Artificial intelligence (AI) and human intelligence (HI) are two distinct forms of intelligence that process",
        "A neural network is a type of machine learning model inspired by the structure and function of the human brain",
        "In the heart of a cutting-edge robotics lab, a team of engineers had been working tirelessly to push",
        "The COVID-19 pandemic has had a profound impact on global economic structures and future business models, leading",
        "The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of",
        "Here are the translations:\n\n* Japanese: (Sasuga no tori ga miwa o ts",
    ]
}


# This test compares against golden strings for exact match since
# there is no baseline implementation to compare against
# and is unstable w.r.t specifics of the fp4 implementation or
# the hardware being run on.
# Disabled to prevent it from breaking the build
@pytest.mark.skip(
    reason="Prevent unstable test based on golden strings from breaking the build "
    " and test input model being too large and hanging the system."
)
@pytest.mark.skipif(
    not is_quant_method_supported("modelopt_fp4"),
    reason="modelopt_fp4 is not supported on this GPU type.",
)
@pytest.mark.parametrize("model_name", MODELS)
def test_models(example_prompts, model_name) -> None:
    llm = LLM(
        model=model_name,
        max_model_len=MAX_MODEL_LEN,
        trust_remote_code=True,
        enforce_eager=True,
        quantization="modelopt_fp4",
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    formatted_prompts = [
        tokenizer.apply_chat_template(
            [{"role": "user", "content": prompt}],
            tokenize=False,
            add_generation_prompt=True,
        )
        for prompt in example_prompts
    ]
    params = SamplingParams(max_tokens=20, temperature=0)
    generations: List[str] = []
    # Note: these need to be run 1 at a time due to numerical precision,
    # since the expected strs were generated this way.
    for prompt in formatted_prompts:
        outputs = llm.generate(prompt, params)
        generations.append(outputs[0].outputs[0].text)
    del llm

    print(model_name, generations)
    expected_strs = EXPECTED_STRS_MAP[model_name]
    for i in range(len(example_prompts)):
        generated_str = generations[i]
        expected_str = expected_strs[i]
        assert expected_str == generated_str, (
            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}"
        )


EAGER = [True, False]


@pytest.mark.skipif(
    not current_platform.has_device_capability(100),
    reason="modelopt_fp4 is not supported on this GPU type.",
)
@pytest.mark.parametrize("model", ["nvidia/Llama-3.1-8B-Instruct-NVFP4"])
@pytest.mark.parametrize("eager", EAGER)
@pytest.mark.parametrize(
    "backend",
    [
        "flashinfer-cudnn",
        "flashinfer-trtllm",  # the small seq_len ensures trtllm_8x4_layout backend is used
        "flashinfer-cutlass",
    ],
)
def test_nvfp4(vllm_runner, model, eager, backend, monkeypatch):
    monkeypatch.setenv("VLLM_NVFP4_GEMM_BACKEND", backend)
    with vllm_runner(model, enforce_eager=eager) as llm:
        output = llm.generate_greedy(["1 2 3 4 5"], max_tokens=2)
    assert output[0][1] == "1 2 3 4 5 6"
[Kernel] Add ModelOpt FP4 Checkpoint Support (#12520) Signed-off-by: Pavani Majety <pmajety@nvidia.com> 2025-03-11 22:13:11 -07:00			`# SPDX-License-Identifier: Apache-2.0`
[Misc] Add SPDX-FileCopyrightText (#19100) Signed-off-by: simon-mo <simon.mo@hey.com> 2025-06-03 11:20:17 -07:00			`# SPDX-FileCopyrightText: Copyright contributors to the vLLM project`
[Kernel] Add ModelOpt FP4 Checkpoint Support (#12520) Signed-off-by: Pavani Majety <pmajety@nvidia.com> 2025-03-11 22:13:11 -07:00			`# flake8: noqa`
			`"""Tests Model Optimizer nvfp4 models against ground truth generation`
			`Note: these tests will only pass on B200`
			`"""`
Convert formatting to use `ruff` instead of `yapf` + `isort` (#26247) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> 2025-10-05 15:06:22 +01:00
[Kernel] Add ModelOpt FP4 Checkpoint Support (#12520) Signed-off-by: Pavani Majety <pmajety@nvidia.com> 2025-03-11 22:13:11 -07:00			`import os`
			`from typing import List`

			`import pytest`
			`from transformers import AutoTokenizer`

			`from tests.quantization.utils import is_quant_method_supported`
			`from vllm import LLM, SamplingParams`

[Kernel][Performance] Enable smaller Scaling Factor tiling for NVFP4 small-batch decoding (#30885) Signed-off-by: LopezCastroRoberto <roberto.lopez.castro@udc.es> Signed-off-by: Roberto L. Castro <38211239+LopezCastroRoberto@users.noreply.github.com> Signed-off-by: LopezCastroRoberto <rocastro@redhat.com> 2026-01-14 00:22:53 +01:00			`from vllm.platforms import current_platform`

[Kernel] Add ModelOpt FP4 Checkpoint Support (#12520) Signed-off-by: Pavani Majety <pmajety@nvidia.com> 2025-03-11 22:13:11 -07:00			`os.environ["TOKENIZERS_PARALLELISM"] = "true"`

			`MAX_MODEL_LEN = 1024`

			`MODELS = ["nvidia/Llama-3.3-70B-Instruct-FP4"]`

			`EXPECTED_STRS_MAP = {`
			`"nvidia/Llama-3.3-70B-Instruct-FP4": [`
			`"vLLM (Vectorized Large Language Model) is indeed a high-throughput and memory-efficient inference",`
			`"Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ",`
			`"Artificial intelligence (AI) and human intelligence (HI) are two distinct forms of intelligence that process",`
			`"A neural network is a type of machine learning model inspired by the structure and function of the human brain",`
			`"In the heart of a cutting-edge robotics lab, a team of engineers had been working tirelessly to push",`
			`"The COVID-19 pandemic has had a profound impact on global economic structures and future business models, leading",`
			`"The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of",`
			`"Here are the translations:\n\n* Japanese: (Sasuga no tori ga miwa o ts",`
			`]`
			`}`


			`# This test compares against golden strings for exact match since`
			`# there is no baseline implementation to compare against`
			`# and is unstable w.r.t specifics of the fp4 implementation or`
			`# the hardware being run on.`
			`# Disabled to prevent it from breaking the build`
			`@pytest.mark.skip(`
			`reason="Prevent unstable test based on golden strings from breaking the build "`
			`" and test input model being too large and hanging the system."`
			`)`
[Minor] Rename quantization nvfp4 to modelopt_fp4 (#18356) Signed-off-by: mgoin <mgoin64@gmail.com> 2025-05-20 12:08:37 -04:00			`@pytest.mark.skipif(`
			`not is_quant_method_supported("modelopt_fp4"),`
			`reason="modelopt_fp4 is not supported on this GPU type.",`
			`)`
[Kernel] Add ModelOpt FP4 Checkpoint Support (#12520) Signed-off-by: Pavani Majety <pmajety@nvidia.com> 2025-03-11 22:13:11 -07:00			`@pytest.mark.parametrize("model_name", MODELS)`
			`def test_models(example_prompts, model_name) -> None:`
[Misc] unify variable for LLM instance (#20996) Signed-off-by: Andy Xie <andy.xning@gmail.com> 2025-07-21 19:18:33 +08:00			`llm = LLM(`
[Kernel] Add ModelOpt FP4 Checkpoint Support (#12520) Signed-off-by: Pavani Majety <pmajety@nvidia.com> 2025-03-11 22:13:11 -07:00			`model=model_name,`
			`max_model_len=MAX_MODEL_LEN,`
			`trust_remote_code=True,`
			`enforce_eager=True,`
[Minor] Rename quantization nvfp4 to modelopt_fp4 (#18356) Signed-off-by: mgoin <mgoin64@gmail.com> 2025-05-20 12:08:37 -04:00			`quantization="modelopt_fp4",`
[Kernel] Add ModelOpt FP4 Checkpoint Support (#12520) Signed-off-by: Pavani Majety <pmajety@nvidia.com> 2025-03-11 22:13:11 -07:00			`)`

			`tokenizer = AutoTokenizer.from_pretrained(model_name)`
			`formatted_prompts = [`
			`tokenizer.apply_chat_template(`
			`[{"role": "user", "content": prompt}],`
			`tokenize=False,`
			`add_generation_prompt=True,`
			`)`
			`for prompt in example_prompts`
			`]`
			`params = SamplingParams(max_tokens=20, temperature=0)`
			`generations: List[str] = []`
			`# Note: these need to be run 1 at a time due to numerical precision,`
			`# since the expected strs were generated this way.`
			`for prompt in formatted_prompts:`
[Misc] unify variable for LLM instance (#20996) Signed-off-by: Andy Xie <andy.xning@gmail.com> 2025-07-21 19:18:33 +08:00			`outputs = llm.generate(prompt, params)`
[Kernel] Add ModelOpt FP4 Checkpoint Support (#12520) Signed-off-by: Pavani Majety <pmajety@nvidia.com> 2025-03-11 22:13:11 -07:00			`generations.append(outputs[0].outputs[0].text)`
[Misc] unify variable for LLM instance (#20996) Signed-off-by: Andy Xie <andy.xning@gmail.com> 2025-07-21 19:18:33 +08:00			`del llm`
[Kernel] Add ModelOpt FP4 Checkpoint Support (#12520) Signed-off-by: Pavani Majety <pmajety@nvidia.com> 2025-03-11 22:13:11 -07:00
			`print(model_name, generations)`
			`expected_strs = EXPECTED_STRS_MAP[model_name]`
			`for i in range(len(example_prompts)):`
			`generated_str = generations[i]`
			`expected_str = expected_strs[i]`
			`assert expected_str == generated_str, (`
			`f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}"`
			`)`
[Kernel][Performance] Enable smaller Scaling Factor tiling for NVFP4 small-batch decoding (#30885) Signed-off-by: LopezCastroRoberto <roberto.lopez.castro@udc.es> Signed-off-by: Roberto L. Castro <38211239+LopezCastroRoberto@users.noreply.github.com> Signed-off-by: LopezCastroRoberto <rocastro@redhat.com> 2026-01-14 00:22:53 +01:00

			`EAGER = [True, False]`


			`@pytest.mark.skipif(`
			`not current_platform.has_device_capability(100),`
			`reason="modelopt_fp4 is not supported on this GPU type.",`
			`)`
			`@pytest.mark.parametrize("model", ["nvidia/Llama-3.1-8B-Instruct-NVFP4"])`
			`@pytest.mark.parametrize("eager", EAGER)`
			`@pytest.mark.parametrize(`
			`"backend",`
			`[`
			`"flashinfer-cudnn",`
			`"flashinfer-trtllm", # the small seq_len ensures trtllm_8x4_layout backend is used`
			`"flashinfer-cutlass",`
			`],`
			`)`
			`def test_nvfp4(vllm_runner, model, eager, backend, monkeypatch):`
			`monkeypatch.setenv("VLLM_NVFP4_GEMM_BACKEND", backend)`
			`with vllm_runner(model, enforce_eager=eager) as llm:`
			`output = llm.generate_greedy(["1 2 3 4 5"], max_tokens=2)`
			`assert output[0][1] == "1 2 3 4 5 6"`