[Kernel][Performance] Enable smaller Scaling Factor tiling for NVFP4 small-batch decoding (#30885)

Signed-off-by: LopezCastroRoberto <roberto.lopez.castro@udc.es> Signed-off-by: Roberto L. Castro <38211239+LopezCastroRoberto@users.noreply.github.com> Signed-off-by: LopezCastroRoberto <rocastro@redhat.com>
2026-01-14 00:22:53 +01:00
parent 2a60ac91d0
commit 8ef50d9a6b
9 changed files with 177 additions and 32 deletions
--- a/tests/models/quantization/test_nvfp4.py
+++ b/tests/models/quantization/test_nvfp4.py
@@ -14,6 +14,8 @@ from transformers import AutoTokenizer
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams

+from vllm.platforms import current_platform
+
 os.environ["TOKENIZERS_PARALLELISM"] = "true"

 MAX_MODEL_LEN = 1024
@@ -83,3 +85,27 @@ def test_models(example_prompts, model_name) -> None:
        assert expected_str == generated_str, (
            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}"
        )
+
+
+EAGER = [True, False]
+
+
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(100),
+    reason="modelopt_fp4 is not supported on this GPU type.",
+)
+@pytest.mark.parametrize("model", ["nvidia/Llama-3.1-8B-Instruct-NVFP4"])
+@pytest.mark.parametrize("eager", EAGER)
+@pytest.mark.parametrize(
+    "backend",
+    [
+        "flashinfer-cudnn",
+        "flashinfer-trtllm",  # the small seq_len ensures trtllm_8x4_layout backend is used
+        "flashinfer-cutlass",
+    ],
+)
+def test_nvfp4(vllm_runner, model, eager, backend, monkeypatch):
+    monkeypatch.setenv("VLLM_NVFP4_GEMM_BACKEND", backend)
+    with vllm_runner(model, enforce_eager=eager) as llm:
+        output = llm.generate_greedy(["1 2 3 4 5"], max_tokens=2)
+    assert output[0][1] == "1 2 3 4 5 6"