[Kernel] Add ModelOpt FP4 Checkpoint Support (#12520)

Signed-off-by: Pavani Majety <pmajety@nvidia.com>
2025-03-11 22:13:11 -07:00
parent 5c538c37b2
commit debd6bbf09
10 changed files with 388 additions and 30 deletions
--- a/tests/models/decoder_only/language/test_nvfp4.py
+++ b/tests/models/decoder_only/language/test_nvfp4.py
@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: Apache-2.0
+# flake8: noqa
+"""Tests Model Optimizer nvfp4 models against ground truth generation
+Note: these tests will only pass on B200
+"""
+import os
+from typing import List
+
+import pytest
+from transformers import AutoTokenizer
+
+from tests.quantization.utils import is_quant_method_supported
+from vllm import LLM, SamplingParams
+
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+MAX_MODEL_LEN = 1024
+
+MODELS = ["nvidia/Llama-3.3-70B-Instruct-FP4"]
+
+EXPECTED_STRS_MAP = {
+    "nvidia/Llama-3.3-70B-Instruct-FP4": [
+        'vLLM (Vectorized Large Language Model) is indeed a high-throughput and memory-efficient inference',
+        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
+        'Artificial intelligence (AI) and human intelligence (HI) are two distinct forms of intelligence that process',
+        'A neural network is a type of machine learning model inspired by the structure and function of the human brain',
+        'In the heart of a cutting-edge robotics lab, a team of engineers had been working tirelessly to push',
+        'The COVID-19 pandemic has had a profound impact on global economic structures and future business models, leading',
+        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
+        'Here are the translations:\n\n* Japanese: (Sasuga no tori ga miwa o ts'
+    ]
+}
+
+
+# This test compares against golden strings for exact match since
+# there is no baseline implementation to compare against
+# and is unstable w.r.t specifics of the fp4 implementation or
+# the hardware being run on.
+# Disabled to prevent it from breaking the build
+@pytest.mark.skip(
+    reason=
+    "Prevent unstable test based on golden strings from breaking the build "
+    " and test input model being too large and hanging the system.")
+@pytest.mark.quant_model
+@pytest.mark.skipif(not is_quant_method_supported("nvfp4"),
+                    reason="nvfp4 is not supported on this GPU type.")
+@pytest.mark.parametrize("model_name", MODELS)
+def test_models(example_prompts, model_name) -> None:
+    model = LLM(
+        model=model_name,
+        max_model_len=MAX_MODEL_LEN,
+        trust_remote_code=True,
+        enforce_eager=True,
+        quantization="nvfp4",
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    formatted_prompts = [
+        tokenizer.apply_chat_template([{
+            "role": "user",
+            "content": prompt
+        }],
+                                      tokenize=False,
+                                      add_generation_prompt=True)
+        for prompt in example_prompts
+    ]
+    params = SamplingParams(max_tokens=20, temperature=0)
+    generations: List[str] = []
+    # Note: these need to be run 1 at a time due to numerical precision,
+    # since the expected strs were generated this way.
+    for prompt in formatted_prompts:
+        outputs = model.generate(prompt, params)
+        generations.append(outputs[0].outputs[0].text)
+    del model
+
+    print(model_name, generations)
+    expected_strs = EXPECTED_STRS_MAP[model_name]
+    for i in range(len(example_prompts)):
+        generated_str = generations[i]
+        expected_str = expected_strs[i]
+        assert expected_str == generated_str, (
+            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")