[Model] Nemotron Parse 1.1 Support (#30864)

Signed-off-by: amitz-nv <203509407+amitz-nv@users.noreply.github.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>
2026-01-05 23:00:14 +02:00
parent af1b07b0c5
commit ee21291825
13 changed files with 1117 additions and 31 deletions
--- a/tests/models/multimodal/generation/test_nemotron_parse.py
+++ b/tests/models/multimodal/generation/test_nemotron_parse.py
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+
+import pytest
+from transformers import AutoModel
+
+from tests.models.utils import check_logprobs_close
+from vllm.assets.image import ImageAsset
+
+from ....conftest import HfRunner, PromptImageInput, VllmRunner
+from ....utils import create_new_process_for_each_test
+
+IMAGE = ImageAsset("paper-11").pil_image_ext(ext="png").convert("RGB")
+PROMPT = "</s><s><predict_bbox><predict_classes><output_markdown>"
+
+
+def run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    inputs: Sequence[tuple[list[str], PromptImageInput]],
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    """Verify that the inference result is the same between hf and vllm."""
+    with vllm_runner(
+        model,
+        dtype=dtype,
+        max_num_seqs=64,
+        limit_mm_per_prompt={"image": 1},
+        trust_remote_code=True,
+    ) as vllm_model:
+        vllm_outputs_per_case = [
+            vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images,
+            )
+            for prompts, images in inputs
+        ]
+
+    with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
+        hf_outputs_per_case = [
+            hf_model.generate_greedy_logprobs_limit(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images,
+                use_cache=False,  # HF Nemotron Parse crashes here without this
+            )
+            for prompts, images in inputs
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", ["nvidia/NVIDIA-Nemotron-Parse-v1.1"])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("num_logprobs", [5])
+@create_new_process_for_each_test("spawn")
+def test_models(
+    hf_runner, vllm_runner, model: str, dtype: str, num_logprobs: int
+) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs=[
+            (
+                [PROMPT] * 10,
+                [IMAGE] * 10,
+            ),
+        ],
+        model=model,
+        dtype=dtype,
+        max_tokens=100,
+        num_logprobs=num_logprobs,
+    )
--- a/tests/models/multimodal/pooling/test_radio.py
+++ b/tests/models/multimodal/pooling/test_radio.py
@@ -40,15 +40,15 @@ def run_radio_test(
        for image in images
    ]

-    config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
+    hf_config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)

    # RADIO model on HF does not properly handle torch_dtype argument
    # And relies on args["dtype"] which we have to patch manually:
-    config.args["dtype"] = torch_dtype
+    hf_config.args["dtype"] = torch_dtype

    hf_model = AutoModel.from_pretrained(
        model_id,
-        config=config,
+        config=hf_config,
        dtype=torch_dtype,
        trust_remote_code=True,
    ).to("cuda")
@@ -62,13 +62,14 @@ def run_radio_test(
    hf_model.make_preprocessor_external()

    hf_outputs_per_image = [
-        hf_model(pixel_value.to("cuda")).features for pixel_value in pixel_values
+        hf_model(pixel_value.to("cuda")) for pixel_value in pixel_values
    ]

-    radio_config = RadioConfig(
-        model_name=config.args["model"], reg_tokens=config.args["register_multiple"]
+    vllm_config = RadioConfig(
+        model_name=hf_config.args["model"],
+        **hf_config.args,
    )
-    vllm_model = RadioModel(radio_config)
+    vllm_model = RadioModel(vllm_config)
    vllm_model.load_weights(hf_model.state_dict())
    vllm_model = vllm_model.to("cuda", torch_dtype)

@@ -80,7 +81,8 @@ def run_radio_test(

    cos_similar = nn.CosineSimilarity(dim=-1)
    for vllm_output, hf_output in zip(vllm_outputs_per_image, hf_outputs_per_image):
-        assert cos_similar(vllm_output, hf_output).mean() > 0.99
+        assert cos_similar(vllm_output[0], hf_output[0]).mean() > 0.99
+        assert cos_similar(vllm_output[1], hf_output[1]).mean() > 0.99


@pytest.mark.parametrize(
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -102,6 +102,7 @@ def glmasr_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
 # incorrect token ids. So we need use `add_special_tokens=False` here
 # to leave bos_token to be added by the processor.
 _ADD_SPECIAL_TOKENS_OVERRIDES = {
+    "nemotron_parse": False,
    "ovis": False,
    "ovis2_5": False,
    "paligemma": False,