tests/models/multimodal/generation/test_nemotron_parse.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from collections.abc import Sequence

import pytest
from transformers import AutoModel

from tests.models.utils import check_logprobs_close
from vllm.assets.image import ImageAsset

from ....conftest import HfRunner, PromptImageInput, VllmRunner
from ....utils import create_new_process_for_each_test

IMAGE = ImageAsset("paper-11").pil_image_ext(ext="png").convert("RGB")
PROMPT = "</s><s><predict_bbox><predict_classes><output_markdown>"


def run_test(
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
    inputs: Sequence[tuple[list[str], PromptImageInput]],
    model: str,
    *,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
) -> None:
    """Verify that the inference result is the same between hf and vllm."""
    with vllm_runner(
        model,
        dtype=dtype,
        max_num_seqs=64,
        limit_mm_per_prompt={"image": 1},
        trust_remote_code=True,
    ) as vllm_model:
        vllm_outputs_per_case = [
            vllm_model.generate_greedy_logprobs(
                prompts,
                max_tokens,
                num_logprobs=num_logprobs,
                images=images,
            )
            for prompts, images in inputs
        ]

    with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
        hf_outputs_per_case = [
            hf_model.generate_greedy_logprobs_limit(
                prompts,
                max_tokens,
                num_logprobs=num_logprobs,
                images=images,
                use_cache=False,  # HF Nemotron Parse crashes here without this
            )
            for prompts, images in inputs
        ]

    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
            outputs_1_lst=vllm_outputs,
            name_0="hf",
            name_1="vllm",
        )


@pytest.mark.core_model
@pytest.mark.parametrize("model", ["nvidia/NVIDIA-Nemotron-Parse-v1.1"])
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("num_logprobs", [5])
@create_new_process_for_each_test("spawn")
def test_models(
    hf_runner, vllm_runner, model: str, dtype: str, num_logprobs: int
) -> None:
    run_test(
        hf_runner,
        vllm_runner,
        inputs=[
            (
                [PROMPT] * 10,
                [IMAGE] * 10,
            ),
        ],
        model=model,
        dtype=dtype,
        max_tokens=100,
        num_logprobs=num_logprobs,
    )
[Model] Nemotron Parse 1.1 Support (#30864) Signed-off-by: amitz-nv <203509407+amitz-nv@users.noreply.github.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> 2026-01-05 23:00:14 +02:00			`# SPDX-License-Identifier: Apache-2.0`
			`# SPDX-FileCopyrightText: Copyright contributors to the vLLM project`

			`from collections.abc import Sequence`

			`import pytest`
			`from transformers import AutoModel`

			`from tests.models.utils import check_logprobs_close`
			`from vllm.assets.image import ImageAsset`

			`from ....conftest import HfRunner, PromptImageInput, VllmRunner`
			`from ....utils import create_new_process_for_each_test`

			`IMAGE = ImageAsset("paper-11").pil_image_ext(ext="png").convert("RGB")`
			`PROMPT = "</s><s><predict_bbox><predict_classes><output_markdown>"`


			`def run_test(`
			`hf_runner: type[HfRunner],`
			`vllm_runner: type[VllmRunner],`
			`inputs: Sequence[tuple[list[str], PromptImageInput]],`
			`model: str,`
			`*,`
			`dtype: str,`
			`max_tokens: int,`
			`num_logprobs: int,`
			`) -> None:`
			`"""Verify that the inference result is the same between hf and vllm."""`
			`with vllm_runner(`
			`model,`
			`dtype=dtype,`
			`max_num_seqs=64,`
			`limit_mm_per_prompt={"image": 1},`
			`trust_remote_code=True,`
			`) as vllm_model:`
			`vllm_outputs_per_case = [`
			`vllm_model.generate_greedy_logprobs(`
			`prompts,`
			`max_tokens,`
			`num_logprobs=num_logprobs,`
			`images=images,`
			`)`
			`for prompts, images in inputs`
			`]`

			`with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:`
			`hf_outputs_per_case = [`
			`hf_model.generate_greedy_logprobs_limit(`
			`prompts,`
			`max_tokens,`
			`num_logprobs=num_logprobs,`
			`images=images,`
			`use_cache=False, # HF Nemotron Parse crashes here without this`
			`)`
			`for prompts, images in inputs`
			`]`

			`for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):`
			`check_logprobs_close(`
			`outputs_0_lst=hf_outputs,`
			`outputs_1_lst=vllm_outputs,`
			`name_0="hf",`
			`name_1="vllm",`
			`)`


			`@pytest.mark.core_model`
			`@pytest.mark.parametrize("model", ["nvidia/NVIDIA-Nemotron-Parse-v1.1"])`
			`@pytest.mark.parametrize("dtype", ["bfloat16"])`
			`@pytest.mark.parametrize("num_logprobs", [5])`
			`@create_new_process_for_each_test("spawn")`
			`def test_models(`
			`hf_runner, vllm_runner, model: str, dtype: str, num_logprobs: int`
			`) -> None:`
			`run_test(`
			`hf_runner,`
			`vllm_runner,`
			`inputs=[`
			`(`
			`[PROMPT] * 10,`
			`[IMAGE] * 10,`
			`),`
			`],`
			`model=model,`
			`dtype=dtype,`
			`max_tokens=100,`
			`num_logprobs=num_logprobs,`
			`)`