[Model] Nemotron Parse 1.1 Support (#30864)

Signed-off-by: amitz-nv <203509407+amitz-nv@users.noreply.github.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
This commit is contained in:
amitz-nv
2026-01-05 23:00:14 +02:00
committed by GitHub
parent af1b07b0c5
commit ee21291825
13 changed files with 1117 additions and 31 deletions

View File

@@ -0,0 +1,89 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
import pytest
from transformers import AutoModel
from tests.models.utils import check_logprobs_close
from vllm.assets.image import ImageAsset
from ....conftest import HfRunner, PromptImageInput, VllmRunner
from ....utils import create_new_process_for_each_test
IMAGE = ImageAsset("paper-11").pil_image_ext(ext="png").convert("RGB")
PROMPT = "</s><s><predict_bbox><predict_classes><output_markdown>"
def run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
inputs: Sequence[tuple[list[str], PromptImageInput]],
model: str,
*,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
"""Verify that the inference result is the same between hf and vllm."""
with vllm_runner(
model,
dtype=dtype,
max_num_seqs=64,
limit_mm_per_prompt={"image": 1},
trust_remote_code=True,
) as vllm_model:
vllm_outputs_per_case = [
vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
)
for prompts, images in inputs
]
with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
hf_outputs_per_case = [
hf_model.generate_greedy_logprobs_limit(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
use_cache=False, # HF Nemotron Parse crashes here without this
)
for prompts, images in inputs
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.core_model
@pytest.mark.parametrize("model", ["nvidia/NVIDIA-Nemotron-Parse-v1.1"])
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("num_logprobs", [5])
@create_new_process_for_each_test("spawn")
def test_models(
hf_runner, vllm_runner, model: str, dtype: str, num_logprobs: int
) -> None:
run_test(
hf_runner,
vllm_runner,
inputs=[
(
[PROMPT] * 10,
[IMAGE] * 10,
),
],
model=model,
dtype=dtype,
max_tokens=100,
num_logprobs=num_logprobs,
)

View File

@@ -40,15 +40,15 @@ def run_radio_test(
for image in images
]
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
hf_config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
# RADIO model on HF does not properly handle torch_dtype argument
# And relies on args["dtype"] which we have to patch manually:
config.args["dtype"] = torch_dtype
hf_config.args["dtype"] = torch_dtype
hf_model = AutoModel.from_pretrained(
model_id,
config=config,
config=hf_config,
dtype=torch_dtype,
trust_remote_code=True,
).to("cuda")
@@ -62,13 +62,14 @@ def run_radio_test(
hf_model.make_preprocessor_external()
hf_outputs_per_image = [
hf_model(pixel_value.to("cuda")).features for pixel_value in pixel_values
hf_model(pixel_value.to("cuda")) for pixel_value in pixel_values
]
radio_config = RadioConfig(
model_name=config.args["model"], reg_tokens=config.args["register_multiple"]
vllm_config = RadioConfig(
model_name=hf_config.args["model"],
**hf_config.args,
)
vllm_model = RadioModel(radio_config)
vllm_model = RadioModel(vllm_config)
vllm_model.load_weights(hf_model.state_dict())
vllm_model = vllm_model.to("cuda", torch_dtype)
@@ -80,7 +81,8 @@ def run_radio_test(
cos_similar = nn.CosineSimilarity(dim=-1)
for vllm_output, hf_output in zip(vllm_outputs_per_image, hf_outputs_per_image):
assert cos_similar(vllm_output, hf_output).mean() > 0.99
assert cos_similar(vllm_output[0], hf_output[0]).mean() > 0.99
assert cos_similar(vllm_output[1], hf_output[1]).mean() > 0.99
@pytest.mark.parametrize(

View File

@@ -102,6 +102,7 @@ def glmasr_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
# incorrect token ids. So we need use `add_special_tokens=False` here
# to leave bos_token to be added by the processor.
_ADD_SPECIAL_TOKENS_OVERRIDES = {
"nemotron_parse": False,
"ovis": False,
"ovis2_5": False,
"paligemma": False,