[Model] Nemotron Parse 1.1 Support (#30864)
Signed-off-by: amitz-nv <203509407+amitz-nv@users.noreply.github.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>
This commit is contained in:
89
tests/models/multimodal/generation/test_nemotron_parse.py
Normal file
89
tests/models/multimodal/generation/test_nemotron_parse.py
Normal file
@@ -0,0 +1,89 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
import pytest
|
||||
from transformers import AutoModel
|
||||
|
||||
from tests.models.utils import check_logprobs_close
|
||||
from vllm.assets.image import ImageAsset
|
||||
|
||||
from ....conftest import HfRunner, PromptImageInput, VllmRunner
|
||||
from ....utils import create_new_process_for_each_test
|
||||
|
||||
IMAGE = ImageAsset("paper-11").pil_image_ext(ext="png").convert("RGB")
|
||||
PROMPT = "</s><s><predict_bbox><predict_classes><output_markdown>"
|
||||
|
||||
|
||||
def run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
inputs: Sequence[tuple[list[str], PromptImageInput]],
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
"""Verify that the inference result is the same between hf and vllm."""
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_num_seqs=64,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
trust_remote_code=True,
|
||||
) as vllm_model:
|
||||
vllm_outputs_per_case = [
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
)
|
||||
for prompts, images in inputs
|
||||
]
|
||||
|
||||
with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
|
||||
hf_outputs_per_case = [
|
||||
hf_model.generate_greedy_logprobs_limit(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
use_cache=False, # HF Nemotron Parse crashes here without this
|
||||
)
|
||||
for prompts, images in inputs
|
||||
]
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", ["nvidia/NVIDIA-Nemotron-Parse-v1.1"])
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@create_new_process_for_each_test("spawn")
|
||||
def test_models(
|
||||
hf_runner, vllm_runner, model: str, dtype: str, num_logprobs: int
|
||||
) -> None:
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs=[
|
||||
(
|
||||
[PROMPT] * 10,
|
||||
[IMAGE] * 10,
|
||||
),
|
||||
],
|
||||
model=model,
|
||||
dtype=dtype,
|
||||
max_tokens=100,
|
||||
num_logprobs=num_logprobs,
|
||||
)
|
||||
@@ -40,15 +40,15 @@ def run_radio_test(
|
||||
for image in images
|
||||
]
|
||||
|
||||
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
|
||||
hf_config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
|
||||
|
||||
# RADIO model on HF does not properly handle torch_dtype argument
|
||||
# And relies on args["dtype"] which we have to patch manually:
|
||||
config.args["dtype"] = torch_dtype
|
||||
hf_config.args["dtype"] = torch_dtype
|
||||
|
||||
hf_model = AutoModel.from_pretrained(
|
||||
model_id,
|
||||
config=config,
|
||||
config=hf_config,
|
||||
dtype=torch_dtype,
|
||||
trust_remote_code=True,
|
||||
).to("cuda")
|
||||
@@ -62,13 +62,14 @@ def run_radio_test(
|
||||
hf_model.make_preprocessor_external()
|
||||
|
||||
hf_outputs_per_image = [
|
||||
hf_model(pixel_value.to("cuda")).features for pixel_value in pixel_values
|
||||
hf_model(pixel_value.to("cuda")) for pixel_value in pixel_values
|
||||
]
|
||||
|
||||
radio_config = RadioConfig(
|
||||
model_name=config.args["model"], reg_tokens=config.args["register_multiple"]
|
||||
vllm_config = RadioConfig(
|
||||
model_name=hf_config.args["model"],
|
||||
**hf_config.args,
|
||||
)
|
||||
vllm_model = RadioModel(radio_config)
|
||||
vllm_model = RadioModel(vllm_config)
|
||||
vllm_model.load_weights(hf_model.state_dict())
|
||||
vllm_model = vllm_model.to("cuda", torch_dtype)
|
||||
|
||||
@@ -80,7 +81,8 @@ def run_radio_test(
|
||||
|
||||
cos_similar = nn.CosineSimilarity(dim=-1)
|
||||
for vllm_output, hf_output in zip(vllm_outputs_per_image, hf_outputs_per_image):
|
||||
assert cos_similar(vllm_output, hf_output).mean() > 0.99
|
||||
assert cos_similar(vllm_output[0], hf_output[0]).mean() > 0.99
|
||||
assert cos_similar(vllm_output[1], hf_output[1]).mean() > 0.99
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
||||
@@ -102,6 +102,7 @@ def glmasr_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
|
||||
# incorrect token ids. So we need use `add_special_tokens=False` here
|
||||
# to leave bos_token to be added by the processor.
|
||||
_ADD_SPECIAL_TOKENS_OVERRIDES = {
|
||||
"nemotron_parse": False,
|
||||
"ovis": False,
|
||||
"ovis2_5": False,
|
||||
"paligemma": False,
|
||||
|
||||
Reference in New Issue
Block a user