[VLM] Merged multi-modal processor for LLaVA-NeXT (#11682)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-03 00:39:27 +08:00
parent b6087a6bee
commit 8c38ee7007
14 changed files with 609 additions and 555 deletions
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py
@@ -1,70 +0,0 @@
-import pytest
-
-from vllm.inputs import InputContext
-
-from ....utils import build_model_context
-
-
-@pytest.fixture()
-def get_max_llava_next_image_tokens():
-    from vllm.model_executor.models.llava_next import (
-        get_max_llava_next_image_tokens)
-    return get_max_llava_next_image_tokens
-
-
-@pytest.fixture()
-def dummy_data_for_llava_next():
-    from vllm.model_executor.models.llava_next import dummy_data_for_llava_next
-    return dummy_data_for_llava_next
-
-
-@pytest.mark.parametrize("gridpoints,expected_max_tokens", [
-    ([[336, 336]], 1176),
-    ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], 2928),
-])
-def test_get_max_llava_next_image_tokens(gridpoints, expected_max_tokens,
-                                         get_max_llava_next_image_tokens):
-    ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf")
-
-    # Update the config image_grid_pinpoints
-    # and calculate the resulting max tokens
-    ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
-
-    actual_max_tokens = get_max_llava_next_image_tokens(
-        InputContext(ctx.model_config))
-
-    assert expected_max_tokens == actual_max_tokens
-
-
-@pytest.mark.parametrize(
-    "gridpoints,expected_size",
-    [
-        # One point; it has to be the largest
-        ([[336, 336]], (336, 336)),
-        # Default for most llava next models; the 2x2 tile is the largest
-        ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]],
-         (672, 672)),
-        # If two rectangular gridpoints are the same, the more vertical
-        # one has the higher feature count due to newline features
-        ([[336, 672], [672, 336]], (672, 336))
-    ])
-def test_dummy_data_for_llava_next_feature_size(dummy_data_for_llava_next,
-                                                gridpoints, expected_size):
-    ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf")
-
-    # Update the config image_grid_pinpoints
-    ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
-    seq_len = 5000  # bigger than the max feature size for any image
-
-    dummy_data = dummy_data_for_llava_next(
-        ctx,
-        seq_len=seq_len,
-        mm_counts={"image": 1},
-    )
-    seq_data = dummy_data.seq_data
-    mm_data = dummy_data.multi_modal_data
-
-    # The dummy data dims should match the gridpoint with the biggest feat size
-    assert mm_data["image"].height == expected_size[0]
-    assert mm_data["image"].width == expected_size[1]
-    assert len(seq_data.get_token_ids()) >= seq_len