[VLM] Merged multi-modal processors for LLaVA-NeXT-Video and LLaVA-OneVision (#11717)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-04 19:40:53 +08:00
parent 300acb8347
commit eed11ebee9
31 changed files with 1104 additions and 973 deletions
--- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
@@ -427,130 +427,3 @@ def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model,
        mm_limit=1,
        tensor_parallel_size=1,
    )
-
-
-def run_chunked_prefill_test(
-    vllm_runner: Type[VllmRunner],
-    inputs: List[Tuple[List[str], PromptImageInput, PromptVideoInput]],
-    model: str,
-    *,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    mm_limit: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    """Compare inference result between
-    chunked prefill disabled and chunked prefill enabled
-    """
-
-    # NOTE:
-    # max_model_len should be greater than image_feature_size
-    with vllm_runner(model,
-                     task="generate",
-                     max_model_len=4000,
-                     max_num_seqs=4,
-                     dtype=dtype,
-                     limit_mm_per_prompt={
-                         "image": mm_limit,
-                         "video": mm_limit
-                     },
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend
-                     ) as vllm_model:
-
-        outputs_per_case = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images or None,
-                                                videos=videos or None)
-            for prompts, images, videos in inputs
-        ]
-
-    with vllm_runner(
-            model,
-            task="generate",
-            max_model_len=4000,
-            max_num_seqs=4,
-            dtype=dtype,
-            limit_mm_per_prompt={
-                "image": mm_limit,
-                "video": mm_limit
-            },
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
-            enable_chunked_prefill=True,
-            # should be small enough to ensure prefilling is chunked
-            max_num_batched_tokens=32,
-            mm_processor_kwargs={
-                "max_pixels": 16 * 28 * 28,
-            }) as vllm_model_chunked:
-        outputs_per_case_chunked = [
-            vllm_model_chunked.generate_greedy_logprobs(
-                prompts,
-                max_tokens,
-                num_logprobs=num_logprobs,
-                images=images or None,
-                videos=videos or None) for prompts, images, videos in inputs
-        ]
-
-    for outputs, \
-        outputs_chunked \
-        in zip(outputs_per_case,
-            outputs_per_case_chunked):
-        check_logprobs_close(
-            outputs_0_lst=outputs,
-            outputs_1_lst=outputs_chunked,
-            name_0="non_chunked",
-            name_1="chunked",
-        )
-
-
-@pytest.mark.core_model
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [1])
-@pytest.mark.parametrize("num_logprobs", [10])
-def test_qwen2_vl_mrope_chunked_prefill(vllm_runner, example_prompts,
-                                        model: str, dtype: str,
-                                        max_tokens: int,
-                                        num_logprobs: int) -> None:
-    """
-    Test Qwen2-VL's chunked prefill with M-RoPE
-    """
-    prompts = [
-        qwen2_vl_chat_template(IMAGE_PLACEHOLDER, prompt)
-        for prompt in example_prompts[:1]
-    ]
-
-    # 1. Qwen2-VL's M-RoPE works only when there are some multi-modal inputs,
-    #    so an image is included in the inputs
-    # 2. however, Qwen2-VL currently won't work properly
-    #    when chunked prefill is enabled and there are some multi-modal inputs,
-    #    here use a hacky way: provide a **zero-length** image to make it happy
-    #
-    # and finally we achieved:
-    # (1) chunked_prefill enabled; (2) M-RoPE works; to continue our tests
-    zero_len_image = {
-        "image_embeds": torch.empty((0, MODEL_HIDDEN_SIZE)),
-        "image_grid_thw": torch.tensor([[0, 0, 0]])
-    }
-    images = [zero_len_image] * len(prompts)
-
-    inputs_per_case: List[Tuple[List[str], PromptImageInput,
-                                PromptVideoInput]] = [
-                                    (prompts, images, []),
-                                ]
-
-    run_chunked_prefill_test(
-        vllm_runner,
-        inputs_per_case,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=1,
-        tensor_parallel_size=1,
-    )