[CI/Build] Split up models tests (#10069)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-11-10 03:39:14 +08:00
parent b09895a618
commit 51c2e1fcef
21 changed files with 115 additions and 129 deletions
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py
@@ -56,11 +56,13 @@ def test_dummy_data_for_llava_next_feature_size(dummy_data_for_llava_next,
    ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
    seq_len = 5000  # bigger than the max feature size for any image

-    seq_data, mm_data = dummy_data_for_llava_next(
+    dummy_data = dummy_data_for_llava_next(
        ctx,
        seq_len=seq_len,
        mm_counts={"image": 1},
    )
+    seq_data = dummy_data.seq_data
+    mm_data = dummy_data.multi_modal_data

    # The dummy data dims should match the gridpoint with the biggest feat size
    assert mm_data["image"].height == expected_size[0]
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
@@ -131,12 +131,13 @@ def test_dummy_data_override(dummy_data_for_phi3v, model: str, num_crops: int,
        mm_processor_kwargs=None,
    )

-    sequence_data, _, = dummy_data_for_phi3v(
+    dummy_data = dummy_data_for_phi3v(
        ctx=ctx,
        seq_len=8192,  # Should be bigger than num_imgs * toks_per_img
        mm_counts={"image": num_imgs},
        num_crops=num_crops,
    )
+    sequence_data = dummy_data.seq_data
    # Ensure we have the right number of placeholders per num_crops size
    img_tok_count = sequence_data.get_token_ids().count(_IMAGE_TOKEN_ID)
    assert img_tok_count == toks_per_img * num_imgs
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
@@ -86,10 +86,17 @@ def test_qwen2_vl_dummy_data(dummy_data_for_qwen2_vl,

    # NOTE: video value is required, but isn't actually used
    # when making the dummy data except for error handling currently
-    seq_data, mm_data = dummy_data_for_qwen2_vl(qwen2_vl_context, seq_len, {
-        "image": 1,
-        "video": 0
-    }, **mm_processor_kwargs)
+    dummy_data = dummy_data_for_qwen2_vl(
+        ctx=qwen2_vl_context,
+        seq_len=seq_len,
+        mm_counts={
+            "image": 1,
+            "video": 0
+        },
+        **mm_processor_kwargs,
+    )
+    seq_data = dummy_data.seq_data
+    mm_data = dummy_data.multi_modal_data

    # Ensure we have the right number of placeholders for min/max pixel values
    assert seq_data.get_token_ids().count(image_token_id) == token_count
--- a/tests/models/decoder_only/vision_language/test_internvl.py
+++ b/tests/models/decoder_only/vision_language/test_internvl.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Tuple, Type
+from typing import List, Optional, Type

 import pytest
 import torch
@@ -19,7 +19,8 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
 def run_awq_test(
    vllm_runner: Type[VllmRunner],
    image_assets: _ImageAssets,
-    models: Tuple[str, str],
+    source_model: str,
+    quant_model: str,
    *,
    size_factors: List[float],
    dtype: str,
@@ -28,8 +29,6 @@ def run_awq_test(
    tensor_parallel_size: int,
    distributed_executor_backend: Optional[str] = None,
 ):
-    source_model, quant_model = models
-
    images = [asset.pil_image for asset in image_assets]

    inputs_per_image = [(
@@ -84,8 +83,11 @@ def run_awq_test(
        )


+@pytest.mark.quant_model
@pytest.mark.parametrize(
-    "models", [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")])
+    ("source_model", "quant_model"),
+    [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")],
+)
@pytest.mark.parametrize(
    "size_factors",
    [
@@ -103,12 +105,13 @@ def run_awq_test(
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@torch.inference_mode()
-def test_awq_models(vllm_runner, image_assets, models, size_factors,
-                    dtype: str, max_tokens: int, num_logprobs: int) -> None:
+def test_awq_models(vllm_runner, image_assets, source_model, quant_model,
+                    size_factors, dtype, max_tokens, num_logprobs) -> None:
    run_awq_test(
        vllm_runner,
        image_assets,
-        models,
+        source_model,
+        quant_model,
        size_factors=size_factors,
        dtype=dtype,
        max_tokens=max_tokens,
--- a/tests/models/decoder_only/vision_language/test_intern_vit.py
+++ b/tests/models/decoder_only/vision_language/test_intern_vit.py
@@ -11,21 +11,17 @@ from ....conftest import _ImageAssets
 # we use snapshot_download to prevent conflicts between
 # dynamic_module and trust_remote_code for hf_runner
 DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"]
-models = [
-    snapshot_download("OpenGVLab/InternViT-300M-448px",
-                      allow_patterns=DOWNLOAD_PATTERN),
-    snapshot_download("OpenGVLab/InternViT-6B-448px-V1-5",
-                      allow_patterns=DOWNLOAD_PATTERN),
-]


 def run_intern_vit_test(
    image_assets: _ImageAssets,
-    model: str,
+    model_id: str,
    *,
    dtype: str,
    distributed_executor_backend: Optional[str] = None,
 ):
+    model = snapshot_download(model_id, allow_patterns=DOWNLOAD_PATTERN)
+
    img_processor = CLIPImageProcessor.from_pretrained(model)
    images = [asset.pil_image for asset in image_assets]
    pixel_values = [
@@ -67,12 +63,15 @@ def run_intern_vit_test(
        assert cos_similar(vllm_output, hf_output).mean() > 0.99


-@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("model_id", [
+    "OpenGVLab/InternViT-300M-448px",
+    "OpenGVLab/InternViT-6B-448px-V1-5",
+])
@pytest.mark.parametrize("dtype", [torch.half])
@torch.inference_mode()
-def test_models(dist_init, image_assets, model, dtype: str) -> None:
+def test_models(dist_init, image_assets, model_id, dtype: str) -> None:
    run_intern_vit_test(
        image_assets,
-        model,
+        model_id,
        dtype=dtype,
    )
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -130,8 +130,8 @@ VLM_TEST_SETTINGS = {
        max_num_seqs=2,
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
-        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
    ),
    #### Extended model tests
    "blip2": VLMTestInfo(
@@ -159,9 +159,9 @@ VLM_TEST_SETTINGS = {
        dtype="bfloat16",
        marks=[
            pytest.mark.skipif(
-                transformers.__version__.startswith("4.46"),
+                transformers.__version__ < "4.46.2",
                reason="Model broken in HF, see huggingface/transformers#34379"
-            )
+            ),
        ]
    ),
    "fuyu": VLMTestInfo(
@@ -185,8 +185,8 @@ VLM_TEST_SETTINGS = {
        max_num_seqs=2,
        dtype="bfloat16",
        get_stop_token_ids=lambda tok: [151329, 151336, 151338],
-        marks=[large_gpu_mark(min_gb=48)],
        patch_hf_runner=model_utils.glm_patch_hf_runner,
+        marks=[large_gpu_mark(min_gb=48)],
    ),
    "h2ovl": VLMTestInfo(
        models = [
@@ -205,6 +205,22 @@ VLM_TEST_SETTINGS = {
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
    ),
+    "idefics3": VLMTestInfo(
+        models=["HuggingFaceM4/Idefics3-8B-Llama3"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "<image>",
+        max_model_len=8192,
+        max_num_seqs=2,
+        auto_cls=AutoModelForVision2Seq,
+        marks=[
+            pytest.mark.skipif(
+                transformers.__version__ < "4.46.0",
+                reason="Model introduced in HF >= 4.46.0"
+            ),
+            large_gpu_mark(min_gb=48),
+        ],
+    ),
    "intern_vl": VLMTestInfo(
        models=[
            "OpenGVLab/InternVL2-1B",
@@ -263,7 +279,6 @@ VLM_TEST_SETTINGS = {
            runner_mm_key="videos",
        )],
    ),
-    # FIXME
    "llava_next_video": VLMTestInfo(
        models=["llava-hf/LLaVA-NeXT-Video-7B-hf"],
        test_type=VLMTestType.VIDEO,
@@ -275,7 +290,7 @@ VLM_TEST_SETTINGS = {
        image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
        marks=[
            pytest.mark.skipif(
-                transformers.__version__.startswith("4.46"),
+                transformers.__version__ < "4.46.2",
                reason="Model broken with changes in transformers 4.46"
            )
        ],
@@ -316,6 +331,7 @@ VLM_TEST_SETTINGS = {
        max_model_len=8192,
        max_num_seqs=2,
        auto_cls=AutoModelForVision2Seq,
+        marks=[large_gpu_mark(min_gb=48)],
    ),
    "qwen": VLMTestInfo(
        models=["Qwen/Qwen-VL"],
@@ -327,22 +343,6 @@ VLM_TEST_SETTINGS = {
        vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output,
        prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
    ),
-    "idefics3": VLMTestInfo(
-        models=["HuggingFaceM4/Idefics3-8B-Llama3"],
-        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
-        img_idx_to_prompt=lambda idx: "<image>",
-        max_model_len=8192,
-        max_num_seqs=2,
-        auto_cls=AutoModelForVision2Seq,
-        marks=[
-            pytest.mark.skipif(
-                transformers.__version__ < "4.46.0",
-                reason="Model introduced in HF >= 4.46.0"
-            ),
-            large_gpu_mark(min_gb=48),
-        ],
-    ),
    ### Tensor parallel / multi-gpu broadcast tests
    "broadcast-chameleon": VLMTestInfo(
        models=["facebook/chameleon-7b"],
@@ -362,7 +362,7 @@ VLM_TEST_SETTINGS = {
                reason="Need at least 2 GPUs to run the test.",
            ),
            pytest.mark.skipif(
-                transformers.__version__.startswith("4.46"),
+                transformers.__version__ < "4.46.2",
                reason="Model broken in HF, see huggingface/transformers#34379"
            )
        ],