tests/compile/fullgraph/test_multimodal_compile.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest

from vllm.compilation.counter import compilation_counter
from vllm.config import VllmConfig
from vllm.config.compilation import CompilationMode
from vllm.platforms import current_platform


def test_compile():
    vllm_config = VllmConfig()
    # Default configuration does not compile mm encoder
    assert not vllm_config.compilation_config.compile_mm_encoder


# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
@pytest.mark.forked
@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
def test_qwen2_5_vl_compilation(vllm_runner, monkeypatch):
    """Test that Qwen2.5-VL vision submodules are compiled.

    This test verifies that the 3 vision submodules (Qwen2_5_VisionPatchEmbed,
    Qwen2_5_VisionBlock, and Qwen2_5_VisionPatchMerger) are properly tagged
    for compilation by checking that num_models_seen increases by at least 3.
    """
    # Disable multiprocessing so that the counter is in the same process
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")

    with (
        # NOTE: Qwen2.5-VL has 35 models in total - the LLM backend
        # Vision Patch Embed, Vision Patch Merger, and then 32 Vision Blocks
        # (one for each layer) - in the future, we should fix vLLM compilation
        # logic to handle this case and only compile the Vision submodules once
        # and reuse the compiled code for all layers
        # See https://github.com/vllm-project/vllm/issues/27590
        compilation_counter.expect(num_models_seen=35),
        vllm_runner(
            "Qwen/Qwen2.5-VL-3B-Instruct",
            max_model_len=2048,
            gpu_memory_utilization=0.8,
            compilation_config={
                "mode": CompilationMode.VLLM_COMPILE,
                "compile_mm_encoder": True,
            },
        ) as _,
    ):
        pass


# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
@pytest.mark.forked
@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
def test_qwen2_5_vl_no_vit_compilation(vllm_runner, monkeypatch):
    """Test that Qwen2.5-VL vision submodules are not compiled when the
    config is passed off
    """
    # Disable multiprocessing so that the counter is in the same process
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")

    with (
        compilation_counter.expect(num_models_seen=1),
        vllm_runner(
            "Qwen/Qwen2.5-VL-3B-Instruct",
            max_model_len=2048,
            gpu_memory_utilization=0.8,
            compilation_config={
                "mode": CompilationMode.VLLM_COMPILE,
                "compile_mm_encoder": False,
            },
        ) as _,
    ):
        pass


# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
# Requires Cuda and 8 gpus as well
@pytest.mark.forked
@pytest.mark.skip(reason="Skipping due to CI resource constraints")
def test_mllama4_vit_compilation(vllm_runner, monkeypatch):
    """Test that Mllama4 vision submodules are compiled.

    This test verifies that the 2 vision submodules (Llama4VisionEncoder,
    Llama4VisionPixelShuffleMLP) are properly tagged
    for compilation by checking that num_models_seen increases to 3.

    However since we are using TP=8, we compilation_counter will not
    work properly so we will just check the run succeeds rn
    """
    # Disable multiprocessing so that the counter is in the same process
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")

    with (
        monkeypatch.context(),
        # TODO: Since we require TP=8, this messes with the compilation
        # counter. We should fix this in the future, but leave for now
        # to make sure that compilation runs (no crash) with llama vision encoder
        compilation_counter.expect(num_models_seen=0),
        vllm_runner(
            "meta-llama/Llama-4-Scout-17B-16E-Instruct",
            max_model_len=512,
            gpu_memory_utilization=0.8,
            tensor_parallel_size=8,
            compilation_config={
                "mode": CompilationMode.VLLM_COMPILE,
                "compile_mm_encoder": True,
            },
        ),
    ):
        pass
[Misc][qwen2_5_vl][torch.compile] Enable `supports_torch_compile` on generic nn.Module and demonstrate speedup on Qwen Vision model (#23207) Signed-off-by: Lucas Kabela <lucaskabela@meta.com> Signed-off-by: Lucas Kabela <lucasakabela@gmail.com> 2025-10-28 15:36:43 -07:00			`# SPDX-License-Identifier: Apache-2.0`
			`# SPDX-FileCopyrightText: Copyright contributors to the vLLM project`
			`import pytest`

			`from vllm.compilation.counter import compilation_counter`
[Multimodal][torch.compile] Add compilation config field for turning off ViT/MM compile (#28242) Signed-off-by: Lucas Kabela <lucaskabela@meta.com> 2025-11-06 16:16:03 -08:00			`from vllm.config import VllmConfig`
[Misc][qwen2_5_vl][torch.compile] Enable `supports_torch_compile` on generic nn.Module and demonstrate speedup on Qwen Vision model (#23207) Signed-off-by: Lucas Kabela <lucaskabela@meta.com> Signed-off-by: Lucas Kabela <lucasakabela@gmail.com> 2025-10-28 15:36:43 -07:00			`from vllm.config.compilation import CompilationMode`
[CI] Add compile/test_multimodal_compile.py to CI (#28151) Signed-off-by: Yanan Cao <gmagogsfm@gmail.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> 2025-11-05 21:41:47 -08:00			`from vllm.platforms import current_platform`
[Misc][qwen2_5_vl][torch.compile] Enable `supports_torch_compile` on generic nn.Module and demonstrate speedup on Qwen Vision model (#23207) Signed-off-by: Lucas Kabela <lucaskabela@meta.com> Signed-off-by: Lucas Kabela <lucasakabela@gmail.com> 2025-10-28 15:36:43 -07:00

[Multimodal][torch.compile] Add compilation config field for turning off ViT/MM compile (#28242) Signed-off-by: Lucas Kabela <lucaskabela@meta.com> 2025-11-06 16:16:03 -08:00			`def test_compile():`
			`vllm_config = VllmConfig()`
[Misc] Turn off encoder torch compile by default (#28634) Signed-off-by: Roger Wang <hey@rogerw.io> 2025-11-13 08:38:08 -08:00			`# Default configuration does not compile mm encoder`
			`assert not vllm_config.compilation_config.compile_mm_encoder`
[Multimodal][torch.compile] Add compilation config field for turning off ViT/MM compile (#28242) Signed-off-by: Lucas Kabela <lucaskabela@meta.com> 2025-11-06 16:16:03 -08:00

[Misc][qwen2_5_vl][torch.compile] Enable `supports_torch_compile` on generic nn.Module and demonstrate speedup on Qwen Vision model (#23207) Signed-off-by: Lucas Kabela <lucaskabela@meta.com> Signed-off-by: Lucas Kabela <lucasakabela@gmail.com> 2025-10-28 15:36:43 -07:00			`# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073`
			`@pytest.mark.forked`
[CI] Add compile/test_multimodal_compile.py to CI (#28151) Signed-off-by: Yanan Cao <gmagogsfm@gmail.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> 2025-11-05 21:41:47 -08:00			`@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")`
[Misc][qwen2_5_vl][torch.compile] Enable `supports_torch_compile` on generic nn.Module and demonstrate speedup on Qwen Vision model (#23207) Signed-off-by: Lucas Kabela <lucaskabela@meta.com> Signed-off-by: Lucas Kabela <lucasakabela@gmail.com> 2025-10-28 15:36:43 -07:00			`def test_qwen2_5_vl_compilation(vllm_runner, monkeypatch):`
			`"""Test that Qwen2.5-VL vision submodules are compiled.`

			`This test verifies that the 3 vision submodules (Qwen2_5_VisionPatchEmbed,`
			`Qwen2_5_VisionBlock, and Qwen2_5_VisionPatchMerger) are properly tagged`
			`for compilation by checking that num_models_seen increases by at least 3.`
			`"""`
			`# Disable multiprocessing so that the counter is in the same process`
			`monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")`

			`with (`
			`# NOTE: Qwen2.5-VL has 35 models in total - the LLM backend`
			`# Vision Patch Embed, Vision Patch Merger, and then 32 Vision Blocks`
			`# (one for each layer) - in the future, we should fix vLLM compilation`
			`# logic to handle this case and only compile the Vision submodules once`
			`# and reuse the compiled code for all layers`
			`# See https://github.com/vllm-project/vllm/issues/27590`
			`compilation_counter.expect(num_models_seen=35),`
			`vllm_runner(`
			`"Qwen/Qwen2.5-VL-3B-Instruct",`
			`max_model_len=2048,`
[Multimodal][torch.compile] Add compilation config field for turning off ViT/MM compile (#28242) Signed-off-by: Lucas Kabela <lucaskabela@meta.com> 2025-11-06 16:16:03 -08:00			`gpu_memory_utilization=0.8,`
[Misc] Turn off encoder torch compile by default (#28634) Signed-off-by: Roger Wang <hey@rogerw.io> 2025-11-13 08:38:08 -08:00			`compilation_config={`
			`"mode": CompilationMode.VLLM_COMPILE,`
			`"compile_mm_encoder": True,`
			`},`
[Misc][qwen2_5_vl][torch.compile] Enable `supports_torch_compile` on generic nn.Module and demonstrate speedup on Qwen Vision model (#23207) Signed-off-by: Lucas Kabela <lucaskabela@meta.com> Signed-off-by: Lucas Kabela <lucasakabela@gmail.com> 2025-10-28 15:36:43 -07:00			`) as _,`
			`):`
			`pass`
[Multimodal][torch.compile] Add compilation config field for turning off ViT/MM compile (#28242) Signed-off-by: Lucas Kabela <lucaskabela@meta.com> 2025-11-06 16:16:03 -08:00

			`# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073`
			`@pytest.mark.forked`
			`@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")`
			`def test_qwen2_5_vl_no_vit_compilation(vllm_runner, monkeypatch):`
			`"""Test that Qwen2.5-VL vision submodules are not compiled when the`
			`config is passed off`
			`"""`
			`# Disable multiprocessing so that the counter is in the same process`
			`monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")`

			`with (`
			`compilation_counter.expect(num_models_seen=1),`
			`vllm_runner(`
			`"Qwen/Qwen2.5-VL-3B-Instruct",`
			`max_model_len=2048,`
			`gpu_memory_utilization=0.8,`
			`compilation_config={`
			`"mode": CompilationMode.VLLM_COMPILE,`
			`"compile_mm_encoder": False,`
			`},`
			`) as _,`
			`):`
			`pass`
[Misc][LLaMa4] Compile LLaMa Vision Encoder (#30709) Signed-off-by: Lucas Kabela <lucaskabela@meta.com> 2026-01-09 19:01:38 -08:00

			`# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073`
			`# Requires Cuda and 8 gpus as well`
			`@pytest.mark.forked`
			`@pytest.mark.skip(reason="Skipping due to CI resource constraints")`
			`def test_mllama4_vit_compilation(vllm_runner, monkeypatch):`
			`"""Test that Mllama4 vision submodules are compiled.`

			`This test verifies that the 2 vision submodules (Llama4VisionEncoder,`
			`Llama4VisionPixelShuffleMLP) are properly tagged`
			`for compilation by checking that num_models_seen increases to 3.`

			`However since we are using TP=8, we compilation_counter will not`
			`work properly so we will just check the run succeeds rn`
			`"""`
			`# Disable multiprocessing so that the counter is in the same process`
			`monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")`

			`with (`
			`monkeypatch.context(),`
			`# TODO: Since we require TP=8, this messes with the compilation`
			`# counter. We should fix this in the future, but leave for now`
			`# to make sure that compilation runs (no crash) with llama vision encoder`
			`compilation_counter.expect(num_models_seen=0),`
			`vllm_runner(`
			`"meta-llama/Llama-4-Scout-17B-16E-Instruct",`
			`max_model_len=512,`
			`gpu_memory_utilization=0.8,`
			`tensor_parallel_size=8,`
			`compilation_config={`
			`"mode": CompilationMode.VLLM_COMPILE,`
			`"compile_mm_encoder": True,`
			`},`
			`),`
			`):`
			`pass`