Enable CUDA graph support for llama 3.2 vision (#14917)

Signed-off-by: Matt Ritter <100659061+mritterfigma@users.noreply.github.com>
2025-03-19 23:29:16 -07:00
parent 2f726b241e
commit a8652f4f0f
3 changed files with 1 additions and 13 deletions
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -215,7 +215,6 @@ def _run_test(
                     max_num_seqs=2,
                     tensor_parallel_size=tensor_parallel_size,
                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True,
                     limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT
                                          }) as vllm_model:
        vllm_outputs_per_image = [
@@ -425,7 +424,6 @@ def test_bnb_regression(
        dtype=dtype,
        max_model_len=4096,
        max_num_seqs=2,
-        enforce_eager=True,
        quantization="bitsandbytes",
        load_format="bitsandbytes",
    )
@@ -481,7 +479,6 @@ def test_explicit_implicit_prompt(
        max_model_len=4096,
        max_num_seqs=2,
        tensor_parallel_size=1,
-        enforce_eager=True,
    )
    sampling_params = SamplingParams(
        temperature=0,
@@ -513,7 +510,6 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
            max_model_len=4096,
            max_num_seqs=2,
            tensor_parallel_size=1,
-            enforce_eager=True,
            limit_mm_per_prompt={"image":
                                 _LIMIT_IMAGE_PER_PROMPT}) as vllm_model: