Upstream Llama4 Support to Main (#16113)

Signed-off-by: Aston Zhang <22279212+astonzhang@users.noreply.github.com> Signed-off-by: Chris Thi <chris.c.thi@gmail.com> Signed-off-by: drisspg <drisspguessous@gmail.com> Signed-off-by: Jon Swenson <jmswen@gmail.com> Signed-off-by: Keyun Tong <tongkeyun@gmail.com> Signed-off-by: Lu Fang <fanglu@meta.com> Signed-off-by: Xiaodong Wang <xdwang@meta.com> Signed-off-by: Yang Chen <yangche@fb.com> Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com> Signed-off-by: Yong Hoon Shin <yhshin@meta.com> Signed-off-by: Zijing Liu <liuzijing2014@gmail.com> Signed-off-by: Lu Fang <lufang@fb.com> Signed-off-by: Lu Fang <fanglu@fb.com> Signed-off-by: Lucia Fang <fanglu@fb.com> Signed-off-by: Roger Wang <ywang@roblox.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: Lu Fang <fanglu@fb.com> Co-authored-by: Roger Wang <ywang@roblox.com> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-07 08:06:27 -07:00
parent 8017c8db7f
commit 55dcce91df
43 changed files with 2436 additions and 155 deletions
--- a/tests/models/decoder_only/vision_language/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/test_phi3v.py
@@ -5,7 +5,9 @@ import re
 from typing import Optional

 import pytest
+from packaging.version import Version
 from transformers import AutoTokenizer
+from transformers import __version__ as TRANSFORMERS_VERSION

 from vllm.multimodal.image import rescale_image_size
 from vllm.platforms import current_platform
@@ -81,6 +83,13 @@ def run_test(
    from transformers import AutoImageProcessor  # noqa: F401
    from transformers import AutoProcessor  # noqa: F401

+    # Once the model repo is updated to 4.49, we should be able to run the
+    # test in `test_models.py` without the above workaround
+    if Version(TRANSFORMERS_VERSION) >= Version("4.49"):
+        pytest.skip(f"`transformers=={TRANSFORMERS_VERSION}` installed, "
+                    "but `transformers<=4.49` is required to run this model. "
+                    "Reason: Cannot run HF implementation")
+
    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it