[Model] New model support for microsoft/Phi-4-mini-flash-reasoning (#20702)

Signed-off-by: Congcong Chen <congcongchen@microsoft.com>
2025-07-12 06:02:10 -07:00
parent b639327ad9
commit 2c11a738b3
22 changed files with 1869 additions and 41 deletions
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -248,6 +248,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "Phi3SmallForCausalLM": _HfExamplesInfo("microsoft/Phi-3-small-8k-instruct",
                                            trust_remote_code=True,
                                            v0_only=True),
+    "Phi4FlashForCausalLM": _HfExamplesInfo("microsoft/Phi-4-mini-flash-reasoning", # noqa: E501
+                                        trust_remote_code=True,
+                                        v0_only=True,
+                                        max_model_len=10240),
    "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
                                         trust_remote_code=True),
    "Plamo2ForCausalLM": _HfExamplesInfo("pfnet/plamo-2-1b",
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -103,6 +103,9 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
                       _initialize_kv_caches_v1), monkeypatch.context() as m):
        if model_info.v0_only:
            m.setenv("VLLM_USE_V1", "0")
+        if model_arch == "Phi4FlashForCausalLM":
+            # Phi4FlashForCausalLM only supports DIFFERENTIAL_FLASH_ATTN backend
+            m.setenv("VLLM_ATTENTION_BACKEND", "DIFFERENTIAL_FLASH_ATTN")
        LLM(
            model_info.default,
            tokenizer=model_info.tokenizer,
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -458,6 +458,31 @@ def test_bind_kv_cache():
    assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[2]
    assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[3]

+def test_bind_kv_cache_kv_sharing():
+    from vllm.attention import Attention
+
+    ctx = {
+        'layers.0.self_attn': Attention(32, 128, 0.1),
+        'layers.1.self_attn': Attention(32, 128, 0.1),
+        'layers.2.self_attn': Attention(32, 128, 0.1),
+        'layers.3.self_attn': Attention(32, 128, 0.1),
+    }
+    kv_cache = [
+        torch.zeros((1, )),
+        torch.zeros((1, )),
+        torch.zeros((1, )),
+        torch.zeros((1, )),
+    ]
+    shared_kv_cache_layers = {
+        'layers.2.self_attn': 'layers.1.self_attn',
+        'layers.3.self_attn': 'layers.0.self_attn'
+    }
+    bind_kv_cache(ctx, [kv_cache], shared_kv_cache_layers)
+    assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0]
+    assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[1]
+    assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[1]
+    assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[0]
+
 def test_bind_kv_cache_non_attention():
    from vllm.attention import Attention