[Model] Add LFM2 architecture (#22845)

Signed-off-by: Paul Pak <paulpak58@gmail.com>
2025-08-21 01:35:07 -06:00
parent 31282401b6
commit 2e2000f352
11 changed files with 960 additions and 8 deletions
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -31,6 +31,7 @@ HYBRID_MODELS = [
    "hmellor/tiny-random-BambaForCausalLM",
    "ibm-granite/granite-4.0-tiny-preview",
    "tiiuae/Falcon-H1-0.5B-Base",
+    "LiquidAI/LFM2-1.2B",
 ]

 HF_UNSUPPORTED_MODELS = [
@@ -52,6 +53,7 @@ V1_SUPPORTED_MODELS = [
    "hmellor/tiny-random-BambaForCausalLM",
    "ibm-granite/granite-4.0-tiny-preview",
    "tiiuae/Falcon-H1-0.5B-Base",
+    "LiquidAI/LFM2-1.2B",
 ]

 FULL_CUDA_GRAPH_MODELS = [
@@ -59,6 +61,10 @@ FULL_CUDA_GRAPH_MODELS = [
    "Zyphra/Zamba2-1.2B-instruct",
 ]

+V0_UNSUPPORTED_MODELS = [
+    "LiquidAI/LFM2-1.2B",
+]
+
 # Avoid OOM
 MAX_NUM_SEQS = 4

@@ -94,9 +100,12 @@ def test_models(
        else:
            hf_outputs = None

-    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-        vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+    if model not in V0_UNSUPPORTED_MODELS:
+        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+            vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, num_logprobs)
+    else:
+        vllm_v0_outputs = None

    if model in V1_SUPPORTED_MODELS:
        with monkeypatch.context() as m:
@@ -112,7 +121,7 @@ def test_models(
    else:
        vllm_v1_outputs = None

-    if hf_outputs is not None:
+    if hf_outputs is not None and vllm_v0_outputs is not None:
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
            outputs_1_lst=vllm_v0_outputs,
@@ -122,6 +131,7 @@ def test_models(

    if model in V1_SUPPORTED_MODELS:
        ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
+        assert ref_outputs is not None
        check_logprobs_close(
            outputs_0_lst=ref_outputs,
            outputs_1_lst=vllm_v1_outputs,
@@ -140,6 +150,9 @@ def test_batching(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
+    if model in V0_UNSUPPORTED_MODELS:
+        pytest.skip(
+            f"Unsupported V0 Engine. Skipping `test_batching` on {model}.")

    try:
        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
@@ -392,9 +405,12 @@ def test_full_cuda_graph(
        else:
            hf_outputs = None

-    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-        vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+    if model not in V0_UNSUPPORTED_MODELS:
+        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+            vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, num_logprobs)
+    else:
+        vllm_v0_outputs = None

    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
@@ -408,7 +424,7 @@ def test_full_cuda_graph(
            vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
                example_prompts, max_tokens, num_logprobs)

-    if hf_outputs is not None:
+    if hf_outputs is not None and vllm_v0_outputs is not None:
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
            outputs_1_lst=vllm_v0_outputs,
@@ -417,6 +433,7 @@ def test_full_cuda_graph(
        )

    ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
+    assert ref_outputs is not None
    check_logprobs_close(
        outputs_0_lst=ref_outputs,
        outputs_1_lst=vllm_v1_outputs,