[Model] Add LFM2 architecture (#22845)

Signed-off-by: Paul Pak <paulpak58@gmail.com>
This commit is contained in:
Paul Pak
2025-08-21 01:35:07 -06:00
committed by GitHub
parent 31282401b6
commit 2e2000f352
11 changed files with 960 additions and 8 deletions

View File

@@ -31,6 +31,7 @@ HYBRID_MODELS = [
"hmellor/tiny-random-BambaForCausalLM",
"ibm-granite/granite-4.0-tiny-preview",
"tiiuae/Falcon-H1-0.5B-Base",
"LiquidAI/LFM2-1.2B",
]
HF_UNSUPPORTED_MODELS = [
@@ -52,6 +53,7 @@ V1_SUPPORTED_MODELS = [
"hmellor/tiny-random-BambaForCausalLM",
"ibm-granite/granite-4.0-tiny-preview",
"tiiuae/Falcon-H1-0.5B-Base",
"LiquidAI/LFM2-1.2B",
]
FULL_CUDA_GRAPH_MODELS = [
@@ -59,6 +61,10 @@ FULL_CUDA_GRAPH_MODELS = [
"Zyphra/Zamba2-1.2B-instruct",
]
V0_UNSUPPORTED_MODELS = [
"LiquidAI/LFM2-1.2B",
]
# Avoid OOM
MAX_NUM_SEQS = 4
@@ -94,9 +100,12 @@ def test_models(
else:
hf_outputs = None
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
if model not in V0_UNSUPPORTED_MODELS:
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
else:
vllm_v0_outputs = None
if model in V1_SUPPORTED_MODELS:
with monkeypatch.context() as m:
@@ -112,7 +121,7 @@ def test_models(
else:
vllm_v1_outputs = None
if hf_outputs is not None:
if hf_outputs is not None and vllm_v0_outputs is not None:
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_v0_outputs,
@@ -122,6 +131,7 @@ def test_models(
if model in V1_SUPPORTED_MODELS:
ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
assert ref_outputs is not None
check_logprobs_close(
outputs_0_lst=ref_outputs,
outputs_1_lst=vllm_v1_outputs,
@@ -140,6 +150,9 @@ def test_batching(
max_tokens: int,
num_logprobs: int,
) -> None:
if model in V0_UNSUPPORTED_MODELS:
pytest.skip(
f"Unsupported V0 Engine. Skipping `test_batching` on {model}.")
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
@@ -392,9 +405,12 @@ def test_full_cuda_graph(
else:
hf_outputs = None
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
if model not in V0_UNSUPPORTED_MODELS:
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
else:
vllm_v0_outputs = None
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
@@ -408,7 +424,7 @@ def test_full_cuda_graph(
vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
if hf_outputs is not None:
if hf_outputs is not None and vllm_v0_outputs is not None:
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_v0_outputs,
@@ -417,6 +433,7 @@ def test_full_cuda_graph(
)
ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
assert ref_outputs is not None
check_logprobs_close(
outputs_0_lst=ref_outputs,
outputs_1_lst=vllm_v1_outputs,