[Model] Support Mamba2 (Codestral Mamba) (#9292)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Yu Chin Fabian Lim <flim@sg.ibm.com>
2025-02-17 07:17:50 -05:00
parent 7b623fca0b
commit 1f69c4a892
9 changed files with 376 additions and 65 deletions
--- a/tests/models/decoder_only/language/test_mamba.py
+++ b/tests/models/decoder_only/language/test_mamba.py
@@ -4,6 +4,7 @@
 Run `pytest tests/models/test_mamba.py`.
 """
 import pytest
+import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer

 from vllm.engine.arg_utils import EngineArgs
@@ -11,7 +12,14 @@ from vllm.sampling_params import SamplingParams

 from ...utils import check_outputs_equal

-MODELS = ["state-spaces/mamba-130m-hf", "tiiuae/falcon-mamba-tiny-dev"]
+MODELS = [
+    "state-spaces/mamba-130m-hf",
+    "tiiuae/falcon-mamba-tiny-dev",
+    # TODO: Compare to a Mamba2 model. The HF transformers implementation of
+    # Mamba2 is buggy for Codestral as it doesn't handle n_groups.
+    # See https://github.com/huggingface/transformers/pull/35943
+    # "mistralai/Mamba-Codestral-7B-v0.1",
+]


 # Use lower-level interfaces to create this greedy generator, as mamba will
@@ -21,6 +29,10 @@ def generate_greedy(model_name, example_prompts, max_tokens):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)

+    # Set the device (GPU if available, else CPU)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+
    # Generate texts from the prompts
    outputs = []
    for prompt in example_prompts:
@@ -29,7 +41,9 @@ def generate_greedy(model_name, example_prompts, max_tokens):
        input_ids = inputs["input_ids"].to(model.device)

        # Generate text using the model's generate method directly
-        generated_ids = model.generate(input_ids, max_new_tokens=max_tokens)
+        generated_ids = model.generate(input_ids,
+                                       max_new_tokens=max_tokens,
+                                       do_sample=False)
        generated_text = tokenizer.decode(generated_ids[0],
                                          skip_special_tokens=True)

@@ -50,7 +64,8 @@ def test_models(
 ) -> None:
    hf_outputs = generate_greedy(model, example_prompts, max_tokens)

-    with vllm_runner(model, dtype=dtype) as vllm_model:
+    # Set max_num_seqs to keep Codestral from going OOM at fp32
+    with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)

        # This test is for verifying whether the model's extra_repr
@@ -81,7 +96,7 @@ def test_batching(
 ) -> None:
    # To pass the small model tests, we need full precision.
    for_loop_outputs = []
-    with vllm_runner(model, dtype=dtype) as vllm_model:
+    with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
        for prompt in example_prompts:
            for_loop_outputs.append(
                vllm_model.generate_greedy([prompt], max_tokens)[0])
@@ -165,20 +180,22 @@ def test_parallel_sampling(
    max_tokens: int,
 ) -> None:

-    with vllm_runner(model, dtype=dtype) as vllm_model:
+    # Numerical differences produce slightly different output for these
+    if 'state-spaces' in model:
+        example_prompts.pop(0)
+        example_prompts.pop(0)
+        example_prompts.pop(0)
+
+    with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
        for_loop_outputs = []
        for _ in range(10):
            for_loop_outputs.append(
-                # using example_prompts index 1 instead of 0 since with 0 the
-                # logprobs get really close and the test doesn't pass
-                vllm_model.generate_greedy([example_prompts[1]], max_tokens)
-                [0])
+                vllm_model.generate_greedy(example_prompts, max_tokens)[0])
        sampling_params = SamplingParams(n=10,
                                         temperature=0.001,
                                         seed=0,
                                         max_tokens=max_tokens)
-        n_lt_1_outputs = vllm_model.generate([example_prompts[1]],
-                                             sampling_params)
+        n_lt_1_outputs = vllm_model.generate(example_prompts, sampling_params)
    token_ids, texts = n_lt_1_outputs[0]
    n_lt_1_outputs = [(token_id, text)
                      for token_id, text in zip(token_ids, texts)]
@@ -232,7 +249,7 @@ def test_models_preemption_recompute(
    # Tests that outputs are identical with and w/o preemtions (recompute)
    assert dtype == "float"

-    with vllm_runner(model, dtype=dtype) as vllm_model:
+    with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
        vllm_model.model.llm_engine.scheduler[
            0].ENABLE_ARTIFICIAL_PREEMPT = True
        preempt_vllm_outputs = vllm_model.generate_greedy(
@@ -283,7 +300,7 @@ def test_state_cleanup(
    # This test is for verifying that the Mamba state is cleaned up between
    # steps, If its not cleaned, an error would be expected.
    try:
-        with vllm_runner(model, dtype=dtype) as vllm_model:
+        with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
            for _ in range(10):
                vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
    except ValueError: