Add Bamba Model (#10909)

Signed-off-by: Yu Chin Fabian Lim <flim@sg.ibm.com> Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-02-07 07:22:42 +08:00
parent 467a96a541
commit aff404571b
17 changed files with 3706 additions and 112 deletions
--- a/tests/models/decoder_only/language/test_hybrid.py
+++ b/tests/models/decoder_only/language/test_hybrid.py
@@ -8,7 +8,8 @@ from vllm.sampling_params import SamplingParams

 from ...utils import check_outputs_equal

-MODELS = ["ai21labs/Jamba-tiny-dev"]
+# This test is for the hybrid models
+MODELS = ["ai21labs/Jamba-tiny-dev", "ibm-ai-platform/Bamba-9B"]


@pytest.mark.parametrize("model", MODELS)
@@ -23,6 +24,10 @@ def test_models(
    max_tokens: int,
 ) -> None:

+    # numeric error produces different generation
+    if 'Bamba' in model:
+        example_prompts.pop(3)
+
    with hf_runner(
            model,
            dtype=dtype,
@@ -108,15 +113,21 @@ def test_mamba_prefill_chunking_with_parallel_sampling(

@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [10])
+@pytest.mark.parametrize("max_tokens", [7])
 def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,
                                model: str, dtype: str,
                                max_tokens: int) -> None:
    # numeric error during prefill chucking produces different generation
    # compared to w/o prefill chunking for those examples, removed them for now
-    example_prompts.pop(7)
-    example_prompts.pop(2)
-    example_prompts.pop(1)
+    if 'Jamba' in model:
+        example_prompts.pop(7)
+        example_prompts.pop(2)
+        example_prompts.pop(1)
+    elif 'Bamba' in model:
+        example_prompts.pop(6)
+        example_prompts.pop(3)
+        example_prompts.pop(2)
+        dtype = "half"  # use a different dtype for Bamba

    with hf_runner(
            model,
@@ -145,7 +156,7 @@ def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,


@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [15])
 def test_parallel_sampling(
    vllm_runner,
@@ -249,17 +260,17 @@ def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
    dtype: str,
    example_prompts,
 ) -> None:
-    # This test is for verifying that the Jamba inner state management doesn't
+    # This test is for verifying that the hybrid inner state management doesn't
    # collapse in case where the number of incoming requests and
    # finished_requests_ids is larger than the maximum mamba block capacity.
-    # This could generally happen due to the fact that Jamba does support
+    # This could generally happen due to the fact that hybrid does support
    # statelessness mechanism where it can cleanup new incoming requests in
    # a single step.
    try:
        with vllm_runner(model, dtype=dtype, max_num_seqs=10) as vllm_model:
            vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
    except ValueError:
-        pytest.fail("Jamba inner state wasn't cleaned up properly between"
+        pytest.fail("Hybrid inner state wasn't cleaned up properly between"
                    "steps finished requests registered unnecessarily ")


@@ -271,14 +282,14 @@ def test_state_cleanup(
    dtype: str,
    example_prompts,
 ) -> None:
-    # This test is for verifying that the Jamba state is cleaned up between
+    # This test is for verifying that the Hybrid state is cleaned up between
    # steps, If its not cleaned, an error would be expected.
    try:
        with vllm_runner(model, dtype=dtype) as vllm_model:
            for _ in range(10):
                vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
    except ValueError:
-        pytest.fail("Jamba inner state wasn't cleaned up between states, "
+        pytest.fail("Hybrid inner state wasn't cleaned up between states, "
                    "could be related to finished_requests_ids")


@@ -324,7 +335,7 @@ def test_multistep_correctness(vllm_runner, model: str, dtype: str,
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [64])
-def test_jamba_distributed_produces_identical_generation(
+def test_hybrid_distributed_produces_identical_generation(
        vllm_runner, model: str, dtype: str, max_tokens: int,
        example_prompts) -> None: