[Model] Support Mamba2 (Codestral Mamba) (#9292)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Yu Chin Fabian Lim <flim@sg.ibm.com>
This commit is contained in:
committed by
GitHub
parent
7b623fca0b
commit
1f69c4a892
@@ -4,6 +4,7 @@
|
||||
Run `pytest tests/models/test_mamba.py`.
|
||||
"""
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
@@ -11,7 +12,14 @@ from vllm.sampling_params import SamplingParams
|
||||
|
||||
from ...utils import check_outputs_equal
|
||||
|
||||
MODELS = ["state-spaces/mamba-130m-hf", "tiiuae/falcon-mamba-tiny-dev"]
|
||||
MODELS = [
|
||||
"state-spaces/mamba-130m-hf",
|
||||
"tiiuae/falcon-mamba-tiny-dev",
|
||||
# TODO: Compare to a Mamba2 model. The HF transformers implementation of
|
||||
# Mamba2 is buggy for Codestral as it doesn't handle n_groups.
|
||||
# See https://github.com/huggingface/transformers/pull/35943
|
||||
# "mistralai/Mamba-Codestral-7B-v0.1",
|
||||
]
|
||||
|
||||
|
||||
# Use lower-level interfaces to create this greedy generator, as mamba will
|
||||
@@ -21,6 +29,10 @@ def generate_greedy(model_name, example_prompts, max_tokens):
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name)
|
||||
|
||||
# Set the device (GPU if available, else CPU)
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
model.to(device)
|
||||
|
||||
# Generate texts from the prompts
|
||||
outputs = []
|
||||
for prompt in example_prompts:
|
||||
@@ -29,7 +41,9 @@ def generate_greedy(model_name, example_prompts, max_tokens):
|
||||
input_ids = inputs["input_ids"].to(model.device)
|
||||
|
||||
# Generate text using the model's generate method directly
|
||||
generated_ids = model.generate(input_ids, max_new_tokens=max_tokens)
|
||||
generated_ids = model.generate(input_ids,
|
||||
max_new_tokens=max_tokens,
|
||||
do_sample=False)
|
||||
generated_text = tokenizer.decode(generated_ids[0],
|
||||
skip_special_tokens=True)
|
||||
|
||||
@@ -50,7 +64,8 @@ def test_models(
|
||||
) -> None:
|
||||
hf_outputs = generate_greedy(model, example_prompts, max_tokens)
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
# Set max_num_seqs to keep Codestral from going OOM at fp32
|
||||
with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
# This test is for verifying whether the model's extra_repr
|
||||
@@ -81,7 +96,7 @@ def test_batching(
|
||||
) -> None:
|
||||
# To pass the small model tests, we need full precision.
|
||||
for_loop_outputs = []
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
|
||||
for prompt in example_prompts:
|
||||
for_loop_outputs.append(
|
||||
vllm_model.generate_greedy([prompt], max_tokens)[0])
|
||||
@@ -165,20 +180,22 @@ def test_parallel_sampling(
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
# Numerical differences produce slightly different output for these
|
||||
if 'state-spaces' in model:
|
||||
example_prompts.pop(0)
|
||||
example_prompts.pop(0)
|
||||
example_prompts.pop(0)
|
||||
|
||||
with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
|
||||
for_loop_outputs = []
|
||||
for _ in range(10):
|
||||
for_loop_outputs.append(
|
||||
# using example_prompts index 1 instead of 0 since with 0 the
|
||||
# logprobs get really close and the test doesn't pass
|
||||
vllm_model.generate_greedy([example_prompts[1]], max_tokens)
|
||||
[0])
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)[0])
|
||||
sampling_params = SamplingParams(n=10,
|
||||
temperature=0.001,
|
||||
seed=0,
|
||||
max_tokens=max_tokens)
|
||||
n_lt_1_outputs = vllm_model.generate([example_prompts[1]],
|
||||
sampling_params)
|
||||
n_lt_1_outputs = vllm_model.generate(example_prompts, sampling_params)
|
||||
token_ids, texts = n_lt_1_outputs[0]
|
||||
n_lt_1_outputs = [(token_id, text)
|
||||
for token_id, text in zip(token_ids, texts)]
|
||||
@@ -232,7 +249,7 @@ def test_models_preemption_recompute(
|
||||
# Tests that outputs are identical with and w/o preemtions (recompute)
|
||||
assert dtype == "float"
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
|
||||
vllm_model.model.llm_engine.scheduler[
|
||||
0].ENABLE_ARTIFICIAL_PREEMPT = True
|
||||
preempt_vllm_outputs = vllm_model.generate_greedy(
|
||||
@@ -283,7 +300,7 @@ def test_state_cleanup(
|
||||
# This test is for verifying that the Mamba state is cleaned up between
|
||||
# steps, If its not cleaned, an error would be expected.
|
||||
try:
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
|
||||
for _ in range(10):
|
||||
vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
|
||||
except ValueError:
|
||||
|
||||
Reference in New Issue
Block a user