[Model][MiniMaxText01] Support MiniMaxText01 model inference (#13454)

Signed-off-by: qscqesze <475517977@qq.com> Co-authored-by: qingjun <qingjun@minimaxi.com> Co-authored-by: qscqesze <475517977@qq.com>
2025-04-02 04:23:55 +08:00
parent 93491aefc7
commit 9ef98d527e
11 changed files with 2439 additions and 129 deletions
--- a/vllm/model_executor/models/constant_size_cache.py
+++ b/vllm/model_executor/models/constant_size_cache.py
@@ -0,0 +1,136 @@
+# SPDX-License-Identifier: Apache-2.0
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Tuple
+
+import torch
+
+from vllm.attention.backends.utils import PAD_SLOT_ID
+
+
+class ConstantSizeCache(ABC):
+    """
+    Abstract base class for managing constant size caches 
+    like Mamba and Minimax.
+    """
+
+    def __init__(self, max_batch_size: int):
+        # Maps between the request id and a dict that maps between the seq_id
+        # and its index inside the cache
+        self.cache_indices_mapping: Dict[str, Dict[int, int]] = {}
+        self.free_cache_indices = list(range(max_batch_size))
+
+    @property
+    @abstractmethod
+    def cache(self) -> Any:
+        """Return the underlying cache tensor(s)"""
+        pass
+
+    @abstractmethod
+    def _copy_cache(self, from_index: int, to_index: int):
+        """Copy cache data from one index to another"""
+        pass
+
+    def current_run_tensors(self, **kwargs) -> Tuple:
+        """
+        Return the tensors for the current run's conv and ssm state.
+        """
+        if "seqlen_agnostic_capture_inputs" not in kwargs:
+            # We get here only on Prefill/Eager mode runs
+            request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
+            finished_requests_ids = kwargs["finished_requests_ids"]
+
+            self._release_finished_requests(finished_requests_ids)
+            state_indices = self._prepare_current_run_cache(
+                request_ids_to_seq_ids, finished_requests_ids)
+
+            state_indices_tensor = torch.as_tensor(state_indices,
+                                                   dtype=torch.int32,
+                                                   device="cuda")
+            cache_tensors = self.cache
+        else:
+            # CUDA graph capturing runs
+            cache_tensors, state_indices_tensor = kwargs[
+                "seqlen_agnostic_capture_inputs"]
+
+        return (cache_tensors, state_indices_tensor)
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        """
+        Copy the relevant state_indices into the CUDA graph input buffer 
+        """
+        assert all(
+            key in kwargs
+            for key in ["request_ids_to_seq_ids", "finished_requests_ids"])
+        finished_requests_ids = kwargs["finished_requests_ids"]
+        request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
+        assert "seqlen_agnostic_capture_inputs" in input_buffers
+        _, input_state_indices_buffer = input_buffers[
+            "seqlen_agnostic_capture_inputs"]
+
+        self._release_finished_requests(finished_requests_ids)
+        state_indices = self._prepare_current_run_cache(
+            request_ids_to_seq_ids, finished_requests_ids)
+        cuda_graph_pad_len = input_state_indices_buffer.shape[0] - len(
+            state_indices)
+        state_indices.extend([PAD_SLOT_ID] * cuda_graph_pad_len)
+
+        input_state_indices_buffer.copy_(
+            torch.as_tensor(state_indices, dtype=torch.int32, device="cuda"))
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        """
+        Provide the CUDA graph capture runs with a buffer in adjusted size.
+        The buffer is used to maintain the Cache during the CUDA graph replay
+        runs.
+        """
+        state_indices_tensor = torch.as_tensor([PAD_SLOT_ID] * batch_size,
+                                               dtype=torch.int32,
+                                               device="cuda")
+        return (self.cache, state_indices_tensor)
+
+    def _assign_seq_id_to_cache_index(self, cur_rid: str, seq_id: int,
+                                      finished_requests_ids) -> int:
+        """
+        Assign (req_id,seq_id) pair to a `destination_index` index, if
+        already occupied, move the occupying index to a free index.
+        """
+        if cur_rid in finished_requests_ids:
+            # set as pad, do not allocate destination index
+            return PAD_SLOT_ID
+        elif cur_rid not in self.cache_indices_mapping:
+            destination_index = self.free_cache_indices.pop()
+            self.cache_indices_mapping[cur_rid] = {seq_id: destination_index}
+            return destination_index
+        elif seq_id not in (seq_ids2indices :=
+                            self.cache_indices_mapping[cur_rid]):
+            # parallel sampling , where n > 1, assume prefill have
+            # already happened, so we copy the
+            # existing cache into the siblings seq_ids caches
+            index_exists = next(iter(seq_ids2indices.values()))
+            # case of decoding n>1, copy prefill cache to decoding indices
+            destination_index = self.free_cache_indices.pop()
+            self._copy_cache(from_index=index_exists,
+                             to_index=destination_index)
+            self.cache_indices_mapping[cur_rid][seq_id] = destination_index
+            return destination_index
+        else:
+            return self.cache_indices_mapping[cur_rid][seq_id]
+
+    def _prepare_current_run_cache(
+            self, request_ids_to_seq_ids: Dict[str, list[int]],
+            finished_requests_ids: List[str]) -> List[int]:
+        return [
+            self._assign_seq_id_to_cache_index(req_id, seq_id,
+                                               finished_requests_ids)
+            for req_id, seq_ids in request_ids_to_seq_ids.items()
+            for seq_id in seq_ids
+        ]
+
+    def _release_finished_requests(self,
+                                   finished_seq_groups_req_ids: List[str]):
+        for req_id in finished_seq_groups_req_ids:
+            if req_id in self.cache_indices_mapping:
+                for seq_id in self.cache_indices_mapping[req_id]:
+                    self.free_cache_indices.append(
+                        self.cache_indices_mapping[req_id][seq_id])
+                self.cache_indices_mapping.pop(req_id)
--- a/vllm/model_executor/models/mamba_cache.py
+++ b/vllm/model_executor/models/mamba_cache.py
@@ -1,12 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0

 from dataclasses import dataclass
-from typing import Dict, List, Tuple
+from typing import Tuple

 import torch

 from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.config import VllmConfig
+from vllm.model_executor.models.constant_size_cache import ConstantSizeCache


@dataclass
@@ -21,7 +22,7 @@ class MambaCacheParams:
                                self.state_indices_tensor)


-class MambaCacheManager:
+class MambaCacheManager(ConstantSizeCache):

    def __init__(self, vllm_config: VllmConfig, dtype: torch.dtype,
                 num_mamba_layers: int, conv_state_shape: Tuple[int, int],
@@ -32,6 +33,9 @@ class MambaCacheManager:
        if not vllm_config.model_config.enforce_eager:
            max_batch_size = vllm_config.pad_for_cudagraph(max_batch_size)

+        # Initialize parent class
+        super().__init__(max_batch_size)
+
        conv_state = torch.empty(size=(num_mamba_layers, max_batch_size) +
                                 conv_state_shape,
                                 dtype=dtype,
@@ -41,126 +45,32 @@ class MambaCacheManager:
                                     dtype=dtype,
                                     device="cuda")

-        self.mamba_cache = (conv_state, temporal_state)
+        self._mamba_cache = (conv_state, temporal_state)

-        # Maps between the request id and a dict that maps between the seq_id
-        # and its index inside the self.mamba_cache
-        self.mamba_cache_indices_mapping: Dict[str, Dict[int, int]] = {}
-        self.free_cache_indices = list(range(max_batch_size))
+    @property
+    def cache(self):
+        return self._mamba_cache
+
+    def _copy_cache(self, from_index: int, to_index: int):
+        for cache_t in self.cache:
+            cache_t[:, to_index].copy_(cache_t[:, from_index],
+                                       non_blocking=True)

    def current_run_tensors(self, **kwargs) -> MambaCacheParams:
        """
        Return the tensors for the current run's conv and ssm state.
        """
-        if "seqlen_agnostic_capture_inputs" not in kwargs:
-            # We get here only on Prefill/Eager mode runs
-            request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
-            finished_requests_ids = kwargs["finished_requests_ids"]
-
-            self._release_finished_requests(finished_requests_ids)
-            state_indices = self._prepare_current_run_mamba_cache(
-                request_ids_to_seq_ids, finished_requests_ids)
-
-            state_indices_tensor = torch.as_tensor(state_indices,
-                                                   dtype=torch.int32,
-                                                   device="cuda")
-            mamba_cache_tensors = self.mamba_cache
-
-        else:
-            # CUDA graph capturing runs
-            (mamba_cache_tensors,
-             state_indices_tensor) = kwargs["seqlen_agnostic_capture_inputs"]
-
-        return MambaCacheParams(mamba_cache_tensors[0], mamba_cache_tensors[1],
+        cache_tensors, state_indices_tensor = super().current_run_tensors(
+            **kwargs)
+        return MambaCacheParams(cache_tensors[0], cache_tensors[1],
                                state_indices_tensor)

-    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
-        """
-        Copy the relevant state_indices into the CUDA graph input buffer 
-        """
-        assert all(
-            key in kwargs
-            for key in ["request_ids_to_seq_ids", "finished_requests_ids"])
-        finished_requests_ids = kwargs["finished_requests_ids"]
-        request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
-        assert "seqlen_agnostic_capture_inputs" in input_buffers
-        _, input_state_indices_buffer = input_buffers[
-            "seqlen_agnostic_capture_inputs"]
-
-        self._release_finished_requests(finished_requests_ids)
-        state_indices = self._prepare_current_run_mamba_cache(
-            request_ids_to_seq_ids, finished_requests_ids)
-        cuda_graph_pad_len = input_state_indices_buffer.shape[0] - len(
-            state_indices)
-        state_indices.extend([PAD_SLOT_ID] * cuda_graph_pad_len)
-
-        input_state_indices_buffer.copy_(
-            torch.as_tensor(state_indices, dtype=torch.int32, device="cuda"))
-
    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
        """
        Provide the CUDA graph capture runs with a buffer in adjusted size.
        The buffer is used to maintain the Mamba Cache during the CUDA graph
        replay runs.
        """
-        state_indices_tensor = torch.as_tensor([PAD_SLOT_ID] * batch_size,
-                                               dtype=torch.int32,
-                                               device="cuda")
-        return (self.mamba_cache, state_indices_tensor)
-
-    def _copy_mamba_cache(self, from_index: int, to_index: int):
-        assert len(self.mamba_cache) > 0
-        for cache_t in self.mamba_cache:
-            cache_t[:, to_index].copy_(cache_t[:, from_index],
-                                       non_blocking=True)
-
-    def _assign_seq_id_to_cache_index(self, cur_rid: str, seq_id: int,
-                                      finished_requests_ids) -> int:
-        """
-        Assign (req_id,seq_id) pair to a `destination_index` index, if
-        already occupied, move the occupying index to a free index.
-        """
-        if cur_rid in finished_requests_ids:
-            # set as pad, do not allocate destination index
-            return PAD_SLOT_ID
-        elif cur_rid not in self.mamba_cache_indices_mapping:
-            destination_index = self.free_cache_indices.pop()
-            self.mamba_cache_indices_mapping[cur_rid] = {
-                seq_id: destination_index
-            }
-            return destination_index
-        elif seq_id not in (seq_ids2indices :=
-                            self.mamba_cache_indices_mapping[cur_rid]):
-            # parallel sampling , where n > 1, assume prefill have
-            # already happened, so we copy the
-            # existing cache into the siblings seq_ids caches
-            index_exists = next(iter(seq_ids2indices.values()))
-            # case of decoding n>1, copy prefill cache to decoding indices
-            destination_index = self.free_cache_indices.pop()
-            self._copy_mamba_cache(from_index=index_exists,
-                                   to_index=destination_index)
-            self.mamba_cache_indices_mapping[cur_rid][
-                seq_id] = destination_index
-            return destination_index
-        else:
-            # already exists
-            return self.mamba_cache_indices_mapping[cur_rid][seq_id]
-
-    def _prepare_current_run_mamba_cache(
-            self, request_ids_to_seq_ids: Dict[str, list[int]],
-            finished_requests_ids: List[str]) -> List[int]:
-        return [
-            self._assign_seq_id_to_cache_index(req_id, seq_id,
-                                               finished_requests_ids)
-            for req_id, seq_ids in request_ids_to_seq_ids.items()
-            for seq_id in seq_ids
-        ]
-
-    def _release_finished_requests(self,
-                                   finished_seq_groups_req_ids: List[str]):
-        for req_id in finished_seq_groups_req_ids:
-            if req_id in self.mamba_cache_indices_mapping:
-                for seq_id in self.mamba_cache_indices_mapping[req_id]:
-                    self.free_cache_indices.append(
-                        self.mamba_cache_indices_mapping[req_id][seq_id])
-                self.mamba_cache_indices_mapping.pop(req_id)
+        return self._mamba_cache, torch.as_tensor([PAD_SLOT_ID] * batch_size,
+                                                  dtype=torch.int32,
+                                                  device="cuda")
--- a/vllm/model_executor/models/minimax_cache.py
+++ b/vllm/model_executor/models/minimax_cache.py
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: Apache-2.0
+from dataclasses import dataclass
+
+import torch
+
+from vllm.model_executor.models.constant_size_cache import ConstantSizeCache
+
+
+@dataclass
+class MinimaxCacheParams:
+    minimax_cache: torch.Tensor = torch.Tensor()
+    state_indices_tensor: torch.Tensor = torch.Tensor()
+
+    def at_layer_idx(self, layer_idx):
+        return MinimaxCacheParams(self.minimax_cache[layer_idx, ...],
+                                  self.state_indices_tensor)
+
+
+class MinimaxCacheManager(ConstantSizeCache):
+
+    def __init__(self, dtype, cache_shape):
+        super().__init__(cache_shape[1])  # max_batch_size is cache_shape[1]
+        self._minimax_cache = torch.empty(size=cache_shape,
+                                          dtype=dtype,
+                                          device="cuda")
+
+    @property
+    def cache(self):
+        return self._minimax_cache
+
+    def _copy_cache(self, from_index: int, to_index: int):
+        assert len(self.cache) > 0
+        for cache_t in self.cache:
+            cache_t[:, to_index].copy_(cache_t[:, from_index],
+                                       non_blocking=True)
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -35,6 +35,7 @@ _TEXT_GENERATION_MODELS = {
    "AquilaModel": ("llama", "LlamaForCausalLM"),
    "AquilaForCausalLM": ("llama", "LlamaForCausalLM"),  # AquilaChat2
    "ArcticForCausalLM": ("arctic", "ArcticForCausalLM"),
+    "MiniMaxText01ForCausalLM": ("minimax_text_01", "MiniMaxText01ForCausalLM"),
    # baichuan-7b, upper case 'C' in the class name
    "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"),
    # baichuan-13b, lower case 'c' in the class name