[Kernel] optimize moe_align_block_size for cuda graph and large num_experts (e.g. DeepSeek-V3) (#12222)

Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com> Co-authored-by: Michael Goin <mgoin@redhat.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-01-21 08:42:16 +08:00
parent 06a760d6e8
commit 750f4cabfa
2 changed files with 58 additions and 37 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -607,7 +607,7 @@ class ModelConfig:
        self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
                                          self.max_model_len)

-        MODEL_NOT_SUPPORT_CUDA_GRAPH = ['deepseek_v3', 'mllama']
+        MODEL_NOT_SUPPORT_CUDA_GRAPH = ['mllama']
        if (self.hf_config.model_type in MODEL_NOT_SUPPORT_CUDA_GRAPH
                and not self.enforce_eager):
            logger.warning(