Deepseek v3 (#11502)

Signed-off-by: mgoin <michael@neuralmagic.com> Co-authored-by: mgoin <michael@neuralmagic.com> Co-authored-by: robertgshaw2-neuralmagic <rshaw@neuralmagic.com>
2024-12-26 16:09:44 -08:00
parent 55fb97f7bd
commit f49777ba62
7 changed files with 886 additions and 60 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -596,6 +596,12 @@ class ModelConfig:
        self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
                                          self.max_model_len)

+        if (self.hf_config.model_type == 'deepseek_v3'
+                and not self.enforce_eager):
+            logger.warning("CUDA graph is not supported for Deepseek V3 yet, "
+                           "fallback to the eager mode.")
+            self.enforce_eager = True
+
    def _verify_bnb_config(self) -> None:
        """
        The current version of bitsandbytes (0.44.0) with 8-bit models does not
@@ -712,8 +718,9 @@ class ModelConfig:

    def get_head_size(self) -> int:
        # TODO remove hard code
-        if hasattr(self.hf_text_config, "model_type"
-                   ) and self.hf_text_config.model_type == 'deepseek_v2':
+        if hasattr(self.hf_text_config,
+                   "model_type") and (self.hf_text_config.model_type
+                                      in ('deepseek_v2', 'deepseek_v3')):
            # FlashAttention supports only head_size 32, 64, 128, 256,
            # we need to pad head_size 192 to 256
            return 256