[New Model] DeepSeek-V3.2 (Rebased to Main) (#25896)

Signed-off-by: Chen Zhang <zhangch99@outlook.com> Signed-off-by: youkaichao <youkaichao@gmail.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: NickLucche <nlucches@redhat.com> Signed-off-by: Yongye Zhu <zyy1102000@gmail.com> Signed-off-by: Barry Kang <43644113+Barry-Delaney@users.noreply.github.com> Signed-off-by: Lucia Fang <fanglu@meta.com> Co-authored-by: Chen Zhang <zhangch99@outlook.com> Co-authored-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Co-authored-by: yewentao256 <zhyanwentao@126.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: mgoin <mgoin64@gmail.com> Co-authored-by: Lucia Fang <116399278+luccafong@users.noreply.github.com> Co-authored-by: Lucia Fang <fanglu@meta.com> Co-authored-by: NickLucche <nlucches@redhat.com> Co-authored-by: Siyuan Fu <siyuanf@nvidia.com> Co-authored-by: Matthew Bonanni <mbonanni@redhat.com> Co-authored-by: Xiaozhu Meng <mxz297@gmail.com> Co-authored-by: Barry Kang <43644113+Barry-Delaney@users.noreply.github.com> Signed-off-by: simon-mo <simon.mo@hey.com>
2025-09-30 05:14:41 -04:00
parent 03df0fb5d2
commit b3230e1ac0
71 changed files with 3915 additions and 218 deletions
--- a/tests/v1/attention/test_mla_backends.py
+++ b/tests/v1/attention/test_mla_backends.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for v1 MLA backends without GPUModelRunner dependency."""
+from typing import Optional, Union

 import pytest
 import torch
@@ -10,6 +11,7 @@ from tests.v1.attention.utils import (BatchSpec, _Backend,
                                      create_standard_kv_cache_spec,
                                      create_vllm_config,
                                      get_attention_backend)
+from vllm import _custom_ops as ops
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.kv_cache_interface import FullAttentionSpec
@@ -78,7 +80,9 @@ def create_and_prepopulate_kv_cache(
        device: torch.device,
        num_blocks: int,
        common_attn_metadata: CommonAttentionMetadata,
-        randomize_blocks: bool = True) -> torch.Tensor:
+        randomize_blocks: bool = True,
+        kv_cache_dtype: Optional[str] = None,
+        scale: Union[float, torch.Tensor] = 1.0) -> torch.Tensor:
    """Create and prepopulate an MLA KV cache with context data.
    
    Args:
@@ -93,6 +97,11 @@ def create_and_prepopulate_kv_cache(
        common_attn_metadata: Common attention metadata
        randomize_blocks: Whether to randomly permute blocks 
                          or use sequential order
+        kv_cache_dtype: Optional kv cache dtype string. When set to
+                        "fp8_ds_mla" the cache is populated using the
+                        fp8 DeepSeek MLA layout via concat_and_cache_mla.
+        scale: Scaling factor forwarded to concat_and_cache_mla when the
+               fp8 cache layout is requested.
        
    Returns:
        MLA KV cache tensor
@@ -105,23 +114,61 @@ def create_and_prepopulate_kv_cache(
    block_table = common_attn_metadata.block_table_tensor
    slot_mapping = common_attn_metadata.slot_mapping

-    # Create MLA KV cache: (num_blocks, block_size, head_size)
-    kv_cache = torch.empty(num_blocks,
-                           block_size,
-                           head_size,
-                           dtype=dtype,
-                           device=device)
-    kv_cache_flat = kv_cache.view(-1, head_size)
+    use_fp8_ds_mla = kv_cache_dtype == "fp8_ds_mla"
+
+    if use_fp8_ds_mla:
+        if not kv_c_contexts:
+            raise ValueError("kv_c_contexts cannot be empty when using"
+                             " fp8_ds_mla cache dtype")
+        kv_lora_rank = kv_c_contexts[0].shape[-1]
+        rope_dim = k_pe_contexts[0].shape[-1]
+        entry_size = kv_lora_rank + 4 * 4 + 2 * rope_dim
+        kv_cache = torch.zeros(num_blocks,
+                               block_size,
+                               entry_size,
+                               dtype=torch.uint8,
+                               device=device)
+        scale_tensor = (scale
+                        if isinstance(scale, torch.Tensor) else torch.tensor(
+                            scale, dtype=torch.float32, device=device))
+        scale_tensor = scale_tensor.to(device=device, dtype=torch.float32)
+    else:
+        # Create MLA KV cache: (num_blocks, block_size, head_size)
+        kv_cache = torch.empty(num_blocks,
+                               block_size,
+                               head_size,
+                               dtype=dtype,
+                               device=device)
+        kv_cache_flat = kv_cache.view(-1, head_size)

    # Populate the cache with the context tokens
    # Start from block_id=1 since block_id=0 is considered the null block
    start_block_idx = 1
    for i in range(batch_size):
        kv_c_context, k_pe_context = kv_c_contexts[i], k_pe_contexts[i]
-        kv_context = torch.cat([kv_c_context, k_pe_context.squeeze(1)], dim=-1)
+        context_len = kv_c_context.shape[0]
+        if context_len == 0:
+            start_block_idx += cdiv(int(seq_lens[i]), block_size)
+            continue
+
        start = start_block_idx * block_size
-        end = start + kv_context.shape[0]
-        kv_cache_flat[start:end, ...] = kv_context
+
+        if use_fp8_ds_mla:
+            slots = torch.arange(context_len, device=device,
+                                 dtype=torch.long) + start
+            ops.concat_and_cache_mla(
+                kv_c_context,
+                k_pe_context.squeeze(1),
+                kv_cache,
+                slots,
+                kv_cache_dtype="fp8_ds_mla",
+                scale=scale_tensor,
+            )
+        else:
+            kv_context = torch.cat(
+                [kv_c_context, k_pe_context.squeeze(1)], dim=-1)
+            end = start + kv_context.shape[0]
+            kv_cache_flat[start:end, ...] = kv_context

        # Stay block aligned and allocate enough blocks for the new tokens
        start_block_idx += cdiv(int(seq_lens[i]), block_size)