[New Model] DeepSeek-V3.2 (Rebased to Main) (#25896)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Signed-off-by: Barry Kang <43644113+Barry-Delaney@users.noreply.github.com>
Signed-off-by: Lucia Fang <fanglu@meta.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Lucia Fang <116399278+luccafong@users.noreply.github.com>
Co-authored-by: Lucia Fang <fanglu@meta.com>
Co-authored-by: NickLucche <nlucches@redhat.com>
Co-authored-by: Siyuan Fu <siyuanf@nvidia.com>
Co-authored-by: Matthew Bonanni <mbonanni@redhat.com>
Co-authored-by: Xiaozhu Meng <mxz297@gmail.com>
Co-authored-by: Barry Kang <43644113+Barry-Delaney@users.noreply.github.com>
Signed-off-by: simon-mo <simon.mo@hey.com>
This commit is contained in:
Yongye Zhu
2025-09-30 05:14:41 -04:00
committed by simon-mo
parent 03df0fb5d2
commit b3230e1ac0
71 changed files with 3915 additions and 218 deletions

View File

@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for v1 MLA backends without GPUModelRunner dependency."""
from typing import Optional, Union
import pytest
import torch
@@ -10,6 +11,7 @@ from tests.v1.attention.utils import (BatchSpec, _Backend,
create_standard_kv_cache_spec,
create_vllm_config,
get_attention_backend)
from vllm import _custom_ops as ops
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
from vllm.v1.kv_cache_interface import FullAttentionSpec
@@ -78,7 +80,9 @@ def create_and_prepopulate_kv_cache(
device: torch.device,
num_blocks: int,
common_attn_metadata: CommonAttentionMetadata,
randomize_blocks: bool = True) -> torch.Tensor:
randomize_blocks: bool = True,
kv_cache_dtype: Optional[str] = None,
scale: Union[float, torch.Tensor] = 1.0) -> torch.Tensor:
"""Create and prepopulate an MLA KV cache with context data.
Args:
@@ -93,6 +97,11 @@ def create_and_prepopulate_kv_cache(
common_attn_metadata: Common attention metadata
randomize_blocks: Whether to randomly permute blocks
or use sequential order
kv_cache_dtype: Optional kv cache dtype string. When set to
"fp8_ds_mla" the cache is populated using the
fp8 DeepSeek MLA layout via concat_and_cache_mla.
scale: Scaling factor forwarded to concat_and_cache_mla when the
fp8 cache layout is requested.
Returns:
MLA KV cache tensor
@@ -105,23 +114,61 @@ def create_and_prepopulate_kv_cache(
block_table = common_attn_metadata.block_table_tensor
slot_mapping = common_attn_metadata.slot_mapping
# Create MLA KV cache: (num_blocks, block_size, head_size)
kv_cache = torch.empty(num_blocks,
block_size,
head_size,
dtype=dtype,
device=device)
kv_cache_flat = kv_cache.view(-1, head_size)
use_fp8_ds_mla = kv_cache_dtype == "fp8_ds_mla"
if use_fp8_ds_mla:
if not kv_c_contexts:
raise ValueError("kv_c_contexts cannot be empty when using"
" fp8_ds_mla cache dtype")
kv_lora_rank = kv_c_contexts[0].shape[-1]
rope_dim = k_pe_contexts[0].shape[-1]
entry_size = kv_lora_rank + 4 * 4 + 2 * rope_dim
kv_cache = torch.zeros(num_blocks,
block_size,
entry_size,
dtype=torch.uint8,
device=device)
scale_tensor = (scale
if isinstance(scale, torch.Tensor) else torch.tensor(
scale, dtype=torch.float32, device=device))
scale_tensor = scale_tensor.to(device=device, dtype=torch.float32)
else:
# Create MLA KV cache: (num_blocks, block_size, head_size)
kv_cache = torch.empty(num_blocks,
block_size,
head_size,
dtype=dtype,
device=device)
kv_cache_flat = kv_cache.view(-1, head_size)
# Populate the cache with the context tokens
# Start from block_id=1 since block_id=0 is considered the null block
start_block_idx = 1
for i in range(batch_size):
kv_c_context, k_pe_context = kv_c_contexts[i], k_pe_contexts[i]
kv_context = torch.cat([kv_c_context, k_pe_context.squeeze(1)], dim=-1)
context_len = kv_c_context.shape[0]
if context_len == 0:
start_block_idx += cdiv(int(seq_lens[i]), block_size)
continue
start = start_block_idx * block_size
end = start + kv_context.shape[0]
kv_cache_flat[start:end, ...] = kv_context
if use_fp8_ds_mla:
slots = torch.arange(context_len, device=device,
dtype=torch.long) + start
ops.concat_and_cache_mla(
kv_c_context,
k_pe_context.squeeze(1),
kv_cache,
slots,
kv_cache_dtype="fp8_ds_mla",
scale=scale_tensor,
)
else:
kv_context = torch.cat(
[kv_c_context, k_pe_context.squeeze(1)], dim=-1)
end = start + kv_context.shape[0]
kv_cache_flat[start:end, ...] = kv_context
# Stay block aligned and allocate enough blocks for the new tokens
start_block_idx += cdiv(int(seq_lens[i]), block_size)