Add tree attention backend for v1 (part 1) (#20401)

Signed-off-by: Giancarlo Delfin <gdelfin@meta.com>
2025-08-03 22:13:26 -07:00
parent c2e75b3c11
commit aa7012eb6d
12 changed files with 1098 additions and 25 deletions
--- a/tests/v1/spec_decode/test_tree_attention.py
+++ b/tests/v1/spec_decode/test_tree_attention.py
@@ -0,0 +1,299 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+from typing import Optional
+
+import torch
+
+from tests.v1.attention.utils import (_Backend, create_standard_kv_cache_spec,
+                                      create_vllm_config,
+                                      get_attention_backend)
+from vllm.config import ParallelConfig, SpeculativeConfig
+from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+
+
+class MockAttentionLayer(torch.nn.Module):
+    _q_scale = torch.tensor(1.0, dtype=torch.float32, device="cuda")
+    _k_scale = torch.tensor(1.0, dtype=torch.float32, device="cuda")
+    _v_scale = torch.tensor(1.0, dtype=torch.float32, device="cuda")
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return x
+
+
+def forward_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    kv_cache: torch.Tensor,
+    block_table: torch.Tensor,
+    slot_mapping: torch.Tensor,
+    seqlen_k: int,
+    backend: _Backend,
+    spec_token_tree: Optional[str] = None,
+    num_spec_tokens: int = 0,
+) -> torch.Tensor:
+    batch_size, q_len, num_heads, dim_per_head = q.shape
+    num_kv_heads = k.shape[-2]
+    # Initialize the query and KV sequence lengths.
+    query_start_loc = q_len * torch.arange(
+        batch_size + 1, device=q.device, dtype=torch.int32)
+    query_lens = torch.diff(query_start_loc)
+    seq_lens = torch.full(
+        (batch_size, ),
+        seqlen_k,
+        device=q.device,
+        dtype=torch.int32,
+    )
+    context_lens = seq_lens - query_lens
+    max_query_len = q_len
+    num_actual_tokens = query_start_loc[-1]
+
+    softmax_scale = q.shape[-1]**(-0.5)
+    layer = MockAttentionLayer()
+
+    # Build common metadata.
+    model_name = "meta-llama/Meta-Llama-3-8B"
+    builder_cls, impl_cls = get_attention_backend(backend)
+    vllm_config = create_vllm_config(model_name=model_name,
+                                     max_model_len=max(seq_lens))
+    if spec_token_tree is not None:
+        # Create speculative config if token tree is specified.
+        vllm_config.speculative_config = SpeculativeConfig(
+            target_model_config=vllm_config.model_config,
+            target_parallel_config=ParallelConfig(),
+            model=model_name,
+            method="eagle",
+            num_speculative_tokens=num_spec_tokens,
+            speculative_token_tree=spec_token_tree)
+    kv_cache_spec = create_standard_kv_cache_spec(vllm_config)
+    builder = builder_cls(kv_cache_spec, [], vllm_config, q.device)
+    common_attn_metadata = CommonAttentionMetadata(
+        query_start_loc=query_start_loc,
+        query_start_loc_cpu=query_start_loc.cpu(),
+        seq_lens=seq_lens,
+        seq_lens_cpu=seq_lens.cpu(),
+        num_computed_tokens_cpu=context_lens.cpu(),
+        num_reqs=batch_size,
+        num_actual_tokens=num_actual_tokens,
+        max_query_len=max_query_len,
+        block_table_tensor=block_table,
+        slot_mapping=slot_mapping,
+    )
+
+    # Build attention metadata.
+    attn_metadata = builder.build(
+        common_prefix_len=0,
+        common_attn_metadata=common_attn_metadata,
+    )
+
+    # Initialize the backend implementation.
+    instance = impl_cls(
+        num_heads=num_heads,
+        head_size=dim_per_head,
+        scale=softmax_scale,
+        num_kv_heads=num_kv_heads,
+        alibi_slopes=None,
+        sliding_window=None,
+        kv_cache_dtype="auto",
+    )
+
+    # Run forward pass and return output.
+    query = q.view(-1, num_heads, dim_per_head)
+    key = k.view(-1, num_kv_heads, dim_per_head)
+    value = v.view(-1, num_kv_heads, dim_per_head)
+    output = torch.empty_like(query)
+    return instance.forward(
+        layer=layer,
+        query=query,
+        key=key,
+        value=value,
+        kv_cache=kv_cache.clone(),
+        attn_metadata=attn_metadata,
+        output=output,
+    )
+
+
+def test_tree_attn_correctness() -> None:
+    torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
+
+    device = "cuda"
+    tree_attn_masks = {
+        # Chain.
+        "[(0,), (0, 0), (0, 0, 0)]":
+        torch.tensor(
+            [
+                [1, 0, 0, 0],
+                [1, 1, 0, 0],
+                [1, 1, 1, 0],
+                [1, 1, 1, 1],
+            ],
+            device=device,
+            dtype=torch.int32,
+        ),
+        # Tree.
+        "[(0,), (1,), (0, 0), (0, 1), (1, 0), (1, 1)]":
+        torch.tensor(
+            [
+                [1, 0, 0, 0, 0, 0, 0],
+                [1, 1, 0, 0, 0, 0, 0],
+                [1, 0, 1, 0, 0, 0, 0],
+                [1, 1, 0, 1, 0, 0, 0],
+                [1, 1, 0, 0, 1, 0, 0],
+                [1, 0, 1, 0, 0, 1, 0],
+                [1, 0, 1, 0, 0, 0, 1],
+            ],
+            device=device,
+            dtype=torch.int32,
+        ),
+    }
+
+    dim_per_head = 128
+    num_kv_heads = 2
+    block_size = 128
+    max_sequence_length = 8192
+    randomize_blocks = True
+    for batch_size in [1, 16, 32]:
+        for num_heads in [2, 4]:
+            for sequence_position in [16, 1024, 2048]:
+                for spec_token_tree, tree_attn_mask in tree_attn_masks.items():
+                    # Assert that the number of heads is divisible
+                    # by the number of KV heads.
+                    assert num_heads % num_kv_heads == 0
+
+                    # Initialize q, k, and v.
+                    tree_size_q = tree_attn_mask.shape[0]
+                    seqlen_k = sequence_position + tree_size_q
+                    q = torch.randn(
+                        (batch_size, tree_size_q, num_heads, dim_per_head),
+                        device=device,
+                        dtype=torch.bfloat16,
+                    )
+                    k = torch.randn(
+                        (batch_size, tree_size_q, num_kv_heads, dim_per_head),
+                        device=device,
+                        dtype=torch.bfloat16,
+                    )
+                    v = torch.randn(
+                        (batch_size, tree_size_q, num_kv_heads, dim_per_head),
+                        device=device,
+                        dtype=torch.bfloat16,
+                    )
+
+                    # Setup the block table and KV cache for paged KV.
+                    assert max_sequence_length % block_size == 0
+                    max_blocks_per_batch = max_sequence_length // block_size
+                    kv_cache = torch.randn(
+                        (
+                            2,
+                            batch_size * max_blocks_per_batch,
+                            block_size,
+                            num_kv_heads,
+                            dim_per_head,
+                        ),
+                        device=q.device,
+                        dtype=torch.bfloat16,
+                    )
+                    num_alloc_blocks_per_batch = math.ceil(seqlen_k /
+                                                           block_size)
+                    block_table = torch.zeros(
+                        (batch_size, max_blocks_per_batch),
+                        device=q.device,
+                        dtype=torch.int32,
+                    )
+                    block_ids = torch.arange(
+                        0,
+                        batch_size * num_alloc_blocks_per_batch,
+                        device=q.device,
+                        dtype=torch.int32,
+                    )
+                    if randomize_blocks:
+                        # Randomize the block ids.
+                        block_ids = block_ids[torch.randperm(
+                            block_ids.numel())]
+                    block_table[:, :
+                                num_alloc_blocks_per_batch] = block_ids.view(
+                                    -1, num_alloc_blocks_per_batch)
+
+                    # Setup the slot mapping for the input KVs.
+                    tree_positions = sequence_position + torch.arange(
+                        0,
+                        tree_size_q,
+                        device=q.device,
+                        dtype=torch.int64,
+                    ).repeat(batch_size, 1)
+                    tree_slot_mapping = _gen_slot_mapping(
+                        tree_positions, block_table, block_size)
+
+                    # Compute attention for the tree.
+                    tree_attn_output = forward_attention(
+                        q=q,
+                        k=k,
+                        v=v,
+                        kv_cache=kv_cache,
+                        block_table=block_table,
+                        slot_mapping=tree_slot_mapping,
+                        seqlen_k=seqlen_k,
+                        backend=_Backend.TREE_ATTN,
+                        spec_token_tree=spec_token_tree,
+                        num_spec_tokens=tree_size_q - 1,
+                    ).view(batch_size, -1, num_heads, dim_per_head)
+
+                    # Verify that the chain attention output for each
+                    # branch of the tree (computed using FA3) matches
+                    # the tree attention output.
+                    for q_index in range(tree_size_q):
+                        # Get the q, k, and v for the branch.
+                        branch_mask = tree_attn_mask[q_index, :]
+                        branch_indices = torch.nonzero(branch_mask,
+                                                       as_tuple=True)[0]
+                        q_len = branch_indices.shape[0]
+                        q_branch = q[:, branch_indices]
+                        k_branch = k[:, branch_indices]
+                        v_branch = v[:, branch_indices]
+
+                        # Setup slot mapping for the branch.
+                        branch_positions = sequence_position + torch.arange(
+                            0,
+                            q_len,
+                            device=q.device,
+                            dtype=torch.int64,
+                        ).repeat(batch_size, 1)
+                        branch_slot_mapping = _gen_slot_mapping(
+                            branch_positions, block_table, block_size)
+
+                        # Compute flash attention for the branch.
+                        flash_attn_output = forward_attention(
+                            q=q_branch,
+                            k=k_branch,
+                            v=v_branch,
+                            kv_cache=kv_cache,
+                            block_table=block_table,
+                            slot_mapping=branch_slot_mapping,
+                            seqlen_k=sequence_position + q_len,
+                            backend=_Backend.FLASH_ATTN_VLLM_V1,
+                        ).view(batch_size, -1, num_heads, dim_per_head)
+
+                        # Compare the outputs.
+                        assert torch.allclose(
+                            tree_attn_output[:, branch_indices],
+                            flash_attn_output,
+                            atol=7.81e-3,
+                        ), (f"outputs are not close for "
+                            f"batch_size: {batch_size}, "
+                            f"num_heads: {num_heads}, "
+                            f"sequence_position: {sequence_position}, "
+                            f"tree_attn_mask: {tree_attn_mask}, "
+                            f"q_index: {q_index}.")
+
+
+def _gen_slot_mapping(positions: torch.Tensor, block_table: torch.Tensor,
+                      block_size: int):
+    block_indices = positions // block_size
+    blocks = block_table.gather(dim=1, index=block_indices)
+    return (blocks * block_size + positions % block_size).view(-1)