[FEAT] [Performance] Add triton mrope to replace the torch code path (#22375)

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-08-09 11:50:03 -07:00
parent fbd8595c5c
commit 42172ad18f
3 changed files with 766 additions and 0 deletions
--- a/tests/kernels/test_mrope.py
+++ b/tests/kernels/test_mrope.py
@@ -0,0 +1,207 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+from transformers import AutoConfig
+
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.platforms import current_platform
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+def generate_test_data(num_tokens: int, num_q_heads: int, num_kv_heads: int,
+                       head_size: int, max_position_embeddings: int,
+                       dtype: torch.dtype, device: torch.device):
+    """Generate test data for given configuration."""
+    # Create 2D positions (3, num_tokens) for multimodal case
+    positions = torch.randint(0,
+                              max_position_embeddings // 4, (3, num_tokens),
+                              device=device)
+
+    # Create query and key tensors
+    query = torch.randn(num_tokens,
+                        num_q_heads * head_size,
+                        dtype=dtype,
+                        device=device)
+    key = torch.randn(num_tokens,
+                      num_kv_heads * head_size,
+                      dtype=dtype,
+                      device=device)
+
+    return positions, query, key
+
+
+def unroll_model_tp_dict(model_tp_dict):
+    return [(model_name, tp_size)
+            for model_name, tp_sizes in model_tp_dict.items()
+            for tp_size in tp_sizes]
+
+
+model_tp_dict = {
+    "Qwen/Qwen2-VL-7B-Instruct": [1, 2],
+    "Qwen/Qwen2-VL-72B-Instruct": [1, 2],
+    "Qwen/Qwen2.5-VL-72B-Instruct": [1, 2]
+}
+
+# https://github.com/pytorch/pytorch/blob/main/torch/testing/_comparison.py#L1317
+dtype_atol_rtol_list = [
+    [torch.bfloat16, 1e-5, 1.6e-2],
+]
+
+num_tokens_list = [11, 8192]
+
+
+@pytest.mark.skipif(not current_platform.is_cuda_alike(),
+                    reason="Skipping CUDA/ROCm only tests.")
+@pytest.mark.parametrize("model_name, tp_size",
+                         unroll_model_tp_dict(model_tp_dict))
+@pytest.mark.parametrize("dtype, atol, rtol", dtype_atol_rtol_list)
+@pytest.mark.parametrize("num_tokens", num_tokens_list)
+def test_mrope(model_name, tp_size, dtype, atol, rtol, num_tokens):
+
+    config = AutoConfig.from_pretrained(model_name)
+
+    # get the model config
+    total_num_kv_heads = config.num_key_value_heads
+    total_num_heads = config.num_attention_heads
+    num_heads = total_num_heads // tp_size
+    num_kv_heads = max(1, total_num_kv_heads // tp_size)
+    head_dim = config.hidden_size // total_num_heads
+    is_neox_style = True
+
+    rope_theta = config.rope_theta
+    max_position = config.max_position_embeddings
+
+    mrope_helper_class = get_rope(
+        head_size=head_dim,
+        rotary_dim=head_dim,
+        max_position=max_position,
+        base=rope_theta,
+        is_neox_style=is_neox_style,
+        rope_scaling=config.rope_scaling,
+        dtype=dtype,
+    ).to(device=device)
+
+    # create q k v input tensors
+    # create rotary pos emb input tensors
+    positions, query, key = generate_test_data(num_tokens, num_heads,
+                                               num_kv_heads, head_dim,
+                                               max_position, dtype, device)
+
+    query_native, key_native = mrope_helper_class.forward_native(
+        positions,
+        query.clone(),
+        key.clone(),
+    )
+
+    query_cuda, key_cuda = mrope_helper_class.forward_cuda(
+        positions,
+        query.clone(),
+        key.clone(),
+    )
+
+    torch.testing.assert_close(query_native, query_cuda, atol=atol, rtol=rtol)
+    torch.testing.assert_close(key_native, key_cuda, atol=atol, rtol=rtol)
+
+
+@pytest.mark.skipif(not current_platform.is_cuda_alike(),
+                    reason="Skipping CUDA/ROCm only tests.")
+@pytest.mark.parametrize(
+    "model_name, tp_size",
+    unroll_model_tp_dict({"Qwen/Qwen2-VL-7B-Instruct": [1, 2]}))
+@pytest.mark.parametrize("dtype, atol, rtol", dtype_atol_rtol_list)
+@pytest.mark.parametrize("num_tokens", [4])
+def test_mrope_torch_compile_tracing(model_name, tp_size, dtype, atol, rtol,
+                                     num_tokens):
+    config = AutoConfig.from_pretrained(model_name)
+
+    # get the model config
+    total_num_kv_heads = config.num_key_value_heads
+    total_num_heads = config.num_attention_heads
+    num_heads = total_num_heads // tp_size
+    num_kv_heads = max(1, total_num_kv_heads // tp_size)
+    head_dim = config.hidden_size // total_num_heads
+    is_neox_style = True
+    rope_theta = config.rope_theta
+    max_position = config.max_position_embeddings
+
+    mrope_helper_class = get_rope(
+        head_size=head_dim,
+        rotary_dim=head_dim,
+        max_position=max_position,
+        base=rope_theta,
+        is_neox_style=is_neox_style,
+        rope_scaling=config.rope_scaling,
+        dtype=dtype,
+    ).to(device=device)
+
+    # Generate test data
+    positions, query, key = generate_test_data(num_tokens, num_heads,
+                                               num_kv_heads, head_dim,
+                                               max_position, dtype, device)
+
+    # Create a wrapper that makes the in-place function appear functional
+    def functional_forward_cuda(pos, q, k):
+        """Wrapper that converts in-place operation to functional style
+        
+        CUDA Graph does not support in-place operations.
+        This wrapper creates working copies of the 
+        input tensors and modifies them.
+        """
+        q_work = q.clone()  # Create working copies
+        k_work = k.clone()
+        # Your in-place function modifies q_work and k_work
+        mrope_helper_class.forward_cuda(pos, q_work, k_work)
+        return q_work, k_work  # Return the modified tensors
+
+    # Get reference results
+    query_native, key_native = mrope_helper_class.forward_native(
+        positions,
+        query.clone(),
+        key.clone(),
+    )
+
+    try:
+        compiled_forward_cuda = torch.compile(functional_forward_cuda,
+                                              fullgraph=True,
+                                              backend="inductor",
+                                              mode="reduce-overhead",
+                                              dynamic=False)
+
+        # Run compiled version
+        query_compiled_cuda, key_compiled_cuda = compiled_forward_cuda(
+            positions,
+            query,
+            key,
+        )
+
+        # Run original version for comparison
+        query_cuda = query.clone()
+        key_cuda = key.clone()
+        mrope_helper_class.forward_cuda(positions, query_cuda, key_cuda)
+
+        # Verify results
+        torch.testing.assert_close(query_compiled_cuda,
+                                   query_cuda,
+                                   atol=atol,
+                                   rtol=rtol)
+        torch.testing.assert_close(key_compiled_cuda,
+                                   key_cuda,
+                                   atol=atol,
+                                   rtol=rtol)
+        torch.testing.assert_close(query_compiled_cuda,
+                                   query_native,
+                                   atol=atol,
+                                   rtol=rtol)
+        torch.testing.assert_close(key_compiled_cuda,
+                                   key_native,
+                                   atol=atol,
+                                   rtol=rtol)
+
+        print("✓ forward_cuda successfully traced with torch.compile inductor")
+
+    except Exception as e:
+        pytest.fail(
+            f"forward_cuda failed to trace with torch.compile inductor: {e}")