[Chore] Remove unused batched RoPE op & kernel (#24789)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-13 00:08:20 -07:00
parent 99bfef841f
commit 5febdc8750
8 changed files with 16 additions and 348 deletions
--- a/tests/kernels/core/test_pos_encoding.py
+++ b/tests/kernels/core/test_pos_encoding.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-from itertools import accumulate, product
+from itertools import product
 from typing import Callable, Optional

 import pytest
@@ -111,151 +111,6 @@ def test_rotary_embedding(
            "expected returned key to be None"


-@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
-@pytest.mark.parametrize("tensor_shape_fn", TENSORS_SHAPES_FN)
-@pytest.mark.parametrize("batch_size", BATCH_SIZES)
-@pytest.mark.parametrize("seq_len", SEQ_LENS)
-@pytest.mark.parametrize("num_heads", NUM_HEADS)
-@pytest.mark.parametrize("head_size", HEAD_SIZES)
-@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.parametrize("use_key", USE_KEY)
-@torch.inference_mode()
-def test_batched_rotary_embedding(
-    is_neox_style: bool,
-    tensor_shape_fn: Callable[[int, int, int, int], tuple[int]],
-    batch_size: int,
-    seq_len: int,
-    num_heads: int,
-    head_size: int,
-    rotary_dim: Optional[int],
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-    use_key: bool,
-    max_position: int = 8192,
-    base: float = 10000,
-) -> None:
-    current_platform.seed_everything(seed)
-    torch.set_default_device(device)
-    if rotary_dim is None:
-        rotary_dim = head_size
-    rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, {
-        "rope_type": "linear",
-        "factor": (1, )
-    })
-    rope = rope.to(dtype=dtype, device=torch.get_default_device())
-
-    positions = torch.randint(0, max_position, (batch_size, seq_len))
-    query_shape = tensor_shape_fn(batch_size, seq_len, num_heads, head_size)
-    query = torch.randn(query_shape, dtype=dtype)
-    key = torch.randn_like(query) if use_key else None
-
-    # slice tensor if required, noop otherwise
-    query = query[..., :head_size]
-    key = key[..., :head_size] if use_key else None
-
-    # NOTE(woosuk): The reference implementation should be executed first
-    # because the custom kernel is in-place.
-    ref_query, ref_key = rope.forward_native(positions, query, key)
-    out_query, out_key = rope.forward(positions,
-                                      query,
-                                      key,
-                                      offsets=torch.zeros(batch_size * seq_len,
-                                                          dtype=torch.long,
-                                                          device=device))
-    # Compare the results.
-    torch.testing.assert_close(out_query,
-                               ref_query,
-                               atol=get_default_atol(out_query),
-                               rtol=get_default_rtol(out_query))
-    if use_key:
-        torch.testing.assert_close(out_key,
-                                   ref_key,
-                                   atol=get_default_atol(out_key),
-                                   rtol=get_default_rtol(out_key))
-    else:
-        assert ref_key is None and out_key is None, \
-            "expected returned key to be None"
-
-
-@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
-@pytest.mark.parametrize("batch_size", BATCH_SIZES)
-@pytest.mark.parametrize("seq_len", SEQ_LENS)
-@pytest.mark.parametrize("num_heads", NUM_HEADS)
-@pytest.mark.parametrize("head_size", HEAD_SIZES)
-@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.parametrize("use_key", USE_KEY)
-@torch.inference_mode()
-def test_batched_rotary_embedding_multi_lora(
-    is_neox_style: bool,
-    batch_size: int,
-    seq_len: int,
-    num_heads: int,
-    head_size: int,
-    rotary_dim: Optional[int],
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-    use_key: bool,
-    max_position: int = 8192,
-    base: float = 10000,
-) -> None:
-    current_platform.seed_everything(seed)
-    torch.set_default_device(device)
-    if rotary_dim is None:
-        rotary_dim = head_size
-    scaling_factors: list[int] = [1, 2, 4]
-    rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, {
-        "rope_type": "linear",
-        "factor": tuple(scaling_factors)
-    })
-    rope = rope.to(dtype=dtype, device=torch.get_default_device())
-
-    positions = torch.randint(0, max_position, (batch_size, seq_len))
-    query = torch.randn(batch_size,
-                        seq_len,
-                        num_heads * head_size,
-                        dtype=dtype)
-    key = torch.randn_like(query) if use_key else None
-
-    offset_map = torch.tensor(
-        list(
-            accumulate([0] + [
-                max_position * scaling_factor * 2
-                for scaling_factor in scaling_factors[:-1]
-            ])))
-    query_types = torch.randint(0,
-                                len(scaling_factors), (batch_size, seq_len),
-                                device=device)
-    query_offsets = offset_map[query_types]
-
-    # NOTE(woosuk): The reference implementation should be executed first
-    # because the custom kernel is in-place.
-    ref_query, ref_key = rope.forward_native(positions, query, key,
-                                             query_offsets)
-    out_query, out_key = rope.forward(positions, query, key,
-                                      query_offsets.flatten())
-    # Compare the results.
-    torch.testing.assert_close(out_query,
-                               ref_query,
-                               atol=get_default_atol(out_query),
-                               rtol=get_default_rtol(out_query))
-    if use_key:
-        torch.testing.assert_close(out_key,
-                                   ref_key,
-                                   atol=get_default_atol(out_key),
-                                   rtol=get_default_rtol(out_key))
-    else:
-        assert ref_key is None and out_key is None, \
-            "expected returned key to be None"
-
-
@torch.inference_mode()
 def test_rope_module_cache():
    MAX_POSITIONS = [123, 1234]