[CI/Build] Clean up LoRA tests (#15867)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-04-02 00:28:50 +08:00
parent e59ca942f5
commit dfa82e2a3d
5 changed files with 27 additions and 200 deletions
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -19,7 +19,6 @@ from vllm.lora.fully_sharded_layers import (
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
-                              LinearScalingRotaryEmbeddingWithLoRA,
                              LogitsProcessorWithLoRA, LoRAMapping,
                              MergedColumnParallelLinearWithLoRA,
                              MergedQKVParallelLinearWithLoRA,
@@ -28,8 +27,7 @@ from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
                              RowParallelLinearWithLoRA,
                              VocabParallelEmbeddingWithLoRA)
 # yapf: enable
-from vllm.lora.models import (LongContextLoRAContext, LoRALayerWeights,
-                              PackedLoRALayerWeights)
+from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights
 from vllm.lora.punica_wrapper import get_punica_wrapper
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                               MergedColumnParallelLinear,
@@ -37,7 +35,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                               ReplicatedLinear,
                                               RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    ParallelLMHead, VocabParallelEmbedding, get_masked_input_and_mask)
 from vllm.model_executor.utils import set_random_seed
@@ -59,28 +56,16 @@ DEVICES = ([
    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ] if current_platform.is_cuda_alike() else ["cpu"])

-#For GPU, we will launch different triton kernels between the prefill and decode
-# stages, so we need to verify this. prefill stage(True) or decode stage(False)
+# prefill stage(True) or decode stage(False)
 STAGES = [True, False]

-# With the inclusion of V1 tests (look at the run_with_both_engines_lora),
-# the tests in this file run twice, once with the V0 engine and then with
-# the V1 engine.
-# The NUM_RANDOM_SEEDS value was set to 10 before. It is cut to half
-# with the inclusion of V1 tests to maintain the CI test times.
-NUM_RANDOM_SEEDS = 5
-# The VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS value was set to
-# 256 before. It is cut to half with the inclusion of V1 tests to maintain
-# the CI test times.
+NUM_RANDOM_SEEDS = 10
+
 VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128


@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-
+def clean_cache():
    # Release any memory we might be holding on to. CI runs OOMs otherwise.
    from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
                                                _LORA_B_PTR_DICT)
@@ -90,6 +75,24 @@ def v1(run_with_both_engines_lora):
    yield


+@pytest.fixture(autouse=True)
+def skip_cuda_with_stage_false(request):
+    """
+    On cuda-like platforms, we use the same kernels for prefill and decode 
+    stage, and 'stage' is generally ignored, so we only need to test once.
+    """
+    if current_platform.is_cuda_alike():
+        try:
+            if hasattr(request.node, "callspec") and hasattr(
+                    request.node.callspec, "params"):
+                params = request.node.callspec.params
+                if "stage" in params and params["stage"] is False:
+                    pytest.skip("Skip test when stage=False")
+        except Exception:
+            pass
+    yield
+
+
 def get_random_id_to_index(num_loras: int,
                           num_slots: int,
                           log: bool = True) -> list[Optional[int]]:
@@ -1011,103 +1014,6 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
                                   atol=atol)


-@torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 8])
-@pytest.mark.parametrize("device", ["cuda"])
-@pytest.mark.parametrize("scaling_factors", [(1.0, ), (4.0, ), (4.0, 8.0),
-                                             (6.0, 1.0)])
-@pytest.mark.parametrize("max_position", [11, 4096, 32768])
-@pytest.mark.parametrize("is_neox_style", [True, False])
-@pytest.mark.parametrize("rotary_dim", [None, 32])
-@pytest.mark.parametrize("head_size", [32, 108])
-@pytest.mark.parametrize("seq_len", [11, 1024])
-@pytest.mark.skipif(not current_platform.is_cuda_alike(),
-                    reason="Only CUDA backends are supported")
-def test_rotary_embedding_long_context(dist_init, num_loras, device,
-                                       scaling_factors, max_position,
-                                       is_neox_style, rotary_dim, head_size,
-                                       seq_len) -> None:
-    dtype = torch.float16
-    max_loras = 8
-    seed = 0
-    current_platform.seed_everything(seed)
-    torch.set_default_device(device)
-    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
-    assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(max_loras=max_loras,
-                             max_lora_rank=8,
-                             long_lora_scaling_factors=scaling_factors,
-                             lora_dtype=dtype)
-
-    if rotary_dim is None:
-        rotary_dim = head_size
-    base = 10000
-    batch_size = 5 * num_loras
-    num_heads = 7
-
-    # Verify lora is equivalent to linear scaling rotary embedding.
-    rope = get_rope(
-        head_size,
-        rotary_dim,
-        max_position,
-        base,
-        is_neox_style,
-    )
-    lora_rope = LinearScalingRotaryEmbeddingWithLoRA(rope)
-    lora_rope.set_mapping(punica_wrapper)
-    lora_rope.create_lora_weights(max_loras, lora_config)
-    linear_rope = get_rope(head_size, rotary_dim, max_position, base,
-                           is_neox_style, {
-                               "rope_type": "linear",
-                               "factor": scaling_factors
-                           })
-    linear_rope = linear_rope.to(dtype=dtype)
-    id_to_index = get_random_id_to_index(num_loras, max_loras)
-    _, index_mapping, prompt_mapping = create_random_inputs(
-        active_lora_ids=[0],
-        num_inputs=batch_size,
-        input_size=(1, max_position),
-        input_range=(0, lora_config.lora_extra_vocab_size),
-        input_type=torch.float16,
-        device=device)
-
-    lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
-    long_lora_context = LongContextLoRAContext(list(scaling_factors),
-                                               rotary_dim)
-
-    next_expected_offset = 0
-    # Make sure the offset is correct.
-    scaling_factor_to_offset = lora_rope.scaling_factor_to_offset
-    for scaling_factor, offset in scaling_factor_to_offset.items():
-        assert offset == next_expected_offset
-        next_expected_offset += scaling_factor * max_position
-
-    for i in range(len(scaling_factors)):
-        long_lora_context.offsets_by_lora_id[i] = scaling_factor_to_offset.get(
-            scaling_factors[i], 0)
-    punica_wrapper.update_metadata(
-        lora_mapping,
-        id_to_index,
-        max_loras,
-        512,
-        lora_config.lora_extra_vocab_size,
-        long_lora_context=long_lora_context,
-    )
-    # lora_rope.set_mapping(*mapping_info)
-
-    positions = torch.randint(0, max_position, (batch_size, seq_len))
-    query = torch.randn(batch_size,
-                        seq_len,
-                        num_heads * head_size,
-                        dtype=dtype)
-    key = torch.randn_like(query)
-    ref_q, ref_k = linear_rope(positions, query, key)
-    actual_q, actual_k = lora_rope(positions, query, key)
-
-    torch.allclose(ref_q, actual_q)
-    torch.allclose(ref_k, actual_k)
-
-
@pytest.mark.parametrize("tp_size", [1, 2, 4, 8])
@pytest.mark.parametrize(
    "seed", list(range(VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS)))