Remove V0 Encoder-Decoder Support (#24907)

Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
2025-09-15 21:17:14 -07:00
parent 5206ab20ba
commit 759ef49b15
47 changed files with 13 additions and 9661 deletions
--- a/tests/utils_/test_utils.py
+++ b/tests/utils_/test_utils.py
@@ -501,34 +501,6 @@ def test_bind_kv_cache_non_attention():
    assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]


-def test_bind_kv_cache_encoder_decoder(monkeypatch: pytest.MonkeyPatch):
-    # V1 TESTS: ENCODER_DECODER is not supported on V1 yet.
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "0")
-
-        from vllm.attention import Attention, AttentionType
-
-        # example from bart
-        ctx = {
-            'encoder.layers.0.self_attn.attn':
-                Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER),
-            'decoder.layers.0.encoder_attn.attn':
-                Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER),
-            'decoder.layers.0.self_attn.attn':
-                Attention(32, 128, 0.1, attn_type=AttentionType.DECODER),
-        }
-
-        kv_cache = [
-            torch.zeros((1, )),
-        ]
-        encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache
-
-        bind_kv_cache(ctx, [kv_cache])
-        assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache
-        assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0]
-        assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0]
-
-
 def test_bind_kv_cache_pp():
    with patch("vllm.utils.cuda_device_count_stateless", lambda: 2):
        # this test runs with 1 GPU, but we simulate 2 GPUs