Remove V0 Encoder-Decoder Support (#24907)
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
This commit is contained in:
@@ -501,34 +501,6 @@ def test_bind_kv_cache_non_attention():
|
||||
assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]
|
||||
|
||||
|
||||
def test_bind_kv_cache_encoder_decoder(monkeypatch: pytest.MonkeyPatch):
|
||||
# V1 TESTS: ENCODER_DECODER is not supported on V1 yet.
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
|
||||
from vllm.attention import Attention, AttentionType
|
||||
|
||||
# example from bart
|
||||
ctx = {
|
||||
'encoder.layers.0.self_attn.attn':
|
||||
Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER),
|
||||
'decoder.layers.0.encoder_attn.attn':
|
||||
Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER),
|
||||
'decoder.layers.0.self_attn.attn':
|
||||
Attention(32, 128, 0.1, attn_type=AttentionType.DECODER),
|
||||
}
|
||||
|
||||
kv_cache = [
|
||||
torch.zeros((1, )),
|
||||
]
|
||||
encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache
|
||||
|
||||
bind_kv_cache(ctx, [kv_cache])
|
||||
assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache
|
||||
assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0]
|
||||
assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0]
|
||||
|
||||
|
||||
def test_bind_kv_cache_pp():
|
||||
with patch("vllm.utils.cuda_device_count_stateless", lambda: 2):
|
||||
# this test runs with 1 GPU, but we simulate 2 GPUs
|
||||
|
||||
Reference in New Issue
Block a user