[Core] Add Sliding Window Support with Flashinfer (#10462)
Signed-off-by: Pavani Majety <pmajety@nvidia.com>
This commit is contained in:
@@ -3,6 +3,7 @@ from typing import List
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.kernels.utils import override_backend_env_variable
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
from .conftest import get_text_from_llm_generator
|
||||
@@ -28,8 +29,9 @@ BLOCK_SIZE = 16
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("batch_size", [5])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
|
||||
def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
|
||||
batch_size, seed):
|
||||
batch_size, seed, backend, monkeypatch):
|
||||
"""
|
||||
The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
|
||||
asks for value of one of them (which is outside the sliding window).
|
||||
@@ -38,6 +40,8 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
|
||||
|
||||
Additionally, we compare the results of the v1 and v2 managers.
|
||||
"""
|
||||
override_backend_env_variable(monkeypatch, backend)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=1024,
|
||||
ignore_eos=True,
|
||||
@@ -84,7 +88,9 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{"enable_chunked_prefill": True}])
|
||||
@pytest.mark.parametrize("batch_size", [5])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed):
|
||||
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
|
||||
def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
|
||||
backend, monkeypatch):
|
||||
"""
|
||||
This is similar to test_sliding_window_retrival, however, it doesn't
|
||||
compare against the v1 block manager since v1 doesn't support
|
||||
@@ -93,6 +99,8 @@ def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed):
|
||||
The results with and without chunked prefill are not the same due to
|
||||
numerical instabilities.
|
||||
"""
|
||||
override_backend_env_variable(monkeypatch, backend)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=10,
|
||||
ignore_eos=True,
|
||||
|
||||
Reference in New Issue
Block a user