[ci][distributed] merge distributed test commands (#7097)
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
This commit is contained in:
@@ -1,46 +1,39 @@
|
||||
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
|
||||
vLLM will allocate all the available memory, so we need to run the tests one
|
||||
by one. The solution is to pass arguments (model name) by environment
|
||||
variables.
|
||||
|
||||
Run:
|
||||
```sh
|
||||
TEST_DIST_MODEL=facebook/opt-125m pytest \
|
||||
test_chunked_prefill_distributed.py
|
||||
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
|
||||
test_chunked_prefill_distributed.py
|
||||
pytest test_chunked_prefill_distributed.py
|
||||
```
|
||||
"""
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.utils import cuda_device_count_stateless
|
||||
|
||||
from ..models.utils import check_outputs_equal
|
||||
|
||||
MODELS = [
|
||||
os.environ["TEST_DIST_MODEL"],
|
||||
]
|
||||
DISTRIBUTED_EXECUTOR_BACKEND = "DISTRIBUTED_EXECUTOR_BACKEND"
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
|
||||
|
||||
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [5])
|
||||
@pytest.mark.parametrize("chunked_prefill_token_size", [16])
|
||||
@pytest.mark.parametrize("model, distributed_executor_backend", [
|
||||
("facebook/opt-125m", "ray"),
|
||||
("meta-llama/Llama-2-7b-hf", "ray"),
|
||||
("facebook/opt-125m", "mp"),
|
||||
("meta-llama/Llama-2-7b-hf", "mp"),
|
||||
])
|
||||
@fork_new_process_for_each_test
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
chunked_prefill_token_size: int,
|
||||
distributed_executor_backend: str,
|
||||
) -> None:
|
||||
distributed_executor_backend = os.getenv(DISTRIBUTED_EXECUTOR_BACKEND)
|
||||
|
||||
dtype = "half"
|
||||
max_tokens = 5
|
||||
chunked_prefill_token_size = 16
|
||||
|
||||
# Add a chunked prefill config.
|
||||
max_num_seqs = min(chunked_prefill_token_size, 256)
|
||||
|
||||
Reference in New Issue
Block a user