[V1] Multiprocessing Tensor Parallel Support for v1 (#9856)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
This commit is contained in:
committed by
GitHub
parent
bc192a2b09
commit
28b3a1c7e5
@@ -26,6 +26,14 @@ MODELS = [
|
||||
TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
def test_vllm_gc_ed():
|
||||
"""Verify vllm instance is GC'ed when it is deleted"""
|
||||
llm = LLM("facebook/opt-125m")
|
||||
@@ -36,6 +44,7 @@ def test_vllm_gc_ed():
|
||||
assert weak_llm() is None
|
||||
|
||||
|
||||
@pytest.mark.skip_v1
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@@ -118,6 +127,11 @@ def test_models_distributed(
|
||||
if attention_backend:
|
||||
os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
|
||||
|
||||
# Import VLLM_USE_V1 dynamically to handle patching
|
||||
from vllm.envs import VLLM_USE_V1
|
||||
if VLLM_USE_V1 and distributed_executor_backend != "mp":
|
||||
pytest.skip(f"Skip {distributed_executor_backend} for V1")
|
||||
|
||||
dtype = "half"
|
||||
max_tokens = 5
|
||||
|
||||
@@ -143,6 +157,7 @@ def test_models_distributed(
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skip_v1
|
||||
def test_model_with_failure(vllm_runner) -> None:
|
||||
try:
|
||||
with patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",
|
||||
@@ -169,6 +184,7 @@ def test_model_with_failure(vllm_runner) -> None:
|
||||
os.remove(filename)
|
||||
|
||||
|
||||
@pytest.mark.skip_v1
|
||||
def test_failure_with_async_out_proc(vllm_runner) -> None:
|
||||
|
||||
filename = None
|
||||
|
||||
Reference in New Issue
Block a user