[V1] Multiprocessing Tensor Parallel Support for v1 (#9856)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
This commit is contained in:
Tyler Michael Smith
2024-12-10 01:28:14 -05:00
committed by GitHub
parent bc192a2b09
commit 28b3a1c7e5
21 changed files with 732 additions and 145 deletions

View File

@@ -26,6 +26,14 @@ MODELS = [
TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def test_vllm_gc_ed():
"""Verify vllm instance is GC'ed when it is deleted"""
llm = LLM("facebook/opt-125m")
@@ -36,6 +44,7 @@ def test_vllm_gc_ed():
assert weak_llm() is None
@pytest.mark.skip_v1
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
@pytest.mark.parametrize("dtype", ["half"])
@@ -118,6 +127,11 @@ def test_models_distributed(
if attention_backend:
os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
# Import VLLM_USE_V1 dynamically to handle patching
from vllm.envs import VLLM_USE_V1
if VLLM_USE_V1 and distributed_executor_backend != "mp":
pytest.skip(f"Skip {distributed_executor_backend} for V1")
dtype = "half"
max_tokens = 5
@@ -143,6 +157,7 @@ def test_models_distributed(
)
@pytest.mark.skip_v1
def test_model_with_failure(vllm_runner) -> None:
try:
with patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",
@@ -169,6 +184,7 @@ def test_model_with_failure(vllm_runner) -> None:
os.remove(filename)
@pytest.mark.skip_v1
def test_failure_with_async_out_proc(vllm_runner) -> None:
filename = None