[V1] Multiprocessing Tensor Parallel Support for v1 (#9856)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2024-12-10 01:28:14 -05:00
parent bc192a2b09
commit 28b3a1c7e5
21 changed files with 732 additions and 145 deletions
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -26,6 +26,14 @@ MODELS = [
 TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")


+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def test_vllm_gc_ed():
    """Verify vllm instance is GC'ed when it is deleted"""
    llm = LLM("facebook/opt-125m")
@@ -36,6 +44,7 @@ def test_vllm_gc_ed():
    assert weak_llm() is None


+@pytest.mark.skip_v1
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
@pytest.mark.parametrize("dtype", ["half"])
@@ -118,6 +127,11 @@ def test_models_distributed(
    if attention_backend:
        os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend

+    # Import VLLM_USE_V1 dynamically to handle patching
+    from vllm.envs import VLLM_USE_V1
+    if VLLM_USE_V1 and distributed_executor_backend != "mp":
+        pytest.skip(f"Skip {distributed_executor_backend} for V1")
+
    dtype = "half"
    max_tokens = 5

@@ -143,6 +157,7 @@ def test_models_distributed(
    )


+@pytest.mark.skip_v1
 def test_model_with_failure(vllm_runner) -> None:
    try:
        with patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",
@@ -169,6 +184,7 @@ def test_model_with_failure(vllm_runner) -> None:
        os.remove(filename)


+@pytest.mark.skip_v1
 def test_failure_with_async_out_proc(vllm_runner) -> None:

    filename = None