[V1] Multiprocessing Tensor Parallel Support for v1 (#9856)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2024-12-10 01:28:14 -05:00
parent bc192a2b09
commit 28b3a1c7e5
21 changed files with 732 additions and 145 deletions
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -5,6 +5,7 @@ from typing import Optional
 import torch
 import torch.nn as nn

+import vllm.envs as envs
 from vllm.distributed import (tensor_model_parallel_all_gather,
                              tensor_model_parallel_gather)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -42,7 +43,9 @@ class LogitsProcessor(nn.Module):
        # Soft cap the logits. Used in Gemma 2.
        self.soft_cap = soft_cap
        # Whether to use gather or all-gather to gather the logits.
-        self.use_gather = not current_platform.is_tpu()
+
+        self.use_gather = not current_platform.is_tpu(
+        ) and not envs.VLLM_USE_V1

    def forward(
        self,