Use NCCL instead of ray for control-plane communication to remove serialization overhead (#2221)

2024-01-04 03:30:22 +08:00
parent 1066cbd152
commit fd4ea8ef5c
34 changed files with 524 additions and 262 deletions
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -254,7 +254,7 @@ class GPTBigCodeForCausalLM(nn.Module):
        self,
        hidden_states: torch.Tensor,
        sampling_metadata: SamplingMetadata,
-    ) -> SamplerOutput:
+    ) -> Optional[SamplerOutput]:
        next_tokens = self.sampler(self.lm_head_weight, hidden_states,
                                   sampling_metadata)
        return next_tokens