[Bugfix] Add custom Triton cache manager to resolve MoE MP issue (#6140)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Chih-Chieh-Yang <chih.chieh.yang@ibm.com>
2024-07-15 19:12:47 +02:00
parent a63a4c6341
commit eaec4b9153
3 changed files with 64 additions and 0 deletions
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -9,6 +9,7 @@ from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
                                                  ResultHandler, WorkerMonitor)
 from vllm.logger import init_logger
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.triton_utils import maybe_set_triton_cache_manager
 from vllm.utils import (cuda_device_count_stateless,
                        error_on_invalid_device_count_status,
                        get_distributed_init_method, get_open_port,
@@ -42,6 +43,10 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
        if "OMP_NUM_THREADS" not in os.environ:
            os.environ["OMP_NUM_THREADS"] = "1"

+        # workaround for https://github.com/vllm-project/vllm/issues/6103
+        if world_size > 1:
+            maybe_set_triton_cache_manager()
+
        assert world_size <= cuda_device_count_stateless(), (
            "please set tensor_parallel_size to less than max local gpu count")