[Bugfix] Add custom Triton cache manager to resolve MoE MP issue (#6140)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Chih-Chieh-Yang <chih.chieh.yang@ibm.com>
This commit is contained in:
@@ -9,6 +9,7 @@ from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
|
||||
ResultHandler, WorkerMonitor)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.sequence import ExecuteModelRequest, SamplerOutput
|
||||
from vllm.triton_utils import maybe_set_triton_cache_manager
|
||||
from vllm.utils import (cuda_device_count_stateless,
|
||||
error_on_invalid_device_count_status,
|
||||
get_distributed_init_method, get_open_port,
|
||||
@@ -42,6 +43,10 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
|
||||
if "OMP_NUM_THREADS" not in os.environ:
|
||||
os.environ["OMP_NUM_THREADS"] = "1"
|
||||
|
||||
# workaround for https://github.com/vllm-project/vllm/issues/6103
|
||||
if world_size > 1:
|
||||
maybe_set_triton_cache_manager()
|
||||
|
||||
assert world_size <= cuda_device_count_stateless(), (
|
||||
"please set tensor_parallel_size to less than max local gpu count")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user