[Distributed] Add enable_expert_parallel arg (#14305)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
This commit is contained in:
Tyler Michael Smith
2025-03-06 13:54:45 -05:00
committed by GitHub
parent cd579352bf
commit cc2f9b32c8
5 changed files with 27 additions and 21 deletions

View File

@@ -114,6 +114,7 @@ class EngineArgs:
# number of P/D disaggregation (or other disaggregation) workers
pipeline_parallel_size: int = 1
tensor_parallel_size: int = 1
enable_expert_parallel: bool = False
max_parallel_loading_workers: Optional[int] = None
block_size: Optional[int] = None
enable_prefix_caching: Optional[bool] = None
@@ -440,6 +441,11 @@ class EngineArgs:
type=int,
default=EngineArgs.tensor_parallel_size,
help='Number of tensor parallel replicas.')
parser.add_argument(
'--enable-expert-parallel',
action='store_true',
help='Use expert parallelism instead of tensor parallelism '
'for MoE layers.')
parser.add_argument(
'--max-parallel-loading-workers',
type=int,
@@ -1207,6 +1213,7 @@ class EngineArgs:
parallel_config = ParallelConfig(
pipeline_parallel_size=self.pipeline_parallel_size,
tensor_parallel_size=self.tensor_parallel_size,
enable_expert_parallel=self.enable_expert_parallel,
max_parallel_loading_workers=self.max_parallel_loading_workers,
disable_custom_all_reduce=self.disable_custom_all_reduce,
tokenizer_pool_config=TokenizerPoolConfig.create_config(