Expert Parallelism (EP) Support for DeepSeek V2 (#12583)

This commit is contained in:
Jongseok Park
2025-02-24 07:33:20 -08:00
committed by GitHub
parent 7940d8a6a7
commit 781096e385
19 changed files with 527 additions and 59 deletions

View File

@@ -677,6 +677,23 @@ class ModelConfig:
"fallback to the eager mode.")
self.enforce_eager = True
def _verify_with_expert_parallelism(self) -> None:
num_expert_names = [
"moe_num_experts", # Dbrx
"num_experts", # Jamba
"n_routed_experts", # DeepSeek
"num_local_experts", # Mixtral
]
num_experts = 0
for name in num_expert_names:
num_experts = getattr(self.hf_text_config, name, 0)
if num_experts > 0:
break
if num_experts < 1:
raise ValueError(
"Number of experts in the model must be greater than 0 "
"when expert parallelism is enabled.")
def verify_async_output_proc(self, parallel_config, speculative_config,
device_config) -> None:
if not self.use_async_output_proc:
@@ -730,6 +747,9 @@ class ModelConfig:
" must be divisible by tensor parallel size "
f"({tensor_parallel_size}).")
if envs.VLLM_TEST_ENABLE_EP:
self._verify_with_expert_parallelism()
pipeline_parallel_size = parallel_config.pipeline_parallel_size
if pipeline_parallel_size > 1:
architectures = getattr(self.hf_config, "architectures", [])