diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 16487d744..4bc12b986 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -348,6 +348,17 @@ class ParallelConfig: "num_redundant_experts." ) + # Note(hc): In the current implementation of decode context + # parallel(DCP), tp_size needs to be divisible by dcp_size, + # because the world size does not change by dcp, it simply + # reuses the GPUs of TP group, and split one TP group into + # tp_size//dcp_size DCP groups. + if self.tensor_parallel_size % self.decode_context_parallel_size != 0: + raise ValueError( + f"tp_size={self.tensor_parallel_size} must be divisible by" + f"dcp_size={self.decode_context_parallel_size}." + ) + return self @property diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 82be97ce6..30eb472ca 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1389,16 +1389,6 @@ class EngineArgs: # global layers in interleaved sliding window models. sliding_window = model_config.get_sliding_window() - # Note(hc): In the current implementation of decode context - # parallel(DCP), tp_size needs to be divisible by dcp_size, - # because the world size does not change by dcp, it simply - # reuses the GPUs of TP group, and split one TP group into - # tp_size//dcp_size DCP groups. - assert self.tensor_parallel_size % self.decode_context_parallel_size == 0, ( - f"tp_size={self.tensor_parallel_size} must be divisible by" - f"dcp_size={self.decode_context_parallel_size}." - ) - # Resolve "auto" kv_cache_dtype to actual value from model config resolved_cache_dtype = resolve_kv_cache_dtype_string( self.kv_cache_dtype, model_config