diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index a1f31bead..ccb175d88 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -258,9 +258,10 @@ class HPUWorker(LocalOrDistributedWorkerBase): This also warms up the model, which may record CUDA graphs. """ - raise_if_cache_size_invalid(num_gpu_blocks, - self.cache_config.block_size, - self.model_config.max_model_len) + raise_if_cache_size_invalid( + num_gpu_blocks, self.cache_config.block_size, + self.model_config.max_model_len, + self.parallel_config.pipeline_parallel_size) self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks @@ -442,13 +443,13 @@ def init_worker_distributed_environment( parallel_config.pipeline_parallel_size) -def raise_if_cache_size_invalid(num_gpu_blocks, block_size, - max_model_len) -> None: +def raise_if_cache_size_invalid(num_gpu_blocks, block_size, max_model_len, + pipeline_parallel_size) -> None: if num_gpu_blocks <= 0: raise ValueError("No available memory for the cache blocks. " "Try increasing `gpu_memory_utilization` when " "initializing the engine.") - max_seq_len = block_size * num_gpu_blocks + max_seq_len = block_size * (num_gpu_blocks // pipeline_parallel_size) if max_model_len > max_seq_len: raise ValueError( f"The model's max seq len ({max_model_len}) " diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 5d548bdb5..ad94a6a4d 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -288,10 +288,11 @@ class Worker(LocalOrDistributedWorkerBase): This also warms up the model, which may record CUDA graphs. """ - raise_if_cache_size_invalid(num_gpu_blocks, - self.cache_config.block_size, - self.cache_config.is_attention_free, - self.model_config.max_model_len) + raise_if_cache_size_invalid( + num_gpu_blocks, self.cache_config.block_size, + self.cache_config.is_attention_free, + self.model_config.max_model_len, + self.parallel_config.pipeline_parallel_size) self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks @@ -530,7 +531,7 @@ def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype): def raise_if_cache_size_invalid(num_gpu_blocks, block_size, is_attention_free, - max_model_len) -> None: + max_model_len, pipeline_parallel_size) -> None: if is_attention_free and num_gpu_blocks != 0: raise ValueError("No memory should be allocated for the cache blocks " f"for an attention-free model, but {num_gpu_blocks} " @@ -539,7 +540,7 @@ def raise_if_cache_size_invalid(num_gpu_blocks, block_size, is_attention_free, raise ValueError("No available memory for the cache blocks. " "Try increasing `gpu_memory_utilization` when " "initializing the engine.") - max_seq_len = block_size * num_gpu_blocks + max_seq_len = block_size * (num_gpu_blocks // pipeline_parallel_size) if not is_attention_free and max_model_len > max_seq_len: raise ValueError( f"The model's max seq len ({max_model_len}) "