[Bugfix] Fix TP inference for Flex attention backend (#19657)

Signed-off-by: Isotr0py <2037008807@qq.com>
This commit is contained in:
Isotr0py
2025-06-16 19:21:37 +08:00
committed by GitHub
parent 4d5424029b
commit 1173804dca
5 changed files with 54 additions and 2 deletions

View File

@@ -84,6 +84,8 @@ class EngineCore:
vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks
vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks
self.collective_rpc("initialize_cache",
args=(num_gpu_blocks, num_cpu_blocks))
self.structured_output_manager = StructuredOutputManager(vllm_config)