[LoRA] Much faster startup when LoRA is enabled (#23777)

Signed-off-by: Andy Lo <andy@mistral.ai>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
This commit is contained in:
Andy Lo
2025-08-30 16:37:39 +01:00
committed by GitHub
parent 68a349114f
commit 038e9be4eb
3 changed files with 33 additions and 13 deletions

View File

@@ -308,7 +308,10 @@ class Worker(WorkerBase):
# We skip EPLB here since we don't want to record dummy metrics
for size in sorted(warmup_sizes, reverse=True):
logger.info("Compile and warming up model for size %d", size)
self.model_runner._dummy_run(size, skip_eplb=True)
self.model_runner._dummy_run(size,
skip_eplb=True,
remove_lora=False)
self.model_runner.maybe_remove_all_loras(self.model_runner.lora_config)
# Warmup and tune the kernels used during model execution before
# cuda graph capture.