[LoRA] Much faster startup when LoRA is enabled (#23777)
Signed-off-by: Andy Lo <andy@mistral.ai> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
This commit is contained in:
@@ -308,7 +308,10 @@ class Worker(WorkerBase):
|
||||
# We skip EPLB here since we don't want to record dummy metrics
|
||||
for size in sorted(warmup_sizes, reverse=True):
|
||||
logger.info("Compile and warming up model for size %d", size)
|
||||
self.model_runner._dummy_run(size, skip_eplb=True)
|
||||
self.model_runner._dummy_run(size,
|
||||
skip_eplb=True,
|
||||
remove_lora=False)
|
||||
self.model_runner.maybe_remove_all_loras(self.model_runner.lora_config)
|
||||
|
||||
# Warmup and tune the kernels used during model execution before
|
||||
# cuda graph capture.
|
||||
|
||||
Reference in New Issue
Block a user