Update Flashinfer to 0.2.14.post1 (#23537)

Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
Signed-off-by: siyuanf <siyuanf@nvidia.com>
Signed-off-by: Weiliang Liu <weiliangl@nvidia.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Siyuan Fu <siyuanf@nvidia.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
weiliang
2025-08-26 09:30:44 +08:00
committed by GitHub
parent 906e461ed6
commit ae067888d6
5 changed files with 14 additions and 7 deletions

View File

@@ -310,6 +310,10 @@ class Worker(WorkerBase):
logger.info("Compile and warming up model for size %d", size)
self.model_runner._dummy_run(size, skip_eplb=True)
# Warmup and tune the kernels used during model execution before
# cuda graph capture.
kernel_warmup(self)
if not self.model_config.enforce_eager:
self.model_runner.capture_model()
@@ -334,9 +338,6 @@ class Worker(WorkerBase):
self.model_runner._dummy_sampler_run(
hidden_states=last_hidden_states)
# Warmup kernels used during model execution
kernel_warmup(self)
# Reset the seed to ensure that the random state is not affected by
# the model initialization and profiling.
set_random_seed(self.model_config.seed)