Update Flashinfer to 0.2.14.post1 (#23537)

Signed-off-by: Siyuan Fu <siyuanf@nvidia.com> Signed-off-by: siyuanf <siyuanf@nvidia.com> Signed-off-by: Weiliang Liu <weiliangl@nvidia.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Siyuan Fu <siyuanf@nvidia.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-08-26 09:30:44 +08:00
parent 906e461ed6
commit ae067888d6
5 changed files with 14 additions and 7 deletions
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -310,6 +310,10 @@ class Worker(WorkerBase):
            logger.info("Compile and warming up model for size %d", size)
            self.model_runner._dummy_run(size, skip_eplb=True)

+        # Warmup and tune the kernels used during model execution before
+        # cuda graph capture.
+        kernel_warmup(self)
+
        if not self.model_config.enforce_eager:
            self.model_runner.capture_model()

@@ -334,9 +338,6 @@ class Worker(WorkerBase):
                self.model_runner._dummy_sampler_run(
                    hidden_states=last_hidden_states)

-        # Warmup kernels used during model execution
-        kernel_warmup(self)
-
        # Reset the seed to ensure that the random state is not affected by
        # the model initialization and profiling.
        set_random_seed(self.model_config.seed)