[Kernel] Add nvfp4 gemm flashinfer backends (#22346)

Signed-off-by: Julien Lin <jullin@nvidia.com> Signed-off-by: mgoin <mgoin64@gmail.com> Co-authored-by: mgoin <mgoin64@gmail.com>
2025-08-15 04:03:55 +08:00
parent b8ff05361a
commit 279a5f31b3
9 changed files with 369 additions and 39 deletions
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -310,6 +310,7 @@ class Worker(WorkerBase):
        for size in sorted(warmup_sizes, reverse=True):
            logger.info("Compile and warming up model for size %d", size)
            self.model_runner._dummy_run(size, skip_eplb=True)
+
        if not self.model_config.enforce_eager:
            self.model_runner.capture_model()

@@ -340,8 +341,7 @@ class Worker(WorkerBase):
                    hidden_states=last_hidden_states)

        # Warmup kernels used during model execution
-        kernel_warmup(self.get_model(),
-                      max_tokens=self.scheduler_config.max_num_batched_tokens)
+        kernel_warmup(self)

        # Reset the seed to ensure that the random state is not affected by
        # the model initialization and profiling.