[Kernel] Add nvfp4 gemm flashinfer backends (#22346)

Signed-off-by: Julien Lin <jullin@nvidia.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
nvjullin
2025-08-15 04:03:55 +08:00
committed by GitHub
parent b8ff05361a
commit 279a5f31b3
9 changed files with 369 additions and 39 deletions

View File

@@ -310,6 +310,7 @@ class Worker(WorkerBase):
for size in sorted(warmup_sizes, reverse=True):
logger.info("Compile and warming up model for size %d", size)
self.model_runner._dummy_run(size, skip_eplb=True)
if not self.model_config.enforce_eager:
self.model_runner.capture_model()
@@ -340,8 +341,7 @@ class Worker(WorkerBase):
hidden_states=last_hidden_states)
# Warmup kernels used during model execution
kernel_warmup(self.get_model(),
max_tokens=self.scheduler_config.max_num_batched_tokens)
kernel_warmup(self)
# Reset the seed to ensure that the random state is not affected by
# the model initialization and profiling.