[MoE Refactor] Oracle Select FP8+NVFP4 Kernels In Priority (#32414)

This commit is contained in:
Robert Shaw
2026-01-21 08:22:33 -05:00
committed by GitHub
parent e14467be43
commit 42135d6898
82 changed files with 2710 additions and 1563 deletions

View File

@@ -29,7 +29,7 @@ from vllm.model_executor.layers.batch_invariant import (
from vllm.model_executor.layers.quantization.utils.quant_utils import (
QuantKey,
kFp8StaticTensorSym,
kNvfp4Quant,
kNvfp4Dynamic,
)
from vllm.platforms import current_platform
from vllm.platforms.interface import DeviceCapability
@@ -1184,7 +1184,7 @@ class FlashInferImpl(AttentionImpl):
return (
self.support_trtllm_attn
and self.kv_cache_dtype.startswith("fp8")
and quant_key in (kFp8StaticTensorSym, kNvfp4Quant)
and quant_key in (kFp8StaticTensorSym, kNvfp4Dynamic)
)
# FlashInfer requires attention sinks to be float32