diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py index 509bacfbc..fd6c365fe 100644 --- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py @@ -927,6 +927,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): SUPPORTED_W_A_FP8 = [ (kFp8Static128BlockSym, kFp8Dynamic128Sym), (kFp8StaticChannelSym, kFp8DynamicTokenSym), + (kFp8StaticTensorSym, kFp8DynamicTokenSym), (kFp8StaticTensorSym, kFp8StaticTensorSym), (kFp8StaticTensorSym, kFp8DynamicTensorSym), ] diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 669a6e74b..0335339b7 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -45,6 +45,7 @@ from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import OCP_MX_Sc from vllm.model_executor.layers.quantization.utils.quant_utils import ( QuantKey, kFp8Dynamic128Sym, + kFp8DynamicTensorSym, kFp8DynamicTokenSym, kFp8Static128BlockSym, kFp8StaticChannelSym, @@ -1942,6 +1943,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): (kFp8StaticChannelSym, kFp8DynamicTokenSym), (kFp8StaticTensorSym, kFp8DynamicTokenSym), (kFp8StaticTensorSym, kFp8StaticTensorSym), + (kFp8StaticTensorSym, kFp8DynamicTensorSym), ] return (weight_key, activation_key) in SUPPORTED_W_A