[Bugfix] Enable Triton MoE for FP8 per-tensor dynamic (#33300)
Signed-off-by: mgoin <mgoin64@gmail.com>
(cherry picked from commit bfb9bdaf3f)
This commit is contained in:
@@ -927,6 +927,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
|
|||||||
SUPPORTED_W_A_FP8 = [
|
SUPPORTED_W_A_FP8 = [
|
||||||
(kFp8Static128BlockSym, kFp8Dynamic128Sym),
|
(kFp8Static128BlockSym, kFp8Dynamic128Sym),
|
||||||
(kFp8StaticChannelSym, kFp8DynamicTokenSym),
|
(kFp8StaticChannelSym, kFp8DynamicTokenSym),
|
||||||
|
(kFp8StaticTensorSym, kFp8DynamicTokenSym),
|
||||||
(kFp8StaticTensorSym, kFp8StaticTensorSym),
|
(kFp8StaticTensorSym, kFp8StaticTensorSym),
|
||||||
(kFp8StaticTensorSym, kFp8DynamicTensorSym),
|
(kFp8StaticTensorSym, kFp8DynamicTensorSym),
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -45,6 +45,7 @@ from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import OCP_MX_Sc
|
|||||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||||
QuantKey,
|
QuantKey,
|
||||||
kFp8Dynamic128Sym,
|
kFp8Dynamic128Sym,
|
||||||
|
kFp8DynamicTensorSym,
|
||||||
kFp8DynamicTokenSym,
|
kFp8DynamicTokenSym,
|
||||||
kFp8Static128BlockSym,
|
kFp8Static128BlockSym,
|
||||||
kFp8StaticChannelSym,
|
kFp8StaticChannelSym,
|
||||||
@@ -1942,6 +1943,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
|
|||||||
(kFp8StaticChannelSym, kFp8DynamicTokenSym),
|
(kFp8StaticChannelSym, kFp8DynamicTokenSym),
|
||||||
(kFp8StaticTensorSym, kFp8DynamicTokenSym),
|
(kFp8StaticTensorSym, kFp8DynamicTokenSym),
|
||||||
(kFp8StaticTensorSym, kFp8StaticTensorSym),
|
(kFp8StaticTensorSym, kFp8StaticTensorSym),
|
||||||
|
(kFp8StaticTensorSym, kFp8DynamicTensorSym),
|
||||||
]
|
]
|
||||||
return (weight_key, activation_key) in SUPPORTED_W_A
|
return (weight_key, activation_key) in SUPPORTED_W_A
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user