[Kernel] Add MXFP8 to Marlin GEMM/MoE and refactor Mxfp8LinearOp (#34664)

Signed-off-by: mgoin <mgoin64@gmail.com>
2026-04-01 18:41:42 +02:00
parent dc0428ebb8
commit db5d0719e1
15 changed files with 481 additions and 129 deletions
--- a/tests/models/quantization/test_mxfp8.py
+++ b/tests/models/quantization/test_mxfp8.py
@@ -23,7 +23,7 @@ from tests.quantization.utils import is_quant_method_supported
 from ..utils import check_logprobs_close

 # A small MoE model that fits on a single GPU and has both linear + MoE layers.
-MOE_MODEL = "Qwen/Qwen3-30B-A3B"
+MOE_MODEL = "allenai/OLMoE-1B-7B-0125-Instruct"
 # A small dense model (no MoE) to validate the linear-only path.
 DENSE_MODEL = "Qwen/Qwen3-0.6B"