[Kernel] Add MXFP8 to Marlin GEMM/MoE and refactor Mxfp8LinearOp (#34664)

Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
Michael Goin
2026-04-01 18:41:42 +02:00
committed by GitHub
parent dc0428ebb8
commit db5d0719e1
15 changed files with 481 additions and 129 deletions

View File

@@ -23,7 +23,7 @@ from tests.quantization.utils import is_quant_method_supported
from ..utils import check_logprobs_close
# A small MoE model that fits on a single GPU and has both linear + MoE layers.
MOE_MODEL = "Qwen/Qwen3-30B-A3B"
MOE_MODEL = "allenai/OLMoE-1B-7B-0125-Instruct"
# A small dense model (no MoE) to validate the linear-only path.
DENSE_MODEL = "Qwen/Qwen3-0.6B"