[Feat] Support non-gated MoE with Marlin, NVFP4 CUTLASS, FP8, INT8, compressed-tensors (#32257)

Signed-off-by: Tomer Natan <tbarnatan@computelab-frontend-8.nvidia.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Tomer Natan <tbarnatan@computelab-frontend-8.nvidia.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Tomer Natan <tbarnatan@ipp1-1429.ipp1a1.colossus.nvidia.com>
This commit is contained in:
TomerBN-Nvidia
2026-01-16 02:15:05 +02:00
committed by GitHub
parent aca5c51487
commit c277fbdf31
17 changed files with 226 additions and 127 deletions

View File

@@ -526,7 +526,7 @@ def test_run_cutlass_moe_fp8(
c_strides1 = torch.full((e,), 2 * n, device="cuda", dtype=torch.int64)
c_strides2 = torch.full((e,), k, device="cuda", dtype=torch.int64)
activation = lambda o, i: torch.ops._C.silu_and_mul(o, i)
activation = "silu"
a1q, a1q_scale = moe_kernel_quantize_input(
mt.a, mt.a_scale, torch.float8_e4m3fn, per_act_token
)