[Kernel] Enable fp8 support for pplx and BatchedTritonExperts. (#18864)

Signed-off-by: Bill Nell <bnell@redhat.com>
2025-07-03 17:55:40 -04:00
parent 2f2fcb31b8
commit 78fe77534b
25 changed files with 1277 additions and 663 deletions
--- a/tests/kernels/moe/test_deepep_moe.py
+++ b/tests/kernels/moe/test_deepep_moe.py
@@ -154,12 +154,13 @@ def make_modular_kernel(
                        deepep_ht_args = ht_args,
                        deepep_ll_args = ll_args)

+    num_dispatchers = pgi.world_size // dp_size
+
    if low_latency_mode:
        assert not per_act_token_quant, "not supported in ll mode"
        fused_experts = BatchedTritonExperts(
            max_num_tokens=MAX_TOKENS_PER_RANK,
-            world_size=pgi.world_size,
-            dp_size=dp_size,
+            num_dispatchers=num_dispatchers,
            use_fp8_w8a8=is_quantized,
            use_int8_w8a8=False,
            use_int8_w8a16=False,