[Kernel] Enable fp8 support for pplx and BatchedTritonExperts. (#18864)

Signed-off-by: Bill Nell <bnell@redhat.com>
2025-07-03 17:55:40 -04:00
parent 2f2fcb31b8
commit 78fe77534b
25 changed files with 1277 additions and 663 deletions
--- a/tests/kernels/moe/parallel_utils.py
+++ b/tests/kernels/moe/parallel_utils.py
@@ -137,8 +137,7 @@ def make_deepep_ht_a2a(pg: ProcessGroup,
                            low_latency_mode=low_latency_mode,
                            num_qps_per_rank=num_qps_per_rank)
    return DeepEPHTPrepareAndFinalize(buffer=buffer,
-                                      world_size=pgi.world_size,
-                                      rank=pgi.rank,
+                                      num_dispatchers=pgi.world_size,
                                      dp_size=dp_size,
                                      rank_expert_offset=pgi.rank *
                                      ht_args.num_local_experts)
@@ -146,7 +145,6 @@ def make_deepep_ht_a2a(pg: ProcessGroup,

 def make_deepep_ll_a2a(pg: ProcessGroup,
                       pgi: ProcessGroupInfo,
-                       dp_size: int,
                       deepep_ll_args: DeepEPLLArgs,
                       q_dtype: Optional[torch.dtype] = None,
                       block_shape: Optional[list[int]] = None):
@@ -166,8 +164,7 @@ def make_deepep_ll_a2a(pg: ProcessGroup,

    return DeepEPLLPrepareAndFinalize(
        buffer=buffer,
-        world_size=pgi.world_size,
-        dp_size=dp_size,
+        num_dispatchers=pgi.world_size,
        max_tokens_per_rank=deepep_ll_args.max_tokens_per_rank,
        use_fp8_dispatch=deepep_ll_args.use_fp8_dispatch,
    )
@@ -186,5 +183,4 @@ def make_deepep_a2a(pg: ProcessGroup,
                                  block_shape)

    assert deepep_ll_args is not None
-    return make_deepep_ll_a2a(pg, pgi, dp_size, deepep_ll_args, q_dtype,
-                              block_shape)
+    return make_deepep_ll_a2a(pg, pgi, deepep_ll_args, q_dtype, block_shape)
--- a/tests/kernels/moe/test_batched_moe.py
+++ b/tests/kernels/moe/test_batched_moe.py
@@ -10,7 +10,7 @@ import triton.language as tl

 from tests.kernels.moe.utils import (batched_moe,
                                     make_quantized_test_activations,
-                                     make_test_weights, triton_moe)
+                                     make_test_weights, naive_batched_moe)
 from tests.kernels.quant_utils import native_batched_masked_quant_matmul
 from tests.kernels.utils import torch_experts
 from vllm.config import VllmConfig, set_current_vllm_config
@@ -33,12 +33,10 @@ MNK_FACTORS = [
    (45, 512, 512),
    (45, 1024, 128),
    (45, 1024, 2048),
-    (64, 128, 128),
    (64, 512, 512),
    (64, 1024, 2048),
    (222, 128, 128),
    (222, 128, 2048),
-    (222, 512, 512),
    (222, 1024, 128),
    (222, 1024, 2048),
 ]
@@ -95,11 +93,12 @@ class BatchedMMTensors:
@pytest.mark.parametrize("max_tokens_per_expert",
                         [32, 64, 128, 192, 224, 256, 512])
@pytest.mark.parametrize("K", [128, 256, 1024])
-@pytest.mark.parametrize("N", [128, 256, 512, 1024])
-@pytest.mark.parametrize("dtype",
-                         [torch.float32, torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("block_shape", [None])
-@pytest.mark.parametrize("per_act_token_quant", [False])
+@pytest.mark.parametrize("N", [128, 256, 1024])
+@pytest.mark.parametrize(
+    "dtype",
+    [torch.float8_e4m3fn, torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("block_shape", [None, [128, 128]])
+@pytest.mark.parametrize("per_act_token_quant", [False, True])
 def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
                    N: int, dtype: torch.dtype,
                    block_shape: Optional[list[int]],
@@ -134,7 +133,8 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
        in_dtype=act_dtype,
        quant_dtype=quant_dtype,
        block_shape=block_shape,
-        per_act_token_quant=per_act_token_quant)
+        per_act_token_quant=per_act_token_quant,
+    )

    B, B_q, B_scale, _, _, _ = make_test_weights(
        num_experts,
@@ -143,6 +143,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
        in_dtype=act_dtype,
        quant_dtype=quant_dtype,
        block_shape=block_shape,
+        per_act_token_quant=per_act_token_quant,
    )

    out_shape = (num_experts, max_tokens_per_expert, N)
@@ -177,6 +178,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
            "BLOCK_SIZE_N": 16,
            "BLOCK_SIZE_K": 16 if dtype.itemsize > 1 else 32
        },
+        per_act_token_quant=per_act_token_quant,
        block_shape=block_shape,
    )

@@ -185,15 +187,13 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
        B,
        ref_output,
        num_expert_tokens,
-        None,
-        None,
-        None,
    )

    q_ref_output = native_batched_masked_quant_matmul(A_q, B_q, q_ref_output,
                                                      num_expert_tokens,
                                                      A_scale, B_scale,
-                                                      block_shape)
+                                                      block_shape,
+                                                      per_act_token_quant)

    rtol, atol = {
        torch.float16: (6e-2, 6e-2),
@@ -201,16 +201,17 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
        torch.float32: (1e-2, 1e-2),
    }[test_output.dtype]

-    torch.testing.assert_close(ref_output, test_output, atol=atol, rtol=rtol)
+    torch.testing.assert_close(ref_output, q_ref_output, atol=atol, rtol=rtol)
    torch.testing.assert_close(test_output, q_ref_output, atol=atol, rtol=rtol)


@pytest.mark.parametrize(("m", "n", "k"), MNK_FACTORS)
@pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("dtype", [torch.bfloat16])
-@pytest.mark.parametrize("per_act_token_quant", [False])
-@pytest.mark.parametrize("block_shape", [None])
+@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
+@pytest.mark.parametrize("per_act_token_quant", [False, True])
+@pytest.mark.parametrize("block_shape", [None, [128, 128]])
+@pytest.mark.parametrize("input_scales", [False])
 def test_fused_moe_batched_experts(
    m: int,
    n: int,
@@ -220,15 +221,19 @@ def test_fused_moe_batched_experts(
    dtype: torch.dtype,
    per_act_token_quant: bool,
    block_shape: Optional[list[int]],
+    input_scales: bool,
 ):
    current_platform.seed_everything(7)

    use_fp8_w8a8 = dtype == torch.float8_e4m3fn

+    if topk > e:
+        pytest.skip("topk > e")
+
    if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
        pytest.skip("Skip quantization test for non-quantized type")

-    if per_act_token_quant and block_shape is not None or topk > e:
+    if per_act_token_quant and block_shape is not None:
        pytest.skip("Skip illegal quantization test.")

    a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
@@ -241,27 +246,26 @@ def test_fused_moe_batched_experts(
        act_dtype = dtype
        quant_dtype = None

-    _, w1, w1_s, _, w2, w2_s = make_test_weights(e,
-                                                 n,
-                                                 k,
-                                                 block_shape=block_shape,
-                                                 in_dtype=act_dtype,
-                                                 quant_dtype=quant_dtype)
+    w1_16, w1, w1_s, w2_16, w2, w2_s = make_test_weights(
+        e,
+        n,
+        k,
+        block_shape=block_shape,
+        in_dtype=act_dtype,
+        quant_dtype=quant_dtype,
+        per_act_token_quant=per_act_token_quant,
+    )
+
+    if input_scales and quant_dtype is not None:
+        a1_scale = torch.tensor(1, device="cuda", dtype=torch.float32)
+        a2_scale = torch.tensor(1, device="cuda", dtype=torch.float32)
+    else:
+        a1_scale = None
+        a2_scale = None

    with set_current_vllm_config(vllm_config):
        topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
-        batched_output = batched_moe(
-            a,
-            w1,
-            w2,
-            topk_weight,
-            topk_ids,
-            w1_scale=w1_s,
-            w2_scale=w2_s,
-            quant_dtype=quant_dtype,
-            per_act_token_quant=per_act_token_quant,
-            block_shape=block_shape,
-        )
+
        baseline_output = torch_experts(
            a,
            w1,
@@ -270,11 +274,14 @@ def test_fused_moe_batched_experts(
            topk_ids,
            w1_scale=w1_s,
            w2_scale=w2_s,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
            quant_dtype=quant_dtype,
            per_act_token_quant=per_act_token_quant,
-            block_shape=block_shape)
+            block_shape=block_shape,
+        )

-        triton_output = triton_moe(
+        batched_output = naive_batched_moe(
            a,
            w1,
            w2,
@@ -282,14 +289,31 @@ def test_fused_moe_batched_experts(
            topk_ids,
            w1_scale=w1_s,
            w2_scale=w2_s,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
            quant_dtype=quant_dtype,
            per_act_token_quant=per_act_token_quant,
            block_shape=block_shape,
        )

-    torch.testing.assert_close(triton_output,
+        triton_output = batched_moe(
+            a,
+            w1,
+            w2,
+            topk_weight,
+            topk_ids,
+            w1_scale=w1_s,
+            w2_scale=w2_s,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            quant_dtype=quant_dtype,
+            per_act_token_quant=per_act_token_quant,
+            block_shape=block_shape,
+        )
+
+    torch.testing.assert_close(batched_output,
                               baseline_output,
-                               atol=2e-2,
+                               atol=3e-2,
                               rtol=2e-2)

    torch.testing.assert_close(triton_output,
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -148,8 +148,7 @@ def make_ll_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo,

    fused_experts = BatchedDeepGemmExperts(
        max_num_tokens=max_tokens_per_rank,
-        world_size=pgi.world_size,
-        dp_size=dp_size,
+        num_dispatchers=pgi.world_size // dp_size,
        block_shape=test_config.block_size,
        per_act_token_quant=test_config.per_act_token_quant)
    mk = FusedMoEModularKernel(prepare_finalize=a2a,
--- a/tests/kernels/moe/test_deepep_moe.py
+++ b/tests/kernels/moe/test_deepep_moe.py
@@ -154,12 +154,13 @@ def make_modular_kernel(
                        deepep_ht_args = ht_args,
                        deepep_ll_args = ll_args)

+    num_dispatchers = pgi.world_size // dp_size
+
    if low_latency_mode:
        assert not per_act_token_quant, "not supported in ll mode"
        fused_experts = BatchedTritonExperts(
            max_num_tokens=MAX_TOKENS_PER_RANK,
-            world_size=pgi.world_size,
-            dp_size=dp_size,
+            num_dispatchers=num_dispatchers,
            use_fp8_w8a8=is_quantized,
            use_int8_w8a8=False,
            use_int8_w8a16=False,
--- a/tests/kernels/moe/test_pplx_cutlass_moe.py
+++ b/tests/kernels/moe/test_pplx_cutlass_moe.py
@@ -14,6 +14,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
    FusedMoEModularKernel)
 from vllm.platforms import current_platform
+from vllm.utils import cdiv

 from .parallel_utils import ProcessGroupInfo, parallel_launch

@@ -112,18 +113,21 @@ def pplx_cutlass_moe(
    w2_scale = w2_scale.to(device)
    a1_scale = a1_scale.to(device)

+    assert num_experts % world_size == 0
+    num_local_experts = cdiv(num_experts, world_size)
+    num_dispatchers = pgi.world_size // dp_size
+
    prepare_finalize = PplxPrepareAndFinalize(
        ata,
-        max_num_tokens,
-        pgi.world_size,
-        rank,
-        dp_size,
-    )
+        max_num_tokens=max_num_tokens,
+        num_local_experts=num_local_experts,
+        num_dispatchers=num_dispatchers)

-    experts = CutlassExpertsFp8((num_experts + world_size - 1) // world_size,
+    experts = CutlassExpertsFp8(num_local_experts,
                                out_dtype,
                                per_act_token,
                                per_out_ch,
+                                num_dispatchers=num_dispatchers,
                                use_batched_format=True)

    fused_cutlass_experts = FusedMoEModularKernel(
@@ -181,35 +185,40 @@ def _pplx_moe(
    per_out_ch: bool,
    use_internode: bool,
 ):
-    if use_internode:
-        uid = nvshmem_get_unique_id(
-        ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
-        torch.distributed.broadcast(uid, src=0)
-        nvshmem_init(uid, pgi.rank, pgi.world_size)
-    else:
-        group_ranks = list(range(pgi.world_size))
-        cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
-        group_name = cpu_group.group_name
+    try:
+        if use_internode:
+            uid = nvshmem_get_unique_id(
+            ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
+            torch.distributed.broadcast(uid, src=0)
+            nvshmem_init(uid, pgi.rank, pgi.world_size)
+        else:
+            group_ranks = list(range(pgi.world_size))
+            cpu_group = torch.distributed.new_group(group_ranks,
+                                                    backend="gloo")
+            group_name = cpu_group.group_name

-    with set_current_vllm_config(vllm_config):
-        torch_output = torch_experts(a_full, w1_full, w2_full, topk_weights,
-                                     topk_ids)
-        pplx_output = pplx_cutlass_moe(pgi, dp_size, a, w1, w2, w1_scale,
-                                       w2_scale, topk_weights, topk_ids,
-                                       a1_scale, out_dtype, per_act_token,
-                                       per_out_ch, group_name)
+        with set_current_vllm_config(vllm_config):
+            torch_output = torch_experts(a_full, w1_full, w2_full,
+                                         topk_weights, topk_ids)
+            pplx_output = pplx_cutlass_moe(pgi, dp_size, a, w1, w2, w1_scale,
+                                           w2_scale, topk_weights, topk_ids,
+                                           a1_scale, out_dtype, per_act_token,
+                                           per_out_ch, group_name)

-        torch_output = chunk_by_rank(torch_output, pgi.rank,
-                                     pgi.world_size).to(pplx_output.device)
+            torch_output = chunk_by_rank(torch_output, pgi.rank,
+                                         pgi.world_size).to(pplx_output.device)

-    # Uncomment if more debugging is needed
-    # print("PPLX OUT:", pplx_output)
-    # print("TORCH OUT:", torch_output)
+        # Uncomment if more debugging is needed
+        # print("PPLX OUT:", pplx_output)
+        # print("TORCH OUT:", torch_output)

-    torch.testing.assert_close(pplx_output, torch_output, atol=0.05, rtol=0)
-
-    if use_internode:
-        nvshmem_finalize()
+        torch.testing.assert_close(pplx_output,
+                                   torch_output,
+                                   atol=0.05,
+                                   rtol=0)
+    finally:
+        if use_internode:
+            nvshmem_finalize()


@pytest.mark.parametrize("m", [2, 224])
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -4,7 +4,10 @@

 Run `pytest tests/kernels/test_pplx_moe.py`.
 """
-from typing import Optional
+import itertools
+import textwrap
+import traceback
+from typing import Callable, Optional

 import pytest
 import torch
@@ -19,12 +22,13 @@ except ImportError:
    has_pplx = False

 from tests.kernels.moe.utils import make_test_weights, naive_batched_moe
+from tests.kernels.quant_utils import dequant
 from tests.kernels.utils import torch_experts
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe import fused_topk, override_config
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts)
+    BatchedTritonExperts)
 from vllm.model_executor.layers.fused_moe.fused_moe import get_default_config
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
    FusedMoEModularKernel)
@@ -38,22 +42,22 @@ requires_pplx = pytest.mark.skipif(
    reason="Requires PPLX kernels",
 )

-PPLX_PREPARE_COMBOS = [(4, 128, 128), (32, 1024, 512), (64, 1024, 512),
-                       (222, 2048, 1024)]
-
-PPLX_MOE_COMBOS = [
-    (1, 128, 128),
+PPLX_COMBOS = [
+    # TODO: figure out why this fails, seems to be test problem
+    #(1, 128, 128),
    (2, 128, 512),
    (3, 1024, 2048),
-    (32, 128, 1024),
+    (4, 128, 128),
+    (32, 1024, 512),
    (45, 512, 2048),
-    (64, 1024, 1024),
-    (222, 1024, 2048),
+    (64, 1024, 512),
+    (222, 2048, 1024),
+    (256, 1408, 2048),
 ]

 NUM_EXPERTS = [8, 64]
-EP_SIZE = [1, 4]
 TOP_KS = [1, 2, 6]
+DTYPES = [torch.float8_e4m3fn, torch.bfloat16]

 vllm_config = VllmConfig()
 vllm_config.scheduler_config.max_num_seqs = 128
@@ -169,9 +173,11 @@ def test_fused_moe_batched_experts(

    with set_current_vllm_config(vllm_config):
        topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
-        baseline_output = torch_experts(a, w1, w2, topk_weight, topk_ids)
+        baseline_output = torch_experts(a, w1, w2, topk_weight,
+                                        topk_ids)  # only for baseline
        torch_output = torch_batched_moe(a, w1, w2, topk_weight, topk_ids)
-        batched_output = naive_batched_moe(a, w1, w2, topk_weight, topk_ids)
+        batched_output = naive_batched_moe(
+            a, w1, w2, topk_weight, topk_ids)  # pick torch_experts or this

    torch.testing.assert_close(baseline_output,
                               torch_output,
@@ -183,6 +189,63 @@ def test_fused_moe_batched_experts(
                               rtol=0)


+def create_pplx_prepare_finalize(
+    num_tokens: int,
+    hidden_dim: int,
+    topk: int,
+    num_experts: int,
+    rank: int,
+    dp_size: int,
+    world_size: int,
+    in_dtype: torch.dtype,
+    quant_dtype: Optional[torch.dtype],
+    block_shape: Optional[list[int]],
+    per_act_token_quant: bool,
+    group_name: Optional[str],
+):
+    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
+        PplxPrepareAndFinalize, pplx_hidden_dim_scale_bytes)
+
+    max_num_tokens = max(rank_chunk(num_tokens, 0, world_size), 1)
+    num_local_experts = rank_chunk(num_experts, 0, world_size)
+
+    hidden_dim_bytes, scale_bytes = pplx_hidden_dim_scale_bytes(
+        max_num_tokens,
+        hidden_dim,
+        in_dtype,
+        quant_dtype,
+        per_act_token_quant=per_act_token_quant,
+        block_shape=block_shape,
+    )
+
+    args = dict(
+        max_num_tokens=max_num_tokens,
+        num_experts=num_experts,
+        experts_per_token=topk,
+        rank=rank,
+        world_size=world_size,
+        dp_size=dp_size,
+        hidden_dim=hidden_dim,
+        hidden_dim_bytes=hidden_dim_bytes,
+        hidden_dim_scale_bytes=scale_bytes,
+    )
+
+    if group_name is None:
+        ata = AllToAll.internode(**args)
+    else:
+        args["group_name"] = group_name
+        ata = AllToAll.intranode(**args)
+
+    prepare_finalize = PplxPrepareAndFinalize(
+        ata,
+        max_num_tokens=max_num_tokens,
+        num_local_experts=num_local_experts,
+        num_dispatchers=world_size // dp_size,
+    )
+
+    return prepare_finalize, ata
+
+
 def rank_chunk(num: int, r: int, w: int) -> int:
    rem = num % w
    return (num // w) + (1 if r < rem else 0)
@@ -193,6 +256,35 @@ def chunk_by_rank(t: torch.Tensor, r: int, w: int) -> torch.Tensor:
    return t[(r * chunk):(r + 1) * chunk]


+def maybe_chunk_by_rank(t: Optional[torch.Tensor], r: int,
+                        w: int) -> Optional[torch.Tensor]:
+    if t is not None:
+        return chunk_by_rank(t, r, w)
+    else:
+        return t
+
+
+def chunk_scales_by_rank(t: Optional[torch.Tensor], r: int,
+                         w: int) -> Optional[torch.Tensor]:
+    if t is not None and t.numel() > 1:
+        chunk = rank_chunk(t.shape[0], r, w)
+        return t[(r * chunk):(r + 1) * chunk]
+    else:
+        return t
+
+
+def chunk_scales(t: Optional[torch.Tensor], start: int,
+                 end: int) -> Optional[torch.Tensor]:
+    if t is not None and t.numel() > 1:
+        return t[start:end]
+    else:
+        return t
+
+
+def dummy_work(a: torch.Tensor) -> torch.Tensor:
+    return a * 1.1
+
+
 def pplx_prepare_finalize(
    pgi: ProcessGroupInfo,
    dp_size: int,
@@ -200,11 +292,11 @@ def pplx_prepare_finalize(
    topk_weight: torch.Tensor,
    topk_ids: torch.Tensor,
    num_experts: int,
+    quant_dtype: Optional[torch.dtype],
+    block_shape: Optional[list[int]],
+    per_act_token_quant: bool,
    group_name: Optional[str],
 ) -> torch.Tensor:
-    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
-        PplxPrepareAndFinalize)
-
    assert torch.cuda.current_device() == pgi.local_rank

    topk = topk_ids.shape[1]
@@ -212,60 +304,66 @@ def pplx_prepare_finalize(
    device = pgi.device
    rank = pgi.rank
    world_size = pgi.world_size
-    max_num_tokens = rank_chunk(num_tokens, 0, world_size)
-
-    args = dict(
-        max_num_tokens=max_num_tokens,
-        num_experts=num_experts,
-        experts_per_token=topk,
-        rank=rank,
-        world_size=world_size,
-        dp_size=dp_size,
-        hidden_dim=hidden_dim,
-        hidden_dim_bytes=hidden_dim * a.dtype.itemsize,
-        hidden_dim_scale_bytes=0,
-    )
-
-    if group_name is None:
-        ata = AllToAll.internode(**args)
-    else:
-        args["group_name"] = group_name
-        ata = AllToAll.intranode(**args)

    topk_ids = topk_ids.to(dtype=torch.uint32)

-    prepare_finalize = PplxPrepareAndFinalize(
-        ata,
-        max_num_tokens,
-        world_size,
+    prepare_finalize, ata = create_pplx_prepare_finalize(
+        num_tokens,
+        hidden_dim,
+        topk,
+        num_experts,
        rank,
        dp_size,
+        world_size,
+        a.dtype,
+        quant_dtype,
+        block_shape,
+        per_act_token_quant,
+        group_name,
    )

+    assert a.shape[0] == topk_ids.shape[0]
+
    a_chunk = chunk_by_rank(a, rank, world_size).to(device)
    chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
    chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)

+    assert a_chunk.shape[0] == chunk_topk_ids.shape[0]
+
+    out = torch.full(
+        a_chunk.shape,
+        torch.nan,
+        dtype=a.dtype,
+        device=device,
+    )
+
+    if (quant_dtype is not None and not per_act_token_quant
+            and block_shape is None):
+        a1_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+        a2_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    else:
+        a1_scale = None
+        a2_scale = None
+
    b_a, b_a_scale, expert_num_tokens, _, _ = prepare_finalize.prepare(
        a_chunk,
-        None,
-        None,
+        a1_scale,
+        a2_scale,
        chunk_topk_weight,
        chunk_topk_ids,
        num_experts,
        None,
        False,
-        FusedMoEQuantConfig(),
+        FusedMoEQuantConfig(
+            quant_dtype,
+            per_act_token_quant,
+            False,
+            block_shape,
+        ),
    )

-    b_a = b_a * 1.5
-
-    out = torch.full(
-        (max_num_tokens, hidden_dim),
-        torch.nan,
-        dtype=a.dtype,
-        device=device,
-    )
+    b_a = dummy_work(
+        dequant(b_a, b_a_scale, block_shape, per_act_token_quant, a.dtype))

    prepare_finalize.finalize(
        out,
@@ -291,70 +389,96 @@ def _pplx_prepare_finalize(
    score: torch.Tensor,
    topk: torch.Tensor,
    num_experts: int,
+    quant_dtype: Optional[torch.dtype],
+    block_shape: Optional[list[int]],
+    per_act_token_quant: bool,
    use_internode: bool,
 ):
-    if use_internode:
-        uid = nvshmem_get_unique_id(
-        ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
-        torch.distributed.broadcast(uid, src=0)
-        nvshmem_init(uid, pgi.rank, pgi.world_size)
-        group_name = None
-    else:
-        group_ranks = list(range(pgi.world_size))
-        cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
-        group_name = cpu_group.group_name
+    try:
+        if use_internode:
+            uid = nvshmem_get_unique_id(
+            ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
+            torch.distributed.broadcast(uid, src=0)
+            nvshmem_init(uid, pgi.rank, pgi.world_size)
+            group_name = None
+        else:
+            group_ranks = list(range(pgi.world_size))
+            cpu_group = torch.distributed.new_group(group_ranks,
+                                                    backend="gloo")
+            group_name = cpu_group.group_name

-    device = pgi.device
+        topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
+        m, k = a.shape

-    topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
-    k = a.shape[1]
+        a_rep = torch.repeat_interleave(dummy_work(a), topk, dim=0)

-    a_rep = torch.repeat_interleave(a, topk, dim=0).to(device)
+        torch_output = (a_rep.view(m, topk, k) *
+                        topk_weight.view(m, topk, 1).to(a_rep.dtype)).sum(
+                            dim=1)

-    torch_output = (a_rep.view(-1, topk, k) * 1.5 *
-                    topk_weight.view(-1, topk, 1).to(device)).sum(dim=1).to(
-                        a.dtype)
+        pplx_output = pplx_prepare_finalize(pgi, dp_size, a, topk_weight,
+                                            topk_ids, num_experts, quant_dtype,
+                                            block_shape, per_act_token_quant,
+                                            group_name)

-    pplx_output = pplx_prepare_finalize(pgi, dp_size, a, topk_weight, topk_ids,
-                                        num_experts, group_name)
+        torch_output = chunk_by_rank(torch_output, pgi.rank,
+                                     pgi.world_size).to(pgi.device)

-    torch_output = chunk_by_rank(torch_output, pgi.rank,
-                                 pgi.world_size).to(pplx_output.device)
-
-    torch.testing.assert_close(pplx_output, torch_output, atol=2e-2, rtol=0)
-
-    if use_internode:
-        nvshmem_finalize()
+        torch.testing.assert_close(pplx_output,
+                                   torch_output,
+                                   atol=3e-2,
+                                   rtol=3e-2)
+    finally:
+        if use_internode:
+            nvshmem_finalize()


-# TODO (bnell): this test point does not work for odd M due to how the test is
-# written, not due to limitations of the pplx kernels.  The pplx_moe
-# test below is able to deal with odd M.
-# TODO (bnell) add fp8 tests
-@pytest.mark.parametrize("mnk", PPLX_PREPARE_COMBOS)
+@pytest.mark.parametrize("mnk", PPLX_COMBOS)
@pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
+@pytest.mark.parametrize("per_act_token_quant", [False, True])
+@pytest.mark.parametrize("block_shape", [None, [128, 128]])
@pytest.mark.parametrize("use_internode", [False])
+@pytest.mark.optional
@requires_pplx
-def test_pplx_prepare_finalize(
+def test_pplx_prepare_finalize_slow(
    mnk: tuple[int, int, int],
    e: int,
    topk: int,
    dtype: torch.dtype,
    world_dp_size: tuple[int, int],
+    per_act_token_quant: bool,
+    block_shape: Optional[list[int]],
    use_internode: bool,
 ):
+    if dtype == torch.float8_e4m3fn:
+        use_fp8_w8a8 = True
+        act_dtype = torch.bfloat16
+        quant_dtype = dtype
+    else:
+        use_fp8_w8a8 = False
+        act_dtype = dtype
+        quant_dtype = None
+
+    if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
+        pytest.skip("Skip quantization test for non-quantized type")
+
+    if per_act_token_quant and block_shape is not None:
+        pytest.skip("Skip illegal quantization combination")
+
    current_platform.seed_everything(7)
    m, n, k = mnk
    world_size, dp_size = world_dp_size
    device = "cuda"
-    a = torch.randn((m, k), device=device, dtype=dtype) / 10
-    score = torch.randn((m, e), device=device, dtype=dtype)
+
+    a = torch.randn((m, k), device=device, dtype=act_dtype) / 10
+    score = torch.randn((m, e), device=device, dtype=act_dtype)

    parallel_launch(world_size, _pplx_prepare_finalize, dp_size, a, score,
-                    topk, e, use_internode)
+                    topk, e, quant_dtype, block_shape, per_act_token_quant,
+                    use_internode)


 def pplx_moe(
@@ -369,84 +493,62 @@ def pplx_moe(
    topk_ids: torch.Tensor,
    w1_scale: Optional[torch.Tensor] = None,
    w2_scale: Optional[torch.Tensor] = None,
-    qtype: Optional[torch.dtype] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    quant_dtype: Optional[torch.dtype] = None,
    per_act_token_quant=False,
    block_shape: Optional[list[int]] = None,
    use_compile: bool = False,
    use_cudagraphs: bool = True,
 ) -> torch.Tensor:
-    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
-        PplxPrepareAndFinalize, pplx_hidden_dim_scale_bytes)

-    device = torch.device("cuda", rank)
-    hidden_dim = a.shape[1]
+    num_tokens, hidden_dim = a.shape
    num_experts = w1.shape[0]
    topk = topk_ids.shape[1]
-    max_num_tokens = round_up(rank_chunk(a.shape[0], 0, world_size), 64)
+    max_num_tokens = round_up(rank_chunk(a.shape[0], 0, world_size), 16)

-    hidden_dim_bytes, scale_bytes = pplx_hidden_dim_scale_bytes(
-        max_num_tokens,
+    prepare_finalize, ata = create_pplx_prepare_finalize(
+        num_tokens,
        hidden_dim,
+        topk,
+        num_experts,
+        rank,
+        dp_size,
+        world_size,
        a.dtype,
-        qtype,
-        per_act_token_quant=per_act_token_quant,
-        block_shape=block_shape,
+        quant_dtype,
+        block_shape,
+        per_act_token_quant,
+        group_name,
    )

-    args = dict(
-        max_num_tokens=max_num_tokens,
-        num_experts=num_experts,
-        experts_per_token=topk,
-        rank=rank,
-        world_size=world_size,
-        dp_size=dp_size,
-        hidden_dim=hidden_dim,
-        hidden_dim_bytes=hidden_dim_bytes,
-        hidden_dim_scale_bytes=scale_bytes,
-    )
-
-    if group_name is None:
-        ata = AllToAll.internode(**args)
-    else:
-        args["group_name"] = group_name
-        ata = AllToAll.intranode(**args)
-
    topk_ids = topk_ids.to(dtype=torch.uint32)

-    prepare_finalize = PplxPrepareAndFinalize(
-        ata,
-        max_num_tokens,
-        world_size,
-        rank,
-        dp_size,
+    experts = BatchedTritonExperts(
+        max_num_tokens=max_num_tokens,
+        num_dispatchers=prepare_finalize.num_dispatchers(),
+        use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
+        block_shape=block_shape,
+        per_act_token_quant=per_act_token_quant,
    )

-    experts = BatchedTritonExperts(max_num_tokens=max_num_tokens,
-                                   world_size=world_size,
-                                   dp_size=dp_size,
-                                   use_fp8_w8a8=qtype == torch.float8_e4m3fn,
-                                   block_shape=block_shape)
-
    fused_experts = FusedMoEModularKernel(
        prepare_finalize,
        experts,
    )

    # Note: workers with the same dp_rank must use the exact same inputs.
-    a_chunk = chunk_by_rank(a, rank, world_size).to(device)
-    chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
-    chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
+    a_chunk = chunk_by_rank(a, rank, world_size)
+    chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size)
+    chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size)

    # Chunking weights like this only works for batched format
-    w1_chunk = chunk_by_rank(w1, rank, world_size).to(device)
-    w2_chunk = chunk_by_rank(w2, rank, world_size).to(device)
-
-    if w1_scale is not None:
-        w1_scale_chunk = chunk_by_rank(w1_scale, rank, world_size).to(device)
-        w2_scale_chunk = chunk_by_rank(w2_scale, rank, world_size).to(device)
-    else:
-        w1_scale_chunk = None
-        w2_scale_chunk = None
+    w1_chunk = chunk_by_rank(w1, rank, world_size)
+    w2_chunk = chunk_by_rank(w2, rank, world_size)
+    w1_scale_chunk = maybe_chunk_by_rank(w1_scale, rank, world_size)
+    w2_scale_chunk = maybe_chunk_by_rank(w2_scale, rank, world_size)
+    a1_scale_chunk = chunk_scales_by_rank(a1_scale, rank, world_size)
+    a2_scale_chunk = chunk_scales_by_rank(a2_scale, rank, world_size)

    # Note: for now use_compile will error out if the problem size is
    # large enough to trigger chunking. I'm leaving the flag and
@@ -468,6 +570,8 @@ def pplx_moe(
                         chunk_topk_ids,
                         w1_scale=w1_scale_chunk,
                         w2_scale=w2_scale_chunk,
+                         a1_scale=a1_scale_chunk,
+                         a2_scale=a2_scale_chunk,
                         global_num_experts=num_experts)

    if use_cudagraphs:
@@ -482,6 +586,8 @@ def pplx_moe(
                                 chunk_topk_ids,
                                 w1_scale=w1_scale_chunk,
                                 w2_scale=w2_scale_chunk,
+                                 a1_scale=a1_scale_chunk,
+                                 a2_scale=a2_scale_chunk,
                                 global_num_experts=num_experts)

        torch.cuda.synchronize()
@@ -494,48 +600,6 @@ def pplx_moe(
    return out


-def _batched_moe(pgi, dp_size, a, w1, w2, topk_weight, topk_ids):
-    assert torch.cuda.current_device() == pgi.local_rank
-
-    num_experts = w1.shape[0]
-    device = pgi.device
-    rank = pgi.rank
-    world_size = pgi.world_size
-    max_num_tokens = rank_chunk(a.shape[0], 0, world_size)
-
-    prepare_finalize = BatchedPrepareAndFinalize(
-        max_num_tokens=max_num_tokens,
-        world_size=world_size,
-        dp_size=dp_size,
-        rank=rank,
-    )
-
-    experts = NaiveBatchedExperts(max_num_tokens=a.shape[0],
-                                  world_size=1,
-                                  dp_size=1)
-
-    fused_experts = FusedMoEModularKernel(
-        prepare_finalize,
-        experts,
-    )
-
-    # Note: workers with the same dp_rank must use the exact same inputs.
-    a_chunk = chunk_by_rank(a, rank, world_size).to(device)
-    chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
-    chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
-
-    out = fused_experts(
-        a_chunk,
-        # Chunking weights like this only works for batched format
-        chunk_by_rank(w1, rank, world_size).to(device),
-        chunk_by_rank(w2, rank, world_size).to(device),
-        chunk_topk_weight,
-        chunk_topk_ids,
-        global_num_experts=num_experts)
-
-    return out
-
-
 def _pplx_moe(
    pgi: ProcessGroupInfo,
    dp_size: int,
@@ -544,75 +608,130 @@ def _pplx_moe(
    w2: torch.Tensor,
    score: torch.Tensor,
    topk: int,
+    num_experts: int,
    w1_s: Optional[torch.Tensor] = None,
    w2_s: Optional[torch.Tensor] = None,
-    qtype: Optional[torch.dtype] = None,
+    quant_dtype: Optional[torch.dtype] = None,
    per_act_token_quant: bool = False,
    block_shape: Optional[list[int]] = None,
    use_internode: bool = False,
 ):
-    if use_internode:
-        uid = nvshmem_get_unique_id(
-        ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
-        torch.distributed.broadcast(uid, src=0)
-        nvshmem_init(uid, pgi.rank, pgi.world_size)
-        group_name = None
-    else:
-        group_ranks = list(range(pgi.world_size))
-        cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
-        group_name = cpu_group.group_name
+    try:
+        if use_internode:
+            uid = nvshmem_get_unique_id(
+            ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
+            torch.distributed.broadcast(uid, src=0)
+            nvshmem_init(uid, pgi.rank, pgi.world_size)
+            group_name = None
+        else:
+            group_ranks = list(range(pgi.world_size))
+            cpu_group = torch.distributed.new_group(group_ranks,
+                                                    backend="gloo")
+            group_name = cpu_group.group_name

-    m, k = a.shape
-    e, _, n = w2.shape
+        m, k = a.shape
+        e, _, n = w2.shape

-    moe_config = get_default_config(m, e, n, k, topk, a.dtype, False)
+        moe_config = get_default_config(m, e, n, k, topk, a.dtype, False)

-    device = torch.device("cuda", pgi.rank)
-    a = a.to(device)
-    w1 = w1.to(device)
-    w2 = w2.to(device)
-    w1_s = w1_s.to(device) if w1_s is not None else None
-    w2_s = w2_s.to(device) if w2_s is not None else None
+        device = torch.device("cuda", pgi.rank)
+        rank = pgi.rank
+        world_size = pgi.world_size

-    with set_current_vllm_config(vllm_config), override_config(moe_config):
-        topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
-        torch_output = torch_experts(a,
-                                     w1,
-                                     w2,
-                                     topk_weight,
-                                     topk_ids,
-                                     w1_scale=w1_s,
-                                     w2_scale=w2_s,
-                                     quant_dtype=qtype,
-                                     per_act_token_quant=per_act_token_quant,
-                                     block_shape=block_shape)
-        pplx_output = pplx_moe(group_name, pgi.rank, pgi.world_size, dp_size,
-                               a, w1, w2, topk_weight, topk_ids, w1_s, w2_s,
-                               qtype, per_act_token_quant, block_shape)
-        # TODO (bnell): fix + re-enable
-        #batched_output = _batched_moe(pgi, dp_size, a, w1, w2, topk_weight,
-        #                              topk_ids)
+        a = a.to(device)
+        w1 = w1.to(device)
+        w2 = w2.to(device)
+        w1_s = w1_s.to(device) if w1_s is not None else None
+        w2_s = w2_s.to(device) if w2_s is not None else None

-    torch_output = chunk_by_rank(torch_output, pgi.rank,
-                                 pgi.world_size).to(pplx_output.device)
+        if (quant_dtype is not None and not per_act_token_quant
+                and block_shape is None):
+            a1_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+            a2_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+        else:
+            a1_scale = None
+            a2_scale = None

-    torch.testing.assert_close(pplx_output, torch_output, atol=2e-2, rtol=0)
-    #torch.testing.assert_close(batched_output, torch_output, atol=2e-2, rtol=0)
+        with set_current_vllm_config(vllm_config), override_config(moe_config):
+            topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)

-    if use_internode:
-        nvshmem_finalize()
+            torch_output = torch_experts(
+                a,
+                w1,
+                w2,
+                topk_weight,
+                topk_ids,
+                w1_scale=w1_s,
+                w2_scale=w2_s,
+                a1_scale=a1_scale,
+                a2_scale=a2_scale,
+                quant_dtype=quant_dtype,
+                per_act_token_quant=per_act_token_quant,
+                block_shape=block_shape,
+            )
+
+            batched_output = naive_batched_moe(
+                a,
+                w1,
+                w2,
+                topk_weight,
+                topk_ids,
+                w1_scale=w1_s,
+                w2_scale=w2_s,
+                a1_scale=a1_scale,
+                a2_scale=a2_scale,
+                quant_dtype=quant_dtype,
+                per_act_token_quant=per_act_token_quant,
+                block_shape=block_shape,
+            )
+
+            pplx_output = pplx_moe(
+                group_name,
+                rank,
+                world_size,
+                dp_size,
+                a,
+                w1,
+                w2,
+                topk_weight,
+                topk_ids,
+                w1_scale=w1_s,
+                w2_scale=w2_s,
+                a1_scale=a1_scale,
+                a2_scale=a2_scale,
+                quant_dtype=quant_dtype,
+                per_act_token_quant=per_act_token_quant,
+                block_shape=block_shape,
+            )
+
+        chunked_batch_output = chunk_by_rank(
+            batched_output, pgi.rank, pgi.world_size).to(pplx_output.device)
+
+        torch.testing.assert_close(batched_output,
+                                   torch_output,
+                                   atol=3e-2,
+                                   rtol=3e-2)
+
+        torch.testing.assert_close(pplx_output,
+                                   chunked_batch_output,
+                                   atol=3e-2,
+                                   rtol=3e-2)
+    finally:
+        if use_internode:
+            nvshmem_finalize()


-@pytest.mark.parametrize("mnk", PPLX_MOE_COMBOS)
+@pytest.mark.parametrize("mnk", PPLX_COMBOS)
@pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
@pytest.mark.parametrize("per_act_token_quant", [False, True])
@pytest.mark.parametrize("block_shape", [None, [128, 128]])
@pytest.mark.parametrize("use_internode", [False])
+@pytest.mark.optional
@requires_pplx
-def test_pplx_moe(
+def test_pplx_moe_slow(
    mnk: tuple[int, int, int],
    e: int,
    topk: int,
@@ -633,18 +752,143 @@ def test_pplx_moe(
        use_fp8_w8a8 = False
        quant_dtype = None

-    if not use_fp8_w8a8 and per_act_token_quant and block_shape is not None:
+    if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
        pytest.skip("Skip quantization test for non-quantized type")

+    if per_act_token_quant and block_shape is not None:
+        pytest.skip("Skip illegal quantization combination")
+
    a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
    score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)

-    _, w1, w1_s, _, w2, w2_s = make_test_weights(e,
-                                                 n,
-                                                 k,
-                                                 quant_dtype=quant_dtype,
-                                                 block_shape=block_shape)
+    _, w1, w1_s, _, w2, w2_s = make_test_weights(
+        e,
+        n,
+        k,
+        quant_dtype=quant_dtype,
+        block_shape=block_shape,
+        per_act_token_quant=per_act_token_quant,
+    )

-    parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk,
+    parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk, e,
                    w1_s, w2_s, quant_dtype, per_act_token_quant, block_shape,
                    use_internode)
+
+
+def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
+                    make_weights: bool, test_fn: Callable):
+
+    def format_result(msg, ex=None):
+        if ex is not None:
+            x = str(ex)
+            newx = x.strip(" \n\t")[:16]
+            if len(newx) < len(x):
+                newx = newx + " ..."
+
+            prefix = "E\t"
+            print(f"{textwrap.indent(traceback.format_exc(), prefix)}")
+            print(f"FAILED {msg} - {newx}\n")
+        else:
+            print(f"PASSED {msg}")
+
+    current_platform.seed_everything(7)
+    combos = itertools.product(PPLX_COMBOS, NUM_EXPERTS, TOP_KS, DTYPES,
+                               [False, True], [None, [128, 128]])
+    exceptions = []
+    count = 0
+    for mnk, e, topk, dtype, per_act_token_quant, block_shape in combos:
+        count = count + 1
+        m, n, k = mnk
+
+        if dtype == torch.float8_e4m3fn:
+            use_fp8_w8a8 = True
+            quant_dtype = dtype
+        else:
+            use_fp8_w8a8 = False
+            quant_dtype = None
+
+        test_desc = (f"test_pplx_moe[mnk={mnk}, e={e}, topk={topk}, "
+                     f"dtype={dtype}, per_act_token={per_act_token_quant}, "
+                     f"block_shape={block_shape}")
+
+        if not use_fp8_w8a8 and (per_act_token_quant
+                                 or block_shape is not None):
+            print(
+                f"{test_desc} - Skip quantization test for non-quantized type."
+            )
+            continue
+
+        if per_act_token_quant and block_shape is not None:
+            print(f"{test_desc} - Skip illegal quantization combination.")
+            continue
+
+        a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
+        score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
+
+        args = dict()
+        if make_weights:
+            _, w1, w1_s, _, w2, w2_s = make_test_weights(
+                e,
+                n,
+                k,
+                quant_dtype=quant_dtype,
+                block_shape=block_shape,
+                per_act_token_quant=per_act_token_quant,
+            )
+            args["w1"] = w1
+            args["w2"] = w2
+            args["w1_s"] = w1_s
+            args["w2_s"] = w2_s
+
+        try:
+            test_fn(
+                pgi=pgi,
+                dp_size=dp_size,
+                a=a,
+                score=score,
+                topk=topk,
+                num_experts=e,
+                quant_dtype=quant_dtype,
+                per_act_token_quant=per_act_token_quant,
+                block_shape=block_shape,
+                use_internode=use_internode,
+                **args,
+            )
+            format_result(test_desc)
+        except Exception as ex:
+            format_result(test_desc, ex)
+            exceptions.append(ex)
+
+    if len(exceptions) > 0:
+        raise RuntimeError(
+            f"{len(exceptions)} of {count} tests failed in child process, "
+            f"rank={pgi.rank}.")
+    else:
+        print(f"{count} of {count} tests passed in child process, "
+              f"rank={pgi.rank}.")
+
+
+@pytest.mark.parametrize("world_dp_size", [[2, 1]])
+@pytest.mark.parametrize("use_internode", [False])
+@requires_pplx
+def test_pplx_prepare_finalize(
+    world_dp_size: tuple[int, int],
+    use_internode: bool,
+):
+    current_platform.seed_everything(7)
+    world_size, dp_size = world_dp_size
+    parallel_launch(world_size * dp_size, _pplx_test_loop, dp_size,
+                    use_internode, False, _pplx_prepare_finalize)
+
+
+@pytest.mark.parametrize("world_dp_size", [[2, 1]])
+@pytest.mark.parametrize("use_internode", [False])
+@requires_pplx
+def test_pplx_moe(
+    world_dp_size: tuple[int, int],
+    use_internode: bool,
+):
+    current_platform.seed_everything(7)
+    world_size, dp_size = world_dp_size
+    parallel_launch(world_size, _pplx_test_loop, dp_size, use_internode, True,
+                    _pplx_moe)
--- a/tests/kernels/moe/utils.py
+++ b/tests/kernels/moe/utils.py
@@ -63,13 +63,12 @@ def batched_moe(

    fused_experts = FusedMoEModularKernel(
        BatchedPrepareAndFinalize(max_num_tokens,
-                                  world_size=1,
-                                  dp_size=1,
+                                  num_dispatchers=1,
+                                  num_local_experts=w1.shape[0],
                                  rank=0),
        BatchedTritonExperts(
            max_num_tokens=max_num_tokens,
-            world_size=1,
-            dp_size=1,
+            num_dispatchers=1,
            use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
            per_act_token_quant=per_act_token_quant,
            block_shape=block_shape,
@@ -105,13 +104,12 @@ def naive_batched_moe(

    fused_experts = FusedMoEModularKernel(
        BatchedPrepareAndFinalize(max_num_tokens,
-                                  world_size=1,
-                                  dp_size=1,
+                                  num_dispatchers=1,
+                                  num_local_experts=w1.shape[0],
                                  rank=0),
        NaiveBatchedExperts(
            max_num_tokens=max_num_tokens,
-            dp_size=1,
-            world_size=1,
+            num_dispatchers=1,
            use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
            per_act_token_quant=per_act_token_quant,
            block_shape=block_shape,