Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/tests/kernels/moe/modular_kernel_tools/cli_args.py
+++ b/tests/kernels/moe/modular_kernel_tools/cli_args.py
@@ -9,18 +9,19 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig

 from .common import Config
-from .mk_objects import (MK_ALL_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES,
-                         MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES)
+from .mk_objects import (
+    MK_ALL_PREPARE_FINALIZE_TYPES,
+    MK_FUSED_EXPERT_TYPES,
+    MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES,
+)


 def make_config_arg_parser(description: str):
-
    def to_pf_class_type(s: str) -> mk.FusedMoEPrepareAndFinalize:
        for pf in MK_ALL_PREPARE_FINALIZE_TYPES:
            if pf.__name__ == s:
                return pf
-        raise ValueError(
-            f"Cannot find a PrepareFinalize type that matches {s}")
+        raise ValueError(f"Cannot find a PrepareFinalize type that matches {s}")

    def to_experts_class_type(s: str) -> mk.FusedMoEPermuteExpertsUnpermute:
        for fe in MK_FUSED_EXPERT_TYPES:
@@ -45,15 +46,18 @@ def make_config_arg_parser(description: str):
        "--pf-type",
        type=to_pf_class_type,
        required=True,
-        help=("Choose a PrepareFinalize Type : "
-              f"{[x.__name__ for x in MK_ALL_PREPARE_FINALIZE_TYPES]}"),
+        help=(
+            "Choose a PrepareFinalize Type : "
+            f"{[x.__name__ for x in MK_ALL_PREPARE_FINALIZE_TYPES]}"
+        ),
    )
    parser.add_argument(
        "--experts-type",
        type=to_experts_class_type,
        required=True,
-        help=(f"Choose a FusedExpert type : "
-              f"{[x.__name__ for x in MK_FUSED_EXPERT_TYPES]}"),
+        help=(
+            f"Choose a FusedExpert type : {[x.__name__ for x in MK_FUSED_EXPERT_TYPES]}"
+        ),
    )
    parser.add_argument(
        "-m",
@@ -74,66 +78,65 @@ def make_config_arg_parser(description: str):
        default=1024,
        help="N dimension of the first fused-moe matmul",
    )
-    parser.add_argument("--num-experts",
-                        type=int,
-                        default=32,
-                        help="Global num experts")
-    parser.add_argument("--topk",
-                        nargs="+",
-                        type=int,
-                        default=[4, 1],
-                        help="num topk")
+    parser.add_argument(
+        "--num-experts", type=int, default=32, help="Global num experts"
+    )
+    parser.add_argument("--topk", nargs="+", type=int, default=[4, 1], help="num topk")
    parser.add_argument(
        "--fused-moe-chunk-size",
        type=int,
-        help="Fused moe chunk size used for the non-batched fused experts impl."
+        help="Fused moe chunk size used for the non-batched fused experts impl.",
    )

    # Quant args
-    parser.add_argument("--quant-dtype",
-                        type=to_quant_torch_dtype,
-                        help="Quant datatype")
-    parser.add_argument("--per-token-quantized-activations",
-                        action='store_true',
-                        help=("The input activations must be per-token "
-                              "quantized"))
-    parser.add_argument("--per-channel-quantized-weights",
-                        action="store_true",
-                        help="The weights must be per-channel quantized.")
-    parser.add_argument("--block-shape",
-                        nargs="+",
-                        type=int,
-                        help="Quantization block shape")
+    parser.add_argument(
+        "--quant-dtype", type=to_quant_torch_dtype, help="Quant datatype"
+    )
+    parser.add_argument(
+        "--per-token-quantized-activations",
+        action="store_true",
+        help=("The input activations must be per-token quantized"),
+    )
+    parser.add_argument(
+        "--per-channel-quantized-weights",
+        action="store_true",
+        help="The weights must be per-channel quantized.",
+    )
+    parser.add_argument(
+        "--block-shape", nargs="+", type=int, help="Quantization block shape"
+    )

    # Torch trace profile generation args
-    parser.add_argument("--torch-trace-dir-path",
-                        type=str,
-                        default=None,
-                        help="Get torch trace for single execution")
+    parser.add_argument(
+        "--torch-trace-dir-path",
+        type=str,
+        default=None,
+        help="Get torch trace for single execution",
+    )

    return parser


 def _validate_args(args: argparse.Namespace):
-
    if args.quant_dtype is not None:
        assert args.quant_dtype == torch.float8_e4m3fn
        if args.block_shape is not None:
            assert len(args.block_shape) == 2, (
-                f"block shape must have 2 elements. got {args.block_shape}")
+                f"block shape must have 2 elements. got {args.block_shape}"
+            )

    if args.experts_type in MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES:
-        assert args.world_size == 1, (
-            "Single GPU objects need world size set to 1")
+        assert args.world_size == 1, "Single GPU objects need world size set to 1"

    if args.torch_trace_dir_path is not None:
        from pathlib import Path
+
        assert Path(args.torch_trace_dir_path).is_dir(), (
-            f"Please create {args.torch_trace_dir_path}")
+            f"Please create {args.torch_trace_dir_path}"
+        )


 def make_config(args: argparse.Namespace) -> Config:
-
    _validate_args(args)

    quant_config = None
@@ -142,7 +145,8 @@ def make_config(args: argparse.Namespace) -> Config:
            quant_dtype=args.quant_dtype,
            per_act_token_quant=args.per_token_quantized_activations,
            per_out_ch_quant=args.per_channel_quantized_weights,
-            block_shape=args.block_shape)
+            block_shape=args.block_shape,
+        )

    return Config(
        Ms=args.m,
@@ -156,4 +160,5 @@ def make_config(args: argparse.Namespace) -> Config:
        fused_experts_type=args.experts_type,
        fused_moe_chunk_size=args.fused_moe_chunk_size,
        world_size=args.world_size,
-        torch_trace_dir_path=args.torch_trace_dir_path)
+        torch_trace_dir_path=args.torch_trace_dir_path,
+    )
--- a/tests/kernels/moe/modular_kernel_tools/common.py
+++ b/tests/kernels/moe/modular_kernel_tools/common.py
@@ -8,20 +8,30 @@ import torch
 import vllm._custom_ops as ops
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from tests.kernels.moe.utils import make_test_weights, per_token_cast_to_fp8
-from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX,
-                                                    FLOAT8_E4M3_MAX,
-                                                    dequantize_nvfp4_to_dtype)
+from tests.kernels.quantization.nvfp4_utils import (
+    FLOAT4_E2M1_MAX,
+    FLOAT8_E4M3_MAX,
+    dequantize_nvfp4_to_dtype,
+)
 from tests.kernels.utils import torch_experts
 from vllm.config import VllmConfig
 from vllm.distributed import get_dp_group, get_tensor_model_parallel_world_size
 from vllm.forward_context import set_forward_context
 from vllm.model_executor.layers.fused_moe.config import (
-    FusedMoEConfig, FusedMoEParallelConfig, FusedMoEQuantConfig)
+    FusedMoEConfig,
+    FusedMoEParallelConfig,
+    FusedMoEQuantConfig,
+)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx

-from .mk_objects import (TestMoEQuantConfig, expert_info, make_fused_experts,
-                         make_prepare_finalize, prepare_finalize_info)
+from .mk_objects import (
+    TestMoEQuantConfig,
+    expert_info,
+    make_fused_experts,
+    make_prepare_finalize,
+    prepare_finalize_info,
+)
 from .parallel_utils import ProcessGroupInfo


@@ -94,8 +104,7 @@ class Config:

    @property
    def is_per_tensor_act_quant(self) -> bool:
-        return (not self.is_per_act_token_quant
-                and self.quant_block_shape is None)
+        return not self.is_per_act_token_quant and self.quant_block_shape is None

    @property
    def is_per_out_ch_quant(self) -> bool:
@@ -134,23 +143,24 @@ class Config:

        if self.fused_moe_chunk_size is not None:
            env_dict.update(
-                {"VLLM_FUSED_MOE_CHUNK_SIZE": str(self.fused_moe_chunk_size)})
+                {"VLLM_FUSED_MOE_CHUNK_SIZE": str(self.fused_moe_chunk_size)}
+            )

        return vllm_config, env_dict

    def is_fp8_block_quantized(self):
-        return (self.quant_dtype == torch.float8_e4m3fn
-                and self.quant_block_shape is not None)
+        return (
+            self.quant_dtype == torch.float8_e4m3fn
+            and self.quant_block_shape is not None
+        )

    def is_batched_prepare_finalize(self):
        info = prepare_finalize_info(self.prepare_finalize_type)
-        return (mk.FusedMoEActivationFormat.BatchedExperts ==
-                info.activation_format)
+        return mk.FusedMoEActivationFormat.BatchedExperts == info.activation_format

    def is_batched_fused_experts(self):
        info = expert_info(self.fused_experts_type)
-        return (mk.FusedMoEActivationFormat.BatchedExperts ==
-                info.activation_format)
+        return mk.FusedMoEActivationFormat.BatchedExperts == info.activation_format

    def is_standard_fused_experts(self):
        info = expert_info(self.fused_experts_type)
@@ -190,8 +200,10 @@ class Config:

    def needs_deep_ep(self):
        info = prepare_finalize_info(self.prepare_finalize_type)
-        return (info.backend == "deepep_high_throughput"
-                or info.backend == "deepep_low_latency")
+        return (
+            info.backend == "deepep_high_throughput"
+            or info.backend == "deepep_low_latency"
+        )

    def all2all_backend(self):
        info = prepare_finalize_info(self.prepare_finalize_type)
@@ -211,20 +223,26 @@ class Config:
            return False

        # Check quantization sanity
-        if (int(self.is_per_act_token_quant) +
-                int(self.is_per_tensor_act_quant) +
-                int(self.quant_block_shape is not None)) > 1:
+        if (
+            int(self.is_per_act_token_quant)
+            + int(self.is_per_tensor_act_quant)
+            + int(self.quant_block_shape is not None)
+        ) > 1:
            # invalid quant config
            return False

        # check type support
        if self.quant_dtype is None:
-            if (self.dtype not in self.pf_supported_types()
-                    or self.dtype not in self.fe_supported_types()):
+            if (
+                self.dtype not in self.pf_supported_types()
+                or self.dtype not in self.fe_supported_types()
+            ):
                return False
        else:
-            if (self.quant_dtype not in self.pf_supported_types()
-                    or self.quant_dtype not in self.fe_supported_types()):
+            if (
+                self.quant_dtype not in self.pf_supported_types()
+                or self.quant_dtype not in self.fe_supported_types()
+            ):
                return False

        # Check block quanization support
@@ -261,18 +279,21 @@ class WeightTensors:
    def describe(self):
        s = ""
        s += "== Weight Tensors: \n"
-        s += f' - {_describe_tensor(self.w1, "w1")} \n'
-        s += f' - {_describe_tensor(self.w2, "w2")} \n'
-        s += f' - {_describe_tensor(self.w1_scale, "w1_scale")} \n'
-        s += f' - {_describe_tensor(self.w2_scale, "w2_scale")} \n'
-        s += f' - {_describe_tensor(self.w1_gs, "w1_gs")} \n'
-        s += f' - {_describe_tensor(self.w2_gs, "w2_gs")} \n'
+        s += f" - {_describe_tensor(self.w1, 'w1')} \n"
+        s += f" - {_describe_tensor(self.w2, 'w2')} \n"
+        s += f" - {_describe_tensor(self.w1_scale, 'w1_scale')} \n"
+        s += f" - {_describe_tensor(self.w2_scale, 'w2_scale')} \n"
+        s += f" - {_describe_tensor(self.w1_gs, 'w1_gs')} \n"
+        s += f" - {_describe_tensor(self.w2_gs, 'w2_gs')} \n"
        return s

    def is_quantized(self) -> bool:
        # or w1_scale is not None?
-        return (self.w1.dtype == torch.float8_e4m3fn
-                or self.w1.dtype == torch.uint8 or self.w1.dtype == torch.int8)
+        return (
+            self.w1.dtype == torch.float8_e4m3fn
+            or self.w1.dtype == torch.uint8
+            or self.w1.dtype == torch.int8
+        )

    def to_current_device(self):
        device = torch.cuda.current_device()
@@ -289,16 +310,13 @@ class WeightTensors:
        if self.w2_gs is not None:
            self.w2_gs = self.w2_gs.to(device=device)

-    def slice_weights(self, rank: int,
-                      num_local_experts: int) -> "WeightTensors":
+    def slice_weights(self, rank: int, num_local_experts: int) -> "WeightTensors":
        s = rank * num_local_experts
        e = s + num_local_experts
        w1 = self.w1[s:e, :, :]
        w2 = self.w2[s:e, :, :]
-        w1_scale = self.w1_scale[
-            s:e, :, :] if self.w1_scale is not None else None
-        w2_scale = self.w2_scale[
-            s:e, :, :] if self.w2_scale is not None else None
+        w1_scale = self.w1_scale[s:e, :, :] if self.w1_scale is not None else None
+        w2_scale = self.w2_scale[s:e, :, :] if self.w2_scale is not None else None
        w1_gs = self.w1_gs[s:e] if self.w1_gs is not None else None
        w2_gs = self.w2_gs[s:e] if self.w2_gs is not None else None

@@ -313,15 +331,11 @@ class WeightTensors:
            in_dtype=config.dtype,
            quant_dtype=config.quant_dtype,
            block_shape=config.quant_block_shape,
-            per_out_ch_quant=config.
-            is_per_act_token_quant,  # or config.is_per_out_ch_quant
+            per_out_ch_quant=config.is_per_act_token_quant,  # or config.is_per_out_ch_quant
+        )
+        return WeightTensors(
+            w1=w1, w2=w2, w1_scale=w1_scale, w2_scale=w2_scale, w1_gs=w1_gs, w2_gs=w2_gs
        )
-        return WeightTensors(w1=w1,
-                             w2=w2,
-                             w1_scale=w1_scale,
-                             w2_scale=w2_scale,
-                             w1_gs=w1_gs,
-                             w2_gs=w2_gs)


@dataclass
@@ -336,22 +350,22 @@ class RankTensors:
    def describe(self):
        s = ""
        s += "== Rank Tensors: \n"
-        s += f' - {_describe_tensor(self.hidden_states, "HS")} \n'
-        s += f' - {_describe_tensor(self.hidden_states_scale, "HS_scale")} \n'
-        s += f' - {_describe_tensor(self.topk_weights, "topk_weights")} \n'
-        s += f' - {_describe_tensor(self.topk_ids, "topk_ids")} \n'
-        s += f' - {_describe_tensor(self.expert_map, "expert_map")} \n'
+        s += f" - {_describe_tensor(self.hidden_states, 'HS')} \n"
+        s += f" - {_describe_tensor(self.hidden_states_scale, 'HS_scale')} \n"
+        s += f" - {_describe_tensor(self.topk_weights, 'topk_weights')} \n"
+        s += f" - {_describe_tensor(self.topk_ids, 'topk_ids')} \n"
+        s += f" - {_describe_tensor(self.expert_map, 'expert_map')} \n"
        return s

    @staticmethod
    def make_hidden_states(
-            config: Config) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        config: Config,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
        """
        Return hidden_states
        """
        m, k, dtype = (config.M, config.K, config.dtype)
-        a = (torch.randn(
-            (m, k), device=torch.cuda.current_device(), dtype=dtype) / 15.0)
+        a = torch.randn((m, k), device=torch.cuda.current_device(), dtype=dtype) / 15.0

        if config.quant_dtype is None:
            return a, None
@@ -362,36 +376,29 @@ class RankTensors:
        # first - so further quantize and dequantize will yield the same
        # values.
        if config.is_per_tensor_act_quant:
-            a_q, a_scales = ops.scaled_fp8_quant(
-                a, use_per_token_if_dynamic=False)
+            a_q, a_scales = ops.scaled_fp8_quant(a, use_per_token_if_dynamic=False)
            return a_q.float().mul(a_scales).to(dtype), a_scales

        if config.is_per_act_token_quant:
-            a_q, a_scales = ops.scaled_fp8_quant(a,
-                                                 use_per_token_if_dynamic=True)
+            a_q, a_scales = ops.scaled_fp8_quant(a, use_per_token_if_dynamic=True)
            return a_q.float().mul(a_scales).to(dtype), None

        assert config.quant_block_shape is not None
        block_k = config.quant_block_shape[1]
        a_q, a_scales = per_token_cast_to_fp8(a, block_size=block_k)
-        return a_q.float().view(
-            (-1, block_k)).mul(a_scales.view(-1, 1)).view(m, k).to(dtype), None
+        return a_q.float().view((-1, block_k)).mul(a_scales.view(-1, 1)).view(m, k).to(
+            dtype
+        ), None

    @staticmethod
    def make(config: Config, pgi: ProcessGroupInfo):
-
        dtype = config.dtype
        topk, m, _ = (config.topk, config.M, config.K)
-        hidden_states, hidden_states_scale = RankTensors.make_hidden_states(
-            config)
+        hidden_states, hidden_states_scale = RankTensors.make_hidden_states(config)

-        num_local_experts, global_num_experts = (config.num_local_experts,
-                                                 config.E)
-        score = torch.randn((m, global_num_experts),
-                            device="cuda",
-                            dtype=dtype)
-        topk_weights, topk_ids, _ = fused_topk(hidden_states, score, topk,
-                                               False)
+        num_local_experts, global_num_experts = (config.num_local_experts, config.E)
+        score = torch.randn((m, global_num_experts), device="cuda", dtype=dtype)
+        topk_weights, topk_ids, _ = fused_topk(hidden_states, score, topk, False)

        # distribute topk_ids evenly
        for mi in range(m):
@@ -400,14 +407,15 @@ class RankTensors:

        expert_map = None
        if config.world_size > 1 and config.supports_expert_map():
-            expert_map = torch.full((global_num_experts, ),
-                                    fill_value=-1,
-                                    dtype=torch.int32)
+            expert_map = torch.full(
+                (global_num_experts,), fill_value=-1, dtype=torch.int32
+            )
            s = pgi.rank * num_local_experts
            e = s + num_local_experts
            expert_map[s:e] = torch.tensor(list(range(num_local_experts)))
-            expert_map = expert_map.to(device=torch.cuda.current_device(),
-                                       dtype=torch.int32)
+            expert_map = expert_map.to(
+                device=torch.cuda.current_device(), dtype=torch.int32
+            )

        return RankTensors(
            hidden_states=hidden_states,
@@ -418,9 +426,9 @@ class RankTensors:
        )


-def reference_moe_impl(config: Config, weights: WeightTensors,
-                       rank_tensors: RankTensors) -> torch.Tensor:
-
+def reference_moe_impl(
+    config: Config, weights: WeightTensors, rank_tensors: RankTensors
+) -> torch.Tensor:
    if config.quant_dtype == "nvfp4":
        quant_blocksize = 16
        dtype = config.dtype
@@ -433,8 +441,10 @@ def reference_moe_impl(config: Config, weights: WeightTensors,
        w2_blockscale = weights.w2_scale
        w2_gs = weights.w2_gs

-        a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.amax(
-            rank_tensors.hidden_states.flatten(), dim=-1)).to(torch.float32)
+        a_global_scale = (
+            (FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX)
+            / torch.amax(rank_tensors.hidden_states.flatten(), dim=-1)
+        ).to(torch.float32)

        assert w1_gs is not None
        assert w2_gs is not None
@@ -447,14 +457,17 @@ def reference_moe_impl(config: Config, weights: WeightTensors,
        assert w2_blockscale.shape[2] % 4 == 0

        a_fp4, a_scale_interleaved = ops.scaled_fp4_quant(
-            rank_tensors.hidden_states, a_global_scale)
+            rank_tensors.hidden_states, a_global_scale
+        )

-        a = dequantize_nvfp4_to_dtype(a_fp4,
-                                      a_scale_interleaved,
-                                      a_global_scale,
-                                      dtype=dtype,
-                                      device=a_fp4.device,
-                                      block_size=quant_blocksize)
+        a = dequantize_nvfp4_to_dtype(
+            a_fp4,
+            a_scale_interleaved,
+            a_global_scale,
+            dtype=dtype,
+            device=a_fp4.device,
+            block_size=quant_blocksize,
+        )

        e = w1_q.shape[0]
        n = w1_q.shape[1] // 2
@@ -464,18 +477,22 @@ def reference_moe_impl(config: Config, weights: WeightTensors,
        w2 = torch.zeros((e, k, n), device="cuda", dtype=dtype)

        for idx in range(0, e):
-            w1[idx] = dequantize_nvfp4_to_dtype(w1_q[idx],
-                                                w1_blockscale[idx],
-                                                w1_gs[idx],
-                                                dtype=dtype,
-                                                device=w1_q.device,
-                                                block_size=quant_blocksize)
-            w2[idx] = dequantize_nvfp4_to_dtype(w2_q[idx],
-                                                w2_blockscale[idx],
-                                                w2_gs[idx],
-                                                dtype=dtype,
-                                                device=w2_q.device,
-                                                block_size=quant_blocksize)
+            w1[idx] = dequantize_nvfp4_to_dtype(
+                w1_q[idx],
+                w1_blockscale[idx],
+                w1_gs[idx],
+                dtype=dtype,
+                device=w1_q.device,
+                block_size=quant_blocksize,
+            )
+            w2[idx] = dequantize_nvfp4_to_dtype(
+                w2_q[idx],
+                w2_blockscale[idx],
+                w2_gs[idx],
+                dtype=dtype,
+                device=w2_q.device,
+                block_size=quant_blocksize,
+            )
        a_scale = None
        w1_scale = None
        w2_scale = None
@@ -493,27 +510,29 @@ def reference_moe_impl(config: Config, weights: WeightTensors,
        per_act_token_quant = config.is_per_act_token_quant
        block_shape = config.quant_block_shape

-    return torch_experts(a=a,
-                         w1=w1,
-                         w2=w2,
-                         topk_weight=rank_tensors.topk_weights,
-                         topk_ids=rank_tensors.topk_ids,
-                         global_num_experts=config.E,
-                         expert_map=None,
-                         w1_scale=w1_scale,
-                         w2_scale=w2_scale,
-                         a1_scale=a_scale,
-                         quant_dtype=quant_dtype,
-                         per_act_token_quant=per_act_token_quant,
-                         block_shape=block_shape,
-                         apply_router_weights_on_input=config.topk == 1
-                         and config.supports_apply_weight_on_input())
+    return torch_experts(
+        a=a,
+        w1=w1,
+        w2=w2,
+        topk_weight=rank_tensors.topk_weights,
+        topk_ids=rank_tensors.topk_ids,
+        global_num_experts=config.E,
+        expert_map=None,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        a1_scale=a_scale,
+        quant_dtype=quant_dtype,
+        per_act_token_quant=per_act_token_quant,
+        block_shape=block_shape,
+        apply_router_weights_on_input=config.topk == 1
+        and config.supports_apply_weight_on_input(),
+    )


 def _make_gscale(num_experts: int) -> torch.Tensor:
-    return torch.ones((num_experts, ),
-                      device=torch.cuda.current_device(),
-                      dtype=torch.float32)
+    return torch.ones(
+        (num_experts,), device=torch.cuda.current_device(), dtype=torch.float32
+    )


 def make_modular_kernel(
@@ -521,12 +540,12 @@ def make_modular_kernel(
    vllm_config: VllmConfig,
    quant_config: FusedMoEQuantConfig,
 ) -> mk.FusedMoEModularKernel:
-
    def next_power_of_2(x):
        import math
+
        if x == 0:
            return 1
-        return 2**math.ceil(math.log2(x))
+        return 2 ** math.ceil(math.log2(x))

    # make moe config
    moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make(
@@ -546,9 +565,9 @@ def make_modular_kernel(
    )

    # make modular kernel
-    prepare_finalize = make_prepare_finalize(config.prepare_finalize_type,
-                                             config.all2all_backend(), moe,
-                                             quant_config)
+    prepare_finalize = make_prepare_finalize(
+        config.prepare_finalize_type, config.all2all_backend(), moe, quant_config
+    )

    fused_experts = make_fused_experts(
        config.fused_experts_type,
@@ -559,7 +578,8 @@ def make_modular_kernel(
    )

    modular_kernel = mk.FusedMoEModularKernel(
-        prepare_finalize=prepare_finalize, fused_experts=fused_experts)
+        prepare_finalize=prepare_finalize, fused_experts=fused_experts
+    )

    return modular_kernel

@@ -587,10 +607,8 @@ def run_modular_kernel(
        w1_scale=rank_weights.w1_scale,
        w2_scale=rank_weights.w2_scale,
        a1_scale=rank_tensors.hidden_states_scale,
-        g1_alphas=(1 / rank_weights.w1_gs)
-        if rank_weights.w1_gs is not None else None,
-        g2_alphas=(1 / rank_weights.w2_gs)
-        if rank_weights.w2_gs is not None else None,
+        g1_alphas=(1 / rank_weights.w1_gs) if rank_weights.w1_gs is not None else None,
+        g2_alphas=(1 / rank_weights.w2_gs) if rank_weights.w2_gs is not None else None,
        a1_gscale=gscale,
        a2_gscale=gscale,
        block_shape=config.quant_block_shape,
@@ -603,38 +621,30 @@ def run_modular_kernel(
    # impls might update the tensor in place
    hidden_states = rank_tensors.hidden_states.clone()

-    topk_ids = rank_tensors.topk_ids.to(
-        mk.prepare_finalize.topk_indices_dtype())
+    topk_ids = rank_tensors.topk_ids.to(mk.prepare_finalize.topk_indices_dtype())

    mk_kwargs = {
-        "hidden_states":
-        hidden_states,
-        "w1":
-        rank_weights.w1,
-        "w2":
-        rank_weights.w2,
-        "topk_weights":
-        rank_tensors.topk_weights,
-        "topk_ids":
-        topk_ids,
-        "expert_map":
-        rank_tensors.expert_map,
-        "global_num_experts":
-        config.E,
-        "apply_router_weight_on_input":
-        config.topk == 1 and config.supports_apply_weight_on_input(),
+        "hidden_states": hidden_states,
+        "w1": rank_weights.w1,
+        "w2": rank_weights.w2,
+        "topk_weights": rank_tensors.topk_weights,
+        "topk_ids": topk_ids,
+        "expert_map": rank_tensors.expert_map,
+        "global_num_experts": config.E,
+        "apply_router_weight_on_input": config.topk == 1
+        and config.supports_apply_weight_on_input(),
    }

    num_tokens = rank_tensors.hidden_states.shape[0]
-    num_tokens_across_dp = torch.tensor([num_tokens] * config.world_size,
-                                        device="cuda",
-                                        dtype=torch.int)
+    num_tokens_across_dp = torch.tensor(
+        [num_tokens] * config.world_size, device="cuda", dtype=torch.int
+    )

    with set_forward_context(
-            None,
-            vllm_config,
-            num_tokens=num_tokens,
-            num_tokens_across_dp=num_tokens_across_dp,
+        None,
+        vllm_config,
+        num_tokens=num_tokens,
+        num_tokens_across_dp=num_tokens_across_dp,
    ):
        out = mk.forward(**mk_kwargs)

--- a/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py
+++ b/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py
@@ -10,14 +10,21 @@ import torch
 from tqdm import tqdm

 from vllm.config import VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.fused_moe.config import (
-    FUSED_MOE_UNQUANTIZED_CONFIG)
+from vllm.model_executor.layers.fused_moe.config import FUSED_MOE_UNQUANTIZED_CONFIG
 from vllm.platforms import current_platform

-from .common import (Config, RankTensors, WeightTensors, reference_moe_impl,
-                     run_modular_kernel)
-from .mk_objects import (MK_FUSED_EXPERT_TYPES,
-                         MK_MULTI_GPU_PREPARE_FINALIZE_TYPES, MK_QUANT_CONFIGS)
+from .common import (
+    Config,
+    RankTensors,
+    WeightTensors,
+    reference_moe_impl,
+    run_modular_kernel,
+)
+from .mk_objects import (
+    MK_FUSED_EXPERT_TYPES,
+    MK_MULTI_GPU_PREPARE_FINALIZE_TYPES,
+    MK_QUANT_CONFIGS,
+)
 from .parallel_utils import ProcessGroupInfo, parallel_launch_with_config


@@ -38,8 +45,9 @@ def rank_worker(

    # sanity check
    from vllm import envs
+
    if config.fused_moe_chunk_size is not None:
-        assert (config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE)
+        assert config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE

    # get weights to this device
    weights.to_current_device()
@@ -60,8 +68,7 @@ def rank_worker(
        rank_tensors = RankTensors.make(cfgx, pgi)

        # modular kernel out
-        mk_out = run_modular_kernel(pgi, vllm_config, cfgx, weights,
-                                    rank_tensors)
+        mk_out = run_modular_kernel(pgi, vllm_config, cfgx, weights, rank_tensors)

        with set_current_vllm_config(vllm_config):
            ref_out = reference_moe_impl(cfgx, weights, rank_tensors)
@@ -70,28 +77,27 @@ def rank_worker(


 def make_feature_matrix(csv_file_path: str):
-
    from dataclasses import asdict

    import pandas as pd

-    def add_to_results(config: Config,
-                       success: Result,
-                       results_df: Optional[pd.DataFrame] = None):
+    def add_to_results(
+        config: Config, success: Result, results_df: Optional[pd.DataFrame] = None
+    ):
        config_dict = asdict(config)
-        config_dict['prepare_finalize_type'] = config_dict[
-            'prepare_finalize_type'].__name__
-        config_dict['fused_experts_type'] = config_dict[
-            'fused_experts_type'].__name__
-        config_dict['per_tensor_act_quant'] = config.is_per_tensor_act_quant
-        quant_config_dict = config_dict['quant_config']
-        del config_dict['quant_config']
+        config_dict["prepare_finalize_type"] = config_dict[
+            "prepare_finalize_type"
+        ].__name__
+        config_dict["fused_experts_type"] = config_dict["fused_experts_type"].__name__
+        config_dict["per_tensor_act_quant"] = config.is_per_tensor_act_quant
+        quant_config_dict = config_dict["quant_config"]
+        del config_dict["quant_config"]
        if quant_config_dict is None:
            quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
            quant_config_dict = asdict(quant_config)

        config_dict |= quant_config_dict
-        result_dict = config_dict | {'success': success.name}
+        result_dict = config_dict | {"success": success.name}

        result_df = pd.DataFrame([result_dict])
        if results_df is None:
@@ -112,22 +118,26 @@ def make_feature_matrix(csv_file_path: str):
    Q_TYPES = MK_QUANT_CONFIGS

    combinations = list(
-        product(Ms, Ks, Ns, Es, TOPKs, DTYPEs, PF_TYPES, FE_TYPES, Q_TYPES))
+        product(Ms, Ks, Ns, Es, TOPKs, DTYPEs, PF_TYPES, FE_TYPES, Q_TYPES)
+    )

    results_df: Optional[pd.DataFrame] = None
    for m, k, n, e, topks, dtype, pf_type, experts_type, quant_config in tqdm(
-            combinations):  #noqa: E501
-        config = Config(Ms=[m],
-                        K=k,
-                        N=n,
-                        E=e,
-                        topks=topks,
-                        dtype=dtype,
-                        prepare_finalize_type=pf_type,
-                        fused_experts_type=experts_type,
-                        quant_config=quant_config,
-                        world_size=2,
-                        fused_moe_chunk_size=None)
+        combinations
+    ):  # noqa: E501
+        config = Config(
+            Ms=[m],
+            K=k,
+            N=n,
+            E=e,
+            topks=topks,
+            dtype=dtype,
+            prepare_finalize_type=pf_type,
+            fused_experts_type=experts_type,
+            quant_config=quant_config,
+            world_size=2,
+            fused_moe_chunk_size=None,
+        )

        success = None
        if config.is_valid():
@@ -135,9 +145,14 @@ def make_feature_matrix(csv_file_path: str):
            try:
                weights: WeightTensors = WeightTensors.make(config)
                vllm_config, env_dict = config.make_env_data()
-                parallel_launch_with_config(config.world_size, rank_worker,
-                                            vllm_config, env_dict, config,
-                                            weights)
+                parallel_launch_with_config(
+                    config.world_size,
+                    rank_worker,
+                    vllm_config,
+                    env_dict,
+                    config,
+                    weights,
+                )
                success = Result.PASS
            except Exception as _:
                success = Result.FAIL
@@ -150,25 +165,33 @@ def make_feature_matrix(csv_file_path: str):
        results_df.to_csv(f"{csv_file_path}")


-if __name__ == '__main__':
+if __name__ == "__main__":
    import argparse
    from pathlib import Path
-    parser = argparse.ArgumentParser(description=(
-        "Make ModularKernel feature matrix \n"
-        "Example : python3 -m tests.kernels.moe.modular_kernel_tools.make_feature_matrix "  #noqa: E501
-        "-f ./feature_matrices/feature_matrix.csv"))

-    parser.add_argument("-f",
-                        "--feature-matrix-csv-file-path",
-                        type=str,
-                        required=True,
-                        help="File name to Generate a .csv file")
+    parser = argparse.ArgumentParser(
+        description=(
+            "Make ModularKernel feature matrix \n"
+            "Example : python3 -m tests.kernels.moe.modular_kernel_tools.make_feature_matrix "  # noqa: E501
+            "-f ./feature_matrices/feature_matrix.csv"
+        )
+    )
+
+    parser.add_argument(
+        "-f",
+        "--feature-matrix-csv-file-path",
+        type=str,
+        required=True,
+        help="File name to Generate a .csv file",
+    )
    args = parser.parse_args()

    csv_path = args.feature_matrix_csv_file_path
-    assert csv_path.endswith(
-        'csv'), f"Need a file path ending with .csv, got {csv_path}"
-    assert Path(csv_path).parent.is_dir(
-    ), f"Cannot find parent directory for {Path(csv_path).parent}"
+    assert csv_path.endswith("csv"), (
+        f"Need a file path ending with .csv, got {csv_path}"
+    )
+    assert Path(csv_path).parent.is_dir(), (
+        f"Cannot find parent directory for {Path(csv_path).parent}"
+    )

    make_feature_matrix(args.feature_matrix_csv_file_path)
--- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py
+++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
@@ -8,24 +8,33 @@ import torch
 # Fused experts and PrepareFinalize imports
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
-    BatchedDeepGemmExperts)
+    BatchedDeepGemmExperts,
+)
 from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
-    BatchedTritonOrDeepGemmExperts)
-from vllm.model_executor.layers.fused_moe.config import (FusedMoEConfig,
-                                                         FusedMoEQuantConfig)
+    BatchedTritonOrDeepGemmExperts,
+)
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEQuantConfig,
+)
 from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    BatchedTritonExperts, NaiveBatchedExperts)
-from vllm.model_executor.layers.fused_moe.layer import (FusedMoEMethodBase,
-                                                        TritonExperts)
+    BatchedTritonExperts,
+    NaiveBatchedExperts,
+)
+from vllm.model_executor.layers.fused_moe.layer import FusedMoEMethodBase, TritonExperts
 from vllm.model_executor.layers.fused_moe.prepare_finalize import (
-    MoEPrepareAndFinalizeNoEP)
+    MoEPrepareAndFinalizeNoEP,
+)
 from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
-    TritonOrDeepGemmExperts)
+    TritonOrDeepGemmExperts,
+)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    cutlass_fp4_supported)
+    cutlass_fp4_supported,
+)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    cutlass_fp8_supported)
+    cutlass_fp8_supported,
+)
 from vllm.platforms import current_platform
 from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
 from vllm.utils.deep_gemm import is_deep_gemm_supported
@@ -60,8 +69,7 @@ class ExpertInfo:
    needs_deep_gemm: bool = False


-PREPARE_FINALIZE_INFO: dict[mk.FusedMoEPrepareAndFinalize,
-                            PrepareFinalizeInfo] = {}
+PREPARE_FINALIZE_INFO: dict[mk.FusedMoEPrepareAndFinalize, PrepareFinalizeInfo] = {}
 EXPERT_INFO: dict[mk.FusedMoEPermuteExpertsUnpermute, ExpertInfo] = {}
 MK_ALL_PREPARE_FINALIZE_TYPES: list[mk.FusedMoEPrepareAndFinalize] = []
 MK_MULTI_GPU_PREPARE_FINALIZE_TYPES: list[mk.FusedMoEPrepareAndFinalize] = []
@@ -71,7 +79,10 @@ MK_FUSED_EXPERT_TYPES: list[mk.FusedMoEPermuteExpertsUnpermute] = []
 standard_format = mk.FusedMoEActivationFormat.Standard
 batched_format = mk.FusedMoEActivationFormat.BatchedExperts
 common_float_types: list[Union[torch.dtype, str]] = [
-    torch.float8_e4m3fn, torch.bfloat16, torch.float16, torch.float32
+    torch.float8_e4m3fn,
+    torch.bfloat16,
+    torch.float16,
+    torch.float32,
 ]
 common_float_and_int_types = common_float_types + [torch.int8]
 nvfp4_types = ["nvfp4"]
@@ -186,9 +197,11 @@ register_experts(
 # Disable on blackwell for now
 if has_deep_ep() and not current_platform.has_device_capability(100):
    from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
-        DeepEPHTPrepareAndFinalize)
+        DeepEPHTPrepareAndFinalize,
+    )
    from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
-        DeepEPLLPrepareAndFinalize)
+        DeepEPLLPrepareAndFinalize,
+    )

    register_prepare_and_finalize(
        DeepEPHTPrepareAndFinalize,
@@ -208,7 +221,9 @@ if has_deep_ep() and not current_platform.has_device_capability(100):

 if has_pplx():
    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
-        PplxPrepareAndFinalize)
+        PplxPrepareAndFinalize,
+    )
+
    register_prepare_and_finalize(
        PplxPrepareAndFinalize,
        batched_format,
@@ -217,13 +232,14 @@ if has_pplx():
        backend="pplx",
    )

-if (has_flashinfer_cutlass_fused_moe()
-        and current_platform.has_device_capability(100)):
+if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability(100):
    from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
-        FlashInferExperts)
+        FlashInferExperts,
+    )
    from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
        FlashInferCutlassMoEPrepareAndFinalize,
-        create_flashinfer_prepare_finalize)
+        create_flashinfer_prepare_finalize,
+    )

    register_prepare_and_finalize(
        FlashInferCutlassMoEPrepareAndFinalize,
@@ -258,16 +274,18 @@ if has_deep_gemm() and is_deep_gemm_supported():
        needs_matching_quant=False,
        needs_deep_gemm=True,
    )
-    register_experts(
-        DeepGemmExperts,
-        standard_format,
-        fp8_types,
-        blocked_quantization_support=True,
-        supports_chunking=True,
-        supports_expert_map=True,
-        needs_matching_quant=False,
-        needs_deep_gemm=True,
-    ),
+    (
+        register_experts(
+            DeepGemmExperts,
+            standard_format,
+            fp8_types,
+            blocked_quantization_support=True,
+            supports_chunking=True,
+            supports_expert_map=True,
+            needs_matching_quant=False,
+            needs_deep_gemm=True,
+        ),
+    )
    register_experts(
        BatchedTritonOrDeepGemmExperts,
        batched_format,
@@ -290,8 +308,11 @@ if has_deep_gemm() and is_deep_gemm_supported():
    )

 if cutlass_fp8_supported():
-    from vllm.model_executor.layers.fused_moe import (CutlassBatchedExpertsFp8,
-                                                      CutlassExpertsFp8)
+    from vllm.model_executor.layers.fused_moe import (
+        CutlassBatchedExpertsFp8,
+        CutlassExpertsFp8,
+    )
+
    register_experts(
        CutlassExpertsFp8,
        standard_format,
@@ -310,8 +331,8 @@ if cutlass_fp8_supported():
    )

 if cutlass_fp4_supported():
-    from vllm.model_executor.layers.fused_moe.cutlass_moe import (
-        CutlassExpertsFp4)
+    from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp4
+
    register_experts(
        CutlassExpertsFp4,
        standard_format,
@@ -324,30 +345,40 @@ if cutlass_fp4_supported():
 MK_QUANT_CONFIGS: list[Optional[TestMoEQuantConfig]] = [
    None,
    # per-channel / per-column weights and per-tensor activations
-    TestMoEQuantConfig(quant_dtype=torch.float8_e4m3fn,
-                       per_out_ch_quant=True,
-                       per_act_token_quant=False,
-                       block_shape=None),
+    TestMoEQuantConfig(
+        quant_dtype=torch.float8_e4m3fn,
+        per_out_ch_quant=True,
+        per_act_token_quant=False,
+        block_shape=None,
+    ),
    # per-channel / per-column weights and per-token activations
-    TestMoEQuantConfig(quant_dtype=torch.float8_e4m3fn,
-                       per_out_ch_quant=True,
-                       per_act_token_quant=True,
-                       block_shape=None),
+    TestMoEQuantConfig(
+        quant_dtype=torch.float8_e4m3fn,
+        per_out_ch_quant=True,
+        per_act_token_quant=True,
+        block_shape=None,
+    ),
    # per-tensor weights and per-tensor activations
-    TestMoEQuantConfig(quant_dtype=torch.float8_e4m3fn,
-                       per_out_ch_quant=False,
-                       per_act_token_quant=False,
-                       block_shape=None),
+    TestMoEQuantConfig(
+        quant_dtype=torch.float8_e4m3fn,
+        per_out_ch_quant=False,
+        per_act_token_quant=False,
+        block_shape=None,
+    ),
    # per-tensor weights and per-token activations
-    TestMoEQuantConfig(quant_dtype=torch.float8_e4m3fn,
-                       per_out_ch_quant=False,
-                       per_act_token_quant=True,
-                       block_shape=None),
+    TestMoEQuantConfig(
+        quant_dtype=torch.float8_e4m3fn,
+        per_out_ch_quant=False,
+        per_act_token_quant=True,
+        block_shape=None,
+    ),
    # block-quantized weights and 128 block per-token activations
-    TestMoEQuantConfig(quant_dtype=torch.float8_e4m3fn,
-                       per_out_ch_quant=False,
-                       per_act_token_quant=False,
-                       block_shape=[128, 128]),
+    TestMoEQuantConfig(
+        quant_dtype=torch.float8_e4m3fn,
+        per_out_ch_quant=False,
+        per_act_token_quant=False,
+        block_shape=[128, 128],
+    ),
    # TODO (varun) : Should we test the following combinations ?
    # block-quantized weights and per-token activations
    # block-quantized weights and per-tensor activations
@@ -355,10 +386,12 @@ MK_QUANT_CONFIGS: list[Optional[TestMoEQuantConfig]] = [

 if cutlass_fp4_supported() or has_flashinfer_cutlass_fused_moe():
    MK_QUANT_CONFIGS += [
-        TestMoEQuantConfig(quant_dtype="nvfp4",
-                           per_out_ch_quant=False,
-                           per_act_token_quant=False,
-                           block_shape=None),
+        TestMoEQuantConfig(
+            quant_dtype="nvfp4",
+            per_out_ch_quant=False,
+            per_act_token_quant=False,
+            block_shape=None,
+        ),
    ]


@@ -370,12 +403,14 @@ def make_prepare_finalize(
 ) -> mk.FusedMoEPrepareAndFinalize:
    if backend != "naive" and backend is not None:
        prepare_finalize = FusedMoEMethodBase._maybe_make_prepare_finalize(
-            moe, quant_config)
+            moe, quant_config
+        )
        assert prepare_finalize is not None
        return prepare_finalize
    elif prepare_finalize_type == FlashInferCutlassMoEPrepareAndFinalize:
        return create_flashinfer_prepare_finalize(
-            use_dp=moe.moe_parallel_config.dp_size > 1)
+            use_dp=moe.moe_parallel_config.dp_size > 1
+        )
    else:
        return MoEPrepareAndFinalizeNoEP()

@@ -391,10 +426,10 @@ def make_cutlass_strides(
    n: int,
    k: int,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-    ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
-    c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
-    c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+    ab_strides1 = torch.full((e,), k, device="cuda", dtype=torch.int64)
+    ab_strides2 = torch.full((e,), n, device="cuda", dtype=torch.int64)
+    c_strides1 = torch.full((e,), 2 * n, device="cuda", dtype=torch.int64)
+    c_strides2 = torch.full((e,), k, device="cuda", dtype=torch.int64)
    return ab_strides1, ab_strides2, c_strides1, c_strides2


@@ -405,7 +440,6 @@ def make_fused_experts(
    num_dispatchers: int,
    N: int,
 ) -> mk.FusedMoEPermuteExpertsUnpermute:
-
    batch_kwargs = {
        "max_num_tokens": moe.max_num_tokens,
        "num_dispatchers": num_dispatchers,
--- a/tests/kernels/moe/modular_kernel_tools/parallel_utils.py
+++ b/tests/kernels/moe/modular_kernel_tools/parallel_utils.py
@@ -6,13 +6,11 @@ import traceback
 from typing import Any, Callable, Optional

 import torch
-from torch.multiprocessing import (
-    spawn)  # pyright: ignore[reportPrivateImportUsage]
+from torch.multiprocessing import spawn  # pyright: ignore[reportPrivateImportUsage]
 from typing_extensions import Concatenate, ParamSpec

 from vllm.config import VllmConfig, set_current_vllm_config
-from vllm.distributed import (init_distributed_environment,
-                              initialize_model_parallel)
+from vllm.distributed import init_distributed_environment, initialize_model_parallel
 from vllm.utils import get_open_port

 ## Parallel Processes Utils
@@ -30,10 +28,11 @@ class ProcessGroupInfo:
    device: torch.device


-def _set_vllm_config(vllm_config: VllmConfig, world_size: int, rank: int,
-                     local_rank: int):
-
+def _set_vllm_config(
+    vllm_config: VllmConfig, world_size: int, rank: int, local_rank: int
+):
    import tempfile
+
    temp_file = tempfile.mkstemp()[1]

    with set_current_vllm_config(vllm_config):
@@ -46,13 +45,10 @@ def _set_vllm_config(vllm_config: VllmConfig, world_size: int, rank: int,
        )

        initialize_model_parallel(
-            tensor_model_parallel_size=vllm_config.parallel_config.
-            tensor_parallel_size,
-            pipeline_model_parallel_size=vllm_config.parallel_config.
-            pipeline_parallel_size,
+            tensor_model_parallel_size=vllm_config.parallel_config.tensor_parallel_size,
+            pipeline_model_parallel_size=vllm_config.parallel_config.pipeline_parallel_size,
        )
-        cpu_group = torch.distributed.new_group(list(range(world_size)),
-                                                backend="gloo")
+        cpu_group = torch.distributed.new_group(list(range(world_size)), backend="gloo")
    return cpu_group


@@ -62,8 +58,7 @@ def _worker_parallel_launch(
    world_local_size: int,
    node_rank: int,
    init_method: str,
-    worker: Callable[Concatenate[ProcessGroupInfo, Optional[VllmConfig], Any,
-                                 P], None],
+    worker: Callable[Concatenate[ProcessGroupInfo, Optional[VllmConfig], Any, P], None],
    vllm_config: Optional[VllmConfig],
    env_dict: Optional[dict],
    *args: P.args,
@@ -131,7 +126,8 @@ def parallel_launch_with_config(
            worker,
            vllm_config,
            env_dict,
-        ) + args,
+        )
+        + args,
        nprocs=world_size,
        join=True,
    )
--- a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
+++ b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
@@ -14,28 +14,31 @@ from .common import Config, RankTensors, WeightTensors, make_modular_kernel
 from .parallel_utils import ProcessGroupInfo, parallel_launch_with_config


-def do_profile(fn: Callable,
-               fn_kwargs: dict[Any, Any],
-               pgi: ProcessGroupInfo,
-               config: Config,
-               num_warmups: int = 5):
+def do_profile(
+    fn: Callable,
+    fn_kwargs: dict[Any, Any],
+    pgi: ProcessGroupInfo,
+    config: Config,
+    num_warmups: int = 5,
+):
    for _ in range(num_warmups):
        fn(**fn_kwargs)

    with torch.profiler.profile(
-            activities=[
-                torch.profiler.ProfilerActivity.CPU,
-                torch.profiler.ProfilerActivity.CUDA,
-            ],
-            with_stack=True,
-            record_shapes=True,
+        activities=[
+            torch.profiler.ProfilerActivity.CPU,
+            torch.profiler.ProfilerActivity.CUDA,
+        ],
+        with_stack=True,
+        record_shapes=True,
    ) as tprof:
        fn(**fn_kwargs)
        torch.cuda.synchronize(torch.cuda.current_device())

    # TODO (varun): Add a descriptive trace file name
    tprof.export_chrome_trace(
-        f"{config.torch_trace_dir_path}/m{config.M}_{pgi.rank}_trace.json")
+        f"{config.torch_trace_dir_path}/m{config.M}_{pgi.rank}_trace.json"
+    )


 def profile_modular_kernel(
@@ -82,6 +85,7 @@ def rank_worker(

    # sanity check
    from vllm import envs
+
    if config.fused_moe_chunk_size is not None:
        assert config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE

@@ -108,20 +112,25 @@ def rank_worker(
 def run(config: Config):
    weights: WeightTensors = WeightTensors.make(config)
    vllm_config, env_dict = config.make_env_data()
-    parallel_launch_with_config(config.world_size, rank_worker, vllm_config,
-                                env_dict, config, weights)
+    parallel_launch_with_config(
+        config.world_size, rank_worker, vllm_config, env_dict, config, weights
+    )


-if __name__ == '__main__':
+if __name__ == "__main__":
    from .cli_args import make_config, make_config_arg_parser
-    parser = make_config_arg_parser(description=(
-        "Run single prepare-finalize & fused-experts combination test"
-        "Example : python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kernel "  #noqa: E501
-        "--pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts"
-    ))
+
+    parser = make_config_arg_parser(
+        description=(
+            "Run single prepare-finalize & fused-experts combination test"
+            "Example : python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kernel "  # noqa: E501
+            "--pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts"
+        )
+    )
    args = parser.parse_args()
    assert args.torch_trace_dir_path is not None, (
-        "Please pass in a directory to store torch traces")
+        "Please pass in a directory to store torch traces"
+    )
    config = make_config(args)

    run(config)