[Kernels] Modular kernel refactor (#24812)

Signed-off-by: Bill Nell <bnell@redhat.com>
2025-10-08 17:51:52 -04:00
parent f08919b7d1
commit da364615fc
22 changed files with 665 additions and 573 deletions
--- a/tests/kernels/moe/modular_kernel_tools/common.py
+++ b/tests/kernels/moe/modular_kernel_tools/common.py
@@ -209,18 +209,18 @@ class Config:
        info = prepare_finalize_info(self.prepare_finalize_type)
        return info.backend

-    def is_valid(self):
+    def is_valid(self) -> tuple[bool, Optional[str]]:
        # Check prepare-finalize and fused-experts compatibility
        if self.is_batched_prepare_finalize():
            if not self.is_batched_fused_experts():
-                return False
+                return False, "Mismatched format."
        else:
            if not self.is_standard_fused_experts():
-                return False
+                return False, "Mismatched format."

        use_chunking = self.fused_moe_chunk_size is not None
        if use_chunking and not self.is_fe_supports_chunking():
-            return False
+            return False, "Chunking not supported."

        # Check quantization sanity
        if (
@@ -229,7 +229,7 @@ class Config:
            + int(self.quant_block_shape is not None)
        ) > 1:
            # invalid quant config
-            return False
+            return False, f"Bad quant_config {self.quant_config}."

        # check type support
        if self.quant_dtype is None:
@@ -237,34 +237,43 @@ class Config:
                self.dtype not in self.pf_supported_types()
                or self.dtype not in self.fe_supported_types()
            ):
-                return False
+                return False, (
+                    f"Unsupported type {self.dtype} not in "
+                    f"{self.pf_supported_types()} and "
+                    f"{self.fe_supported_types()}."
+                )
        else:
            if (
                self.quant_dtype not in self.pf_supported_types()
                or self.quant_dtype not in self.fe_supported_types()
            ):
-                return False
+                return False, (
+                    f"Unsupported quant type {self.quant_dtype} "
+                    f"not in {self.pf_supported_types()} and "
+                    f"{self.fe_supported_types()}."
+                )

        # Check block quanization support
        is_block_quatized = self.quant_block_shape is not None
        if is_block_quatized and self.quant_dtype is None:
-            return False
+            return False, "No block quantization support."
+
        if is_block_quatized and not self.is_block_quant_supported():
-            return False
+            return False, "Mismatched block quantization support."

        # deep_gemm only works with block-quantized
        if self.needs_deep_gemm() and not is_block_quatized:
-            return False
+            return False, "Needs DeepGEMM but not block quantized."

        # Check dependencies (turn into asserts?)
        if self.needs_deep_ep() and not has_deep_ep():
-            return False
+            return False, "Needs DeepEP, but DeepEP not available."
        if self.needs_deep_gemm() and not has_deep_gemm():
-            return False
+            return False, "Needs DeepGEMM, but DeepGEMM not available."
        if self.needs_pplx() and not has_pplx():  # noqa: SIM103
-            return False
+            return False, "Needs PPLX, but PPLX not available."

-        return True
+        return True, None


@dataclass
--- a/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py
+++ b/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py
@@ -140,7 +140,7 @@ def make_feature_matrix(csv_file_path: str):
        )

        success = None
-        if config.is_valid():
+        if config.is_valid()[0]:
            print(f"Running config : {config.describe()} ...")
            try:
                weights: WeightTensors = WeightTensors.make(config)
--- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py
+++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
@@ -244,7 +244,7 @@ if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability
    register_prepare_and_finalize(
        FlashInferCutlassMoEPrepareAndFinalize,
        standard_format,
-        nvfp4_types,
+        nvfp4_types + fp8_types,
        blocked_quantization_support=True,
        backend=None,
        force_multigpu=True,
@@ -254,7 +254,7 @@ if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability
    register_experts(
        FlashInferExperts,
        standard_format,
-        nvfp4_types,
+        nvfp4_types + fp8_types,
        blocked_quantization_support=True,
        supports_chunking=True,
        # Note: this is a hack to get it to run for now
@@ -274,17 +274,15 @@ if has_deep_gemm() and is_deep_gemm_supported():
        needs_matching_quant=False,
        needs_deep_gemm=True,
    )
-    (
-        register_experts(
-            DeepGemmExperts,
-            standard_format,
-            fp8_types,
-            blocked_quantization_support=True,
-            supports_chunking=True,
-            supports_expert_map=True,
-            needs_matching_quant=False,
-            needs_deep_gemm=True,
-        ),
+    register_experts(
+        DeepGemmExperts,
+        standard_format,
+        fp8_types,
+        blocked_quantization_support=True,
+        supports_chunking=True,
+        supports_expert_map=True,
+        needs_matching_quant=False,
+        needs_deep_gemm=True,
    )
    register_experts(
        BatchedTritonOrDeepGemmExperts,
@@ -464,7 +462,7 @@ def make_fused_experts(
        print(f"Making BatchedTritonOrDeepGemmExperts {kwargs} ...")
        experts = BatchedTritonOrDeepGemmExperts(**kwargs)
    elif fused_experts_type == DeepGemmExperts:
-        print("Making DeepGemmExperts {quant_config} ...")
+        print(f"Making DeepGemmExperts {quant_config} ...")
        experts = DeepGemmExperts(quant_config)
    elif fused_experts_type == TritonExperts:
        kwargs = quant_kwargs