[Refactor] Remove unused cutlass moe problem size function (#32047)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
2026-01-18 15:46:59 -05:00
parent 16de822c71
commit eebc58df0c
5 changed files with 0 additions and 101 deletions
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -260,12 +260,6 @@ void get_cutlass_moe_mm_data(
    const int64_t num_experts, const int64_t n, const int64_t k,
    const std::optional<torch::Tensor>& blockscale_offsets);

-void get_cutlass_moe_mm_problem_sizes(
-    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
-    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
-    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
-    std::optional<bool> force_swap_ab = std::nullopt);
-
 void get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
    const torch::Tensor& expert_first_token_offset,
    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
--- a/csrc/quantization/w8a8/cutlass/moe/moe_data.cu
+++ b/csrc/quantization/w8a8/cutlass/moe/moe_data.cu
@@ -130,26 +130,6 @@ inline void launch_compute_problem_sizes(const torch::Tensor& topk_ids,
 }
 }  // namespace

-void get_cutlass_moe_mm_problem_sizes_caller(
-    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
-    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
-    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
-    std::optional<bool> force_swap_ab = std::nullopt) {
-  auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
-  auto options_int32 =
-      torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
-  torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32);
-
-  // Swap-AB should be disabled for FP4 path
-  bool may_swap_ab =
-      force_swap_ab.value_or((!blockscale_offsets.has_value()) &&
-                             (topk_ids.numel() <= SWAP_AB_THRESHOLD));
-
-  launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2,
-                               atomic_buffer, num_experts, n, k, stream,
-                               may_swap_ab);
-}
-
 template <bool SWAP_AB>
 __global__ void compute_problem_sizes_from_expert_offsets(
    const int64_t* __restrict__ expert_first_token_offset,
--- a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
+++ b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
@@ -77,12 +77,6 @@ void get_cutlass_moe_mm_data_caller(
    const int64_t num_experts, const int64_t n, const int64_t k,
    const std::optional<torch::Tensor>& blockscale_offsets);

-void get_cutlass_moe_mm_problem_sizes_caller(
-    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
-    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
-    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
-    std::optional<bool> force_swap_ab = std::nullopt);
-
 void get_cutlass_moe_mm_problem_sizes_from_expert_offsets_caller(
    const torch::Tensor& expert_first_token_offset,
    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
@@ -306,27 +300,6 @@ void get_cutlass_moe_mm_data(
      version_num, ". Required capability: 90, 100, or 120");
 }

-void get_cutlass_moe_mm_problem_sizes(
-    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
-    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
-    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
-    std::optional<bool> force_swap_ab = std::nullopt) {
-  int32_t version_num = get_sm_version_num();
-#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) ||   \
-    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
-    (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
-  get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1,
-                                          problem_sizes2, num_experts, n, k,
-                                          blockscale_offsets, force_swap_ab);
-  return;
-#endif
-  TORCH_CHECK_NOT_IMPLEMENTED(
-      false,
-      "No compiled get_cutlass_moe_mm_problem_sizes: no cutlass_scaled_mm "
-      "kernel for CUDA device capability: ",
-      version_num, ". Required capability: 90, 100, or 120");
-}
-
 void get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
    const torch::Tensor& expert_first_token_offset,
    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -474,19 +474,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "()");
  ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data);

-  // A function that computes problem sizes for each expert's multiplication
-  // used by the two mms called from fused MoE operation. It takes topk_ids as
-  // an input, and computes problem_sizes1 and problem_sizes2 only.
-  ops.def(
-      "get_cutlass_moe_mm_problem_sizes(Tensor topk_ids, "
-      "                                 Tensor! problem_sizes1, "
-      "                                 Tensor! problem_sizes2, "
-      "                                 int num_experts, int n, int k, "
-      "                                 Tensor? blockscale_offsets, "
-      "                                 bool? force_swap_ab) -> ()");
-  ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA,
-           &get_cutlass_moe_mm_problem_sizes);
-
  // compute per-expert problem sizes from expert_first_token_offset
  // produced by vLLM's moe_permute kernel
  ops.def(
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1044,41 +1044,6 @@ def get_cutlass_moe_mm_data(
    )


-def get_cutlass_moe_mm_problem_sizes(
-    topk_ids: torch.Tensor,
-    problem_sizes1: torch.Tensor,
-    problem_sizes2: torch.Tensor,
-    num_experts: int,
-    n: int,
-    k: int,
-    blockscale_offsets: torch.Tensor | None = None,
-    force_swap_ab: bool | None = None,
-):
-    """
-    Compute only the per-expert problem sizes needed by the two grouped matrix
-    multiplications used in CUTLASS-based fused MoE.
-
-    The function takes in topk_ids (token→expert mapping) and computes:
-    - problem_sizes1, problem_sizes2: M×N×K sizes of each expert's
-                                    multiplication for the two grouped MMs
-                                    used in the fused MoE operation.
-    Optional:
-    - force_swap_ab: If set to True or False, explicitly enable or disable the
-                     A/B input swap optimization. If None (default), the swap
-                     is selected automatically based on tensor sizes.
-    """
-    return torch.ops._C.get_cutlass_moe_mm_problem_sizes(
-        topk_ids,
-        problem_sizes1,
-        problem_sizes2,
-        num_experts,
-        n,
-        k,
-        blockscale_offsets,
-        force_swap_ab,
-    )
-
-
 def get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
    expert_first_token_offset: torch.Tensor,
    problem_sizes1: torch.Tensor,