diff --git a/csrc/ops.h b/csrc/ops.h index d394d99a7..c899535bd 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -260,12 +260,6 @@ void get_cutlass_moe_mm_data( const int64_t num_experts, const int64_t n, const int64_t k, const std::optional& blockscale_offsets); -void get_cutlass_moe_mm_problem_sizes( - const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1, - torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n, - const int64_t k, const std::optional& blockscale_offsets, - std::optional force_swap_ab = std::nullopt); - void get_cutlass_moe_mm_problem_sizes_from_expert_offsets( const torch::Tensor& expert_first_token_offset, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, diff --git a/csrc/quantization/w8a8/cutlass/moe/moe_data.cu b/csrc/quantization/w8a8/cutlass/moe/moe_data.cu index 28af2e7d4..eae500cb6 100644 --- a/csrc/quantization/w8a8/cutlass/moe/moe_data.cu +++ b/csrc/quantization/w8a8/cutlass/moe/moe_data.cu @@ -130,26 +130,6 @@ inline void launch_compute_problem_sizes(const torch::Tensor& topk_ids, } } // namespace -void get_cutlass_moe_mm_problem_sizes_caller( - const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1, - torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n, - const int64_t k, const std::optional& blockscale_offsets, - std::optional force_swap_ab = std::nullopt) { - auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index()); - auto options_int32 = - torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device()); - torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32); - - // Swap-AB should be disabled for FP4 path - bool may_swap_ab = - force_swap_ab.value_or((!blockscale_offsets.has_value()) && - (topk_ids.numel() <= SWAP_AB_THRESHOLD)); - - launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2, - atomic_buffer, num_experts, n, k, stream, - may_swap_ab); -} - template __global__ void compute_problem_sizes_from_expert_offsets( const int64_t* __restrict__ expert_first_token_offset, diff --git a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu index 077966a1d..82ccc1960 100644 --- a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu +++ b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu @@ -77,12 +77,6 @@ void get_cutlass_moe_mm_data_caller( const int64_t num_experts, const int64_t n, const int64_t k, const std::optional& blockscale_offsets); -void get_cutlass_moe_mm_problem_sizes_caller( - const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1, - torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n, - const int64_t k, const std::optional& blockscale_offsets, - std::optional force_swap_ab = std::nullopt); - void get_cutlass_moe_mm_problem_sizes_from_expert_offsets_caller( const torch::Tensor& expert_first_token_offset, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, @@ -306,27 +300,6 @@ void get_cutlass_moe_mm_data( version_num, ". Required capability: 90, 100, or 120"); } -void get_cutlass_moe_mm_problem_sizes( - const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1, - torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n, - const int64_t k, const std::optional& blockscale_offsets, - std::optional force_swap_ab = std::nullopt) { - int32_t version_num = get_sm_version_num(); -#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \ - (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \ - (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120) - get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1, - problem_sizes2, num_experts, n, k, - blockscale_offsets, force_swap_ab); - return; -#endif - TORCH_CHECK_NOT_IMPLEMENTED( - false, - "No compiled get_cutlass_moe_mm_problem_sizes: no cutlass_scaled_mm " - "kernel for CUDA device capability: ", - version_num, ". Required capability: 90, 100, or 120"); -} - void get_cutlass_moe_mm_problem_sizes_from_expert_offsets( const torch::Tensor& expert_first_token_offset, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 864be7a26..cf12682cb 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -474,19 +474,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "()"); ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data); - // A function that computes problem sizes for each expert's multiplication - // used by the two mms called from fused MoE operation. It takes topk_ids as - // an input, and computes problem_sizes1 and problem_sizes2 only. - ops.def( - "get_cutlass_moe_mm_problem_sizes(Tensor topk_ids, " - " Tensor! problem_sizes1, " - " Tensor! problem_sizes2, " - " int num_experts, int n, int k, " - " Tensor? blockscale_offsets, " - " bool? force_swap_ab) -> ()"); - ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA, - &get_cutlass_moe_mm_problem_sizes); - // compute per-expert problem sizes from expert_first_token_offset // produced by vLLM's moe_permute kernel ops.def( diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 5c777221b..26931ba29 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1044,41 +1044,6 @@ def get_cutlass_moe_mm_data( ) -def get_cutlass_moe_mm_problem_sizes( - topk_ids: torch.Tensor, - problem_sizes1: torch.Tensor, - problem_sizes2: torch.Tensor, - num_experts: int, - n: int, - k: int, - blockscale_offsets: torch.Tensor | None = None, - force_swap_ab: bool | None = None, -): - """ - Compute only the per-expert problem sizes needed by the two grouped matrix - multiplications used in CUTLASS-based fused MoE. - - The function takes in topk_ids (token→expert mapping) and computes: - - problem_sizes1, problem_sizes2: M×N×K sizes of each expert's - multiplication for the two grouped MMs - used in the fused MoE operation. - Optional: - - force_swap_ab: If set to True or False, explicitly enable or disable the - A/B input swap optimization. If None (default), the swap - is selected automatically based on tensor sizes. - """ - return torch.ops._C.get_cutlass_moe_mm_problem_sizes( - topk_ids, - problem_sizes1, - problem_sizes2, - num_experts, - n, - k, - blockscale_offsets, - force_swap_ab, - ) - - def get_cutlass_moe_mm_problem_sizes_from_expert_offsets( expert_first_token_offset: torch.Tensor, problem_sizes1: torch.Tensor,