From 3aab680e3e261e04c188b3015611f2947465d33b Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Wed, 1 Apr 2026 23:30:11 -0500 Subject: [PATCH] [ROCm][Bugfix] Fix ROCm runtime failure due to missing symbol (#38750) Signed-off-by: Gregory Shtrasberg Signed-off-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: tjtanaavllm --- csrc/ops.h | 2 ++ csrc/torch_bindings.cpp | 23 +++++++++++------------ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/csrc/ops.h b/csrc/ops.h index 283e8a885..20351a3e4 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -142,11 +142,13 @@ void rms_norm_per_block_quant(torch::Tensor& out, torch::Tensor const& input, std::optional residual, int64_t group_size, bool is_scale_transposed); +#ifndef USE_ROCM void silu_and_mul_per_block_quant(torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scales, int64_t group_size, std::optional scale_ub, bool is_scale_transposed); +#endif void rotary_embedding(torch::Tensor& positions, torch::Tensor& query, std::optional key, int64_t head_size, diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 80d83d4c3..0354df666 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -109,18 +109,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()"); ops.impl("silu_and_mul_quant", torch::kCUDA, &silu_and_mul_quant); - // Fused SiLU+Mul + per-block quantization - ops.def( - "silu_and_mul_per_block_quant(" - "Tensor! out, " - "Tensor input, " - "Tensor! scales, " - "int group_size, " - "Tensor? scale_ub=None, " - "bool is_scale_transposed=False) -> ()"); - ops.impl("silu_and_mul_per_block_quant", torch::kCUDA, - &silu_and_mul_per_block_quant); - ops.def("mul_and_silu(Tensor! out, Tensor input) -> ()"); ops.impl("mul_and_silu", torch::kCUDA, &mul_and_silu); @@ -244,6 +232,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // Quantization ops #ifndef USE_ROCM + // Fused SiLU+Mul + per-block quantization + ops.def( + "silu_and_mul_per_block_quant(" + "Tensor! out, " + "Tensor input, " + "Tensor! scales, " + "int group_size, " + "Tensor? scale_ub=None, " + "bool is_scale_transposed=False) -> ()"); + ops.impl("silu_and_mul_per_block_quant", torch::kCUDA, + &silu_and_mul_per_block_quant); // DeepSeek V3 fused A GEMM (SM 9.0+, bf16 only, 1-16 tokens). ops.def( "dsv3_fused_a_gemm(Tensor! output, Tensor mat_a, Tensor mat_b) -> ()");