[fix] CPUDNNLGEMMHandler pointer baked into inductor artifact (#32913)

Signed-off-by: dolpm <34420038+dolpm@users.noreply.github.com>
This commit is contained in:
dolpm
2026-01-26 13:59:44 -08:00
committed by GitHub
parent 6ee7f18f33
commit 58a05b0ca1
4 changed files with 38 additions and 23 deletions

View File

@@ -360,13 +360,14 @@ void onednn_scaled_mm(
const std::optional<torch::Tensor>& azp, // [M] or [1]
const std::optional<torch::Tensor>& azp_adj, // [M] or [1]
const std::optional<torch::Tensor>& bias, // [N]
int64_t handler) {
const torch::Tensor& handler_tensor) {
CPU_KERNEL_GUARD_IN(onednn_scaled_mm)
TORCH_CHECK(a.dim() == 2);
TORCH_CHECK(a.is_contiguous());
TORCH_CHECK(c.is_contiguous());
W8A8MatMulPrimitiveHandler* ptr =
reinterpret_cast<W8A8MatMulPrimitiveHandler*>(handler);
reinterpret_cast<W8A8MatMulPrimitiveHandler*>(
handler_tensor.item<int64_t>());
const int32_t* azp_ptr = nullptr;
if (azp.has_value()) {
azp_ptr = azp->data_ptr<int32_t>();
@@ -519,13 +520,14 @@ int64_t create_onednn_mm_handler(const torch::Tensor& b,
void onednn_mm(torch::Tensor& c, // [M, OC], row-major
const torch::Tensor& a, // [M, IC], row-major
const std::optional<torch::Tensor>& bias, int64_t handler) {
const std::optional<torch::Tensor>& bias,
const torch::Tensor& handler_tensor) {
CPU_KERNEL_GUARD_IN(onednn_mm)
TORCH_CHECK(a.dim() == 2);
TORCH_CHECK(a.stride(-1) == 1);
TORCH_CHECK(c.stride(-1) == 1);
MatMulPrimitiveHandler* ptr =
reinterpret_cast<MatMulPrimitiveHandler*>(handler);
reinterpret_cast<MatMulPrimitiveHandler*>(handler_tensor.item<int64_t>());
// ACL matmuls expect contiguous source tensors
#ifdef VLLM_USE_ACL

View File

@@ -19,13 +19,14 @@ void onednn_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
const std::optional<torch::Tensor>& azp,
const std::optional<torch::Tensor>& azp_adj,
const std::optional<torch::Tensor>& bias,
int64_t handler);
const torch::Tensor& handler_tensor);
int64_t create_onednn_mm_handler(const torch::Tensor& b,
int64_t primitive_cache_size);
void onednn_mm(torch::Tensor& c, const torch::Tensor& a,
const std::optional<torch::Tensor>& bias, int64_t handler);
const std::optional<torch::Tensor>& bias,
const torch::Tensor& handler_tensor);
bool is_onednn_acl_supported();
@@ -196,7 +197,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// oneDNN GEMM
ops.def(
"onednn_mm(Tensor! c, Tensor a, Tensor? bias, "
"int handler) -> ()");
"Tensor handler_tensor) -> ()");
ops.impl("onednn_mm", torch::kCPU, &onednn_mm);
// Check if oneDNN was built with ACL backend
@@ -212,7 +213,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// oneDNN scaled_mm for W8A8 with static per-tensor activation quantization
ops.def(
"onednn_scaled_mm(Tensor! c, Tensor a, Tensor a_scales, Tensor? azp, "
"Tensor? azp_adj, Tensor? bias, int handler) -> ()");
"Tensor? azp_adj, Tensor? bias, Tensor handler_tensor) -> ()");
ops.impl("onednn_scaled_mm", torch::kCPU, &onednn_scaled_mm);
// Compute int8 quantized tensor for given scaling factor.