[fix] CPUDNNLGEMMHandler pointer baked into inductor artifact (#32913)
Signed-off-by: dolpm <34420038+dolpm@users.noreply.github.com>
This commit is contained in:
@@ -360,13 +360,14 @@ void onednn_scaled_mm(
|
||||
const std::optional<torch::Tensor>& azp, // [M] or [1]
|
||||
const std::optional<torch::Tensor>& azp_adj, // [M] or [1]
|
||||
const std::optional<torch::Tensor>& bias, // [N]
|
||||
int64_t handler) {
|
||||
const torch::Tensor& handler_tensor) {
|
||||
CPU_KERNEL_GUARD_IN(onednn_scaled_mm)
|
||||
TORCH_CHECK(a.dim() == 2);
|
||||
TORCH_CHECK(a.is_contiguous());
|
||||
TORCH_CHECK(c.is_contiguous());
|
||||
W8A8MatMulPrimitiveHandler* ptr =
|
||||
reinterpret_cast<W8A8MatMulPrimitiveHandler*>(handler);
|
||||
reinterpret_cast<W8A8MatMulPrimitiveHandler*>(
|
||||
handler_tensor.item<int64_t>());
|
||||
const int32_t* azp_ptr = nullptr;
|
||||
if (azp.has_value()) {
|
||||
azp_ptr = azp->data_ptr<int32_t>();
|
||||
@@ -519,13 +520,14 @@ int64_t create_onednn_mm_handler(const torch::Tensor& b,
|
||||
|
||||
void onednn_mm(torch::Tensor& c, // [M, OC], row-major
|
||||
const torch::Tensor& a, // [M, IC], row-major
|
||||
const std::optional<torch::Tensor>& bias, int64_t handler) {
|
||||
const std::optional<torch::Tensor>& bias,
|
||||
const torch::Tensor& handler_tensor) {
|
||||
CPU_KERNEL_GUARD_IN(onednn_mm)
|
||||
TORCH_CHECK(a.dim() == 2);
|
||||
TORCH_CHECK(a.stride(-1) == 1);
|
||||
TORCH_CHECK(c.stride(-1) == 1);
|
||||
MatMulPrimitiveHandler* ptr =
|
||||
reinterpret_cast<MatMulPrimitiveHandler*>(handler);
|
||||
reinterpret_cast<MatMulPrimitiveHandler*>(handler_tensor.item<int64_t>());
|
||||
|
||||
// ACL matmuls expect contiguous source tensors
|
||||
#ifdef VLLM_USE_ACL
|
||||
|
||||
@@ -19,13 +19,14 @@ void onednn_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
|
||||
const std::optional<torch::Tensor>& azp,
|
||||
const std::optional<torch::Tensor>& azp_adj,
|
||||
const std::optional<torch::Tensor>& bias,
|
||||
int64_t handler);
|
||||
const torch::Tensor& handler_tensor);
|
||||
|
||||
int64_t create_onednn_mm_handler(const torch::Tensor& b,
|
||||
int64_t primitive_cache_size);
|
||||
|
||||
void onednn_mm(torch::Tensor& c, const torch::Tensor& a,
|
||||
const std::optional<torch::Tensor>& bias, int64_t handler);
|
||||
const std::optional<torch::Tensor>& bias,
|
||||
const torch::Tensor& handler_tensor);
|
||||
|
||||
bool is_onednn_acl_supported();
|
||||
|
||||
@@ -196,7 +197,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
// oneDNN GEMM
|
||||
ops.def(
|
||||
"onednn_mm(Tensor! c, Tensor a, Tensor? bias, "
|
||||
"int handler) -> ()");
|
||||
"Tensor handler_tensor) -> ()");
|
||||
ops.impl("onednn_mm", torch::kCPU, &onednn_mm);
|
||||
|
||||
// Check if oneDNN was built with ACL backend
|
||||
@@ -212,7 +213,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
// oneDNN scaled_mm for W8A8 with static per-tensor activation quantization
|
||||
ops.def(
|
||||
"onednn_scaled_mm(Tensor! c, Tensor a, Tensor a_scales, Tensor? azp, "
|
||||
"Tensor? azp_adj, Tensor? bias, int handler) -> ()");
|
||||
"Tensor? azp_adj, Tensor? bias, Tensor handler_tensor) -> ()");
|
||||
ops.impl("onednn_scaled_mm", torch::kCPU, &onednn_scaled_mm);
|
||||
|
||||
// Compute int8 quantized tensor for given scaling factor.
|
||||
|
||||
Reference in New Issue
Block a user