[Performance][ROCm] Add skinny gemms for unquantized linear on ROCm (#15830)

Signed-off-by: charlifu <charlifu@amd.com>
Co-authored-by: Tyler Michael Smith <tysmith@redhat.com>
This commit is contained in:
Charlie Fu
2025-04-21 22:46:22 -05:00
committed by GitHub
parent b9b4746950
commit 188b7f9b8c
12 changed files with 1955 additions and 93 deletions

View File

@@ -6,7 +6,6 @@ from typing import Any, Literal, Optional, Union
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter, UninitializedParameter
from vllm.distributed import (divide, get_tensor_model_parallel_rank,
@@ -17,6 +16,7 @@ from vllm.distributed import (divide, get_tensor_model_parallel_rank,
from vllm.logger import init_logger
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, QuantizeMethodBase)
from vllm.model_executor.layers.utils import dispatch_unquantized_gemm
# yapf: disable
from vllm.model_executor.parameter import (BasevLLMParameter,
BlockQuantScaleParameter,
@@ -188,7 +188,7 @@ class UnquantizedLinearMethod(LinearMethodBase):
x: torch.Tensor,
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
return F.linear(x, layer.weight, bias)
return dispatch_unquantized_gemm()(x, layer.weight, bias)
class LinearBase(torch.nn.Module):