[Bugfix] Allow fallback to AWQ from AWQMarlin at per-layer granularity (#13119)

This commit is contained in:
Michael Goin
2025-02-12 12:19:53 -05:00
committed by GitHub
parent 36a08630e8
commit 09972e716c
4 changed files with 61 additions and 32 deletions

View File

@@ -6,6 +6,7 @@ import numpy
import torch
from vllm import _custom_ops as ops
from vllm.model_executor.layers.linear import LinearBase
from vllm.platforms import current_platform
from vllm.scalar_type import ScalarType, scalar_types
@@ -135,6 +136,20 @@ def check_marlin_supports_shape(output_size_per_partition: int,
return True, None
def check_marlin_supports_layer(layer: LinearBase, group_size: int) \
-> bool:
output_size_per_partition = getattr(layer, "output_size_per_partition",
None) or layer.output_size
input_size_per_partition = getattr(layer, "input_size_per_partition",
None) or layer.input_size
return check_marlin_supports_shape(
output_size_per_partition=output_size_per_partition,
input_size_per_partition=input_size_per_partition,
input_size=layer.input_size,
group_size=group_size)[0]
def marlin_make_workspace(output_size_per_partition: int,
device: torch.device) -> torch.Tensor:
max_workspace_size = (output_size_per_partition //