[Bugfix] Allow fallback to AWQ from AWQMarlin at per-layer granularity (#13119)
This commit is contained in:
@@ -6,6 +6,7 @@ import numpy
|
||||
import torch
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.linear import LinearBase
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.scalar_type import ScalarType, scalar_types
|
||||
|
||||
@@ -135,6 +136,20 @@ def check_marlin_supports_shape(output_size_per_partition: int,
|
||||
return True, None
|
||||
|
||||
|
||||
def check_marlin_supports_layer(layer: LinearBase, group_size: int) \
|
||||
-> bool:
|
||||
output_size_per_partition = getattr(layer, "output_size_per_partition",
|
||||
None) or layer.output_size
|
||||
input_size_per_partition = getattr(layer, "input_size_per_partition",
|
||||
None) or layer.input_size
|
||||
|
||||
return check_marlin_supports_shape(
|
||||
output_size_per_partition=output_size_per_partition,
|
||||
input_size_per_partition=input_size_per_partition,
|
||||
input_size=layer.input_size,
|
||||
group_size=group_size)[0]
|
||||
|
||||
|
||||
def marlin_make_workspace(output_size_per_partition: int,
|
||||
device: torch.device) -> torch.Tensor:
|
||||
max_workspace_size = (output_size_per_partition //
|
||||
|
||||
Reference in New Issue
Block a user