[Core] Support tensor parallelism for GGUF quantization (#7520)

This commit is contained in:
Isotr0py
2024-08-20 05:30:14 +08:00
committed by GitHub
parent 47b65a5508
commit 7601cb044d
3 changed files with 39 additions and 15 deletions

View File

@@ -5,7 +5,6 @@ import torch
from torch.nn.parameter import Parameter, UninitializedParameter
from vllm import _custom_ops as ops
from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, QuantizeMethodBase)
@@ -39,9 +38,6 @@ class GGUFConfig(QuantizationConfig):
@classmethod
def from_config(cls, config: Dict[str, Any]) -> "GGUFConfig":
if get_tensor_model_parallel_world_size() > 1:
raise ValueError(
"GGUF quantization hasn't supported tensor parallelism yet.")
return cls()
def get_quant_method(self, layer: torch.nn.Module,