[Core] Support tensor parallelism for GGUF quantization (#7520)

2024-08-20 05:30:14 +08:00
parent 47b65a5508
commit 7601cb044d
3 changed files with 39 additions and 15 deletions
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -5,7 +5,6 @@ import torch
 from torch.nn.parameter import Parameter, UninitializedParameter

 from vllm import _custom_ops as ops
-from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
 from vllm.model_executor.layers.quantization.base_config import (
    QuantizationConfig, QuantizeMethodBase)
@@ -39,9 +38,6 @@ class GGUFConfig(QuantizationConfig):

    @classmethod
    def from_config(cls, config: Dict[str, Any]) -> "GGUFConfig":
-        if get_tensor_model_parallel_world_size() > 1:
-            raise ValueError(
-                "GGUF quantization hasn't supported tensor parallelism yet.")
        return cls()

    def get_quant_method(self, layer: torch.nn.Module,