Refactor the tensor parallelism, quantization, and weight-loading codes. Summary of the new features enabled by this PR: - **All models** are able to be quantized with AWQ and SqueezeLLM, and [soon GPTQ](https://github.com/vllm-project/vllm/pull/1580). - Model loading code became much simpler. - Support model parallelism for all MQA/GQA models when the number of key/value heads is smaller than the tensor parallel size.
23 lines
686 B
Python
23 lines
686 B
Python
from typing import Type
|
|
|
|
from vllm.model_executor.layers.quantization.awq import AWQConfig
|
|
from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
|
|
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
|
|
|
_QUANTIZATION_CONFIG_REGISTRY = {
|
|
"awq": AWQConfig,
|
|
"squeezellm": SqueezeLLMConfig,
|
|
}
|
|
|
|
|
|
def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
|
|
if quantization not in _QUANTIZATION_CONFIG_REGISTRY:
|
|
raise ValueError(f"Invalid quantization method: {quantization}")
|
|
return _QUANTIZATION_CONFIG_REGISTRY[quantization]
|
|
|
|
|
|
__all__ = [
|
|
"QuantizationConfig",
|
|
"get_quantization_config",
|
|
]
|