Expand MLA to support most types of quantization (#13181)
This commit is contained in:
@@ -991,37 +991,7 @@ class ModelConfig:
|
||||
|
||||
@property
|
||||
def use_mla(self) -> bool:
|
||||
if not self.is_deepseek_mla or envs.VLLM_MLA_DISABLE:
|
||||
return False
|
||||
|
||||
if self.quantization is not None and self.quantization not in [\
|
||||
"fp8", "compressed-tensors"]:
|
||||
logger.warning(
|
||||
"MLA is not supported with %s quantization. "
|
||||
"Disabling MLA.", self.quantization)
|
||||
return False
|
||||
|
||||
# If using a "compressed-tensors" checkpoint, check that all groups
|
||||
# have fp8 for both weights and activations.
|
||||
if self.quantization == "compressed-tensors":
|
||||
quant_config = self._parse_quant_hf_config()
|
||||
for group_name, cfg in quant_config.get("config_groups", {
|
||||
"": {}
|
||||
}).items():
|
||||
act_cfg = cfg.get("input_activations", {})
|
||||
act_type = None if act_cfg is None else act_cfg.get("type", "")
|
||||
w_cfg = cfg.get("weights", {})
|
||||
w_type = None if w_cfg is None else w_cfg.get("type", "")
|
||||
if act_type != "fp8" or w_type != "fp8":
|
||||
logger.warning(
|
||||
"compressed-tensors MLA support requires fp8 "
|
||||
"activations and weights in group '%s', but got "
|
||||
"activations type '%s' and weights type '%s'.\n "
|
||||
"Full config: %s", group_name, act_type, w_type,
|
||||
quant_config)
|
||||
return False
|
||||
|
||||
return True
|
||||
return self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE
|
||||
|
||||
@property
|
||||
def supported_runner_types(self) -> Set[RunnerType]:
|
||||
|
||||
Reference in New Issue
Block a user