[AMD] [Quantization] Add override flag for attention dtype instead of using kv_cache_dtype trigger (#17331)

Signed-off-by: Randall Smith <Randall.Smith@amd.com>
This commit is contained in:
rasmith
2025-06-11 14:53:28 -05:00
committed by GitHub
parent 29fa5cac1c
commit c7ea0b56cd
3 changed files with 21 additions and 1 deletions

View File

@@ -417,6 +417,8 @@ class ModelConfig:
available.\n
- "vllm" will use the vLLM model implementation.\n
- "transformers" will use the Transformers model implementation."""
override_attention_dtype: Optional[str] = None
"""Override dtype for attention"""
def compute_hash(self) -> str:
"""
@@ -517,6 +519,12 @@ class ModelConfig:
from vllm.platforms import current_platform
if (self.override_attention_dtype is not None
and not current_platform.is_rocm()):
warnings.warn(
"override-attention-dtype is set but not using ROCm platform",
stacklevel=2)
if (self.enable_sleep_mode
and not current_platform.is_sleep_mode_available()):
raise ValueError(