[AMD] [Quantization] Add override flag for attention dtype instead of using kv_cache_dtype trigger (#17331)
Signed-off-by: Randall Smith <Randall.Smith@amd.com>
This commit is contained in:
@@ -417,6 +417,8 @@ class ModelConfig:
|
||||
available.\n
|
||||
- "vllm" will use the vLLM model implementation.\n
|
||||
- "transformers" will use the Transformers model implementation."""
|
||||
override_attention_dtype: Optional[str] = None
|
||||
"""Override dtype for attention"""
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
"""
|
||||
@@ -517,6 +519,12 @@ class ModelConfig:
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if (self.override_attention_dtype is not None
|
||||
and not current_platform.is_rocm()):
|
||||
warnings.warn(
|
||||
"override-attention-dtype is set but not using ROCm platform",
|
||||
stacklevel=2)
|
||||
|
||||
if (self.enable_sleep_mode
|
||||
and not current_platform.is_sleep_mode_available()):
|
||||
raise ValueError(
|
||||
|
||||
Reference in New Issue
Block a user