[AMD] [Quantization] Add override flag for attention dtype instead of using kv_cache_dtype trigger (#17331)

Signed-off-by: Randall Smith <Randall.Smith@amd.com>
2025-06-11 14:53:28 -05:00
parent 29fa5cac1c
commit c7ea0b56cd
3 changed files with 21 additions and 1 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -417,6 +417,8 @@ class ModelConfig:
    available.\n
    - "vllm" will use the vLLM model implementation.\n
    - "transformers" will use the Transformers model implementation."""
+    override_attention_dtype: Optional[str] = None
+    """Override dtype for attention"""

    def compute_hash(self) -> str:
        """
@@ -517,6 +519,12 @@ class ModelConfig:

        from vllm.platforms import current_platform

+        if (self.override_attention_dtype is not None
+                and not current_platform.is_rocm()):
+            warnings.warn(
+                "override-attention-dtype is set but not using ROCm platform",
+                stacklevel=2)
+
        if (self.enable_sleep_mode
                and not current_platform.is_sleep_mode_available()):
            raise ValueError(