[Bugfix][CI/Build][Hardware][AMD] Fix AMD tests, add HF cache, update CK FA, add partially supported model notes (#6543)

2024-07-20 11:39:07 -05:00
parent 683e3cb9c4
commit 06d6c5fe9f
12 changed files with 116 additions and 39 deletions
--- a/vllm/model_executor/models/init.py
+++ b/vllm/model_executor/models/init.py
@@ -87,13 +87,24 @@ _ROCM_UNSUPPORTED_MODELS: List[str] = []

 # Models partially supported by ROCm.
 # Architecture -> Reason.
+_ROCM_SWA_REASON = ("Sliding window attention (SWA) is not yet supported in "
+                    "Triton flash attention. For half-precision SWA support, "
+                    "please use CK flash attention by setting "
+                    "`VLLM_USE_TRITON_FLASH_ATTN=0`")
 _ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = {
    "Qwen2ForCausalLM":
-    "Sliding window attention is not yet supported in ROCm's flash attention",
+    _ROCM_SWA_REASON,
    "MistralForCausalLM":
-    "Sliding window attention is not yet supported in ROCm's flash attention",
+    _ROCM_SWA_REASON,
    "MixtralForCausalLM":
-    "Sliding window attention is not yet supported in ROCm's flash attention",
+    _ROCM_SWA_REASON,
+    "PaliGemmaForConditionalGeneration":
+    ("ROCm flash attention does not yet "
+     "fully support 32-bit precision on PaliGemma"),
+    "Phi3VForCausalLM":
+    ("ROCm Triton flash attention may run into compilation errors due to "
+     "excessive use of shared memory. If this happens, disable Triton FA "
+     "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`")
 }