[BugFix][V1][ROCm] Triton MLA uses V0 backend on V1 engine (#19067)

Signed-off-by: Tianyuan Wu <Tianyuan.Wu@amd.com>
2025-07-01 16:12:19 +08:00
parent b1c1fe35a5
commit 96453cfa83
5 changed files with 78 additions and 10 deletions
--- a/tests/kernels/attention/test_rocm_attention_selector.py
+++ b/tests/kernels/attention/test_rocm_attention_selector.py
@@ -35,7 +35,8 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
        m.setenv(STR_BACKEND_ENV_VAR, "TRITON_MLA")
        backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
                                   False, True)
-        assert backend.get_name() == "TRITON_MLA"
+        assert (backend.get_name() == "TRITON_MLA"
+                or backend.get_name() == "TRITON_MLA_VLLM_V1")

        # If attention backend is None
        # If use_mla is true
@@ -43,7 +44,8 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
        m.setenv(STR_BACKEND_ENV_VAR, None)
        backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
                                   False, True)
-        assert backend.get_name() == "TRITON_MLA"
+        assert (backend.get_name() == "TRITON_MLA"
+                or backend.get_name() == "TRITON_MLA_VLLM_V1")

        # change the attention backend to AITER MLA
        m.setenv(STR_BACKEND_ENV_VAR, "ROCM_AITER_MLA")