diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py index 8e45c0e41..509bacfbc 100644 --- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py @@ -913,7 +913,14 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): activation_key: QuantKey | None, ) -> bool: p = current_platform - device_supports_fp8 = (p.is_rocm() and p.rocm.on_gfx9()) or ( + if p.is_rocm(): + from vllm.platforms.rocm import on_gfx9 + + is_rocm_on_gfx9 = on_gfx9() + else: + is_rocm_on_gfx9 = False + + device_supports_fp8 = is_rocm_on_gfx9 or ( p.is_cuda() and p.has_device_capability((8, 9)) ) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 7e7d59fb9..669a6e74b 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1922,7 +1922,14 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): activation_key: QuantKey | None, ) -> bool: p = current_platform - device_supports_fp8 = (p.is_rocm() and p.rocm.on_gfx9()) or ( + if p.is_rocm(): + from vllm.platforms.rocm import on_gfx9 + + is_rocm_on_gfx9 = on_gfx9() + else: + is_rocm_on_gfx9 = False + + device_supports_fp8 = is_rocm_on_gfx9 or ( p.is_cuda() and p.has_device_capability((8, 9)) )