feat: Add Support GPTQ Quantization MOE on ROCM vllm serve (#21733)

2025-08-02 03:12:19 +02:00
parent eefbf4a68b
commit 3654847db5
2 changed files with 21 additions and 5 deletions
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -761,8 +761,8 @@ def get_moe_wna16_block_config(config: dict[str,

 def should_moe_wna16_use_cuda(num_valid_tokens: int, group_size: int,
                              num_experts: int, bit: int):
-    return bit == 4 and group_size in [32, 64, 128] and \
-        num_valid_tokens / num_experts <= 6
+    return current_platform.is_cuda() and bit == 4 and \
+        group_size in [32, 64, 128] and num_valid_tokens / num_experts <= 6


 def get_default_config(