feat: Add Support GPTQ Quantization MOE on ROCM vllm serve (#21733)

This commit is contained in:
JartX
2025-08-02 03:12:19 +02:00
committed by GitHub
parent eefbf4a68b
commit 3654847db5
2 changed files with 21 additions and 5 deletions

View File

@@ -761,8 +761,8 @@ def get_moe_wna16_block_config(config: dict[str,
def should_moe_wna16_use_cuda(num_valid_tokens: int, group_size: int,
num_experts: int, bit: int):
return bit == 4 and group_size in [32, 64, 128] and \
num_valid_tokens / num_experts <= 6
return current_platform.is_cuda() and bit == 4 and \
group_size in [32, 64, 128] and num_valid_tokens / num_experts <= 6
def get_default_config(