diff --git a/tests/evals/gsm8k/configs/moe-refactor/Nemotron-Nano-30B-Fp8-ModelOpt-fi-trtllm.yaml b/tests/evals/gsm8k/configs/moe-refactor/Nemotron-Nano-30B-Fp8-ModelOpt-fi-trtllm.yaml new file mode 100644 index 000000000..570569def --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Nemotron-Nano-30B-Fp8-ModelOpt-fi-trtllm.yaml @@ -0,0 +1,8 @@ +model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8" +accuracy_threshold: 0.29 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" +env: + VLLM_USE_FLASHINFER_MOE_FP8: "1" + VLLM_FLASHINFER_MOE_BACKEND: "latency" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Nemotron-Nano-30B-NvFp4-ModelOpt-fi-cutlass.yaml b/tests/evals/gsm8k/configs/moe-refactor/Nemotron-Nano-30B-NvFp4-ModelOpt-fi-cutlass.yaml new file mode 100644 index 000000000..d802ac3f3 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Nemotron-Nano-30B-NvFp4-ModelOpt-fi-cutlass.yaml @@ -0,0 +1,8 @@ +model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4" +accuracy_threshold: 0.29 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" +env: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_FLASHINFER_MOE_BACKEND: "throughput" diff --git a/tests/evals/gsm8k/configs/moe-refactor/config-b200.txt b/tests/evals/gsm8k/configs/moe-refactor/config-b200.txt index a7c55a6ef..8249d2914 100644 --- a/tests/evals/gsm8k/configs/moe-refactor/config-b200.txt +++ b/tests/evals/gsm8k/configs/moe-refactor/config-b200.txt @@ -13,3 +13,5 @@ Llama-4-Scout-BF16-fi-cutlass.yaml Llama-4-Scout-BF16-triton.yaml Mixtral-8x7B-BF16-fi-cutlass.yaml Mixtral-8x7B-BF16-triton.yaml +Nemotron-Nano-30B-Fp8-ModelOpt-fi-trtllm.yaml +Nemotron-Nano-30B-NvFp4-ModelOpt-fi-cutlass.yaml diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py index ea84406ba..d61303923 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -122,6 +122,8 @@ def is_supported_config_trtllm( return False, _make_reason("routing method") elif activation_format != mk.FusedMoEActivationFormat.Standard: return False, _make_reason("activation format") + elif moe_config.hidden_dim % 512 != 0: + return False, _make_reason("hidden_dim must be divisible by 512") return True, None