[Bugfix] Fix NVFP4 TRTLLM MoE non-gated support; add gsm8k for Nemotron-3-Nano FP8+NVFP4 (#34725)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
@@ -0,0 +1,8 @@
|
||||
model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8"
|
||||
accuracy_threshold: 0.29
|
||||
num_questions: 1319
|
||||
num_fewshot: 5
|
||||
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
|
||||
env:
|
||||
VLLM_USE_FLASHINFER_MOE_FP8: "1"
|
||||
VLLM_FLASHINFER_MOE_BACKEND: "latency"
|
||||
@@ -0,0 +1,8 @@
|
||||
model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4"
|
||||
accuracy_threshold: 0.29
|
||||
num_questions: 1319
|
||||
num_fewshot: 5
|
||||
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
|
||||
env:
|
||||
VLLM_USE_FLASHINFER_MOE_FP4: "1"
|
||||
VLLM_FLASHINFER_MOE_BACKEND: "throughput"
|
||||
@@ -13,3 +13,5 @@ Llama-4-Scout-BF16-fi-cutlass.yaml
|
||||
Llama-4-Scout-BF16-triton.yaml
|
||||
Mixtral-8x7B-BF16-fi-cutlass.yaml
|
||||
Mixtral-8x7B-BF16-triton.yaml
|
||||
Nemotron-Nano-30B-Fp8-ModelOpt-fi-trtllm.yaml
|
||||
Nemotron-Nano-30B-NvFp4-ModelOpt-fi-cutlass.yaml
|
||||
|
||||
@@ -122,6 +122,8 @@ def is_supported_config_trtllm(
|
||||
return False, _make_reason("routing method")
|
||||
elif activation_format != mk.FusedMoEActivationFormat.Standard:
|
||||
return False, _make_reason("activation format")
|
||||
elif moe_config.hidden_dim % 512 != 0:
|
||||
return False, _make_reason("hidden_dim must be divisible by 512")
|
||||
|
||||
return True, None
|
||||
|
||||
|
||||
Reference in New Issue
Block a user