Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
This commit is contained in:
@@ -45,6 +45,22 @@ steps:
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
|
||||
|
||||
- label: LM Eval Qwen3.5 Models (B200)
|
||||
timeout_in_minutes: 120
|
||||
device: b200
|
||||
optional: true
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/models/qwen3_5.py
|
||||
- vllm/model_executor/models/qwen3_5_mtp.py
|
||||
- vllm/transformers_utils/configs/qwen3_5.py
|
||||
- vllm/transformers_utils/configs/qwen3_5_moe.py
|
||||
- vllm/model_executor/models/qwen3_next.py
|
||||
- vllm/model_executor/models/qwen3_next_mtp.py
|
||||
- vllm/model_executor/layers/fla/ops/
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-qwen35-blackwell.txt
|
||||
|
||||
- label: LM Eval Large Models (H200)
|
||||
timeout_in_minutes: 60
|
||||
device: h200
|
||||
|
||||
8
tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-DEP2.yaml
Normal file
8
tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-DEP2.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
model_name: "Qwen/Qwen3.5-35B-A3B"
|
||||
accuracy_threshold: 0.86
|
||||
num_questions: 1319
|
||||
num_fewshot: 5
|
||||
server_args: >-
|
||||
--max-model-len 4096
|
||||
--data-parallel-size 2
|
||||
--enable-expert-parallel
|
||||
9
tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-FP8-DEP2.yaml
Normal file
9
tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-FP8-DEP2.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
model_name: "Qwen/Qwen3.5-35B-A3B-FP8"
|
||||
accuracy_threshold: 0.86
|
||||
num_questions: 1319
|
||||
num_fewshot: 5
|
||||
server_args: >-
|
||||
--max-model-len 4096
|
||||
--data-parallel-size 2
|
||||
--enable-expert-parallel
|
||||
--kv-cache-dtype fp8
|
||||
1
tests/evals/gsm8k/configs/models-qwen35-blackwell.txt
Normal file
1
tests/evals/gsm8k/configs/models-qwen35-blackwell.txt
Normal file
@@ -0,0 +1 @@
|
||||
Qwen3.5-35B-A3B-DEP2.yaml
|
||||
@@ -269,9 +269,16 @@ class TrtLlmFp8ExpertsMonolithic(TrtLlmFp8ExpertsBase, mk.FusedMoEExpertsMonolit
|
||||
weight_key: QuantKey | None,
|
||||
activation_key: QuantKey | None,
|
||||
) -> bool:
|
||||
"""Monolithic kernels need to express router support."""
|
||||
"""Monolithic kernels need to express router support.
|
||||
Renormalize/RenormalizeNaive are excluded: the monolithic kernel's
|
||||
internal routing for these methods produces output uncorrelated
|
||||
with the modular kernel's output and with Triton kernel's output
|
||||
for Qwen3.5-35B-A3B-FP8.
|
||||
See: https://github.com/vllm-project/vllm/issues/37591
|
||||
"""
|
||||
# NOTE(dbari): TopK routing could also be enabled, but need to validate models
|
||||
# NOTE(dbari): Default is not implemented and should not be enabled until it is
|
||||
|
||||
if (weight_key, activation_key) in [
|
||||
(kFp8Static128BlockSym, kFp8Dynamic128Sym),
|
||||
(kMxfp8Static, kMxfp8Dynamic),
|
||||
@@ -279,16 +286,12 @@ class TrtLlmFp8ExpertsMonolithic(TrtLlmFp8ExpertsBase, mk.FusedMoEExpertsMonolit
|
||||
# NOTE(rob): potentially allow others here. This is a conservative list.
|
||||
return routing_method in [
|
||||
RoutingMethodType.DeepSeekV3,
|
||||
RoutingMethodType.Renormalize,
|
||||
RoutingMethodType.RenormalizeNaive,
|
||||
]
|
||||
elif (weight_key, activation_key) == (kFp8StaticTensorSym, kFp8StaticTensorSym):
|
||||
# NOTE(dbari): as above, potentially allow others here.
|
||||
return routing_method in [
|
||||
RoutingMethodType.DeepSeekV3,
|
||||
RoutingMethodType.Llama4,
|
||||
RoutingMethodType.Renormalize,
|
||||
RoutingMethodType.RenormalizeNaive,
|
||||
]
|
||||
else:
|
||||
raise ValueError("Unsupported quantization scheme.")
|
||||
|
||||
Reference in New Issue
Block a user