@@ -1,5 +1,6 @@
|
||||
model_name: "Qwen/Qwen3.5-35B-A3B"
|
||||
accuracy_threshold: 0.86
|
||||
accuracy_threshold: 0.84
|
||||
tolerance: 0.03
|
||||
num_questions: 1319
|
||||
num_fewshot: 5
|
||||
server_args: >-
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
model_name: "Qwen/Qwen3.5-35B-A3B-FP8"
|
||||
accuracy_threshold: 0.86
|
||||
accuracy_threshold: 0.79
|
||||
tolerance: 0.03
|
||||
num_questions: 1319
|
||||
num_fewshot: 5
|
||||
server_args: >-
|
||||
|
||||
@@ -0,0 +1,9 @@
|
||||
model_name: "nvidia/Qwen3.5-397B-A17B-NVFP4"
|
||||
accuracy_threshold: 0.88
|
||||
tolerance: 0.03
|
||||
num_questions: 1319
|
||||
num_fewshot: 5
|
||||
server_args: >-
|
||||
--max-model-len 4096
|
||||
--data-parallel-size 2
|
||||
--enable-expert-parallel
|
||||
@@ -1,2 +1,3 @@
|
||||
Qwen3.5-35B-A3B-DEP2.yaml
|
||||
Qwen3.5-35B-A3B-FP8-DEP2.yaml
|
||||
Qwen3.5-397B-A17B-NVFP4-DEP2.yaml
|
||||
@@ -19,8 +19,6 @@ from vllm.platforms import current_platform
|
||||
|
||||
from .gsm8k_eval import evaluate_gsm8k
|
||||
|
||||
TOL = 0.08 # Absolute tolerance for accuracy comparison
|
||||
|
||||
|
||||
def run_gsm8k_eval(eval_config: dict, server_url: str) -> dict:
|
||||
"""Run GSM8K evaluation using our isolated script."""
|
||||
@@ -99,20 +97,20 @@ def test_gsm8k_correctness(config_filename):
|
||||
|
||||
measured_metric = results["accuracy"]
|
||||
expected_metric = eval_config["accuracy_threshold"]
|
||||
tol = eval_config.get("tolerance", 0.08)
|
||||
|
||||
print(f"GSM8K Results for {eval_config['model_name']}:")
|
||||
print(f" Measured metric: {measured_metric:.4f}")
|
||||
print(f" Expected metric: {expected_metric:.4f}")
|
||||
print(f" Tolerance: {TOL:.4f}")
|
||||
print(f" Tolerance: {tol:.4f}")
|
||||
print(f" Questions: {results['num_questions']}")
|
||||
print(f" Invalid rate: {results['invalid_rate']:.3f}")
|
||||
print(f" Latency: {results['latency']:.1f}s")
|
||||
print(f" QPS: {results['questions_per_second']:.1f}")
|
||||
|
||||
# Verify metric is within tolerance
|
||||
assert measured_metric >= expected_metric - TOL, (
|
||||
assert measured_metric >= expected_metric - tol, (
|
||||
f"GSM8K metric too low: {measured_metric:.4f} < "
|
||||
f"{expected_metric:.4f} - {TOL:.4f} = {expected_metric - TOL:.4f}"
|
||||
f"{expected_metric:.4f} - {tol:.4f} = {expected_metric - tol:.4f}"
|
||||
)
|
||||
|
||||
print(f"✅ GSM8K test passed for {eval_config['model_name']}")
|
||||
|
||||
Reference in New Issue
Block a user