Pipeline test: use max_num_tokens=8192 matching vLLM

2026-05-17 23:04:44 +00:00
parent c45364b3a8
commit bedcfc4dab
1 changed files with 2 additions and 1 deletions
--- a/tests/test_pipeline_real_weights.py
+++ b/tests/test_pipeline_real_weights.py
@@ -14,6 +14,7 @@ NUM_TOKENS = 8
 TOP_K = 6
 SWIGLU_LIMIT = 10.0
 DEVICE = "cuda"
+MAX_NUM_TOKENS = 8192  # match vLLM config


 def load_layer_tensors(model_dir, layer_idx):
@@ -126,7 +127,7 @@ def main():
    
    runner = CuTeDSLMoERunner(
        num_experts=NUM_EXPERTS, hidden_size=HIDDEN_SIZE,
-        intermediate_size=INTERMEDIATE_SIZE, max_num_tokens=NUM_TOKENS,
+        intermediate_size=INTERMEDIATE_SIZE, max_num_tokens=MAX_NUM_TOKENS,
        top_k=TOP_K, device=DEVICE,
    )
    runner.l1_fp4 = l1_fp4; runner.l1_sf = l1_sf; runner.l1_gs = l1_gs