From bedcfc4dab985696760970be2358b2041e73dacf Mon Sep 17 00:00:00 2001 From: biondizzle Date: Sun, 17 May 2026 23:04:44 +0000 Subject: [PATCH] Pipeline test: use max_num_tokens=8192 matching vLLM --- tests/test_pipeline_real_weights.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_pipeline_real_weights.py b/tests/test_pipeline_real_weights.py index 30687b8d..cf2ce97c 100644 --- a/tests/test_pipeline_real_weights.py +++ b/tests/test_pipeline_real_weights.py @@ -14,6 +14,7 @@ NUM_TOKENS = 8 TOP_K = 6 SWIGLU_LIMIT = 10.0 DEVICE = "cuda" +MAX_NUM_TOKENS = 8192 # match vLLM config def load_layer_tensors(model_dir, layer_idx): @@ -126,7 +127,7 @@ def main(): runner = CuTeDSLMoERunner( num_experts=NUM_EXPERTS, hidden_size=HIDDEN_SIZE, - intermediate_size=INTERMEDIATE_SIZE, max_num_tokens=NUM_TOKENS, + intermediate_size=INTERMEDIATE_SIZE, max_num_tokens=MAX_NUM_TOKENS, top_k=TOP_K, device=DEVICE, ) runner.l1_fp4 = l1_fp4; runner.l1_sf = l1_sf; runner.l1_gs = l1_gs