From bedcfc4dab985696760970be2358b2041e73dacf Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Sun, 17 May 2026 23:04:44 +0000
Subject: [PATCH] Pipeline test: use max_num_tokens=8192 matching vLLM

---
 tests/test_pipeline_real_weights.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_pipeline_real_weights.py b/tests/test_pipeline_real_weights.py
index 30687b8d..cf2ce97c 100644
--- a/tests/test_pipeline_real_weights.py
+++ b/tests/test_pipeline_real_weights.py
@@ -14,6 +14,7 @@ NUM_TOKENS = 8
 TOP_K = 6
 SWIGLU_LIMIT = 10.0
 DEVICE = "cuda"
+MAX_NUM_TOKENS = 8192  # match vLLM config
 
 
 def load_layer_tensors(model_dir, layer_idx):
@@ -126,7 +127,7 @@ def main():
     
     runner = CuTeDSLMoERunner(
         num_experts=NUM_EXPERTS, hidden_size=HIDDEN_SIZE,
-        intermediate_size=INTERMEDIATE_SIZE, max_num_tokens=NUM_TOKENS,
+        intermediate_size=INTERMEDIATE_SIZE, max_num_tokens=MAX_NUM_TOKENS,
         top_k=TOP_K, device=DEVICE,
     )
     runner.l1_fp4 = l1_fp4; runner.l1_sf = l1_sf; runner.l1_gs = l1_gs