Pipeline test: use max_num_tokens=8192 matching vLLM

This commit is contained in:
2026-05-17 23:04:44 +00:00
parent c45364b3a8
commit bedcfc4dab

View File

@@ -14,6 +14,7 @@ NUM_TOKENS = 8
TOP_K = 6
SWIGLU_LIMIT = 10.0
DEVICE = "cuda"
MAX_NUM_TOKENS = 8192 # match vLLM config
def load_layer_tensors(model_dir, layer_idx):
@@ -126,7 +127,7 @@ def main():
runner = CuTeDSLMoERunner(
num_experts=NUM_EXPERTS, hidden_size=HIDDEN_SIZE,
intermediate_size=INTERMEDIATE_SIZE, max_num_tokens=NUM_TOKENS,
intermediate_size=INTERMEDIATE_SIZE, max_num_tokens=MAX_NUM_TOKENS,
top_k=TOP_K, device=DEVICE,
)
runner.l1_fp4 = l1_fp4; runner.l1_sf = l1_sf; runner.l1_gs = l1_gs