Pipeline test: use max_num_tokens=8192 matching vLLM
This commit is contained in:
@@ -14,6 +14,7 @@ NUM_TOKENS = 8
|
||||
TOP_K = 6
|
||||
SWIGLU_LIMIT = 10.0
|
||||
DEVICE = "cuda"
|
||||
MAX_NUM_TOKENS = 8192 # match vLLM config
|
||||
|
||||
|
||||
def load_layer_tensors(model_dir, layer_idx):
|
||||
@@ -126,7 +127,7 @@ def main():
|
||||
|
||||
runner = CuTeDSLMoERunner(
|
||||
num_experts=NUM_EXPERTS, hidden_size=HIDDEN_SIZE,
|
||||
intermediate_size=INTERMEDIATE_SIZE, max_num_tokens=NUM_TOKENS,
|
||||
intermediate_size=INTERMEDIATE_SIZE, max_num_tokens=MAX_NUM_TOKENS,
|
||||
top_k=TOP_K, device=DEVICE,
|
||||
)
|
||||
runner.l1_fp4 = l1_fp4; runner.l1_sf = l1_sf; runner.l1_gs = l1_gs
|
||||
|
||||
Reference in New Issue
Block a user