From 1330e2b2cf3174a269e9bacd02177df3297969ff Mon Sep 17 00:00:00 2001 From: biondizzle Date: Sun, 17 May 2026 08:30:41 +0000 Subject: [PATCH] cleanup: remove debug prints, ready for testing Current state: - Token indices on CPU (avoids CuTeDSL GPU memory corruption) - Scale assembly uses per-expert swizzle + scatter (matches reference) - compute_activation_global_scales warmup gets ~0.97 cosine - expert_offsets passed without leading 0 (matches pipeline) - layertest + cudagraph_test pass --- vllm/nvfp4_cutedsl.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/vllm/nvfp4_cutedsl.py b/vllm/nvfp4_cutedsl.py index 196d5781..63ef48f9 100644 --- a/vllm/nvfp4_cutedsl.py +++ b/vllm/nvfp4_cutedsl.py @@ -242,12 +242,6 @@ class CuTeDSLMoERunner: sorted_token_ids = token_indices[sort_idx.cpu()].to(device) slot_hidden = hidden_states_sample[sorted_token_ids] - # Debug: verify slot_hidden - torch.cuda.synchronize() - _slot_check = sorted_token_ids[:8].cpu().tolist() - _slot_amax = slot_hidden.abs().max().item() - print(f" Warmup: sorted_token_ids[:8]={_slot_check}, slot_hidden amax={_slot_amax:.6f}") - # L1: get exact gs from quantize_to_nvfp4 _, _, l1_gs = quantize_to_nvfp4(slot_hidden)