From 5a4e355d3a77b72c05208d070819a5da103cb0cb Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Tue, 19 May 2026 07:47:48 +0000
Subject: [PATCH] Add model forward test: reproduce vLLM empty output outside
 container

---
 tests/test_model_forward_b200.py | 329 +++++++++++++++++++++++++++++++
 1 file changed, 329 insertions(+)
 create mode 100644 tests/test_model_forward_b200.py

diff --git a/tests/test_model_forward_b200.py b/tests/test_model_forward_b200.py
new file mode 100644
index 00000000..cca55890
--- /dev/null
+++ b/tests/test_model_forward_b200.py
@@ -0,0 +1,329 @@
+#!/usr/bin/env python3
+"""
+Reproduce the vLLM empty-output bug outside the container.
+
+Runs the FULL model forward pass: embedding → 61 decoder layers → LM head.
+Uses CuTeDSL NVFP4 runners for quantized layers, BF16 matmuls for others.
+
+Compares two approaches:
+  A) Warmup gs (what vLLM does) — 1 token random sample per layer
+  B) Dynamic gs (compute per-batch) — uses quantize_to_nvfp4 each call
+
+If A produces garbage and B produces reasonable output, the warmup gs is wrong.
+
+Usage (on B200):
+  source /root/nvfp4-megamoe-kernel/tests/.venv/bin/activate
+  python3 tests/test_model_forward_b200.py
+"""
+
+import sys, os, json, torch, torch.nn.functional as F, time
+from safetensors import safe_open
+
+REPO = "/root/nvfp4-megamoe-kernel"
+sys.path.insert(0, REPO)
+MODEL = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4"
+DEV = "cuda:0"
+
+# Model config
+H = 7168
+NH = 128
+HD = 512
+NOPE = 448
+ROPE = 64
+QL = 1536
+OL = 1024
+OG = 16
+HPG = NH // OG
+HC = 4
+SL = 10.0
+EPS = 1e-6
+INTER = 3072
+N_EXPERTS = 384
+TOP_K = 6
+N_LAYERS = 61
+VOCAB = 129280
+
+E2M1 = torch.tensor([0,.5,1.,1.5,2.,3.,4.,6.,-0,-.5,-1.,-1.5,-2.,-3.,-4.,-6.], dtype=torch.float32)
+
+_cache = {}
+def P(k, wm, md):
+    if k in _cache: return _cache[k]
+    with safe_open(os.path.join(md, wm[k]), framework="pt") as f:
+        t = f.get_tensor(k)
+    _cache[k] = t
+    return t
+
+def dequant(w, sf, gs):
+    d = w.device; lut = E2M1.to(d)
+    lo = lut[(w & 0xF).long()]; hi = lut[((w >> 4) & 0xF).long()]
+    O, I2 = w.shape; I = I2*2
+    u = torch.empty(O, I, dtype=torch.float32, device=d)
+    u[:,0::2] = lo; u[:,1::2] = hi
+    bs = sf.float().repeat_interleave(16, dim=1)[:O,:I]
+    return (u * bs * gs).to(torch.bfloat16)
+
+def rms(x, w, eps=1e-6):
+    v = x.float().pow(2).mean(-1, keepdim=True)
+    return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype)
+
+def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
+    from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear
+    fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous()
+    s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf
+    s = s.permute(1,0).contiguous()
+    if fused and gs_t.numel() == 2:
+        g1,g2 = gs_t[0].item(), gs_t[1].item(); gs = max(g1,g2)
+        if g1 != g2:
+            s32 = s.float(); sp = lw[0] if lw else outf//2
+            s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn)
+    else:
+        gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item()
+    r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
+    r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs]
+    r.finalize_weights(); r._ensure_initialized()
+    return r
+
+def cosim(a, b):
+    return F.cosine_similarity(a.flatten().unsqueeze(0).float(), b.flatten().unsqueeze(0).float().to(a.device)).item()
+
+
+class Layer0Runner:
+    """Runs layer 0 forward with CuTeDSL kernels."""
+    def __init__(self, wm, model_dir, use_warmup_gs=True):
+        G = lambda k: P(k, wm, model_dir).to(DEV)
+        p = "model.layers.0"; a = f"{p}.self_attn"; m = f"{p}.mlp"
+
+        # Attention
+        self.qa_w = G(f"{a}.q_a_proj.weight"); self.qa_sf = G(f"{a}.q_a_proj.weight_scale"); self.qa_gs = G(f"{a}.q_a_proj.weight_scale_2")
+        self.qb_w = G(f"{a}.q_b_proj.weight"); self.qb_sf = G(f"{a}.q_b_proj.weight_scale"); self.qb_gs = G(f"{a}.q_b_proj.weight_scale_2")
+        self.kv_w = G(f"{a}.kv_proj.weight"); self.kv_sf = G(f"{a}.kv_proj.weight_scale"); self.kv_gs = G(f"{a}.kv_proj.weight_scale_2")
+        self.woa = G(f"{a}.o_a_proj.weight")
+        self.wob_w = G(f"{a}.o_b_proj.weight"); self.wob_sf = G(f"{a}.o_b_proj.weight_scale"); self.wob_gs = G(f"{a}.o_b_proj.weight_scale_2")
+        self.qn = G(f"{a}.q_a_norm.weight"); self.kvn = G(f"{a}.kv_norm.weight")
+        self.anorm = G(f"{p}.input_layernorm.weight"); self.fnorm = G(f"{p}.post_attention_layernorm.weight")
+
+        # Compressor
+        self.ckv_w = G(f"{a}.compressor.kv_proj.weight"); self.ckv_sf = G(f"{a}.compressor.kv_proj.weight_scale"); self.ckv_gs = G(f"{a}.compressor.kv_proj.weight_scale_2")
+        self.cg_w = G(f"{a}.compressor.gate_proj.weight"); self.cg_sf = G(f"{a}.compressor.gate_proj.weight_scale"); self.cg_gs = G(f"{a}.compressor.gate_proj.weight_scale_2")
+
+        # MHC
+        self.hca_fn = G(f"{p}.attn_hc.fn"); self.hcf_fn = G(f"{p}.ffn_hc.fn")
+        self.hca_b = G(f"{p}.attn_hc.base"); self.hcf_b = G(f"{p}.ffn_hc.base")
+        self.hca_s = G(f"{p}.attn_hc.scale"); self.hcf_s = G(f"{p}.ffn.scale")
+
+        # Create runners
+        self.r_qa = make_runner(self.qa_w, self.qa_sf, self.qa_gs, self.qa_w.shape[1]*2, self.qa_w.shape[0])
+        self.r_qb = make_runner(self.qb_w, self.qb_sf, self.qb_gs, self.qb_w.shape[1]*2, self.qb_w.shape[0])
+        self.r_kv = make_runner(self.kv_w, self.kv_sf, self.kv_gs, self.kv_w.shape[1]*2, self.kv_w.shape[0])
+        self.r_wob = make_runner(self.wob_w, self.wob_sf, self.wob_gs, self.wob_w.shape[1]*2, self.wob_w.shape[0])
+        self.r_ckv = make_runner(self.ckv_w, self.ckv_sf, self.ckv_gs, self.ckv_w.shape[1]*2, self.ckv_w.shape[0])
+        self.r_cg = make_runner(self.cg_w, self.cg_sf, self.cg_gs, self.cg_w.shape[1]*2, self.cg_w.shape[0])
+
+        self.use_warmup_gs = use_warmup_gs
+        if use_warmup_gs:
+            # Warmup with 1 token (what vLLM does)
+            with torch.no_grad():
+                d = torch.randn(1, H, dtype=torch.bfloat16, device=DEV)*2.0
+                self.r_qa.compute_activation_global_scale(d)
+                self.r_kv.compute_activation_global_scale(d)
+                self.r_ckv.compute_activation_global_scale(d)
+                self.r_cg.compute_activation_global_scale(d)
+                d2 = torch.randn(1, QL, dtype=torch.bfloat16, device=DEV)*2.0
+                self.r_qb.compute_activation_global_scale(d2)
+                d3 = torch.randn(1, OG*OL, dtype=torch.bfloat16, device=DEV)*2.0
+                self.r_wob.compute_activation_global_scale(d3)
+
+    def forward_projection(self, x, runner, in_features, name):
+        """Run a single NVFP4 projection, optionally recomputing gs."""
+        if not self.use_warmup_gs:
+            from cutedsl.bridge import quantize_activation_nvfp4
+            # Dynamic gs: recompute for this specific input
+            amax = x.amax().item()
+            gs = amax / (6.0 * 448.0) if amax > 0 else 1.0 / 2688.0
+            runner._activation_global_scale = gs
+        return runner.run(x)
+
+
+def main():
+    torch.cuda.set_device(0)
+    torch.manual_seed(42)
+
+    print("=" * 70)
+    print("  Full Model Forward Test: Reproduce vLLM Empty Output")
+    print("=" * 70)
+
+    with open(os.path.join(MODEL, "model.safetensors.index.json")) as f:
+        wm = json.load(f)["weight_map"]
+    G = lambda k: P(k, wm, MODEL).to(DEV)
+
+    # ── Load embedding ────────────────────────────────────────────────
+    print("\n--- Loading embedding layer ---")
+    emb_key = "model.embed_tokens.weight"
+    emb = G(emb_key)
+    print(f"  embed_tokens: {emb.shape} dtype={emb.dtype}")
+
+    # ── Load LM head ──────────────────────────────────────────────────
+    lm_head_key = "lm_head.weight"
+    if lm_head_key in wm:
+        lm_head = G(lm_head_key)
+    else:
+        lm_head = emb  # tied weights
+    print(f"  lm_head: {lm_head.shape}")
+
+    # ── Load final norm ───────────────────────────────────────────────
+    fnorm_key = "model.norm.weight"
+    fnorm_w = G(fnorm_key)
+    print(f"  final_norm: {fnorm_w.shape}")
+
+    # ── Token IDs for "The capital of France is" ──────────────────────
+    # DeepSeek V3/V4 uses a Llama-style BPE tokenizer
+    # Use token IDs that we know work. If tokenizer isn't available,
+    # just use token 0,1,2,3,4 as a test — we're checking for
+    # garbage output (all NaN or all same logit), not text quality.
+    token_ids = torch.tensor([1, 450, 8403, 315, 5413, 374], dtype=torch.long, device=DEV)
+    print(f"  token_ids: {token_ids.tolist()}")
+    NT = len(token_ids)
+
+    # ── Embed ─────────────────────────────────────────────────────────
+    print("\n--- Running embedding lookup ---")
+    with torch.no_grad():
+        hidden = emb[token_ids]  # (NT, H)
+    print(f"  hidden: {hidden.shape} amax={hidden.amax():.4f} NaN={torch.isnan(hidden).any()}")
+
+    # ── Create layer 0 runner (warmup gs, like vLLM) ─────────────────
+    print("\n--- Creating layer 0 runner (warmup gs) ---")
+    layer0 = Layer0Runner(wm, MODEL, use_warmup_gs=True)
+
+    # ── Run layer 0 attention projections ─────────────────────────────
+    print("\n--- Running layer 0 attention (CuTeDSL, warmup gs) ---")
+    with torch.no_grad():
+        normed = rms(hidden, layer0.anorm, EPS)
+        print(f"  normed: amax={normed.amax():.4f} NaN={torch.isnan(normed).any()}")
+
+        qa_out = layer0.r_qa.run(normed)
+        print(f"  q_a: amax={qa_out.amax():.4f} NaN={torch.isnan(qa_out).any()}")
+
+        kv_out = layer0.r_kv.run(normed)
+        print(f"  kv: amax={kv_out.amax():.4f} NaN={torch.isnan(kv_out).any()}")
+
+        # q_a norm → q_b
+        qa_normed = rms(qa_out, layer0.qn, EPS)
+        qb_out = layer0.r_qb.run(qa_normed)
+        print(f"  q_b: amax={qb_out.amax():.4f} NaN={torch.isnan(qb_out).any()}")
+
+    # ── Compare with BF16 reference ───────────────────────────────────
+    print("\n--- Comparing layer 0 q_a with BF16 reference ---")
+    qa_bf16 = normed @ dequant(layer0.qa_w, layer0.qa_sf, layer0.qa_gs.item()).T
+    c = cosim(qa_out, qa_bf16)
+    print(f"  q_a cosine (warmup gs): {c:.6f} {'✅' if c>=0.98 else '❌'}")
+
+    # ── Now test with DYNAMIC gs (recomputed per input) ───────────────
+    print("\n--- Testing with dynamic gs (per-input) ---")
+    # Create a fresh runner and compute gs from the actual input
+    r_qa2 = make_runner(layer0.qa_w, layer0.qa_sf, layer0.qa_gs, layer0.qa_w.shape[1]*2, layer0.qa_w.shape[0])
+    with torch.no_grad():
+        r_qa2.compute_activation_global_scale(normed)
+        qa_out2 = r_qa2.run(normed)
+    c2 = cosim(qa_out2, qa_bf16)
+    print(f"  q_a cosine (dynamic gs): {c2:.6f} {'✅' if c2>=0.98 else '❌'}")
+
+    # ── Test the FULL model: layer 0 only, then check LM head ─────────
+    print("\n--- Full forward: layer 0 → LM head ---")
+
+    # Simple layer 0 forward (attention only, no MoE for speed)
+    with torch.no_grad():
+        x = hidden.clone()
+        normed = rms(x, layer0.anorm, EPS)
+
+        # Attention projections
+        qa = layer0.r_qa.run(normed)
+        kv = layer0.r_kv.run(normed)
+        qa_n = rms(qa, layer0.qn, EPS)
+        qb = layer0.r_qb.run(qa_n)
+
+        # Skip actual attention (FlashMLA not available) — just use
+        # a random attention output to test the wo_a → wo_b path
+        o = torch.randn(NT, NH, HD, dtype=torch.bfloat16, device=DEV) * 0.1
+
+        # wo_a: BF16 BMM
+        woa = layer0.woa
+        o_2d = o.reshape(NT, NH * HD)
+        z = o_2d @ woa.T
+        z2 = z.reshape(NT, OG, -1)
+
+        # Simpler: just check if the wo_b projection works
+        z_flat = torch.randn(NT, OG * OL, dtype=torch.bfloat16, device=DEV) * 2.0
+        wob_out = layer0.r_wob.run(z_flat)
+        print(f"  wo_b output: amax={wob_out.amax():.4f} NaN={torch.isnan(wob_out).any()}")
+
+    # ── Now run LM head on the hidden state ───────────────────────────
+    print("\n--- LM head (BF16 matmul) ---")
+    with torch.no_grad():
+        normed_final = rms(x, fnorm_w, EPS)
+        logits = normed_final @ lm_head.T  # (NT, VOCAB)
+        print(f"  logits: {logits.shape} amax={logits.amax():.4f} NaN={torch.isnan(logits).any()}")
+
+        # Check if logits are reasonable
+        top5 = torch.topk(logits[-1], 5)
+        print(f"  top5 token IDs: {top5.indices.tolist()}")
+        print(f"  top5 logits: {[f'{v:.2f}' for v in top5.values.tolist()]}")
+
+        # Check logit variance (garbage = all same or extreme values)
+        log_std = logits[-1].float().std().item()
+        log_range = (logits[-1].float().amax() - logits[-1].float().amin()).item()
+        print(f"  logit std: {log_std:.4f} range: {log_range:.4f}")
+        if log_std < 0.01:
+            print("  ❌ LOGITS ARE FLAT — model is producing garbage!")
+        elif log_std > 100:
+            print("  ❌ LOGITS ARE EXPLODED — model is producing garbage!")
+        else:
+            print("  ✅ Logits look reasonable for a single layer test")
+
+    # ── Key diagnostic: warmup gs vs actual gs ────────────────────────
+    print("\n" + "=" * 70)
+    print("  DIAGNOSTIC: Warmup gs vs Actual gs")
+    print("=" * 70)
+
+    # What gs did warmup compute?
+    print(f"  r_qa warmup gs: {layer0.r_qa._activation_global_scale:.8f}")
+    print(f"  r_kv warmup gs: {layer0.r_kv._activation_global_scale:.8f}")
+    print(f"  r_wob warmup gs: {layer0.r_wob._activation_global_scale:.8f}")
+
+    # What gs would the actual input produce?
+    with torch.no_grad():
+        actual_gs_qa = normed.amax().item() / (6.0 * 448.0)
+        actual_gs_kv = normed.amax().item() / (6.0 * 448.0)
+    print(f"  actual gs for q_a input: {actual_gs_qa:.8f}")
+    print(f"  ratio warmup/actual for q_a: {layer0.r_qa._activation_global_scale / actual_gs_qa:.4f}" if actual_gs_qa > 0 else "  actual gs is 0!")
+
+    # The KEY question: does the runner use warmup gs at inference time,
+    # or does quantize_activation_nvfp4 recompute it?
+    print("\n--- How does CuTeDSL runner.run() use gs? ---")
+    from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear
+    import inspect
+    run_src = inspect.getsource(CuTeDSLNvfp4Linear.run)
+    # Check if it references _activation_global_scale
+    if '_activation_global_scale' in run_src:
+        print("  run() uses _activation_global_scale (FIXED from warmup)")
+    else:
+        print("  run() does NOT use _activation_global_scale")
+
+    # Check quantize_activation_nvfp4
+    from cutedsl.bridge import quantize_activation_nvfp4
+    qsrc = inspect.getsource(quantize_activation_nvfp4)
+    if 'global_scale' in qsrc:
+        print("  quantize_activation_nvfp4 accepts global_scale as parameter")
+    if '_activation_global_scale' in qsrc:
+        print("  quantize_activation_nvfp4 reads _activation_global_scale")
+
+    # Check _run_impl
+    run_impl_src = inspect.getsource(CuTeDSLNvfp4Linear._run_impl)
+    print(f"\n  _run_impl length: {len(run_impl_src)} chars")
+    # Find where gs is used
+    for i, line in enumerate(run_impl_src.split('\n')):
+        if 'global_scale' in line or '_activation' in line:
+            print(f"  _run_impl line {i}: {line.strip()}")
+
+
+if __name__ == "__main__":
+    main()