Files
nvfp4-megamoe-kernel/tests/unit/test_part_a_decode_diagnostics.py

464 lines
22 KiB
Python

#!/usr/bin/env python3
"""PART A — Decode Diagnostics: Full per-layer comparison of production vs PyTorch reference.
This test is the core diagnostic for the decode degeneration issue.
FMHA per-layer cos is 0.999993 (prefill) and 0.999976 (decode) — FMHA is NOT the bug.
The degeneration must be in some other stage of the pipeline.
Strategy:
Phase 1: Run full production pipeline for all prefill tokens (populates KV caches).
Also run reference prefill to populate reference KV caches.
Phase 2: Run ONE decode step, comparing production X_{l+1} vs reference X_{l+1}
at each layer. Also print |X| growth, F_attn/F_ffn magnitudes,
and compressed/SWA visible range diagnostics.
Production values: HD=512, NOPE=448, ROPE=64, H=128, 61 layers, 8 GPUs, 384 experts.
"""
import os, sys, json, math, time
import torch
import torch.nn.functional as F
CHECKPOINT_DIR = os.environ.get(
"CHECKPOINT_DIR", "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4")
NUM_GPUS = int(os.environ.get("NUM_GPUS", "8"))
DEVICE = "cuda:0"
TEST_LAYERS = int(os.environ.get("TEST_LAYERS", "5"))
# First layer index to test. L0-1 are hash routing, L2+ are dense/CSA/HCA.
# Set to 0 to include hash layers.
FIRST_LAYER = int(os.environ.get("FIRST_LAYER", "2"))
def cosine(a, b):
if a.numel() == 0 or b.numel() == 0:
return float('nan')
return F.cosine_similarity(a.flatten().float(), b.flatten().float(), dim=0).item()
def main():
torch.manual_seed(42)
print("=" * 70)
print("PART A — DECODE DIAGNOSTICS")
print("Full per-layer comparison: production vs PyTorch reference")
print("=" * 70)
with open(os.path.join(CHECKPOINT_DIR, "config.json")) as f:
cfg = json.load(f)
n_layers = cfg["num_hidden_layers"]
H = cfg["hidden_size"]
hd = cfg["head_dim"]
n_h = cfg["num_attention_heads"]
rd = cfg.get("qk_rope_head_dim", 64)
nope_dim = hd - rd
cr = cfg.get("compress_ratios", [128] * n_layers)
print(f"Model: {n_layers} layers, {n_h} heads, hd={hd}, rope_dim={rd}, nope_dim={nope_dim}")
print(f"Compress ratios (first {TEST_LAYERS}): {cr[:TEST_LAYERS]}")
# Import production components
from single_shot_inference import (
load_all_weights, make_nvfp4_linear, get_nvfp4_weight,
rmsnorm, unweighted_rmsnorm, _apply_rope, build_rope_cache,
KVCache, Compressor, Indexer, forward_layer, moe_forward,
_load_moe_weights_stacked, _load_shared_expert_weights,
_cache_layer_weights_no_experts,
)
from dsv4.layers.mhc import mHCLayer, mHCContext
from dsv4.layers.router import Router
from dsv4.layers.moe import Nvfp4MoE
from dsv4.layers.shared_expert import Nvfp4SharedExpert
from dsv4.layers.grouped_linear import Nvfp4GroupedLinear
from dsv4.layers.linear import Nvfp4Linear
from dsv4.ops.quantize import (
rmsnorm_quantize_nvfp4, mhc_rmsnorm_quantize_nvfp4, dequantize_nvfp4,
quantize_to_nvfp4,
)
# Import reference components
from dsv4.reference.single_shot_PYTORCH_REFERENCE import (
mHCBlock, Compressor as RefCompressor,
Indexer as RefIndexer, KVCache as RefKVCache,
build_rope_cache as ref_build_rope_cache,
forward_attention as ref_forward_attention,
forward_layer as ref_forward_layer,
)
print("Loading weights...")
all_w = load_all_weights(CHECKPOINT_DIR)
o_groups = cfg.get("o_groups", 16)
o_rank = cfg.get("o_lora_rank", 1024)
n_ih = cfg.get("index_n_heads", 64)
ihd = cfg.get("index_head_dim", 128)
itk = cfg.get("index_topk", 1024)
rope_caches = {g: build_rope_cache(65536, rd, f"cuda:{g}", 10000., "yarn", 16., 4096, 32, 1)
for g in range(NUM_GPUS)}
ref_rope_caches = {g: ref_build_rope_cache(65536, rd, f"cuda:{g}", 10000., "yarn", 16., 4096, 32, 1)
for g in range(NUM_GPUS)}
# Build production components for TEST_LAYERS
prod_lins, attn_mhcs, ffn_mhcs = {}, {}, {}
attn_norms, ffn_norms = {}, {}
compressors, indexers, kv_caches = {}, {}, {}
routers, moe_runners, se_runners = {}, {}, {}
for li in range(TEST_LAYERS):
gpu = li % NUM_GPUS
dev = f"cuda:{gpu}"
torch.cuda.set_device(gpu)
pfx = f"model.layers.{li}.self_attn"
mlp_pfx = f"model.layers.{li}.mlp"
ratio = cr[li] if li < len(cr) else 128
pl = {}
pl['q_a'] = make_nvfp4_linear(H, 1536, dev, all_w, pfx, 'q_a_proj')
pl['q_b'] = make_nvfp4_linear(1536, H * hd, dev, all_w, pfx, 'q_b_proj')
pl['kv'] = make_nvfp4_linear(H, hd, dev, all_w, pfx, 'kv_proj')
hpg = n_h // o_groups
wo_a = Nvfp4GroupedLinear(n_local_groups=o_groups, heads_per_group=hpg,
head_dim=hd, o_lora_rank=o_rank, max_num_tokens=8192, device=dev)
oa_w, oa_ws, oa_ws2, oa_isc = get_nvfp4_weight(all_w, pfx, 'o_a_proj')
if oa_w is not None and oa_ws is not None:
wo_a.load_nvfp4_weight(oa_w.to(dev), oa_ws.to(dev),
oa_ws2.to(dev) if oa_ws2 is not None else None,
oa_isc.to(dev) if oa_isc is not None else None)
else:
oa_bf = all_w.get(f"{pfx}.o_a_proj.weight")
if oa_bf is not None:
wo_a.set_bf16_weight(oa_bf.bfloat16().to(dev))
pl['o_a'] = wo_a; wo_a._use_runtime_gsa = True
pl['o_b'] = make_nvfp4_linear(o_groups * o_rank, H, dev, all_w, pfx, 'o_b_proj')
prod_lins[li] = pl
for tag, blocks, fn_s, base_s, scale_s in [
("attn", attn_mhcs, f"model.layers.{li}.attn_hc.fn",
f"model.layers.{li}.attn_hc.base", f"model.layers.{li}.attn_hc.scale"),
("ffn", ffn_mhcs, f"model.layers.{li}.ffn_hc.fn",
f"model.layers.{li}.ffn_hc.base", f"model.layers.{li}.ffn_hc.scale"),
]:
fn, base, scale = all_w.get(fn_s), all_w.get(base_s), all_w.get(scale_s)
if fn is not None and base is not None and scale is not None:
m = mHCLayer(hidden_dim=H, n_hc=4, t_max_sinkhorn=20, device=dev)
n = 4
m.load_weights(
W_pre=fn[0:n].to(dev, torch.float32), W_post=fn[n:2*n].to(dev, torch.float32),
W_comb=fn[2*n:].to(dev, torch.float32),
S_pre=base[0:n].reshape(1, n).to(dev, torch.float32),
S_post=base[n:2*n].reshape(n, 1).to(dev, torch.float32),
S_comb=base[2*n:].reshape(n, n).to(dev, torch.float32),
alpha_pre=scale[0].item(), alpha_post=scale[1].item(), alpha_comb=scale[2].item())
blocks[li] = m
an_k = f"model.layers.{li}.input_layernorm.weight"
if an_k in all_w: attn_norms[li] = all_w[an_k].to(dev, torch.float32)
fn_k = f"model.layers.{li}.post_attention_layernorm.weight"
if fn_k in all_w: ffn_norms[li] = all_w[fn_k].to(dev, torch.float32)
max_comp = (8192 + ratio - 1) // ratio if ratio > 0 else 0
kv_caches[li] = KVCache(hd, cfg.get("sliding_window", 128), max_comp=max_comp,
device=dev, indexer_key_dim=ihd, compress_ratio=ratio, indexer_top_k=itk, rope_dim=rd)
if ratio > 0: compressors[li] = Compressor(ratio, hd, H, dev)
if ratio == 4: indexers[li] = Indexer(n_ih, ihd, itk, dev)
is_hash = (li < cfg.get("num_hash_layers", 3)) and (f"{mlp_pfx}.gate.tid2eid" in all_w)
router = Router(hidden_size=H, num_experts=cfg["n_routed_experts"],
top_k=cfg.get("num_experts_per_tok", 6),
routed_scaling_factor=cfg.get("routed_scaling_factor", 2.5),
mode="hash" if is_hash else "dense",
vocab_size=cfg.get("vocab_size", 128000) if is_hash else None, device=dev)
if is_hash:
router.load_weights(hash_lut=all_w[f"{mlp_pfx}.gate.tid2eid"].to(dev, torch.int32))
else:
eb = all_w.get(f"{mlp_pfx}.gate.e_score_correction_bias")
gate_w, gate_ws, gate_ws2, gate_isc = get_nvfp4_weight(all_w, mlp_pfx, 'gate')
E = cfg["n_routed_experts"]
if gate_w is not None and gate_ws is not None:
gate_lin = Nvfp4Linear(in_features=H, out_features=E, device=dev)
gate_lin.fp4 = [gate_w.to(dev).view(torch.float4_e2m1fn_x2) if gate_w.dtype == torch.uint8 else gate_w.to(dev)]
gate_lin.sf = [gate_ws.to(dev)]
ws2_v = gate_ws2.float().item() if gate_ws2 is not None else 1.0
isc_v = gate_isc.float().item() if gate_isc is not None else 1.0/(6.0*448.0)
gate_lin.gs = [1.0]
gate_lin.ws2 = [torch.tensor([ws2_v], device=dev, dtype=torch.float32)]
gate_lin._activation_global_scale = isc_v
gate_lin._use_runtime_gsa = True
gate_lin.finalize_weights()
router.load_nvfp4_gate(gate_lin)
router.load_weights(e_bias=eb.to(dev, torch.float32))
router.finalize_weights(); routers[li] = router
moe = Nvfp4MoE(num_experts=cfg["n_routed_experts"], hidden_size=H,
intermediate_size=cfg.get("moe_intermediate_size", 3072),
top_k=cfg.get("num_experts_per_tok", 6), device=dev)
moe.set_swiglu_limit(cfg.get("swiglu_limit", 10.0)); moe.set_fused_swiglu(True)
_load_moe_weights_stacked(all_w, li, mlp_pfx, dev, moe, cfg)
moe._ensure_stacked(); moe._use_runtime_gsa = True; moe_runners[li] = moe
se = Nvfp4SharedExpert(hidden_size=H, intermediate_size=cfg.get("moe_intermediate_size", 3072),
device=dev, swiglu_limit=cfg.get("swiglu_limit", 10.0))
se.set_fused_swiglu(True)
_load_shared_expert_weights(all_w, li, mlp_pfx, dev, se, cfg)
se._ensure_initialized(); se._use_runtime_gsa = True; se_runners[li] = se
torch.cuda.empty_cache()
for li in range(TEST_LAYERS):
pfx = f"model.layers.{li}.self_attn.compressor"
dev = f"cuda:{li % NUM_GPUS}"
if li in compressors: compressors[li].load(all_w, pfx, dev=dev)
if li in indexers: indexers[li].load(all_w, f"{pfx}.indexer", dev=dev)
print("Production components built")
# Build reference components
ref_attn_mhcs, ref_ffn_mhcs = {}, {}
ref_attn_norms, ref_ffn_norms = {}, {}
ref_kv_caches = {}
ref_compressors, ref_indexers = {}, {}
ref_layer_w = {}
for li in range(TEST_LAYERS):
dev = f"cuda:{li % NUM_GPUS}"
pfx = f"model.layers.{li}.self_attn"
ratio = cr[li] if li < len(cr) else 128
for tag, blocks, fn_s, base_s, scale_s in [
("attn", ref_attn_mhcs, f"model.layers.{li}.attn_hc.fn",
f"model.layers.{li}.attn_hc.base", f"model.layers.{li}.attn_hc.scale"),
("ffn", ref_ffn_mhcs, f"model.layers.{li}.ffn_hc.fn",
f"model.layers.{li}.ffn_hc.base", f"model.layers.{li}.ffn_hc.scale"),
]:
fn, base, scale = all_w.get(fn_s), all_w.get(base_s), all_w.get(scale_s)
if fn is not None and base is not None and scale is not None:
m = mHCBlock(hidden_dim=H, n_hc=4, sinkhorn_iters=20, device=dev)
m.load(fn.to(dev), base.to(dev), scale.to(dev))
blocks[li] = m
an_k = f"model.layers.{li}.input_layernorm.weight"
if an_k in all_w: ref_attn_norms[li] = all_w[an_k]
fn_k = f"model.layers.{li}.post_attention_layernorm.weight"
if fn_k in all_w: ref_ffn_norms[li] = all_w[fn_k]
ref_kv_caches[li] = RefKVCache(head_dim=hd, window_size=cfg.get("sliding_window", 128), device=dev)
if ratio > 0:
ref_compressors[li] = RefCompressor(ratio, hd, H, dev)
ref_compressors[li].load(all_w, f"{pfx}.compressor")
if ratio == 4:
ref_indexers[li] = RefIndexer(n_ih, ihd, itk, dev)
ref_indexers[li].load(all_w, f"{pfx}.compressor.indexer")
ref_layer_w[li] = {k: v for k, v in all_w.items() if k.startswith(f"model.layers.{li}.")}
print("Reference components built")
# Embedding + tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT_DIR)
bos = tokenizer.bos_token_id or 0
USER_TOKEN, ASSISTANT_TOKEN, THINK_START = 128803, 128804, 128821
input_ids = [bos, USER_TOKEN]
input_ids += tokenizer.encode('\n\nThe capital of France is', add_special_tokens=False)
input_ids.append(ASSISTANT_TOKEN)
input_ids.append(THINK_START)
print(f"Input: {len(input_ids)} tokens")
torch.cuda.set_device(0)
embed_w = all_w.get("model.embed_tokens.weight")
prod_embed = torch.nn.Embedding.from_pretrained(embed_w.bfloat16().to(DEVICE))
ref_embed = torch.nn.Embedding.from_pretrained(embed_w.bfloat16().to(DEVICE))
devs_list = [f"cuda:{g}" for g in range(NUM_GPUS)]
layer_w = _cache_layer_weights_no_experts(all_w, TEST_LAYERS, devs_list)
torch.cuda.set_device(0)
# PHASE 1: Prefill — production
print(f"\n{'='*70}")
print("PHASE 1: Prefill — PRODUCTION")
print(f"{'='*70}")
for pi, tid_val in enumerate(input_ids):
t1 = time.time()
tid = torch.tensor([tid_val], dtype=torch.long, device=DEVICE)
pos = torch.tensor([pi], dtype=torch.long, device=DEVICE)
tid32 = torch.tensor([tid_val], dtype=torch.int32, device=DEVICE)
X = mHCLayer.init_state(prod_embed(tid))
for li in range(TEST_LAYERS):
gpu = li % NUM_GPUS
if X.device != torch.device(f"cuda:{gpu}"): X = X.to(f"cuda:{gpu}")
torch.cuda.set_device(gpu)
if pi == 0:
r = routers.get(li)
print(f" L{li} router: mode={r.mode if r else 'None'} has_gate_lin={r._gate_lin is not None if r and hasattr(r, '_gate_lin') else 'N/A'}", flush=True)
X = forward_layer(X, layer_w[li], li, cfg, *rope_caches[gpu],
attn_mhcs.get(li), ffn_mhcs.get(li), attn_norms.get(li), ffn_norms.get(li),
kv_caches[li], pos, tid32, compressors.get(li), indexers.get(li),
moe_runners.get(li), se_runners.get(li), routers.get(li),
prod_lin=prod_lins.get(li), _use_fused_rmsnorm_quantize=True)
if pi % 5 == 0:
print(f" Token {pi}/{len(input_ids)}: {time.time()-t1:.2f}s |X|={X.to(DEVICE).abs().max().item():.1f}", flush=True)
print(f"\nProduction KV cache state after prefill ({len(input_ids)} tokens):")
for li in range(TEST_LAYERS):
kc = kv_caches[li]
ratio = cr[li] if li < len(cr) else 128
print(f" L{li} (ratio={ratio}): n_comp={kc.n_comp} swa_len={kc.swa_len} total_KV={kc.n_comp + kc.swa_len}")
# PHASE 1b: Prefill — reference
print(f"\n{'='*70}")
print("PHASE 1b: Prefill — REFERENCE")
print(f"{'='*70}")
for pi, tid_val in enumerate(input_ids):
tid = torch.tensor([tid_val], dtype=torch.long, device=DEVICE)
pos = torch.tensor([pi], dtype=torch.long, device=DEVICE)
X_ref = mHCBlock.init_state(ref_embed(tid))
for li in range(TEST_LAYERS):
gpu = li % NUM_GPUS
if X_ref.device != torch.device(f"cuda:{gpu}"): X_ref = X_ref.to(f"cuda:{gpu}")
torch.cuda.set_device(gpu)
X_ref = ref_forward_layer(X_ref, ref_layer_w[li], li, cfg,
*ref_rope_caches[gpu],
ref_attn_mhcs.get(li), ref_ffn_mhcs.get(li),
ref_attn_norms.get(li).to(dev, torch.float32) if li in ref_attn_norms else None,
ref_ffn_norms.get(li).to(dev, torch.float32) if li in ref_ffn_norms else None,
ref_kv_caches[li], pos, 0,
ref_compressors.get(li), ref_indexers.get(li))
if pi % 5 == 0:
print(f" Token {pi}/{len(input_ids)}: |X_ref|={X_ref.to(DEVICE).abs().max().item():.1f}", flush=True)
print(f"\nReference KV cache state after prefill:")
for li in range(TEST_LAYERS):
kc = ref_kv_caches[li]
ratio = cr[li] if li < len(cr) else 128
print(f" L{li} (ratio={ratio}): n_comp={kc.n_comp} swa_len={kc.swa_len}")
# PHASE 2: Decode — production vs reference per-layer X comparison
print(f"\n{'='*70}")
print("PHASE 2: Decode step — per-layer X comparison")
print(f"{'='*70}")
decode_pos = len(input_ids)
decode_tid = tokenizer.encode(" the", add_special_tokens=False)
decode_tid = decode_tid[0] if len(decode_tid) > 0 else 2
dec_tid = torch.tensor([decode_tid], dtype=torch.long, device=DEVICE)
dec_tid32 = torch.tensor([decode_tid], dtype=torch.int32, device=DEVICE)
dec_pos = torch.tensor([decode_pos], dtype=torch.long, device=DEVICE)
X_prod = mHCLayer.init_state(prod_embed(dec_tid))
X_ref = mHCBlock.init_state(ref_embed(dec_tid))
cos_init = cosine(X_prod.to(DEVICE), X_ref.to(DEVICE))
print(f"\nInitial X (before any layer): cos={cos_init:.6f} "
f"|prod|={X_prod.abs().max().item():.4f} |ref|={X_ref.abs().max().item():.4f}")
print(f"\n {'L':>3} {'ratio':>5} {'cos(X_next)':>12} {'|X_prod|':>10} {'|X_ref|':>10} "
f"{'|F_attn|':>10} {'|F_ffn|':>10} {'n_comp':>6} {'swa':>4} {'mode':>8} {'leak':>5}")
print(f" {'-'*3} {'-'*5} {'-'*12} {'-'*10} {'-'*10} {'-'*10} {'-'*10} {'-'*6} {'-'*4} {'-'*8} {'-'*5}")
all_pass = True
for li in range(TEST_LAYERS):
gpu = li % NUM_GPUS
dev = f"cuda:{gpu}"
torch.cuda.set_device(gpu)
if X_prod.device != torch.device(dev): X_prod = X_prod.to(dev)
if X_ref.device != torch.device(dev): X_ref = X_ref.to(dev)
ratio = cr[li] if li < len(cr) else 128
kc = kv_caches[li]
# Production forward — capture intermediates
attn_mhc = attn_mhcs.get(li)
ffn_mhc = ffn_mhcs.get(li)
A_l_a, B_l_a, C_l_a = attn_mhc._dynamic_params(X_prod)
ctx_a = mHCContext(B_l=B_l_a, C_l=C_l_a)
x_quant_attn = mhc_rmsnorm_quantize_nvfp4(
X_prod, A_l_a, attn_norms.get(li).to(dev, torch.float32))
x_normed = dequantize_nvfp4(x_quant_attn.x_fp4, x_quant_attn.x_sf, x_quant_attn.gsa)
F_attn, q_a = forward_attention(
x_normed, layer_w[li], li, cfg, *rope_caches[gpu],
kc, dec_pos, compressors.get(li), indexers.get(li), prod_lins.get(li),
x_quant=x_quant_attn)
X_mid = attn_mhc.post_block(X_prod, F_attn, ctx_a)
A_l_f, B_l_f, C_l_f = ffn_mhc._dynamic_params(X_mid)
ctx_f = mHCContext(B_l=B_l_f, C_l=C_l_f)
x_quant_ffn = mhc_rmsnorm_quantize_nvfp4(
X_mid, A_l_f, ffn_norms.get(li).to(dev, torch.float32))
x_ffn = dequantize_nvfp4(x_quant_ffn.x_fp4, x_quant_ffn.x_sf, x_quant_ffn.gsa)
F_ffn = moe_forward(x_ffn, li, moe_runners.get(li), se_runners.get(li),
routers.get(li), dec_tid32.to(dev))
X_prod_next = ffn_mhc.post_block(X_mid, F_ffn, ctx_f)
# Reference forward
X_ref_next = ref_forward_layer(X_ref, ref_layer_w[li], li, cfg, *ref_rope_caches[gpu],
ref_attn_mhcs.get(li), ref_ffn_mhcs.get(li),
ref_attn_norms.get(li).to(dev, torch.float32) if li in ref_attn_norms else None,
ref_ffn_norms.get(li).to(dev, torch.float32) if li in ref_ffn_norms else None,
ref_kv_caches[li], dec_pos.to(dev), 0,
ref_compressors.get(li), ref_indexers.get(li))
# Compare
cos_val = cosine(X_prod_next.to(DEVICE), X_ref_next.to(DEVICE))
mag_prod = X_prod_next.to(DEVICE).abs().max().item()
mag_ref = X_ref_next.to(DEVICE).abs().max().item()
f_attn_mag = F_attn.to(DEVICE).abs().max().item()
f_ffn_mag = F_ffn.to(DEVICE).abs().max().item()
swa_kv, swa_pos = kc.get_swa()
swa_len = swa_kv.shape[0]
n_comp = kc.n_comp
mode = "CSA" if ratio == 4 else ("HCA" if ratio > 4 else "SWA")
future_leak = False
if ratio == 4 and n_comp > 0 and kc.comp_pos is not None and kc.comp_pos.numel() > 0:
visible_comp_pos = kc.comp_pos[:n_comp]
future_leak = (visible_comp_pos >= decode_pos).any().item()
status = "PASS" if cos_val >= 0.99 else "FAIL"
if cos_val < 0.99: all_pass = False
print(f" {li:>3} {ratio:>5} {cos_val:>12.6f} {mag_prod:>10.2f} {mag_ref:>10.2f} "
f"{f_attn_mag:>10.2f} {f_ffn_mag:>10.2f} {n_comp:>6} {swa_len:>4} {mode:>8} "
f"{'YES!' if future_leak else 'no':>5}")
if cos_val < 0.99:
print(f" FAIL detail: |X_prod_in|={X_prod.to(DEVICE).abs().max().item():.2f} "
f"|X_ref_in|={X_ref.to(DEVICE).abs().max().item():.2f}")
B_a = B_l_a
print(f" B_l row_sum=[{B_a.sum(-1).min().item():.4f},{B_a.sum(-1).max().item():.4f}] "
f"col_sum=[{B_a.sum(-2).min().item():.4f},{B_a.sum(-2).max().item():.4f}]")
print(f" A_l=[{A_l_a.min().item():.4f},{A_l_a.max().item():.4f}] "
f"C_l=[{C_l_a.min().item():.4f},{C_l_a.max().item():.4f}]")
X_prod = X_prod_next
X_ref = X_ref_next
# Summary
print(f"\n{'='*70}")
print("PART A SUMMARY")
print(f"{'='*70}")
if all_pass:
print("ALL LAYERS PASS (cos >= 0.99) — production matches reference at decode")
print("The decode degeneration is likely caused by accumulated small errors across 61 layers,")
print("or by components beyond these first layers (e.g., lm_head, hc_head).")
else:
print("SOME LAYERS FAIL — production diverges from reference at decode")
print("The failing layer(s) contain the root cause of decode degeneration.")
print()
print("Next steps:")
print(" 1. For each failing layer, compare intermediate values:")
print(" - x_normed (after mHC pre + rmsnorm)")
print(" - F_attn (after attention)")
print(" - X_mid (after mHC post)")
print(" - F_ffn (after MoE)")
print(" 2. Check mHC B_l doubly-stochastic property")
print(" 3. Check compressed/SWA visible range parity")
print(" 4. Check indexer top-k indices validity")
return 0 if all_pass else 1
if __name__ == "__main__":
sys.exit(main())