349 lines
16 KiB
Python
349 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""Production FMHA layer comparison test — real model weights, real pipeline.
|
|
|
|
Strategy:
|
|
1. Run the full production pipeline (single_shot_inference.py forward_layer)
|
|
for all prefill tokens through layers 0-4.
|
|
2. On the LAST prefill token, for each layer, ALSO run the reference FMHA
|
|
(dequantize KV to BF16, run PyTorch SDPA) on the SAME gathered KV
|
|
that the production kernel saw.
|
|
3. Compare raw FMHA output (before inverse RoPE, before output projection).
|
|
|
|
This isolates the FMHA kernel's accuracy from the rest of the pipeline.
|
|
|
|
Production values: HD=512, NOPE=448, ROPE=64, H=128, 61 layers, 8 GPUs.
|
|
"""
|
|
import os, sys, json, math, time
|
|
import torch
|
|
import torch.nn.functional as F
|
|
|
|
CHECKPOINT_DIR = os.environ.get(
|
|
"CHECKPOINT_DIR", "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4")
|
|
NUM_GPUS = int(os.environ.get("NUM_GPUS", "8"))
|
|
DEVICE = "cuda:0"
|
|
|
|
|
|
def cosine(a, b):
|
|
return F.cosine_similarity(a.flatten().float(), b.flatten().float(), dim=0).item()
|
|
|
|
|
|
def main():
|
|
torch.manual_seed(42)
|
|
print("=" * 70)
|
|
print("PRODUCTION FMHA LAYER COMPARISON TEST")
|
|
print("=" * 70)
|
|
|
|
with open(os.path.join(CHECKPOINT_DIR, "config.json")) as f:
|
|
cfg = json.load(f)
|
|
n_layers = cfg["num_hidden_layers"]
|
|
H = cfg["hidden_size"]
|
|
hd = cfg["head_dim"]
|
|
n_h = cfg["num_attention_heads"]
|
|
rd = cfg.get("qk_rope_head_dim", 64)
|
|
nope_dim = hd - rd
|
|
cr = cfg.get("compress_ratios", [128] * n_layers)
|
|
print(f"Model: {n_layers} layers, {n_h} heads, hd={hd}, rope_dim={rd}")
|
|
|
|
from single_shot_inference import (
|
|
load_all_weights, make_nvfp4_linear, get_nvfp4_weight,
|
|
rmsnorm, unweighted_rmsnorm, _apply_rope, build_rope_cache,
|
|
KVCache, Compressor, Indexer, forward_layer, moe_forward,
|
|
_load_moe_weights_stacked, _load_shared_expert_weights,
|
|
_cache_layer_weights_no_experts,
|
|
)
|
|
from dsv4.layers.mhc import mHCLayer, mHCContext
|
|
from dsv4.layers.router import Router
|
|
from dsv4.layers.moe import Nvfp4MoE
|
|
from dsv4.layers.shared_expert import Nvfp4SharedExpert
|
|
from dsv4.layers.grouped_linear import Nvfp4GroupedLinear
|
|
from dsv4.layers.linear import Nvfp4Linear
|
|
from dsv4.ops.quantize import (
|
|
rmsnorm_quantize_nvfp4, mhc_rmsnorm_quantize_nvfp4, dequantize_nvfp4,
|
|
quantize_to_nvfp4,
|
|
)
|
|
|
|
print("Loading weights...")
|
|
all_w = load_all_weights(CHECKPOINT_DIR)
|
|
|
|
TEST_LAYERS = 5
|
|
o_groups = cfg.get("o_groups", 16)
|
|
o_rank = cfg.get("o_lora_rank", 1024)
|
|
n_ih = cfg.get("index_n_heads", 64)
|
|
ihd = cfg.get("index_head_dim", 128)
|
|
itk = cfg.get("index_topk", 1024)
|
|
|
|
rope_caches = {g: build_rope_cache(65536, rd, f"cuda:{g}", 10000., "yarn", 16., 4096, 32, 1)
|
|
for g in range(NUM_GPUS)}
|
|
|
|
# Build all production components (same as single_shot main())
|
|
prod_lins, attn_mhcs, ffn_mhcs = {}, {}, {}
|
|
attn_norms, ffn_norms = {}, {}
|
|
compressors, indexers, kv_caches = {}, {}, {}
|
|
routers, moe_runners, se_runners = {}, {}, {}
|
|
|
|
for li in range(TEST_LAYERS):
|
|
gpu = li % NUM_GPUS
|
|
dev = f"cuda:{gpu}"
|
|
torch.cuda.set_device(gpu)
|
|
pfx = f"model.layers.{li}.self_attn"
|
|
mlp_pfx = f"model.layers.{li}.mlp"
|
|
ratio = cr[li] if li < len(cr) else 128
|
|
|
|
# Attention linears
|
|
pl = {}
|
|
pl['q_a'] = make_nvfp4_linear(H, 1536, dev, all_w, pfx, 'q_a_proj')
|
|
pl['q_b'] = make_nvfp4_linear(1536, H * hd, dev, all_w, pfx, 'q_b_proj')
|
|
pl['kv'] = make_nvfp4_linear(H, hd, dev, all_w, pfx, 'kv_proj')
|
|
hpg = n_h // o_groups
|
|
wo_a = Nvfp4GroupedLinear(n_local_groups=o_groups, heads_per_group=hpg,
|
|
head_dim=hd, o_lora_rank=o_rank, max_num_tokens=8192, device=dev)
|
|
oa_w, oa_ws, oa_ws2, oa_isc = get_nvfp4_weight(all_w, pfx, 'o_a_proj')
|
|
if oa_w is not None and oa_ws is not None:
|
|
wo_a.load_nvfp4_weight(oa_w.to(dev), oa_ws.to(dev),
|
|
oa_ws2.to(dev) if oa_ws2 is not None else None,
|
|
oa_isc.to(dev) if oa_isc is not None else None)
|
|
else:
|
|
oa_bf = all_w.get(f"{pfx}.o_a_proj.weight")
|
|
if oa_bf is not None:
|
|
wo_a.set_bf16_weight(oa_bf.bfloat16().to(dev))
|
|
pl['o_a'] = wo_a; wo_a._use_runtime_gsa = True
|
|
pl['o_b'] = make_nvfp4_linear(o_groups * o_rank, H, dev, all_w, pfx, 'o_b_proj')
|
|
prod_lins[li] = pl
|
|
|
|
# mHC
|
|
for tag, blocks, fn_s, base_s, scale_s in [
|
|
("attn", attn_mhcs, f"model.layers.{li}.attn_hc.fn",
|
|
f"model.layers.{li}.attn_hc.base", f"model.layers.{li}.attn_hc.scale"),
|
|
("ffn", ffn_mhcs, f"model.layers.{li}.ffn_hc.fn",
|
|
f"model.layers.{li}.ffn_hc.base", f"model.layers.{li}.ffn_hc.scale"),
|
|
]:
|
|
fn, base, scale = all_w.get(fn_s), all_w.get(base_s), all_w.get(scale_s)
|
|
if fn is not None and base is not None and scale is not None:
|
|
m = mHCLayer(hidden_dim=H, n_hc=4, t_max_sinkhorn=20, device=dev)
|
|
n = 4
|
|
m.load_weights(
|
|
W_pre=fn[0:n].to(dev, torch.float32), W_post=fn[n:2*n].to(dev, torch.float32),
|
|
W_comb=fn[2*n:].to(dev, torch.float32),
|
|
S_pre=base[0:n].reshape(1, n).to(dev, torch.float32),
|
|
S_post=base[n:2*n].reshape(n, 1).to(dev, torch.float32),
|
|
S_comb=base[2*n:].reshape(n, n).to(dev, torch.float32),
|
|
alpha_pre=scale[0].item(), alpha_post=scale[1].item(), alpha_comb=scale[2].item())
|
|
blocks[li] = m
|
|
|
|
an_k = f"model.layers.{li}.input_layernorm.weight"
|
|
if an_k in all_w: attn_norms[li] = all_w[an_k].to(dev, torch.float32)
|
|
fn_k = f"model.layers.{li}.post_attention_layernorm.weight"
|
|
if fn_k in all_w: ffn_norms[li] = all_w[fn_k].to(dev, torch.float32)
|
|
|
|
max_comp = (8192 + ratio - 1) // ratio if ratio > 0 else 0
|
|
kv_caches[li] = KVCache(hd, cfg.get("sliding_window", 128), max_comp=max_comp,
|
|
device=dev, indexer_key_dim=ihd, compress_ratio=ratio, indexer_top_k=itk, rope_dim=rd)
|
|
if ratio > 0: compressors[li] = Compressor(ratio, hd, H, dev)
|
|
if ratio == 4: indexers[li] = Indexer(n_ih, ihd, itk, dev)
|
|
|
|
# Router
|
|
is_hash = (li < cfg.get("num_hash_layers", 3)) and (f"{mlp_pfx}.gate.tid2eid" in all_w)
|
|
router = Router(hidden_size=H, num_experts=cfg["n_routed_experts"],
|
|
top_k=cfg.get("num_experts_per_tok", 6),
|
|
routed_scaling_factor=cfg.get("routed_scaling_factor", 2.5),
|
|
mode="hash" if is_hash else "dense",
|
|
vocab_size=cfg.get("vocab_size", 128000) if is_hash else None, device=dev)
|
|
if is_hash:
|
|
router.load_weights(hash_lut=all_w[f"{mlp_pfx}.gate.tid2eid"].to(dev, torch.int32))
|
|
else:
|
|
eb = all_w.get(f"{mlp_pfx}.gate.e_score_correction_bias")
|
|
gate_w, gate_ws, gate_ws2, gate_isc = get_nvfp4_weight(all_w, mlp_pfx, 'gate')
|
|
E = cfg["n_routed_experts"]
|
|
if gate_w is not None and gate_ws is not None:
|
|
gate_lin = Nvfp4Linear(in_features=H, out_features=E, device=dev)
|
|
gate_lin.fp4 = [gate_w.to(dev).view(torch.float4_e2m1fn_x2) if gate_w.dtype == torch.uint8 else gate_w.to(dev)]
|
|
gate_lin.sf = [gate_ws.to(dev)]
|
|
ws2_v = gate_ws2.float().item() if gate_ws2 is not None else 1.0
|
|
isc_v = gate_isc.float().item() if gate_isc is not None else 1.0/(6.0*448.0)
|
|
gate_lin.gs = [1.0]
|
|
gate_lin.ws2 = [torch.tensor([ws2_v], device=dev, dtype=torch.float32)]
|
|
gate_lin._activation_global_scale = isc_v
|
|
gate_lin._use_runtime_gsa = True
|
|
gate_lin.finalize_weights()
|
|
router.load_nvfp4_gate(gate_lin)
|
|
router.load_weights(e_bias=eb.to(dev, torch.float32))
|
|
else:
|
|
# BF16 gate weight — quantize to NVFP4
|
|
gw = all_w.get(f"{mlp_pfx}.gate.weight")
|
|
if gw is not None:
|
|
g_bf16 = gw if gw.shape == (E, H) else gw.T.contiguous()
|
|
g_bf16 = g_bf16.bfloat16().to(dev)
|
|
g_fp4, g_sf, g_gs = quantize_to_nvfp4(g_bf16)
|
|
gate_lin = Nvfp4Linear(in_features=H, out_features=E, device=dev)
|
|
gate_lin.fp4 = [g_fp4]
|
|
gate_lin.sf = [g_sf]
|
|
gate_lin.gs = [g_gs]
|
|
gate_lin.ws2 = [torch.tensor([g_gs], device=dev, dtype=torch.float32)]
|
|
gate_lin._activation_global_scale = 1.0 / (6.0 * 448.0)
|
|
gate_lin._use_runtime_gsa = True
|
|
gate_lin.finalize_weights()
|
|
router.load_nvfp4_gate(gate_lin)
|
|
router.load_weights(e_bias=eb.to(dev, torch.float32))
|
|
router.finalize_weights(); routers[li] = router
|
|
|
|
moe = Nvfp4MoE(num_experts=cfg["n_routed_experts"], hidden_size=H,
|
|
intermediate_size=cfg.get("moe_intermediate_size", 3072),
|
|
top_k=cfg.get("num_experts_per_tok", 6), device=dev)
|
|
moe.set_swiglu_limit(cfg.get("swiglu_limit", 10.0)); moe.set_fused_swiglu(True)
|
|
_load_moe_weights_stacked(all_w, li, mlp_pfx, dev, moe, cfg)
|
|
moe._ensure_stacked(); moe._use_runtime_gsa = True; moe_runners[li] = moe
|
|
|
|
se = Nvfp4SharedExpert(hidden_size=H, intermediate_size=cfg.get("moe_intermediate_size", 3072),
|
|
device=dev, swiglu_limit=cfg.get("swiglu_limit", 10.0))
|
|
se.set_fused_swiglu(True)
|
|
_load_shared_expert_weights(all_w, li, mlp_pfx, dev, se, cfg)
|
|
se._ensure_initialized(); se._use_runtime_gsa = True; se_runners[li] = se
|
|
torch.cuda.empty_cache()
|
|
|
|
for li in range(TEST_LAYERS):
|
|
pfx = f"model.layers.{li}.self_attn.compressor"
|
|
dev = f"cuda:{li % NUM_GPUS}"
|
|
if li in compressors: compressors[li].load(all_w, pfx, dev=dev)
|
|
if li in indexers: indexers[li].load(all_w, f"{pfx}.indexer", dev=dev)
|
|
print("Components built")
|
|
|
|
# Embedding + tokenizer
|
|
from transformers import AutoTokenizer
|
|
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT_DIR)
|
|
bos = tokenizer.bos_token_id or 0
|
|
USER_TOKEN, ASSISTANT_TOKEN, THINK_START = 128803, 128804, 128821
|
|
input_ids = [bos, USER_TOKEN]
|
|
input_ids += tokenizer.encode('\n\nThe capital of France is', add_special_tokens=False)
|
|
input_ids.append(ASSISTANT_TOKEN); input_ids.append(THINK_START)
|
|
print(f"Input: {len(input_ids)} tokens")
|
|
|
|
torch.cuda.set_device(0)
|
|
embed_w = all_w.get("model.embed_tokens.weight")
|
|
embed = torch.nn.Embedding.from_pretrained(embed_w.bfloat16().to(DEVICE))
|
|
devs_list = [f"cuda:{g}" for g in range(NUM_GPUS)]
|
|
layer_w = _cache_layer_weights_no_experts(all_w, TEST_LAYERS, devs_list)
|
|
del all_w; import gc; gc.collect()
|
|
for g in range(NUM_GPUS): torch.cuda.set_device(g); torch.cuda.empty_cache()
|
|
torch.cuda.set_device(0)
|
|
|
|
# ================================================================
|
|
# PHASE 1: Run full production pipeline to populate KV caches
|
|
# ================================================================
|
|
print(f"\nPhase 1: Populating KV caches...")
|
|
for pi, tid_val in enumerate(input_ids):
|
|
t1 = time.time()
|
|
tid = torch.tensor([tid_val], dtype=torch.long, device=DEVICE)
|
|
pos = torch.tensor([pi], dtype=torch.long, device=DEVICE)
|
|
tid32 = torch.tensor([tid_val], dtype=torch.int32, device=DEVICE)
|
|
|
|
X = mHCLayer.init_state(embed(tid))
|
|
for li in range(TEST_LAYERS):
|
|
gpu = li % NUM_GPUS
|
|
if X.device != torch.device(f"cuda:{gpu}"): X = X.to(f"cuda:{gpu}")
|
|
torch.cuda.set_device(gpu)
|
|
X = forward_layer(X, layer_w[li], li, cfg, *rope_caches[gpu],
|
|
attn_mhcs.get(li), ffn_mhcs.get(li), attn_norms.get(li), ffn_norms.get(li),
|
|
kv_caches[li], pos, tid32, compressors.get(li), indexers.get(li),
|
|
moe_runners.get(li), se_runners.get(li), routers.get(li),
|
|
prod_lin=prod_lins.get(li), _use_fused_rmsnorm_quantize=True)
|
|
if pi % 5 == 0:
|
|
print(f" Token {pi}/{len(input_ids)}: {time.time()-t1:.2f}s", flush=True)
|
|
|
|
# ================================================================
|
|
# PHASE 2: For each layer, gather KV, run production FMHA, compare vs SDPA
|
|
# ================================================================
|
|
print(f"\nPhase 2: FMHA comparison per layer...")
|
|
results = {}
|
|
|
|
for li in range(TEST_LAYERS):
|
|
gpu = li % NUM_GPUS
|
|
dev = f"cuda:{gpu}"
|
|
torch.cuda.set_device(gpu)
|
|
ratio = cr[li] if li < len(cr) else 128
|
|
k_cache = kv_caches[li]
|
|
|
|
# Gather KV in mixed format (same as production path)
|
|
if k_cache.n_comp > 0:
|
|
if ratio > 4:
|
|
kv_nope_fp8, kv_nope_scale, kv_rope_bf16 = k_cache.gather_mixed_all()
|
|
else:
|
|
kv_nope_fp8, kv_nope_scale, kv_rope_bf16 = k_cache.gather_mixed_swa_only()
|
|
else:
|
|
kv_nope_fp8, kv_nope_scale, kv_rope_bf16 = k_cache.gather_mixed_swa_only()
|
|
|
|
seq_len = kv_nope_scale.shape[0]
|
|
if seq_len == 0:
|
|
print(f" L{li}: SKIPPED (seq_len=0)")
|
|
continue
|
|
|
|
# Generate a test Q (random, on this GPU)
|
|
q_bf16 = torch.randn(1, n_h, 1, hd, dtype=torch.bfloat16, device=dev) * 0.5
|
|
|
|
# 1. Run production mixed FP8 FMHA
|
|
from dsv4.kernels.attention.fmha_mixed_fp8_op import fmha_mixed_fp8_decode_raw
|
|
scale_val = 1.0 / math.sqrt(hd)
|
|
try:
|
|
o_prod, lse_prod = fmha_mixed_fp8_decode_raw(
|
|
q_bf16, kv_nope_fp8, kv_nope_scale, kv_rope_bf16, scale_val, rope_dim=rd)
|
|
except Exception as e:
|
|
print(f" L{li}: PROD FMHA FAILED: {e}")
|
|
results[li] = {'cos': -1.0, 'error': str(e)}
|
|
continue
|
|
|
|
# 2. Reference: dequantize KV, run SDPA
|
|
nope_dequant = kv_nope_fp8.view(torch.float8_e4m3fn).float() * kv_nope_scale.unsqueeze(-1).float()
|
|
kv_full = torch.cat([nope_dequant.bfloat16(), kv_rope_bf16], dim=-1) # (N, hd)
|
|
k_4d = kv_full.unsqueeze(0).unsqueeze(0).expand(1, 1, -1, -1) # (1, 1, N, hd)
|
|
v_4d = k_4d.clone()
|
|
o_ref = F.scaled_dot_product_attention(q_bf16, k_4d, v_4d, scale=scale_val) # (1, H, 1, hd)
|
|
|
|
# 3. Compare
|
|
cos_val = cosine(o_prod, o_ref)
|
|
mag_prod = o_prod.float().abs().max().item()
|
|
mag_ref = o_ref.float().abs().max().item()
|
|
|
|
# Per-head cosine
|
|
o_prod_h = o_prod.float().squeeze(2) # (1, H, hd) → (H, hd) after squeeze
|
|
o_ref_h = o_ref.float().squeeze(2)
|
|
if o_prod_h.dim() == 3: o_prod_h = o_prod_h.squeeze(0)
|
|
if o_ref_h.dim() == 3: o_ref_h = o_ref_h.squeeze(0)
|
|
per_head_cos = F.cosine_similarity(o_prod_h, o_ref_h, dim=-1)
|
|
min_head = per_head_cos.min().item()
|
|
mean_head = per_head_cos.mean().item()
|
|
|
|
results[li] = {
|
|
'cos': cos_val, 'mag_prod': mag_prod, 'mag_ref': mag_ref,
|
|
'seq_len': seq_len, 'ratio': ratio,
|
|
'min_head_cos': min_head, 'mean_head_cos': mean_head,
|
|
}
|
|
status = "PASS" if cos_val >= 0.999 else "FAIL"
|
|
print(f" L{li}: {status} cos={cos_val:.6f} min_head={min_head:.6f} mean_head={mean_head:.6f} "
|
|
f"|prod|={mag_prod:.4f} |ref|={mag_ref:.4f} seq_len={seq_len} ratio={ratio}")
|
|
|
|
if cos_val < 0.999:
|
|
worst = per_head_cos.argsort()[:5]
|
|
print(f" Worst heads: {worst.tolist()} cos={[f'{c:.4f}' for c in per_head_cos[worst].tolist()]}")
|
|
|
|
# Summary
|
|
print("\n" + "=" * 70)
|
|
print("SUMMARY")
|
|
print("=" * 70)
|
|
all_pass = True
|
|
for li in sorted(results.keys()):
|
|
r = results[li]
|
|
c = r.get('cos', -1.0)
|
|
status = "PASS" if c >= 0.999 else "FAIL"
|
|
if c < 0.999: all_pass = False
|
|
print(f" L{li}: {status} cos={c:.6f} seq={r.get('seq_len','?')} ratio={r.get('ratio','?')}")
|
|
|
|
print()
|
|
if all_pass:
|
|
print("ALL PASSED (cos >= 0.999)")
|
|
else:
|
|
print("SOME FAILED — see per-layer output above")
|
|
return 0 if all_pass else 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|