From ba915dbd530fc67a8fbf41a293757ef2d11e1d3e Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Sun, 31 May 2026 21:41:31 +0000
Subject: [PATCH] add probe_shapes script

---
 probe_shapes.py | 69 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 probe_shapes.py

diff --git a/probe_shapes.py b/probe_shapes.py
new file mode 100644
index 00000000..8806d70f
--- /dev/null
+++ b/probe_shapes.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+"""Probe actual checkpoint weight shapes for DSV4-Pro NVFP4.
+Run on B200: prints key/shape for attention, compressor, mHC, FFN, global weights.
+"""
+import json, os, sys
+from safetensors.torch import load_file
+from pathlib import Path
+
+CHECKPOINT_DIR = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4"
+cdir = Path(CHECKPOINT_DIR)
+
+# Load index
+with open(cdir / "model.safetensors.index.json") as f:
+    wmap = json.load(f)["weight_map"]
+
+# 1. Find actual key patterns for layer 0 and 2 (HCA and CSA)
+for li in [0, 2]:
+    print(f"\n=== Layer {li} keys ===")
+    pfx = f"model.layers.{li}."
+    keys = sorted([k for k in wmap if k.startswith(pfx) and not 'experts.' in k and not 'shared_expert' in k])
+    for k in keys:
+        print(f"  {k}")
+    # Expert/shared count
+    expert_keys = [k for k in wmap if k.startswith(pfx) and 'experts.' in k]
+    shared_keys = [k for k in wmap if k.startswith(pfx) and 'shared_expert' in k]
+    print(f"  + {len(expert_keys)} expert keys, {len(shared_keys)} shared expert keys")
+
+# 2. Global keys
+print(f"\n=== Global keys ===")
+global_keys = sorted([k for k in wmap if not k.startswith("model.layers.")])
+for k in global_keys:
+    print(f"  {k}")
+
+# 3. Load ONE shard to get actual shapes for layer 0
+# Find shards containing layer 0 attention keys
+pfx0 = f"model.layers.0."
+attn_like_keys = [k for k in wmap if k.startswith(pfx0) and ('self_attn' in k or 'attn' in k or 'compressor' in k or 'hyper' in k or 'mhc' in k or 'input_layernorm' in k or 'post_attention_layernorm' in k)]
+print(f"\n=== Layer 0 attn-like keys ===")
+for k in attn_like_keys:
+    print(f"  {k} -> shard {wmap[k]}")
+
+# Load the first shard with layer 0 data
+shards_needed = set(wmap[k] for k in attn_like_keys)
+print(f"\nShards with layer 0 attn data: {shards_needed}")
+
+for sn in sorted(shards_needed)[:2]:
+    if not (cdir / sn).exists(): continue
+    w = load_file(str(cdir / sn))
+    for k in sorted(w.keys()):
+        if k.startswith(pfx0) and 'experts.' not in k:
+            print(f"  {k}: {tuple(w[k].shape)} dtype={w[k].dtype}")
+
+# 4. Config
+print(f"\n=== Config ===")
+with open(cdir / "config.json") as f:
+    cfg = json.load(f)
+for k in ['num_hidden_layers', 'hidden_size', 'num_attention_heads', 'head_dim',
+          'qk_rope_head_dim', 'n_routed_experts', 'num_experts_per_tok',
+          'routed_scaling_factor', 'num_output_groups', 'output_group_dim',
+          'query_compression_dim', 'compress_ratios', 'rope_theta',
+          'intermediate_size', 'kv_lora_rank', 'q_lora_rank',
+          'num_key_value_heads', 'indexer_num_heads', 'indexer_head_dim',
+          'n_shared_experts', 'first_k_dense_replace']:
+    if k in cfg:
+        v = cfg[k]
+        if isinstance(v, list) and len(v) > 10:
+            print(f"  {k}: len={len(v)}, first5={v[:5]}")
+        else:
+            print(f"  {k}: {v}")