203 lines
8.7 KiB
Python
203 lines
8.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test A: Compare moe_pipeline output vs CuTeDSLMoERunner output.
|
|
|
|
Uses the same weights and inputs. If they differ, the runner is broken.
|
|
Runs on the B200 host (not inside Docker):
|
|
source /root/nvfp4-megamoe-kernel/tests/.venv/bin/activate
|
|
python3 tests/test_runner_vs_pipeline.py
|
|
"""
|
|
import os, sys, json, torch
|
|
from safetensors import safe_open
|
|
|
|
REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
sys.path.insert(0, REPO_ROOT)
|
|
|
|
from cutedsl.moe_pipeline import run_nvfp4_moe
|
|
from vllm.nvfp4_cutedsl import CuTeDSLMoERunner
|
|
from cutedsl.bridge import quantize_to_nvfp4, quantize_weight_to_nvfp4, make_b_k_major, assemble_scales_3d_side, compute_expert_offsets
|
|
|
|
MODEL_DIR = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4"
|
|
DEVICE = "cuda"
|
|
LAYER_IDX = 0
|
|
E2M1_LUT = torch.tensor([0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0,
|
|
-0.0, -0.5, -1.0, -1.5, -2.0, -3.0, -4.0, -6.0], dtype=torch.float32)
|
|
|
|
|
|
def find_shards(model_dir):
|
|
index_path = os.path.join(model_dir, "model.safetensors.index.json")
|
|
key_to_shard = {}
|
|
if os.path.exists(index_path):
|
|
with open(index_path) as f:
|
|
index = json.load(f)
|
|
for key, shard in index["weight_map"].items():
|
|
key_to_shard[key] = os.path.join(model_dir, shard)
|
|
return key_to_shard
|
|
|
|
|
|
def load_layer_tensors(model_dir, layer_idx):
|
|
key_to_shard = find_shards(model_dir)
|
|
layer_prefix = f"layers.{layer_idx}."
|
|
tensors = {}
|
|
for key, shard in key_to_shard.items():
|
|
norm_key = key.removeprefix("model.")
|
|
if not norm_key.startswith(layer_prefix):
|
|
continue
|
|
with safe_open(shard, framework="pt") as f:
|
|
if key in f.keys():
|
|
tensors[norm_key] = f.get_tensor(key)
|
|
return tensors
|
|
|
|
|
|
def dequantize_nvfp4_weight(packed_uint8, scale_e4m3, global_scale):
|
|
device = packed_uint8.device
|
|
lut = E2M1_LUT.to(device)
|
|
lower = lut[(packed_uint8 & 0x0F).long()]
|
|
upper = lut[((packed_uint8 >> 4) & 0x0F).long()]
|
|
out_features = packed_uint8.shape[0]
|
|
in_features = packed_uint8.shape[1] * 2
|
|
unpacked = torch.empty(out_features, in_features, dtype=torch.float32, device=device)
|
|
unpacked[:, 0::2] = lower
|
|
unpacked[:, 1::2] = upper
|
|
block_scale = scale_e4m3.float()
|
|
block_expanded = block_scale.repeat_interleave(16, dim=1)[:, :in_features]
|
|
return (unpacked * block_expanded * global_scale).to(torch.bfloat16)
|
|
|
|
|
|
def prepare_direct_weights(nvfp4_tensors, layer_idx, expert_indices, intermediate_size):
|
|
"""Direct view-cast path (same as layertest)."""
|
|
l1_fp4, l1_sf, l1_gs = [], [], []
|
|
l2_fp4, l2_sf, l2_gs = [], [], []
|
|
for e in expert_indices:
|
|
gate_w = nvfp4_tensors[f"layers.{layer_idx}.mlp.experts.{e}.gate_proj.weight"].to(DEVICE)
|
|
up_w = nvfp4_tensors[f"layers.{layer_idx}.mlp.experts.{e}.up_proj.weight"].to(DEVICE)
|
|
gate_sf = nvfp4_tensors[f"layers.{layer_idx}.mlp.experts.{e}.gate_proj.weight_scale"].to(DEVICE)
|
|
up_sf = nvfp4_tensors[f"layers.{layer_idx}.mlp.experts.{e}.up_proj.weight_scale"].to(DEVICE)
|
|
gate_gs = nvfp4_tensors[f"layers.{layer_idx}.mlp.experts.{e}.gate_proj.weight_scale_2"].item()
|
|
up_gs = nvfp4_tensors[f"layers.{layer_idx}.mlp.experts.{e}.up_proj.weight_scale_2"].item()
|
|
fused_w = torch.cat([gate_w, up_w], dim=0)
|
|
fused_w_fp4 = fused_w.view(torch.float4_e2m1fn_x2).permute(1, 0).contiguous()
|
|
fused_sf = torch.cat([gate_sf, up_sf], dim=0).permute(1, 0).contiguous()
|
|
max_gs = max(gate_gs, up_gs)
|
|
if gate_gs != up_gs:
|
|
f32 = fused_sf.float()
|
|
f32[:, :intermediate_size] *= (gate_gs / max_gs)
|
|
f32[:, intermediate_size:] *= (up_gs / max_gs)
|
|
fused_sf = f32.to(torch.float8_e4m3fn)
|
|
l1_fp4.append(fused_w_fp4)
|
|
l1_sf.append(fused_sf)
|
|
l1_gs.append(max_gs)
|
|
down_w = nvfp4_tensors[f"layers.{layer_idx}.mlp.experts.{e}.down_proj.weight"].to(DEVICE)
|
|
down_sf = nvfp4_tensors[f"layers.{layer_idx}.mlp.experts.{e}.down_proj.weight_scale"].to(DEVICE)
|
|
down_gs = nvfp4_tensors[f"layers.{layer_idx}.mlp.experts.{e}.down_proj.weight_scale_2"].item()
|
|
l2_fp4.append(down_w.view(torch.float4_e2m1fn_x2).permute(1, 0).contiguous())
|
|
l2_sf.append(down_sf.permute(1, 0).contiguous())
|
|
l2_gs.append(down_gs)
|
|
return {'l1_fp4': l1_fp4, 'l1_sf': l1_sf, 'l1_gs': l1_gs,
|
|
'l2_fp4': l2_fp4, 'l2_sf': l2_sf, 'l2_gs': l2_gs}
|
|
|
|
|
|
def main():
|
|
torch.manual_seed(42)
|
|
expert_indices = [0, 1, 2]
|
|
num_experts = len(expert_indices)
|
|
hidden_size = 7168
|
|
intermediate_size = 3072
|
|
top_k = 2
|
|
num_tokens = 4
|
|
|
|
print("=" * 70)
|
|
print(" Loading checkpoint")
|
|
print("=" * 70)
|
|
nvfp4_tensors = load_layer_tensors(MODEL_DIR, LAYER_IDX)
|
|
print(f" {len(nvfp4_tensors)} tensors loaded")
|
|
|
|
weights = prepare_direct_weights(nvfp4_tensors, LAYER_IDX, expert_indices, intermediate_size)
|
|
|
|
hidden_states = torch.randn(num_tokens, hidden_size, dtype=torch.bfloat16, device=DEVICE) * 2.0
|
|
expert_ids = torch.tensor([[0, 1]] * num_tokens, dtype=torch.int32, device=DEVICE)
|
|
expert_weights = torch.tensor([[0.6, 0.4]] * num_tokens, dtype=torch.float32, device=DEVICE)
|
|
|
|
# ── Path 1: moe_pipeline (reference, uses quantize_to_nvfp4) ──
|
|
print("\n Running moe_pipeline (dynamic gs)...")
|
|
pipeline_out = run_nvfp4_moe(
|
|
hidden_states.clone(), expert_ids.clone(), expert_weights.clone(),
|
|
weights, expert_indices,
|
|
)
|
|
print(f" Pipeline: amax={pipeline_out.abs().max():.4f}, mean={pipeline_out.float().mean():.6f}")
|
|
|
|
# ── Path 2: CuTeDSLMoERunner with checkpoint input_scale (what vLLM uses) ──
|
|
print("\n Running CuTeDSLMoERunner (checkpoint gs)...")
|
|
runner = CuTeDSLMoERunner(num_experts, hidden_size, intermediate_size, device=DEVICE)
|
|
runner.prepare_weights_direct(
|
|
[w.clone() for w in weights['l1_fp4']],
|
|
[w.clone() for w in weights['l1_sf']],
|
|
list(weights['l1_gs']),
|
|
[w.clone() for w in weights['l2_fp4']],
|
|
[w.clone() for w in weights['l2_sf']],
|
|
list(weights['l2_gs']),
|
|
)
|
|
# Set checkpoint input_scale (what vLLM does in finalize_weights)
|
|
igs = nvfp4_tensors[f"layers.{LAYER_IDX}.mlp.experts.0.gate_proj.input_scale"].item()
|
|
runner._l1_activation_global_scale = igs
|
|
runner._l2_activation_global_scale = igs
|
|
print(f" Checkpoint input_scale: {igs:.10f}")
|
|
|
|
# Build topk_weights and topk_ids in the format the runner expects
|
|
# runner.run expects topk_ids as expert indices (0-based within our expert set)
|
|
topk_weights = expert_weights
|
|
topk_ids = expert_ids
|
|
|
|
runner_out = runner.run(hidden_states.clone(), topk_weights, topk_ids)
|
|
print(f" Runner (ckpt gs): amax={runner_out.abs().max():.4f}, mean={runner_out.float().mean():.6f}")
|
|
|
|
cos_ckpt = torch.nn.functional.cosine_similarity(
|
|
runner_out.flatten().unsqueeze(0).float(),
|
|
pipeline_out.flatten().unsqueeze(0).float(),
|
|
).item()
|
|
print(f" Cosine vs pipeline: {cos_ckpt:.6f}")
|
|
|
|
# ── Path 3: CuTeDSLMoERunner with dynamic gs ──
|
|
print("\n Running CuTeDSLMoERunner (dynamic gs)...")
|
|
# We can't use quantize_to_nvfp4 in the runner (cudagraph), but we can
|
|
# compute the gs from the input and set it before calling run
|
|
x_igs = (hidden_states.abs().max().item()) / (6.0 * 448.0)
|
|
runner2 = CuTeDSLMoERunner(num_experts, hidden_size, intermediate_size, device=DEVICE)
|
|
runner2.prepare_weights_direct(
|
|
[w.clone() for w in weights['l1_fp4']],
|
|
[w.clone() for w in weights['l1_sf']],
|
|
list(weights['l1_gs']),
|
|
[w.clone() for w in weights['l2_fp4']],
|
|
[w.clone() for w in weights['l2_sf']],
|
|
list(weights['l2_gs']),
|
|
)
|
|
runner2._l1_activation_global_scale = x_igs
|
|
runner2._l2_activation_global_scale = x_igs
|
|
print(f" Dynamic gs (from input amax): {x_igs:.10f}")
|
|
|
|
runner2_out = runner2.run(hidden_states.clone(), topk_weights, topk_ids)
|
|
print(f" Runner (dynamic gs): amax={runner2_out.abs().max():.4f}, mean={runner2_out.float().mean():.6f}")
|
|
|
|
cos_dyn = torch.nn.functional.cosine_similarity(
|
|
runner2_out.flatten().unsqueeze(0).float(),
|
|
pipeline_out.flatten().unsqueeze(0).float(),
|
|
).item()
|
|
print(f" Cosine vs pipeline: {cos_dyn:.6f}")
|
|
|
|
# ── Summary ──
|
|
print(f"\n{'=' * 70}")
|
|
print(f" RESULTS")
|
|
print(f"{'=' * 70}")
|
|
print(f" Runner with checkpoint gs vs pipeline: {cos_ckpt:.6f}")
|
|
print(f" Runner with dynamic gs vs pipeline: {cos_dyn:.6f}")
|
|
if cos_dyn > 0.95:
|
|
print(f" ✅ Dynamic gs fixes the problem — gs is the only bug")
|
|
elif cos_dyn < 0.5 and cos_ckpt < 0.5:
|
|
print(f" ❌ Both runner paths are broken — scale assembly is also wrong")
|
|
else:
|
|
print(f" ⚠️ Partial match — multiple issues")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|