feat: add progress bars for expert quantization and post-load conversion

Visual feedback during the slow parts of model loading:
  NVFP4 experts [████████████████░░░░]  80% (26/32)
  NVFP4 convert [██████░░░░░░░░░░░░░░]  30% (20/61)

Updates every 10% so it's not spammy.
This commit is contained in:
2026-05-16 04:14:07 +00:00
parent b465579a02
commit 00b766af60

View File

@@ -428,7 +428,12 @@ class DeepseekV4MegaMoEExperts(nn.Module):
l1_fp4, l1_sf, l1_gs = [], [], []
l2_fp4, l2_sf, l2_gs = [], [], []
for e in range(self.num_local_experts):
n_exp = self.num_local_experts
for e in range(n_exp):
if e % max(1, n_exp // 10) == 0 or e == n_exp - 1:
pct = (e + 1) * 100 // n_exp
bar = "" * (pct // 5) + "" * (20 - pct // 5)
print(f" NVFP4 experts [{bar}] {pct:3d}% ({e+1}/{n_exp})", flush=True)
# ── L1: gate + up (fused) ──
gate_w = self.w13_weight.data[e, :self.intermediate_size] # (intermediate, hidden//2) uint8
up_w = self.w13_weight.data[e, self.intermediate_size:] # (intermediate, hidden//2) uint8
@@ -1621,7 +1626,12 @@ class DeepseekV4Model(nn.Module):
# Build shard index once for compressor reconstruction (avoids N×M full-shard loads)
_shard_index = self._build_shard_index("/model") if os.path.isdir("/model") else None
n_layers = len(self.layers)
for layer_idx, layer in enumerate(self.layers):
if layer_idx % max(1, n_layers // 10) == 0 or layer_idx == n_layers - 1:
pct = (layer_idx + 1) * 100 // n_layers
bar = "" * (pct // 5) + "" * (20 - pct // 5)
print(f" NVFP4 convert [{bar}] {pct:3d}% (layer {layer_idx+1}/{n_layers})", flush=True)
attn = layer.attn
# FP8 conversion: only wo_a