feat: add progress bars for expert quantization and post-load conversion
Visual feedback during the slow parts of model loading: NVFP4 experts [████████████████░░░░] 80% (26/32) NVFP4 convert [██████░░░░░░░░░░░░░░] 30% (20/61) Updates every 10% so it's not spammy.
This commit is contained in:
@@ -428,7 +428,12 @@ class DeepseekV4MegaMoEExperts(nn.Module):
|
||||
l1_fp4, l1_sf, l1_gs = [], [], []
|
||||
l2_fp4, l2_sf, l2_gs = [], [], []
|
||||
|
||||
for e in range(self.num_local_experts):
|
||||
n_exp = self.num_local_experts
|
||||
for e in range(n_exp):
|
||||
if e % max(1, n_exp // 10) == 0 or e == n_exp - 1:
|
||||
pct = (e + 1) * 100 // n_exp
|
||||
bar = "█" * (pct // 5) + "░" * (20 - pct // 5)
|
||||
print(f" NVFP4 experts [{bar}] {pct:3d}% ({e+1}/{n_exp})", flush=True)
|
||||
# ── L1: gate + up (fused) ──
|
||||
gate_w = self.w13_weight.data[e, :self.intermediate_size] # (intermediate, hidden//2) uint8
|
||||
up_w = self.w13_weight.data[e, self.intermediate_size:] # (intermediate, hidden//2) uint8
|
||||
@@ -1621,7 +1626,12 @@ class DeepseekV4Model(nn.Module):
|
||||
# Build shard index once for compressor reconstruction (avoids N×M full-shard loads)
|
||||
_shard_index = self._build_shard_index("/model") if os.path.isdir("/model") else None
|
||||
|
||||
n_layers = len(self.layers)
|
||||
for layer_idx, layer in enumerate(self.layers):
|
||||
if layer_idx % max(1, n_layers // 10) == 0 or layer_idx == n_layers - 1:
|
||||
pct = (layer_idx + 1) * 100 // n_layers
|
||||
bar = "█" * (pct // 5) + "░" * (20 - pct // 5)
|
||||
print(f" NVFP4 convert [{bar}] {pct:3d}% (layer {layer_idx+1}/{n_layers})", flush=True)
|
||||
attn = layer.attn
|
||||
|
||||
# FP8 conversion: only wo_a
|
||||
|
||||
Reference in New Issue
Block a user