feat: add progress bars for expert quantization and post-load conversion

Visual feedback during the slow parts of model loading: NVFP4 experts [████████████████░░░░] 80% (26/32) NVFP4 convert [██████░░░░░░░░░░░░░░] 30% (20/61) Updates every 10% so it's not spammy.
2026-05-16 04:14:07 +00:00
parent b465579a02
commit 00b766af60
1 changed files with 11 additions and 1 deletions
--- a/vllm/patches/deepseek_v4.py
+++ b/vllm/patches/deepseek_v4.py
@@ -428,7 +428,12 @@ class DeepseekV4MegaMoEExperts(nn.Module):
        l1_fp4, l1_sf, l1_gs = [], [], []
        l2_fp4, l2_sf, l2_gs = [], [], []

-        for e in range(self.num_local_experts):
+        n_exp = self.num_local_experts
+        for e in range(n_exp):
+            if e % max(1, n_exp // 10) == 0 or e == n_exp - 1:
+                pct = (e + 1) * 100 // n_exp
+                bar = "█" * (pct // 5) + "░" * (20 - pct // 5)
+                print(f"  NVFP4 experts [{bar}] {pct:3d}% ({e+1}/{n_exp})", flush=True)
            # ── L1: gate + up (fused) ──
            gate_w = self.w13_weight.data[e, :self.intermediate_size]  # (intermediate, hidden//2) uint8
            up_w = self.w13_weight.data[e, self.intermediate_size:]    # (intermediate, hidden//2) uint8
@@ -1621,7 +1626,12 @@ class DeepseekV4Model(nn.Module):
        # Build shard index once for compressor reconstruction (avoids N×M full-shard loads)
        _shard_index = self._build_shard_index("/model") if os.path.isdir("/model") else None

+        n_layers = len(self.layers)
        for layer_idx, layer in enumerate(self.layers):
+            if layer_idx % max(1, n_layers // 10) == 0 or layer_idx == n_layers - 1:
+                pct = (layer_idx + 1) * 100 // n_layers
+                bar = "█" * (pct // 5) + "░" * (20 - pct // 5)
+                print(f"  NVFP4 convert [{bar}] {pct:3d}% (layer {layer_idx+1}/{n_layers})", flush=True)
            attn = layer.attn
            
            # FP8 conversion: only wo_a