From 00b766af601992d3ef95b9ff79bb56a95cd52dae Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Sat, 16 May 2026 04:14:07 +0000
Subject: [PATCH] feat: add progress bars for expert quantization and post-load
 conversion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Visual feedback during the slow parts of model loading:
  NVFP4 experts [████████████████░░░░]  80% (26/32)
  NVFP4 convert [██████░░░░░░░░░░░░░░]  30% (20/61)

Updates every 10% so it's not spammy.
---
 vllm/patches/deepseek_v4.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/vllm/patches/deepseek_v4.py b/vllm/patches/deepseek_v4.py
index 06bac7eb..d7dd3aff 100644
--- a/vllm/patches/deepseek_v4.py
+++ b/vllm/patches/deepseek_v4.py
@@ -428,7 +428,12 @@ class DeepseekV4MegaMoEExperts(nn.Module):
         l1_fp4, l1_sf, l1_gs = [], [], []
         l2_fp4, l2_sf, l2_gs = [], [], []
 
-        for e in range(self.num_local_experts):
+        n_exp = self.num_local_experts
+        for e in range(n_exp):
+            if e % max(1, n_exp // 10) == 0 or e == n_exp - 1:
+                pct = (e + 1) * 100 // n_exp
+                bar = "█" * (pct // 5) + "░" * (20 - pct // 5)
+                print(f"  NVFP4 experts [{bar}] {pct:3d}% ({e+1}/{n_exp})", flush=True)
             # ── L1: gate + up (fused) ──
             gate_w = self.w13_weight.data[e, :self.intermediate_size]  # (intermediate, hidden//2) uint8
             up_w = self.w13_weight.data[e, self.intermediate_size:]    # (intermediate, hidden//2) uint8
@@ -1621,7 +1626,12 @@ class DeepseekV4Model(nn.Module):
         # Build shard index once for compressor reconstruction (avoids N×M full-shard loads)
         _shard_index = self._build_shard_index("/model") if os.path.isdir("/model") else None
 
+        n_layers = len(self.layers)
         for layer_idx, layer in enumerate(self.layers):
+            if layer_idx % max(1, n_layers // 10) == 0 or layer_idx == n_layers - 1:
+                pct = (layer_idx + 1) * 100 // n_layers
+                bar = "█" * (pct // 5) + "░" * (20 - pct // 5)
+                print(f"  NVFP4 convert [{bar}] {pct:3d}% (layer {layer_idx+1}/{n_layers})", flush=True)
             attn = layer.attn
             
             # FP8 conversion: only wo_a