From 00b766af601992d3ef95b9ff79bb56a95cd52dae Mon Sep 17 00:00:00 2001 From: biondizzle Date: Sat, 16 May 2026 04:14:07 +0000 Subject: [PATCH] feat: add progress bars for expert quantization and post-load conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Visual feedback during the slow parts of model loading: NVFP4 experts [████████████████░░░░] 80% (26/32) NVFP4 convert [██████░░░░░░░░░░░░░░] 30% (20/61) Updates every 10% so it's not spammy. --- vllm/patches/deepseek_v4.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/vllm/patches/deepseek_v4.py b/vllm/patches/deepseek_v4.py index 06bac7eb..d7dd3aff 100644 --- a/vllm/patches/deepseek_v4.py +++ b/vllm/patches/deepseek_v4.py @@ -428,7 +428,12 @@ class DeepseekV4MegaMoEExperts(nn.Module): l1_fp4, l1_sf, l1_gs = [], [], [] l2_fp4, l2_sf, l2_gs = [], [], [] - for e in range(self.num_local_experts): + n_exp = self.num_local_experts + for e in range(n_exp): + if e % max(1, n_exp // 10) == 0 or e == n_exp - 1: + pct = (e + 1) * 100 // n_exp + bar = "█" * (pct // 5) + "░" * (20 - pct // 5) + print(f" NVFP4 experts [{bar}] {pct:3d}% ({e+1}/{n_exp})", flush=True) # ── L1: gate + up (fused) ── gate_w = self.w13_weight.data[e, :self.intermediate_size] # (intermediate, hidden//2) uint8 up_w = self.w13_weight.data[e, self.intermediate_size:] # (intermediate, hidden//2) uint8 @@ -1621,7 +1626,12 @@ class DeepseekV4Model(nn.Module): # Build shard index once for compressor reconstruction (avoids N×M full-shard loads) _shard_index = self._build_shard_index("/model") if os.path.isdir("/model") else None + n_layers = len(self.layers) for layer_idx, layer in enumerate(self.layers): + if layer_idx % max(1, n_layers // 10) == 0 or layer_idx == n_layers - 1: + pct = (layer_idx + 1) * 100 // n_layers + bar = "█" * (pct // 5) + "░" * (20 - pct // 5) + print(f" NVFP4 convert [{bar}] {pct:3d}% (layer {layer_idx+1}/{n_layers})", flush=True) attn = layer.attn # FP8 conversion: only wo_a