fix: descriptive tqdm labels — uint8→NVFP4 and NVFP4→FP8/BF16
Makes it crystal clear what's happening: - Experts: direct uint8→float4 view-cast (Blackwell native, no BF16) - Convert: NVFP4→FP8/BF16 for attention weights (non-expert path)
This commit is contained in:
@@ -429,7 +429,7 @@ class DeepseekV4MegaMoEExperts(nn.Module):
|
||||
l2_fp4, l2_sf, l2_gs = [], [], []
|
||||
|
||||
from tqdm import tqdm
|
||||
for e in tqdm(range(self.num_local_experts), desc=" NVFP4 experts", unit="exp"):
|
||||
for e in tqdm(range(self.num_local_experts), desc=" uint8→NVFP4 experts", unit="exp"):
|
||||
# ── L1: gate + up (fused) ──
|
||||
gate_w = self.w13_weight.data[e, :self.intermediate_size] # (intermediate, hidden//2) uint8
|
||||
up_w = self.w13_weight.data[e, self.intermediate_size:] # (intermediate, hidden//2) uint8
|
||||
@@ -1623,7 +1623,7 @@ class DeepseekV4Model(nn.Module):
|
||||
_shard_index = self._build_shard_index("/model") if os.path.isdir("/model") else None
|
||||
|
||||
from tqdm import tqdm
|
||||
for layer_idx, layer in tqdm(enumerate(self.layers), total=len(self.layers), desc=" NVFP4 convert", unit="layer"):
|
||||
for layer_idx, layer in tqdm(enumerate(self.layers), total=len(self.layers), desc=" NVFP4→FP8/BF16 convert", unit="layer"):
|
||||
attn = layer.attn
|
||||
|
||||
# FP8 conversion: only wo_a
|
||||
|
||||
Reference in New Issue
Block a user