diff --git a/vllm/patches/deepseek_v4.py b/vllm/patches/deepseek_v4.py index 00f01236..3955184e 100644 --- a/vllm/patches/deepseek_v4.py +++ b/vllm/patches/deepseek_v4.py @@ -2202,6 +2202,24 @@ class DeepseekV4ForCausalLM(nn.Module): if os.environ.get('NVFP4_DEBUG_SYNC', '') == '1': torch.cuda.synchronize() print("[NVFP4] post-load conversion done, CUDA OK") + + # Post-load NaN scale scan — find any scale tensors that are NaN + # after weight loading + post-load conversion + nan_attrs = [] + for name, module in self.named_modules(): + for attr in ('weight_scale', 'weight_scale_inv', 'weight_scale_2', + 'input_scale', 'act_scale'): + if hasattr(module, attr): + t = getattr(module, attr) + if torch.is_tensor(t) and torch.isnan(t.to(torch.float32)).any().item(): + nan_attrs.append((name, attr, tuple(t.shape), str(t.dtype))) + if nan_attrs: + print(f"[POST-LOAD] {len(nan_attrs)} NaN scale tensors after loading:") + for n, a, s, d in nan_attrs[:20]: + print(f" {n}.{a} shape={s} dtype={d}") + else: + print("[POST-LOAD] No NaN scale tensors found — scales are clean") + return loaded_params def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: diff --git a/vllm/patches/deepseek_v4_attention.py b/vllm/patches/deepseek_v4_attention.py index c907f898..c89e3a0f 100644 --- a/vllm/patches/deepseek_v4_attention.py +++ b/vllm/patches/deepseek_v4_attention.py @@ -377,6 +377,34 @@ class DeepseekV4MultiHeadLatentAttentionWrapper(PluggableLayer): f"inf_any={torch.isinf(zf32).any().item()} " f"shape={tuple(z.shape)} dtype={z.dtype}") + # wo_b inspection — dump all tensor attributes once + if _debug and not hasattr(self, '_wo_b_inspected'): + self._wo_b_inspected = True + layer_idx = getattr(self, 'layer_idx', None) or getattr(self, 'layer_name', '?') + print(f"[wo_b-inspect L{layer_idx}] type={type(self.wo_b).__name__}") + print(f"[wo_b-inspect L{layer_idx}] z (input) nan_frac=" + f"{torch.isnan(z.to(torch.float32)).float().mean().item():.4f} " + f"abs_max={z.to(torch.float32).abs().max().item():.4e}") + for attr in dir(self.wo_b): + if attr.startswith('_'): + continue + try: + v = getattr(self.wo_b, attr) + except Exception: + continue + if torch.is_tensor(v): + vf = v.to(torch.float32) if v.dtype not in (torch.float32,) else v + nf = torch.isnan(vf).float().mean().item() + inf = torch.isinf(vf).any().item() + try: + vmin = vf.min().item() + vmax = vf.max().item() + except Exception: + vmin = vmax = float('nan') + print(f"[wo_b-inspect L{layer_idx}] {attr}: " + f"dtype={v.dtype} shape={tuple(v.shape)} " + f"nan_frac={nf:.4f} inf={inf} min={vmin:.4e} max={vmax:.4e}") + result = self.wo_b(z.flatten(1)) # NaN-trace: check final wo_b output