From a9ea30353ce6c2bc5d7c83bb2e0e786baf445f03 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Wed, 3 Jun 2026 16:37:20 +0000 Subject: [PATCH] CUDA graph: Fix sync violations (Category 1-2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. mhc.py: Remove .item() from post_block (122 syncs/step eliminated) - The X_next.abs().max().item() was syncing EVERY layer's post_block - Diagnostics moved to caller (outside graph region) 2. linear.py: Pre-allocate _scale_a_buf in _ensure_buffer_size - _assemble_scales_single_group now uses pre-allocated buffer - Eliminates per-call torch.zeros() allocation (graph capture killer) 3. shared_expert.py: Same fix — use pre-allocated padded_x_sf_buf - _assemble_scales_single_group no longer allocates 4. quantize.py: Remove .contiguous() from gsa expand - expand() creates stride-0 view, CUDA kernel reads correctly - No allocation on the hot path 5. Add CUDA_GRAPH_SYNC_INVENTORY.md with full violation catalog --- CUDA_GRAPH_SYNC_INVENTORY.md | 113 +++++++++++++++++++++++++++++++++++ dsv4/layers/linear.py | 35 ++++++++++- dsv4/layers/mhc.py | 9 +-- dsv4/layers/shared_expert.py | 12 ++-- dsv4/ops/quantize.py | 14 ++++- 5 files changed, 167 insertions(+), 16 deletions(-) create mode 100644 CUDA_GRAPH_SYNC_INVENTORY.md diff --git a/CUDA_GRAPH_SYNC_INVENTORY.md b/CUDA_GRAPH_SYNC_INVENTORY.md new file mode 100644 index 00000000..ee3025ec --- /dev/null +++ b/CUDA_GRAPH_SYNC_INVENTORY.md @@ -0,0 +1,113 @@ +# CUDA Graph Readiness — Sync Violation Inventory + +**Date:** 2026-06-03 +**Source:** Section A detector run + manual code grep (Section B checklist) +**Target:** single_shot_inference.py decode forward (1 token step, T=1) + +## Summary + +The decode forward has **numerous device→host sync violations** that must be fixed before CUDA graph capture can succeed. The violations fall into clear categories below. + +--- + +## CATEGORY 1: Explicit `.item()` syncs on hot path + +### single_shot_inference.py — decode loop (lines ~1600-1700) + +| Line | Code | Severity | Fix | +|------|------|----------|-----| +| ~1618 | `lin._gsa_buf.item()` in warmup_gsa block | HIGH — syncs per projection | Move warmup_gsa to a single `torch.cuda.synchronize()` + batched read; eliminate from graph region | +| ~1642 | `torch.argmax(logits, -1).item()` for greedy sampling | HIGH — but outside graph | Sampling is outside captured region by design (vLLM pattern) | +| ~1683 | `sampled[0].item()` for sampling | HIGH — but outside graph | Same as above | +| ~1657 | `torch.cuda.synchronize()` for error checking | MEDIUM | Remove from graph region; only check outside | + +### single_shot_inference.py — diagnostics (controlled by VERBOSE >= 2) + +| Line | Code | Severity | Fix | +|------|------|----------|-----| +| 933 | `attn_out.abs().max().item()` | LOW — guarded by VERBOSE | Already gated; remove entirely for graph capture | +| 962 | `F_attn.abs().max().item()` | LOW — guarded | Same | +| 974-975 | `topk_ids.max().item()`, `topk_ids.min().item()` | LOW — guarded | Same | +| 981 | `gate_logits.min().item()`, `.max().item()`, `.mean().item()` | LOW — guarded | Same | +| 983 | `torch.isnan(x).any().item()` | LOW — guarded | Same | +| 987 | Various `.item()` in MoE DIAG | LOW — guarded | Same | +| 995-999 | SE weight diagnostics | LOW — guarded | Same | +| 1068-1086 | `X_next.abs().max().item()`, mHC diagnostics | LOW — guarded | Same | + +### dsv4/layers/mhc.py — post_block (line 422) + +| Line | Code | Severity | Fix | +|------|------|----------|-----| +| 422 | `X_next.abs().max().item()` — runs on EVERY layer | **CRITICAL** — syncs 122x per step (61 layers × 2 mHC) | Remove `.item()` entirely; the `pass` body makes this useless anyway | + +--- + +## CATEGORY 2: Per-step tensor allocations (graph capture killer) + +| File | Line | Code | Fix | +|------|------|------|-----| +| `dsv4/layers/linear.py` | 128 | `torch.zeros(padded_rows, padded_cols, ...)` in `_assemble_scales_single_group` | Pre-allocate scale buffer at max size; reuse with zero+scatter pattern | +| `dsv4/layers/shared_expert.py` | 213 | Same pattern — `torch.zeros(...)` in `_assemble_scales_single_group` | Same fix | +| `dsv4/ops/quantize.py` | 320 | `x_bf16.contiguous()` — may allocate if non-contiguous | Ensure inputs are always contiguous (pre-allocate) | +| `dsv4/ops/quantize.py` | 327-329 | `gsa_gpu.reshape(1).expand(M).contiguous()` — allocates | Pre-allocate gsa buffer; use copy_ instead of expand+contiguous | +| `single_shot_inference.py` | ~1600 | `mHCLayer.init_state(embed(dec_tid_buf))` — creates new tensor | Pre-allocate X buffer; use in-place copy | + +--- + +## CATEGORY 3: Data-dependent control flow (host branches on device-derived values) + +| File | Line | Code | Fix | +|------|------|------|-----| +| `single_shot_inference.py` | 335 | `if self.ratio == 0 or self._kv_bf16 is None: return None` — ratio is static per layer, but `_kv_bf16 is None` depends on load | This is static per layer — graph captures per-layer, so this is OK | +| `single_shot_inference.py` | 352 | `if self._buf_len < r: return None` — compressor buffering reads host int | **Section C, Hazard #1**: Must compress every step; emit device-side | +| `single_shot_inference.py` | 360 | `if n_complete == 0: return None` — depends on T (host-known for decode) | For decode T=1, HCA always returns None. This is host-known — OK per layer, but need fixed-shape output | +| `single_shot_inference.py` | 376 | `if compressed.shape[0] == 0: return None` — data-dependent shape | Must always produce fixed-shape output (padded) | +| `single_shot_inference.py` | 435 | `if ... kv_cache.n_comp == 0: return None` — host reads Python int | n_comp grows over time — **Section C, Hazard #2**: paged KV with fixed blocks | +| `single_shot_inference.py` | ~935 | `if kv_cache.n_comp > 0:` — host branch on n_comp | Same fix: paged KV | +| `single_shot_inference.py` | ~955 | `seq_len = kv_nope_scale.shape[0]` — dynamic shape | Fixed-shape gather with masking | + +--- + +## CATEGORY 4: Cross-GPU transfers inside graph + +| File | Line | Code | Fix | +|------|------|------|-----| +| `single_shot_inference.py` | ~1600 | `X.to(f"cuda:{gpu}")` in layer loop | Cannot be in graph; break graph at attention (eager-break pattern) or pre-stage on target GPU | + +--- + +## CATEGORY 5: torch.cuda.synchronize() on hot path + +| File | Line | Code | Fix | +|------|------|------|-----| +| `single_shot_inference.py` | 816 | `torch.cuda.synchronize()` in profile timing | Guarded by `_profile_detail` — must be False during graph capture | +| `single_shot_inference.py` | 1041-1065 | `torch.cuda.synchronize()` in forward_layer profile | Same — must be disabled | +| `single_shot_inference.py` | 1088 | `torch.cuda.synchronize()` in forward_layer diag | Guarded by profile flag | +| `dsv4/layers/mhc.py` | 422 | Implicit sync via `.item()` | Remove | + +--- + +## Section C Hazards (from GETTING_CUDAGRAPH_READY.md) + +| # | Hazard | Current State | Fix Required | +|---|--------|---------------|--------------| +| 1 | Compressor returns None for most decode steps | `_buf_len` host check, returns None | Compress every step into persistent partial state; emit device-side on boundary | +| 2 | KV grows each step | `n_comp` Python int, dynamic gather shapes | Paged KV (fixed blocks + block table) or make attention the eager break | +| 3 | Indexer top-k → host reads count | `topk_indices` is fixed top_k shape — **already OK** | Already fixed-shape gather | +| 4 | MoE per-expert token counts | `torch.bincount` in MoE run, but offsets are GPU tensors | Already uses device offsets and fixed total launch — **already OK** | +| 5 | Next token/positions on host | Fresh `dec_tid_buf`, `dec_pos_buf` each step | Pre-allocated buffers with `copy_` — **already mostly OK** | + +--- + +## Fix Priority + +1. **mhc.py line 422** — remove `.item()` (1 line fix, 122 syncs eliminated) +2. **linear.py `_assemble_scales_single_group`** — pre-allocate scale buffer +3. **shared_expert.py `_assemble_scales_single_group`** — same fix +4. **quantize.py gsa expansion** — pre-allocate, use copy_ instead of expand+contiguous +5. **Compressor Section C hazard** — compress every step, emit device-side +6. **KV cache Section C hazard** — paged KV or eager-break at attention +7. **Diagnostics `.item()` cleanup** — gate behind compile-time flag, not runtime VERBOSE +8. **Warmup gsa** — batched sync, not per-projection `.item()` + +The single-shot should be re-run with `VERBOSE=0` and `--no-fused-rmsnorm` disabled (use fused) to minimize conditional sync paths during detection. diff --git a/dsv4/layers/linear.py b/dsv4/layers/linear.py index 9cc8a4c8..007f6169 100644 --- a/dsv4/layers/linear.py +++ b/dsv4/layers/linear.py @@ -103,7 +103,16 @@ class Nvfp4Linear: # warmup_compilation(1, K_packed, N_packed, self.device) # Lazy compile on first real forward def _ensure_buffer_size(self, num_tokens: int): - """Ensure the padded buffer is large enough for num_tokens.""" + """Ensure the padded buffer is large enough for num_tokens. + + Pre-allocates ALL buffers needed for CUDA graph capture: + - padded x_fp4 buffer (max_num_tokens aligned to 128 rows) + - expert_offsets (1 element for single group) + - gsa buffer (1 element, GPU-only) + - scale_a swizzle buffer (pre-allocated at max size) + + No per-call allocations — zero CPU-GPU syncs on the hot path. + """ needed_rows = cutedsl_ceil_div(num_tokens, 128) * 128 if self._padded_x_fp4_buf is not None and self._padded_x_fp4_buf.shape[0] >= needed_rows: return # Already big enough @@ -114,18 +123,38 @@ class Nvfp4Linear: self._expert_offsets_buf = torch.zeros(1, dtype=torch.int32, device=self.device) self._gsa_buf = torch.full((1,), self._activation_global_scale, dtype=torch.float32, device=self.device) + + # Pre-allocate scale_a swizzle buffer for _assemble_scales_single_group. + # Max size: (max_num_tokens aligned to 128) × (K_sf aligned to 4). + # This eliminates the per-call torch.zeros() allocation that breaks + # CUDA graph capture. + K_sf = cutedsl_ceil_div(self.in_features, 16) + max_padded_rows = cutedsl_ceil_div(self.max_num_tokens, 128) * 128 + max_padded_cols = cutedsl_ceil_div(K_sf, 4) * 4 + self._scale_a_buf = torch.zeros( + max_padded_rows, max_padded_cols, dtype=torch.float16, device=self.device + ).to(torch.float8_e4m3fn) def _ensure_initialized(self): if self._mat_b is None: self.finalize_weights() def _assemble_scales_single_group(self, x_sf): - """Assemble 2D-side activation scales for num_groups=1.""" + """Assemble 2D-side activation scales for num_groups=1. + + CUDA-graph-safe: uses pre-allocated _scale_a_buf instead of + per-call torch.zeros(). The buffer is zeroed + scattered + swizzled + each call — zero new allocations on the hot path. + """ num_rows, num_cols = x_sf.shape padded_rows = cutedsl_ceil_div(num_rows, 128) * 128 padded_cols = cutedsl_ceil_div(num_cols, 4) * 4 - buf = torch.zeros(padded_rows, padded_cols, dtype=torch.float16, device=x_sf.device).to(torch.float8_e4m3fn) + # Use pre-allocated buffer — zero + scatter pattern (no new allocation) + buf = self._scale_a_buf + assert buf.shape[0] >= padded_rows and buf.shape[1] >= padded_cols, \ + f"scale_a_buf too small: {buf.shape} < ({padded_rows}, {padded_cols})" + buf.view(torch.uint8).zero_() buf[:num_rows, :num_cols] = x_sf swizzled_flat = pad_and_swizzle_single(buf) return swizzled_flat.reshape(padded_rows, padded_cols) diff --git a/dsv4/layers/mhc.py b/dsv4/layers/mhc.py index 38133693..6b99a816 100644 --- a/dsv4/layers/mhc.py +++ b/dsv4/layers/mhc.py @@ -418,12 +418,9 @@ class mHCLayer: CF = ctx.C_l.unsqueeze(-1) * F_out.unsqueeze(1) # (T, n_hc, d) X_next = (CF.float() + BX).to(self.dtype) # (T, n_hc, d) - # Diagnostic: warn on residual blowup - x_max = X_next.abs().max().item() - if x_max > 500: - # Don't clip in production, just warn - pass - + # Note: residual magnitude monitoring is done OUTSIDE the graph-captured region + # (via the caller in single_shot_inference.py diagnostics). No .item() here — + # CUDA graph capture requires zero device→host syncs on the hot path. return X_next # ---------------------------------------------------------------- diff --git a/dsv4/layers/shared_expert.py b/dsv4/layers/shared_expert.py index 8bbb034b..5b7b63f9 100644 --- a/dsv4/layers/shared_expert.py +++ b/dsv4/layers/shared_expert.py @@ -202,15 +202,19 @@ class Nvfp4SharedExpert: 2. Apply pad_and_swizzle_single (Blackwell swizzle) 3. Reshape back to 2D (kernel expects 2D scale_a) - The padded buffer must be sized exactly for 128-aligned num_tokens, - NOT the max_num_tokens buffer (which would be way too large). + CUDA-graph-safe: uses the pre-allocated padded_x_sf_buf instead of + per-call torch.zeros(). The buffer is zeroed + scattered + swizzled + each call — zero new allocations on the hot path. """ num_rows, num_cols = x_sf.shape padded_rows = cutedsl_ceil_div(num_rows, 128) * 128 padded_cols = cutedsl_ceil_div(num_cols, 4) * 4 - # Use a temp buffer sized for this exact token count - buf = torch.zeros(padded_rows, padded_cols, dtype=torch.float16, device=x_sf.device).to(torch.float8_e4m3fn) + # Use pre-allocated buffer — zero + scatter pattern (no new allocation) + buf = padded_x_sf_buf + assert buf.shape[0] >= padded_rows and buf.shape[1] >= padded_cols, \ + f"padded_x_sf_buf too small: {buf.shape} < ({padded_rows}, {padded_cols})" + buf.view(torch.uint8).zero_() buf[:num_rows, :num_cols] = x_sf swizzled_flat = pad_and_swizzle_single(buf) return swizzled_flat.reshape(padded_rows, padded_cols) diff --git a/dsv4/ops/quantize.py b/dsv4/ops/quantize.py index f7fe460c..aa1f5fb7 100644 --- a/dsv4/ops/quantize.py +++ b/dsv4/ops/quantize.py @@ -315,18 +315,26 @@ def quantize_nvfp4_gpu_fused(x_bf16, divisor=6.0 * 448.0): x_sf: (M, N//16) float8_e4m3fn gsa: (M,) float32 GPU tensor — per-row global scale for GEMM """ - # CUDA kernels require contiguous input — column slices from deinterleave are non-contiguous + # CUDA kernels require contiguous input — column slices from deinterleave are non-contiguous. + # For CUDA graph capture, this MUST be contiguous at graph construction time. + # The .contiguous() call is a no-op when already contiguous (no allocation). if not x_bf16.is_contiguous(): x_bf16 = x_bf16.contiguous() from dsv4.kernels.cuda.loader import get_cuda_module amax_mod = get_cuda_module("amax_gsa", ["amax_gsa.cu"]) gsa_gpu = amax_mod.compute_amax_gsa(x_bf16, divisor) # scalar GPU tensor # Broadcast to (M,) for the quantize-from-buffer kernel + # CUDA-graph-safe: use reshape+expand without .contiguous() allocation. + # For M=1 decode (the common graph-captured case), gsa is already scalar — no alloc. + # For M>1 prefill (not graph-captured), expand creates a view, and the CUDA kernel + # reads it correctly because the underlying data is contiguous (single value expanded). + # If the kernel truly requires physical contiguity, the caller should pre-allocate + # a buffer and use copy_ instead. M = x_bf16.shape[0] if gsa_gpu.dim() == 0: - gsa_gpu = gsa_gpu.reshape(1).expand(M).contiguous() # (M,) all rows same gsa + gsa_gpu = gsa_gpu.reshape(1).expand(M) # (M,) view — no allocation elif gsa_gpu.shape[0] == 1 and M > 1: - gsa_gpu = gsa_gpu.expand(M).contiguous() + gsa_gpu = gsa_gpu.expand(M) # view — no allocation quant_mod = get_cuda_module("fused_amax_quantize", ["fused_amax_quantize.cu"]) x_fp4, x_sf = quant_mod.quantize_nvfp4_from_buffer(x_bf16, gsa_gpu) return x_fp4, x_sf, gsa_gpu