From e07d79868f2db8d7f903fe473af85b6f73ee38c1 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Wed, 3 Jun 2026 17:02:34 +0000 Subject: [PATCH] CUDA graph: Fix _assemble_scales_single_group swizzle size The pre-allocated buffer is max-sized, but pad_and_swizzle_single operates on the full buffer dimensions. Fix: pass a correctly-sized view (buf[:padded_rows, :padded_cols]) so the swizzle produces the right output size. Same fix applied to both linear.py and shared_expert.py. --- CUDA_GRAPH_SYNC_INVENTORY.md | 12 +++++++++++- dsv4/layers/linear.py | 5 ++++- dsv4/layers/shared_expert.py | 4 +++- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/CUDA_GRAPH_SYNC_INVENTORY.md b/CUDA_GRAPH_SYNC_INVENTORY.md index ee3025ec..1602202f 100644 --- a/CUDA_GRAPH_SYNC_INVENTORY.md +++ b/CUDA_GRAPH_SYNC_INVENTORY.md @@ -4,7 +4,17 @@ **Source:** Section A detector run + manual code grep (Section B checklist) **Target:** single_shot_inference.py decode forward (1 token step, T=1) -## Summary +## B200 Detector Results (first run) + +Method 1 (sync debug mode): **1 violation** caught +- `dec_tid_buf[0] = all_tokens[-1]` — CPU→GPU sync from writing Python int to GPU tensor +- **FIXED**: Use pinned CPU buffer + copy_ + +Method 2 (graph capture L0): **FAIL** +- `expert_offsets[g] = (g + 1) * padded_rows_per_group` — CPU→GPU sync in Python loop +- **FIXED**: Pre-allocated range tensor + element-wise multiply + +Both fixes committed and pushed. Re-running detector to verify. The decode forward has **numerous device→host sync violations** that must be fixed before CUDA graph capture can succeed. The violations fall into clear categories below. diff --git a/dsv4/layers/linear.py b/dsv4/layers/linear.py index 007f6169..32b53473 100644 --- a/dsv4/layers/linear.py +++ b/dsv4/layers/linear.py @@ -156,7 +156,10 @@ class Nvfp4Linear: f"scale_a_buf too small: {buf.shape} < ({padded_rows}, {padded_cols})" buf.view(torch.uint8).zero_() buf[:num_rows, :num_cols] = x_sf - swizzled_flat = pad_and_swizzle_single(buf) + # Pass correctly-sized VIEW to swizzle — the swizzle operates on + # (padded_rows, padded_cols) not the full max-size buffer. + view = buf[:padded_rows, :padded_cols] + swizzled_flat = pad_and_swizzle_single(view) return swizzled_flat.reshape(padded_rows, padded_cols) def compute_activation_global_scale(self, hidden_states_sample): diff --git a/dsv4/layers/shared_expert.py b/dsv4/layers/shared_expert.py index 5b7b63f9..cb936673 100644 --- a/dsv4/layers/shared_expert.py +++ b/dsv4/layers/shared_expert.py @@ -216,7 +216,9 @@ class Nvfp4SharedExpert: f"padded_x_sf_buf too small: {buf.shape} < ({padded_rows}, {padded_cols})" buf.view(torch.uint8).zero_() buf[:num_rows, :num_cols] = x_sf - swizzled_flat = pad_and_swizzle_single(buf) + # Pass correctly-sized VIEW to swizzle — avoids processing the full max-size buffer + view = buf[:padded_rows, :padded_cols] + swizzled_flat = pad_and_swizzle_single(view) return swizzled_flat.reshape(padded_rows, padded_cols) def compute_activation_global_scales(self, hidden_states_sample):