fix: accurate profile sync + remove paris_tids 129K iteration

2026-06-01 23:55:26 +00:00
parent 73ae9393da
commit 4017ef2f16
1 changed files with 11 additions and 5 deletions
--- a/single_shot_inference.py
+++ b/single_shot_inference.py
@@ -993,6 +993,7 @@ def main():
    prof_embed_layers = 0.0
    prof_lm_head = 0.0
    prof_sample = 0.0
+    prof_sample_start = 0.0
    for step in range(MAX_NEW_TOKENS):
        t1 = time.time()
        dec_tid_buf[0] = all_tokens[-1]
@@ -1058,10 +1059,13 @@ def main():
            ls = logits.float()
            for tid, name in [(THINK_START, 'think_start'), (THINK_END, 'think_end'), (USER_TOKEN, 'user'), (ASSISTANT_TOKEN, 'assistant')]:
                print(f"  {name}({tid}) logit={ls[0, tid].item():.2f}", flush=True)
-            # Also check "Paris" token
-            paris_tids = [t for t in range(min(129280, ls.shape[-1])) if 'Paris' in tokenizer.decode([t])]
-            if paris_tids:
-                print(f"  Paris tokens: {[(t, ls[0,t].item()) for t in paris_tids[:5]]}", flush=True)
+            # Paris token check — only check known token IDs, no 129K iteration
+            for t in [11111, 51119, 60107]:
+                if t < ls.shape[-1]:
+                    print(f"  Paris-candidate({t}) logit={ls[0, t].item():.2f}", flush=True)
+        # Sync for profiling and error check
+        if profile: torch.cuda.synchronize()
+        t_sample_start = time.perf_counter()
        # Only sync + validate on first 3 steps and every 20th step (reduces pipeline stalls)
        if step < 3 or (step + 1) % 20 == 0:
            torch.cuda.synchronize()  # catch CUDA errors at source
@@ -1096,6 +1100,7 @@ def main():
        all_tokens.append(next_id)
        dt = time.time() - t1

+        if profile: torch.cuda.synchronize()
        t_s = time.perf_counter()
        # Track thinking state
        if next_id == THINK_START: in_thinking = True
@@ -1104,7 +1109,8 @@ def main():
        if profile:
            prof_embed_layers += (t_layers - t_e)
            prof_lm_head += (t_lm - t_layers)
-            prof_sample += (t_s - t_lm)
+            prof_sample_start = t_sample_start
+            prof_sample += (t_s - t_sample_start)

        # Diagnostics — reduce CPU syncs, only top-5 every 5 steps
        if step % 5 == 0 or step < 5: