diff --git a/single_shot_inference.py b/single_shot_inference.py
index 232379a8..88912f8d 100644
--- a/single_shot_inference.py
+++ b/single_shot_inference.py
@@ -993,6 +993,7 @@ def main():
     prof_embed_layers = 0.0
     prof_lm_head = 0.0
     prof_sample = 0.0
+    prof_sample_start = 0.0
     for step in range(MAX_NEW_TOKENS):
         t1 = time.time()
         dec_tid_buf[0] = all_tokens[-1]
@@ -1058,10 +1059,13 @@ def main():
             ls = logits.float()
             for tid, name in [(THINK_START, 'think_start'), (THINK_END, 'think_end'), (USER_TOKEN, 'user'), (ASSISTANT_TOKEN, 'assistant')]:
                 print(f"  {name}({tid}) logit={ls[0, tid].item():.2f}", flush=True)
-            # Also check "Paris" token
-            paris_tids = [t for t in range(min(129280, ls.shape[-1])) if 'Paris' in tokenizer.decode([t])]
-            if paris_tids:
-                print(f"  Paris tokens: {[(t, ls[0,t].item()) for t in paris_tids[:5]]}", flush=True)
+            # Paris token check — only check known token IDs, no 129K iteration
+            for t in [11111, 51119, 60107]:
+                if t < ls.shape[-1]:
+                    print(f"  Paris-candidate({t}) logit={ls[0, t].item():.2f}", flush=True)
+        # Sync for profiling and error check
+        if profile: torch.cuda.synchronize()
+        t_sample_start = time.perf_counter()
         # Only sync + validate on first 3 steps and every 20th step (reduces pipeline stalls)
         if step < 3 or (step + 1) % 20 == 0:
             torch.cuda.synchronize()  # catch CUDA errors at source
@@ -1096,6 +1100,7 @@ def main():
         all_tokens.append(next_id)
         dt = time.time() - t1
 
+        if profile: torch.cuda.synchronize()
         t_s = time.perf_counter()
         # Track thinking state
         if next_id == THINK_START: in_thinking = True
@@ -1104,7 +1109,8 @@ def main():
         if profile:
             prof_embed_layers += (t_layers - t_e)
             prof_lm_head += (t_lm - t_layers)
-            prof_sample += (t_s - t_lm)
+            prof_sample_start = t_sample_start
+            prof_sample += (t_s - t_sample_start)
 
         # Diagnostics — reduce CPU syncs, only top-5 every 5 steps
         if step % 5 == 0 or step < 5: