diff --git a/single_shot_inference.py b/single_shot_inference.py index 232379a8..88912f8d 100644 --- a/single_shot_inference.py +++ b/single_shot_inference.py @@ -993,6 +993,7 @@ def main(): prof_embed_layers = 0.0 prof_lm_head = 0.0 prof_sample = 0.0 + prof_sample_start = 0.0 for step in range(MAX_NEW_TOKENS): t1 = time.time() dec_tid_buf[0] = all_tokens[-1] @@ -1058,10 +1059,13 @@ def main(): ls = logits.float() for tid, name in [(THINK_START, 'think_start'), (THINK_END, 'think_end'), (USER_TOKEN, 'user'), (ASSISTANT_TOKEN, 'assistant')]: print(f" {name}({tid}) logit={ls[0, tid].item():.2f}", flush=True) - # Also check "Paris" token - paris_tids = [t for t in range(min(129280, ls.shape[-1])) if 'Paris' in tokenizer.decode([t])] - if paris_tids: - print(f" Paris tokens: {[(t, ls[0,t].item()) for t in paris_tids[:5]]}", flush=True) + # Paris token check — only check known token IDs, no 129K iteration + for t in [11111, 51119, 60107]: + if t < ls.shape[-1]: + print(f" Paris-candidate({t}) logit={ls[0, t].item():.2f}", flush=True) + # Sync for profiling and error check + if profile: torch.cuda.synchronize() + t_sample_start = time.perf_counter() # Only sync + validate on first 3 steps and every 20th step (reduces pipeline stalls) if step < 3 or (step + 1) % 20 == 0: torch.cuda.synchronize() # catch CUDA errors at source @@ -1096,6 +1100,7 @@ def main(): all_tokens.append(next_id) dt = time.time() - t1 + if profile: torch.cuda.synchronize() t_s = time.perf_counter() # Track thinking state if next_id == THINK_START: in_thinking = True @@ -1104,7 +1109,8 @@ def main(): if profile: prof_embed_layers += (t_layers - t_e) prof_lm_head += (t_lm - t_layers) - prof_sample += (t_s - t_lm) + prof_sample_start = t_sample_start + prof_sample += (t_s - t_sample_start) # Diagnostics — reduce CPU syncs, only top-5 every 5 steps if step % 5 == 0 or step < 5: