fix: accurate profile sync + remove paris_tids 129K iteration

This commit is contained in:
2026-06-01 23:55:26 +00:00
parent 73ae9393da
commit 4017ef2f16

View File

@@ -993,6 +993,7 @@ def main():
prof_embed_layers = 0.0
prof_lm_head = 0.0
prof_sample = 0.0
prof_sample_start = 0.0
for step in range(MAX_NEW_TOKENS):
t1 = time.time()
dec_tid_buf[0] = all_tokens[-1]
@@ -1058,10 +1059,13 @@ def main():
ls = logits.float()
for tid, name in [(THINK_START, 'think_start'), (THINK_END, 'think_end'), (USER_TOKEN, 'user'), (ASSISTANT_TOKEN, 'assistant')]:
print(f" {name}({tid}) logit={ls[0, tid].item():.2f}", flush=True)
# Also check "Paris" token
paris_tids = [t for t in range(min(129280, ls.shape[-1])) if 'Paris' in tokenizer.decode([t])]
if paris_tids:
print(f" Paris tokens: {[(t, ls[0,t].item()) for t in paris_tids[:5]]}", flush=True)
# Paris token check — only check known token IDs, no 129K iteration
for t in [11111, 51119, 60107]:
if t < ls.shape[-1]:
print(f" Paris-candidate({t}) logit={ls[0, t].item():.2f}", flush=True)
# Sync for profiling and error check
if profile: torch.cuda.synchronize()
t_sample_start = time.perf_counter()
# Only sync + validate on first 3 steps and every 20th step (reduces pipeline stalls)
if step < 3 or (step + 1) % 20 == 0:
torch.cuda.synchronize() # catch CUDA errors at source
@@ -1096,6 +1100,7 @@ def main():
all_tokens.append(next_id)
dt = time.time() - t1
if profile: torch.cuda.synchronize()
t_s = time.perf_counter()
# Track thinking state
if next_id == THINK_START: in_thinking = True
@@ -1104,7 +1109,8 @@ def main():
if profile:
prof_embed_layers += (t_layers - t_e)
prof_lm_head += (t_lm - t_layers)
prof_sample += (t_s - t_lm)
prof_sample_start = t_sample_start
prof_sample += (t_s - t_sample_start)
# Diagnostics — reduce CPU syncs, only top-5 every 5 steps
if step % 5 == 0 or step < 5: