Fix prompt encoding: remove \n\n before content per official DSV4 spec; add --chat-mode

This commit is contained in:
2026-06-03 08:19:33 +00:00
parent 2a42686e8e
commit 1e77dfcaa0

View File

@@ -24,6 +24,7 @@ def parse_args():
p.add_argument('--top-k', type=int, default=50, help='Top-k filtering (0=disabled)')
p.add_argument('--top-p', type=float, default=0.95, help='Top-p (nucleus) filtering (1.0=disabled)')
p.add_argument('--prompt', type=str, default=None)
p.add_argument('--chat-mode', action='store_true', help='Chat mode: close thinking block after Assistant token (no reasoning)')
p.add_argument('--seed', type=int, default=42)
p.add_argument('--verbose', type=int, default=1)
p.add_argument('--prefill-only', action='store_true')
@@ -1460,16 +1461,20 @@ def main():
if _args.prefill_tokens:
generated = [int(x) for x in _args.prefill_tokens.split(',')]
else:
# Official DeepSeek V4 encoding (from encoding/encoding_dsv4.py):
# <BOS><User>{msg}<Assistant>ately (thinking mode)
# <BOS><User>{msg}<Assistant>heroically (chat mode — closes thinking immediately)
# NOTE: No \n\n between User token and content — the official spec has no separator.
# The \n\n was wrong and made the prompt out-of-distribution.
input_ids = [bos, USER_TOKEN]
input_ids += tokenizer.encode('\n\n' + PROMPT, add_special_tokens=False)
input_ids += tokenizer.encode(PROMPT, add_special_tokens=False)
input_ids.append(ASSISTANT_TOKEN)
# DSV4 reasoning model: must prime with ◇ (think_start) after Assistant token.
# Without this, the model is out-of-distribution — it expects to be inside a
# thinking block but never received the think-start sentinel.
# Symptom: degenerate output from step 0 (e.g. "France" instead of "Paris",
# looping on newlines/repeated tokens). With ◇, the model generates thinking
# content, emits ◇ (think_end), then produces the actual answer.
input_ids.append(THINK_START)
if _args.chat_mode:
# Chat mode: close thinking block immediately → model generates content directly
input_ids.append(THINK_END)
else:
# Thinking mode: open thinking block → model reasons first, then answers
input_ids.append(THINK_START)
generated = input_ids
all_tokens = generated.copy()
print(f"Input: {len(generated)} tokens")