fix: pass int32 token_ids to hash router (was int64)

2026-06-01 01:08:03 +00:00
parent 62efde5c9f
commit 03c45d4bfb
2 changed files with 8 additions and 4 deletions
--- a/single_shot_PYTORCH_REFERENCE.py
+++ b/single_shot_PYTORCH_REFERENCE.py
@@ -3,7 +3,10 @@

 THIS is a pure-PyTorch reference reimplementation that bypasses every kernel in the production stack.

-IT IS ONLY TO BE USED FOR REFERENCE FOR THE ACTUAL PRODUCTION KERNEL SINGLE SHOT
+IT IS ONLY TO BE USED FOR REFERENCE FOR THE CONSTRUCTION OF THE ACTUAL PRODUCTION KERNEL SINGLE SHOT
+
+THIS FILE WAS MADE BY AN LLM THAT WAS ASKED TO IMPLIMENT THE PRODUCTION KERNEL AND INSTEAD IT JUST REDID IT IN PYTORCH.
+THE FACT THIS FILE EXISTS PISSES ME OFF. IT DEMONSTRATES THAT AI IS FAR FROM INTELLIGENT, IT CAN NOT FOLLOW SIMPLE INSTRUCTIONS OR TRULY REASON, AND TRIES TO DO EVERYTHING SHITTY AND FAST.

 Architecture (paper §2, verified against HuggingFace modeling_deepseek_v4.py):
  X_l → mHC.pre_block → RMSNorm → Attention → F_attn → mHC.post_block → X_mid
--- a/single_shot_inference.py
+++ b/single_shot_inference.py
@@ -669,9 +669,10 @@ def main():
    print(f"Prefilling {len(generated)} tokens...")
    for pi, tid_val in enumerate(generated):
        t1 = time.time()
-        tid = torch.tensor([tid_val], dtype=torch.long, device='cuda:0')
+        tid_int64 = torch.tensor([tid_val], dtype=torch.long, device='cuda:0')
+        tid = tid_int64.to(torch.int32)  # hash router needs int32
        pos = torch.tensor([pi], dtype=torch.long, device='cuda:0')
-        X = mHCLayer.init_state(embed(tid))
+        X = mHCLayer.init_state(embed(tid_int64))
        for li in range(n_layers):
            gpu = li % NUM_GPUS
            if X.device != torch.device(f"cuda:{gpu}"): X = X.to(f"cuda:{gpu}")
@@ -695,7 +696,7 @@ def main():
        t1 = time.time()
        tid = torch.tensor([all_tokens[-1]], dtype=torch.long, device='cuda:0')
        dec_pos = torch.tensor([len(all_tokens)-1], dtype=torch.long, device='cuda:0')
-        X = mHCLayer.init_state(embed(tid))
+        X = mHCLayer.init_state(embed(tid_int64))
        for li in range(n_layers):
            gpu = li % NUM_GPUS
            if X.device != torch.device(f"cuda:{gpu}"): X = X.to(f"cuda:{gpu}")