From 03c45d4bfb7e9c2fa515387b4dcd69ad91791f93 Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Mon, 1 Jun 2026 01:08:03 +0000
Subject: [PATCH] fix: pass int32 token_ids to hash router (was int64)

---
 single_shot_PYTORCH_REFERENCE.py | 5 ++++-
 single_shot_inference.py         | 7 ++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/single_shot_PYTORCH_REFERENCE.py b/single_shot_PYTORCH_REFERENCE.py
index 5f443223..64ccad80 100644
--- a/single_shot_PYTORCH_REFERENCE.py
+++ b/single_shot_PYTORCH_REFERENCE.py
@@ -3,7 +3,10 @@
 
 THIS is a pure-PyTorch reference reimplementation that bypasses every kernel in the production stack.
 
-IT IS ONLY TO BE USED FOR REFERENCE FOR THE ACTUAL PRODUCTION KERNEL SINGLE SHOT
+IT IS ONLY TO BE USED FOR REFERENCE FOR THE CONSTRUCTION OF THE ACTUAL PRODUCTION KERNEL SINGLE SHOT
+
+THIS FILE WAS MADE BY AN LLM THAT WAS ASKED TO IMPLIMENT THE PRODUCTION KERNEL AND INSTEAD IT JUST REDID IT IN PYTORCH.
+THE FACT THIS FILE EXISTS PISSES ME OFF. IT DEMONSTRATES THAT AI IS FAR FROM INTELLIGENT, IT CAN NOT FOLLOW SIMPLE INSTRUCTIONS OR TRULY REASON, AND TRIES TO DO EVERYTHING SHITTY AND FAST.
 
 Architecture (paper §2, verified against HuggingFace modeling_deepseek_v4.py):
   X_l → mHC.pre_block → RMSNorm → Attention → F_attn → mHC.post_block → X_mid
diff --git a/single_shot_inference.py b/single_shot_inference.py
index 5375d758..614dfc32 100644
--- a/single_shot_inference.py
+++ b/single_shot_inference.py
@@ -669,9 +669,10 @@ def main():
     print(f"Prefilling {len(generated)} tokens...")
     for pi, tid_val in enumerate(generated):
         t1 = time.time()
-        tid = torch.tensor([tid_val], dtype=torch.long, device='cuda:0')
+        tid_int64 = torch.tensor([tid_val], dtype=torch.long, device='cuda:0')
+        tid = tid_int64.to(torch.int32)  # hash router needs int32
         pos = torch.tensor([pi], dtype=torch.long, device='cuda:0')
-        X = mHCLayer.init_state(embed(tid))
+        X = mHCLayer.init_state(embed(tid_int64))
         for li in range(n_layers):
             gpu = li % NUM_GPUS
             if X.device != torch.device(f"cuda:{gpu}"): X = X.to(f"cuda:{gpu}")
@@ -695,7 +696,7 @@ def main():
         t1 = time.time()
         tid = torch.tensor([all_tokens[-1]], dtype=torch.long, device='cuda:0')
         dec_pos = torch.tensor([len(all_tokens)-1], dtype=torch.long, device='cuda:0')
-        X = mHCLayer.init_state(embed(tid))
+        X = mHCLayer.init_state(embed(tid_int64))
         for li in range(n_layers):
             gpu = li % NUM_GPUS
             if X.device != torch.device(f"cuda:{gpu}"): X = X.to(f"cuda:{gpu}")