diff --git a/single_shot_PYTORCH_REFERENCE.py b/single_shot_PYTORCH_REFERENCE.py
index 5f443223..64ccad80 100644
--- a/single_shot_PYTORCH_REFERENCE.py
+++ b/single_shot_PYTORCH_REFERENCE.py
@@ -3,7 +3,10 @@
 
 THIS is a pure-PyTorch reference reimplementation that bypasses every kernel in the production stack.
 
-IT IS ONLY TO BE USED FOR REFERENCE FOR THE ACTUAL PRODUCTION KERNEL SINGLE SHOT
+IT IS ONLY TO BE USED FOR REFERENCE FOR THE CONSTRUCTION OF THE ACTUAL PRODUCTION KERNEL SINGLE SHOT
+
+THIS FILE WAS MADE BY AN LLM THAT WAS ASKED TO IMPLIMENT THE PRODUCTION KERNEL AND INSTEAD IT JUST REDID IT IN PYTORCH.
+THE FACT THIS FILE EXISTS PISSES ME OFF. IT DEMONSTRATES THAT AI IS FAR FROM INTELLIGENT, IT CAN NOT FOLLOW SIMPLE INSTRUCTIONS OR TRULY REASON, AND TRIES TO DO EVERYTHING SHITTY AND FAST.
 
 Architecture (paper §2, verified against HuggingFace modeling_deepseek_v4.py):
   X_l → mHC.pre_block → RMSNorm → Attention → F_attn → mHC.post_block → X_mid
diff --git a/single_shot_inference.py b/single_shot_inference.py
index 5375d758..614dfc32 100644
--- a/single_shot_inference.py
+++ b/single_shot_inference.py
@@ -669,9 +669,10 @@ def main():
     print(f"Prefilling {len(generated)} tokens...")
     for pi, tid_val in enumerate(generated):
         t1 = time.time()
-        tid = torch.tensor([tid_val], dtype=torch.long, device='cuda:0')
+        tid_int64 = torch.tensor([tid_val], dtype=torch.long, device='cuda:0')
+        tid = tid_int64.to(torch.int32)  # hash router needs int32
         pos = torch.tensor([pi], dtype=torch.long, device='cuda:0')
-        X = mHCLayer.init_state(embed(tid))
+        X = mHCLayer.init_state(embed(tid_int64))
         for li in range(n_layers):
             gpu = li % NUM_GPUS
             if X.device != torch.device(f"cuda:{gpu}"): X = X.to(f"cuda:{gpu}")
@@ -695,7 +696,7 @@ def main():
         t1 = time.time()
         tid = torch.tensor([all_tokens[-1]], dtype=torch.long, device='cuda:0')
         dec_pos = torch.tensor([len(all_tokens)-1], dtype=torch.long, device='cuda:0')
-        X = mHCLayer.init_state(embed(tid))
+        X = mHCLayer.init_state(embed(tid_int64))
         for li in range(n_layers):
             gpu = li % NUM_GPUS
             if X.device != torch.device(f"cuda:{gpu}"): X = X.to(f"cuda:{gpu}")