From 1121cd7b47f9488ed4a4e2c0a52170b87fdf840a Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Wed, 3 Jun 2026 14:48:51 +0000
Subject: [PATCH] Add CUDA_LAUNCH_BLOCKING=1 to catch async errors

---
 single_shot_inference.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/single_shot_inference.py b/single_shot_inference.py
index 9bf945ca..9d10121e 100644
--- a/single_shot_inference.py
+++ b/single_shot_inference.py
@@ -9,6 +9,7 @@ NO PyTorch SDPA fallback. NO dequant+matmul for production projections.
 This is the ground truth for vLLM / SGLang integration.
 """
 import os, sys, time, json, math, argparse, logging
+os.environ['CUDA_LAUNCH_BLOCKING'] = '1'  # Catch async CUDA errors immediately
 import torch
 import torch.nn.functional as F
 from pathlib import Path