From 1121cd7b47f9488ed4a4e2c0a52170b87fdf840a Mon Sep 17 00:00:00 2001 From: biondizzle Date: Wed, 3 Jun 2026 14:48:51 +0000 Subject: [PATCH] Add CUDA_LAUNCH_BLOCKING=1 to catch async errors --- single_shot_inference.py | 1 + 1 file changed, 1 insertion(+) diff --git a/single_shot_inference.py b/single_shot_inference.py index 9bf945ca..9d10121e 100644 --- a/single_shot_inference.py +++ b/single_shot_inference.py @@ -9,6 +9,7 @@ NO PyTorch SDPA fallback. NO dequant+matmul for production projections. This is the ground truth for vLLM / SGLang integration. """ import os, sys, time, json, math, argparse, logging +os.environ['CUDA_LAUNCH_BLOCKING'] = '1' # Catch async CUDA errors immediately import torch import torch.nn.functional as F from pathlib import Path