diff --git a/single_shot_inference.py b/single_shot_inference.py index 9bf945ca..9d10121e 100644 --- a/single_shot_inference.py +++ b/single_shot_inference.py @@ -9,6 +9,7 @@ NO PyTorch SDPA fallback. NO dequant+matmul for production projections. This is the ground truth for vLLM / SGLang integration. """ import os, sys, time, json, math, argparse, logging +os.environ['CUDA_LAUNCH_BLOCKING'] = '1' # Catch async CUDA errors immediately import torch import torch.nn.functional as F from pathlib import Path