From 307574bc9116208e40bcd56ac7bde785c8e2eed1 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Tue, 12 May 2026 15:14:39 +0000 Subject: [PATCH] test: signal alarm timeout for kernel hang --- patches/test_nvfp4_mega_moe.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/patches/test_nvfp4_mega_moe.py b/patches/test_nvfp4_mega_moe.py index 778b957..5a0a1d4 100644 --- a/patches/test_nvfp4_mega_moe.py +++ b/patches/test_nvfp4_mega_moe.py @@ -96,23 +96,24 @@ def test_nvfp4_mega_moe(): # --- Run kernel --- y = torch.zeros(num_tokens, hidden, dtype=torch.bfloat16, device=device) - print("Calling fp8_nvfp4_mega_moe...") + print("Calling fp8_nvfp4_mega_moe...", flush=True) + import signal + timed_out = False + def handler(signum, frame): + nonlocal timed_out + timed_out = True + raise TimeoutError("Kernel timeout") + signal.signal(signal.SIGALRM, handler) + signal.alarm(15) # 15 second timeout try: fp8_nvfp4_mega_moe(y, l1_weights, l2_weights, symm_buffer) - # Use a sync with a manual timeout - done = torch.cuda.Event() - done.record() - import time - start = time.time() - while not done.query(): - if time.time() - start > 10: - print("TIMEOUT: kernel did not complete in 10s") - break - time.sleep(0.1) - else: - torch.cuda.synchronize() - print(f"SUCCESS! y stats: min={y.min().item():.4f} max={y.max().item():.4f} mean={y.mean().item():.4f} nonzero={torch.count_nonzero(y).item()}") + torch.cuda.synchronize() + signal.alarm(0) + print(f"SUCCESS! y stats: min={y.min().item():.4f} max={y.max().item():.4f} mean={y.mean().item():.4f} nonzero={torch.count_nonzero(y).item()}") + except TimeoutError: + print("TIMEOUT: kernel did not complete in 15s (GPU hang?)") except Exception as e: + signal.alarm(0) print(f"FAILED: {e}") raise