From 553275d81025e1832349e336007e0419c485ae2c Mon Sep 17 00:00:00 2001 From: biondizzle Date: Tue, 2 Jun 2026 08:25:52 +0000 Subject: [PATCH] =?UTF-8?q?feat:=20P1=20=E2=80=94=20add=20eager=20warmup?= =?UTF-8?q?=5Ffused=5Fswiglu=5Fcompilation=20for=20SharedExpert=20(1-group?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- single_shot_inference.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/single_shot_inference.py b/single_shot_inference.py index d0c936e5..82afad7f 100644 --- a/single_shot_inference.py +++ b/single_shot_inference.py @@ -1042,6 +1042,15 @@ def main(): se.set_fused_swiglu(True) # EAGERLY process shared expert weights se._ensure_initialized() + # P1: Eagerly warmup fused SwiGLU compilation for SE (1-group) + if se._fused_swiglu: + from dsv4.ops.gemm_runner import warmup_fused_swiglu_compilation + K_packed = H // 2 + N_packed_l1 = (2 * cfg.get("moe_intermediate_size", 3072)) // 2 # gate+up + warmup_fused_swiglu_compilation( + 1, K_packed, N_packed_l1, dev, + swiglu_limit=cfg.get("swiglu_limit", 10.0), + ) # Fix activation global scales — _ensure_initialized sets gsa from l1_gs (which is 1.0) # FIX: Same runtime gsa for SharedExpert se._use_runtime_gsa = True