From 3a30f35c68ce3bb552cfe9d790bb6e3bacf0fd38 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Tue, 2 Jun 2026 08:12:55 +0000 Subject: [PATCH] =?UTF-8?q?fix:=20cute.math.fmin/fmax=20=E2=86=92=20cute.a?= =?UTF-8?q?rch.fmin/fmax=20in=20fused=20SwiGLU=20kernel?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cute.math has no fmin/fmax. cute.arch does (register-level ops). README constraint #4: use cute.arch.fmax inside plain range(), not vectorize=True. --- dsv4/kernels/gemm/fused_swiglu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dsv4/kernels/gemm/fused_swiglu.py b/dsv4/kernels/gemm/fused_swiglu.py index 61fcffeb..5d610f18 100644 --- a/dsv4/kernels/gemm/fused_swiglu.py +++ b/dsv4/kernels/gemm/fused_swiglu.py @@ -2199,7 +2199,7 @@ class FusedSwiGLUScaledGroupedGemmKernel: silu_result = acc_vec * sigmoid # Paper §4.2.3: gate component capped at swiglu_limit if cutlass.const_expr(self.swiglu_limit > 0.0): - silu_result = cute.math.fmin(silu_result, cutlass.Float32(self.swiglu_limit)) + silu_result = cute.arch.fmin(silu_result, cutlass.Float32(self.swiglu_limit)) silu_result = silu_result.to(self.c_dtype) silu_gate_buf.store(silu_result) # Keep acc_vec in BF16 (same type as the up branch) @@ -2207,7 +2207,7 @@ class FusedSwiGLUScaledGroupedGemmKernel: if is_up: # Paper §4.2.3: linear component clamped to [-swiglu_limit, swiglu_limit] if cutlass.const_expr(self.swiglu_limit > 0.0): - acc_vec = cute.math.fmin(cute.math.fmax(acc_vec, cutlass.Float32(-self.swiglu_limit)), cutlass.Float32(self.swiglu_limit)) + acc_vec = cute.arch.fmin(cute.arch.fmax(acc_vec, cutlass.Float32(-self.swiglu_limit)), cutlass.Float32(self.swiglu_limit)) # SwiGLU: silu(gate) * up gate_vals = silu_gate_buf.load() swiglu_result = (gate_vals * acc_vec.to(self.c_dtype))