fix: default activation global_scale so runner works without finalize_weights
This commit is contained in:
@@ -60,8 +60,10 @@ class CuTeDSLMoERunner:
|
||||
self._l1_gsb = None
|
||||
self._l2_gsb = None
|
||||
|
||||
self._l1_activation_global_scale = None # set from checkpoint input_scale
|
||||
self._l2_activation_global_scale = None
|
||||
# Default: 1/2688 ≈ 0.000372 (amax=1 → gs=1/2688)
|
||||
# Overridden in finalize_weights with checkpoint input_scale or warmup value
|
||||
self._l1_activation_global_scale = 1.0 / (6.0 * 448.0)
|
||||
self._l2_activation_global_scale = 1.0 / (6.0 * 448.0)
|
||||
|
||||
# Pre-allocated cudagraph buffers (set in _allocate_buffers)
|
||||
self._token_indices = None
|
||||
|
||||
Reference in New Issue
Block a user