""" NVFP4 weight transformation and SF layout utilities. Port of deep_gemm.mega.transform_nvfp4_weights_for_mega_moe """ from math import ceil_div fn fold_global_scale_into_block_scales( weight_scale: Tensor[float8_e4m3fn], # (N, K//16) UE4M3 block scales weight_scale_2: Tensor[float32], # (num_logical,) or scalar global scale logical_widths: List[int], # per-logical-weight row counts ) -> Tensor[float32]: """Fold global scale into block scales: UE4M3 * FP32 -> FP32""" # Convert UE4M3 to float32, multiply by global scale # For MergedColumnParallelLinear, expand per-logical global scale ... fn pack_ue4m3_to_int32(sf: Tensor[float8_e4m3fn]) -> Tensor[int32]: """Pack 4 UE4M3 values (4 bytes) into one int32 for DeepGEMM TMA""" # View as uint8, pack 4 consecutive bytes into int32 ... fn transform_sf_into_required_layout( sf_mn: Tensor[int32], # MN-major packed SF N: int, K: int, recipe: Tuple[int, int], # (gran_mn, gran_k) num_groups: int, ) -> Tensor[int32]: """Transform SF into TMA-aligned UTCCP layout for DeepGEMM""" # Call into DeepGEMM's C++ layout transform ...