[Kernel] Enable fp8 support for pplx and BatchedTritonExperts. (#18864)

Signed-off-by: Bill Nell <bnell@redhat.com>
This commit is contained in:
bnellnm
2025-07-03 17:55:40 -04:00
committed by GitHub
parent 2f2fcb31b8
commit 78fe77534b
25 changed files with 1277 additions and 663 deletions

View File

@@ -277,6 +277,24 @@ def dequant(
return t.to(out_dtype)
def batched_dequant(
t: torch.Tensor,
scale: Optional[torch.Tensor],
block_shape: Optional[list[int]],
per_act_token_quant: bool,
out_dtype: Optional[torch.dtype] = torch.float32,
) -> torch.Tensor:
if scale is not None:
assert t.shape[0] == scale.shape[0]
out = torch.empty_like(t, dtype=out_dtype)
for e in range(t.shape[0]):
out[e] = dequant(t[e], scale[e], block_shape, per_act_token_quant,
out_dtype)
return out
return t.to(out_dtype)
def native_batched_masked_quant_matmul(
A: torch.Tensor,
B: torch.Tensor,