[Kernel] Delegate construction of FusedMoEQuantConfig to FusedMoEMethodBase subclasses (#22537)
Signed-off-by: Bill Nell <bnell@redhat.com>
This commit is contained in:
@@ -411,6 +411,7 @@ def per_token_group_quant_fp8(
|
||||
x_s = torch.empty(shape, device=x.device, dtype=torch.float32)
|
||||
|
||||
# prefer CUDA kernel if available
|
||||
# TODO(bnell): this causes some fp8 moe test to fail.
|
||||
if current_platform.is_cuda() and x.is_contiguous():
|
||||
torch.ops._C.per_token_group_fp8_quant(x, x_q, x_s, group_size, eps,
|
||||
fp8_min, fp8_max, use_ue8m0)
|
||||
|
||||
Reference in New Issue
Block a user