[Kernel] Fuse FP8 output quantization into merge_attn_states (#36518)
Signed-off-by: Carl You <4531192+carlyou@users.noreply.github.com>
This commit is contained in:
@@ -73,7 +73,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
" Tensor prefix_lse,"
|
||||
" Tensor suffix_output,"
|
||||
" Tensor suffix_lse,"
|
||||
" int!? prefill_tokens_with_context) -> ()");
|
||||
" int!? prefill_tokens_with_context,"
|
||||
" Tensor? output_scale=None) -> ()");
|
||||
ops.impl("merge_attn_states", torch::kCUDA, &merge_attn_states);
|
||||
#ifndef USE_ROCM
|
||||
ops.def(
|
||||
|
||||
Reference in New Issue
Block a user