[FP8] Extend per-token-group quantization support to QuantFP8 (#24342)

Signed-off-by: Tahsin Tunan <tahsintunan@gmail.com> Signed-off-by: Luka Govedič <lgovedic@redhat.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Luka Govedič <lgovedic@redhat.com>
2025-09-17 07:31:06 +06:00
parent 493b10f8bf
commit cef32104b4
5 changed files with 444 additions and 61 deletions
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -32,9 +32,11 @@ from vllm.model_executor.layers.fused_moe.prepare_finalize import (
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
    TopKWeightAndReduceNoOP)
 from vllm.model_executor.layers.fused_moe.utils import (
-    _resize_cache, moe_kernel_quantize_input, per_token_group_quant_fp8)
+    _resize_cache, moe_kernel_quantize_input)
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
    calculate_tile_tokens_dim)
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8)
 from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
    dequant_mxfp4)
 from vllm.platforms import current_platform