Support FP8 block quant for CompressedTensorsW8A16Fp8 (#33280)

Signed-off-by: mgoin <mgoin64@gmail.com>
2026-01-30 11:15:20 -05:00
parent f857a03f6b
commit fd0e377244
4 changed files with 74 additions and 64 deletions
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -400,7 +400,6 @@ class Fp8LinearMethod(LinearMethodBase):
                None,
                weight_loader,
            )
-            set_weight_attrs(scale, {"scale_type": "weight_scale"})
            layer.register_parameter("weight_scale", scale)
        else:
            assert not self.act_q_static
@@ -412,7 +411,6 @@ class Fp8LinearMethod(LinearMethodBase):
                self.weight_block_size,
                weight_loader,
            )
-            set_weight_attrs(scale, {"scale_type": "weight_scale"})
            # The weight_scale_inv name is intentional for deepseekv3
            layer.register_parameter("weight_scale_inv", scale)