[ROCm] [Feature] [Doc] [Dockerfile] [BugFix] Support Per-Token-Activation Per-Channel-Weight FP8 Quantization Inferencing (#12501)

2025-02-08 00:13:43 +08:00
parent 0630d4537a
commit eaa92d4437
8 changed files with 295 additions and 32 deletions
--- a/vllm/model_executor/layers/quantization/init.py
+++ b/vllm/model_executor/layers/quantization/init.py
@@ -11,6 +11,7 @@ QUANTIZATION_METHODS: List[str] = [
    "deepspeedfp",
    "tpu_int8",
    "fp8",
+    "ptpc_fp8",
    "fbgemm_fp8",
    "modelopt",
    # The order of gptq methods is important for config.py iteration over
@@ -99,6 +100,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
    from .modelopt import ModelOptFp8Config
    from .moe_wna16 import MoeWNA16Config
    from .neuron_quant import NeuronQuantConfig
+    from .ptpc_fp8 import PTPCFp8Config
    from .qqq import QQQConfig
    from .tpu_int8 import Int8TpuConfig

@@ -120,6 +122,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
        "gptq": GPTQConfig,
        "compressed-tensors": CompressedTensorsConfig,
        "bitsandbytes": BitsAndBytesConfig,
+        "ptpc_fp8": PTPCFp8Config,
        "qqq": QQQConfig,
        "hqq": HQQMarlinConfig,
        "experts_int8": ExpertsInt8Config,