Support FP8 Quantization and Inference Run on Intel Gaudi (HPU) using INC (Intel Neural Compressor) (#12010)

Signed-off-by: Nir David <ndavid@habana.ai> Signed-off-by: Uri Livne <ulivne@habana.ai> Co-authored-by: Uri Livne <ulivne@habana.ai>
2025-07-16 22:33:41 +03:00
parent ac2bf41e53
commit 01513a334a
11 changed files with 168 additions and 25 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -139,6 +139,10 @@ def get_type_hints(type_hint: TypeHint) -> set[TypeHint]:
    return type_hints


+def is_online_quantization(quantization: Any) -> bool:
+    return quantization in ["inc"]
+
+
@functools.lru_cache(maxsize=30)
 def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
    cls_docs = get_attr_docs(cls)
@@ -960,6 +964,8 @@ class EngineArgs:
        return LoadConfig(
            load_format=self.load_format,
            download_dir=self.download_dir,
+            device="cpu"
+            if is_online_quantization(self.quantization) else None,
            model_loader_extra_config=self.model_loader_extra_config,
            ignore_patterns=self.ignore_patterns,
            use_tqdm_on_load=self.use_tqdm_on_load,
@@ -1359,7 +1365,9 @@ class EngineArgs:
            supported = False
            if current_platform.is_rocm() or (
                    current_platform.is_cuda()
-                    and current_platform.is_device_capability(100)):
+                    and current_platform.is_device_capability(100)) or (
+                        current_platform.device_name
+                        == "hpu"):  # handle hpu also for OOT platform
                supported = True
            elif fp8_attention and will_use_fa:
                from vllm.attention.utils.fa_utils import (