Support FP8 Quantization and Inference Run on Intel Gaudi (HPU) using INC (Intel Neural Compressor) (#12010)

Signed-off-by: Nir David <ndavid@habana.ai>
Signed-off-by: Uri Livne <ulivne@habana.ai>
Co-authored-by: Uri Livne <ulivne@habana.ai>
This commit is contained in:
Nir David
2025-07-16 22:33:41 +03:00
committed by GitHub
parent ac2bf41e53
commit 01513a334a
11 changed files with 168 additions and 25 deletions

View File

@@ -139,6 +139,10 @@ def get_type_hints(type_hint: TypeHint) -> set[TypeHint]:
return type_hints
def is_online_quantization(quantization: Any) -> bool:
return quantization in ["inc"]
@functools.lru_cache(maxsize=30)
def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
cls_docs = get_attr_docs(cls)
@@ -960,6 +964,8 @@ class EngineArgs:
return LoadConfig(
load_format=self.load_format,
download_dir=self.download_dir,
device="cpu"
if is_online_quantization(self.quantization) else None,
model_loader_extra_config=self.model_loader_extra_config,
ignore_patterns=self.ignore_patterns,
use_tqdm_on_load=self.use_tqdm_on_load,
@@ -1359,7 +1365,9 @@ class EngineArgs:
supported = False
if current_platform.is_rocm() or (
current_platform.is_cuda()
and current_platform.is_device_capability(100)):
and current_platform.is_device_capability(100)) or (
current_platform.device_name
== "hpu"): # handle hpu also for OOT platform
supported = True
elif fp8_attention and will_use_fa:
from vllm.attention.utils.fa_utils import (