Support FP8 Quantization and Inference Run on Intel Gaudi (HPU) using INC (Intel Neural Compressor) (#12010)
Signed-off-by: Nir David <ndavid@habana.ai> Signed-off-by: Uri Livne <ulivne@habana.ai> Co-authored-by: Uri Livne <ulivne@habana.ai>
This commit is contained in:
@@ -139,6 +139,10 @@ def get_type_hints(type_hint: TypeHint) -> set[TypeHint]:
|
||||
return type_hints
|
||||
|
||||
|
||||
def is_online_quantization(quantization: Any) -> bool:
|
||||
return quantization in ["inc"]
|
||||
|
||||
|
||||
@functools.lru_cache(maxsize=30)
|
||||
def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
|
||||
cls_docs = get_attr_docs(cls)
|
||||
@@ -960,6 +964,8 @@ class EngineArgs:
|
||||
return LoadConfig(
|
||||
load_format=self.load_format,
|
||||
download_dir=self.download_dir,
|
||||
device="cpu"
|
||||
if is_online_quantization(self.quantization) else None,
|
||||
model_loader_extra_config=self.model_loader_extra_config,
|
||||
ignore_patterns=self.ignore_patterns,
|
||||
use_tqdm_on_load=self.use_tqdm_on_load,
|
||||
@@ -1359,7 +1365,9 @@ class EngineArgs:
|
||||
supported = False
|
||||
if current_platform.is_rocm() or (
|
||||
current_platform.is_cuda()
|
||||
and current_platform.is_device_capability(100)):
|
||||
and current_platform.is_device_capability(100)) or (
|
||||
current_platform.device_name
|
||||
== "hpu"): # handle hpu also for OOT platform
|
||||
supported = True
|
||||
elif fp8_attention and will_use_fa:
|
||||
from vllm.attention.utils.fa_utils import (
|
||||
|
||||
Reference in New Issue
Block a user