[Hardware][Intel-Gaudi] Add Intel Gaudi (HPU) inference backend (#6143)

Signed-off-by: yuwenzho <yuwen.zhou@intel.com>
Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
Signed-off-by: Bob Zhu <bob.zhu@intel.com>
Signed-off-by: zehao-intel <zehao.huang@intel.com>
Signed-off-by: Konrad Zawora <kzawora@habana.ai>
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
Co-authored-by: Sanju C Sudhakaran <scsudhakaran@habana.ai>
Co-authored-by: Michal Adamczyk <madamczyk@habana.ai>
Co-authored-by: Marceli Fylcek <mfylcek@habana.ai>
Co-authored-by: Himangshu Lahkar <49579433+hlahkar@users.noreply.github.com>
Co-authored-by: Vivek Goel <vgoel@habana.ai>
Co-authored-by: yuwenzho <yuwen.zhou@intel.com>
Co-authored-by: Dominika Olszewska <dolszewska@habana.ai>
Co-authored-by: barak goldberg <149692267+bgoldberg-habana@users.noreply.github.com>
Co-authored-by: Michal Szutenberg <37601244+szutenberg@users.noreply.github.com>
Co-authored-by: Jan Kaniecki <jkaniecki@habana.ai>
Co-authored-by: Agata Dobrzyniewicz <160237065+adobrzyniewicz-habana@users.noreply.github.com>
Co-authored-by: Krzysztof Wisniewski <kwisniewski@habana.ai>
Co-authored-by: Dudi Lester <160421192+dudilester@users.noreply.github.com>
Co-authored-by: Ilia Taraban <tarabanil@gmail.com>
Co-authored-by: Chendi.Xue <chendi.xue@intel.com>
Co-authored-by: Michał Kuligowski <mkuligowski@habana.ai>
Co-authored-by: Jakub Maksymczuk <jmaksymczuk@habana.ai>
Co-authored-by: Tomasz Zielinski <85164140+tzielinski-habana@users.noreply.github.com>
Co-authored-by: Sun Choi <schoi@habana.ai>
Co-authored-by: Iryna Boiko <iboiko@habana.ai>
Co-authored-by: Bob Zhu <41610754+czhu15@users.noreply.github.com>
Co-authored-by: hlin99 <73271530+hlin99@users.noreply.github.com>
Co-authored-by: Zehao Huang <zehao.huang@intel.com>
Co-authored-by: Andrzej Kotłowski <Andrzej.Kotlowski@intel.com>
Co-authored-by: Yan Tomsinsky <73292515+Yantom1@users.noreply.github.com>
Co-authored-by: Nir David <ndavid@habana.ai>
Co-authored-by: Yu-Zhou <yu.zhou@intel.com>
Co-authored-by: Ruheena Suhani Shaik <rsshaik@habana.ai>
Co-authored-by: Karol Damaszke <kdamaszke@habana.ai>
Co-authored-by: Marcin Swiniarski <mswiniarski@habana.ai>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Jacek Czaja <jacek.czaja@intel.com>
Co-authored-by: Jacek Czaja <jczaja@habana.ai>
Co-authored-by: Yuan <yuan.zhou@outlook.com>
This commit is contained in:
Konrad Zawora
2024-11-06 10:09:10 +01:00
committed by GitHub
parent a5fda50a10
commit a02a50e6e5
31 changed files with 4279 additions and 20 deletions

View File

@@ -466,9 +466,10 @@ class ModelConfig:
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
# If the feature combo become valid
if device_config.device_type not in ("cuda", "tpu", "xpu"):
if device_config.device_type not in ("cuda", "tpu", "xpu", "hpu"):
logger.warning(
"Async output processing is only supported for CUDA, TPU, XPU. "
"Async output processing is only supported for CUDA, TPU, XPU "
"and HPU."
"Disabling it for other platforms.")
self.use_async_output_proc = False
return
@@ -860,7 +861,6 @@ class LoadConfig:
ignore_patterns: The list of patterns to ignore when loading the model.
Default to "original/**/*" to avoid repeated loading of llama's
checkpoints.
"""
load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO
@@ -964,6 +964,13 @@ class ParallelConfig:
raise ValueError(
"TPU backend only supports Ray for distributed inference.")
if current_platform.is_hpu() and self.world_size > 1:
if self.distributed_executor_backend is None:
self.distributed_executor_backend = "ray"
if self.distributed_executor_backend != "ray":
raise ValueError(
"HPU backend only supports Ray for distributed inference.")
if self.distributed_executor_backend is None and self.world_size > 1:
# We use multiprocessing by default if world_size fits on the
# current node and we aren't in a ray placement group.
@@ -1166,6 +1173,8 @@ class DeviceConfig:
self.device_type = "cuda"
elif current_platform.is_neuron():
self.device_type = "neuron"
elif current_platform.is_hpu():
self.device_type = "hpu"
elif current_platform.is_openvino():
self.device_type = "openvino"
elif current_platform.is_tpu():
@@ -1745,6 +1754,13 @@ def _get_and_verify_dtype(
torch_dtype = torch.float16
else:
torch_dtype = config_dtype
if current_platform.is_hpu() and config_dtype == torch.float16:
logger.info(
"For HPU, we cast models to bfloat16 instead of"
"using float16 by default. Please specify `dtype` if you "
"want to use float16.")
torch_dtype = torch.bfloat16
else:
if dtype not in _STR_DTYPE_TO_TORCH_DTYPE:
raise ValueError(f"Unknown dtype: {dtype}")