[Hardware][Intel-Gaudi] Add Intel Gaudi (HPU) inference backend (#6143)
Signed-off-by: yuwenzho <yuwen.zhou@intel.com> Signed-off-by: Chendi.Xue <chendi.xue@intel.com> Signed-off-by: Bob Zhu <bob.zhu@intel.com> Signed-off-by: zehao-intel <zehao.huang@intel.com> Signed-off-by: Konrad Zawora <kzawora@habana.ai> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> Co-authored-by: Sanju C Sudhakaran <scsudhakaran@habana.ai> Co-authored-by: Michal Adamczyk <madamczyk@habana.ai> Co-authored-by: Marceli Fylcek <mfylcek@habana.ai> Co-authored-by: Himangshu Lahkar <49579433+hlahkar@users.noreply.github.com> Co-authored-by: Vivek Goel <vgoel@habana.ai> Co-authored-by: yuwenzho <yuwen.zhou@intel.com> Co-authored-by: Dominika Olszewska <dolszewska@habana.ai> Co-authored-by: barak goldberg <149692267+bgoldberg-habana@users.noreply.github.com> Co-authored-by: Michal Szutenberg <37601244+szutenberg@users.noreply.github.com> Co-authored-by: Jan Kaniecki <jkaniecki@habana.ai> Co-authored-by: Agata Dobrzyniewicz <160237065+adobrzyniewicz-habana@users.noreply.github.com> Co-authored-by: Krzysztof Wisniewski <kwisniewski@habana.ai> Co-authored-by: Dudi Lester <160421192+dudilester@users.noreply.github.com> Co-authored-by: Ilia Taraban <tarabanil@gmail.com> Co-authored-by: Chendi.Xue <chendi.xue@intel.com> Co-authored-by: Michał Kuligowski <mkuligowski@habana.ai> Co-authored-by: Jakub Maksymczuk <jmaksymczuk@habana.ai> Co-authored-by: Tomasz Zielinski <85164140+tzielinski-habana@users.noreply.github.com> Co-authored-by: Sun Choi <schoi@habana.ai> Co-authored-by: Iryna Boiko <iboiko@habana.ai> Co-authored-by: Bob Zhu <41610754+czhu15@users.noreply.github.com> Co-authored-by: hlin99 <73271530+hlin99@users.noreply.github.com> Co-authored-by: Zehao Huang <zehao.huang@intel.com> Co-authored-by: Andrzej Kotłowski <Andrzej.Kotlowski@intel.com> Co-authored-by: Yan Tomsinsky <73292515+Yantom1@users.noreply.github.com> Co-authored-by: Nir David <ndavid@habana.ai> Co-authored-by: Yu-Zhou <yu.zhou@intel.com> Co-authored-by: Ruheena Suhani Shaik <rsshaik@habana.ai> Co-authored-by: Karol Damaszke <kdamaszke@habana.ai> Co-authored-by: Marcin Swiniarski <mswiniarski@habana.ai> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: Jacek Czaja <jacek.czaja@intel.com> Co-authored-by: Jacek Czaja <jczaja@habana.ai> Co-authored-by: Yuan <yuan.zhou@outlook.com>
This commit is contained in:
@@ -17,6 +17,7 @@ from vllm.config import (CacheConfig, ConfigFormat, DecodingConfig,
|
||||
from vllm.executor.executor_base import ExecutorBase
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.transformers_utils.config import (
|
||||
maybe_register_config_serialize_by_value)
|
||||
from vllm.transformers_utils.utils import check_gguf_file
|
||||
@@ -37,6 +38,7 @@ DEVICE_OPTIONS = [
|
||||
"openvino",
|
||||
"tpu",
|
||||
"xpu",
|
||||
"hpu",
|
||||
]
|
||||
|
||||
|
||||
@@ -110,7 +112,9 @@ class EngineArgs:
|
||||
pipeline_parallel_size: int = 1
|
||||
tensor_parallel_size: int = 1
|
||||
max_parallel_loading_workers: Optional[int] = None
|
||||
block_size: int = 16
|
||||
# NOTE(kzawora): default block size for Gaudi should be 128
|
||||
# smaller sizes still work, but very inefficiently
|
||||
block_size: int = 16 if not current_platform.is_hpu() else 128
|
||||
enable_prefix_caching: bool = False
|
||||
disable_sliding_window: bool = False
|
||||
use_v2_block_manager: bool = True
|
||||
@@ -397,7 +401,7 @@ class EngineArgs:
|
||||
parser.add_argument('--block-size',
|
||||
type=int,
|
||||
default=EngineArgs.block_size,
|
||||
choices=[8, 16, 32],
|
||||
choices=[8, 16, 32, 64, 128],
|
||||
help='Token block size for contiguous chunks of '
|
||||
'tokens. This is ignored on neuron devices and '
|
||||
'set to max-model-len')
|
||||
@@ -1132,8 +1136,7 @@ class EngineArgs:
|
||||
multi_step_stream_outputs=self.multi_step_stream_outputs,
|
||||
send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
|
||||
and parallel_config.use_ray),
|
||||
policy=self.scheduling_policy,
|
||||
)
|
||||
policy=self.scheduling_policy)
|
||||
lora_config = LoRAConfig(
|
||||
max_lora_rank=self.max_lora_rank,
|
||||
max_loras=self.max_loras,
|
||||
|
||||
Reference in New Issue
Block a user