[Models] Add remaining model PP support (#7168)
Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> Signed-off-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
committed by
GitHub
parent
303d44790a
commit
0f6d7a9a34
@@ -31,28 +31,7 @@ if TYPE_CHECKING:
|
||||
logger = init_logger(__name__)
|
||||
|
||||
_EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
|
||||
_MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 4096
|
||||
|
||||
_PP_SUPPORTED_MODELS = [
|
||||
"AquilaForCausalLM",
|
||||
"AquilaModel",
|
||||
"DeepseekV2ForCausalLM",
|
||||
"GPT2LMHeadModel",
|
||||
"InternLM2ForCausalLM",
|
||||
"InternLMForCausalLM",
|
||||
"InternVLChatModel",
|
||||
"JAISLMHeadModel",
|
||||
"LlamaForCausalLM",
|
||||
"LLaMAForCausalLM",
|
||||
"MistralForCausalLM",
|
||||
"MixtralForCausalLM",
|
||||
"NemotronForCausalLM",
|
||||
"Phi3ForCausalLM",
|
||||
"Qwen2ForCausalLM",
|
||||
"Qwen2MoeForCausalLM",
|
||||
"QWenLMHeadModel",
|
||||
"Qwen2VLForConditionalGeneration",
|
||||
]
|
||||
_MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120
|
||||
|
||||
|
||||
class ModelConfig:
|
||||
@@ -228,16 +207,14 @@ class ModelConfig:
|
||||
self, limit_mm_per_prompt: Optional[Mapping[str, int]]
|
||||
) -> Optional["MultiModalConfig"]:
|
||||
architectures = getattr(self.hf_config, "architectures", [])
|
||||
if any(
|
||||
ModelRegistry.is_multimodal_model(arch)
|
||||
for arch in architectures):
|
||||
if ModelRegistry.is_multimodal_model(architectures):
|
||||
return MultiModalConfig(limit_per_prompt=limit_mm_per_prompt or {})
|
||||
else:
|
||||
if limit_mm_per_prompt:
|
||||
raise ValueError(
|
||||
"limit_mm_per_prompt is only supported for multimodal "
|
||||
"models.")
|
||||
return None
|
||||
|
||||
if limit_mm_per_prompt:
|
||||
raise ValueError("`limit_mm_per_prompt` is only supported for "
|
||||
"multimodal models.")
|
||||
|
||||
return None
|
||||
|
||||
def _verify_tokenizer_mode(self) -> None:
|
||||
tokenizer_mode = self.tokenizer_mode.lower()
|
||||
@@ -249,8 +226,7 @@ class ModelConfig:
|
||||
|
||||
def _verify_embedding_mode(self) -> None:
|
||||
architectures = getattr(self.hf_config, "architectures", [])
|
||||
self.embedding_mode = any(
|
||||
ModelRegistry.is_embedding_model(arch) for arch in architectures)
|
||||
self.embedding_mode = ModelRegistry.is_embedding_model(architectures)
|
||||
|
||||
def _parse_quant_hf_config(self):
|
||||
quant_cfg = getattr(self.hf_config, "quantization_config", None)
|
||||
@@ -417,17 +393,17 @@ class ModelConfig:
|
||||
f"({tensor_parallel_size}).")
|
||||
|
||||
pipeline_parallel_size = parallel_config.pipeline_parallel_size
|
||||
architectures = getattr(self.hf_config, "architectures", [])
|
||||
if not all(arch in _PP_SUPPORTED_MODELS
|
||||
for arch in architectures) and pipeline_parallel_size > 1:
|
||||
raise NotImplementedError(
|
||||
"Pipeline parallelism is only supported for the following "
|
||||
f" architectures: {_PP_SUPPORTED_MODELS}.")
|
||||
if pipeline_parallel_size > 1:
|
||||
architectures = getattr(self.hf_config, "architectures", [])
|
||||
if not ModelRegistry.is_pp_supported_model(architectures):
|
||||
raise NotImplementedError(
|
||||
"Pipeline parallelism is not supported for this model. "
|
||||
"Supported models implement the `SupportsPP` interface.")
|
||||
|
||||
if pipeline_parallel_size > 1 and self.use_async_output_proc:
|
||||
logger.warning("Async output processor is not supported with "
|
||||
"pipeline parallelism currently. Disabling it.")
|
||||
self.use_async_output_proc = False
|
||||
if self.use_async_output_proc:
|
||||
logger.warning("Async output processor is not supported with "
|
||||
"pipeline parallelism currently. Disabling it.")
|
||||
self.use_async_output_proc = False
|
||||
|
||||
def get_hf_config_sliding_window(self) -> Optional[int]:
|
||||
"""Get the sliding window size, or None if disabled."""
|
||||
|
||||
Reference in New Issue
Block a user