[Model] Support NVLM-D and fix QK Norm in InternViT (#9045)
Co-authored-by: Roger Wang <ywang@roblox.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
@@ -16,6 +16,7 @@ from .interfaces_base import is_embedding_model, is_text_generation_model
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
# yapf: disable
|
||||
_TEXT_GENERATION_MODELS = {
|
||||
# [Decoder-only]
|
||||
"AquilaModel": ("llama", "LlamaForCausalLM"),
|
||||
@@ -68,8 +69,6 @@ _TEXT_GENERATION_MODELS = {
|
||||
"PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"),
|
||||
"Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
|
||||
"Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),
|
||||
"Qwen2VLForConditionalGeneration":
|
||||
("qwen2_vl", "Qwen2VLForConditionalGeneration"),
|
||||
"RWForCausalLM": ("falcon", "FalconForCausalLM"),
|
||||
"StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"),
|
||||
"StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),
|
||||
@@ -88,32 +87,25 @@ _EMBEDDING_MODELS = {
|
||||
}
|
||||
|
||||
_MULTIMODAL_MODELS = {
|
||||
"Blip2ForConditionalGeneration":
|
||||
("blip2", "Blip2ForConditionalGeneration"),
|
||||
"ChameleonForConditionalGeneration":
|
||||
("chameleon", "ChameleonForConditionalGeneration"),
|
||||
# [Decoder-only]
|
||||
"Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"),
|
||||
"ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"), # noqa: E501
|
||||
"FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
|
||||
"InternVLChatModel": ("internvl", "InternVLChatModel"),
|
||||
"LlavaForConditionalGeneration": ("llava",
|
||||
"LlavaForConditionalGeneration"),
|
||||
"LlavaNextForConditionalGeneration": ("llava_next",
|
||||
"LlavaNextForConditionalGeneration"),
|
||||
"LlavaNextVideoForConditionalGeneration":
|
||||
("llava_next_video", "LlavaNextVideoForConditionalGeneration"),
|
||||
"LlavaOnevisionForConditionalGeneration":
|
||||
("llava_onevision", "LlavaOnevisionForConditionalGeneration"),
|
||||
"LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
|
||||
"LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501
|
||||
"LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), # noqa: E501
|
||||
"LlavaOnevisionForConditionalGeneration": ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), # noqa: E501
|
||||
"MiniCPMV": ("minicpmv", "MiniCPMV"),
|
||||
"PaliGemmaForConditionalGeneration": ("paligemma",
|
||||
"PaliGemmaForConditionalGeneration"),
|
||||
"NVLM_D": ("nvlm_d", "NVLM_D_Model"),
|
||||
"PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"), # noqa: E501
|
||||
"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
|
||||
"PixtralForConditionalGeneration": ("pixtral",
|
||||
"PixtralForConditionalGeneration"),
|
||||
"PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"), # noqa: E501
|
||||
"QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
|
||||
"Qwen2VLForConditionalGeneration": ("qwen2_vl",
|
||||
"Qwen2VLForConditionalGeneration"),
|
||||
"Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501
|
||||
"UltravoxModel": ("ultravox", "UltravoxModel"),
|
||||
"MllamaForConditionalGeneration": ("mllama",
|
||||
"MllamaForConditionalGeneration"),
|
||||
# [Encoder-decoder]
|
||||
"MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"), # noqa: E501
|
||||
}
|
||||
|
||||
_SPECULATIVE_DECODING_MODELS = {
|
||||
@@ -121,6 +113,7 @@ _SPECULATIVE_DECODING_MODELS = {
|
||||
"MedusaModel": ("medusa", "Medusa"),
|
||||
"MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
|
||||
}
|
||||
# yapf: enable
|
||||
|
||||
_MODELS = {
|
||||
**_TEXT_GENERATION_MODELS,
|
||||
|
||||
Reference in New Issue
Block a user