Revert "Add GlmOcrConfig for GLM-OCR model type recognition" (#35512)

This commit is contained in:
Harry Mellor
2026-02-27 14:13:27 +00:00
committed by GitHub
parent 66c1751d13
commit fbe3f0120a
3 changed files with 0 additions and 96 deletions

View File

@@ -82,7 +82,6 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
deepseek_v32="DeepseekV3Config",
flex_olmo="FlexOlmoConfig",
funaudiochat="FunAudioChatConfig",
glm_ocr="GlmOcrConfig",
hunyuan_vl="HunYuanVLConfig",
isaac="IsaacConfig",
kimi_linear="KimiLinearConfig",

View File

@@ -28,8 +28,6 @@ _CLASS_TO_MODULE: dict[str, str] = {
"FlexOlmoConfig": "vllm.transformers_utils.configs.flex_olmo",
"FunAudioChatConfig": "vllm.transformers_utils.configs.funaudiochat",
"FunAudioChatAudioEncoderConfig": "vllm.transformers_utils.configs.funaudiochat",
"GlmOcrConfig": "vllm.transformers_utils.configs.glm_ocr",
"GlmOcrVisionConfig": "vllm.transformers_utils.configs.glm_ocr",
"HunYuanVLConfig": "vllm.transformers_utils.configs.hunyuan_vl",
"HunYuanVLTextConfig": "vllm.transformers_utils.configs.hunyuan_vl",
"HunYuanVLVisionConfig": "vllm.transformers_utils.configs.hunyuan_vl",
@@ -85,8 +83,6 @@ __all__ = [
"FlexOlmoConfig",
"FunAudioChatConfig",
"FunAudioChatAudioEncoderConfig",
"GlmOcrConfig",
"GlmOcrVisionConfig",
"HunYuanVLConfig",
"HunYuanVLTextConfig",
"HunYuanVLVisionConfig",

View File

@@ -1,91 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
from typing import Any
from transformers.configuration_utils import PretrainedConfig
class GlmOcrVisionConfig(PretrainedConfig):
model_type = "glm_ocr_vision"
def __init__(
self,
hidden_size: int = 1024,
depth: int = 24,
num_heads: int = 16,
attention_bias: bool = True,
intermediate_size: int = 4096,
hidden_act: str = "silu",
hidden_dropout_prob: float = 0.0,
initializer_range: float = 0.02,
image_size: int = 336,
in_channels: int = 3,
patch_size: int = 14,
out_hidden_size: int = 1536,
rms_norm_eps: float = 1e-5,
spatial_merge_size: int = 2,
temporal_patch_size: int = 2,
**kwargs: Any,
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.depth = depth
self.num_heads = num_heads
self.attention_bias = attention_bias
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.initializer_range = initializer_range
self.image_size = image_size
self.in_channels = in_channels
self.patch_size = patch_size
self.out_hidden_size = out_hidden_size
self.rms_norm_eps = rms_norm_eps
self.spatial_merge_size = spatial_merge_size
self.temporal_patch_size = temporal_patch_size
class GlmOcrConfig(PretrainedConfig):
model_type = "glm_ocr"
def __init__(
self,
text_config: dict | None = None,
vision_config: dict | None = None,
image_start_token_id: int = 59256,
image_end_token_id: int = 59257,
video_start_token_id: int = 59258,
video_end_token_id: int = 59259,
image_token_id: int = 59280,
video_token_id: int = 59281,
**kwargs: Any,
):
super().__init__(**kwargs)
self.image_start_token_id = image_start_token_id
self.image_end_token_id = image_end_token_id
self.video_start_token_id = video_start_token_id
self.video_end_token_id = video_end_token_id
self.image_token_id = image_token_id
self.video_token_id = video_token_id
self.vision_config = GlmOcrVisionConfig(**(vision_config or {}))
if isinstance(text_config, dict):
from transformers import AutoConfig
model_type = text_config.get("model_type", "chatglm")
self.text_config = AutoConfig.for_model(model_type, **text_config)
elif text_config is None:
from transformers import AutoConfig
self.text_config = AutoConfig.for_model("chatglm")
else:
self.text_config = text_config
def get_text_config(self) -> PretrainedConfig:
return self.text_config
def save_pretrained(self, save_directory, **kwargs):
self._auto_class = None
super().save_pretrained(save_directory, **kwargs)