[Model] Upstream Deepseek-OCR model (#27247)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.io>
This commit is contained in:
Isotr0py
2025-10-22 22:59:15 +08:00
committed by GitHub
parent 3ae082c373
commit 675aa2ec64
10 changed files with 1821 additions and 40 deletions

View File

@@ -101,9 +101,10 @@ class MlpProjector(nn.Module):
super().__init__()
self.cfg = cfg
self.projector_type = cfg.projector_type
assert not cfg.token_pooling, "Token pooling is not supported currently."
if cfg.projector_type == "downsample_mlp_gelu":
if self.projector_type == "downsample_mlp_gelu":
mlp_depth = cfg.depth
mlp_ratio = cfg.mlp_ratio
modules = [
@@ -120,7 +121,8 @@ class MlpProjector(nn.Module):
modules.append(nn.GELU())
modules.append(nn.Linear(cfg.n_embed * mlp_ratio, cfg.n_embed))
modules = nn.Sequential(*modules)
elif self.projector_type == "linear":
modules = nn.Linear(cfg.input_dim, cfg.n_embed)
else:
raise NotImplementedError(
f"Unsupported projector type: {cfg.projector_type}"
@@ -130,24 +132,25 @@ class MlpProjector(nn.Module):
def forward(self, x):
bs, hw, input_dim = x.shape
h = w = int((hw) ** 0.5)
"""compute padding"""
if h % self.cfg.downsample_ratio:
pad = self.cfg.downsample_ratio - h % self.cfg.downsample_ratio
else:
pad = 0
x = x.reshape(bs, h, w, input_dim)
if pad > 0:
x = F.pad(x, (0, 0, 0, pad, 0, pad), "constant", 0)
"""4 to 1 concat"""
x = x.permute(0, 3, 1, 2) # B, C, H, W
x = F.unfold(
x,
kernel_size=self.cfg.downsample_ratio,
stride=self.cfg.downsample_ratio,
padding=0,
) # B, C*4, HW // 4
x = x.permute(0, 2, 1)
if self.projector_type == "downsample_mlp_gelu":
h = w = int((hw) ** 0.5)
"""compute padding"""
if h % self.cfg.downsample_ratio:
pad = self.cfg.downsample_ratio - h % self.cfg.downsample_ratio
else:
pad = 0
x = x.reshape(bs, h, w, input_dim)
if pad > 0:
x = F.pad(x, (0, 0, 0, pad, 0, pad), "constant", 0)
"""4 to 1 concat"""
x = x.permute(0, 3, 1, 2) # B, C, H, W
x = F.unfold(
x,
kernel_size=self.cfg.downsample_ratio,
stride=self.cfg.downsample_ratio,
padding=0,
) # B, C*4, HW // 4
x = x.permute(0, 2, 1)
return self.layers(x)