[Bugfix] Fix RuntimeError: Already borrowed that degrades VLM serving throughput under concurrent load. (#36557)

Signed-off-by: hallerite <hallerite@users.noreply.github.com>
Signed-off-by: hallerite <git@hallerite.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
This commit is contained in:
hallerite
2026-03-09 22:30:51 -07:00
committed by GitHub
parent 195c997203
commit d0cd736caa

View File

@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import copy
import time
from abc import ABC, abstractmethod
from collections.abc import Mapping, Sequence
@@ -90,10 +91,17 @@ class BaseRenderer(ABC, Generic[_T]):
mm_processor_cache = mm_registry.processor_cache_from_config(config)
# Deep-copy the tokenizer so the multimodal processor gets its
# own Rust tokenizer backend. Without this, concurrent access
# from AsyncMicrobatchTokenizer and call_hf_processor causes
# "RuntimeError: Already borrowed" from the Rust RefCell.
# See: https://github.com/huggingface/tokenizers/issues/537
mm_tokenizer = copy.deepcopy(tokenizer)
with set_default_torch_num_threads():
self.mm_processor = mm_registry.create_processor(
config.model_config,
tokenizer=tokenizer,
tokenizer=mm_tokenizer,
cache=mm_processor_cache,
)