From 16abe6b85a3fbf6312afe3514797ec741c10a178 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 12 Jan 2026 10:28:16 -0800 Subject: [PATCH] [Misc] Set default torch num threads for input processing (#31879) Signed-off-by: Roger Wang --- vllm/model_executor/models/internvl.py | 16 +--------------- vllm/v1/engine/input_processor.py | 12 +++++++++++- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 15f7d4f41..048bc49ea 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -7,7 +7,6 @@ # Copyright (c) 2023 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- -import os from abc import ABC, abstractmethod from collections.abc import Iterable, Mapping, Sequence from typing import Annotated, Any, Literal, TypeAlias, TypeVar @@ -52,7 +51,6 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.tokenizers import TokenizerLike from vllm.utils.tensor_schema import TensorSchema, TensorShape -from vllm.utils.torch_utils import set_default_torch_num_threads from .interfaces import ( MultiModalEmbeddings, @@ -143,19 +141,7 @@ def build_transform(input_size: int): T.Normalize(mean=MEAN, std=STD), ] ) - # Image transformation operations (which include tensor computations - # on the CPU) can occupy a substantial number of CPU cores, introducing - # overhead due to CPU contention. This issue becomes particularly - # noticeable when deploying multiple vLLM instances on a single machine. - # Therefore, it is necessary to limit the number of threads allocated to - # image transformation tasks. - num_threads = int(os.environ.get("OMP_NUM_THREADS", "1")) - - def apply(img): - with set_default_torch_num_threads(num_threads): - return transform(img) - - return apply + return transform # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index 7b0b06baa..573047e20 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os import time from collections.abc import Mapping from typing import Any, Literal, cast @@ -23,6 +24,7 @@ from vllm.sampling_params import _SAMPLING_EPS, SamplingParams from vllm.tokenizers import TokenizerLike from vllm.tokenizers.mistral import MistralTokenizer from vllm.utils import length_from_prompt_token_ids_or_embeds, random_uuid +from vllm.utils.torch_utils import set_default_torch_num_threads from vllm.v1.engine import EngineCoreRequest from vllm.v1.metrics.stats import MultiModalCacheStats from vllm.v1.structured_output.backend_guidance import ( @@ -493,7 +495,15 @@ class InputProcessor: # 1. Tokenize text prompt, with LoRA request if one exists. # 2. For multimodal models with a merged preprocessor, preprocess # multimodal data and expand prompt token ids accordingly. - with set_request_id(request_id): + num_threads = int(os.environ.get("OMP_NUM_THREADS", "1")) + if "OMP_NUM_THREADS" not in os.environ: + logger.debug_once( + "OMP_NUM_THREADS is not set; defaulting Torch threads to %d for " + "input preprocessing.", + num_threads, + ) + + with set_request_id(request_id), set_default_torch_num_threads(num_threads): processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess( prompt, tokenization_kwargs=tokenization_kwargs,