[Misc] Set default torch num threads for input processing (#31879)
Signed-off-by: Roger Wang <hey@rogerw.io>
This commit is contained in:
@@ -7,7 +7,6 @@
|
||||
# Copyright (c) 2023 OpenGVLab
|
||||
# Licensed under The MIT License [see LICENSE for details]
|
||||
# --------------------------------------------------------
|
||||
import os
|
||||
from abc import ABC, abstractmethod
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from typing import Annotated, Any, Literal, TypeAlias, TypeVar
|
||||
@@ -52,7 +51,6 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
from vllm.utils.torch_utils import set_default_torch_num_threads
|
||||
|
||||
from .interfaces import (
|
||||
MultiModalEmbeddings,
|
||||
@@ -143,19 +141,7 @@ def build_transform(input_size: int):
|
||||
T.Normalize(mean=MEAN, std=STD),
|
||||
]
|
||||
)
|
||||
# Image transformation operations (which include tensor computations
|
||||
# on the CPU) can occupy a substantial number of CPU cores, introducing
|
||||
# overhead due to CPU contention. This issue becomes particularly
|
||||
# noticeable when deploying multiple vLLM instances on a single machine.
|
||||
# Therefore, it is necessary to limit the number of threads allocated to
|
||||
# image transformation tasks.
|
||||
num_threads = int(os.environ.get("OMP_NUM_THREADS", "1"))
|
||||
|
||||
def apply(img):
|
||||
with set_default_torch_num_threads(num_threads):
|
||||
return transform(img)
|
||||
|
||||
return apply
|
||||
return transform
|
||||
|
||||
|
||||
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
import time
|
||||
from collections.abc import Mapping
|
||||
from typing import Any, Literal, cast
|
||||
@@ -23,6 +24,7 @@ from vllm.sampling_params import _SAMPLING_EPS, SamplingParams
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.tokenizers.mistral import MistralTokenizer
|
||||
from vllm.utils import length_from_prompt_token_ids_or_embeds, random_uuid
|
||||
from vllm.utils.torch_utils import set_default_torch_num_threads
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.metrics.stats import MultiModalCacheStats
|
||||
from vllm.v1.structured_output.backend_guidance import (
|
||||
@@ -493,7 +495,15 @@ class InputProcessor:
|
||||
# 1. Tokenize text prompt, with LoRA request if one exists.
|
||||
# 2. For multimodal models with a merged preprocessor, preprocess
|
||||
# multimodal data and expand prompt token ids accordingly.
|
||||
with set_request_id(request_id):
|
||||
num_threads = int(os.environ.get("OMP_NUM_THREADS", "1"))
|
||||
if "OMP_NUM_THREADS" not in os.environ:
|
||||
logger.debug_once(
|
||||
"OMP_NUM_THREADS is not set; defaulting Torch threads to %d for "
|
||||
"input preprocessing.",
|
||||
num_threads,
|
||||
)
|
||||
|
||||
with set_request_id(request_id), set_default_torch_num_threads(num_threads):
|
||||
processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
|
||||
prompt,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
|
||||
Reference in New Issue
Block a user