[CPU] Replace OMP initialization (#36487)
Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
This commit is contained in:
@@ -23,22 +23,22 @@ if [ "$failed_req" -ne 0 ]; then
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "--- DP+TP"
|
#echo "--- DP+TP"
|
||||||
vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 --max-model-len=4096 &
|
#vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 --max-model-len=4096 &
|
||||||
server_pid=$!
|
#server_pid=$!
|
||||||
timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
|
#timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
|
||||||
vllm bench serve \
|
#vllm bench serve \
|
||||||
--backend vllm \
|
# --backend vllm \
|
||||||
--dataset-name random \
|
# --dataset-name random \
|
||||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
# --model meta-llama/Llama-3.2-3B-Instruct \
|
||||||
--num-prompts 20 \
|
# --num-prompts 20 \
|
||||||
--result-dir ./test_results \
|
# --result-dir ./test_results \
|
||||||
--result-filename dp_pp.json \
|
# --result-filename dp_pp.json \
|
||||||
--save-result \
|
# --save-result \
|
||||||
--endpoint /v1/completions
|
# --endpoint /v1/completions
|
||||||
kill -s SIGTERM $server_pid; wait $server_pid || true
|
#kill -s SIGTERM $server_pid; wait $server_pid || true
|
||||||
failed_req=$(jq '.failed' ./test_results/dp_pp.json)
|
#failed_req=$(jq '.failed' ./test_results/dp_pp.json)
|
||||||
if [ "$failed_req" -ne 0 ]; then
|
#if [ "$failed_req" -ne 0 ]; then
|
||||||
echo "Some requests were failed!"
|
# echo "Some requests were failed!"
|
||||||
exit 1
|
# exit 1
|
||||||
fi
|
#fi
|
||||||
|
|||||||
@@ -8,8 +8,6 @@
|
|||||||
// libraries use different ISAs.
|
// libraries use different ISAs.
|
||||||
#define TORCH_EXTENSION_NAME _C
|
#define TORCH_EXTENSION_NAME _C
|
||||||
|
|
||||||
std::string init_cpu_threads_env(const std::string& cpu_ids);
|
|
||||||
|
|
||||||
void release_dnnl_matmul_handler(int64_t handler);
|
void release_dnnl_matmul_handler(int64_t handler);
|
||||||
|
|
||||||
int64_t create_onednn_scaled_mm_handler(const torch::Tensor& b,
|
int64_t create_onednn_scaled_mm_handler(const torch::Tensor& b,
|
||||||
@@ -354,7 +352,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
"str act, str isa) -> ()");
|
"str act, str isa) -> ()");
|
||||||
ops.impl("cpu_fused_moe", torch::kCPU, &cpu_fused_moe);
|
ops.impl("cpu_fused_moe", torch::kCPU, &cpu_fused_moe);
|
||||||
#endif
|
#endif
|
||||||
ops.def("init_cpu_threads_env(str cpu_ids) -> str", &init_cpu_threads_env);
|
|
||||||
ops.def(
|
ops.def(
|
||||||
"mla_decode_kvcache("
|
"mla_decode_kvcache("
|
||||||
" Tensor! out, Tensor query, Tensor kv_cache,"
|
" Tensor! out, Tensor query, Tensor kv_cache,"
|
||||||
|
|||||||
@@ -21,150 +21,6 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef VLLM_NUMA_DISABLED
|
|
||||||
std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
|
||||||
bitmask* omp_cpu_mask = numa_parse_cpustring_all(cpu_ids.c_str());
|
|
||||||
TORCH_CHECK(omp_cpu_mask != nullptr,
|
|
||||||
"Failed to parse CPU string: " + cpu_ids);
|
|
||||||
TORCH_CHECK(omp_cpu_mask->size > 0);
|
|
||||||
std::vector<int> omp_cpu_ids;
|
|
||||||
omp_cpu_ids.reserve(omp_cpu_mask->size);
|
|
||||||
|
|
||||||
constexpr int group_size = 8 * sizeof(*omp_cpu_mask->maskp);
|
|
||||||
|
|
||||||
for (int offset = 0; offset < omp_cpu_mask->size; offset += group_size) {
|
|
||||||
unsigned long group_mask = omp_cpu_mask->maskp[offset / group_size];
|
|
||||||
int i = 0;
|
|
||||||
while (group_mask) {
|
|
||||||
if (group_mask & 1) {
|
|
||||||
omp_cpu_ids.emplace_back(offset + i);
|
|
||||||
}
|
|
||||||
++i;
|
|
||||||
group_mask >>= 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Memory node binding
|
|
||||||
if (numa_available() != -1) {
|
|
||||||
std::set<int> node_ids;
|
|
||||||
for (const auto& cpu_id : omp_cpu_ids) {
|
|
||||||
int node_id = numa_node_of_cpu(cpu_id);
|
|
||||||
if (node_id != -1) {
|
|
||||||
node_ids.insert(node_id);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Concatenate all node_ids into a single comma-separated string
|
|
||||||
if (!node_ids.empty()) {
|
|
||||||
std::string node_ids_str;
|
|
||||||
for (const int node_id : node_ids) {
|
|
||||||
if (!node_ids_str.empty()) {
|
|
||||||
node_ids_str += ",";
|
|
||||||
}
|
|
||||||
node_ids_str += std::to_string(node_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
bitmask* mask = numa_parse_nodestring(node_ids_str.c_str());
|
|
||||||
bitmask* src_mask = numa_get_mems_allowed();
|
|
||||||
|
|
||||||
int pid = getpid();
|
|
||||||
|
|
||||||
if (mask && src_mask) {
|
|
||||||
// move all existing pages to the specified numa node.
|
|
||||||
*(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp);
|
|
||||||
int page_num = numa_migrate_pages(pid, src_mask, mask);
|
|
||||||
if (page_num == -1) {
|
|
||||||
TORCH_WARN("numa_migrate_pages failed. errno: " +
|
|
||||||
std::to_string(errno));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Restrict memory allocation to the selected NUMA node(s).
|
|
||||||
// Enhances memory locality for the threads bound to those NUMA CPUs.
|
|
||||||
if (node_ids.size() > 1) {
|
|
||||||
errno = 0;
|
|
||||||
numa_set_interleave_mask(mask);
|
|
||||||
if (errno != 0) {
|
|
||||||
TORCH_WARN("numa_set_interleave_mask failed. errno: " +
|
|
||||||
std::to_string(errno));
|
|
||||||
} else {
|
|
||||||
TORCH_WARN(
|
|
||||||
"NUMA binding: Using INTERLEAVE policy for memory "
|
|
||||||
"allocation across multiple NUMA nodes (nodes: " +
|
|
||||||
node_ids_str +
|
|
||||||
"). Memory allocations will be "
|
|
||||||
"interleaved across the specified NUMA nodes.");
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
errno = 0;
|
|
||||||
numa_set_membind(mask);
|
|
||||||
if (errno != 0) {
|
|
||||||
TORCH_WARN("numa_set_membind failed. errno: " +
|
|
||||||
std::to_string(errno));
|
|
||||||
} else {
|
|
||||||
TORCH_WARN(
|
|
||||||
"NUMA binding: Using MEMBIND policy for memory "
|
|
||||||
"allocation on the NUMA nodes (" +
|
|
||||||
node_ids_str +
|
|
||||||
"). Memory allocations will be "
|
|
||||||
"strictly bound to these NUMA nodes.");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
numa_set_strict(1);
|
|
||||||
|
|
||||||
numa_free_nodemask(mask);
|
|
||||||
numa_free_nodemask(src_mask);
|
|
||||||
} else {
|
|
||||||
TORCH_WARN(
|
|
||||||
"numa_parse_nodestring or numa_get_run_node_mask failed. errno: " +
|
|
||||||
std::to_string(errno));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// OMP threads binding
|
|
||||||
omp_set_num_threads((int)omp_cpu_ids.size());
|
|
||||||
torch::set_num_threads((int)omp_cpu_ids.size());
|
|
||||||
TORCH_CHECK_EQ(omp_cpu_ids.size(), torch::get_num_threads());
|
|
||||||
TORCH_CHECK_EQ(omp_cpu_ids.size(), omp_get_max_threads());
|
|
||||||
|
|
||||||
std::vector<std::pair<int, int>> thread_core_mapping;
|
|
||||||
thread_core_mapping.reserve(omp_cpu_ids.size());
|
|
||||||
omp_lock_t writelock;
|
|
||||||
omp_init_lock(&writelock);
|
|
||||||
|
|
||||||
#pragma omp parallel for schedule(static, 1)
|
|
||||||
for (size_t i = 0; i < omp_cpu_ids.size(); ++i) {
|
|
||||||
cpu_set_t mask;
|
|
||||||
CPU_ZERO(&mask);
|
|
||||||
CPU_SET(omp_cpu_ids[i], &mask);
|
|
||||||
int ret = sched_setaffinity(0, sizeof(cpu_set_t), &mask);
|
|
||||||
if (ret == -1) {
|
|
||||||
TORCH_CHECK(false,
|
|
||||||
"sched_setaffinity failed. errno: " + std::to_string(errno));
|
|
||||||
}
|
|
||||||
|
|
||||||
omp_set_lock(&writelock);
|
|
||||||
thread_core_mapping.emplace_back(gettid(), omp_cpu_ids[i]);
|
|
||||||
omp_unset_lock(&writelock);
|
|
||||||
}
|
|
||||||
|
|
||||||
omp_destroy_lock(&writelock);
|
|
||||||
|
|
||||||
numa_free_nodemask(omp_cpu_mask);
|
|
||||||
|
|
||||||
std::stringstream ss;
|
|
||||||
ss << "OMP threads binding of Process " << getpid() << ":\n";
|
|
||||||
std::sort(thread_core_mapping.begin(), thread_core_mapping.end(),
|
|
||||||
[](auto&& a, auto&& b) { return a.second < b.second; });
|
|
||||||
for (auto&& item : thread_core_mapping) {
|
|
||||||
ss << "\t"
|
|
||||||
<< "OMP tid: " << item.first << ", core " << item.second << "\n";
|
|
||||||
}
|
|
||||||
|
|
||||||
return ss.str();
|
|
||||||
}
|
|
||||||
#endif // VLLM_NUMA_DISABLED
|
|
||||||
|
|
||||||
namespace cpu_utils {
|
namespace cpu_utils {
|
||||||
ScratchPadManager::ScratchPadManager() : size_(0), ptr_(nullptr) {
|
ScratchPadManager::ScratchPadManager() : size_(0), ptr_(nullptr) {
|
||||||
this->realloc(allocation_unit * 128);
|
this->realloc(allocation_unit * 128);
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import glob
|
import glob
|
||||||
import json
|
|
||||||
import os
|
import os
|
||||||
import platform
|
import platform
|
||||||
import subprocess
|
import subprocess
|
||||||
@@ -11,11 +10,11 @@ from dataclasses import dataclass
|
|||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
import psutil
|
import psutil
|
||||||
import regex as re
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm import envs
|
from vllm import envs
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
from vllm.utils.ompmultiprocessing import OMPProcessManager
|
||||||
from vllm.utils.torch_utils import is_quantized_kv_cache
|
from vllm.utils.torch_utils import is_quantized_kv_cache
|
||||||
from vllm.v1.attention.backends.registry import AttentionBackendEnum
|
from vllm.v1.attention.backends.registry import AttentionBackendEnum
|
||||||
|
|
||||||
@@ -76,6 +75,10 @@ class CpuPlatform(Platform):
|
|||||||
dispatch_key: str = "CPU"
|
dispatch_key: str = "CPU"
|
||||||
dist_backend: str = "gloo"
|
dist_backend: str = "gloo"
|
||||||
device_control_env_var = "CPU_VISIBLE_MEMORY_NODES"
|
device_control_env_var = "CPU_VISIBLE_MEMORY_NODES"
|
||||||
|
omp_process_manager = None
|
||||||
|
smt = 1 # SMT level for OMP - 4 threads on PowerPC, 1 on others
|
||||||
|
global_cpu_mask = None
|
||||||
|
simulate_numa = int(os.environ.get("_SIM_MULTI_NUMA", 0))
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def supported_dtypes(self) -> list[torch.dtype]:
|
def supported_dtypes(self) -> list[torch.dtype]:
|
||||||
@@ -191,26 +194,10 @@ class CpuPlatform(Platform):
|
|||||||
|
|
||||||
cache_config.cpu_kvcache_space_bytes = CpuPlatform.get_device_total_memory()
|
cache_config.cpu_kvcache_space_bytes = CpuPlatform.get_device_total_memory()
|
||||||
|
|
||||||
# reserve at least one core for nixl_connector under p/d case
|
|
||||||
if vllm_config.kv_transfer_config and (
|
|
||||||
envs.VLLM_CPU_NUM_OF_RESERVED_CPU == 0
|
|
||||||
or envs.VLLM_CPU_NUM_OF_RESERVED_CPU is None
|
|
||||||
):
|
|
||||||
os.environ["VLLM_CPU_NUM_OF_RESERVED_CPU"] = "1"
|
|
||||||
|
|
||||||
parallel_config = vllm_config.parallel_config
|
parallel_config = vllm_config.parallel_config
|
||||||
if (
|
# OMP requires the MP executor to function correctly, UniProc is not
|
||||||
parallel_config.world_size > 1
|
# supported as it is not possible to set the OMP environment correctly
|
||||||
and parallel_config.distributed_executor_backend is not None
|
if parallel_config.distributed_executor_backend == "uni":
|
||||||
and parallel_config.distributed_executor_backend != "mp"
|
|
||||||
):
|
|
||||||
logger.warning(
|
|
||||||
(
|
|
||||||
"%s is not supported on CPU, fallback to mp "
|
|
||||||
"distributed executor backend."
|
|
||||||
),
|
|
||||||
parallel_config.distributed_executor_backend,
|
|
||||||
)
|
|
||||||
parallel_config.distributed_executor_backend = "mp"
|
parallel_config.distributed_executor_backend = "mp"
|
||||||
if parallel_config.worker_cls == "auto":
|
if parallel_config.worker_cls == "auto":
|
||||||
parallel_config.worker_cls = "vllm.v1.worker.cpu_worker.CPUWorker"
|
parallel_config.worker_cls = "vllm.v1.worker.cpu_worker.CPUWorker"
|
||||||
@@ -267,14 +254,6 @@ class CpuPlatform(Platform):
|
|||||||
# variable "NUMEXPR_MAX_THREADS" (64)'.
|
# variable "NUMEXPR_MAX_THREADS" (64)'.
|
||||||
os.environ["NUMEXPR_MAX_THREADS"] = str(get_max_threads())
|
os.environ["NUMEXPR_MAX_THREADS"] = str(get_max_threads())
|
||||||
|
|
||||||
if envs.VLLM_CPU_OMP_THREADS_BIND != "nobind":
|
|
||||||
# Set default threads num for OpenMP parallel
|
|
||||||
os.environ["OMP_NUM_THREADS"] = str(torch.get_num_threads())
|
|
||||||
else:
|
|
||||||
# In this case, setting the OpenMP configuration via
|
|
||||||
# OMP_NUM_THREADS is up to the user.
|
|
||||||
logger.info("Disabling binding processes to CPU cores...")
|
|
||||||
|
|
||||||
# Disable torch async compiling which won't work with daemonic processes
|
# Disable torch async compiling which won't work with daemonic processes
|
||||||
os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
|
os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
|
||||||
|
|
||||||
@@ -286,8 +265,8 @@ class CpuPlatform(Platform):
|
|||||||
|
|
||||||
ld_preload_str = os.getenv("LD_PRELOAD", "")
|
ld_preload_str = os.getenv("LD_PRELOAD", "")
|
||||||
|
|
||||||
# Intel OpenMP setting
|
# Intel and CLANG OpenMP setting
|
||||||
if "libiomp5.so" in ld_preload_str:
|
if "libiomp5.so" in ld_preload_str or "libomp5" in ld_preload_str:
|
||||||
# The time(milliseconds) that a thread should wait after
|
# The time(milliseconds) that a thread should wait after
|
||||||
# completing the execution of a parallel region, before sleeping.
|
# completing the execution of a parallel region, before sleeping.
|
||||||
os.environ["KMP_BLOCKTIME"] = "1"
|
os.environ["KMP_BLOCKTIME"] = "1"
|
||||||
@@ -324,37 +303,6 @@ class CpuPlatform(Platform):
|
|||||||
ld_preload_str = tcmalloc_so
|
ld_preload_str = tcmalloc_so
|
||||||
os.environ["LD_PRELOAD"] = ld_preload_str
|
os.environ["LD_PRELOAD"] = ld_preload_str
|
||||||
|
|
||||||
if (
|
|
||||||
platform.system() == "Linux"
|
|
||||||
and cpu_architecture in (CpuArchEnum.ARM, CpuArchEnum.POWERPC)
|
|
||||||
and not ("libomp" in ld_preload_str or "libgomp" in ld_preload_str)
|
|
||||||
):
|
|
||||||
# We need to LD_PRELOAD PyTorch's libgomp, otherwise only
|
|
||||||
# one core will be properly utilized when we thread-bind
|
|
||||||
# See: https://github.com/vllm-project/vllm/issues/27369
|
|
||||||
# TODO: Remove once:
|
|
||||||
# https://github.com/pytorch/pytorch/issues/166087 is fixed
|
|
||||||
|
|
||||||
# We need to find the location of PyTorch's libgomp
|
|
||||||
torch_pkg = os.path.dirname(torch.__file__)
|
|
||||||
site_root = os.path.dirname(torch_pkg)
|
|
||||||
# Search both torch.libs and torch/lib - See: https://github.com/vllm-project/vllm/issues/30470
|
|
||||||
torch_libs_paths = [
|
|
||||||
os.path.join(site_root, "torch.libs"),
|
|
||||||
os.path.join(torch_pkg, "lib"),
|
|
||||||
]
|
|
||||||
pytorch_libgomp_so_candidates = []
|
|
||||||
for torch_libs in torch_libs_paths:
|
|
||||||
pytorch_libgomp_so_candidates.extend(
|
|
||||||
glob.glob(os.path.join(torch_libs, "libgomp*.so*"))
|
|
||||||
)
|
|
||||||
if pytorch_libgomp_so_candidates:
|
|
||||||
pytorch_libgomp_so = pytorch_libgomp_so_candidates[0]
|
|
||||||
if ld_preload_str:
|
|
||||||
ld_preload_str += ":"
|
|
||||||
ld_preload_str += pytorch_libgomp_so
|
|
||||||
os.environ["LD_PRELOAD"] = ld_preload_str
|
|
||||||
|
|
||||||
os.environ["LOCAL_WORLD_SIZE"] = str(
|
os.environ["LOCAL_WORLD_SIZE"] = str(
|
||||||
vllm_config.parallel_config.tensor_parallel_size
|
vllm_config.parallel_config.tensor_parallel_size
|
||||||
)
|
)
|
||||||
@@ -369,6 +317,13 @@ class CpuPlatform(Platform):
|
|||||||
vllm_config.model_config.max_model_len,
|
vllm_config.model_config.max_model_len,
|
||||||
vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
|
vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
|
||||||
)
|
)
|
||||||
|
# CI specific "quick" NUMA simulation - split all available CPUs
|
||||||
|
# into a fake NUMA topology
|
||||||
|
if os.environ.get("VLLM_CPU_SIM_MULTI_NUMA", None) is not None:
|
||||||
|
os.environ["_SIM_MULTI_NUMA"] = str(
|
||||||
|
vllm_config.parallel_config.world_size
|
||||||
|
* vllm_config.parallel_config._api_process_count
|
||||||
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None:
|
def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None:
|
||||||
@@ -377,46 +332,71 @@ class CpuPlatform(Platform):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_allowed_cpu_core_node_list(cls) -> tuple[list[int], list[LogicalCPUInfo]]:
|
def get_omp_manager(cls) -> OMPProcessManager:
|
||||||
assert platform.system() == "Linux"
|
# initialise the OMP resource management if need be and return the manager
|
||||||
|
if cls.omp_process_manager is None:
|
||||||
|
if cls.get_cpu_architecture() == CpuArchEnum.POWERPC:
|
||||||
|
cls.smt = 4
|
||||||
|
cls.omp_process_manager = OMPProcessManager(
|
||||||
|
affinity=cls.get_global_cpu_mask(), smt=cls.smt
|
||||||
|
)
|
||||||
|
# we need to fix up the topology returned by the OMP Manager for
|
||||||
|
# simulated NUMA environments in CI
|
||||||
|
if cls.simulate_numa > 0:
|
||||||
|
logger.info(
|
||||||
|
"Adjusting numa topology to resemble at least %d nodes",
|
||||||
|
int(cls.simulate_numa),
|
||||||
|
)
|
||||||
|
om = cls.omp_process_manager
|
||||||
|
while len(om.omp_places) < cls.simulate_numa:
|
||||||
|
new_omp_places = []
|
||||||
|
touched = False
|
||||||
|
for omp_place in om.omp_places:
|
||||||
|
if len(omp_place["mask"]) > 1:
|
||||||
|
touched = True
|
||||||
|
cpu_list = sorted(list(omp_place["mask"]))
|
||||||
|
new_omp_places.append(
|
||||||
|
{
|
||||||
|
"mask": set(cpu_list[0 : int(len(cpu_list) / 2)]),
|
||||||
|
"available": True,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
new_omp_places.append(
|
||||||
|
{
|
||||||
|
"mask": set(cpu_list[int(len(cpu_list) / 2) :]),
|
||||||
|
"available": True,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if touched:
|
||||||
|
om.omp_places = new_omp_places
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
"Cannot split the existing NUMA topology to match "
|
||||||
|
"simulation requirements"
|
||||||
|
)
|
||||||
|
|
||||||
# Init LogicalCPUInfo from lscpu
|
return cls.omp_process_manager
|
||||||
lscpu_output = subprocess.check_output(
|
|
||||||
"lscpu -J -e=CPU,CORE,NODE", shell=True, text=True
|
@classmethod
|
||||||
|
def get_global_cpu_mask(cls) -> set[int]:
|
||||||
|
# get global cpu mask
|
||||||
|
if cls.global_cpu_mask is None:
|
||||||
|
cls.global_cpu_mask = os.sched_getaffinity(0)
|
||||||
|
return cls.global_cpu_mask
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def reserve_cpus(cls, reserve: set[int]) -> bool:
|
||||||
|
# remove CPUs from global mask, for now there is no "release" mechanism
|
||||||
|
if cls.omp_process_manager is not None:
|
||||||
|
for place in cls.omp_process_manager.omp_places:
|
||||||
|
if not place["available"]:
|
||||||
|
return False
|
||||||
|
cls.global_cpu_mask = cls.get_global_cpu_mask() - reserve
|
||||||
|
# reinitialize OMP resource management
|
||||||
|
cls.omp_process_manager = OMPProcessManager(
|
||||||
|
affinity=cls.global_cpu_mask, smt=cls.smt
|
||||||
)
|
)
|
||||||
lscpu_output = re.sub(r'"node":\s*-\s*(,|\n)', r'"node": 0\1', lscpu_output)
|
return True
|
||||||
logical_cpu_list: list[LogicalCPUInfo] = json.loads(
|
|
||||||
lscpu_output, object_hook=LogicalCPUInfo.json_decoder
|
|
||||||
)["cpus"]
|
|
||||||
|
|
||||||
# Filter CPUs with invalid attributes
|
|
||||||
logical_cpu_list = [
|
|
||||||
x
|
|
||||||
for x in logical_cpu_list
|
|
||||||
if -1 not in (x.id, x.physical_core, x.numa_node)
|
|
||||||
]
|
|
||||||
|
|
||||||
# Filter allowed CPUs
|
|
||||||
if hasattr(os, "sched_getaffinity"):
|
|
||||||
allowed_cpu_id_list = os.sched_getaffinity(0)
|
|
||||||
else:
|
|
||||||
raise NotImplementedError("Unsupported OS")
|
|
||||||
logical_cpu_list = [x for x in logical_cpu_list if x.id in allowed_cpu_id_list]
|
|
||||||
|
|
||||||
# Get allowed NUMA nodes
|
|
||||||
allowed_numa_nodes = set()
|
|
||||||
for x in logical_cpu_list:
|
|
||||||
allowed_numa_nodes.add(x.numa_node) # type: ignore
|
|
||||||
allowed_numa_nodes_list = sorted(allowed_numa_nodes)
|
|
||||||
|
|
||||||
env_key = CpuPlatform.device_control_env_var
|
|
||||||
if env_key in os.environ and os.environ[env_key] != "":
|
|
||||||
visible_nodes = [int(s) for s in os.environ[env_key].split(",")]
|
|
||||||
allowed_numa_nodes_list = [
|
|
||||||
x for x in sorted(list(set(visible_nodes))) if x in allowed_numa_nodes
|
|
||||||
]
|
|
||||||
|
|
||||||
return allowed_numa_nodes_list, logical_cpu_list
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def discover_numa_topology(cls) -> list[list[int]]:
|
def discover_numa_topology(cls) -> list[list[int]]:
|
||||||
|
|||||||
174
vllm/utils/ompmultiprocessing.py
Normal file
174
vllm/utils/ompmultiprocessing.py
Normal file
@@ -0,0 +1,174 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
"""OMP Aware Multiprocessing manager for running multiprocessing.Process()
|
||||||
|
Copyright (c) 2026 Red Hat Inc
|
||||||
|
Copyright (c) 2026 Cambridge Greys Ltd
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
|
||||||
|
def _int(arg):
|
||||||
|
"""Relaxed parsing of ints which handles a - instead of a number.
|
||||||
|
The lscpu json may contain that for nodes in some cases. If that
|
||||||
|
is the case we parse it to zero
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if int(arg) >= 0:
|
||||||
|
return int(arg)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def parse_mask(mask):
|
||||||
|
"""Expand a X-Y,Z list"""
|
||||||
|
result = []
|
||||||
|
for token in mask.split(","):
|
||||||
|
try:
|
||||||
|
start, finish = token.split("-")
|
||||||
|
if int(start) > int(finish):
|
||||||
|
raise IndexError("Invalid Indexes for cpu ranges")
|
||||||
|
for cpu in range(int(start), int(finish) + 1):
|
||||||
|
result.append(cpu)
|
||||||
|
except ValueError:
|
||||||
|
result.append(int(token))
|
||||||
|
return set(result)
|
||||||
|
|
||||||
|
|
||||||
|
def enumerate_resources(resource_map, mask=None, allowed=None):
|
||||||
|
"""Enumerate system resources"""
|
||||||
|
if allowed is None:
|
||||||
|
allowed = os.sched_getaffinity(0)
|
||||||
|
if mask is not None:
|
||||||
|
allowed = allowed & mask
|
||||||
|
|
||||||
|
try:
|
||||||
|
allowed_nodes = parse_mask(os.environ["CPU_VISIBLE_MEMORY_NODES"])
|
||||||
|
except KeyError:
|
||||||
|
allowed_nodes = None
|
||||||
|
|
||||||
|
lscpu: dict[str, dict] = {"cpus": {}, "cores": {}, "nodes": {}}
|
||||||
|
for cpu in resource_map["cpus"]:
|
||||||
|
cpunum = int(cpu["cpu"])
|
||||||
|
if (
|
||||||
|
cpunum in allowed
|
||||||
|
and cpunum >= 0
|
||||||
|
and (allowed_nodes is None or _int(cpu["node"]) in allowed_nodes)
|
||||||
|
):
|
||||||
|
lscpu["cpus"][cpunum] = [cpu]
|
||||||
|
core = _int(cpu["core"])
|
||||||
|
if lscpu["cores"].get(core, None) is None:
|
||||||
|
lscpu["cores"][core] = [cpu]
|
||||||
|
else:
|
||||||
|
lscpu["cores"][core].append(cpu)
|
||||||
|
node = _int(cpu["node"])
|
||||||
|
if lscpu["nodes"].get(node, None) is None:
|
||||||
|
lscpu["nodes"][node] = [cpu]
|
||||||
|
else:
|
||||||
|
lscpu["nodes"][node].append(cpu)
|
||||||
|
return lscpu
|
||||||
|
|
||||||
|
|
||||||
|
def produce_cpu_list(cpus, smt=1):
|
||||||
|
"""Produce a CPU list with/without SMT pairs - main cpu list case"""
|
||||||
|
mask: list[int] = []
|
||||||
|
for key, value in cpus.items():
|
||||||
|
exists = 0
|
||||||
|
for cpu in mask:
|
||||||
|
if cpu == value[0]["core"]:
|
||||||
|
exists += 1
|
||||||
|
break
|
||||||
|
if exists < smt:
|
||||||
|
mask.append(int(key))
|
||||||
|
return {"mask": set(mask), "available": True}
|
||||||
|
|
||||||
|
|
||||||
|
def produce_cpu_sublist(scpus, smt=1):
|
||||||
|
"""Produce a CPU list with/without SMT pairs - resource leaf case"""
|
||||||
|
cpu_list: list[dict] = []
|
||||||
|
for value in scpus:
|
||||||
|
exists = 0
|
||||||
|
for cpu in cpu_list:
|
||||||
|
if int(cpu["core"]) == int(value["core"]):
|
||||||
|
exists += 1
|
||||||
|
break
|
||||||
|
if exists < smt:
|
||||||
|
cpu_list.append(value)
|
||||||
|
mask = []
|
||||||
|
for cpu in cpu_list:
|
||||||
|
mask.append(int(cpu["cpu"]))
|
||||||
|
|
||||||
|
return {"mask": set(mask), "available": True}
|
||||||
|
|
||||||
|
|
||||||
|
def create_omp_places(resources, strategy, smt=True):
|
||||||
|
"""Parse CPU topology and generate possible CPU masks"""
|
||||||
|
omp_places = []
|
||||||
|
if strategy == "all":
|
||||||
|
omp_places.append(produce_cpu_list(resources["cpus"], smt))
|
||||||
|
elif strategy == "cores":
|
||||||
|
for value in resources["cores"].values():
|
||||||
|
omp_places.append(produce_cpu_sublist(value, smt))
|
||||||
|
elif strategy == "nodes":
|
||||||
|
for value in resources["nodes"].values():
|
||||||
|
omp_places.append(produce_cpu_sublist(value, smt))
|
||||||
|
else:
|
||||||
|
raise NotImplementedError("Unknown strategy")
|
||||||
|
|
||||||
|
return omp_places
|
||||||
|
|
||||||
|
|
||||||
|
# pylint: disable=too-few-public-methods
|
||||||
|
class OMPProcessManager:
|
||||||
|
"""OMP aware wrapper to run mp Process()"""
|
||||||
|
|
||||||
|
def __init__(self, strategy="nodes", smt=1, mock=None, affinity=None):
|
||||||
|
self.strategy = strategy
|
||||||
|
self.smt = smt
|
||||||
|
self.omp_places = []
|
||||||
|
vllm_mask = os.environ.get("VLLM_CPU_OMP_THREADS_BIND", None)
|
||||||
|
self.setup_omp = vllm_mask != "nobind"
|
||||||
|
if self.setup_omp:
|
||||||
|
omp_places = []
|
||||||
|
if vllm_mask is not None:
|
||||||
|
masks = []
|
||||||
|
for spec in vllm_mask.split("|"):
|
||||||
|
masks.append(parse_mask(spec))
|
||||||
|
else:
|
||||||
|
masks = [None]
|
||||||
|
if mock is None:
|
||||||
|
data = subprocess.run(
|
||||||
|
["lscpu", "-Je"], check=True, capture_output=True
|
||||||
|
).stdout
|
||||||
|
else:
|
||||||
|
with open(mock, mode="rb") as jf:
|
||||||
|
data = jf.read()
|
||||||
|
lscpu = json.loads(data)
|
||||||
|
for mask in masks:
|
||||||
|
resources = enumerate_resources(lscpu, mask, affinity)
|
||||||
|
omp_places.extend(create_omp_places(resources, strategy, smt))
|
||||||
|
self.omp_places = sorted(
|
||||||
|
omp_places,
|
||||||
|
key=lambda p: "{:04d}-{:04d}".format(len(p["mask"]), max(p["mask"])),
|
||||||
|
reverse=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
def run(self, what, *args, **kwargs):
|
||||||
|
"""Run arg with correct OMP environment"""
|
||||||
|
if self.setup_omp:
|
||||||
|
for place in self.omp_places:
|
||||||
|
if place["available"]:
|
||||||
|
reserve = int(os.environ.get("VLLM_CPU_NUM_OF_RESERVED_CPU", 0))
|
||||||
|
place["available"] = False
|
||||||
|
# pylint: disable=consider-using-f-string
|
||||||
|
os.environ["OMP_PLACES"] = "{}".format(place["mask"])
|
||||||
|
os.environ["OMP_NUM_THREADS"] = "{}".format(
|
||||||
|
len(place["mask"]) - reserve
|
||||||
|
)
|
||||||
|
os.environ["OMP_PROC_BIND"] = "TRUE"
|
||||||
|
return what(*args, **kwargs)
|
||||||
|
raise IndexError("Out of OMP places")
|
||||||
|
return what(*args, **kwargs)
|
||||||
@@ -119,7 +119,6 @@ class MultiprocExecutor(Executor):
|
|||||||
f"_parallel_size ({pcp_size}). "
|
f"_parallel_size ({pcp_size}). "
|
||||||
)
|
)
|
||||||
|
|
||||||
# Set multiprocessing envs
|
|
||||||
set_multiprocessing_worker_envs()
|
set_multiprocessing_worker_envs()
|
||||||
|
|
||||||
# use the loopback address get_loopback_ip() for communication.
|
# use the loopback address get_loopback_ip() for communication.
|
||||||
@@ -172,16 +171,31 @@ class MultiprocExecutor(Executor):
|
|||||||
for local_rank in range(self.local_world_size):
|
for local_rank in range(self.local_world_size):
|
||||||
global_rank = global_start_rank + local_rank
|
global_rank = global_start_rank + local_rank
|
||||||
is_driver_worker = self._is_driver_worker(global_rank)
|
is_driver_worker = self._is_driver_worker(global_rank)
|
||||||
unready_worker_handle = WorkerProc.make_worker_process(
|
if current_platform.is_cpu():
|
||||||
vllm_config=self.vllm_config,
|
om = current_platform.get_omp_manager()
|
||||||
local_rank=local_rank,
|
logger.info("Configured OMP PLACES %s", str(om.omp_places))
|
||||||
rank=global_rank,
|
unready_worker_handle = om.run(
|
||||||
distributed_init_method=distributed_init_method,
|
WorkerProc.make_worker_process,
|
||||||
input_shm_handle=scheduler_output_handle,
|
vllm_config=self.vllm_config,
|
||||||
shared_worker_lock=shared_worker_lock,
|
local_rank=local_rank,
|
||||||
is_driver_worker=is_driver_worker,
|
rank=global_rank,
|
||||||
inherited_fds=inherited_fds,
|
distributed_init_method=distributed_init_method,
|
||||||
)
|
input_shm_handle=scheduler_output_handle,
|
||||||
|
shared_worker_lock=shared_worker_lock,
|
||||||
|
is_driver_worker=is_driver_worker,
|
||||||
|
inherited_fds=inherited_fds,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
unready_worker_handle = WorkerProc.make_worker_process(
|
||||||
|
vllm_config=self.vllm_config,
|
||||||
|
local_rank=local_rank,
|
||||||
|
rank=global_rank,
|
||||||
|
distributed_init_method=distributed_init_method,
|
||||||
|
input_shm_handle=scheduler_output_handle,
|
||||||
|
shared_worker_lock=shared_worker_lock,
|
||||||
|
is_driver_worker=is_driver_worker,
|
||||||
|
inherited_fds=inherited_fds,
|
||||||
|
)
|
||||||
unready_workers.append(unready_worker_handle)
|
unready_workers.append(unready_worker_handle)
|
||||||
if inherited_fds is not None:
|
if inherited_fds is not None:
|
||||||
inherited_fds.append(unready_worker_handle.death_writer.fileno())
|
inherited_fds.append(unready_worker_handle.death_writer.fileno())
|
||||||
@@ -1000,24 +1014,26 @@ def set_multiprocessing_worker_envs():
|
|||||||
|
|
||||||
_maybe_force_spawn()
|
_maybe_force_spawn()
|
||||||
|
|
||||||
# Configure thread parallelism if OMP_NUM_THREADS isn't set
|
if not current_platform.is_cpu():
|
||||||
#
|
# Configure thread parallelism if OMP_NUM_THREADS isn't set
|
||||||
# Helps to avoid CPU contention. The default of spawning a thread per
|
#
|
||||||
# core combined with multiprocessing for each GPU can have a negative
|
# Helps to avoid CPU contention. The default of spawning a thread per
|
||||||
# impact on performance. The contention is amplified when running in a
|
# core combined with multiprocessing for each GPU can have a negative
|
||||||
# container where CPU limits can cause throttling.
|
# impact on performance. The contention is amplified when running in a
|
||||||
default_omp_num_threads = 1
|
# container where CPU limits can cause throttling.
|
||||||
if (
|
default_omp_num_threads = 1
|
||||||
"OMP_NUM_THREADS" not in os.environ
|
if (
|
||||||
and (current_parallelism := torch.get_num_threads()) > default_omp_num_threads
|
"OMP_NUM_THREADS" not in os.environ
|
||||||
):
|
and (current_parallelism := torch.get_num_threads())
|
||||||
logger.warning_once(
|
> default_omp_num_threads
|
||||||
"Reducing Torch parallelism from %d threads to %d to avoid "
|
):
|
||||||
"unnecessary CPU contention. Set OMP_NUM_THREADS in the "
|
logger.warning_once(
|
||||||
"external environment to tune this value as needed.",
|
"Reducing Torch parallelism from %d threads to %d to avoid "
|
||||||
current_parallelism,
|
"unnecessary CPU contention. Set OMP_NUM_THREADS in the "
|
||||||
default_omp_num_threads,
|
"external environment to tune this value as needed.",
|
||||||
scope="local",
|
current_parallelism,
|
||||||
)
|
default_omp_num_threads,
|
||||||
os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads)
|
scope="local",
|
||||||
torch.set_num_threads(default_omp_num_threads)
|
)
|
||||||
|
os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads)
|
||||||
|
torch.set_num_threads(default_omp_num_threads)
|
||||||
|
|||||||
@@ -1,18 +1,14 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import os
|
import os
|
||||||
import platform
|
|
||||||
import sys
|
import sys
|
||||||
from collections.abc import Callable
|
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm import envs
|
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.platforms import CpuArchEnum, current_platform
|
from vllm.platforms import CpuArchEnum, current_platform
|
||||||
from vllm.platforms.cpu import CpuPlatform, LogicalCPUInfo
|
|
||||||
from vllm.profiler.wrapper import TorchProfilerWrapper
|
from vllm.profiler.wrapper import TorchProfilerWrapper
|
||||||
from vllm.utils.torch_utils import set_random_seed
|
from vllm.utils.torch_utils import set_random_seed
|
||||||
from vllm.v1.worker.cpu_model_runner import CPUModelRunner
|
from vllm.v1.worker.cpu_model_runner import CPUModelRunner
|
||||||
@@ -71,44 +67,6 @@ class CPUWorker(Worker):
|
|||||||
if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
|
if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
|
||||||
check_preloaded_libs("libiomp")
|
check_preloaded_libs("libiomp")
|
||||||
|
|
||||||
# Setup OpenMP threads affinity.
|
|
||||||
omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND
|
|
||||||
# Under numa binding some cores reserved for kv transfer in nixl_connector.py
|
|
||||||
if omp_cpuids == "auto" and platform.system() == "Linux":
|
|
||||||
cpu_arch = current_platform.get_cpu_architecture()
|
|
||||||
if cpu_arch in (CpuArchEnum.POWERPC, CpuArchEnum.S390X):
|
|
||||||
# For S390X/POWERPC SMT-8/4/2
|
|
||||||
self.local_omp_cpuid = self._get_autobind_cpu_ids(
|
|
||||||
lambda cpus: [cpu for cpu in cpus if cpu.id % 8 < 4]
|
|
||||||
)
|
|
||||||
elif cpu_arch == CpuArchEnum.X86:
|
|
||||||
# For x86 SMT-2, use 1 CPU per core
|
|
||||||
self.local_omp_cpuid = self._get_autobind_cpu_ids(
|
|
||||||
lambda cpus: cpus[-1:]
|
|
||||||
)
|
|
||||||
elif cpu_arch == CpuArchEnum.ARM:
|
|
||||||
# For AArch64, no SMT
|
|
||||||
self.local_omp_cpuid = self._get_autobind_cpu_ids(lambda cpus: cpus)
|
|
||||||
else:
|
|
||||||
self.local_omp_cpuid = "nobind"
|
|
||||||
elif omp_cpuids == "nobind":
|
|
||||||
self.local_omp_cpuid = "nobind"
|
|
||||||
else:
|
|
||||||
local_dp_rank = self.parallel_config.data_parallel_rank_local
|
|
||||||
omp_cpuids_list = omp_cpuids.split("|")
|
|
||||||
if local_dp_rank is not None:
|
|
||||||
world_size = self.parallel_config.world_size
|
|
||||||
omp_cpuids_list = omp_cpuids_list[
|
|
||||||
local_dp_rank * world_size : (local_dp_rank + 1) * world_size
|
|
||||||
]
|
|
||||||
self.local_omp_cpuid = omp_cpuids_list[self.rank]
|
|
||||||
|
|
||||||
if self.local_omp_cpuid != "nobind":
|
|
||||||
ret = torch.ops._C.init_cpu_threads_env(self.local_omp_cpuid)
|
|
||||||
if ret:
|
|
||||||
logger.info(ret)
|
|
||||||
|
|
||||||
# After the thread binding, changing thread num is not allowed
|
|
||||||
def skip_set_num_threads(x: int):
|
def skip_set_num_threads(x: int):
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"CPU backend doesn't allow to use "
|
"CPU backend doesn't allow to use "
|
||||||
@@ -153,92 +111,6 @@ class CPUWorker(Worker):
|
|||||||
self.model_runner.warming_up_model()
|
self.model_runner.warming_up_model()
|
||||||
return self.compilation_config.compilation_time
|
return self.compilation_config.compilation_time
|
||||||
|
|
||||||
def _get_autobind_cpu_ids(
|
|
||||||
self, cpu_selector: Callable[[list[LogicalCPUInfo]], list[LogicalCPUInfo]]
|
|
||||||
) -> str:
|
|
||||||
"""
|
|
||||||
Return CPU ids to bind based on NUMA nodes.
|
|
||||||
Currently for rank N, only CPU ids on the N-th node in available NUMA
|
|
||||||
node list will be selected.
|
|
||||||
Args:
|
|
||||||
cpu_selector: a callable object to select CPUs from a CPU list
|
|
||||||
of a physical core. The input is a LogicalCPUInfo list, sorted by
|
|
||||||
the LogicalCPUInfo.id. A selected LogicalCPUInfo list should be
|
|
||||||
returned.
|
|
||||||
"""
|
|
||||||
# simulate multiple numa nodes, for testing
|
|
||||||
sim_multi_numa_nodes = os.environ.get("VLLM_CPU_SIM_MULTI_NUMA", "0") != "0"
|
|
||||||
|
|
||||||
allowed_numa_nodes, logical_cpu_list = (
|
|
||||||
CpuPlatform.get_allowed_cpu_core_node_list()
|
|
||||||
)
|
|
||||||
local_world_size = self.parallel_config.local_world_size
|
|
||||||
assert len(allowed_numa_nodes) >= local_world_size or sim_multi_numa_nodes, (
|
|
||||||
f"Not enough allowed NUMA nodes to bind threads of "
|
|
||||||
f"{local_world_size} local CPUWorkers. "
|
|
||||||
f"Allowed NUMA nodes are {allowed_numa_nodes}. "
|
|
||||||
"Please try to bind threads manually."
|
|
||||||
)
|
|
||||||
|
|
||||||
if not sim_multi_numa_nodes:
|
|
||||||
# Get CPUs on NUMA node `allowed_numa_nodes[local_rank]`
|
|
||||||
selected_numa_node = allowed_numa_nodes[self.local_rank] # type: ignore
|
|
||||||
logical_cpu_list = [
|
|
||||||
x for x in logical_cpu_list if x.numa_node == selected_numa_node
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
# This is a bit tricky because the internal DP size
|
|
||||||
# is always 1 for non-MoE models
|
|
||||||
world_size_across_dp = (
|
|
||||||
self.parallel_config.world_size
|
|
||||||
* self.parallel_config._api_process_count
|
|
||||||
)
|
|
||||||
assert len(logical_cpu_list) >= world_size_across_dp
|
|
||||||
logical_cpu_list = sorted(logical_cpu_list, key=lambda x: x.numa_node)
|
|
||||||
sim_cpu_num_per_node = len(logical_cpu_list) // world_size_across_dp
|
|
||||||
assert self.parallel_config.data_parallel_rank_local is not None
|
|
||||||
start_idx = (
|
|
||||||
self.local_rank
|
|
||||||
+ self.parallel_config.world_size
|
|
||||||
* self.parallel_config.data_parallel_rank_local
|
|
||||||
) * sim_cpu_num_per_node
|
|
||||||
logical_cpu_list = logical_cpu_list[
|
|
||||||
start_idx : (start_idx + sim_cpu_num_per_node)
|
|
||||||
]
|
|
||||||
|
|
||||||
# Select CPUs from each physical core via cpu_selector
|
|
||||||
core_to_cpus: dict[int, list[LogicalCPUInfo]] = {}
|
|
||||||
for cpu_info in logical_cpu_list:
|
|
||||||
if cpu_info.physical_core not in core_to_cpus:
|
|
||||||
core_to_cpus[cpu_info.physical_core] = []
|
|
||||||
core_to_cpus[cpu_info.physical_core].append(cpu_info)
|
|
||||||
logical_cpu_list = []
|
|
||||||
for cpu_list in core_to_cpus.values():
|
|
||||||
cpu_list = sorted(cpu_list, key=lambda x: x.id)
|
|
||||||
logical_cpu_list.extend(cpu_selector(cpu_list))
|
|
||||||
logical_cpu_list = sorted(logical_cpu_list, key=lambda x: x.id)
|
|
||||||
|
|
||||||
# Reserve CPUs for other processes
|
|
||||||
reserve_cpu_num = envs.VLLM_CPU_NUM_OF_RESERVED_CPU
|
|
||||||
if reserve_cpu_num is None:
|
|
||||||
need_reserve = (
|
|
||||||
self.parallel_config.world_size > 1
|
|
||||||
or self.parallel_config.data_parallel_size_local > 1
|
|
||||||
)
|
|
||||||
reserve_cpu_num = 1 if need_reserve else 0
|
|
||||||
assert len(logical_cpu_list) > reserve_cpu_num, (
|
|
||||||
f"VLLM_CPU_NUM_OF_RESERVED_CPU ({reserve_cpu_num}) "
|
|
||||||
f"should less than {len(logical_cpu_list)}."
|
|
||||||
)
|
|
||||||
if reserve_cpu_num != 0:
|
|
||||||
logical_cpu_list = logical_cpu_list[:-reserve_cpu_num]
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
"auto thread-binding list (id, physical core): %s",
|
|
||||||
[(x.id, x.physical_core) for x in logical_cpu_list],
|
|
||||||
)
|
|
||||||
return ",".join([str(x.id) for x in logical_cpu_list])
|
|
||||||
|
|
||||||
def profile(self, is_start: bool = True, profile_prefix: str | None = None):
|
def profile(self, is_start: bool = True, profile_prefix: str | None = None):
|
||||||
if self.profiler is None:
|
if self.profiler is None:
|
||||||
raise RuntimeError("Profiler is not enabled.")
|
raise RuntimeError("Profiler is not enabled.")
|
||||||
|
|||||||
Reference in New Issue
Block a user