[Refactor] Remove dead or duplicate func utils or variables (#35318)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
Wentao Ye
2026-02-26 10:57:56 -05:00
committed by GitHub
parent 111d869069
commit 05972ea7e5
9 changed files with 0 additions and 199 deletions

View File

@@ -649,9 +649,3 @@ ASYNC_REQUEST_FUNCS = {
"sglang": async_request_openai_completions,
"llama.cpp": async_request_openai_completions,
}
OPENAI_COMPATIBLE_BACKENDS = [
k
for k, v in ASYNC_REQUEST_FUNCS.items()
if v in (async_request_openai_completions, async_request_openai_chat_completions)
]

View File

@@ -1,78 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import json
import math
import os
import time
from types import TracebackType
from typing import Any
def convert_to_pytorch_benchmark_format(
args: argparse.Namespace, metrics: dict[str, list], extra_info: dict[str, Any]
) -> list:
"""
Save the benchmark results in the format used by PyTorch OSS benchmark with
on metric per record
https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
"""
records = []
if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
return records
for name, benchmark_values in metrics.items():
record = {
"benchmark": {
"name": "vLLM benchmark",
"extra_info": {
"args": vars(args),
},
},
"model": {
"name": args.model,
},
"metric": {
"name": name,
"benchmark_values": benchmark_values,
"extra_info": extra_info,
},
}
tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size")
# Save tensor_parallel_size parameter if it's part of the metadata
if not tp and "tensor_parallel_size" in extra_info:
record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = (
extra_info["tensor_parallel_size"]
)
records.append(record)
return records
class InfEncoder(json.JSONEncoder):
def clear_inf(self, o: Any):
if isinstance(o, dict):
return {k: self.clear_inf(v) for k, v in o.items()}
elif isinstance(o, list):
return [self.clear_inf(v) for v in o]
elif isinstance(o, float) and math.isinf(o):
return "inf"
return o
def iterencode(self, o: Any, *args, **kwargs) -> Any:
return super().iterencode(self.clear_inf(o), *args, **kwargs)
def write_to_json(filename: str, records: list) -> None:
with open(filename, "w") as f:
json.dump(
records,
f,
cls=InfEncoder,
default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
)
# Collect time and generate time metrics

View File

@@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Cutlass bench utils
from collections.abc import Iterable
import torch
@@ -86,15 +85,3 @@ def make_rand_sparse_tensors(
# Compressed B, Metadata, Original A, B
return b_compressed, e, a, b
def make_n_rand_sparse_tensors(
num_tensors: int, dtype: torch.dtype, m: int, n: int, k: int
) -> tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
ABs = []
for _ in range(num_tensors):
b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
if b_comp is not None:
ABs.append(make_rand_sparse_tensors(dtype, m, n, k))
BComps, Es, As, Bs = zip(*ABs)
return list(BComps), list(Es), list(As), list(Bs)

View File

@@ -1,45 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import time
class RateLimiter:
"""Token bucket rate limiter implementation"""
def __init__(self, rate_limit):
self.rate_limit = rate_limit # Requests per second
self.num_available_tokens = rate_limit # Available tokens
self.last_refill = time.monotonic() # Last token refill time
self.lock = asyncio.Lock() # Synchronization lock
async def acquire(self):
"""Acquire a token from the rate limiter"""
while True:
async with self.lock:
current_time = time.monotonic()
elapsed = current_time - self.last_refill
# Refill num_available_tokens if more than 1 second has passed
if elapsed > 1.0:
self.num_available_tokens = self.rate_limit
self.last_refill = current_time
# Check if num_available_tokens are available
if self.num_available_tokens > 0:
self.num_available_tokens -= 1
return True
# Calculate wait time if no num_available_tokens available
wait_time = 1.0 - elapsed
await asyncio.sleep(wait_time)
async def __aenter__(self):
"""Enter async context manager - acquire token"""
await self.acquire()
return self
async def __aexit__(self, exc_type, exc_value, traceback):
"""Exit async context manager - no cleanup needed"""
pass

View File

@@ -1,39 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
from collections import deque
class RequestQueue:
"""Request queue manager with concurrency control"""
def __init__(self, max_concurrent, max_queue_size):
# Maximum concurrent requests
self.max_concurrent = max_concurrent
self.max_queue_size = max_queue_size # Maximum queue size
# Concurrency control
self.semaphore = asyncio.Semaphore(max_concurrent)
self.queue = deque() # Request queue
self.queue_size = 0 # Current queue size
self.lock = asyncio.Lock() # Sync queue Lock
async def enqueue(self, task):
"""Add a request task to the queue"""
async with self.lock:
if self.queue_size >= self.max_queue_size:
return False
self.queue.append(task)
self.queue_size += 1
return True
async def process(self):
"""Process queued requests using semaphore for concurrency control"""
while True:
if self.queue:
async with self.semaphore, self.lock:
task = self.queue.popleft()
self.queue_size -= 1
await task
await asyncio.sleep(0.01) # Yield control to event loop

View File

@@ -7,7 +7,6 @@ import torch
from torch.nn.parameter import Parameter
from vllm import _custom_ops as ops
from vllm.logger import init_logger
from vllm.model_executor.kernels.linear import (
init_fp8_linear_kernel,
)
@@ -26,10 +25,6 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
)
from vllm.platforms import current_platform
ACTIVATION_SCHEMES = ["static", "dynamic"]
logger = init_logger(__name__)
class PTPCFp8Config(Fp8Config):
"""Config class for Per-Token-Per-Channel Dynamic Quantization Fp8."""

View File

@@ -255,18 +255,6 @@ def marlin_moe_intermediate_size(w1_packed: torch.Tensor, w2_packed: torch.Tenso
return w2_packed.size(1) * marlin_tile_size
def marlin_make_workspace(
output_size_per_partition: int, device: torch.device
) -> torch.Tensor:
max_workspace_size = (
output_size_per_partition // GPTQ_MARLIN_MIN_THREAD_N
) * GPTQ_MARLIN_MAX_PARALLEL
return torch.zeros(
max_workspace_size, dtype=torch.int, device=device, requires_grad=False
)
def marlin_make_workspace_new(
device: torch.device, max_blocks_per_sm: int = 1
) -> torch.Tensor:
@@ -297,12 +285,6 @@ def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
)
def marlin_make_empty_zp(device: torch.device) -> torch.Tensor:
return torch.nn.Parameter(
torch.empty(0, dtype=torch.int, device=device), requires_grad=False
)
def marlin_sort_g_idx(g_idx: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
return g_idx[g_idx_sort_indices], g_idx_sort_indices

View File

@@ -49,7 +49,6 @@ from .utils import (
)
from .vision import get_vision_encoder_info
EOT = "<|endofturn|>"
IMAGE_TOKEN: str = "<|dummy3|>"
VIDEO_TOKEN: str = "<|_unuse_missing_100270|>"

View File

@@ -72,7 +72,6 @@ from vllm.version import __version__ as VLLM_VERSION
logger = init_logger(__name__)
POLLING_TIMEOUT_S = 2.5
HANDSHAKE_TIMEOUT_MINS = 5
_R = TypeVar("_R") # Return type for collective_rpc