[Refactor] Remove dead or duplicate func utils or variables (#35318)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
@@ -649,9 +649,3 @@ ASYNC_REQUEST_FUNCS = {
|
||||
"sglang": async_request_openai_completions,
|
||||
"llama.cpp": async_request_openai_completions,
|
||||
}
|
||||
|
||||
OPENAI_COMPATIBLE_BACKENDS = [
|
||||
k
|
||||
for k, v in ASYNC_REQUEST_FUNCS.items()
|
||||
if v in (async_request_openai_completions, async_request_openai_chat_completions)
|
||||
]
|
||||
|
||||
@@ -1,78 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
import time
|
||||
from types import TracebackType
|
||||
from typing import Any
|
||||
|
||||
|
||||
def convert_to_pytorch_benchmark_format(
|
||||
args: argparse.Namespace, metrics: dict[str, list], extra_info: dict[str, Any]
|
||||
) -> list:
|
||||
"""
|
||||
Save the benchmark results in the format used by PyTorch OSS benchmark with
|
||||
on metric per record
|
||||
https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
|
||||
"""
|
||||
records = []
|
||||
if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
|
||||
return records
|
||||
|
||||
for name, benchmark_values in metrics.items():
|
||||
record = {
|
||||
"benchmark": {
|
||||
"name": "vLLM benchmark",
|
||||
"extra_info": {
|
||||
"args": vars(args),
|
||||
},
|
||||
},
|
||||
"model": {
|
||||
"name": args.model,
|
||||
},
|
||||
"metric": {
|
||||
"name": name,
|
||||
"benchmark_values": benchmark_values,
|
||||
"extra_info": extra_info,
|
||||
},
|
||||
}
|
||||
|
||||
tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size")
|
||||
# Save tensor_parallel_size parameter if it's part of the metadata
|
||||
if not tp and "tensor_parallel_size" in extra_info:
|
||||
record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = (
|
||||
extra_info["tensor_parallel_size"]
|
||||
)
|
||||
|
||||
records.append(record)
|
||||
|
||||
return records
|
||||
|
||||
|
||||
class InfEncoder(json.JSONEncoder):
|
||||
def clear_inf(self, o: Any):
|
||||
if isinstance(o, dict):
|
||||
return {k: self.clear_inf(v) for k, v in o.items()}
|
||||
elif isinstance(o, list):
|
||||
return [self.clear_inf(v) for v in o]
|
||||
elif isinstance(o, float) and math.isinf(o):
|
||||
return "inf"
|
||||
return o
|
||||
|
||||
def iterencode(self, o: Any, *args, **kwargs) -> Any:
|
||||
return super().iterencode(self.clear_inf(o), *args, **kwargs)
|
||||
|
||||
|
||||
def write_to_json(filename: str, records: list) -> None:
|
||||
with open(filename, "w") as f:
|
||||
json.dump(
|
||||
records,
|
||||
f,
|
||||
cls=InfEncoder,
|
||||
default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
|
||||
)
|
||||
|
||||
|
||||
# Collect time and generate time metrics
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Cutlass bench utils
|
||||
from collections.abc import Iterable
|
||||
|
||||
import torch
|
||||
|
||||
@@ -86,15 +85,3 @@ def make_rand_sparse_tensors(
|
||||
|
||||
# Compressed B, Metadata, Original A, B
|
||||
return b_compressed, e, a, b
|
||||
|
||||
|
||||
def make_n_rand_sparse_tensors(
|
||||
num_tensors: int, dtype: torch.dtype, m: int, n: int, k: int
|
||||
) -> tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
|
||||
ABs = []
|
||||
for _ in range(num_tensors):
|
||||
b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
|
||||
if b_comp is not None:
|
||||
ABs.append(make_rand_sparse_tensors(dtype, m, n, k))
|
||||
BComps, Es, As, Bs = zip(*ABs)
|
||||
return list(BComps), list(Es), list(As), list(Bs)
|
||||
|
||||
@@ -1,45 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
|
||||
class RateLimiter:
|
||||
"""Token bucket rate limiter implementation"""
|
||||
|
||||
def __init__(self, rate_limit):
|
||||
self.rate_limit = rate_limit # Requests per second
|
||||
self.num_available_tokens = rate_limit # Available tokens
|
||||
self.last_refill = time.monotonic() # Last token refill time
|
||||
self.lock = asyncio.Lock() # Synchronization lock
|
||||
|
||||
async def acquire(self):
|
||||
"""Acquire a token from the rate limiter"""
|
||||
while True:
|
||||
async with self.lock:
|
||||
current_time = time.monotonic()
|
||||
elapsed = current_time - self.last_refill
|
||||
|
||||
# Refill num_available_tokens if more than 1 second has passed
|
||||
if elapsed > 1.0:
|
||||
self.num_available_tokens = self.rate_limit
|
||||
self.last_refill = current_time
|
||||
|
||||
# Check if num_available_tokens are available
|
||||
if self.num_available_tokens > 0:
|
||||
self.num_available_tokens -= 1
|
||||
return True
|
||||
|
||||
# Calculate wait time if no num_available_tokens available
|
||||
wait_time = 1.0 - elapsed
|
||||
await asyncio.sleep(wait_time)
|
||||
|
||||
async def __aenter__(self):
|
||||
"""Enter async context manager - acquire token"""
|
||||
await self.acquire()
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc_value, traceback):
|
||||
"""Exit async context manager - no cleanup needed"""
|
||||
pass
|
||||
@@ -1,39 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import asyncio
|
||||
from collections import deque
|
||||
|
||||
|
||||
class RequestQueue:
|
||||
"""Request queue manager with concurrency control"""
|
||||
|
||||
def __init__(self, max_concurrent, max_queue_size):
|
||||
# Maximum concurrent requests
|
||||
self.max_concurrent = max_concurrent
|
||||
self.max_queue_size = max_queue_size # Maximum queue size
|
||||
# Concurrency control
|
||||
self.semaphore = asyncio.Semaphore(max_concurrent)
|
||||
self.queue = deque() # Request queue
|
||||
self.queue_size = 0 # Current queue size
|
||||
self.lock = asyncio.Lock() # Sync queue Lock
|
||||
|
||||
async def enqueue(self, task):
|
||||
"""Add a request task to the queue"""
|
||||
async with self.lock:
|
||||
if self.queue_size >= self.max_queue_size:
|
||||
return False
|
||||
|
||||
self.queue.append(task)
|
||||
self.queue_size += 1
|
||||
return True
|
||||
|
||||
async def process(self):
|
||||
"""Process queued requests using semaphore for concurrency control"""
|
||||
while True:
|
||||
if self.queue:
|
||||
async with self.semaphore, self.lock:
|
||||
task = self.queue.popleft()
|
||||
self.queue_size -= 1
|
||||
await task
|
||||
await asyncio.sleep(0.01) # Yield control to event loop
|
||||
@@ -7,7 +7,6 @@ import torch
|
||||
from torch.nn.parameter import Parameter
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.kernels.linear import (
|
||||
init_fp8_linear_kernel,
|
||||
)
|
||||
@@ -26,10 +25,6 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
ACTIVATION_SCHEMES = ["static", "dynamic"]
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class PTPCFp8Config(Fp8Config):
|
||||
"""Config class for Per-Token-Per-Channel Dynamic Quantization Fp8."""
|
||||
|
||||
@@ -255,18 +255,6 @@ def marlin_moe_intermediate_size(w1_packed: torch.Tensor, w2_packed: torch.Tenso
|
||||
return w2_packed.size(1) * marlin_tile_size
|
||||
|
||||
|
||||
def marlin_make_workspace(
|
||||
output_size_per_partition: int, device: torch.device
|
||||
) -> torch.Tensor:
|
||||
max_workspace_size = (
|
||||
output_size_per_partition // GPTQ_MARLIN_MIN_THREAD_N
|
||||
) * GPTQ_MARLIN_MAX_PARALLEL
|
||||
|
||||
return torch.zeros(
|
||||
max_workspace_size, dtype=torch.int, device=device, requires_grad=False
|
||||
)
|
||||
|
||||
|
||||
def marlin_make_workspace_new(
|
||||
device: torch.device, max_blocks_per_sm: int = 1
|
||||
) -> torch.Tensor:
|
||||
@@ -297,12 +285,6 @@ def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
|
||||
)
|
||||
|
||||
|
||||
def marlin_make_empty_zp(device: torch.device) -> torch.Tensor:
|
||||
return torch.nn.Parameter(
|
||||
torch.empty(0, dtype=torch.int, device=device), requires_grad=False
|
||||
)
|
||||
|
||||
|
||||
def marlin_sort_g_idx(g_idx: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
|
||||
return g_idx[g_idx_sort_indices], g_idx_sort_indices
|
||||
|
||||
@@ -49,7 +49,6 @@ from .utils import (
|
||||
)
|
||||
from .vision import get_vision_encoder_info
|
||||
|
||||
EOT = "<|endofturn|>"
|
||||
IMAGE_TOKEN: str = "<|dummy3|>"
|
||||
VIDEO_TOKEN: str = "<|_unuse_missing_100270|>"
|
||||
|
||||
|
||||
@@ -72,7 +72,6 @@ from vllm.version import __version__ as VLLM_VERSION
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
POLLING_TIMEOUT_S = 2.5
|
||||
HANDSHAKE_TIMEOUT_MINS = 5
|
||||
|
||||
_R = TypeVar("_R") # Return type for collective_rpc
|
||||
|
||||
Reference in New Issue
Block a user