diff --git a/.github/workflows/bc-lint.yml b/.github/workflows/bc-lint.yml index 3795b046d..823695a92 100644 --- a/.github/workflows/bc-lint.yml +++ b/.github/workflows/bc-lint.yml @@ -6,6 +6,8 @@ on: - opened - synchronize - reopened + - labeled + - unlabeled jobs: bc_lint: diff --git a/README.md b/README.md index b4a3583c2..0c6e5aa6b 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,7 @@ vLLM is flexible and easy to use with: - Tensor, pipeline, data and expert parallelism support for distributed inference - Streaming outputs - OpenAI-compatible API server -- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron +- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend. - Prefix caching support - Multi-LoRA support diff --git a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py index 0650cbf3c..c7a4066b3 100644 --- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py +++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py @@ -1,77 +1,675 @@ -#!/usr/bin/env python3 # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import time +from collections.abc import Callable +import matplotlib.pyplot as plt +import numpy as np import torch from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( - silu_mul_fp8_quant_deep_gemm, + silu_mul_fp8_quant_deep_gemm_cuda, ) from vllm.platforms import current_platform +from vllm.triton_utils import tl, triton +from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used -def benchmark(E, T, H, G=128, runs=50): - current_platform.seed_everything(42) - y = torch.randn((E, T, 2 * H), dtype=torch.bfloat16, device="cuda") - tokens_per_expert = torch.randint( - T // 2, T, size=(E,), dtype=torch.int32, device="cuda" +@triton.jit +def _silu_mul_fp8_quant_deep_gemm( + # Pointers ------------------------------------------------------------ + input_ptr, # 16-bit activations (E, T, 2*H) + y_q_ptr, # fp8 quantized activations (E, T, H) + y_s_ptr, # 16-bit scales (E, T, G) + counts_ptr, # int32 num tokens per expert (E) + # Sizes --------------------------------------------------------------- + H: tl.constexpr, # hidden dimension (per output) + GROUP_SIZE: tl.constexpr, # elements per group (usually 128) + # Strides for input (elements) --------------------------------------- + stride_i_e, + stride_i_t, + stride_i_h, + # Strides for y_q (elements) ----------------------------------------- + stride_yq_e, + stride_yq_t, + stride_yq_h, + # Strides for y_s (elements) ----------------------------------------- + stride_ys_e, + stride_ys_t, + stride_ys_g, + # Stride for counts (elements) + stride_counts_e, + # Numeric params ------------------------------------------------------ + eps: tl.constexpr, + fp8_min: tl.constexpr, + fp8_max: tl.constexpr, + use_ue8m0: tl.constexpr, + # Meta --------------------------------------------------------------- + BLOCK: tl.constexpr, + NUM_STAGES: tl.constexpr, +): + G = H // GROUP_SIZE + + # map program id -> (e, g) + pid = tl.program_id(0) + e = pid // G + g = pid % G + + e = e.to(tl.int64) + g = g.to(tl.int64) + + # number of valid tokens for this expert + n_tokens = tl.load(counts_ptr + e * stride_counts_e).to(tl.int64) + + cols = tl.arange(0, BLOCK).to(tl.int64) + mask = cols < BLOCK + + base_input_offset = e * stride_i_e + g * GROUP_SIZE * stride_i_h + base_gate_offset = base_input_offset + cols * stride_i_h + base_up_offset = base_input_offset + H * stride_i_h + cols * stride_i_h + base_yq_offset = e * stride_yq_e + g * GROUP_SIZE * stride_yq_h + cols * stride_yq_h + base_ys_offset = e * stride_ys_e + g * stride_ys_g + + for t in tl.range(0, n_tokens, num_stages=NUM_STAGES): + gate = tl.load( + input_ptr + base_gate_offset + t * stride_i_t, mask=mask, other=0.0 + ).to(tl.float32) + up = tl.load(input_ptr + base_up_offset + t * stride_i_t, mask=mask, other=0.0) + + gate = gate * (1.0 / (1.0 + tl.exp(-gate))) + y = gate * up + + y_s = tl.maximum(tl.max(tl.abs(y)), eps) / fp8_max + if use_ue8m0: + y_s = tl.exp2(tl.ceil(tl.log2(y_s))) + + y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty) + + tl.store(y_q_ptr + base_yq_offset + t * stride_yq_t, y_q, mask=mask) + tl.store(y_s_ptr + base_ys_offset + t * stride_ys_t, y_s) + + +def silu_mul_fp8_quant_deep_gemm_triton( + y: torch.Tensor, # (E, T, 2*H) + tokens_per_expert: torch.Tensor, # (E,) number of valid tokens per expert + num_parallel_tokens, + group_size: int = 128, + eps: float = 1e-10, +) -> tuple[torch.Tensor, torch.Tensor]: + """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales + + y has shape (E, T, 2*H). The first half of the last dimension is + silu-activated, multiplied by the second half, then quantized into FP8. + + Returns `(y_q, y_s)` where + * `y_q`: FP8 tensor, shape (E, T, H), same layout as y[..., :H] + * `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T) + """ + assert y.ndim == 3, "y must be (E, T, 2*H)" + E, T, H2 = y.shape + assert H2 % 2 == 0, "last dim of y must be even (2*H)" + H = H2 // 2 + G = (H + group_size - 1) // group_size + assert H % group_size == 0, "H must be divisible by group_size" + assert tokens_per_expert.ndim == 1 and tokens_per_expert.shape[0] == E, ( + "tokens_per_expert must be shape (E,)" + ) + tokens_per_expert = tokens_per_expert.to(device=y.device, dtype=torch.int32) + + # allocate outputs + fp8_dtype = torch.float8_e4m3fn + y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device) + + # strides (elements) + stride_i_e, stride_i_t, stride_i_h = y.stride() + stride_yq_e, stride_yq_t, stride_yq_h = y_q.stride() + + # desired scale strides (elements): (T*G, 1, T) + stride_ys_e = T * G + stride_ys_t = 1 + stride_ys_g = T + y_s = torch.empty_strided( + (E, T, G), + (stride_ys_e, stride_ys_t, stride_ys_g), + dtype=torch.float32, + device=y.device, ) + stride_cnt_e = tokens_per_expert.stride()[0] + + # Static grid over experts and H-groups. + # A loop inside the kernel handles the token dim + grid = (E * G,) + + f_info = torch.finfo(fp8_dtype) + fp8_max = f_info.max + fp8_min = f_info.min + + _silu_mul_fp8_quant_deep_gemm[grid]( + y, + y_q, + y_s, + tokens_per_expert, + H, + group_size, + stride_i_e, + stride_i_t, + stride_i_h, + stride_yq_e, + stride_yq_t, + stride_yq_h, + stride_ys_e, + stride_ys_t, + stride_ys_g, + stride_cnt_e, + eps, + fp8_min, + fp8_max, + is_deep_gemm_e8m0_used(), + BLOCK=group_size, + NUM_STAGES=4, + num_warps=1, + ) + + return y_q, y_s + + +# Parse generation strategies +strategies = ["uniform", "max_t", "first_t"] + + +def benchmark( + kernel: Callable, + E: int, + T: int, + H: int, + total_tokens: int, + num_parallel_tokens: int = 64, + G: int = 128, + runs: int = 200, + num_warmups: int = 20, + gen_strategy: str = "default", + iterations_per_run: int = 20, +): + def generate_data(seed_offset=0): + """Generate input data with given seed offset""" + current_platform.seed_everything(42 + seed_offset) + y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous() + + if gen_strategy == "uniform": + r = torch.rand(size=(E,), device="cuda") + r /= r.sum() + r *= total_tokens + tokens_per_expert = r.int() + tokens_per_expert = torch.minimum( + tokens_per_expert, + torch.ones((E,), device=r.device, dtype=torch.int) * T, + ) + elif gen_strategy == "max_t": + tokens_per_expert = torch.empty(size=(E,), dtype=torch.int32, device="cuda") + tokens_per_expert.fill_(total_tokens / E) + elif gen_strategy == "first_t": + tokens_per_expert = torch.zeros(size=(E,), dtype=torch.int32, device="cuda") + tokens_per_expert[0] = min(T, total_tokens) + else: + raise ValueError(f"Unknown generation strategy: {gen_strategy}") + return y, tokens_per_expert + + dataset_count = 4 + # Pre-generate different input matrices for each iteration to avoid cache effects + data_sets = [generate_data(i) for i in range(dataset_count)] + # Warmup - for _ in range(10): - silu_mul_fp8_quant_deep_gemm(y, tokens_per_expert, group_size=G) - torch.cuda.synchronize() + y, tokens_per_expert = data_sets[0] + for _ in range(num_warmups): + kernel( + y, tokens_per_expert, num_parallel_tokens=num_parallel_tokens, group_size=G + ) + torch.cuda.synchronize() + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) # Benchmark - torch.cuda.synchronize() - start = time.perf_counter() + latencies: list[float] = [] for _ in range(runs): - silu_mul_fp8_quant_deep_gemm(y, tokens_per_expert, group_size=G) - torch.cuda.synchronize() + torch.cuda.synchronize() - avg_time = (time.perf_counter() - start) / runs * 1000 + start_event.record() + for i in range(iterations_per_run): + y, tokens_per_expert = data_sets[i % dataset_count] + kernel( + y, + tokens_per_expert, + num_parallel_tokens=num_parallel_tokens, + group_size=G, + ) + end_event.record() + end_event.synchronize() - # Calculate actual work done (only count valid tokens) + total_time_ms = start_event.elapsed_time(end_event) + per_iter_time_ms = total_time_ms / iterations_per_run + latencies.append(per_iter_time_ms) + + # Use median instead of average for better outlier handling + median_time_ms = np.median(latencies) + median_time_s = median_time_ms / 1000 + + # Calculate actual work done (using first dataset for consistency) + _, tokens_per_expert = data_sets[0] actual_tokens = tokens_per_expert.sum().item() actual_elements = actual_tokens * H # GFLOPS: operations per element = exp + 3 muls + 1 div + quantization ops ≈ 8 ops ops_per_element = 8 total_ops = actual_elements * ops_per_element - gflops = total_ops / (avg_time / 1000) / 1e9 + gflops = total_ops / median_time_s / 1e9 # Memory bandwidth: bfloat16 inputs (2 bytes), fp8 output (1 byte), scales (4 bytes) input_bytes = actual_tokens * 2 * H * 2 # 2*H bfloat16 inputs output_bytes = actual_tokens * H * 1 # H fp8 outputs scale_bytes = actual_tokens * (H // G) * 4 # scales in float32 total_bytes = input_bytes + output_bytes + scale_bytes - memory_bw = total_bytes / (avg_time / 1000) / 1e9 + memory_bw = total_bytes / median_time_s / 1e9 - return avg_time, gflops, memory_bw + HOPPER_BANDWIDTH_TBPS = 3.35 + return ( + median_time_ms, + gflops, + memory_bw, + (memory_bw / (HOPPER_BANDWIDTH_TBPS * 1024)) * 100, + ) +def create_comparison_plot( + ratio, cuda_times, baseline_times, config_labels, strategy_name, id +): + """Create a comparison plot for a specific generation strategy""" + fig, ax = plt.subplots(1, 1, figsize=(16, 6)) + + # Configure x-axis positions + x = np.arange(len(config_labels)) + width = 0.35 + + # Execution Time plot (lower is better) + ax.bar( + x - width / 2, cuda_times, width, label="CUDA Kernel", alpha=0.8, color="blue" + ) + ax.bar( + x + width / 2, + baseline_times, + width, + label="Baseline", + alpha=0.8, + color="orange", + ) + + # Add speedup labels over each bar pair + for i in range(len(x)): + speedup = ratio[i] + max_height = max(cuda_times[i], baseline_times[i]) + ax.text( + x[i], + max_height + max_height * 0.02, + f"{speedup:.2f}x", + ha="center", + va="bottom", + fontweight="bold", + fontsize=9, + ) + + ax.set_xlabel("Configuration") + ax.set_ylabel("% Utilization") + ax.set_title( + f"Memory Bandwidth Utilization (%) - {strategy_name}\n(Higher is Better)" + ) + ax.set_xticks(x) + ax.set_xticklabels(config_labels, rotation=45, ha="right") + ax.legend() + ax.grid(True, alpha=0.3) + + plt.tight_layout() + return fig, ax + + +def create_combined_plot(all_results): + """Create a combined plot with all strategies in one PNG""" + num_strategies = len(all_results) + fig, axes = plt.subplots(num_strategies, 1, figsize=(20, 6 * num_strategies)) + + if num_strategies == 1: + axes = [axes] + + for idx, ( + strategy_name, + ratio, + cuda_times, + baseline_times, + config_labels, + ) in enumerate(all_results): + ax = axes[idx] + + # Configure x-axis positions + x = np.arange(len(config_labels)) + width = 0.35 + + # Execution Time plot (lower is better) + ax.bar( + x - width / 2, + cuda_times, + width, + label="CUDA Kernel", + alpha=0.8, + color="blue", + ) + ax.bar( + x + width / 2, + baseline_times, + width, + label="Baseline", + alpha=0.8, + color="orange", + ) + + # Add speedup labels over each bar pair + for i in range(len(x)): + speedup = ratio[i] + max_height = max(cuda_times[i], baseline_times[i]) + ax.text( + x[i], + max_height + max_height * 0.02, + f"{speedup:.2f}x", + ha="center", + va="bottom", + fontweight="bold", + fontsize=9, + ) + + ax.set_xlabel("Configuration") + ax.set_ylabel("% Utilization") + ax.set_title( + f"Memory Bandwidth Utilization (%) - {strategy_name}\n(Higher is Better)" + ) + ax.set_xticks(x) + ax.set_xticklabels(config_labels, rotation=45, ha="right") + ax.legend() + ax.grid(True, alpha=0.3) + + plt.tight_layout() + filename = "../../silu_bench/silu_benchmark_combined.png" + plt.savefig(filename, dpi=300, bbox_inches="tight") + plt.show() + + return filename + + +outer_dim = 7168 configs = [ - (8, 32, 1024), - (16, 64, 2048), - (32, 128, 4096), # DeepSeekV3 Configs - (256, 16, 7168), - (256, 32, 7168), - (256, 64, 7168), - (256, 128, 7168), - (256, 256, 7168), - (256, 512, 7168), + (8, 1024, 7168), + # DeepSeekV3 Configs + (32, 1024, 7168), + # DeepSeekV3 Configs (256, 1024, 7168), ] -print(f"GPU: {torch.cuda.get_device_name()}") -print(f"{'Config':<20} {'Time(ms)':<10} {'GFLOPS':<10} {'GB/s':<10}") -print("-" * 50) +runs = 100 +num_warmups = 20 -for E, T, H in configs: - try: - time_ms, gflops, gbps = benchmark(E, T, H) - print(f"E={E:3d},T={T:4d},H={H:4d} {time_ms:8.3f} {gflops:8.1f} {gbps:8.1f}") - except Exception: - print(f"E={E:3d},T={T:4d},H={H:4d} FAILED") +strategy_descriptions = { + "uniform": "Uniform Random", + "max_t": "Even Assignment", + "first_t": "experts[0] = T, experts[1:] = 0", +} + +print(f"GPU: {torch.cuda.get_device_name()}") +print(f"Testing strategies: {', '.join(strategies)}") +print(f"Configurations: {len(configs)} configs") + +all_results = [] + +# Run benchmarks for each strategy +for id, strategy in enumerate(strategies): + print(f"\n{'=' * 60}") + print(f"Testing strategy: {strategy_descriptions[strategy]}") + print(f"{'=' * 60}") + + # Collect benchmark data for both algorithms + config_labels = [] + config_x_axis = [] + all_cuda_results = [] + all_baseline_results = [] + all_ratios = [] + + for E, T, H in configs: + total_tokens_config = [8 * E, 16 * E, 32 * E, 64 * E, 128 * E, 256 * E] + config_x_axis.append(total_tokens_config) + + cuda_results = [] + baseline_results = [] + ratios = [] + + for total_tokens in total_tokens_config: + config_label = f"E={E},T={T},H={H},TT={total_tokens}" + config_labels.append(config_label) + + # CUDA kernel results + time_ms_cuda, gflops, gbps, perc = benchmark( + silu_mul_fp8_quant_deep_gemm_cuda, + E, + T, + H, + total_tokens, + runs=runs, + num_warmups=num_warmups, + gen_strategy=strategy, + ) + cuda_results.append((time_ms_cuda, gflops, gbps, perc)) + + # Baseline results + time_ms_triton, gflops, gbps, perc = benchmark( + silu_mul_fp8_quant_deep_gemm_triton, + E, + T, + H, + total_tokens, + runs=runs, + num_warmups=num_warmups, + gen_strategy=strategy, + ) + baseline_results.append((time_ms_triton, gflops, gbps, perc)) + ratios.append(time_ms_triton / time_ms_cuda) + + print(f"Completed: {config_label}") + all_cuda_results.append(cuda_results) + all_baseline_results.append(baseline_results) + all_ratios.append(ratios) + + # Store results for combined plotting + all_results.append( + ( + strategy_descriptions[strategy], + all_ratios, + all_cuda_results, + all_baseline_results, + config_labels, + config_x_axis, + ) + ) + + # Print summary table for this strategy + print(f"\nSummary Table - {strategy_descriptions[strategy]}:") + print(f"{'Config':<20} {'CUDA Time(ms)':<12} {'Base Time(ms)':<12} {'Speedup':<8}") + print("-" * 60) + + for i, (E, T, H) in enumerate(configs): + speedup = baseline_results[i][0] / cuda_results[i][0] + config_label = f"E={E:3d},T={T:4d},H={H:4d}" + print( + f"{config_label:<20} {cuda_results[i][0]:8.5f} " + f"{baseline_results[i][0]:8.5f} {speedup:6.2f}x" + ) + + +def create_total_tokens_plot(all_results): + num_strategies = len(all_results) + num_configs = len(configs) + + # Create side-by-side subplots: 2 columns for speedup and bandwidth percentage + fig, axs = plt.subplots( + num_strategies, num_configs * 2, figsize=(28, 6 * num_strategies) + ) + + # Add main title to the entire figure + fig.suptitle( + "Performance Analysis: Speedup vs Bandwidth Utilization (Triton & CUDA)", + fontsize=16, + fontweight="bold", + y=0.98, + ) + + # Handle single strategy case + if num_strategies == 1: + axs = axs.reshape(1, -1) + + # Handle single config case + if num_configs == 1: + axs = axs.reshape(-1, 2) + + for strategy_idx, result in enumerate(all_results): + ( + strategy_name, + all_ratios, + all_cuda_results, + all_baseline_results, + config_labels, + config_x_axis, + ) = result + + for config_idx in range(num_configs): + # Speedup plot (left column) + ax_speedup = axs[strategy_idx, config_idx * 2] + # Bandwidth plot (right column) + ax_bandwidth = axs[strategy_idx, config_idx * 2 + 1] + + E, T, H = configs[config_idx] + ratios = all_ratios[config_idx] + total_tokens_values = config_x_axis[config_idx] + + # Extract CUDA and Triton bandwidth percentages + cuda_bandwidth_percentages = [ + result[3] for result in all_cuda_results[config_idx] + ] + triton_bandwidth_percentages = [ + result[3] for result in all_baseline_results[config_idx] + ] + + # Plot speedup ratios vs total tokens (left plot) + ax_speedup.plot( + total_tokens_values, ratios, "bo-", linewidth=3, markersize=8 + ) + ax_speedup.set_title( + f"{strategy_name}\nSpeedup (CUDA/Triton)\nE={E}, T={T}, H={H}", + fontsize=12, + fontweight="bold", + ) + ax_speedup.set_xlabel("Total Tokens", fontweight="bold", fontsize=11) + ax_speedup.set_ylabel("Speedup Ratio", fontweight="bold", fontsize=11) + ax_speedup.grid(True, alpha=0.3) + + ax_bandwidth.plot( + total_tokens_values, + cuda_bandwidth_percentages, + "ro-", + linewidth=3, + markersize=8, + label="CUDA", + ) + ax_bandwidth.plot( + total_tokens_values, + triton_bandwidth_percentages, + "go-", + linewidth=3, + markersize=8, + label="Triton", + ) + ax_bandwidth.set_title( + f"{strategy_name}\nBandwidth Utilization (Hopper)\nE={E}, T={T}, H={H}", + fontsize=12, + fontweight="bold", + ) + ax_bandwidth.set_xlabel("Total Tokens", fontweight="bold", fontsize=11) + ax_bandwidth.set_ylabel( + "% of Peak Bandwidth", fontweight="bold", fontsize=11 + ) + ax_bandwidth.legend(prop={"weight": "bold"}) + ax_bandwidth.grid(True, alpha=0.3) + + # Format x-axis labels for both plots + for ax in [ax_speedup, ax_bandwidth]: + ax.set_xticks(total_tokens_values) + ax.set_xticklabels( + [ + f"{tt // 1000}K" if tt >= 1000 else str(tt) + for tt in total_tokens_values + ], + fontweight="bold", + ) + # Make tick labels bold + for label in ax.get_xticklabels() + ax.get_yticklabels(): + label.set_fontweight("bold") + + # Add value labels on speedup points + for x, y in zip(total_tokens_values, ratios): + ax_speedup.annotate( + f"{y:.2f}x", + (x, y), + textcoords="offset points", + xytext=(0, 12), + ha="center", + fontsize=10, + fontweight="bold", + bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7), + ) + + # Add value labels on CUDA bandwidth points + for x, y in zip(total_tokens_values, cuda_bandwidth_percentages): + ax_bandwidth.annotate( + f"{y:.1f}%", + (x, y), + textcoords="offset points", + xytext=(0, 12), + ha="center", + fontsize=9, + fontweight="bold", + bbox=dict(boxstyle="round,pad=0.2", facecolor="red", alpha=0.3), + ) + + # Add value labels on Triton bandwidth points + for x, y in zip(total_tokens_values, triton_bandwidth_percentages): + ax_bandwidth.annotate( + f"{y:.1f}%", + (x, y), + textcoords="offset points", + xytext=(0, -15), + ha="center", + fontsize=9, + fontweight="bold", + bbox=dict(boxstyle="round,pad=0.2", facecolor="green", alpha=0.3), + ) + + plt.tight_layout() + plt.subplots_adjust(top=0.93) # Make room for main title + filename = "silu_benchmark_total_tokens.png" + plt.savefig(filename, dpi=300, bbox_inches="tight") + plt.show() + + return filename + + +# Create combined plot with all strategies +combined_plot_filename = create_total_tokens_plot(all_results) + +print(f"\n{'=' * 60}") +print("Benchmark Complete!") +print(f"Generated combined plot: {combined_plot_filename}") +print(f"{'=' * 60}") diff --git a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu index c60f1823b..d1874515c 100644 --- a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu +++ b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu @@ -43,6 +43,7 @@ void sm100_cutlass_mla_decode( torch::Tensor const& seq_lens, torch::Tensor const& page_table, torch::Tensor const& workspace, + double sm_scale, int64_t num_kv_splits) { TORCH_CHECK(false, "CUDA version must be >= 12.4 for cutlass_mla_decode"); } diff --git a/csrc/ops.h b/csrc/ops.h index 2ee5df4ca..c65bf4316 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -122,12 +122,6 @@ void rotary_embedding(torch::Tensor& positions, torch::Tensor& query, std::optional key, int64_t head_size, torch::Tensor& cos_sin_cache, bool is_neox); -void batched_rotary_embedding(torch::Tensor& positions, torch::Tensor& query, - std::optional key, - int64_t head_size, torch::Tensor& cos_sin_cache, - bool is_neox, int64_t rot_dim, - torch::Tensor& cos_sin_cache_offsets); - void silu_and_mul(torch::Tensor& out, torch::Tensor& input); void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input, @@ -139,6 +133,12 @@ void silu_and_mul_nvfp4_quant(torch::Tensor& out, torch::Tensor& input, torch::Tensor& input_global_scale); #endif +void silu_mul_fp8_quant_deep_gemm_cuda( + const at::Tensor& input, // (E, T, 2*H) + const at::Tensor& counts, // (E) + at::Tensor& y_q, // (E, T, H) [OUT] + at::Tensor& y_s, // (E, T, H//group_size) [OUT] + int64_t group_size, bool use_ue8m0, int64_t num_parallel_tokens); void mul_and_silu(torch::Tensor& out, torch::Tensor& input); diff --git a/csrc/pos_encoding_kernels.cu b/csrc/pos_encoding_kernels.cu index 266f2a066..b5645b33b 100644 --- a/csrc/pos_encoding_kernels.cu +++ b/csrc/pos_encoding_kernels.cu @@ -99,35 +99,6 @@ __global__ void rotary_embedding_kernel( token_idx, query_stride, key_stride, head_stride); } -template -__global__ void batched_rotary_embedding_kernel( - const int64_t* __restrict__ positions, // [batch_size, seq_len] or - // [num_tokens] - scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads, - // head_size] or [num_tokens, num_heads, - // head_size] - scalar_t* __restrict__ key, // nullptr or - // [batch_size, seq_len, num_kv_heads, - // head_size] or [num_tokens, num_kv_heads, - // head_size] - const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // - // 2] - const int64_t* __restrict__ cos_sin_cache_offsets, // [batch_size, seq_len] - const int rot_dim, const int64_t query_stride, const int64_t key_stride, - const int64_t head_stride, const int num_heads, const int num_kv_heads, - const int head_size) { - // Each thread block is responsible for one token. - const int token_idx = blockIdx.x; - int64_t pos = positions[token_idx]; - int64_t cos_sin_cache_offset = cos_sin_cache_offsets[token_idx]; - const scalar_t* cache_ptr = - cos_sin_cache + (cos_sin_cache_offset + pos) * rot_dim; - - apply_rotary_embedding( - query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim, - token_idx, query_stride, key_stride, head_stride); -} - } // namespace vllm void rotary_embedding( @@ -211,96 +182,3 @@ void rotary_embedding( } }); } - -/* -Batched version of rotary embedding, pack multiple LoRAs together -and process in batched manner. -*/ -void batched_rotary_embedding( - torch::Tensor& positions, // [batch_size, seq_len] or [num_tokens] - torch::Tensor& query, // [batch_size, seq_len, num_heads * head_size] or - // [num_tokens, num_heads * head_size] or - // [batch_size, seq_len, num_heads, head_size] or - // [num_tokens, num_heads, head_size] - std::optional - key, // null or - // [batch_size, seq_len, num_kv_heads * head_size] or - // [num_tokens, num_kv_heads * head_size] or - // [batch_size, seq_len, num_heads, head_size] or - // [num_tokens, num_heads, head_size] - int64_t head_size, - torch::Tensor& cos_sin_cache, // [max_position, rot_dim] - bool is_neox, int64_t rot_dim, - torch::Tensor& cos_sin_cache_offsets // [num_tokens] or [batch_size] -) { - // num_tokens = batch_size * seq_len - int64_t num_tokens = cos_sin_cache_offsets.size(0); - TORCH_CHECK( - positions.size(0) == num_tokens || positions.numel() == num_tokens, - "positions must have the same num_tokens or batch_size as " - "cos_sin_cache_offsets"); - - int positions_ndim = positions.dim(); - // Make sure num_tokens dim is consistent across positions, query, and key - TORCH_CHECK( - positions_ndim == 1 || positions_ndim == 2, - "positions must have shape [num_tokens] or [batch_size, seq_len]"); - if (positions_ndim == 1) { - TORCH_CHECK(query.size(0) == positions.size(0) && - (!key.has_value() || key->size(0) == positions.size(0)), - "query, key and positions must have the same number of tokens"); - } - if (positions_ndim == 2) { - TORCH_CHECK( - query.size(0) == positions.size(0) && - (!key.has_value() || key->size(0) == positions.size(0)) && - query.size(1) == positions.size(1) && - (!key.has_value() || key->size(1) == positions.size(1)), - "query, key and positions must have the same batch_size and seq_len"); - } - - // Make sure head_size is valid for query and key - int query_hidden_size = query.numel() / num_tokens; - int key_hidden_size = key.has_value() ? key->numel() / num_tokens : 0; - TORCH_CHECK(query_hidden_size % head_size == 0); - TORCH_CHECK(key_hidden_size % head_size == 0); - - // Make sure query and key have concistent number of heads - int num_heads = query_hidden_size / head_size; - int num_kv_heads = key.has_value() ? key_hidden_size / head_size : num_heads; - TORCH_CHECK(num_heads % num_kv_heads == 0); - - int seq_dim_idx = positions_ndim - 1; - int64_t query_stride = query.stride(seq_dim_idx); - int64_t key_stride = key.has_value() ? key->stride(seq_dim_idx) : 0; - // Determine head stride: for [*, heads, head_size] use stride of last dim; - // for flat [*, heads*head_size], heads blocks are contiguous of size - // head_size - int query_ndim = query.dim(); - int64_t head_stride = - (query_ndim == positions_ndim + 2) ? query.stride(-2) : head_size; - - dim3 grid(num_tokens); - dim3 block(std::min(num_heads * rot_dim / 2, 512)); - const at::cuda::OptionalCUDAGuard device_guard(device_of(query)); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "rotary_embedding", [&] { - if (is_neox) { - vllm::batched_rotary_embedding_kernel - <<>>( - positions.data_ptr(), query.data_ptr(), - key.has_value() ? key->data_ptr() : nullptr, - cos_sin_cache.data_ptr(), - cos_sin_cache_offsets.data_ptr(), rot_dim, query_stride, - key_stride, head_stride, num_heads, num_kv_heads, head_size); - } else { - vllm::batched_rotary_embedding_kernel - <<>>( - positions.data_ptr(), query.data_ptr(), - key.has_value() ? key->data_ptr() : nullptr, - cos_sin_cache.data_ptr(), - cos_sin_cache_offsets.data_ptr(), rot_dim, query_stride, - key_stride, head_stride, num_heads, num_kv_heads, head_size); - } - }); -} diff --git a/csrc/quantization/activation_kernels.cu b/csrc/quantization/activation_kernels.cu index 74dcc5dca..699ebb276 100644 --- a/csrc/quantization/activation_kernels.cu +++ b/csrc/quantization/activation_kernels.cu @@ -9,6 +9,26 @@ #include "quantization/w8a8/fp8/common.cuh" +#include + +#ifndef USE_ROCM + #include + #include + #include +#else + #include + #include + #include + +typedef __hip_bfloat162 __nv_bfloat162; +typedef __hip_bfloat16 __nv_bfloat16; +typedef __hip_bfloat16_raw __nv_bfloat16_raw; + +typedef __hip_fp8_e4m3 __nv_fp8_e4m3; +typedef __hip_fp8x4_e4m3 __nv_fp8x4_e4m3; +#endif + +#include "core/registration.h" namespace vllm { template @@ -87,6 +107,337 @@ __global__ void act_and_mul_quant_kernel( } } } + +__device__ __forceinline__ float silu(float x) { + return (__fdividef(x, (1.f + expf(-x)))); +} + +__device__ __forceinline__ float2 silu2(float2 x) { + return make_float2(silu(x.x), silu(x.y)); +} + +#ifndef USE_ROCM +__device__ __forceinline__ float warp_max(float v) { + static constexpr unsigned FULL_MASK = 0xffffffffu; + for (int offset = 1; offset < WARP_SIZE; offset *= 2) { + v = fmaxf(v, __shfl_xor_sync(FULL_MASK, v, offset)); + } + return v; +} + +__device__ __forceinline__ __nv_bfloat16 warp_max(__nv_bfloat16 v) { + static constexpr unsigned FULL_MASK = 0xffffffffu; + for (int offset = 1; offset < WARP_SIZE; offset *= 2) { + v = __hmax(v, __shfl_xor_sync(FULL_MASK, v, offset)); + } + return v; +} +#endif + +template +__device__ __forceinline__ void cp_async4(T* _smem_ptr, const U* _glob_ptr) { +#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800 + auto smem_ptr = reinterpret_cast(_smem_ptr); + auto glob_ptr = reinterpret_cast(_glob_ptr); + const int BYTES = 16; + uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); + asm volatile( + "{\n" + " cp.async.cg.shared.global [%0], [%1], %2;\n" + "}\n" ::"r"(smem), + "l"(glob_ptr), "n"(BYTES)); +#else + _smem_ptr[0] = _glob_ptr[0]; +#endif +} + +__device__ __forceinline__ void cp_async_fence() { +#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800 + asm volatile("cp.async.commit_group;\n" ::); +#else +#endif +} + +template +__device__ __forceinline__ void cp_async_wait() { +#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800 + asm volatile("cp.async.wait_group %0;\n" ::"n"(N)); +#else +#endif +} + +template <> +__device__ __forceinline__ void cp_async_wait<0>() { +#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800 + asm volatile("cp.async.wait_all;\n" ::); +#else +#endif +} + +__device__ __forceinline__ float clip(float v, float mmin, float mmax) { +#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800 + return fminf(mmax, fmaxf(v, mmin)); +#else +#endif +} + +__device__ __forceinline__ __nv_bfloat16 clip(__nv_bfloat16 v, + __nv_bfloat16 mmin, + __nv_bfloat16 mmax) { + return __hmin(mmax, __hmax(v, mmin)); +} + +__device__ __forceinline__ __nv_bfloat162 clip(__nv_bfloat162 v, + __nv_bfloat162 mmin, + __nv_bfloat162 mmax) { + return __hmin2(mmax, __hmax2(v, mmin)); +} + +// We use the following values for fp8 min/max: +// __nv_fp8_e4m3 = (-448, +448) +// __nv_fp8_e4m3uz = (-240.0, +240.0) +// It is currently assumed that only +template +constexpr __nv_bfloat16 get_fp8_max() { + static_assert(std::is_same_v || + std::is_same_v); + if constexpr (std::is_same_v) { + return __nv_bfloat16(__nv_bfloat16_raw{.x = 17376}); + } else { + return __nv_bfloat16(__nv_bfloat16_raw{.x = 17264}); + } +} + +template +constexpr __nv_bfloat16 get_fp8_min() { + static_assert(std::is_same_v || + std::is_same_v); + if constexpr (std::is_same_v) { + return __nv_bfloat16(__nv_bfloat16_raw{.x = 50144}); + } else { + return __nv_bfloat16(__nv_bfloat16_raw{.x = 50032}); + } +} +#ifndef USE_ROCM +template +__global__ void silu_mul_fp8_quant_deep_gemm_kernel( + const __nv_bfloat16* __restrict__ _input, fp8_type* __restrict__ _y_q, + float* __restrict__ _y_s, const int32_t* __restrict__ counts, + + // sizes + int H, int G, + + // strides (in elements) + Idx_t stride_i_e, Idx_t stride_i_t, Idx_t stride_i_h, Idx_t stride_yq_e, + Idx_t stride_yq_t, Idx_t stride_yq_h, Idx_t stride_ys_e, Idx_t stride_ys_t, + Idx_t stride_ys_g, Idx_t stride_counts_e) { + static constexpr __nv_bfloat16 fp8_min = get_fp8_min(); + static constexpr __nv_bfloat16 fp8_max = get_fp8_max(); + // We assign EPS with its 16-bit unsigned counterpart to allow constexpr. + static constexpr __nv_bfloat16 EPS = (__nv_bfloat16_raw{.x = 11996}); + + // We pack 8 16-bit bfloat16 values into a 128-bit __int128_t. + static constexpr int32_t BFLOAT16_PER_GROUP = 8; + + // We split the shared memory in half, corresponding to gate and up matrices: + // [...gate_i, ...up_i] where 0 <= i < stages. + static constexpr int32_t S_NUM_128 = + 2u * (GROUP_SIZE / BFLOAT16_PER_GROUP) * NUM_WARPS * NUM_STAGES; + static constexpr auto THREAD_COUNT = NUM_WARPS * WARP_SIZE; + static constexpr int HALF_THREAD_COUNT = THREAD_COUNT / 2; + static constexpr int32_t S_NUM_64 = S_NUM_128 * 2; + __shared__ __int128_t __align__(16) s_buff_128[S_NUM_128]; + + const int32_t tid = threadIdx.x; + const int32_t warp_id = tid / WARP_SIZE; + const int32_t lane_id = tid % WARP_SIZE; + + auto s_buff_compute_32 = reinterpret_cast<__nv_bfloat162*>(s_buff_128); + + // block handles one (expert e, group g) + int32_t pid = blockIdx.x; + int32_t e = pid / G; + int32_t g = pid % G; + + const int32_t n_tokens = counts[e * stride_counts_e]; + + if (!n_tokens) { + return; // Exit ASAP. + } + + const Idx_t stride_i_t_128 = stride_i_t / 8u; + + int32_t n_tokens_lower, n_tokens_upper; + + // Each block i iterates over tokens of a slice of n_tokens = + // expert_counts[i], with the size of chunk being + // (n_tokens / NUM_PARALLEL_TOKENS) + residual, instead of + // updiv(n_tokens, NUM_PARALLEL_TOKENS) for better scheduling. + if (n_tokens < NUM_PARALLEL_TOKENS && blockIdx.y < n_tokens) { + // Specialize this, but can be likely fused. + if (blockIdx.y >= NUM_PARALLEL_TOKENS) { + return; + } + n_tokens_lower = blockIdx.y; + n_tokens_upper = blockIdx.y + 1; + } else { + auto chunk_size = n_tokens / NUM_PARALLEL_TOKENS; + auto residual = n_tokens - chunk_size * NUM_PARALLEL_TOKENS; + auto calc_id = [&](int32_t id) { + if (id < residual) { + return min(n_tokens, id * (chunk_size + 1)); + } else { + return min(n_tokens, id * chunk_size + residual); + } + }; + n_tokens_lower = calc_id(blockIdx.y); + n_tokens_upper = calc_id(blockIdx.y + 1); + } + + if (n_tokens_lower >= n_tokens_upper) { + return; + } + + // We do calculations here, using constexpr wherever possible. + const Idx_t base_i = e * stride_i_e + NUM_WARPS * g * GROUP_SIZE * stride_i_h; + const Idx_t base_ys = e * stride_ys_e + NUM_WARPS * g * stride_ys_g; + const Idx_t base_yq = + e * stride_yq_e + NUM_WARPS * g * GROUP_SIZE * stride_yq_h; + Idx_t gate_off_128 = (base_i / static_cast(8u)); + auto input_128_ptr = reinterpret_cast(_input); + auto gate_128_ptr = input_128_ptr + gate_off_128 + (tid % HALF_THREAD_COUNT) + + stride_i_t_128 * n_tokens_lower; + auto up_128_ptr = gate_128_ptr + (H * stride_i_h) / 8u; + auto y_s_ptr = + _y_s + base_ys + warp_id * stride_ys_g + n_tokens_lower * stride_ys_t; + auto y_q_ptr = _y_q + base_yq + warp_id * GROUP_SIZE + + stride_yq_t * n_tokens_lower + 4 * lane_id; + int32_t t_load = n_tokens_lower, load_stage_id = 0; + auto s_buff_gate_load_128 = s_buff_128 + (tid % HALF_THREAD_COUNT); + auto s_buff_up_load_128 = s_buff_gate_load_128 + S_NUM_128 / 2u; + int32_t stage_offset{}; + + static constexpr int32_t LOAD_STAGE_SIZE = (NUM_WARPS * WARP_SIZE / 2); + static constexpr int32_t LOAD_STAGE_MOD = + NUM_STAGES * (NUM_WARPS * WARP_SIZE / 2); + + // Two halves of all threads in a block conduct global loads for gate and up, + // repsectively. + auto load_and_advance_y_pred = [&] { + if (t_load < n_tokens_upper) { + auto s_gate_stage_128_staged_ptr = s_buff_gate_load_128 + stage_offset; + auto s_up_stage_128_staged_ptr = s_buff_up_load_128 + stage_offset; + + // It is very important that LOAD_STAGE_SIZE is constexpr to avoid + // unnecessary ALU ops. + stage_offset += LOAD_STAGE_SIZE; + stage_offset %= LOAD_STAGE_MOD; + + if (tid < HALF_THREAD_COUNT) { + cp_async4(s_gate_stage_128_staged_ptr, gate_128_ptr); + gate_128_ptr += stride_i_t_128; + } else { + cp_async4(s_up_stage_128_staged_ptr, up_128_ptr); + up_128_ptr += stride_i_t_128; + } + ++t_load; + ++load_stage_id; + } + // We fence even if there is nothing to load to simplify pipelining. + cp_async_fence(); + }; + + #pragma unroll + for (int i = 0; i < NUM_STAGES - 1; i++) { + load_and_advance_y_pred(); + } + + __int64_t* s_gate_ptr = reinterpret_cast<__int64_t*>( + s_buff_compute_32 + warp_id * (GROUP_SIZE / 2)) + + lane_id; + __int64_t* s_up_ptr = s_gate_ptr + S_NUM_64 / 2; + + static constexpr int32_t STAGE_SIZE = (GROUP_SIZE * NUM_WARPS) / 4u; + static constexpr int32_t STAGE_MOD = STAGE_SIZE * NUM_STAGES; + + int32_t compute_pipeline_offset_64 = 0; + + for (int32_t t = n_tokens_lower; t < n_tokens_upper; ++t) { + __nv_bfloat16 y_max_bf16 = EPS; + __nv_bfloat162 results_bf162[2]; + + cp_async_wait(); + __syncthreads(); + + // We double-buffer pipelined loads so that the next load will + // concurrently run with compute without overwrites. + load_and_advance_y_pred(); + + auto s_gate_compute_64 = s_gate_ptr + compute_pipeline_offset_64; + auto s_up_compute_64 = s_up_ptr + compute_pipeline_offset_64; + + // STAGE_SIZE must also be constexpr! + compute_pipeline_offset_64 += STAGE_SIZE; + compute_pipeline_offset_64 %= STAGE_MOD; + + // Each thread loads (gate/up) 2X 4X bfloat16 values into registers. + __int64_t gate64 = *s_gate_compute_64; + __nv_bfloat162* s_gate_compute_32 = + reinterpret_cast<__nv_bfloat162*>(&gate64); + + __int64_t up64 = *s_up_compute_64; + __nv_bfloat162* s_up_compute_32 = reinterpret_cast<__nv_bfloat162*>(&up64); + + #pragma unroll + for (int i = 0; i < 2; i++) { + // For silu, we make sure that div is emitted. + float2 gate = silu2(__bfloat1622float2(s_gate_compute_32[i])); + results_bf162[i] = __float22bfloat162_rn(gate); + } + + #pragma unroll + for (int i = 0; i < 2; i++) { + results_bf162[i] = __hmul2(results_bf162[i], s_up_compute_32[i]); + } + + auto _y_max2 = + __hmax2(__habs2(results_bf162[0]), __habs2(results_bf162[1])); + + y_max_bf16 = __hmax(_y_max2.x, _y_max2.y); + + // An entire group is assigned to a single warp, so a simple warp reduce + // is used. + __nv_bfloat16 y_s = warp_max(y_max_bf16) / fp8_max; + + if constexpr (USE_UE8M0) { + y_s = hexp2(hceil(hlog2(y_s))); + } + + auto inv_y = __float2bfloat16_rn(1.f) / y_s; + + auto y_s2 = make_bfloat162(inv_y, inv_y); + + #pragma unroll + for (int32_t i = 0; i < 2; ++i) { + results_bf162[i] = + clip(__hmul2(results_bf162[i], y_s2), __bfloat162bfloat162(fp8_min), + __bfloat162bfloat162(fp8_max)); + } + + auto fp8x4 = __nv_fp8x4_e4m3(results_bf162[0], results_bf162[1]); + *reinterpret_cast<__nv_fp8x4_e4m3*>(y_q_ptr) = fp8x4; + y_q_ptr += stride_yq_t; + + if (lane_id == 0) { + *y_s_ptr = y_s; + y_s_ptr += stride_ys_t; + } + } +} +#endif + } // namespace vllm // Launch activation, gating, and quantize kernel. @@ -119,3 +470,117 @@ void silu_and_mul_quant(torch::Tensor& out, // [..., d] TORCH_CHECK(input.size(-1) % 2 == 0); LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel); } + +void silu_mul_fp8_quant_deep_gemm_cuda( + const at::Tensor& input, // (E, T, 2*H) + const at::Tensor& counts, // (E) + at::Tensor& y_q, // (E, T, H) [OUT] + at::Tensor& y_s, // (E, T, H//group_size) [OUT] + int64_t group_size, bool use_ue8m0, int64_t num_parallel_tokens) { +#ifndef USE_ROCM + // This kernel relies heavily on cp.async and fp8 support. + // This kernel currently only supports H % 128 == 0 and assumes a + // fixed GROUP_SIZE of 128. + TORCH_CHECK(input.dtype() == torch::kBFloat16); + TORCH_CHECK(y_q.dtype() == torch::kFloat8_e4m3fn || + y_q.dtype() == torch::kFloat8_e4m3fnuz); + TORCH_CHECK(y_s.dtype() == torch::kFloat32); + TORCH_CHECK(input.size(-1) % 256 == 0); + + // Check that num_parallel_tokens is of power of 2 and between 1 and 64. + TORCH_CHECK(1 <= num_parallel_tokens && num_parallel_tokens <= 64); + TORCH_CHECK(!(num_parallel_tokens & (num_parallel_tokens - 1))); + + using Idx_t = int64_t; + + Idx_t E = input.size(0); + Idx_t T = input.size(1); + Idx_t H = input.size(2) / 2; + Idx_t stride_i_e = input.stride(0); + Idx_t stride_i_t = input.stride(1); + Idx_t stride_i_h = input.stride(2); + Idx_t stride_yq_e = y_q.stride(0); + Idx_t stride_yq_t = y_q.stride(1); + Idx_t stride_yq_h = y_q.stride(2); + Idx_t stride_ys_e = y_s.stride(0); + Idx_t stride_ys_t = y_s.stride(1); + Idx_t stride_ys_g = y_s.stride(2); + + Idx_t stride_counts_e = counts.stride(0); + + static constexpr int GROUP_SIZE = 128; + + #define KERNEL_FN \ + if (use_ue8m0) { \ + vllm::silu_mul_fp8_quant_deep_gemm_kernel \ + <<>>( \ + reinterpret_cast<__nv_bfloat16*>(input.data_ptr()), \ + (fp8_t*)y_q.data_ptr(), y_s.data_ptr(), \ + reinterpret_cast(counts.data_ptr()), H, G, \ + stride_i_e, stride_i_t, stride_i_h, stride_yq_e, stride_yq_t, \ + stride_yq_h, stride_ys_e, stride_ys_t, stride_ys_g, \ + stride_counts_e); \ + } else { \ + vllm::silu_mul_fp8_quant_deep_gemm_kernel \ + <<>>( \ + reinterpret_cast<__nv_bfloat16*>(input.data_ptr()), \ + (fp8_t*)y_q.data_ptr(), y_s.data_ptr(), \ + reinterpret_cast(counts.data_ptr()), H, G, \ + stride_i_e, stride_i_t, stride_i_h, stride_yq_e, stride_yq_t, \ + stride_yq_h, stride_ys_e, stride_ys_t, stride_ys_g, \ + stride_counts_e); \ + } + + #define KERNEL_CALL_H \ + if (H % (4 * GROUP_SIZE) == 0) { \ + static constexpr int NUM_WARPS = 4; \ + populate_launch_params(NUM_WARPS, NUM_PARALLEL_TOKENS); \ + KERNEL_FN \ + } else { \ + static constexpr int NUM_WARPS = 1; \ + populate_launch_params(NUM_WARPS, NUM_PARALLEL_TOKENS); \ + KERNEL_FN \ + } + + #define KERNEL_CALL_TOP_LEVEL \ + if (num_parallel_tokens == 1) { \ + static constexpr int NUM_PARALLEL_TOKENS = 1; \ + KERNEL_CALL_H \ + } else if (num_parallel_tokens == 2) { \ + static constexpr int NUM_PARALLEL_TOKENS = 2; \ + KERNEL_CALL_H \ + } else if (num_parallel_tokens == 4) { \ + static constexpr int NUM_PARALLEL_TOKENS = 4; \ + KERNEL_CALL_H \ + } else if (num_parallel_tokens == 8) { \ + static constexpr int NUM_PARALLEL_TOKENS = 8; \ + KERNEL_CALL_H \ + } else if (num_parallel_tokens == 16) { \ + static constexpr int NUM_PARALLEL_TOKENS = 16; \ + KERNEL_CALL_H \ + } else if (num_parallel_tokens == 32) { \ + static constexpr int NUM_PARALLEL_TOKENS = 32; \ + KERNEL_CALL_H \ + } else if (num_parallel_tokens == 64) { \ + static constexpr int NUM_PARALLEL_TOKENS = 64; \ + KERNEL_CALL_H \ + } + + Idx_t G; + dim3 block, grid; + auto populate_launch_params = [&](int num_warps, int _num_parallel_tokens) { + G = H / Idx_t(group_size * num_warps); + grid = dim3(E * G, _num_parallel_tokens); + block = dim3(num_warps * WARP_SIZE); + }; + + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); + VLLM_DISPATCH_FP8_TYPES(y_q.scalar_type(), + "silu_mul_fp8_quant_deep_gemm_kernel", + [&] { KERNEL_CALL_TOP_LEVEL }); + +#endif +} diff --git a/csrc/quantization/w8a8/fp8/common.cuh b/csrc/quantization/w8a8/fp8/common.cuh index 1aad6330c..7838f211c 100644 --- a/csrc/quantization/w8a8/fp8/common.cuh +++ b/csrc/quantization/w8a8/fp8/common.cuh @@ -5,7 +5,9 @@ #include -#ifdef USE_ROCM +#ifndef USE_ROCM + #include "nvidia/quant_utils.cuh" +#else #include "amd/quant_utils.cuh" #endif @@ -48,7 +50,9 @@ __device__ __forceinline__ fp8_type scaled_fp8_conversion(float const val, float r = fmaxf(-quant_type_max_v, fminf(x, quant_type_max_v)); #ifndef USE_ROCM - return static_cast(r); + // Use hardware cvt instruction for fp8 on nvidia + // Currently only support fp8_type = c10::Float8_e4m3fn + return fp8::vec_conversion(r); #else // Use hardware cvt instruction for fp8 on rocm return fp8::cvt_c10(r); diff --git a/csrc/quantization/w8a8/fp8/nvidia/quant_utils.cuh b/csrc/quantization/w8a8/fp8/nvidia/quant_utils.cuh index 5a397b3ef..60e023f9a 100644 --- a/csrc/quantization/w8a8/fp8/nvidia/quant_utils.cuh +++ b/csrc/quantization/w8a8/fp8/nvidia/quant_utils.cuh @@ -12,13 +12,26 @@ namespace vllm { namespace fp8 { #ifdef ENABLE_FP8 - #if 0 // Disable the following code to reduce the binary size. template -__inline__ __device__ Tout -vec_conversion(const Tin &x, const __nv_fp8_interpretation_t fp8_type) { +__inline__ __device__ Tout vec_conversion( + const Tin& x, const __nv_fp8_interpretation_t fp8_type = __NV_E4M3) { return x; } +// float -> c10::Float8_e4m3fn +template <> +__inline__ __device__ c10::Float8_e4m3fn +vec_conversion( + const float& a, const __nv_fp8_interpretation_t fp8_type) { + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + return static_cast(a); + #else + return c10::Float8_e4m3fn(__nv_cvt_float_to_fp8(a, __NV_SATFINITE, fp8_type), + c10::Float8_e4m3fn::from_bits()); + #endif +} + + #if 0 // Disable the following code to reduce the binary size. // fp8 -> half template <> __inline__ __device__ uint16_t vec_conversion( diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 9e4586e8f..81aca7b88 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -32,6 +32,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { #define stride_tag #endif + ops.def( + "silu_mul_fp8_quant_deep_gemm_cuda(Tensor input, Tensor counts, Tensor! " + "y_q, Tensor! y_s, int group_size, " + "bool use_ue8m0, int num_parallel_tokens) -> ()"); + ops.impl("silu_mul_fp8_quant_deep_gemm_cuda", torch::kCUDA, + &silu_mul_fp8_quant_deep_gemm_cuda); + ops.def("weak_ref_tensor(Tensor input) -> Tensor"); ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor); @@ -214,16 +221,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor cos_sin_cache, bool is_neox) -> ()"); ops.impl("rotary_embedding", torch::kCUDA, &rotary_embedding); - // Apply GPT-NeoX or GPT-J style rotary embedding to query and key - // (supports multiple loras). - ops.def( - "batched_rotary_embedding(Tensor positions, Tensor! query," - " Tensor!? key, int head_size," - " Tensor cos_sin_cache, bool is_neox," - " int rot_dim," - " Tensor cos_sin_cache_offsets) -> ()"); - ops.impl("batched_rotary_embedding", torch::kCUDA, &batched_rotary_embedding); - // Quantization ops #ifndef USE_ROCM // Quantized GEMM for AWQ. diff --git a/docs/README.md b/docs/README.md index 683e1d375..ae95717de 100644 --- a/docs/README.md +++ b/docs/README.md @@ -56,7 +56,7 @@ vLLM is flexible and easy to use with: - Tensor, pipeline, data and expert parallelism support for distributed inference - Streaming outputs - OpenAI-compatible API server -- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, IBM Power CPUs, TPU, and AWS Trainium and Inferentia Accelerators. +- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend. - Prefix caching support - Multi-LoRA support diff --git a/docs/examples/README.md b/docs/examples/README.md index 3cf93027f..94f5efc92 100644 --- a/docs/examples/README.md +++ b/docs/examples/README.md @@ -2,6 +2,6 @@ vLLM's examples are split into three categories: -- If you are using vLLM from within Python code, see [Offline Inference](./offline_inference) -- If you are using vLLM from an HTTP application or client, see [Online Serving](./online_serving) -- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see [Others](./others) +- If you are using vLLM from within Python code, see the *Offline Inference* section. +- If you are using vLLM from an HTTP application or client, see the *Online Serving* section. +- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see the *Others* section. diff --git a/docs/features/README.md b/docs/features/README.md index de23cd0a9..d8e26ec02 100644 --- a/docs/features/README.md +++ b/docs/features/README.md @@ -76,6 +76,3 @@ th:not(:first-child) { | multi-step | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:8477) | ✅ | ❌ | | best-of | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | | beam-search | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | - -!!! note - Please refer to [Feature support through NxD Inference backend][feature-support-through-nxd-inference-backend] for features supported on AWS Neuron hardware diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index 77baa27c7..7fb033723 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -45,6 +45,32 @@ When using multi-modal inputs, vLLM normally hashes each media item by content t print(o.outputs[0].text) ``` +Using UUIDs, you can also skip sending media data entirely if you expect cache hits for respective items. Note that the request will fail if the skipped media doesn't have a corresponding UUID, or if the UUID fails to hit the cache. + +??? code + + ```python + from vllm import LLM + from PIL import Image + + # Qwen2.5-VL example with two images + llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct") + + prompt = "USER: \nDescribe the differences.\nASSISTANT:" + img_b = Image.open("/path/to/b.jpg") + + outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": {"image": [None, img_b]}, + # Since img_a is expected to be cached, we can skip sending the actual + # image entirely. + "multi_modal_uuids": {"image": ["sku-1234-a", None]}, + }) + + for o in outputs: + print(o.outputs[0].text) + ``` + !!! warning If both multimodal processor caching and prefix caching are disabled, user-provided `multi_modal_uuids` are ignored. @@ -755,6 +781,39 @@ The following example demonstrates how to pass image embeddings to the OpenAI se ) ``` +For Online Serving, you can also skip sending media if you expect cache hits with provided UUIDs. You can do so by sending media like this: + + ```python + # Image/video/audio URL: + { + "type": "image_url", + "image_url": None, + "uuid": image_uuid, + }, + + # image_embeds + { + "type": "image_embeds", + "image_embeds": None, + "uuid": image_uuid + }, + + # input_audio: + { + "type": "input_audio", + "input_audio": None, + "uuid": audio_uuid + }, + + # PIL Image: + { + "type": "image_pil", + "image_pil": None + "uuid": image_uuid + } + + ``` + !!! note Only one message can contain `{"type": "image_embeds"}`. If used with a model that requires additional parameters, you must also provide a tensor for each of them, e.g. `image_grid_thw`, `image_sizes`, etc. diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md index 4605ba778..4c8377871 100644 --- a/docs/features/quantization/README.md +++ b/docs/features/quantization/README.md @@ -43,19 +43,19 @@ th:not(:first-child) { } -| Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | Intel Gaudi | x86 CPU | AWS Neuron | Google TPU | -|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------|--------------| -| AWQ | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | ❌ | ❌ | -| GPTQ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | ❌ | ❌ | -| Marlin (GPTQ/AWQ/FP8) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | -| INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | -| FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ❌ | -| BitBLAS | ✅︎ | ✅ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | -| BitBLAS (GPTQ) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | -| bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | -| DeepSpeedFP | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | -| GGUF | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | -| INC (W8A8) | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅︎ | ❌ | ❌ | ❌ | +| Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | Intel Gaudi | x86 CPU | Google TPU | +|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------| +| AWQ | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | ❌ | +| GPTQ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | ❌ | +| Marlin (GPTQ/AWQ/FP8) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | +| INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | +| FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | +| BitBLAS | ✅︎ | ✅ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | +| BitBLAS (GPTQ) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | +| bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | +| DeepSpeedFP | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | +| GGUF | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | +| INC (W8A8) | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅︎ | ❌ | ❌ | - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. - ✅︎ indicates that the quantization method is supported on the specified hardware. diff --git a/docs/getting_started/installation/.nav.yml b/docs/getting_started/installation/.nav.yml index d4a727c92..ba1f8099a 100644 --- a/docs/getting_started/installation/.nav.yml +++ b/docs/getting_started/installation/.nav.yml @@ -3,5 +3,3 @@ nav: - gpu.md - cpu.md - google_tpu.md - - intel_gaudi.md - - aws_neuron.md diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md index 8a658b7a9..5e57d23f4 100644 --- a/docs/getting_started/installation/README.md +++ b/docs/getting_started/installation/README.md @@ -12,7 +12,6 @@ vLLM supports the following hardware platforms: - [Apple silicon](cpu.md#apple-silicon) - [IBM Z (S390X)](cpu.md#ibm-z-s390x) - [Google TPU](google_tpu.md) -- [AWS Neuron](aws_neuron.md) ## Hardware Plugins diff --git a/docs/getting_started/installation/aws_neuron.md b/docs/getting_started/installation/aws_neuron.md deleted file mode 100644 index ff2500f03..000000000 --- a/docs/getting_started/installation/aws_neuron.md +++ /dev/null @@ -1,147 +0,0 @@ -# AWS Neuron - -[AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/) is the software development kit (SDK) used to run deep learning and -generative AI workloads on AWS Inferentia and AWS Trainium powered Amazon EC2 instances and UltraServers (Inf1, Inf2, Trn1, Trn2, -and Trn2 UltraServer). Both Trainium and Inferentia are powered by fully-independent heterogeneous compute-units called NeuronCores. -This describes how to set up your environment to run vLLM on Neuron. - -!!! warning - There are no pre-built wheels or images for this device, so you must build vLLM from source. - -## Requirements - -- OS: Linux -- Python: 3.9 or newer -- Pytorch 2.5/2.6 -- Accelerator: NeuronCore-v2 (in trn1/inf2 chips) or NeuronCore-v3 (in trn2 chips) -- AWS Neuron SDK 2.23 - -## Configure a new environment - -### Launch a Trn1/Trn2/Inf2 instance and verify Neuron dependencies - -The easiest way to launch a Trainium or Inferentia instance with pre-installed Neuron dependencies is to follow this -[quick start guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/multiframework/multi-framework-ubuntu22-neuron-dlami.html#setup-ubuntu22-multi-framework-dlami) using the Neuron Deep Learning AMI (Amazon machine image). - -- After launching the instance, follow the instructions in [Connect to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) to connect to the instance -- Once inside your instance, activate the pre-installed virtual environment for inference by running - -```bash -source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate -``` - -Refer to the [NxD Inference Setup Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/nxdi-setup.html) -for alternative setup instructions including using Docker and manually installing dependencies. - -!!! note - NxD Inference is the default recommended backend to run inference on Neuron. If you are looking to use the legacy [transformers-neuronx](https://github.com/aws-neuron/transformers-neuronx) - library, refer to [Transformers NeuronX Setup](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/transformers-neuronx/setup/index.html). - -## Set up using Python - -### Pre-built wheels - -Currently, there are no pre-built Neuron wheels. - -### Build wheel from source - -To build and install vLLM from source, run: - -```bash -git clone https://github.com/vllm-project/vllm.git -cd vllm -pip install -U -r requirements/neuron.txt -VLLM_TARGET_DEVICE="neuron" pip install -e . -``` - -AWS Neuron maintains a [Github fork of vLLM](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2) at -, which contains several features in addition to what's -available on vLLM V0. Please utilize the AWS Fork for the following features: - -- Llama-3.2 multi-modal support -- Multi-node distributed inference - -Refer to [vLLM User Guide for NxD Inference](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/vllm-user-guide.html) - for more details and usage examples. - -To install the AWS Neuron fork, run the following: - -```bash -git clone -b neuron-2.23-vllm-v0.7.2 https://github.com/aws-neuron/upstreaming-to-vllm.git -cd upstreaming-to-vllm -pip install -r requirements/neuron.txt -VLLM_TARGET_DEVICE="neuron" pip install -e . -``` - -Note that the AWS Neuron fork is only intended to support Neuron hardware; compatibility with other hardwares is not tested. - -## Set up using Docker - -### Pre-built images - -Currently, there are no pre-built Neuron images. - -### Build image from source - -See [deployment-docker-build-image-from-source][deployment-docker-build-image-from-source] for instructions on building the Docker image. - -Make sure to use in place of the default Dockerfile. - -## Extra information - -[](){ #feature-support-through-nxd-inference-backend } - -### Feature support through NxD Inference backend - -The current vLLM and Neuron integration relies on either the `neuronx-distributed-inference` (preferred) or `transformers-neuronx` backend -to perform most of the heavy lifting which includes PyTorch model initialization, compilation, and runtime execution. Therefore, most -[features supported on Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/feature-guide.html) are also available via the vLLM integration. - -To configure NxD Inference features through the vLLM entrypoint, use the `override_neuron_config` setting. Provide the configs you want to override -as a dictionary (or JSON object when starting vLLM from the CLI). For example, to disable auto bucketing, include - -```python -override_neuron_config={ - "enable_bucketing":False, -} -``` - -or when launching vLLM from the CLI, pass - -```bash ---override-neuron-config "{\"enable_bucketing\":false}" -``` - -Alternatively, users can directly call the NxDI library to trace and compile your model, then load the pre-compiled artifacts -(via `NEURON_COMPILED_ARTIFACTS` environment variable) in vLLM to run inference workloads. - -### Known limitations - -- EAGLE speculative decoding: NxD Inference requires the EAGLE draft checkpoint to include the LM head weights from the target model. Refer to this - [guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/feature-guide.html#eagle-checkpoint-compatibility) - for how to convert pretrained EAGLE model checkpoints to be compatible for NxDI. -- Quantization: the native quantization flow in vLLM is not well supported on NxD Inference. It is recommended to follow this - [Neuron quantization guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/custom-quantization.html) - to quantize and compile your model using NxD Inference, and then load the compiled artifacts into vLLM. -- Multi-LoRA serving: NxD Inference only supports loading of LoRA adapters at server startup. Dynamic loading of LoRA adapters at - runtime is not currently supported. Refer to [multi-lora example](https://github.com/aws-neuron/upstreaming-to-vllm/blob/neuron-2.23-vllm-v0.7.2/examples/offline_inference/neuron_multi_lora.py) -- Multi-modal support: multi-modal support is only available through the AWS Neuron fork. This feature has not been upstreamed - to vLLM main because NxD Inference currently relies on certain adaptations to the core vLLM logic to support this feature. -- Multi-node support: distributed inference across multiple Trainium/Inferentia instances is only supported on the AWS Neuron fork. Refer - to this [multi-node example](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2/examples/neuron/multi_node) - to run. Note that tensor parallelism (distributed inference across NeuronCores) is available in vLLM main. -- Known edge case bug in speculative decoding: An edge case failure may occur in speculative decoding when sequence length approaches - max model length (e.g. when requesting max tokens up to the max model length and ignoring eos). In this scenario, vLLM may attempt - to allocate an additional block to ensure there is enough memory for number of lookahead slots, but since we do not have good support - for paged attention, there isn't another Neuron block for vLLM to allocate. A workaround fix (to terminate 1 iteration early) is - implemented in the AWS Neuron fork but is not upstreamed to vLLM main as it modifies core vLLM logic. - -### Environment variables - -- `NEURON_COMPILED_ARTIFACTS`: set this environment variable to point to your pre-compiled model artifacts directory to avoid - compilation time upon server initialization. If this variable is not set, the Neuron module will perform compilation and save the - artifacts under `neuron-compiled-artifacts/{unique_hash}/` subdirectory in the model path. If this environment variable is set, - but the directory does not exist, or the contents are invalid, Neuron will also fall back to a new compilation and store the artifacts - under this specified path. -- `NEURON_CONTEXT_LENGTH_BUCKETS`: Bucket sizes for context encoding. (Only applicable to `transformers-neuronx` backend). -- `NEURON_TOKEN_GEN_BUCKETS`: Bucket sizes for token generation. (Only applicable to `transformers-neuronx` backend). diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index db3dd2c25..6295a2aa8 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -389,6 +389,7 @@ th { | `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ | ✅︎ | | `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ | | `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `OLMo3ForCausalLM` | OLMo3 | TBA | ✅︎ | ✅︎ | ✅︎ | | `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ | ✅︎ | | `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | | ✅︎ | ✅︎ | | `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ | ✅︎ | diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py index 9776f4fe3..0093b63b0 100644 --- a/examples/offline_inference/tpu.py +++ b/examples/offline_inference/tpu.py @@ -42,7 +42,7 @@ def main(): llm_args["model"] = "meta-llama/Llama-3.1-8B-Instruct" # Set `enforce_eager=True` to avoid ahead-of-time compilation. - # In real workloads, `enforace_eager` should be `False`. + # In real workloads, `enforce_eager` should be `False`. llm = LLM(**llm_args) outputs = llm.generate(prompts, sampling_params) print("-" * 50) diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index b104113b8..4b75eb19f 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -1764,6 +1764,7 @@ def apply_image_repeat( probs = [1.0 - image_repeat_prob, image_repeat_prob] inputs = [] + inputs_with_empty_media = [] cur_image = data for i in range(num_prompts): if image_repeat_prob is not None: @@ -1774,14 +1775,25 @@ def apply_image_repeat( new_val = (i // 256 // 256, i // 256, i % 256) cur_image.putpixel((0, 0), new_val) + uuid = "uuid_{}".format(i) + inputs.append( { "prompt": prompts[i % len(prompts)], "multi_modal_data": {modality: cur_image}, + "multi_modal_uuids": {modality: uuid}, } ) - return inputs + inputs_with_empty_media.append( + { + "prompt": prompts[i % len(prompts)], + "multi_modal_data": {modality: None}, + "multi_modal_uuids": {modality: uuid}, + } + ) + + return inputs, inputs_with_empty_media @contextmanager @@ -1860,6 +1872,13 @@ def parse_args(): help="If True, then use different prompt (with the same multi-modal " "data) for each request.", ) + + parser.add_argument( + "--verify-mm-cache-hit-with-uuids", + action="store_true", + help="If True, will send all requests in a second batch with empty mm " + "data to verify cache hits with UUIDs.", + ) return parser.parse_args() @@ -1903,26 +1922,48 @@ def main(args): assert args.num_prompts > 0 if args.num_prompts == 1: # Single inference + uuid = "uuid_0" inputs = { "prompt": prompts[0], "multi_modal_data": {modality: data}, + "multi_modal_uuids": {modality: uuid}, + } + inputs_with_empty_media = { + "prompt": prompts[0], + "multi_modal_data": {modality: None}, + "multi_modal_uuids": {modality: uuid}, } else: # Batch inference if args.image_repeat_prob is not None: # Repeat images with specified probability of "image_repeat_prob" - inputs = apply_image_repeat( - args.image_repeat_prob, args.num_prompts, data, prompts, modality + inputs, inputs_with_empty_media = apply_image_repeat( + args.image_repeat_prob, + args.num_prompts, + data, + prompts, + modality, ) else: # Use the same image for all prompts - inputs = [ - { - "prompt": prompts[i % len(prompts)], - "multi_modal_data": {modality: data}, - } - for i in range(args.num_prompts) - ] + inputs = [] + inputs_with_empty_media = [] + for i in range(args.num_prompts): + uuid = "uuid_{}".format(i) + inputs.append( + { + "prompt": prompts[i % len(prompts)], + "multi_modal_data": {modality: data}, + "multi_modal_uuids": {modality: uuid}, + } + ) + inputs_with_empty_media.append( + { + "prompt": prompts[i % len(prompts)], + "multi_modal_data": {modality: None}, + "multi_modal_uuids": {modality: uuid}, + } + ) # Add LoRA request if applicable lora_request = ( @@ -1942,6 +1983,26 @@ def main(args): print(generated_text) print("-" * 50) + if args.verify_mm_cache_hit_with_uuids: + try: + # Verify cache hits with UUIDs + print( + "Sending a second batch of requests with empty media" + " and matching UUIDs." + ) + outputs = llm.generate( + inputs_with_empty_media, + sampling_params=sampling_params, + lora_request=lora_request, + ) + print("-" * 50) + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + print("-" * 50) + except Exception as e: + print(f"Failed to verify cache hits with UUIDs. Error: {e}") + if __name__ == "__main__": args = parse_args() diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index a3b09cc81..fba18f197 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -62,6 +62,8 @@ def _fix_prompt_embed_outputs( @pytest.mark.parametrize("backend", ["FLASH_ATTN"]) @pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.parametrize("enforce_eager", [False]) +@pytest.mark.parametrize("async_scheduling", [True, False]) +@pytest.mark.parametrize("model_executor", ["uni", "mp"]) @pytest.mark.parametrize("enable_prompt_embeds", [True, False]) def test_models( monkeypatch: pytest.MonkeyPatch, @@ -70,6 +72,8 @@ def test_models( backend: str, max_tokens: int, enforce_eager: bool, + async_scheduling: bool, + model_executor: str, enable_prompt_embeds: bool, ) -> None: @@ -77,6 +81,12 @@ def test_models( "VLLM_USE_V1") and envs.VLLM_USE_V1: pytest.skip("enable_prompt_embeds is not supported in v1.") + if not envs.VLLM_USE_V1: + if async_scheduling: + pytest.skip("async_scheduling only supported in v1.") + if model_executor != "uni": + pytest.skip("only test uniproc executor for v0.") + if backend == "XFORMERS" and model == "google/gemma-2-2b-it": pytest.skip( f"{backend} does not support gemma2 with full context length.") @@ -98,11 +108,15 @@ def test_models( prompt_embeds = hf_model.get_prompt_embeddings( example_prompts) - with VllmRunner(model, - max_model_len=8192, - enforce_eager=enforce_eager, - enable_prompt_embeds=enable_prompt_embeds, - gpu_memory_utilization=0.7) as vllm_model: + with VllmRunner( + model, + max_model_len=8192, + enforce_eager=enforce_eager, + enable_prompt_embeds=enable_prompt_embeds, + gpu_memory_utilization=0.7, + async_scheduling=async_scheduling, + distributed_executor_backend=model_executor, + ) as vllm_model: if enable_prompt_embeds: vllm_outputs = vllm_model.generate_greedy( prompt_embeds, max_tokens) diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 72819f31d..a324e8666 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -522,6 +522,71 @@ async def test_completions_with_image_with_uuid( assert isinstance(chat_completion.choices[0].message.content, str) assert len(chat_completion.choices[0].message.content) > 0 + # Second request, with empty image but the same uuid. + chat_completion_with_empty_image = await client.chat.completions.create( + messages=[ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "Describe this image.", + }, + { + "type": "image_url", + "image_url": {}, + "uuid": image_url + }, + ], + }, + ], + model=model_name, + ) + assert chat_completion_with_empty_image.choices[ + 0].message.content is not None + assert isinstance( + chat_completion_with_empty_image.choices[0].message.content, str) + assert len( + chat_completion_with_empty_image.choices[0].message.content) > 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_completions_with_empty_image_with_uuid_without_cache_hit( + client: openai.AsyncOpenAI, + model_name: str, +): + with pytest.raises(openai.BadRequestError): + _ = await client.chat.completions.create( + messages=[ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "Describe this image.", + }, + { + "type": "image_url", + "image_url": {}, + "uuid": "uuid_not_previously_seen" + }, + ], + }, + ], + model=model_name, + ) + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 5149ca346..dd33f5c8c 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -79,6 +79,28 @@ def phi3v_tokenizer(): ) +@pytest.fixture(scope="function") +def qwen2_audio_model_config(): + return ModelConfig( + QWEN2AUDIO_MODEL_ID, + runner="generate", + trust_remote_code=True, + limit_mm_per_prompt={ + "audio": 1, + }, + ) + + +@pytest.fixture(scope="module") +def qwen2_audio_tokenizer(): + return TokenizerGroup( + tokenizer_id=QWEN2AUDIO_MODEL_ID, + enable_lora=False, + max_num_seqs=5, + max_input_length=None, + ) + + @pytest.fixture(scope="function") def qwen25omni_model_config_mm_interleaved(): return ModelConfig( @@ -169,6 +191,7 @@ def audio_url(): def _assert_mm_data_is_image_input( mm_data: Optional[MultiModalDataDict], image_count: int, + skipped_image_indices: Optional[list] = None, ) -> None: assert mm_data is not None assert set(mm_data.keys()) == {"image"} @@ -177,6 +200,9 @@ def _assert_mm_data_is_image_input( assert image_data is not None assert isinstance(image_data, list) and len(image_data) == image_count + if skipped_image_indices is not None: + for i in skipped_image_indices: + assert image_data[i] is None def _assert_mm_uuids( @@ -205,8 +231,10 @@ MultiModalDataCounts = Mapping[ModalityType, int] def _assert_mm_data_inputs( - mm_data: Optional[MultiModalDataDict], - data_count: MultiModalDataCounts, + mm_data: Optional[MultiModalDataDict], + data_count: MultiModalDataCounts, + skipped_media_indices: Optional[dict[ + str, list]] = None, # modality -> list[int] ) -> None: assert mm_data is not None assert set(data_count.keys()) == (set(mm_data.keys())) @@ -216,6 +244,13 @@ def _assert_mm_data_inputs( assert modality_data is not None assert isinstance(modality_data, list) and len(modality_data) == n + if skipped_media_indices is not None: + skipped_media_indices_for_modality = skipped_media_indices.get( + modality) + assert skipped_media_indices_for_modality is not None + for i in skipped_media_indices_for_modality: + assert modality_data[i] is None + def test_parse_chat_messages_single_image( phi3v_model_config, @@ -289,6 +324,41 @@ def test_parse_chat_messages_single_image_with_uuid( _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid]) +def test_parse_chat_messages_single_empty_image_with_uuid( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + image_uuid = str(hash(image_url)) + conversation, mm_data, mm_uuids = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": None, + "uuid": image_uuid, + }, + { + "type": "text", + "text": "What's in the image?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": "user", + "content": "<|image_1|>\nWhat's in the image?" + }] + _assert_mm_data_is_image_input(mm_data, 1, skipped_image_indices=[0]) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid]) + + def test_parse_chat_messages_single_image_with_bad_uuid_format( phi3v_model_config, phi3v_tokenizer, @@ -375,6 +445,96 @@ def test_parse_chat_messages_multiple_images_with_uuids( _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2]) +def test_parse_chat_messages_multiple_empty_images_with_uuids( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + image_uuid1 = "my_uuid_1" + image_uuid2 = "my_uuid_2" + + conversation, mm_data, mm_uuids = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": None, + "uuid": image_uuid1, + }, + { + "type": "image_url", + "image_url": None, + "uuid": image_uuid2, + }, + { + "type": "text", + "text": "What's in the image?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": + "user", + "content": + "<|image_1|>\n<|image_2|>\nWhat's in the image?", + }] + _assert_mm_data_is_image_input(mm_data, 2, skipped_image_indices=[0, 1]) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2]) + + +def test_parse_chat_messages_mixed_empty_images_with_uuids( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + image_uuid1 = "my_uuid_1" + image_uuid2 = "my_uuid_2" + + conversation, mm_data, mm_uuids = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url, + }, + "uuid": image_uuid1, + }, + { + "type": "image_url", + "image_url": None, + "uuid": image_uuid2, + }, + { + "type": "text", + "text": "What's in the image?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": + "user", + "content": + "<|image_1|>\n<|image_2|>\nWhat's in the image?", + }] + _assert_mm_data_is_image_input(mm_data, 2, skipped_image_indices=[1]) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2]) + + @pytest.mark.asyncio async def test_parse_chat_messages_single_image_with_uuid_async( phi3v_model_config, @@ -413,6 +573,44 @@ async def test_parse_chat_messages_single_image_with_uuid_async( _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid]) +@pytest.mark.asyncio +async def test_parse_chat_messages_empty_image_with_uuid_async( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + image_uuid = str(hash(image_url)) + conversation, mm_future, mm_uuids = parse_chat_messages_futures( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": None, + "uuid": image_uuid, + }, + { + "type": "text", + "text": "What's in the image?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": "user", + "content": "<|image_1|>\nWhat's in the image?" + }] + _assert_mm_data_is_image_input(await mm_future, + 1, + skipped_image_indices=[0]) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid]) + + @pytest.mark.asyncio async def test_parse_chat_messages_multiple_images_with_uuids_async( phi3v_model_config, @@ -460,6 +658,53 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async( _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2]) +@pytest.mark.asyncio +async def test_parse_chat_messages_multiple_empty_images_with_uuids_async( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + image_uuid1 = "my_uuid_1" + image_uuid2 = "my_uuid_2" + + conversation, mm_future, mm_uuids = parse_chat_messages_futures( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": None, + "uuid": image_uuid1, + }, + { + "type": "image_pil", + "image_pil": None, + "uuid": image_uuid2, + }, + { + "type": "text", + "text": "What's in these images?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": + "user", + "content": + "<|image_1|>\n<|image_2|>\nWhat's in these images?", + }] + _assert_mm_data_is_image_input(await mm_future, + 2, + skipped_image_indices=[0, 1]) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2]) + + @pytest.mark.asyncio async def test_parse_chat_messages_multiple_images_with_partial_uuids_async( phi3v_model_config, @@ -653,6 +898,114 @@ def test_parse_chat_messages_multiple_images( _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) +def test_parse_chat_messages_empty_pil_image_with_uuid( + phi3v_model_config, + phi3v_tokenizer, +): + uuid = "abcd" + conversation, mm_data, mm_uuids = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + "type": "image_pil", + "image_pil": None, + "uuid": uuid + }, + { + "type": "text", + "text": "What's in this image?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": "user", + "content": "<|image_1|>\nWhat's in this image?", + }] + _assert_mm_data_is_image_input(mm_data, 1, skipped_image_indices=[0]) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid]) + + +def test_parse_chat_messages_empty_image_embeds_with_uuid( + phi3v_model_config, + phi3v_tokenizer, +): + uuid = "abcd" + conversation, mm_data, mm_uuids = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + "type": "image_embeds", + "image_embeds": None, + "uuid": uuid + }, + { + "type": "text", + "text": "What's in this image?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": "user", + "content": "<|image_1|>\nWhat's in this image?", + }] + assert mm_data is not None + assert "image" in mm_data + assert mm_data["image"] is None + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid]) + + +@pytest.mark.asyncio +async def test_parse_chat_messages_empty_image_embeds_with_uuid_async( + phi3v_model_config, + phi3v_tokenizer, +): + uuid = "abcd" + conversation, mm_future, mm_uuids = parse_chat_messages_futures( + [{ + "role": + "user", + "content": [ + { + "type": "image_embeds", + "image_embeds": None, + "uuid": uuid + }, + { + "type": "text", + "text": "What's in this image?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": "user", + "content": "<|image_1|>\nWhat's in this image?", + }] + mm_data = await mm_future + assert mm_data is not None + assert "image" in mm_data + assert mm_data["image"] is None + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid]) + + @pytest.mark.asyncio async def test_parse_chat_messages_multiple_images_async( phi3v_model_config, @@ -1636,6 +1989,118 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl expected_uuids=["audio_123"]) +def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_messages_interleave( # noqa: E501 + qwen25omni_model_config_mm_interleaved, + qwen25omni_tokenizer, + image_url, + video_url, + audio_url, +): + conversation, mm_data, mm_uuids = parse_chat_messages( + [ + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's on this image?" + }, + { + "type": "image_url", + "image_url": None, + "uuid": "image_123", + }, + { + "type": "text", + "text": "Now listen to this audio" + }, + { + "type": "audio_url", + "audio_url": None, + "uuid": "audio_123", + }, + ], + }, + { + "role": "assistant", + "content": "Some stuff." + }, + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's on this image?" + }, + { + "type": "image_url", + "image_url": None, + "uuid": "image_123", + }, + { + "type": "text", + "text": "And what's in the video?" + }, + { + "type": "video_url", + "video_url": None, + "uuid": "video_123", + }, + ], + }, + ], + qwen25omni_model_config_mm_interleaved, + qwen25omni_tokenizer, + content_format="string", + ) + + assert conversation == [ + { + "role": + "user", + "content": + "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" + "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501 + }, + { + "role": "assistant", + "content": "Some stuff." + }, + { + "role": + "user", + "content": + "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" + "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>", + }, + ] + + _assert_mm_data_inputs(mm_data, { + "image": 2, + "video": 1, + "audio": 1 + }, + skipped_media_indices={ + "image": [0, 1], + "video": [0], + "audio": [0] + }) + _assert_mm_uuids(mm_uuids, + 2, + modality="image", + expected_uuids=["image_123", "image_123"]) + _assert_mm_uuids(mm_uuids, + 1, + modality="video", + expected_uuids=["video_123"]) + _assert_mm_uuids(mm_uuids, + 1, + modality="audio", + expected_uuids=["audio_123"]) + + def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_messages_interleave( # noqa: E501 qwen25omni_model_config_mm_interleaved, qwen25omni_tokenizer, @@ -2355,3 +2820,82 @@ def test_apply_mistral_chat_template_thinking_chunk(): r"[INST]Thanks, what is 3+3?[/INST]") assert string_tokens == expected_tokens + + +def test_parse_chat_messages_single_empty_audio_with_uuid( + qwen2_audio_model_config, + qwen2_audio_tokenizer, +): + audio_uuid = "abcd" + conversation, mm_data, mm_uuids = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + "type": "input_audio", + "input_audio": {}, + "uuid": audio_uuid, + }, + { + "type": "text", + "text": "What does the audio say?" + }, + ], + }], + qwen2_audio_model_config, + qwen2_audio_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": + "user", + "content": + "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?" + }] + _assert_mm_data_inputs(mm_data, {"audio": 1}) + _assert_mm_uuids(mm_uuids, + 1, + modality="audio", + expected_uuids=[audio_uuid]) + + +@pytest.mark.asyncio +async def test_parse_chat_messages_single_empty_audio_with_uuid_async( + qwen2_audio_model_config, + qwen2_audio_tokenizer, +): + audio_uuid = "abcd" + conversation, mm_future, mm_uuids = parse_chat_messages_futures( + [{ + "role": + "user", + "content": [ + { + "type": "input_audio", + "input_audio": {}, + "uuid": audio_uuid, + }, + { + "type": "text", + "text": "What does the audio say?" + }, + ], + }], + qwen2_audio_model_config, + qwen2_audio_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": + "user", + "content": + "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?" + }] + _assert_mm_data_inputs(await mm_future, {"audio": 1}) + _assert_mm_uuids(mm_uuids, + 1, + modality="audio", + expected_uuids=[audio_uuid]) diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py index 3c2aaabac..4d969cf99 100644 --- a/tests/kernels/attention/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -22,7 +22,10 @@ def clear_cache(): # Define MLA and non-MLA backends separately DEVICE_MLA_BACKENDS = { - "cuda": ["TRITON_MLA", "FLASHMLA", "FLASH_ATTN_MLA", "CUTLASS_MLA"], + "cuda": [ + "TRITON_MLA", "FLASHMLA", "FLASHINFER_MLA", "FLASH_ATTN_MLA", + "CUTLASS_MLA" + ], "hip": ["TRITON_MLA", "ROCM_AITER_MLA"], "cpu": [], } @@ -90,8 +93,8 @@ def test_env( with patch("vllm.attention.selector.current_platform", CpuPlatform()): - backend = get_attn_backend(16, torch.float16, torch.float16, - block_size, False) + backend = get_attn_backend(16, torch.float16, None, block_size, + False) assert backend.get_name() == "TORCH_SDPA_VLLM_V1" elif device == "hip": @@ -109,7 +112,7 @@ def test_env( with pytest.raises(ValueError) as exc_info: get_attn_backend(16, torch.float16, - torch.float16, + None, block_size, False, use_mla=use_mla) @@ -120,7 +123,7 @@ def test_env( with pytest.raises(ValueError) as exc_info: get_attn_backend(16, torch.float16, - torch.float16, + None, block_size, False, use_mla=use_mla) @@ -130,7 +133,7 @@ def test_env( # Valid backend-block_size combination backend = get_attn_backend(16, torch.float16, - torch.float16, + None, block_size, False, use_mla=use_mla) @@ -139,7 +142,7 @@ def test_env( else: backend = get_attn_backend(16, torch.float16, - torch.float16, + None, block_size, False, use_mla=use_mla) @@ -153,6 +156,8 @@ def test_env( # CUDA MLA backend logic: # - CUTLASS_MLA: only supported with block_size == 128 # and Blackwell GPUs (SM 10.0), V1 only + # - FLASHINFER_MLA: only supported on Blackwell GPUs + # (SM 10.0+), V1 only # - FLASHMLA: only supported with block_size == 64 # - FLASH_ATTN_MLA: V1 only # - TRITON_MLA: fallback for other cases @@ -169,12 +174,31 @@ def test_env( else: backend = get_attn_backend(16, torch.float16, - torch.float16, + None, block_size, False, use_mla=use_mla) expected = "CUTLASS_MLA_VLLM_V1" assert backend.get_name() == expected + elif name == "FLASHINFER_MLA": + if not use_v1: + # FlashInfer MLA only supported on V1 engine + pytest.skip( + "FlashInfer MLA only supported on V1 engine") + elif block_size not in [32, 64]: + # FlashInfer MLA only supports block_size 32 or 64 + pytest.skip( + "FlashInfer MLA only supports block_size 32 " + "or 64") + else: + backend = get_attn_backend(16, + torch.float16, + None, + block_size, + False, + use_mla=use_mla) + expected = "FLASHINFER_MLA" + assert backend.get_name() == expected elif name == "FLASHMLA": if block_size != 64: # FlashMLA only supports block_size == 64 @@ -189,7 +213,7 @@ def test_env( else: backend = get_attn_backend(16, torch.float16, - torch.float16, + None, block_size, False, use_mla=use_mla) @@ -204,7 +228,7 @@ def test_env( else: backend = get_attn_backend(16, torch.float16, - torch.float16, + None, block_size, False, use_mla=use_mla) @@ -214,7 +238,7 @@ def test_env( # TRITON_MLA or other fallback backend = get_attn_backend(16, torch.float16, - torch.float16, + None, block_size, False, use_mla=use_mla) @@ -224,7 +248,7 @@ def test_env( elif name == "FLASHINFER": backend = get_attn_backend(16, torch.float16, - torch.float16, + None, block_size, False, use_mla=use_mla) @@ -233,7 +257,7 @@ def test_env( else: backend = get_attn_backend(32, torch.float16, - torch.float16, + None, block_size, False, use_mla=use_mla) @@ -243,7 +267,7 @@ def test_env( if use_v1: backend = get_attn_backend(16, torch.float16, - torch.float16, + None, block_size, False, use_mla=use_mla) @@ -269,15 +293,13 @@ def test_fp32_fallback( with patch("vllm.attention.selector.current_platform", CpuPlatform()): - backend = get_attn_backend(16, torch.float32, torch.float32, - 16, False) + backend = get_attn_backend(16, torch.float32, None, 16, False) assert backend.get_name() == "TORCH_SDPA_VLLM_V1" elif device == "cuda": with patch("vllm.attention.selector.current_platform", CudaPlatform()): - backend = get_attn_backend(16, torch.float32, torch.float32, - 16, False) + backend = get_attn_backend(16, torch.float32, None, 16, False) assert (backend.get_name() == "FLEX_ATTENTION" if use_v1 else "XFORMERS") @@ -331,7 +353,7 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch): assert backend.get_name() != STR_FLASH_ATTN_VAL # Attention-free models should bypass env and use PlaceholderAttention - backend = get_attn_backend(16, torch.float16, torch.float16, 16, True) + backend = get_attn_backend(16, torch.float16, None, 16, True) assert backend.get_name() != STR_FLASH_ATTN_VAL diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py index ab6f1ccf8..bf9b1d9b4 100644 --- a/tests/kernels/core/test_pos_encoding.py +++ b/tests/kernels/core/test_pos_encoding.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from itertools import accumulate, product +from itertools import product from typing import Callable, Optional import pytest @@ -111,151 +111,6 @@ def test_rotary_embedding( "expected returned key to be None" -@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) -@pytest.mark.parametrize("tensor_shape_fn", TENSORS_SHAPES_FN) -@pytest.mark.parametrize("batch_size", BATCH_SIZES) -@pytest.mark.parametrize("seq_len", SEQ_LENS) -@pytest.mark.parametrize("num_heads", NUM_HEADS) -@pytest.mark.parametrize("head_size", HEAD_SIZES) -@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@pytest.mark.parametrize("use_key", USE_KEY) -@torch.inference_mode() -def test_batched_rotary_embedding( - is_neox_style: bool, - tensor_shape_fn: Callable[[int, int, int, int], tuple[int]], - batch_size: int, - seq_len: int, - num_heads: int, - head_size: int, - rotary_dim: Optional[int], - dtype: torch.dtype, - seed: int, - device: str, - use_key: bool, - max_position: int = 8192, - base: float = 10000, -) -> None: - current_platform.seed_everything(seed) - torch.set_default_device(device) - if rotary_dim is None: - rotary_dim = head_size - rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, { - "rope_type": "linear", - "factor": (1, ) - }) - rope = rope.to(dtype=dtype, device=torch.get_default_device()) - - positions = torch.randint(0, max_position, (batch_size, seq_len)) - query_shape = tensor_shape_fn(batch_size, seq_len, num_heads, head_size) - query = torch.randn(query_shape, dtype=dtype) - key = torch.randn_like(query) if use_key else None - - # slice tensor if required, noop otherwise - query = query[..., :head_size] - key = key[..., :head_size] if use_key else None - - # NOTE(woosuk): The reference implementation should be executed first - # because the custom kernel is in-place. - ref_query, ref_key = rope.forward_native(positions, query, key) - out_query, out_key = rope.forward(positions, - query, - key, - offsets=torch.zeros(batch_size * seq_len, - dtype=torch.long, - device=device)) - # Compare the results. - torch.testing.assert_close(out_query, - ref_query, - atol=get_default_atol(out_query), - rtol=get_default_rtol(out_query)) - if use_key: - torch.testing.assert_close(out_key, - ref_key, - atol=get_default_atol(out_key), - rtol=get_default_rtol(out_key)) - else: - assert ref_key is None and out_key is None, \ - "expected returned key to be None" - - -@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) -@pytest.mark.parametrize("batch_size", BATCH_SIZES) -@pytest.mark.parametrize("seq_len", SEQ_LENS) -@pytest.mark.parametrize("num_heads", NUM_HEADS) -@pytest.mark.parametrize("head_size", HEAD_SIZES) -@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@pytest.mark.parametrize("use_key", USE_KEY) -@torch.inference_mode() -def test_batched_rotary_embedding_multi_lora( - is_neox_style: bool, - batch_size: int, - seq_len: int, - num_heads: int, - head_size: int, - rotary_dim: Optional[int], - dtype: torch.dtype, - seed: int, - device: str, - use_key: bool, - max_position: int = 8192, - base: float = 10000, -) -> None: - current_platform.seed_everything(seed) - torch.set_default_device(device) - if rotary_dim is None: - rotary_dim = head_size - scaling_factors: list[int] = [1, 2, 4] - rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, { - "rope_type": "linear", - "factor": tuple(scaling_factors) - }) - rope = rope.to(dtype=dtype, device=torch.get_default_device()) - - positions = torch.randint(0, max_position, (batch_size, seq_len)) - query = torch.randn(batch_size, - seq_len, - num_heads * head_size, - dtype=dtype) - key = torch.randn_like(query) if use_key else None - - offset_map = torch.tensor( - list( - accumulate([0] + [ - max_position * scaling_factor * 2 - for scaling_factor in scaling_factors[:-1] - ]))) - query_types = torch.randint(0, - len(scaling_factors), (batch_size, seq_len), - device=device) - query_offsets = offset_map[query_types] - - # NOTE(woosuk): The reference implementation should be executed first - # because the custom kernel is in-place. - ref_query, ref_key = rope.forward_native(positions, query, key, - query_offsets) - out_query, out_key = rope.forward(positions, query, key, - query_offsets.flatten()) - # Compare the results. - torch.testing.assert_close(out_query, - ref_query, - atol=get_default_atol(out_query), - rtol=get_default_rtol(out_query)) - if use_key: - torch.testing.assert_close(out_key, - ref_key, - atol=get_default_atol(out_key), - rtol=get_default_rtol(out_key)) - else: - assert ref_key is None and out_key is None, \ - "expected returned key to be None" - - @torch.inference_mode() def test_rope_module_cache(): MAX_POSITIONS = [123, 1234] diff --git a/tests/kernels/core/test_rotary_embedding.py b/tests/kernels/core/test_rotary_embedding.py index d1fd960bf..5857dd5ba 100644 --- a/tests/kernels/core/test_rotary_embedding.py +++ b/tests/kernels/core/test_rotary_embedding.py @@ -16,20 +16,14 @@ from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding def rotary_embedding_opcheck(rot, positions: torch.Tensor, query: torch.Tensor, - key: Optional[torch.Tensor] = None, - offsets: Optional[torch.Tensor] = None): + key: Optional[torch.Tensor] = None): cos_sin_cache = rot.cos_sin_cache.to(query.device, dtype=query.dtype) - # ops.rotary_embedding()/batched_rotary_embedding() - # are in-place operations that update the query and key tensors. - if offsets is not None: - opcheck(torch.ops._C.batched_rotary_embedding, - (positions, query, key, rot.head_size, cos_sin_cache, - rot.is_neox_style, rot.rotary_dim, offsets)) - else: - opcheck(torch.ops._C.rotary_embedding, - (positions, query, key, rot.head_size, cos_sin_cache, - rot.is_neox_style)) + # ops.rotary_embedding() is a in-place operation + # that updates the query and key tensors. + opcheck(torch.ops._C.rotary_embedding, + (positions, query, key, rot.head_size, cos_sin_cache, + rot.is_neox_style)) @pytest.mark.parametrize("device", ["cuda"]) @@ -65,10 +59,6 @@ def test_rotary_embedding_opcheck(dist_init, device, max_position, key = key[..., :head_size] if use_key else None rotary_embedding_opcheck(rot, positions, query, key) - offsets = torch.zeros(batch_size * seq_len, - device=device, - dtype=torch.long) - rotary_embedding_opcheck(rot, positions, query, key, offsets) # if we have a contiguous head stride, test the alternate # [..., num_heads * head_dim] shape/layout diff --git a/tests/kernels/moe/test_mxfp4_moe.py b/tests/kernels/moe/test_mxfp4_moe.py index 9fd72ee15..a3b8f0763 100644 --- a/tests/kernels/moe/test_mxfp4_moe.py +++ b/tests/kernels/moe/test_mxfp4_moe.py @@ -771,11 +771,11 @@ def test_flashinfer_cutlass_mxfp4_mxfp8_fused_moe( w13_ref = dequant_mxfp4_batches( w13_q.view(torch.uint8), w13_scale.view(torch.uint8).reshape(-1)).to(torch.float32).reshape( - num_experts, 2 * intermediate_size, hidden_size) + num_experts, 2 * intermediate_size, hidden_size).to(device) w2_ref = dequant_mxfp4_batches( w2_q.view(torch.uint8), w2_scale.view(torch.uint8).reshape(-1)).to(torch.float32).reshape( - num_experts, hidden_size, intermediate_size) + num_experts, hidden_size, intermediate_size).to(device) # Quantize activations for SM100 path and dequantize for reference hidden_states_q, hidden_states_sf = mxfp8_quantize(hidden_states, True, 32) diff --git a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py index 5a0379dfb..383b5ebfb 100644 --- a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py +++ b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py @@ -5,28 +5,52 @@ import pytest import torch from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( - silu_mul_fp8_quant_deep_gemm) + silu_mul_fp8_quant_deep_gemm_cuda) from vllm.platforms import current_platform +from vllm.utils import cdiv + +fp8_dtype = torch.float8_e4m3fn -# (E, T, H, group_size, seed) CASES = [ - (1, 1, 128, 64, 0), - (1, 4, 128, 128, 0), - (2, 4, 256, 128, 0), - (32, 64, 256, 128, 0), - (17, 31, 768, 128, 0), + (1, 1, 128, fp8_dtype), + (1, 4, 128, fp8_dtype), + (2, 4, 256, fp8_dtype), + (32, 64, 256, fp8_dtype), + (17, 31, 768, fp8_dtype), + (1, 1, 128 * 1, fp8_dtype), + (1, 1, 128 * 2, fp8_dtype), + (1, 1, 128 * 3, fp8_dtype), + (1, 1, 128 * 4, fp8_dtype), + (8, 16, 128 * 1, fp8_dtype), + (8, 16, 128 * 2, fp8_dtype), + (8, 16, 128 * 3, fp8_dtype), + (8, 16, 128 * 4, fp8_dtype), + (8, 64, 7168, fp8_dtype), + (8, 128, 7168, fp8_dtype), + (8, 256, 7168, fp8_dtype), + (8, 512, 7168, fp8_dtype), + (8, 1024, 7168, fp8_dtype), + (256, 8, 7168, fp8_dtype), + (256, 16, 7168, fp8_dtype), + (256, 32, 7168, fp8_dtype), + (256, 64, 7168, fp8_dtype), + + # Only add a few fnuz tests to help with long CI times. + (8, 512, 7168, torch.float8_e4m3fnuz), + (8, 1024, 7168, torch.float8_e4m3fnuz), ] -@pytest.mark.parametrize("E,T,H,group_size,seed", CASES) +@pytest.mark.parametrize("E,T,H,fp8_type", CASES) @torch.inference_mode() -def test_silu_mul_fp8_quant_deep_gemm(E, T, H, group_size, seed): - current_platform.seed_everything(seed) +def test_silu_mul_fp8_quant_deep_gemm(E, T, H, fp8_type): + group_size = 128 + current_platform.seed_everything(42) # Input tensor of shape (E, T, 2*H) y = torch.randn((E, T, 2 * H), dtype=torch.bfloat16, device="cuda") tokens_per_expert = torch.randint( - low=0, + low=T // 2, high=T, size=(E, ), dtype=torch.int32, @@ -34,45 +58,59 @@ def test_silu_mul_fp8_quant_deep_gemm(E, T, H, group_size, seed): ) # Run the Triton kernel - y_q, y_s = silu_mul_fp8_quant_deep_gemm(y, - tokens_per_expert, - group_size=group_size, - eps=1e-10) + y_q, y_s = silu_mul_fp8_quant_deep_gemm_cuda(y, + tokens_per_expert, + group_size=group_size) - # Reference implementation - fp8_info = torch.finfo(torch.float8_e4m3fn) + torch.cuda.synchronize() + fp8_info = torch.finfo(fp8_dtype) fp8_max = fp8_info.max fp8_min = fp8_info.min eps = 1e-10 - # Compute silu activation and elementwise multiplication - y1 = y[..., :H] + y1 = y[..., :H].float() y2 = y[..., H:] silu_x = y1 * torch.sigmoid(y1) merged = silu_x * y2 - # Compute reference scales and quantized output, skipping padded tokens for e in range(E): nt = tokens_per_expert[e].item() - ref_s = torch.empty((T, H // group_size), + ref_s = torch.empty((T, cdiv(H, group_size)), dtype=torch.float32, device="cuda") - ref_q = torch.empty((T, H), dtype=torch.float8_e4m3fn, device="cuda") + ref_q = torch.empty((T, H), dtype=fp8_dtype, device="cuda") + for t in range(nt): - data = merged[e, t] - data_grp = data.view(H // group_size, group_size) - amax = data_grp.abs().amax(dim=1).clamp(min=eps) - scale = amax / fp8_max + data = merged[e, t].float() + ref_q_row = torch.empty_like(data) - scaled = data / scale.repeat_interleave(group_size) - clamped = scaled.clamp(fp8_min, fp8_max) - q = clamped.to(torch.float8_e4m3fn) + # process full groups + n_full_groups = H // group_size + if n_full_groups > 0: + data_grp = data[:n_full_groups * group_size].view( + n_full_groups, group_size) + amax = data_grp.abs().amax(dim=1).clamp(min=eps) + scale = amax / fp8_max + scaled = data[:n_full_groups * + group_size] / scale.repeat_interleave(group_size) + ref_q_row[:n_full_groups * group_size] = scaled.clamp( + fp8_min, fp8_max).to(fp8_dtype) + ref_s[t, :n_full_groups] = scale - ref_s[t] = scale - ref_q[t] = q + # process remainder group + rem = H % group_size + if rem > 0: + data_rem = data[-rem:] + amax = data_rem.abs().amax().clamp(min=eps) + scale = amax / fp8_max + scaled = data_rem / scale + ref_q_row[-rem:] = scaled.clamp(fp8_min, fp8_max).to(fp8_dtype) + ref_s[t, -1] = scale - y_se = y_s[e] - y_qe = y_q[e] + ref_q[t] = ref_q_row + + y_se = y_s[e].float() + y_qe = y_q[e].float() torch.testing.assert_close(y_se[:nt], ref_s[:nt], atol=1e-4, rtol=1e-2) torch.testing.assert_close( diff --git a/tests/models/registry.py b/tests/models/registry.py index 0c77ec5ef..b268bf12a 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -301,6 +301,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { trust_remote_code=True), "OlmoForCausalLM": _HfExamplesInfo("allenai/OLMo-1B-hf"), "Olmo2ForCausalLM": _HfExamplesInfo("allenai/OLMo-2-0425-1B"), + "Olmo3ForCausalLM": _HfExamplesInfo("shanearora/2025-sep-a-base-model"), "OlmoeForCausalLM": _HfExamplesInfo("allenai/OLMoE-1B-7B-0924-Instruct"), "OPTForCausalLM": _HfExamplesInfo("facebook/opt-125m", {"1b": "facebook/opt-iml-max-1.3b"}), diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py index 1ae8b91c3..0b7e103be 100644 --- a/tests/v1/attention/test_attention_backends.py +++ b/tests/v1/attention/test_attention_backends.py @@ -178,6 +178,7 @@ class MockAttentionLayer: self._k_scale = torch.tensor(1.0, device=device) self._v_scale = torch.tensor(1.0, device=device) # Add float versions for flashinfer + self._q_scale_float = 1.0 self._k_scale_float = 1.0 self._v_scale_float = 1.0 diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py index 5c4956624..f07c6eb0e 100644 --- a/tests/v1/attention/utils.py +++ b/tests/v1/attention/utils.py @@ -141,6 +141,8 @@ def get_attention_backend(backend_name: _Backend): "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend", _Backend.FLASH_ATTN_MLA: "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend", + _Backend.FLASHINFER_MLA: + "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend", _Backend.TRITON_MLA_VLLM_V1: "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend", } diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 0b240b7d4..bf90f50b1 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -117,9 +117,9 @@ def test_ngram_correctness( print(f"ref_output: {ref_output.outputs[0].text}") print(f"spec_output: {spec_output.outputs[0].text}") - # Heuristic: expect at least 68% of the prompts to match exactly + # Heuristic: expect at least 66% of the prompts to match exactly # Upon failure, inspect the outputs to check for inaccuracy. - assert matches >= int(0.68 * len(ref_outputs)) + assert matches >= int(0.66 * len(ref_outputs)) del spec_llm torch.cuda.empty_cache() cleanup_dist_env_and_memory() diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 98265c634..17b136aa4 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -257,9 +257,13 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): def execute_model( self, scheduler_output, + non_block=False, ) -> Future[ModelRunnerOutput]: """Make execute_model non-blocking.""" + # DummyExecutor used only for testing async case. + assert non_block + def _execute(): output = self.collective_rpc("execute_model", args=(scheduler_output, )) diff --git a/tests/v1/tpu/test_pallas.py b/tests/v1/tpu/test_pallas.py index bfba3af57..1bc8dff31 100644 --- a/tests/v1/tpu/test_pallas.py +++ b/tests/v1/tpu/test_pallas.py @@ -33,10 +33,12 @@ def test_ragged_paged_attention(): ) class FakeAttentionLayer: + _q_scale_float: float _k_scale_float: float _v_scale_float: float layer = FakeAttentionLayer() + layer._q_scale_float = 1.0 layer._k_scale_float = 1.0 layer._v_scale_float = 1.0 diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 3b2e859b7..93b4f87ed 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -257,16 +257,6 @@ def rotary_embedding( cos_sin_cache, is_neox) -def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor, - key: Optional[torch.Tensor], head_size: int, - cos_sin_cache: torch.Tensor, is_neox: bool, - rot_dim: int, - cos_sin_cache_offsets: torch.Tensor) -> None: - torch.ops._C.batched_rotary_embedding(positions, query, key, head_size, - cos_sin_cache, is_neox, rot_dim, - cos_sin_cache_offsets) - - # layer norm ops def rms_norm(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, epsilon: float) -> None: diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index c2868c040..59b0aed32 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -148,17 +148,6 @@ class ipex_ops: head_size, cos_sin_cache, is_neox, rot_dim) - @staticmethod - def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor, - key: torch.Tensor, head_size: int, - cos_sin_cache: torch.Tensor, is_neox: bool, - rot_dim: int, - cos_sin_cache_offsets: torch.Tensor) -> None: - ipex.llm.functional.rotary_embedding_batched(positions, query, key, - head_size, cos_sin_cache, - is_neox, rot_dim, - cos_sin_cache_offsets) - @staticmethod def rms_norm(input: torch.Tensor, weight: torch.Tensor, epsilon: float) -> torch.Tensor: diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 0217bff6a..75bcdc4bb 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -240,6 +240,7 @@ class AttentionLayer(Protocol): _q_scale: torch.Tensor _k_scale: torch.Tensor _v_scale: torch.Tensor + _q_scale_float: float _k_scale_float: float _v_scale_float: float _prob_scale: torch.Tensor diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py index 9d67580be..e64063047 100644 --- a/vllm/benchmarks/lib/endpoint_request_func.py +++ b/vllm/benchmarks/lib/endpoint_request_func.py @@ -68,6 +68,7 @@ class RequestFuncInput: model: str model_name: Optional[str] = None logprobs: Optional[int] = None + extra_headers: Optional[dict] = None extra_body: Optional[dict] = None multi_modal_content: Optional[Union[dict, list[dict]]] = None ignore_eos: bool = False @@ -129,6 +130,8 @@ async def async_request_openai_completions( headers = { "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" } + if request_func_input.extra_headers: + headers |= request_func_input.extra_headers if request_func_input.request_id: headers["x-request-id"] = request_func_input.request_id @@ -258,6 +261,8 @@ async def async_request_openai_chat_completions( "Content-Type": "application/json", "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", } + if request_func_input.extra_headers: + headers |= request_func_input.extra_headers if request_func_input.request_id: headers["x-request-id"] = request_func_input.request_id @@ -364,6 +369,8 @@ async def async_request_openai_audio( headers = { "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", } + if request_func_input.extra_headers: + headers |= request_func_input.extra_headers if request_func_input.request_id: headers["x-request-id"] = request_func_input.request_id diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index a98eb2a78..33e831e54 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -389,6 +389,7 @@ async def benchmark( goodput_config_dict: dict[str, float], max_concurrency: Optional[int], lora_modules: Optional[Iterable[str]], + extra_headers: Optional[dict], extra_body: Optional[dict], ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None, ramp_up_start_rps: Optional[int] = None, @@ -452,6 +453,7 @@ async def benchmark( logprobs=logprobs, multi_modal_content=test_mm_content, ignore_eos=ignore_eos, + extra_headers=extra_headers, extra_body=extra_body, ) @@ -484,6 +486,7 @@ async def benchmark( logprobs=logprobs, multi_modal_content=test_mm_content, ignore_eos=ignore_eos, + extra_headers=extra_headers, extra_body=extra_body) profile_output = await request_func( request_func_input=profile_input, session=session) @@ -568,6 +571,7 @@ async def benchmark( logprobs=logprobs, multi_modal_content=mm_content, ignore_eos=ignore_eos, + extra_headers=extra_headers, extra_body=extra_body, request_id=request_id,) tasks.append( @@ -815,6 +819,15 @@ def add_cli_args(parser: argparse.ArgumentParser): default="/v1/completions", help="API endpoint.", ) + parser.add_argument( + "--header", + metavar="KEY=VALUE", + nargs="*", + help="Key-value pairs (e.g, --header x-additional-info=0.3.3) " + "for headers to be passed with each request. These headers override " \ + "per backend constants and values set via environment variable, and " \ + "will be overriden by other arguments (such as request ids)." + ) parser.add_argument( "--max-concurrency", type=int, @@ -1104,6 +1117,19 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: api_url = f"http://{args.host}:{args.port}{args.endpoint}" base_url = f"http://{args.host}:{args.port}" + # Headers + headers = None + if args.header: + headers = {} + for item in args.header: + if "=" in item: + kvstring = item.split("=", 1) + headers[kvstring[0].strip()] = kvstring[1].strip() + else: + raise ValueError( + "Invalid header format. Please use KEY=VALUE format." + ) + tokenizer = get_tokenizer(tokenizer_id, tokenizer_mode=tokenizer_mode, trust_remote_code=args.trust_remote_code) @@ -1161,6 +1187,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: goodput_config_dict=goodput_config_dict, max_concurrency=args.max_concurrency, lora_modules=args.lora_modules, + extra_headers=headers, extra_body=sampling_params, ramp_up_strategy=args.ramp_up_strategy, ramp_up_start_rps=args.ramp_up_start_rps, @@ -1184,7 +1211,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: if args.metadata: for item in args.metadata: if "=" in item: - kvstring = item.split("=") + kvstring = item.split("=", 1) result_json[kvstring[0].strip()] = kvstring[1].strip() else: raise ValueError( diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index 7d9b32cd4..ae876d131 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -182,7 +182,7 @@ class NaiveBlockAllocator(BlockAllocator): # Increment refcount for each block. assert block.block_id is not None refcount = self._refcounter.incr(block.block_id) - assert refcount != 1, "can't fork free'd block" + assert refcount != 1, "can't fork freed block" forked_block = self._block_pool.init_block( prev_block=prev_block, diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index 7a4a836ee..85ff6bc9c 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -58,7 +58,7 @@ class Evictor(ABC): class BlockMetaData: """Data structure for storing key data describe cached block, so that - evitor could use to make its decision which one to choose for eviction + evictor could use to make its decision which one to choose for eviction Here we use physical block id as the dict key, as there maybe several blocks with the same content hash, but their physical id is unique. diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py index d5ab61473..8f8baa7d5 100644 --- a/vllm/distributed/eplb/eplb_state.py +++ b/vllm/distributed/eplb/eplb_state.py @@ -337,11 +337,11 @@ class EplbState: Args: model (MixtureOfExperts): The MoE model. is_dummy (bool): If `True`, this is a dummy step and the load - metrics recorded in this forward pass will not count. Defaults - to `False`. + metrics recorded in this forward pass will not count. Defaults + to `False`. is_profile (bool): If `True`, perform a dummy rearrangement - with maximum communication cost. This is used in `profile_run` - to reserve enough memory for the communication buffer. + with maximum communication cost. This is used in `profile_run` + to reserve enough memory for the communication buffer. log_stats (bool): If `True`, log the expert load metrics. # Stats diff --git a/vllm/distributed/eplb/rebalance_algo.py b/vllm/distributed/eplb/rebalance_algo.py index 879b5b9f1..3564a10df 100644 --- a/vllm/distributed/eplb/rebalance_algo.py +++ b/vllm/distributed/eplb/rebalance_algo.py @@ -102,14 +102,14 @@ def rebalance_experts_hierarchical( num_groups: int, num_nodes: int, num_gpus: int, -): +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ Parameters: weight: [num_moe_layers, num_logical_experts] num_physical_experts: number of physical experts after replication num_groups: number of expert groups - num_nodes: number of server nodes, where the intra-node network - (e.g, NVLink) is faster + num_nodes: number of server nodes, where the intra-node network + (e.g, NVLink) is faster num_gpus: number of GPUs, must be a multiple of `num_nodes` Returns: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index cd4561154..7e0b927c5 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -149,7 +149,7 @@ class KVConnectorBase_V1(ABC): @abstractmethod def start_load_kv(self, forward_context: "ForwardContext", - **kwargs) -> None: + **kwargs: Any) -> None: """ Start loading the KV cache from the connector to vLLM's paged KV buffer. This is called from the forward context before the @@ -182,7 +182,8 @@ class KVConnectorBase_V1(ABC): @abstractmethod def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, - attn_metadata: "AttentionMetadata", **kwargs) -> None: + attn_metadata: "AttentionMetadata", + **kwargs: Any) -> None: """ Start saving a layer of KV cache from vLLM's paged buffer to the connector. This is called from within attention layer to diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py index c99f538ee..2b0abe983 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py @@ -30,7 +30,7 @@ class LMCacheConnectorV1(KVConnectorBase_V1): # Worker-side methods # ============================== def start_load_kv(self, forward_context: "ForwardContext", - **kwargs) -> None: + **kwargs: Any) -> None: """ Start loading the KV cache from the connector to vLLM's paged KV buffer. This is called from the forward context before the @@ -61,7 +61,8 @@ class LMCacheConnectorV1(KVConnectorBase_V1): self._lmcache_engine.wait_for_layer_load(layer_name) def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, - attn_metadata: "AttentionMetadata", **kwargs) -> None: + attn_metadata: "AttentionMetadata", + **kwargs: Any) -> None: """ Start saving the a layer of KV cache from vLLM's paged buffer to the connector. This is called from within attention layer to diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 17f5be76c..c306eeb5a 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -708,8 +708,6 @@ class NixlConnectorWorker: caches_data = [] # With hybrid allocator, layers can share a kv cache tensor seen_base_addresses = [] - xfer_buffers = (self.host_xfer_buffers - if self.use_host_buffer else kv_caches) # Note(tms): I modified this from the original region setup code. # K and V are now in different regions. Advantage is that we can diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 2485c57d8..ec72905a0 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -91,7 +91,7 @@ class P2pNcclConnector(KVConnectorBase_V1): # ============================== def start_load_kv(self, forward_context: "ForwardContext", - **kwargs) -> None: + **kwargs: Any) -> None: """Start loading the KV cache from the connector buffer to vLLM's paged KV buffer. @@ -212,7 +212,8 @@ class P2pNcclConnector(KVConnectorBase_V1): return def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, - attn_metadata: "AttentionMetadata", **kwargs) -> None: + attn_metadata: "AttentionMetadata", + **kwargs: Any) -> None: """Start saving the KV cache of the layer from vLLM's paged buffer to the connector. @@ -278,7 +279,7 @@ class P2pNcclConnector(KVConnectorBase_V1): def get_finished( self, finished_req_ids: set[str], - **kwargs) -> tuple[Optional[set[str]], Optional[set[str]]]: + **kwargs: Any) -> tuple[Optional[set[str]], Optional[set[str]]]: """ Notifies worker-side connector ids of requests that have finished generating tokens. diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py index b775276d4..26070488b 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py @@ -218,8 +218,9 @@ class TensorMemoryPool: return addr - def load_tensor(self, addr: int, dtype: torch.dtype, - shape: tuple[int, ...], device) -> torch.Tensor: + def load_tensor(self, addr: int, dtype: torch.dtype, shape: tuple[int, + ...], + device: torch.device) -> torch.Tensor: """Loads a tensor from pinned host memory to the specified device. Args: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index 25460ed29..48fa1a82c 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -3,7 +3,7 @@ import hashlib import os from dataclasses import dataclass -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, Any, Optional import safetensors import torch @@ -90,7 +90,7 @@ class SharedStorageConnector(KVConnectorBase_V1): logger.info("Shared storage path is %s", self._storage_path) def start_load_kv(self, forward_context: "ForwardContext", - **kwargs) -> None: + **kwargs: Any) -> None: """Start loading the KV cache from the connector buffer to vLLM's paged KV buffer. @@ -191,7 +191,8 @@ class SharedStorageConnector(KVConnectorBase_V1): return def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, - attn_metadata: "AttentionMetadata", **kwargs) -> None: + attn_metadata: "AttentionMetadata", + **kwargs: Any) -> None: """Start saving the KV cache of the layer from vLLM's paged buffer to the connector. diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py index 66120e9a0..7a79a8cc0 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py @@ -251,8 +251,8 @@ class PyNcclPipe(KVPipeBase): """ Receives a tensor and its metadata from the source rank. Blocking call. - Args: - tensor: The received tensor, or `None` if no tensor is received. + Returns: + The received tensor, or `None` if no tensor is received. """ if self.transport_thread is None: self.transport_thread = ThreadPoolExecutor(max_workers=1) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index bc7ec0f06..ab43c0edc 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1296,11 +1296,8 @@ class EngineArgs: # Async scheduling does not work with the uniprocess backend. if self.distributed_executor_backend is None: self.distributed_executor_backend = "mp" - logger.info("Using mp-based distributed executor backend " - "for async scheduling.") - if self.distributed_executor_backend == "uni": - raise ValueError("Async scheduling is not supported with " - "uni-process backend.") + logger.info("Defaulting to mp-based distributed executor " + "backend for async scheduling.") if self.pipeline_parallel_size > 1: raise ValueError("Async scheduling is not supported with " "pipeline-parallel-size > 1.") diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 0a8709db4..2762175c4 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -379,7 +379,7 @@ class LoggingStatLogger(StatLoggerBase): if local_interval_elapsed(stats.now, self.last_local_log, self.local_interval): # Compute summary metrics for tracked stats (and log them - # to promethus if applicable). + # to prometheus if applicable). prompt_throughput = get_throughput(self.num_prompt_tokens, now=stats.now, last_log=self.last_local_log) @@ -432,7 +432,7 @@ class LoggingStatLogger(StatLoggerBase): class PrometheusStatLogger(StatLoggerBase): - """PrometheusStatLogger is used LLMEngine to log to Promethus.""" + """PrometheusStatLogger is used LLMEngine to log to Prometheus.""" _metrics_cls = Metrics _gauge_cls = prometheus_client.Gauge diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index b53dbfb3a..aa231de93 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -73,15 +73,10 @@ class ChatCompletionContentPartAudioParam(TypedDict, total=False): type: Required[Literal["audio_url"]] """The type of the content part.""" - uuid: Optional[str] - """ - User-provided UUID of a media. User must guarantee that it is properly - generated and unique for different medias. - """ class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False): - image_embeds: Required[Union[str, dict[str, str]]] + image_embeds: Optional[Union[str, dict[str, str]]] """ The image embeddings. It can be either: - A single base64 string. @@ -108,11 +103,6 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False): type: Required[Literal["video_url"]] """The type of the content part.""" - uuid: Optional[str] - """ - User-provided UUID of a media. User must guarantee that it is properly - generated and unique for different medias. - """ class PILImage(BaseModel): @@ -133,7 +123,7 @@ class CustomChatCompletionContentPILImageParam(TypedDict, total=False): } """ - image_pil: Required[PILImage] + image_pil: Optional[PILImage] uuid: Optional[str] """ User-provided UUID of a media. User must guarantee that it is properly @@ -151,7 +141,7 @@ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): } """ - image_url: Required[str] + image_url: Optional[str] uuid: Optional[str] """ User-provided UUID of a media. User must guarantee that it is properly @@ -168,7 +158,7 @@ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False): } """ - audio_url: Required[str] + audio_url: Optional[str] class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False): @@ -180,7 +170,7 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False): } """ - video_url: Required[str] + video_url: Optional[str] uuid: Optional[str] """ User-provided UUID of a media. User must guarantee that it is properly @@ -597,7 +587,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): self._model_config = model_config self._tokenizer = tokenizer - self._items_by_modality = defaultdict[str, list[_T]](list) + self._items_by_modality = defaultdict[str, list[Optional[_T]]](list) self._uuids_by_modality = defaultdict[str, list[Optional[str]]](list) @property @@ -624,14 +614,17 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): return self.mm_registry.create_processor(self.model_config) def add( - self, modality: ModalityStr, item: _T, uuid: Optional[str] = None + self, + modality: ModalityStr, + item: Optional[_T], + uuid: Optional[str] = None, ) -> Optional[str]: """ Add a multi-modal item to the current prompt and returns the placeholder string to use, if any. An optional uuid can be added which serves as a unique identifier of the - media. + media. """ input_modality = modality.replace("_embeds", "") num_items = len(self._items_by_modality[modality]) + 1 @@ -708,10 +701,15 @@ class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]): if not self._items_by_modality: return None mm_inputs = {} - items_by_modality = { - modality: await asyncio.gather(*items) - for modality, items in self._items_by_modality.items() - } + items_by_modality = {} + for modality, items in self._items_by_modality.items(): + coros = [] + for item in items: + if item is not None: + coros.append(item) + else: + coros.append(asyncio.sleep(0)) + items_by_modality[modality] = await asyncio.gather(*coros) if "image" in items_by_modality and "image_embeds" in items_by_modality: raise ValueError( @@ -760,35 +758,40 @@ class BaseMultiModalContentParser(ABC): return dict(self._placeholder_storage) @abstractmethod - def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None: + def parse_image( + self, image_url: Optional[str], uuid: Optional[str] = None) -> None: raise NotImplementedError @abstractmethod def parse_image_embeds( self, - image_embeds: Union[str, dict[str, str]], + image_embeds: Union[str, dict[str, str], None], uuid: Optional[str] = None, ) -> None: raise NotImplementedError @abstractmethod def parse_image_pil( - self, image_pil: Image.Image, uuid: Optional[str] = None + self, image_pil: Optional[Image.Image], uuid: Optional[str] = None ) -> None: raise NotImplementedError @abstractmethod - def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None: + def parse_audio( + self, audio_url: Optional[str], uuid: Optional[str] = None + ) -> None: raise NotImplementedError @abstractmethod def parse_input_audio( - self, input_audio: InputAudio, uuid: Optional[str] = None + self, input_audio: Optional[InputAudio], uuid: Optional[str] = None ) -> None: raise NotImplementedError @abstractmethod - def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None: + def parse_video( + self, video_url: Optional[str], uuid: Optional[str] = None + ) -> None: raise NotImplementedError @@ -803,15 +806,17 @@ class MultiModalContentParser(BaseMultiModalContentParser): allowed_local_media_path=tracker.allowed_local_media_path, ) - def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None: - image = self._connector.fetch_image(image_url) + def parse_image( + self, image_url: Optional[str], uuid: Optional[str] = None + ) -> None: + image = self._connector.fetch_image(image_url) if image_url else None placeholder = self._tracker.add("image", image, uuid) self._add_placeholder("image", placeholder) def parse_image_embeds( self, - image_embeds: Union[str, dict[str, str]], + image_embeds: Union[str, dict[str, str], None], uuid: Optional[str] = None, ) -> None: if isinstance(image_embeds, dict): @@ -825,31 +830,49 @@ class MultiModalContentParser(BaseMultiModalContentParser): embedding = self._connector.fetch_image_embedding(image_embeds) placeholder = self._tracker.add("image_embeds", embedding, uuid) + if image_embeds is None: + placeholder = self._tracker.add("image_embeds", None, uuid) + self._add_placeholder("image", placeholder) def parse_image_pil( - self, image_pil: Image.Image, uuid: Optional[str] = None + self, image_pil: Optional[Image.Image], uuid: Optional[str] = None ) -> None: placeholder = self._tracker.add("image", image_pil, uuid) self._add_placeholder("image", placeholder) - def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None: - audio = self._connector.fetch_audio(audio_url) + def parse_audio( + self, audio_url: Optional[str], uuid: Optional[str] = None + ) -> None: + audio = self._connector.fetch_audio(audio_url) if audio_url else None placeholder = self._tracker.add("audio", audio, uuid) self._add_placeholder("audio", placeholder) def parse_input_audio( - self, input_audio: InputAudio, uuid: Optional[str] = None + self, input_audio: Optional[InputAudio], uuid: Optional[str] = None ) -> None: - audio_data = input_audio.get("data", "") - audio_format = input_audio.get("format", "") - audio_url = f"data:audio/{audio_format};base64,{audio_data}" + if input_audio: + audio_data = input_audio.get("data", "") + audio_format = input_audio.get("format", "") + if audio_data: + audio_url = f"data:audio/{audio_format};base64,{audio_data}" + else: + # If a UUID is provided, audio data may be empty. + audio_url = None + else: + audio_url = None return self.parse_audio(audio_url, uuid) - def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None: - video = self._connector.fetch_video(video_url=video_url) + def parse_video( + self, video_url: Optional[str], uuid: Optional[str] = None + ) -> None: + video = ( + self._connector.fetch_video(video_url=video_url) + if video_url + else None + ) placeholder = self._tracker.add("video", video, uuid) self._add_placeholder("video", placeholder) @@ -865,18 +888,24 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser): allowed_local_media_path=tracker.allowed_local_media_path, ) - def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None: - image_coro = self._connector.fetch_image_async(image_url) + def parse_image( + self, image_url: Optional[str], uuid: Optional[str] = None + ) -> None: + image_coro = ( + self._connector.fetch_image_async(image_url) if image_url else None + ) placeholder = self._tracker.add("image", image_coro, uuid) self._add_placeholder("image", placeholder) def parse_image_embeds( self, - image_embeds: Union[str, dict[str, str]], + image_embeds: Union[str, dict[str, str], None], uuid: Optional[str] = None, ) -> None: - future: asyncio.Future[Union[str, dict[str, str]]] = asyncio.Future() + future: asyncio.Future[Union[str, dict[str, str], None]] = ( + asyncio.Future() + ) if isinstance(image_embeds, dict): embeds = { @@ -889,35 +918,58 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser): embedding = self._connector.fetch_image_embedding(image_embeds) future.set_result(embedding) + if image_embeds is None: + future.set_result(None) + placeholder = self._tracker.add("image_embeds", future, uuid) self._add_placeholder("image", placeholder) def parse_image_pil( - self, image_pil: Image.Image, uuid: Optional[str] = None + self, image_pil: Optional[Image.Image], uuid: Optional[str] = None ) -> None: - future: asyncio.Future[Image.Image] = asyncio.Future() - future.set_result(image_pil) + future: asyncio.Future[Optional[Image.Image]] = asyncio.Future() + if image_pil: + future.set_result(image_pil) + else: + future.set_result(None) placeholder = self._tracker.add("image", future, uuid) self._add_placeholder("image", placeholder) - def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None: - audio_coro = self._connector.fetch_audio_async(audio_url) + def parse_audio( + self, audio_url: Optional[str], uuid: Optional[str] = None + ) -> None: + audio_coro = ( + self._connector.fetch_audio_async(audio_url) if audio_url else None + ) placeholder = self._tracker.add("audio", audio_coro, uuid) self._add_placeholder("audio", placeholder) def parse_input_audio( - self, input_audio: InputAudio, uuid: Optional[str] = None + self, input_audio: Optional[InputAudio], uuid: Optional[str] = None ) -> None: - audio_data = input_audio.get("data", "") - audio_format = input_audio.get("format", "") - audio_url = f"data:audio/{audio_format};base64,{audio_data}" + if input_audio: + audio_data = input_audio.get("data", "") + audio_format = input_audio.get("format", "") + if audio_data: + audio_url = f"data:audio/{audio_format};base64,{audio_data}" + else: + # If a UUID is provided, audio data may be empty. + audio_url = None + else: + audio_url = None return self.parse_audio(audio_url, uuid) - def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None: - video = self._connector.fetch_video_async(video_url=video_url) + def parse_video( + self, video_url: Optional[str], uuid: Optional[str] = None + ) -> None: + video = ( + self._connector.fetch_video_async(video_url=video_url) + if video_url + else None + ) placeholder = self._tracker.add("video", video, uuid) self._add_placeholder("video", placeholder) @@ -1130,8 +1182,9 @@ def _parse_chat_message_content_mm_part( part, dict ) # This is needed to avoid mypy errors: part.get() from str part_type = part.get("type", None) + uuid = part.get("uuid", None) - if isinstance(part_type, str) and part_type in MM_PARSER_MAP: + if isinstance(part_type, str) and part_type in MM_PARSER_MAP and uuid is None: # noqa: E501 content = MM_PARSER_MAP[part_type](part) # Special case for 'image_url.detail' @@ -1146,25 +1199,54 @@ def _parse_chat_message_content_mm_part( # Handle missing 'type' but provided direct URL fields. # 'type' is required field by pydantic - if part_type is None: - if part.get("image_url") is not None: + if part_type is None or uuid is not None: + if "image_url" in part: image_params = cast( CustomChatCompletionContentSimpleImageParam, part ) - return "image_url", image_params.get("image_url", "") - if part.get("audio_url") is not None: + image_url = image_params.get("image_url", None) + if isinstance(image_url, dict): + # Can potentially happen if user provides a uuid + # with url as a dict of {"url": url} + image_url = image_url.get("url", None) + return "image_url", image_url + if "image_pil" in part: + # "image_pil" could be None if UUID is provided. + image_params = cast( # type: ignore + CustomChatCompletionContentPILImageParam, part + ) + image_pil = image_params.get("image_pil", None) + return "image_pil", image_pil + if "image_embeds" in part: + # "image_embeds" could be None if UUID is provided. + image_params = cast( # type: ignore + ChatCompletionContentPartImageEmbedsParam, part + ) + image_embeds = image_params.get("image_embeds", None) + return "image_embeds", image_embeds + if "audio_url" in part: audio_params = cast( CustomChatCompletionContentSimpleAudioParam, part ) - return "audio_url", audio_params.get("audio_url", "") + audio_url = audio_params.get("audio_url", None) + if isinstance(audio_url, dict): + # Can potentially happen if user provides a uuid + # with url as a dict of {"url": url} + audio_url = audio_url.get("url", None) + return "audio_url", audio_url if part.get("input_audio") is not None: input_audio_params = cast(dict[str, str], part) return "input_audio", input_audio_params - if part.get("video_url") is not None: + if "video_url" in part: video_params = cast( CustomChatCompletionContentSimpleVideoParam, part ) - return "video_url", video_params.get("video_url", "") + video_url = video_params.get("video_url", None) + if isinstance(video_url, dict): + # Can potentially happen if user provides a uuid + # with url as a dict of {"url": url} + video_url = video_url.get("url", None) + return "video_url", video_url # Raise an error if no 'type' or direct URL is found. raise ValueError("Missing 'type' field in multimodal part.") @@ -1173,15 +1255,9 @@ def _parse_chat_message_content_mm_part( return part_type, "unknown part_type content" -VALID_MESSAGE_CONTENT_MM_PART_TYPES = ( +PART_TYPES_TO_SKIP_NONE_CONTENT = ( "text", "refusal", - "image_url", - "image_embeds", - "image_pil", - "audio_url", - "input_audio", - "video_url", ) @@ -1242,7 +1318,7 @@ def _parse_chat_message_content_part( part_type, content = _parse_chat_message_content_mm_part(part) # if part_type is text/refusal/image_url/audio_url/video_url/input_audio but # content is None, log a warning and skip - if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and content is None: + if part_type in PART_TYPES_TO_SKIP_NONE_CONTENT and content is None: logger.warning( "Skipping multimodal part '%s' (type: '%s') " "with empty / unparsable content.", @@ -1266,7 +1342,10 @@ def _parse_chat_message_content_part( modality = None if part_type == "image_pil": - image_content = cast(Image.Image, content) + if content is not None: + image_content = cast(Image.Image, content) + else: + image_content = None mm_parser.parse_image_pil(image_content, uuid) modality = "image" elif part_type in ("image_url", "input_image"): @@ -1274,7 +1353,10 @@ def _parse_chat_message_content_part( mm_parser.parse_image(str_content, uuid) modality = "image" elif part_type == "image_embeds": - content = cast(Union[str, dict[str, str]], content) + if content is not None: + content = cast(Union[str, dict[str, str]], content) + else: + content = None mm_parser.parse_image_embeds(content, uuid) modality = "image" elif part_type == "audio_url": diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 94749eb88..4b51dbcd8 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1491,6 +1491,11 @@ class LLM: for i, prompt in enumerate(it): + if isinstance(prompt, dict): + self._validate_mm_data_and_uuids( + prompt.get("multi_modal_data"), + prompt.get("multi_modal_uuids")) + param = params[i] if isinstance(params, Sequence) else params tokenization_kwargs: dict[str, Any] = {} @@ -1507,6 +1512,41 @@ class LLM: priority=priority[i] if priority else 0, ) + def _validate_mm_data_and_uuids( + self, + multi_modal_data: Optional[Any], # MultiModalDataDict + multi_modal_uuids: Optional[Any], # MultiModalUUIDDict + ): + """ + Validate that if any multi-modal data is skipped (i.e. None), + then its corresponding UUID must be set. + """ + if multi_modal_data is None: + return + + for modality, data in multi_modal_data.items(): + if isinstance(data, list): + for i, d in enumerate(data): + if d is None: + if multi_modal_uuids is None or modality not in multi_modal_uuids or multi_modal_uuids[ # noqa: E501 + modality] is None: + raise ValueError( + f"Multi-modal data for {modality} is None " + f"but UUID is not provided") + else: + if len( + multi_modal_uuids[modality] + ) <= i or multi_modal_uuids[modality][i] is None: + raise ValueError( + f"Multi-modal data for {modality} is None " + f"but UUID is not provided") + else: + if data is None and (multi_modal_uuids is None + or modality not in multi_modal_uuids + or multi_modal_uuids[modality] is None): + raise ValueError(f"Multi-modal data for {modality} is None" + f" but UUID is not provided") + def _add_request( self, prompt: PromptType, diff --git a/vllm/envs.py b/vllm/envs.py index f1f404594..bb10c7cc2 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -476,6 +476,8 @@ environment_variables: dict[str, Callable[[], Any]] = { # - "FLASHINFER": use flashinfer # - "FLASHMLA": use FlashMLA # - "FLASH_ATTN_MLA": use FlashAttention for MLA + # - "FLASHINFER_MLA": use FlashInfer for MLA + # - "CUTLASS_MLA": use CUTLASS for MLA "VLLM_ATTENTION_BACKEND": lambda: os.getenv("VLLM_ATTENTION_BACKEND", None), diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py index 2e456ecd6..3b566e88a 100644 --- a/vllm/executor/uniproc_executor.py +++ b/vllm/executor/uniproc_executor.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - import os +from concurrent.futures import Future, ThreadPoolExecutor +from functools import cached_property from multiprocessing import Lock from typing import Any, Callable, Dict, List, Optional, Tuple, Union @@ -17,6 +18,7 @@ from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, run_method) from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType from vllm.v1.executor.utils import get_and_update_mm_cache +from vllm.v1.outputs import AsyncModelRunnerOutput from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -31,15 +33,7 @@ class UniProcExecutor(ExecutorBase): """ self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config, rpc_rank=0) - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - local_rank = 0 - # set local rank as the device index if specified - device_info = self.vllm_config.device_config.device.__str__().split( - ":") - if len(device_info) > 1: - local_rank = int(device_info[1]) - rank = 0 + distributed_init_method, rank, local_rank = self._distributed_args() is_driver_worker = True kwargs = dict( vllm_config=self.vllm_config, @@ -50,21 +44,56 @@ class UniProcExecutor(ExecutorBase): ) self.mm_receiver_cache = worker_receiver_cache_from_config( self.vllm_config, MULTIMODAL_REGISTRY, Lock()) + + self.async_output_thread: Optional[ThreadPoolExecutor] = None + if self.max_concurrent_batches > 1: + self.async_output_thread = ThreadPoolExecutor( + max_workers=1, thread_name_prefix="WorkerAsyncOutput") + self.collective_rpc("init_worker", args=([kwargs], )) self.collective_rpc("init_device") self.collective_rpc("load_model") + def _distributed_args(self) -> tuple[str, int, int]: + """Return (distributed_init_method, rank, local_rank).""" + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + # set local rank as the device index if specified + device_info = self.vllm_config.device_config.device.__str__().split( + ":") + local_rank = int(device_info[1]) if len(device_info) > 1 else 0 + return distributed_init_method, 0, local_rank + + @cached_property + def max_concurrent_batches(self) -> int: + return 2 if self.scheduler_config.async_scheduling else 1 + def collective_rpc(self, method: Union[str, Callable], timeout: Optional[float] = None, args: Tuple = (), - kwargs: Optional[Dict] = None) -> List[Any]: + kwargs: Optional[Dict] = None, + non_block: bool = False) -> List[Any]: if kwargs is None: kwargs = {} if self.mm_receiver_cache is not None and method == "execute_model": get_and_update_mm_cache(self.mm_receiver_cache, args) - answer = run_method(self.driver_worker, method, args, kwargs) - return [answer] + + if not non_block: + return [run_method(self.driver_worker, method, args, kwargs)] + + try: + result = run_method(self.driver_worker, method, args, kwargs) + if isinstance(result, AsyncModelRunnerOutput): + if (async_thread := self.async_output_thread) is not None: + return [async_thread.submit(result.get_output)] + result = result.get_output() + future = Future[Any]() + future.set_result(result) + except Exception as e: + future = Future[Any]() + future.set_exception(e) + return [future] def check_health(self) -> None: # UniProcExecutor will always be healthy as long as @@ -116,8 +145,9 @@ class ExecutorWithExternalLauncher(UniProcExecutor): assert not envs.VLLM_ENABLE_V1_MULTIPROCESSING, \ ("To get deterministic execution in V1, " "please set VLLM_ENABLE_V1_MULTIPROCESSING=0") - self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config, - rpc_rank=0) + super()._init_executor() + + def _distributed_args(self) -> tuple[str, int, int]: # engines are launched in torchrun-compatible launchers # so we can use the env:// method. # required env vars: @@ -128,19 +158,7 @@ class ExecutorWithExternalLauncher(UniProcExecutor): distributed_init_method = "env://" rank = int(os.environ["RANK"]) local_rank = int(os.environ["LOCAL_RANK"]) - is_driver_worker = True - kwargs = dict( - vllm_config=self.vllm_config, - local_rank=local_rank, - rank=rank, - distributed_init_method=distributed_init_method, - is_driver_worker=is_driver_worker, - ) - self.mm_receiver_cache = worker_receiver_cache_from_config( - self.vllm_config, MULTIMODAL_REGISTRY, Lock()) - self.collective_rpc("init_worker", args=([kwargs], )) - self.collective_rpc("init_device") - self.collective_rpc("load_model") + return distributed_init_method, rank, local_rank def determine_num_available_blocks(self) -> Tuple[int, int]: """ diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py index a5326dfe8..0ab6355f4 100644 --- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from math import log2 from typing import Optional import torch @@ -10,6 +11,7 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceDelegate) from vllm.model_executor.layers.fused_moe.utils import _resize_cache +from vllm.platforms import current_platform from vllm.triton_utils import tl, triton from vllm.utils.deep_gemm import (fp8_m_grouped_gemm_nt_masked, is_deep_gemm_e8m0_used) @@ -24,35 +26,28 @@ def _silu_mul_fp8_quant_deep_gemm( y_q_ptr, # fp8 quantized activations (E, T, H) y_s_ptr, # 16-bit scales (E, T, G) counts_ptr, # int32 num tokens per expert (E) - # Sizes --------------------------------------------------------------- H: tl.constexpr, # hidden dimension (per output) GROUP_SIZE: tl.constexpr, # elements per group (usually 128) - # Strides for input (elements) --------------------------------------- stride_i_e, stride_i_t, stride_i_h, - # Strides for y_q (elements) ----------------------------------------- stride_yq_e, stride_yq_t, stride_yq_h, - # Strides for y_s (elements) ----------------------------------------- stride_ys_e, stride_ys_t, stride_ys_g, - # Stride for counts (elements) stride_counts_e, - # Numeric params ------------------------------------------------------ eps: tl.constexpr, fp8_min: tl.constexpr, fp8_max: tl.constexpr, use_ue8m0: tl.constexpr, - # Meta --------------------------------------------------------------- BLOCK: tl.constexpr, NUM_STAGES: tl.constexpr, @@ -101,17 +96,15 @@ def _silu_mul_fp8_quant_deep_gemm( tl.store(y_s_ptr + base_ys_offset + t * stride_ys_t, y_s) -def silu_mul_fp8_quant_deep_gemm( +def silu_mul_fp8_quant_deep_gemm_cuda( y: torch.Tensor, # (E, T, 2*H) tokens_per_expert: torch.Tensor, # (E,) number of valid tokens per expert + num_parallel_tokens=16, group_size: int = 128, - eps: float = 1e-10, ) -> tuple[torch.Tensor, torch.Tensor]: """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales - - y has shape (E, T, 2*H). The first half of the last dimension is + y has shape (E, T, 2*H). The first half of the last dimension is silu-activated, multiplied by the second half, then quantized into FP8. - Returns `(y_q, y_s)` where * `y_q`: FP8 tensor, shape (E, T, H), same layout as y[..., :H] * `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T) @@ -120,22 +113,17 @@ def silu_mul_fp8_quant_deep_gemm( E, T, H2 = y.shape assert H2 % 2 == 0, "last dim of y must be even (2*H)" H = H2 // 2 - G = H // group_size - assert H % group_size == 0, "H must be divisible by group_size" - assert tokens_per_expert.ndim == 1 and tokens_per_expert.shape[0] == E, \ - "tokens_per_expert must be shape (E,)" + G = (H + group_size - 1) // group_size + assert H % 8 == 0, "H must be divisible by 8" + assert group_size == 128, "H must be divisible by 8" + assert tokens_per_expert.ndim == 1 and tokens_per_expert.shape[0] == E + tokens_per_expert = tokens_per_expert.to(device=y.device, dtype=torch.int32) - # allocate outputs fp8_dtype = torch.float8_e4m3fn y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device) - # strides (elements) - stride_i_e, stride_i_t, stride_i_h = y.stride() - stride_yq_e, stride_yq_t, stride_yq_h = y_q.stride() - - # desired scale strides (elements): (T*G, 1, T) stride_ys_e = T * G stride_ys_t = 1 stride_ys_g = T @@ -144,47 +132,86 @@ def silu_mul_fp8_quant_deep_gemm( dtype=torch.float32, device=y.device) - stride_cnt_e = tokens_per_expert.stride()[0] + use_ue8m0 = is_deep_gemm_e8m0_used() - # Static grid over experts and H-groups. - # A loop inside the kernel handles the token dim - grid = (E * G, ) + if E <= 16: + max_empirical_parallelism = 64 + elif E <= 32: + max_empirical_parallelism = 16 + else: + max_empirical_parallelism = 4 - f_info = torch.finfo(fp8_dtype) - fp8_max = f_info.max - fp8_min = f_info.min + # We never want to launch more than Tx number of threads + # This computes the clip. + num_parallel_tokens = max( + 1, + min(max_empirical_parallelism, 2**int(log2(min(num_parallel_tokens, + T))))) + cuda_arch = current_platform.get_device_capability( + device_id=y.device.index).to_int() - _silu_mul_fp8_quant_deep_gemm[grid]( - y, - y_q, - y_s, - tokens_per_expert, - H, - group_size, - stride_i_e, - stride_i_t, - stride_i_h, - stride_yq_e, - stride_yq_t, - stride_yq_h, - stride_ys_e, - stride_ys_t, - stride_ys_g, - stride_cnt_e, - eps, - fp8_min, - fp8_max, - is_deep_gemm_e8m0_used(), - BLOCK=group_size, - NUM_STAGES=4, - num_warps=1, - ) + if cuda_arch >= 80: + torch.ops._C.silu_mul_fp8_quant_deep_gemm_cuda(y, tokens_per_expert, + y_q, y_s, group_size, + use_ue8m0, + num_parallel_tokens) + else: + # Default to triton if not on cuda or if arch is too old + y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device) + + stride_cnt_e = tokens_per_expert.stride()[0] + + # Static grid over experts and H-groups. + # A loop inside the kernel handles the token dim + grid = (E * G, ) + # strides (elements) + stride_i_e, stride_i_t, stride_i_h = y.stride() + stride_yq_e, stride_yq_t, stride_yq_h = y_q.stride() + + # desired scale strides (elements): (T*G, 1, T) + stride_ys_e = T * G + stride_ys_t = 1 + stride_ys_g = T + y_s = torch.empty_strided( + (E, T, G), + (stride_ys_e, stride_ys_t, stride_ys_g), + dtype=torch.float32, + device=y.device, + ) + f_info = torch.finfo(fp8_dtype) + fp8_max = f_info.max + fp8_min = f_info.min + eps: float = 1e-10 + _silu_mul_fp8_quant_deep_gemm[grid]( + y, + y_q, + y_s, + tokens_per_expert, + H, + group_size, + stride_i_e, + stride_i_t, + stride_i_h, + stride_yq_e, + stride_yq_t, + stride_yq_h, + stride_ys_e, + stride_ys_t, + stride_ys_g, + stride_cnt_e, + eps, + fp8_min, + fp8_max, + is_deep_gemm_e8m0_used(), + BLOCK=group_size, + NUM_STAGES=4, + num_warps=1, + ) return y_q, y_s class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): - # The Deep Gemm kernels only support block size of 128 DEEPGEMM_BLOCK_SHAPE: list[int] = [128, 128] @@ -297,8 +324,8 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): fp8_m_grouped_gemm_nt_masked((a1q, a1q_scale), (w1, w1_scale), workspace1, expert_num_tokens, expected_m) - a2q, a2q_scale = silu_mul_fp8_quant_deep_gemm(workspace1, - expert_num_tokens) + a2q, a2q_scale = silu_mul_fp8_quant_deep_gemm_cuda( + workspace1, expert_num_tokens) fp8_m_grouped_gemm_nt_masked((a2q, a2q_scale), (w2, w2_scale), output, expert_num_tokens, expected_m) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index fd88eac55..773dfeae2 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -740,7 +740,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear): """ Handle special case for models where MLP layers are already fused on disk. In this case, we have no shard id. This function - determmines the shard id by splitting these layers and then calls + determines the shard id by splitting these layers and then calls the weight loader using the shard id. An example of a model with these fused layers: @@ -914,7 +914,7 @@ class QKVParallelLinear(ColumnParallelLinear): """ Handle special case for models where QKV layers are already fused on disk. In this case, we have no shard id. This function - determmines the shard id by splitting these layers and then calls + determines the shard id by splitting these layers and then calls the weight loader using the shard id. An example of a model with these fused layers: diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py index e5604670f..4c6fcda89 100644 --- a/vllm/model_executor/layers/quantization/kv_cache.py +++ b/vllm/model_executor/layers/quantization/kv_cache.py @@ -88,6 +88,7 @@ class BaseKVCacheMethod(QuantizeMethodBase): "Setting it to k_scale. This only matters for " "the flash-attn backend.") layer._q_scale.copy_(k_scale) + layer._q_scale_float = k_scale # These are used in the final Attention.forward() layer._k_scale.copy_(k_scale) @@ -124,6 +125,7 @@ class BaseKVCacheMethod(QuantizeMethodBase): # These are used in the final Attention.forward() layer._q_scale.copy_(q_scale) + layer._q_scale_float = q_scale layer._prob_scale.copy_(prob_scale) if layer.kv_cache_dtype == "fp8" and (q_scale == 1.0 or prob_scale == 1.0): diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py index be25e90ab..db50eb08d 100644 --- a/vllm/model_executor/layers/rotary_embedding/base.py +++ b/vllm/model_executor/layers/rotary_embedding/base.py @@ -62,11 +62,8 @@ class RotaryEmbedding(CustomOp): positions: torch.Tensor, query: torch.Tensor, key: Optional[torch.Tensor] = None, - offsets: Optional[torch.Tensor] = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: """A PyTorch-native implementation of forward().""" - if offsets is not None: - positions = positions + offsets positions = positions.flatten() num_tokens = positions.shape[0] cos_sin = self.cos_sin_cache.index_select(0, positions) @@ -96,7 +93,6 @@ class RotaryEmbedding(CustomOp): positions: torch.Tensor, query: torch.Tensor, key: Optional[torch.Tensor] = None, - offsets: Optional[torch.Tensor] = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: from vllm import _custom_ops as ops @@ -107,16 +103,10 @@ class RotaryEmbedding(CustomOp): self.cos_sin_cache = self.cos_sin_cache.to(query.device, dtype=query.dtype) - # ops.rotary_embedding()/batched_rotary_embedding() - # are in-place operations that update the query and key tensors. - if offsets is not None: - ops.batched_rotary_embedding(positions, query, key, self.head_size, - self.cos_sin_cache, - self.is_neox_style, self.rotary_dim, - offsets) - else: - ops.rotary_embedding(positions, query, key, self.head_size, - self.cos_sin_cache, self.is_neox_style) + # ops.rotary_embedding() is an in-place operation + # that updates the query and key tensors. + ops.rotary_embedding(positions, query, key, self.head_size, + self.cos_sin_cache, self.is_neox_style) return query, key def forward_xpu( @@ -124,29 +114,21 @@ class RotaryEmbedding(CustomOp): positions: torch.Tensor, query: torch.Tensor, key: Optional[torch.Tensor] = None, - offsets: Optional[torch.Tensor] = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: from vllm._ipex_ops import ipex_ops as ops self.cos_sin_cache = self.cos_sin_cache.to(positions.device, dtype=query.dtype) - # ops.rotary_embedding()/batched_rotary_embedding() - # are in-place operations that update the query and key tensors. + # ops.rotary_embedding() is an in-place operation + # that updates the query and key tensors. if key is None: # XPU kernel doesn't support key=None so fall back to native impl # TODO(sarckk): add support for optional key in # ipex.llm.functional.rotary_embedding_batched - return self.forward_native(positions, query, key, offsets) + return self.forward_native(positions, query, key) else: - if offsets is not None: - ops.batched_rotary_embedding(positions, query, key, - self.head_size, - self.cos_sin_cache, - self.is_neox_style, - self.rotary_dim, offsets) - else: - ops.rotary_embedding(positions, query, key, self.head_size, - self.cos_sin_cache, self.is_neox_style) + ops.rotary_embedding(positions, query, key, self.head_size, + self.cos_sin_cache, self.is_neox_style) return query, key def extra_repr(self) -> str: diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index c915ebac9..aa64d4e09 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -258,7 +258,7 @@ class VocabParallelEmbedding(CustomOp): if params_dtype is None: params_dtype = torch.get_default_dtype() - # Divide the weight matrix along the vocaburaly dimension. + # Divide the weight matrix along the vocabulary dimension. self.num_added_embeddings = self.num_embeddings - self.org_vocab_size self.num_embeddings_per_partition = divide(self.num_embeddings_padded, self.tp_size) diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index f49dd36b0..3396c67f4 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -1446,7 +1446,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(nn.Module, SupportsMultiModal, return None # The result multimodal_embeddings is tuple of tensors, with each - # tensor correspoending to a multimodal data item (image or video). + # tensor corresponding to a multimodal data item (image or video). multimodal_embeddings: tuple[torch.Tensor, ...] = () # NOTE: It is important to iterate over the keys in this dictionary diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index 3074451e4..663d4da7c 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -586,10 +586,10 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal, # ruff: noqa # The Gemma3nProcessor expects all audio will be 30s in length and inserts 188 audio soft tokens into the - # text to account for this. However, the audio preprocessing and encoder do not gurarantee they will + # text to account for this. However, the audio preprocessing and encoder do not guarantee they will # produce 188 soft tokens; they will produce at most that many tokens, but they may produce fewer tokens # depending on the length of the longest audio input in the batch. When we encounter this situation, we pad - # the audio feature out to 188 soft tokens with the emebedding of the last token in the embed_audio vocab. + # the audio feature out to 188 soft tokens with the embedding of the last token in the embed_audio vocab. # TODO precompute and cache padding audio_padding_toks = torch.tensor([[self.vocab_size - 1]], dtype=torch.long, diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index d5b71b057..8f8e300c8 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -823,7 +823,7 @@ class SupportsEagle3(Protocol): Args: layers: Tuple of layer indices that should output auxiliary - hidden states. + hidden states. """ ... diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index cb4cd60a8..afe33b4d4 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -1520,15 +1520,9 @@ class BaseKeyeModule(nn.Module): batch. **NOTE**: If mrope is enabled (default setting for Qwen2-VL opensource models), the shape will be `(3, seq_len)`, - otherwise it will be `(seq_len,). - pixel_values: Pixel values to be fed to a model. - `None` if no images are passed. - image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM. - `None` if no images are passed. - pixel_values_videos: Pixel values of videos to be fed to a model. - `None` if no videos are passed. - video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM. - `None` if no videos are passed. + otherwise it will be `(seq_len,)`. + intermediate_tensors: Intermediate tensors from prior forward pass. + inputs_embeds: Optional tensor of input embeddings. """ if intermediate_tensors is not None: inputs_embeds = None diff --git a/vllm/model_executor/models/keye_vl1_5.py b/vllm/model_executor/models/keye_vl1_5.py index b3c44d132..93a3bf5f9 100644 --- a/vllm/model_executor/models/keye_vl1_5.py +++ b/vllm/model_executor/models/keye_vl1_5.py @@ -58,17 +58,18 @@ def split_thw(grid_thw: torch.Tensor) -> torch.Tensor: return torch.cat([ones, h_w], dim=1).repeat_interleave(t, dim=0) -def get_num_patches(grid_thw: torch.Tensor, num_frames: Union[list[int], - torch.Tensor]): +def get_num_patches(grid_thw: torch.Tensor, + num_frames: Union[list[int], torch.Tensor]) -> list[int]: """ Return num_patches per video. Args: - t: tensor with shape [N, ...] where each item is a list/tensor - cu_seqlens: list indicating the boundaries of groups + grid_thw: Tensor with shape [N, 3] containing temporal, height, width + dimensions + num_frames: List or tensor indicating the number of frames per video Returns: - list of ints representing the sum of products for each group + List of ints representing the number of patches for each video Examples: >>> # Suppose there are 2 videos with a total of 3 grids diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index d692b2783..9591deea0 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -732,7 +732,9 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): Args: input_ids: Flattened (concatenated) input_ids corresponding to a batch. - pixel_values: The pixels in each input image. + positions: Position indices for the input tokens. + intermediate_tensors: Intermediate tensors from prior forward pass. + inputs_embeds: Optional tensor of input embeddings. Info: [LlavaImageInputs][] diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index a63c18493..5e82f9799 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -535,8 +535,9 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, Args: input_ids: Flattened (concatenated) input_ids corresponding to a batch. - pixel_values: The pixels in each grid patch for each input image. - image_sizes: The original `(height, width)` for each input image. + positions: Position indices for the input tokens. + intermediate_tensors: Intermediate tensors from prior forward pass. + inputs_embeds: Optional tensor of input embeddings. Info: [LlavaNextImageInputs][] diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index 08948960b..09479012a 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -578,7 +578,9 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA, Args: input_ids: Flattened (concatenated) input_ids corresponding to a batch. - pixel_values: The pixels in each input image. + positions: Position indices for the input tokens. + intermediate_tensors: Intermediate tensors from prior forward pass. + inputs_embeds: Optional tensor of input embeddings. Info: [Mistral3ImagePixelInputs][] diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 6dc77666e..2f0e8a2a5 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -387,11 +387,10 @@ class Llama4VisionEncoder(nn.Module): ) -> torch.Tensor: r""" Args: - inputs_embeds (`torch.FloatTensor` of shape - `(batch_size, sequence_length, hidden_size)`): - Optionally, instead of passing `input_ids` you can choose to - directly pass an embedded representation. This is useful if you - want more control over how to convert `input_ids` indices into + hidden_states: Input tensor of shape + (batch_size, sequence_length, hidden_size). + Hidden states from the model embeddings, representing + the input tokens. associated vectors than the model's internal embedding lookup matrix. """ diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py index 41a2c836b..caa00763f 100644 --- a/vllm/model_executor/models/moonvit.py +++ b/vllm/model_executor/models/moonvit.py @@ -70,11 +70,15 @@ def multihead_attention( v: torch.Tensor, q_cu_seqlens: Optional[torch.Tensor] = None, k_cu_seqlens: Optional[torch.Tensor] = None, -): +) -> torch.Tensor: """Multi-head attention using flash attention 2. Args: - q, k, v: tensor of shape (batch_size, seqlen, num_heads, head_dim), + q: Query tensor of shape (batch_size, seqlen, num_heads, head_dim), + or (tot_seqlens, num_heads, head_dim) if packing. + k: Key tensor of shape (batch_size, seqlen, num_heads, head_dim), + or (tot_seqlens, num_heads, head_dim) if packing. + v: Value tensor of shape (batch_size, seqlen, num_heads, head_dim), or (tot_seqlens, num_heads, head_dim) if packing. q_cu_seqlens (torch.Tensor): cumulative sequence lengths of q. The first element should be 0 and the last element should be q.shape[0]. @@ -123,8 +127,14 @@ def sdpa_attention( """SDPA attention. Args: - q, k, v: tensor of shape (batch_size, seqlen, num_heads, head_dim), + q: Query tensor of shape (batch_size, seqlen, num_heads, head_dim), or (tot_seqlens, num_heads, head_dim) if packing. + k: Key tensor of shape (batch_size, seqlen, num_heads, head_dim), + or (tot_seqlens, num_heads, head_dim) if packing. + v: Value tensor of shape (batch_size, seqlen, num_heads, head_dim), + or (tot_seqlens, num_heads, head_dim) if packing. + q_cu_seqlens: Optional cumulative sequence lengths of q. + k_cu_seqlens: Optional cumulative sequence lengths of k. """ seq_length = q.shape[0] attention_mask = torch.zeros([1, seq_length, seq_length], @@ -387,7 +397,7 @@ class MLP2(nn.Module): def __init__(self, dims: list[int], activation, - bias=True, + bias: bool = True, prefix: str = "", use_data_parallel: bool = False): super().__init__() diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py index a9c7d8044..acda20274 100644 --- a/vllm/model_executor/models/nemotron_vl.py +++ b/vllm/model_executor/models/nemotron_vl.py @@ -560,7 +560,7 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, return [] # The result multimodal_embeddings is tuple of tensors, with each - # tensor correspoending to a multimodal data item (image). + # tensor corresponding to a multimodal data item (image). multimodal_embeddings: tuple[torch.Tensor, ...] = () # NOTE: It is important to iterate over the keys in this dictionary diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index bccd1b870..3e4c580a1 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -52,10 +52,11 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP from vllm.model_executor.models.utils import ( - AutoWeightsLoader, is_pp_missing_parameter, + AutoWeightsLoader, extract_layer_index, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.configs import Olmo3Config class Olmo2Attention(nn.Module): @@ -68,7 +69,7 @@ class Olmo2Attention(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() self.config = vllm_config.model_config.hf_config - assert isinstance(self.config, Olmo2Config) + assert isinstance(self.config, (Olmo2Config, Olmo3Config)) hidden_size = self.config.hidden_size self.tp_size = get_tensor_model_parallel_world_size() @@ -111,14 +112,14 @@ class Olmo2Attention(nn.Module): self.q_norm = RMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps) - # Rotary embeddings. - self.rotary_emb = get_rope( - self.head_dim, - rotary_dim=self.head_dim, - max_position=self.max_position_embeddings, - base=self.rope_theta, # type: ignore - ) self.scaling = self.head_dim**-0.5 + + layer_idx = extract_layer_index(prefix) + sliding_window = None + if ((layer_types := getattr(self.config, "layer_types", None)) + is not None and layer_types[layer_idx] == "sliding_attention"): + sliding_window = self.config.sliding_window + self.attn = Attention( self.num_heads, self.head_dim, @@ -126,7 +127,20 @@ class Olmo2Attention(nn.Module): num_kv_heads=self.num_kv_heads, cache_config=vllm_config.cache_config, quant_config=vllm_config.quant_config, - prefix=prefix, + per_layer_sliding_window=sliding_window, + prefix=f"{prefix}.attn", + ) + + # Rotary embeddings. Rope scaling is only applied on full attention + # layers. + self.rope_scaling = (self.config.rope_scaling + if sliding_window is None else None) + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=self.max_position_embeddings, + base=self.rope_theta, # type: ignore + rope_scaling=self.rope_scaling, ) # Attention output projection. @@ -176,7 +190,7 @@ class Olmo2MLP(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - assert isinstance(config, Olmo2Config) + assert isinstance(config, (Olmo2Config, Olmo3Config)) hidden_size = config.hidden_size intermediate_size = config.intermediate_size @@ -221,7 +235,7 @@ class Olmo2DecoderLayer(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - assert isinstance(config, Olmo2Config) + assert isinstance(config, (Olmo2Config, Olmo3Config)) # Attention block. self.self_attn = Olmo2Attention(vllm_config=vllm_config, prefix=f"{prefix}.self_attn") @@ -261,7 +275,7 @@ class Olmo2Model(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() self.config = vllm_config.model_config.hf_config - assert isinstance(self.config, Olmo2Config) + assert isinstance(self.config, (Olmo2Config, Olmo3Config)) self.embed_tokens = VocabParallelEmbedding( self.config.vocab_size, @@ -376,7 +390,7 @@ class Olmo2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - assert isinstance(config, Olmo2Config) + assert isinstance(config, (Olmo2Config, Olmo3Config)) self.config = config self.model = Olmo2Model(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py index ab63649b4..25df9e926 100644 --- a/vllm/model_executor/models/phi4_multimodal.py +++ b/vllm/model_executor/models/phi4_multimodal.py @@ -374,8 +374,8 @@ class Phi4MMAudioMeanVarianceNormLayer(nn.Module): Typically used as a very first layer in a model. Args: - input_size: int - layer input size. + config: [Phi4MultimodalAudioConfig](https://huggingface.co/docs/transformers/model_doc/phi4_multimodal#transformers.Phi4MultimodalAudioConfig) + object containing model parameters. """ def __init__(self, config: Phi4MultimodalAudioConfig): diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 352ae4064..469638281 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -1154,7 +1154,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): return None # The result multimodal_embeddings is tuple of tensors, with each - # tensor correspoending to a multimodal data item (image or video). + # tensor corresponding to a multimodal data item (image or video). multimodal_embeddings: tuple[torch.Tensor, ...] = () # NOTE: It is important to iterate over the keys in this dictionary diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index cf15dfa67..d08181c5f 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1372,15 +1372,9 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, batch. **NOTE**: If mrope is enabled (default setting for Qwen2-VL opensource models), the shape will be `(3, seq_len)`, - otherwise it will be `(seq_len,). - pixel_values: Pixel values to be fed to a model. - `None` if no images are passed. - image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM. - `None` if no images are passed. - pixel_values_videos: Pixel values of videos to be fed to a model. - `None` if no videos are passed. - video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM. - `None` if no videos are passed. + otherwise it will be `(seq_len,)`. + intermediate_tensors: Intermediate tensors from prior forward pass. + inputs_embeds: Optional tensor of input embeddings. """ if intermediate_tensors is not None: diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index a7e0a0035..85429b3a0 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -170,8 +170,9 @@ class Qwen3MoeSparseMoeBlock(nn.Module): return quant_config def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - # NOTE: hidden_states can have either 1D or 2D shape. - orig_shape = hidden_states.shape + assert hidden_states.dim( + ) <= 2, "Qwen3MoeSparseMoeBlock only supports 1D or 2D inputs" + is_input_1d = hidden_states.dim() == 1 hidden_dim = hidden_states.shape[-1] hidden_states = hidden_states.view(-1, hidden_dim) @@ -180,7 +181,9 @@ class Qwen3MoeSparseMoeBlock(nn.Module): final_hidden_states = self.experts(hidden_states=hidden_states, router_logits=router_logits) - return final_hidden_states.view(orig_shape) + # return to 1d if input is 1d + return final_hidden_states.squeeze(0) if is_input_1d else \ + final_hidden_states class Qwen3MoeAttention(nn.Module): diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 7d7654e84..85759df36 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -120,6 +120,7 @@ _TEXT_GENERATION_MODELS = { "NemotronHForCausalLM": ("nemotron_h", "NemotronHForCausalLM"), "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"), "Olmo2ForCausalLM": ("olmo2", "Olmo2ForCausalLM"), + "Olmo3ForCausalLM": ("olmo2", "Olmo2ForCausalLM"), "OlmoeForCausalLM": ("olmoe", "OlmoeForCausalLM"), "OPTForCausalLM": ("opt", "OPTForCausalLM"), "OrionForCausalLM": ("orion", "OrionForCausalLM"), diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py index a86700fe6..7d90d3a7e 100644 --- a/vllm/model_executor/models/siglip2navit.py +++ b/vllm/model_executor/models/siglip2navit.py @@ -390,12 +390,9 @@ class Siglip2EncoderLayer(nn.Module): position_embeddings: torch.Tensor) -> tuple[torch.FloatTensor]: """ Args: - hidden_states (`torch.FloatTensor`): - Input to the layer of shape `(batch, seq_len, embed_dim)`. - output_attentions (`bool`, *optional*, defaults to `False`): - Whether or not to return the attentions tensors of all - attention layers. See `attentions` under - returned tensors for more detail. + hidden_states: Input tensor of shape (batch, seq_len, embed_dim). + cu_seqlens: Cumulative sequence lengths tensor. + position_embeddings: Position embeddings tensor. """ residual = hidden_states @@ -534,19 +531,11 @@ class Siglip2Encoder(nn.Module): ) -> torch.Tensor: r""" Args: - inputs_embeds (`torch.FloatTensor` of shape - `(batch_size, sequence_length, hidden_size)`): - Optionally, instead of passing `input_ids` you can choose to - directly pass an embedded representation. This is useful if - you want more control over how to convert `input_ids` indices - into associated vectors than the model's internal embedding - lookup matrix. - grid_thws (`torch.LongTensor`): - grid shape (num_patches, 3) - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See - `hidden_states` under returned tensors for more detail. - return_dict (`bool`, *optional*): + inputs_embeds: Input tensor of shape + (batch_size, sequence_length, hidden_size). + Embedded representation of the input tokens. + grid_thws: Grid tensor of shape (num_patches, 3) + containing grid dimensions. Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 9e28b0c44..ad911ebed 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -597,10 +597,11 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): with the `input_ids`. Args: - audio_features: A batch of audio input chunks [B, N, 80, M]. - audio_lens: Length of audio frames for each audio chunk [B]. - audio_token_len: Length of audio tokens for each audio chunk [B']. - Note: batch dim is different from batch dim in audio chunks. + input_ids: Flattened (concatenated) input_ids corresponding to a + batch. + positions: Position indices for the input tokens. + intermediate_tensors: Intermediate tensors from prior forward pass. + inputs_embeds: Optional tensor of input embeddings. """ diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 1ea317c2f..16a97389c 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -585,12 +585,12 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal, r"language_model.model.layers.\1.mlp.down_proj"), (r"layers\.(\d+)\.feed_forward\.w3", r"language_model.model.layers.\1.mlp.up_proj"), - (r"mm_whisper_embeddings\.whisper_encoder\.transformer\.layers\.(\d+)\.attention.wo", - r"whisper_encoder.whisper_encoder.layers.\1.layers.self_attn.out_proj" - ), (r"mm_whisper_embeddings\.whisper_encoder\.transformer\.layers\.(\d+)\.attention.w(.*)", r"whisper_encoder.whisper_encoder.layers.\1.layers.self_attn.\2_proj" ), + (r"mm_whisper_embeddings\.whisper_encoder\.transformer\.layers\.(\d+)\.attention.wo", + r"whisper_encoder.whisper_encoder.layers.\1.layers.self_attn.out_proj" + ), (r"mm_whisper_embeddings\.whisper_encoder\.transformer\.layers\.(\d+)\.feed_forward.w(\d+)", r"whisper_encoder.whisper_encoder.layers.\1.layers.mlp.fc\2"), (r"mm_whisper_embeddings\.whisper_encoder\.conv_layers\.0", diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index 34b9c1ad0..86335d48c 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -909,8 +909,8 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid): prefix: Optional prefix for parameter names Raises: - AssertionError: If prefix caching is enabled - (not supported by Mamba) + AssertionError: If prefix caching is enabled + (not supported by Mamba) """ config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index f8ea3835f..240e34e13 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -85,9 +85,10 @@ which are treated as audio embeddings; these are directly passed to the model without HF processing. """ -ModalityData: TypeAlias = Union[_T, list[_T]] +ModalityData: TypeAlias = Union[_T, list[Optional[_T]], None] """ -Either a single data item, or a list of data items. +Either a single data item, or a list of data items. Can only be None if UUID +is provided. The number of data items allowed per modality is restricted by `--limit-mm-per-prompt`. diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 88bb99529..493dd3560 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -36,7 +36,7 @@ class ModalityDataItems(ABC, Generic[_T, _I]): def __init__(self, data: _T, modality: str) -> None: super().__init__() - self.data = data + self.data: _T = data self.modality = modality def __repr__(self) -> str: @@ -177,7 +177,9 @@ class DictEmbeddingItems(ModalityDataItems[Mapping[str, torch.Tensor], class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]): - def __init__(self, data: Sequence[HfAudioItem]) -> None: + def __init__(self, data: Optional[Sequence[HfAudioItem]]) -> None: + if data is None: + data = [None] super().__init__(data, "audio") def get_audio_length(self, item_idx: int) -> int: @@ -198,7 +200,9 @@ class ImageSize(NamedTuple): class ImageProcessorItems(ProcessorBatchItems[HfImageItem]): - def __init__(self, data: Sequence[HfImageItem]) -> None: + def __init__(self, data: Optional[Sequence[HfImageItem]]) -> None: + if data is None: + data = [None] super().__init__(data, "image") def get_image_size(self, item_idx: int) -> ImageSize: @@ -223,10 +227,12 @@ class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]): def __init__( self, - data: Sequence[HfVideoItem], + data: Optional[Sequence[HfVideoItem]], metadata: Optional[Union[dict[str, Any], list[Optional[dict[str, Any]]]]] = None, ) -> None: + if data is None: + data = [None] super().__init__(data, "video") self.metadata = metadata @@ -385,6 +391,9 @@ class MultiModalDataParser: self, data: ModalityData[AudioItem], ) -> Optional[ModalityDataItems[Any, Any]]: + if data is None: + return AudioProcessorItems(None) + # also check single audio item with sampling rate if self._is_empty(data) or (isinstance(data, tuple) and self._is_empty(data[0])): @@ -420,6 +429,9 @@ class MultiModalDataParser: self, data: ModalityData[ImageItem], ) -> Optional[ModalityDataItems[Any, Any]]: + if data is None: + return ImageProcessorItems(None) + if self._is_empty(data): return None @@ -441,6 +453,9 @@ class MultiModalDataParser: self, data: ModalityData[VideoItem], ) -> Optional[ModalityDataItems[Any, Any]]: + if data is None: + return VideoProcessorItems(None) + if self._is_empty(data): return None diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index e5db356b6..7471bfcb4 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1075,7 +1075,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): [`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data]. """ mm_items = self.data_parser.parse_mm_data(mm_data) - for modality, items in mm_items.items(): self.validate_num_items(modality, len(items)) @@ -1436,10 +1435,18 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): ] for modality, items_is_cached in mm_is_cached.items() } - mm_missing_data = { - modality: [mm_data_items[modality][idx] for idx in idxs] - for modality, idxs in mm_missing_idxs.items() - } + mm_missing_data = {} + for modality, idxs in mm_missing_idxs.items(): + missing_modality_data = [] + for idx in idxs: + data = mm_data_items[modality][idx] + if data is None: + raise ValueError( + f"Cache miss for {modality} at index {idx} " + f"but data is not provided.") + else: + missing_modality_data.append(data) + mm_missing_data[modality] = missing_modality_data return self._to_mm_items(mm_missing_data) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 77c9a012b..42fff2f07 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -179,6 +179,7 @@ class CudaPlatformBase(Platform): cache_config.block_size = 128 logger.info("Forcing kv cache block size to 128 for " "CUTLASS_MLA backend.") + if use_flashinfer_mla and cache_config.block_size not in [32, 64]: cache_config.block_size = 64 logger.info( @@ -541,7 +542,9 @@ class CudaPlatformBase(Platform): attention_backend = "FLASHMLA" # Only FlashMLA and CUTLASS_MLA support fp8 - if attention_backend in ["FLASHMLA", "CUTLASS_MLA"]: + if attention_backend in [ + "FLASHMLA", "CUTLASS_MLA", "FLASHINFER_MLA" + ]: supported = True else: supported = (not fp8_attention) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 2852d16ec..fd19d33ca 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -75,6 +75,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict( eagle="EAGLEConfig", speculators="SpeculatorsConfig", nemotron="NemotronConfig", + olmo3="Olmo3Config", ovis="OvisConfig", ultravox="UltravoxConfig", step3_vl="Step3VLConfig", @@ -678,20 +679,21 @@ def get_hf_file_to_dict(file_name: str, @cache -def get_pooling_config(model: str, revision: Optional[str] = 'main'): +def get_pooling_config(model: str, + revision: Optional[str] = 'main') -> Optional[dict]: """ This function gets the pooling and normalize config from the model - only applies to sentence-transformers models. Args: - model (str): The name of the Hugging Face model. - revision (str, optional): The specific version - of the model to use. Defaults to 'main'. + model: The name of the Hugging Face model. + revision: The specific version of the model to use. + Defaults to 'main'. Returns: - dict: A dictionary containing the pooling - type and whether normalization is used. + A dictionary containing the pooling type and whether + normalization is used, or None if no pooling configuration is found. """ modules_file_name = "modules.json" diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index cdae59ccc..ca0d5def7 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -23,6 +23,7 @@ from vllm.transformers_utils.configs.moonvit import MoonViTConfig from vllm.transformers_utils.configs.nemotron import NemotronConfig from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config +from vllm.transformers_utils.configs.olmo3 import Olmo3Config from vllm.transformers_utils.configs.ovis import OvisConfig from vllm.transformers_utils.configs.qwen3_next import Qwen3NextConfig from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig @@ -45,6 +46,7 @@ __all__ = [ "NemotronConfig", "NemotronHConfig", "Nemotron_Nano_VL_Config", + "Olmo3Config", "OvisConfig", "SpeculatorsConfig", "UltravoxConfig", diff --git a/vllm/transformers_utils/configs/jais.py b/vllm/transformers_utils/configs/jais.py index 767c4ddae..d5ca2c7b4 100644 --- a/vllm/transformers_utils/configs/jais.py +++ b/vllm/transformers_utils/configs/jais.py @@ -74,10 +74,10 @@ class JAISConfig(PretrainedConfig): use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). - scale_attn_by_inverse_layer_idx (`bool`, *optional*, - defaults to `False`): - Whether to additionally scale attention weights by - `1 / layer_idx + 1`. + scale_attn_by_inverse_layer_idx + (`bool`, *optional*, defaults to `False`): + Whether to additionally scale attention weights + by `1 / layer_idx + 1`. reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`): Whether to scale keys (K) prior to computing attention (dot-product) diff --git a/vllm/transformers_utils/configs/olmo3.py b/vllm/transformers_utils/configs/olmo3.py new file mode 100644 index 000000000..874507db4 --- /dev/null +++ b/vllm/transformers_utils/configs/olmo3.py @@ -0,0 +1,80 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from transformers.configuration_utils import PretrainedConfig + + +class Olmo3Config(PretrainedConfig): + + model_type = "olmo3" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=50304, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + use_cache=True, + pad_token_id=1, + bos_token_id=None, + eos_token_id=50279, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + rms_norm_eps=1e-5, + sliding_window=4096, + layer_types=None, + **kwargs, + ): + # This model uses Olmo3ForCausalLM in transformers but Olmo2ForCausalLM + # in vLLM. + if "architectures" not in kwargs: + kwargs["architectures"] = ["Olmo2ForCausalLM"] + elif "Olmo3ForCausalLM" in kwargs["architectures"]: + kwargs["architectures"].remove("Olmo3ForCausalLM") + kwargs["architectures"].append("Olmo2ForCausalLM") + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + + self.rms_norm_eps = rms_norm_eps + + self.sliding_window = sliding_window + self.layer_types = layer_types + if self.layer_types is None: + self.layer_types = [ + "sliding_attention" if (i + 1) % 4 != 0 else "full_attention" + for i in range(self.num_hidden_layers) + ] diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py index e67479516..aaf31d84d 100644 --- a/vllm/transformers_utils/configs/ultravox.py +++ b/vllm/transformers_utils/configs/ultravox.py @@ -37,10 +37,6 @@ class UltravoxConfig(transformers.PretrainedConfig): The initialization value for the layer normalization. projector_act (`str`, *optional*, defaults to `"swiglu"`): The activation function used by the multimodal projector. - text_model_lora_config (`LoraConfigSimplified`, *optional*): - The LoRA configuration for finetuning the text model. - audio_model_lora_config (`LoraConfigSimplified`, *optional*): - The LoRA configuration for finetuning the audio model. projector_ln_mid (`bool`, *optional*, defaults to `False`): Whether to apply layer normalization at the middle of the projector or at the end. Versions v0.4.1 and below diff --git a/vllm/transformers_utils/processors/deepseek_vl2.py b/vllm/transformers_utils/processors/deepseek_vl2.py index 5896bde31..d1d117b4e 100644 --- a/vllm/transformers_utils/processors/deepseek_vl2.py +++ b/vllm/transformers_utils/processors/deepseek_vl2.py @@ -25,6 +25,7 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. import math +from typing import Any import torch import torchvision.transforms as T @@ -178,17 +179,15 @@ class DeepseekVLV2Processor(ProcessorMixin): prompt: str, images: list[Image.Image], inference_mode: bool = True, - **kwargs, + **kwargs: Any, ): """ Args: prompt (str): the formatted prompt; - conversations (list[dict]): conversations with a list of messages; images (list[ImageType]): the list of images; inference_mode (bool): if True, then remove the last eos token; - system_prompt (str): the system prompt; - **kwargs: + **kwargs: Additional keyword arguments. Returns: outputs (BaseProcessorOutput): the output of the processor, @@ -259,7 +258,7 @@ class DeepseekVLV2Processor(ProcessorMixin): text: str, images: list[Image.Image], inference_mode: bool = True, - **kwargs, + **kwargs: Any, ): """ diff --git a/vllm/transformers_utils/runai_utils.py b/vllm/transformers_utils/runai_utils.py index 357c180ed..b7bee1974 100644 --- a/vllm/transformers_utils/runai_utils.py +++ b/vllm/transformers_utils/runai_utils.py @@ -33,7 +33,6 @@ def list_safetensors(path: str = "") -> list[str]: Args: path: The object storage path to list from. - allow_pattern: A list of patterns of which files to pull. Returns: list[str]: List of full object storage paths allowed by the pattern @@ -54,8 +53,7 @@ class ObjectStorageModel: dir: The temporary created directory. Methods: - pull_files(): Pull model from object storage to the temporary - directory. + pull_files(): Pull model from object storage to the temporary directory. """ def __init__(self) -> None: diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py index 62c87c167..d17c1afe9 100644 --- a/vllm/transformers_utils/s3_utils.py +++ b/vllm/transformers_utils/s3_utils.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import fnmatch -from typing import Optional +from typing import Any, Optional from vllm.utils import PlaceholderModule @@ -26,7 +26,7 @@ def _filter_ignore(paths: list[str], patterns: list[str]) -> list[str]: ] -def glob(s3=None, +def glob(s3: Optional[Any] = None, path: str = "", allow_pattern: Optional[list[str]] = None) -> list[str]: """ @@ -51,7 +51,7 @@ def glob(s3=None, def list_files( - s3, + s3: Any, path: str, allow_pattern: Optional[list[str]] = None, ignore_pattern: Optional[list[str]] = None diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 7984f1958..f13381ecd 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -2082,6 +2082,7 @@ async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args, return await task(*args, **kwargs) +@lru_cache def supports_kw( callable: Callable[..., object], kw_name: str, diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py index 12233af05..74eb9ae9d 100644 --- a/vllm/v1/attention/backends/gdn_attn.py +++ b/vllm/v1/attention/backends/gdn_attn.py @@ -209,7 +209,8 @@ class GDNAttentionMetadataBuilder( # prepare tensors for cudagraph if (self.use_full_cuda_graph and num_prefills == 0 and num_decodes == 0 - and num_spec_decodes <= self.decode_cudagraph_max_bs): + and num_spec_decodes <= self.decode_cudagraph_max_bs + and m.num_actual_tokens <= self.decode_cudagraph_max_bs): num_total_tokens = self.vllm_config.pad_for_cudagraph( m.num_actual_tokens) batch_size = num_total_tokens // (self.num_spec + 1) diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 036a281f1..a990cb2f1 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -584,7 +584,6 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): window_left=self._global_hyperparameters.window_left, logits_soft_cap=self._global_hyperparameters.logits_soft_cap, q_data_type=self.model_config.dtype, - kv_data_type=self.kv_cache_spec.dtype, ) # Prepare context prefills @@ -605,7 +604,6 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): logits_soft_cap=self._global_hyperparameters. logits_soft_cap, q_data_type=self.model_config.dtype, - kv_data_type=self.kv_cache_spec.dtype, ) prefill.prefill_main = self._fi_prefill_main diff --git a/vllm/v1/attention/backends/mla/flashinfer_mla.py b/vllm/v1/attention/backends/mla/flashinfer_mla.py index 71eb9e0ce..701248670 100644 --- a/vllm/v1/attention/backends/mla/flashinfer_mla.py +++ b/vllm/v1/attention/backends/mla/flashinfer_mla.py @@ -6,8 +6,7 @@ from typing import Optional, Union import torch from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla -from vllm.attention.backends.abstract import (AttentionLayer, AttentionType, - is_quantized_kv_cache) +from vllm.attention.backends.abstract import AttentionLayer, AttentionType from vllm.logger import init_logger from vllm.v1.attention.backends.mla.common import (MLACommonBackend, MLACommonImpl, @@ -69,11 +68,9 @@ class FlashInferMLAImpl(MLACommonImpl[MLACommonMetadata]): "are not implemented for " "FlashInferMLAImpl") - if is_quantized_kv_cache(self.kv_cache_dtype): - raise NotImplementedError( - "FlashInferMLA V1 with FP8 KV cache not yet supported") - self._workspace_buffer = g_fi_workspace + self.bmm1_scale: Optional[float] = None + self.bmm2_scale: Optional[float] = None def _forward_decode( self, @@ -92,6 +89,12 @@ class FlashInferMLAImpl(MLACommonImpl[MLACommonMetadata]): # trtllm API requires extra dimension q_len_per_request for MTP q = q.unsqueeze(1) + if self.bmm1_scale is None: + self.bmm1_scale = (layer._q_scale_float * layer._k_scale_float * + self.scale) + if self.bmm2_scale is None: + self.bmm2_scale = layer._v_scale_float + o = trtllm_batch_decode_with_kv_cache_mla( query=q, kv_cache=kv_c_and_k_pe_cache.unsqueeze(1), @@ -102,7 +105,8 @@ class FlashInferMLAImpl(MLACommonImpl[MLACommonMetadata]): block_tables=attn_metadata.decode.block_table, seq_lens=attn_metadata.decode.seq_lens, max_seq_len=attn_metadata.max_seq_len, - bmm1_scale=self.scale, + bmm1_scale=self.bmm1_scale, + bmm2_scale=self.bmm2_scale, ) # TODO: Return LSE pending support from Flashinfer API: diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 27ee146c4..995e70385 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -159,6 +159,9 @@ class EngineCore: self.request_block_hasher = get_request_block_hasher( block_size, caching_hash_fn) + self.step_fn = (self.step if self.batch_queue is None else + self.step_with_batch_queue) + def _initialize_kv_caches( self, vllm_config: VllmConfig) -> tuple[int, int, KVCacheConfig]: start = time.time() @@ -331,7 +334,8 @@ class EngineCore: model_executed = False if self.scheduler.has_requests(): scheduler_output = self.scheduler.schedule() - future = self.model_executor.execute_model(scheduler_output) + future = self.model_executor.execute_model(scheduler_output, + non_block=True) batch_queue.appendleft( (future, scheduler_output)) # type: ignore[arg-type] @@ -534,9 +538,6 @@ class EngineCoreProc(EngineCore): assert addresses.coordinator_input is not None logger.info("Waiting for READY message from DP Coordinator...") - self.step_fn = (self.step if self.batch_queue is None else - self.step_with_batch_queue) - # Mark the startup heap as static so that it's ignored by GC. # Reduces pause times of oldest generation collections. gc.collect() diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 605bedaf1..bb0f37c6e 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -245,8 +245,8 @@ class InprocClient(EngineCoreClient): self.engine_core = EngineCore(*args, **kwargs) def get_output(self) -> EngineCoreOutputs: - outputs, _ = self.engine_core.step() - return outputs.get(0) or EngineCoreOutputs() + outputs, _ = self.engine_core.step_fn() + return outputs and outputs.get(0) or EngineCoreOutputs() def get_supported_tasks(self) -> tuple[SupportedTask, ...]: return self.engine_core.get_supported_tasks() diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index a54a98adf..f3fad15b7 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -260,10 +260,10 @@ class Processor: else: # NOTE: engine_level_backend must be "auto" here, because we have # checked supported_backends above. - # "auto" is an opt-in to opinionated behavior where we try to - # choose a backend based on request contents. This is not the - # default as it is less predictable and subject to change - # between releases as feature support changes. + # In this mode, we set opinionated defaults based on what we think + # will satisfy the most use cases without having to worry about + # this setting. We include fallback behavior here, but not with any + # other setting where a specific backend was specified. try: validate_xgrammar_grammar(params) params.guided_decoding.backend = "xgrammar" diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py index 68408a0b8..625017d52 100644 --- a/vllm/v1/executor/abstract.py +++ b/vllm/v1/executor/abstract.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from concurrent.futures import Future -from typing import Callable, Optional, Union +from typing import Any, Callable, Optional, Union import torch import torch.distributed as dist @@ -14,6 +14,7 @@ from vllm.executor.uniproc_executor import ( # noqa from vllm.executor.uniproc_executor import ( # noqa UniProcExecutor as UniProcExecutorV0) from vllm.utils import resolve_obj_by_qualname +from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput @@ -86,12 +87,22 @@ class Executor(ExecutorBase): def get_kv_cache_specs(self) -> list[dict[str, KVCacheSpec]]: return self.collective_rpc("get_kv_cache_spec") + def collective_rpc(self, + method: Union[str, Callable], + timeout: Optional[float] = None, + args: tuple = (), + kwargs: Optional[dict] = None, + non_block: bool = False) -> list[Any]: + raise NotImplementedError + def execute_model( self, - scheduler_output, + scheduler_output: SchedulerOutput, + non_block: bool = False, ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]: output = self.collective_rpc("execute_model", - args=(scheduler_output, )) + args=(scheduler_output, ), + non_block=non_block) return output[0] def execute_dummy_batch(self) -> None: diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 2f0e5aa38..f566c9aee 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -11,7 +11,7 @@ import weakref from concurrent.futures import Future, ThreadPoolExecutor from dataclasses import dataclass from enum import Enum, auto -from functools import partial +from functools import cached_property, partial from multiprocessing.connection import Connection from multiprocessing.process import BaseProcess from multiprocessing.synchronize import Lock as LockType @@ -37,6 +37,7 @@ from vllm.multimodal.cache import worker_receiver_cache_from_config from vllm.utils import (decorate_logs, get_distributed_init_method, get_loopback_ip, get_mp_context, get_open_port, set_process_title) +from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.executor.abstract import Executor, FailureCallback from vllm.v1.executor.utils import get_and_update_mm_cache from vllm.v1.outputs import (AsyncModelRunnerOutput, DraftTokenIds, @@ -174,9 +175,9 @@ class MultiprocExecutor(Executor): def execute_model( self, - scheduler_output, + scheduler_output: SchedulerOutput, + non_block: bool = False, ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]: - non_block = self.max_concurrent_batches > 1 if not self.has_connector: # get output only from a single worker (output_rank) @@ -328,7 +329,7 @@ class MultiprocExecutor(Executor): self.collective_rpc("check_health", timeout=10) return - @property + @cached_property def max_concurrent_batches(self) -> int: if self.scheduler_config.async_scheduling: return 2 @@ -632,7 +633,8 @@ class WorkerProc: result = (WorkerProc.ResponseStatus.FAILURE, str(output)) else: result = (WorkerProc.ResponseStatus.SUCCESS, output) - self.worker_response_mq.enqueue(result) + if (response_mq := self.worker_response_mq) is not None: + response_mq.enqueue(result) def handle_output(self, output: Any): """Handles output from the worker. If async scheduling is enabled, diff --git a/vllm/v1/executor/ray_distributed_executor.py b/vllm/v1/executor/ray_distributed_executor.py index 8394ae788..59c9b5662 100644 --- a/vllm/v1/executor/ray_distributed_executor.py +++ b/vllm/v1/executor/ray_distributed_executor.py @@ -66,11 +66,13 @@ class RayDistributedExecutor(RayDistributedExecutorV0, Executor): def execute_model( self, scheduler_output: SchedulerOutput, + non_block: bool = False, ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]: """Execute the model on the Ray workers. Args: scheduler_output: The scheduler output to execute. + non_block: If True, the method will return a Future. Returns: The model runner output. @@ -84,7 +86,7 @@ class RayDistributedExecutor(RayDistributedExecutorV0, Executor): if not self.has_connector: # Get output only from a single worker (output_rank) # When PP is not used, we block here until the result is available. - if self.max_concurrent_batches == 1: + if not non_block: return refs[0].get() # When PP is used, we return a FutureWrapper immediately so that @@ -92,7 +94,7 @@ class RayDistributedExecutor(RayDistributedExecutorV0, Executor): return FutureWrapper(refs) # Get output from all workers when connector is present - if self.max_concurrent_batches == 1: + if not non_block: # Block and get results from all workers outputs = [ref.get() for ref in refs] return self.kv_output_aggregator.aggregate(outputs) @@ -106,4 +108,3 @@ class RayDistributedExecutor(RayDistributedExecutorV0, Executor): if reconfig_request.new_data_parallel_rank == \ ReconfigureRankType.SHUTDOWN_CURRENT_RANK: self.shutdown() - return \ No newline at end of file diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py index f2ebd5e10..01d5f0525 100644 --- a/vllm/v1/worker/lora_model_runner_mixin.py +++ b/vllm/v1/worker/lora_model_runner_mixin.py @@ -63,8 +63,7 @@ class LoRAModelRunnerMixin: def _set_active_loras(self, prompt_lora_mapping: tuple[int, ...], token_lora_mapping: tuple[int, ...], lora_requests: set[LoRARequest]) -> None: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") + self._ensure_lora_enabled() # Set is_prefill to True, so we always use the SGMV kernels on # non-cuda platforms. @@ -75,6 +74,11 @@ class LoRAModelRunnerMixin: is_prefill=True) self.lora_manager.set_active_adapters(lora_requests, lora_mapping) + def _ensure_lora_enabled(self) -> None: + if not hasattr(self, "lora_manager"): + raise RuntimeError( + "LoRA is not enabled. Use --enable-lora to enable LoRA.") + def set_active_loras(self, input_batch: InputBatch, num_scheduled_tokens: np.ndarray) -> None: @@ -172,21 +176,17 @@ class LoRAModelRunnerMixin: self.lora_manager.remove_all_adapters() def add_lora(self, lora_request: LoRARequest) -> bool: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") + self._ensure_lora_enabled() return self.lora_manager.add_adapter(lora_request) def remove_lora(self, lora_id: int) -> bool: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") + self._ensure_lora_enabled() return self.lora_manager.remove_adapter(lora_id) def pin_lora(self, lora_id: int) -> bool: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") + self._ensure_lora_enabled() return self.lora_manager.pin_adapter(lora_id) def list_loras(self) -> set[int]: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") + self._ensure_lora_enabled() return self.lora_manager.list_adapters()