109 lines
3.9 KiB
Python
109 lines
3.9 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
import os
|
|
import random
|
|
|
|
import pytest
|
|
import torch
|
|
|
|
from vllm.platforms import current_platform
|
|
from vllm.transformers_utils.config import get_config
|
|
from vllm.transformers_utils.model_arch_config_convertor import (
|
|
ModelArchConfigConvertorBase,
|
|
)
|
|
from vllm.v1.attention.backends.fa_utils import flash_attn_supports_mla
|
|
|
|
skip_unsupported = pytest.mark.skipif(
|
|
not (current_platform.is_cuda() and current_platform.has_device_capability(80)),
|
|
# Supports testing on Ampere and Ada Lovelace devices.
|
|
# Note: For devices with SM < 90, batch invariance does not support CUDA Graphs.
|
|
reason="Requires CUDA and >= Ampere (SM80)",
|
|
)
|
|
|
|
DEFAULT_MODEL = "Qwen/Qwen3-1.7B"
|
|
TEST_MODEL = os.getenv("VLLM_TEST_MODEL", DEFAULT_MODEL)
|
|
|
|
BACKENDS: list[str] = [
|
|
"FLASH_ATTN",
|
|
"TRITON_ATTN",
|
|
]
|
|
|
|
# FlashInfer temporarily disabled due to invariant CTA sizes.
|
|
# See FlashInfer issue #2424
|
|
# if has_flashinfer():
|
|
# BACKENDS.append("FLASHINFER")
|
|
|
|
# only run MLA backends when the requested test model is itself an MLA model.
|
|
if os.getenv("VLLM_TEST_MODEL"):
|
|
config = get_config(TEST_MODEL, trust_remote_code=False)
|
|
if ModelArchConfigConvertorBase(config, config.get_text_config()).is_deepseek_mla():
|
|
BACKENDS = ["TRITON_MLA"]
|
|
if flash_attn_supports_mla():
|
|
BACKENDS.append("FLASH_ATTN_MLA")
|
|
|
|
|
|
def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
|
|
# Generate more realistic prompts that will actually produce varied tokens
|
|
# Use a mix of common English text patterns
|
|
|
|
prompt_templates = [
|
|
# Question-answer style
|
|
"Question: What is the capital of France?\nAnswer: The capital of France is",
|
|
"Q: How does photosynthesis work?\nA: Photosynthesis is the process by which",
|
|
"User: Can you explain quantum mechanics?\nAssistant: Quantum mechanics is",
|
|
# Story/narrative style
|
|
"Once upon a time in a distant galaxy, there lived",
|
|
"The old man walked slowly down the street, remembering",
|
|
"In the year 2157, humanity finally discovered",
|
|
# Technical/code style
|
|
"To implement a binary search tree in Python, first we need to",
|
|
"The algorithm works by iterating through the array and",
|
|
"Here's how to optimize database queries using indexing:",
|
|
# Factual/informative style
|
|
"The Renaissance was a period in European history that",
|
|
"Climate change is caused by several factors including",
|
|
"The human brain contains approximately 86 billion neurons which",
|
|
# Conversational style
|
|
"I've been thinking about getting a new laptop because",
|
|
"Yesterday I went to the store and bought",
|
|
"My favorite thing about summer is definitely",
|
|
]
|
|
|
|
# Pick a random template
|
|
base_prompt = random.choice(prompt_templates)
|
|
|
|
if max_words < min_words:
|
|
max_words = min_words
|
|
target_words = random.randint(min_words, max_words)
|
|
|
|
if target_words > 50:
|
|
# For longer prompts, repeat context
|
|
padding_text = (
|
|
" This is an interesting topic that deserves more explanation. "
|
|
# TODO: Update to * (target_words // 10) to better align with word ratio
|
|
* (target_words // 50)
|
|
)
|
|
base_prompt = padding_text + base_prompt
|
|
|
|
return base_prompt
|
|
|
|
|
|
def _extract_step_logprobs(request_output):
|
|
if getattr(request_output, "outputs", None):
|
|
inner = request_output.outputs[0]
|
|
if hasattr(inner, "logprobs") and inner.logprobs is not None:
|
|
t = torch.tensor(
|
|
[
|
|
inner.logprobs[i][tid].logprob
|
|
for i, tid in enumerate(inner.token_ids)
|
|
],
|
|
dtype=torch.float32,
|
|
)
|
|
return t, inner.token_ids
|
|
|
|
return None, None
|
|
|
|
|
|
def is_device_capability_below_90() -> bool:
|
|
return not current_platform.has_device_capability(90)
|