Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-10-05 15:06:22 +01:00
committed by GitHub
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions

View File

@@ -8,19 +8,23 @@ import torch
from tests.kernels.allclose_default import get_default_atol, get_default_rtol
from tests.kernels.utils import opcheck
from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul,
GeluAndMul, MulAndSilu,
NewGELU, QuickGELU,
SiluAndMul, SwigluOAIAndMul)
from vllm.model_executor.layers.activation import (
FastGELU,
FatreluAndMul,
GeluAndMul,
MulAndSilu,
NewGELU,
QuickGELU,
SiluAndMul,
SwigluOAIAndMul,
)
from vllm.platforms import current_platform
DTYPES = [torch.half, torch.bfloat16, torch.float]
NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing
D = [512, 13824] # Arbitrary values for testing
SEEDS = [0]
CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
]
CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
@pytest.mark.parametrize(
@@ -73,24 +77,19 @@ def test_act_and_mul(
out = layer(x)
ref_out = layer.forward_native(x)
if activation == "swigluoai_and_mul":
rtol = {
#For fp16, change the relative tolerance from 1e-3 to 2e-3
torch.float16:
2e-3,
torch.bfloat16:
2e-2,
torch.float:
1.3e-6
# For fp16, change the relative tolerance from 1e-3 to 2e-3
torch.float16: 2e-3,
torch.bfloat16: 2e-2,
torch.float: 1.3e-6,
}
def _get_rtol(output) -> float:
return rtol[output.dtype]
torch.testing.assert_close(out,
ref_out,
atol=get_default_atol(out),
rtol=_get_rtol(out))
torch.testing.assert_close(
out, ref_out, atol=get_default_atol(out), rtol=_get_rtol(out)
)
else:
# The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are
# equivalent to the native PyTorch implementations, so we can do exact
@@ -98,7 +97,7 @@ def test_act_and_mul(
torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
d = x.shape[-1] // 2
output_shape = (x.shape[:-1] + (d, ))
output_shape = x.shape[:-1] + (d,)
out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
if activation == "fatrelu":
opcheck(fn, (out, x, threshold))
@@ -108,9 +107,14 @@ def test_act_and_mul(
opcheck(fn, (out, x))
@pytest.mark.parametrize("activation", [(FastGELU, torch.ops._C.gelu_fast),
(NewGELU, torch.ops._C.gelu_new),
(QuickGELU, torch.ops._C.gelu_quick)])
@pytest.mark.parametrize(
"activation",
[
(FastGELU, torch.ops._C.gelu_fast),
(NewGELU, torch.ops._C.gelu_new),
(QuickGELU, torch.ops._C.gelu_quick),
],
)
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("d", D)
@pytest.mark.parametrize("dtype", DTYPES)
@@ -132,10 +136,9 @@ def test_activation(
fn = activation[1]
out = layer(x)
ref_out = layer.forward_native(x)
torch.testing.assert_close(out,
ref_out,
atol=get_default_atol(out),
rtol=get_default_rtol(out))
torch.testing.assert_close(
out, ref_out, atol=get_default_atol(out), rtol=get_default_rtol(out)
)
out = torch.empty_like(x)
opcheck(fn, (out, x))

View File

@@ -24,9 +24,7 @@ NUM_TOKENS_HIDDEN_SIZES = [
ADD_RESIDUAL = [False, True]
SCALE_UBS = [True, False]
SEEDS = [0]
CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
]
CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
EPS = 1e-6
@@ -34,13 +32,12 @@ EPS = 1e-6
def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
return torch.as_tensor(x, dtype=torch.float32, device='cuda')
return torch.as_tensor(x, dtype=torch.float32, device="cuda")
def ref_rms_norm(rms_norm_layer: RMSNorm,
x: torch.Tensor,
residual: Optional[torch.Tensor]) \
-> tuple[torch.Tensor, Optional[torch.Tensor]]:
def ref_rms_norm(
rms_norm_layer: RMSNorm, x: torch.Tensor, residual: Optional[torch.Tensor]
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
if residual is not None:
residual = residual.clone()
out, residual = rms_norm_layer.forward_native(x, residual)
@@ -50,12 +47,13 @@ def ref_rms_norm(rms_norm_layer: RMSNorm,
return out, residual
def ref_dynamic_per_token_quant(rms_norm_layer: RMSNorm,
x: torch.Tensor,
quant_dtype: torch.dtype,
residual: Optional[torch.Tensor],
scale_ub: Optional[torch.Tensor]) \
-> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
def ref_dynamic_per_token_quant(
rms_norm_layer: RMSNorm,
x: torch.Tensor,
quant_dtype: torch.dtype,
residual: Optional[torch.Tensor],
scale_ub: Optional[torch.Tensor],
) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
if scale_ub is not None:
assert quant_dtype == torch.float8_e4m3fn
@@ -64,9 +62,9 @@ def ref_dynamic_per_token_quant(rms_norm_layer: RMSNorm,
# Quant
if quant_dtype == torch.float8_e4m3fn:
torch_out, scales = ops.scaled_fp8_quant(torch_out,
scale_ub=scale_ub,
use_per_token_if_dynamic=True)
torch_out, scales = ops.scaled_fp8_quant(
torch_out, scale_ub=scale_ub, use_per_token_if_dynamic=True
)
else:
assert quant_dtype == torch.int8
torch_out, scales = ops.scaled_int8_quant(torch_out)
@@ -74,38 +72,41 @@ def ref_dynamic_per_token_quant(rms_norm_layer: RMSNorm,
return torch_out, scales, residual
def ref_impl(rms_norm_layer: RMSNorm,
x: torch.Tensor,
quant_dtype: torch.dtype,
residual: Optional[torch.Tensor],
scale_ub: Optional[torch.Tensor]) \
-> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
return ref_dynamic_per_token_quant(rms_norm_layer, x, quant_dtype,
residual, scale_ub)
def ref_impl(
rms_norm_layer: RMSNorm,
x: torch.Tensor,
quant_dtype: torch.dtype,
residual: Optional[torch.Tensor],
scale_ub: Optional[torch.Tensor],
) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
return ref_dynamic_per_token_quant(
rms_norm_layer, x, quant_dtype, residual, scale_ub
)
def ops_dynamic_per_token_quant(weight: torch.Tensor,
x: torch.Tensor,
quant_dtype: torch.dtype,
residual: Optional[torch.Tensor],
scale_ub: Optional[torch.Tensor]) \
-> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
def ops_dynamic_per_token_quant(
weight: torch.Tensor,
x: torch.Tensor,
quant_dtype: torch.dtype,
residual: Optional[torch.Tensor],
scale_ub: Optional[torch.Tensor],
) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
if residual is not None:
residual = residual.clone()
out, scales = ops.rms_norm_dynamic_per_token_quant(x, weight, EPS,
quant_dtype, scale_ub,
residual)
out, scales = ops.rms_norm_dynamic_per_token_quant(
x, weight, EPS, quant_dtype, scale_ub, residual
)
return out, scales, residual
def ops_impl(weight: torch.Tensor,
x: torch.Tensor,
quant_dtype: torch.dtype,
residual: Optional[torch.Tensor],
scale_ub: Optional[torch.Tensor]) \
-> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
return ops_dynamic_per_token_quant(weight, x, quant_dtype, residual,
scale_ub)
def ops_impl(
weight: torch.Tensor,
x: torch.Tensor,
quant_dtype: torch.dtype,
residual: Optional[torch.Tensor],
scale_ub: Optional[torch.Tensor],
) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
return ops_dynamic_per_token_quant(weight, x, quant_dtype, residual, scale_ub)
@pytest.mark.parametrize("num_tokens, hidden_size", NUM_TOKENS_HIDDEN_SIZES)
@@ -146,12 +147,14 @@ def test_rms_norm(
residual = torch.randn_like(x) * scale if add_residual else None
if scale_ub is not None:
rms_x, _ = ref_rms_norm(layer, x, residual)
scale_ub = torch.mean(rms_x).to(dtype=torch.float32, device='cuda')
scale_ub = torch.mean(rms_x).to(dtype=torch.float32, device="cuda")
ref_out, ref_scales, ref_residual = \
ref_impl(layer, x, quant_dtype, residual, scale_ub)
ops_out, ops_scales, ops_residual = \
ops_impl(layer.weight, x, quant_dtype, residual, scale_ub)
ref_out, ref_scales, ref_residual = ref_impl(
layer, x, quant_dtype, residual, scale_ub
)
ops_out, ops_scales, ops_residual = ops_impl(
layer.weight, x, quant_dtype, residual, scale_ub
)
assert ref_out.dtype == quant_dtype
assert ops_out.dtype == quant_dtype
@@ -160,15 +163,18 @@ def test_rms_norm(
# big atol to account for round-off errors.
assert torch.allclose(ref_out, ops_out, atol=1)
else:
assert torch.allclose(ref_out.to(dtype=torch.float32),
ops_out.to(dtype=torch.float32))
assert torch.allclose(
ref_out.to(dtype=torch.float32), ops_out.to(dtype=torch.float32)
)
if add_residual:
assert torch.allclose(ref_residual, ops_residual)
output = torch.empty_like(x, dtype=quant_dtype)
scales = torch.empty((x.numel() // x.shape[-1], 1),
device=x.device,
dtype=torch.float32)
scales = torch.empty(
(x.numel() // x.shape[-1], 1), device=x.device, dtype=torch.float32
)
opcheck(torch.ops._C.rms_norm_dynamic_per_token_quant,
(output, x, layer.weight, scales, 1e-5, scale_ub, residual))
opcheck(
torch.ops._C.rms_norm_dynamic_per_token_quant,
(output, x, layer.weight, scales, 1e-5, scale_ub, residual),
)

View File

@@ -11,13 +11,22 @@ from vllm.platforms import current_platform
DTYPES = [torch.half, torch.bfloat16, torch.float]
NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing
HIDDEN_SIZES = [8, 768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192,
8199] # Arbitrary values for testing
HIDDEN_SIZES = [
8,
768,
769,
770,
771,
5120,
5124,
5125,
5126,
8192,
8199,
] # Arbitrary values for testing
ADD_RESIDUAL = [False, True]
SEEDS = [0]
CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
]
CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -63,11 +72,14 @@ def test_rms_norm(
torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
if residual is not None:
opcheck(torch.ops._C.fused_add_rms_norm,
(x, residual, layer.weight.data, layer.variance_epsilon))
opcheck(
torch.ops._C.fused_add_rms_norm,
(x, residual, layer.weight.data, layer.variance_epsilon),
)
else:
opcheck(torch.ops._C.rms_norm,
(out, x, layer.weight.data, layer.variance_epsilon))
opcheck(
torch.ops._C.rms_norm, (out, x, layer.weight.data, layer.variance_epsilon)
)
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -98,7 +110,8 @@ def test_poly_norm(
opcheck(
torch.ops._C.poly_norm,
(out, x, layer.weight.data, layer.bias.data, layer.variance_epsilon))
(out, x, layer.weight.data, layer.bias.data, layer.variance_epsilon),
)
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -144,7 +157,8 @@ def test_fused_rms_norm_quant(
if add_residual:
torch.ops._C.fused_add_rms_norm_static_fp8_quant(
out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6)
out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6
)
# Unfused kernel is in-place so it goes second
# Also use a separate clone of x to avoid modifying the input
@@ -152,29 +166,32 @@ def test_fused_rms_norm_quant(
x_unfused = x_unfused_base[..., :hidden_size]
assert x_unfused.is_contiguous() != strided_input
torch.ops._C.fused_add_rms_norm(x_unfused, residual, weight, 1e-6)
torch.ops._C.static_scaled_fp8_quant(out_quant, x_unfused.contiguous(),
quant_scale_t)
torch.ops._C.static_scaled_fp8_quant(
out_quant, x_unfused.contiguous(), quant_scale_t
)
torch.cuda.synchronize()
torch.testing.assert_close(residual_fused,
residual,
atol=1e-2,
rtol=1e-2)
torch.testing.assert_close(residual_fused, residual, atol=1e-2, rtol=1e-2)
opcheck(
torch.ops._C.fused_add_rms_norm_static_fp8_quant,
(out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6))
(out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6),
)
else:
torch.ops._C.rms_norm_static_fp8_quant(out_quant_fused, x, weight,
quant_scale_t, 1e-6)
torch.ops._C.rms_norm_static_fp8_quant(
out_quant_fused, x, weight, quant_scale_t, 1e-6
)
torch.ops._C.rms_norm(out_norm, x, weight, 1e-6)
torch.ops._C.static_scaled_fp8_quant(out_quant, out_norm,
quant_scale_t)
torch.ops._C.static_scaled_fp8_quant(out_quant, out_norm, quant_scale_t)
opcheck(torch.ops._C.rms_norm_static_fp8_quant,
(out_quant_fused, x, weight, quant_scale_t, 1e-6))
opcheck(
torch.ops._C.rms_norm_static_fp8_quant,
(out_quant_fused, x, weight, quant_scale_t, 1e-6),
)
torch.testing.assert_close(out_quant.to(dtype=torch.float32),
out_quant_fused.to(dtype=torch.float32),
atol=1e-3,
rtol=1e-3)
torch.testing.assert_close(
out_quant.to(dtype=torch.float32),
out_quant_fused.to(dtype=torch.float32),
atol=1e-3,
rtol=1e-3,
)

View File

@@ -14,25 +14,25 @@ from vllm.platforms import current_platform
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def generate_test_data(num_tokens: int, num_q_heads: int, num_kv_heads: int,
head_size: int, max_position_embeddings: int,
dtype: torch.dtype, device: torch.device):
def generate_test_data(
num_tokens: int,
num_q_heads: int,
num_kv_heads: int,
head_size: int,
max_position_embeddings: int,
dtype: torch.dtype,
device: torch.device,
):
"""Generate test data for given configuration."""
current_platform.seed_everything(42)
# Create 2D positions (3, num_tokens) for multimodal case
positions = torch.randint(0,
max_position_embeddings // 4, (3, num_tokens),
device=device)
positions = torch.randint(
0, max_position_embeddings // 4, (3, num_tokens), device=device
)
# Create query and key tensors
query = torch.randn(num_tokens,
num_q_heads * head_size,
dtype=dtype,
device=device)
key = torch.randn(num_tokens,
num_kv_heads * head_size,
dtype=dtype,
device=device)
query = torch.randn(num_tokens, num_q_heads * head_size, dtype=dtype, device=device)
key = torch.randn(num_tokens, num_kv_heads * head_size, dtype=dtype, device=device)
return positions, query, key
@@ -59,7 +59,8 @@ MODELS_TO_TEST = [
Version(TRANSFORMERS_BASE_VERSION) < Version("4.57.0"),
reason="Qwen3-VL only available after Transformers v4.57",
)
]),
],
),
MRoPETestInfo(
model_name="Qwen/Qwen3-VL-30B-A3B-Instruct",
marks=[
@@ -67,24 +68,33 @@ MODELS_TO_TEST = [
Version(TRANSFORMERS_BASE_VERSION) < Version("4.57.0"),
reason="Qwen3-VL only available after Transformers v4.57",
)
]),
],
),
]
num_tokens_list = [11, 8192]
@pytest.mark.skipif(not current_platform.is_cuda_alike(),
reason="Skipping CUDA/ROCm only tests.")
@pytest.mark.parametrize("model_info, model_name", [
pytest.param(test_config, test_config.model_name, marks=test_config.marks)
for test_config in MODELS_TO_TEST
])
@pytest.mark.skipif(
not current_platform.is_cuda_alike(), reason="Skipping CUDA/ROCm only tests."
)
@pytest.mark.parametrize(
"model_info, model_name",
[
pytest.param(test_config, test_config.model_name, marks=test_config.marks)
for test_config in MODELS_TO_TEST
],
)
@pytest.mark.parametrize("tp_size", [1, 2])
@pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("num_tokens", num_tokens_list)
def test_mrope(model_name: str, model_info: MRoPETestInfo, tp_size: int,
dtype: torch.dtype, num_tokens: int):
def test_mrope(
model_name: str,
model_info: MRoPETestInfo,
tp_size: int,
dtype: torch.dtype,
num_tokens: int,
):
atol = model_info.atol
rtol = model_info.rtol
@@ -96,8 +106,11 @@ def test_mrope(model_name: str, model_info: MRoPETestInfo, tp_size: int,
total_num_heads = config.num_attention_heads
num_heads = total_num_heads // tp_size
num_kv_heads = max(1, total_num_kv_heads // tp_size)
head_dim = (config.head_dim if hasattr(config, "head_dim") else
config.hidden_size // total_num_heads)
head_dim = (
config.head_dim
if hasattr(config, "head_dim")
else config.hidden_size // total_num_heads
)
is_neox_style = True
rope_theta = config.rope_theta
@@ -117,9 +130,9 @@ def test_mrope(model_name: str, model_info: MRoPETestInfo, tp_size: int,
# create q k v input tensors
# create rotary pos emb input tensors
positions, query, key = generate_test_data(num_tokens, num_heads,
num_kv_heads, head_dim,
max_position, dtype, device)
positions, query, key = generate_test_data(
num_tokens, num_heads, num_kv_heads, head_dim, max_position, dtype, device
)
query_native, key_native = mrope_helper_class.forward_native(
positions,
@@ -137,19 +150,26 @@ def test_mrope(model_name: str, model_info: MRoPETestInfo, tp_size: int,
torch.testing.assert_close(key_native, key_cuda, atol=atol, rtol=rtol)
@pytest.mark.skipif(not current_platform.is_cuda_alike(),
reason="Skipping CUDA/ROCm only tests.")
@pytest.mark.parametrize("model_info, model_name", [
pytest.param(test_config, test_config.model_name, marks=test_config.marks)
for test_config in MODELS_TO_TEST
])
@pytest.mark.skipif(
not current_platform.is_cuda_alike(), reason="Skipping CUDA/ROCm only tests."
)
@pytest.mark.parametrize(
"model_info, model_name",
[
pytest.param(test_config, test_config.model_name, marks=test_config.marks)
for test_config in MODELS_TO_TEST
],
)
@pytest.mark.parametrize("tp_size", [1, 2])
@pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("num_tokens", num_tokens_list)
def test_mrope_torch_compile_tracing(model_name: str,
model_info: MRoPETestInfo, tp_size: int,
dtype: torch.dtype, num_tokens: int):
def test_mrope_torch_compile_tracing(
model_name: str,
model_info: MRoPETestInfo,
tp_size: int,
dtype: torch.dtype,
num_tokens: int,
):
atol = model_info.atol
rtol = model_info.rtol
@@ -161,8 +181,11 @@ def test_mrope_torch_compile_tracing(model_name: str,
total_num_heads = config.num_attention_heads
num_heads = total_num_heads // tp_size
num_kv_heads = max(1, total_num_kv_heads // tp_size)
head_dim = (config.head_dim if hasattr(config, "head_dim") else
config.hidden_size // total_num_heads)
head_dim = (
config.head_dim
if hasattr(config, "head_dim")
else config.hidden_size // total_num_heads
)
is_neox_style = True
rope_theta = config.rope_theta
max_position = config.max_position_embeddings
@@ -180,16 +203,16 @@ def test_mrope_torch_compile_tracing(model_name: str,
).to(device=device)
# Generate test data
positions, query, key = generate_test_data(num_tokens, num_heads,
num_kv_heads, head_dim,
max_position, dtype, device)
positions, query, key = generate_test_data(
num_tokens, num_heads, num_kv_heads, head_dim, max_position, dtype, device
)
# Create a wrapper that makes the in-place function appear functional
def functional_forward_cuda(pos, q, k):
"""Wrapper that converts in-place operation to functional style
CUDA Graph does not support in-place operations.
This wrapper creates working copies of the
This wrapper creates working copies of the
input tensors and modifies them.
"""
q_work = q.clone() # Create working copies
@@ -206,11 +229,13 @@ def test_mrope_torch_compile_tracing(model_name: str,
)
try:
compiled_forward_cuda = torch.compile(functional_forward_cuda,
fullgraph=True,
backend="inductor",
mode="reduce-overhead",
dynamic=False)
compiled_forward_cuda = torch.compile(
functional_forward_cuda,
fullgraph=True,
backend="inductor",
mode="reduce-overhead",
dynamic=False,
)
# Run compiled version
query_compiled_cuda, key_compiled_cuda = compiled_forward_cuda(
@@ -225,25 +250,16 @@ def test_mrope_torch_compile_tracing(model_name: str,
mrope_helper_class.forward_cuda(positions, query_cuda, key_cuda)
# Verify results
torch.testing.assert_close(query_compiled_cuda,
query_cuda,
atol=atol,
rtol=rtol)
torch.testing.assert_close(key_compiled_cuda,
key_cuda,
atol=atol,
rtol=rtol)
torch.testing.assert_close(query_compiled_cuda,
query_native,
atol=atol,
rtol=rtol)
torch.testing.assert_close(key_compiled_cuda,
key_native,
atol=atol,
rtol=rtol)
torch.testing.assert_close(
query_compiled_cuda, query_cuda, atol=atol, rtol=rtol
)
torch.testing.assert_close(key_compiled_cuda, key_cuda, atol=atol, rtol=rtol)
torch.testing.assert_close(
query_compiled_cuda, query_native, atol=atol, rtol=rtol
)
torch.testing.assert_close(key_compiled_cuda, key_native, atol=atol, rtol=rtol)
print("✓ forward_cuda successfully traced with torch.compile inductor")
except Exception as e:
pytest.fail(
f"forward_cuda failed to trace with torch.compile inductor: {e}")
pytest.fail(f"forward_cuda failed to trace with torch.compile inductor: {e}")

View File

@@ -8,11 +8,11 @@ from tests.kernels.utils import opcheck
from vllm._custom_ops import permute_cols
@pytest.mark.parametrize('shape', [(1, 512), (544, 4096), (67, 8192)])
@pytest.mark.parametrize('dtype', [torch.bfloat16, torch.float16])
@pytest.mark.parametrize("shape", [(1, 512), (544, 4096), (67, 8192)])
@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
def test_permute_cols(shape, dtype):
x = torch.randn(shape, dtype=dtype).cuda()
perm = torch.randperm(x.shape[1]).to(torch.int).cuda()
opcheck(torch.ops._C.permute_cols, (x, perm))
y = permute_cols(x, perm)
torch.testing.assert_close(y, x[:, perm])
torch.testing.assert_close(y, x[:, perm])

View File

@@ -19,30 +19,33 @@ NUM_HEADS = [17] # Arbitrary values for testing
BATCH_SIZES = [5] # Arbitrary values for testing
SEQ_LENS = [11, 8192] # Arbitrary values for testing
SEEDS = [0]
CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
]
CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
USE_KEY = [True, False]
def _get_flat_tensor_shape(batch_size: int, seq_len: int, num_heads: int,
head_size: int) -> tuple[int, ...]:
def _get_flat_tensor_shape(
batch_size: int, seq_len: int, num_heads: int, head_size: int
) -> tuple[int, ...]:
return (batch_size, seq_len, num_heads * head_size)
# For testing sliced tensors
def _get_padded_tensor_shape(batch_size: int, seq_len: int, num_heads: int,
head_size: int) -> tuple[int, ...]:
def _get_padded_tensor_shape(
batch_size: int, seq_len: int, num_heads: int, head_size: int
) -> tuple[int, ...]:
return (batch_size, seq_len, num_heads, head_size + 64)
def _get_batch_tensor_shape(batch_size: int, seq_len: int, num_heads: int,
head_size: int) -> tuple[int, ...]:
def _get_batch_tensor_shape(
batch_size: int, seq_len: int, num_heads: int, head_size: int
) -> tuple[int, ...]:
return (batch_size, seq_len, num_heads, head_size)
TENSORS_SHAPES_FN = [
_get_batch_tensor_shape, _get_flat_tensor_shape, _get_padded_tensor_shape
_get_batch_tensor_shape,
_get_flat_tensor_shape,
_get_padded_tensor_shape,
]
@@ -97,41 +100,63 @@ def test_rotary_embedding(
ref_query, ref_key = rope.forward_native(positions, query, key)
out_query, out_key = rope.forward(positions, query, key)
# Compare the results.
torch.testing.assert_close(out_query,
ref_query,
atol=get_default_atol(out_query),
rtol=get_default_rtol(out_query))
torch.testing.assert_close(
out_query,
ref_query,
atol=get_default_atol(out_query),
rtol=get_default_rtol(out_query),
)
if use_key:
torch.testing.assert_close(out_key,
ref_key,
atol=get_default_atol(out_key),
rtol=get_default_rtol(out_key))
torch.testing.assert_close(
out_key,
ref_key,
atol=get_default_atol(out_key),
rtol=get_default_rtol(out_key),
)
else:
assert ref_key is None and out_key is None, \
"expected returned key to be None"
assert ref_key is None and out_key is None, "expected returned key to be None"
@torch.inference_mode()
def test_rope_module_cache():
MAX_POSITIONS = [123, 1234]
BASES = [10000, 1000000]
ROPE_SCALINGS = (None, {
"rope_type": "linear",
"factor": (1, )
}, {
"rope_type": "dynamic",
"factor": 1
})
settings = (HEAD_SIZES, ROTARY_DIMS, MAX_POSITIONS, BASES, IS_NEOX_STYLE,
ROPE_SCALINGS, DTYPES)
ROPE_SCALINGS = (
None,
{"rope_type": "linear", "factor": (1,)},
{"rope_type": "dynamic", "factor": 1},
)
settings = (
HEAD_SIZES,
ROTARY_DIMS,
MAX_POSITIONS,
BASES,
IS_NEOX_STYLE,
ROPE_SCALINGS,
DTYPES,
)
rope_setting_id_map: dict[str, int] = {}
for setting in product(*settings):
head_size, rotary_dim, max_position, base, \
is_neox_stype, rope_scaling, dtype = setting
(
head_size,
rotary_dim,
max_position,
base,
is_neox_stype,
rope_scaling,
dtype,
) = setting
if rotary_dim is None:
rotary_dim = head_size
rope = get_rope(head_size, rotary_dim, max_position, base,
is_neox_stype, rope_scaling, dtype)
rope = get_rope(
head_size,
rotary_dim,
max_position,
base,
is_neox_stype,
rope_scaling,
dtype,
)
# different settings cannot share the same rope module
assert id(rope) not in rope_setting_id_map.values()
assert all(x.dtype == dtype for x in rope.buffers())
@@ -139,11 +164,25 @@ def test_rope_module_cache():
rope_setting_id_map[str(setting)] = id(rope)
for setting in product(*settings):
head_size, rotary_dim, max_position, base, \
is_neox_stype, rope_scaling, dtype = setting
(
head_size,
rotary_dim,
max_position,
base,
is_neox_stype,
rope_scaling,
dtype,
) = setting
if rotary_dim is None:
rotary_dim = head_size
rope = get_rope(head_size, rotary_dim, max_position, base,
is_neox_stype, rope_scaling, dtype)
rope = get_rope(
head_size,
rotary_dim,
max_position,
base,
is_neox_stype,
rope_scaling,
dtype,
)
# check if cache take effect
assert id(rope) == rope_setting_id_map[str(setting)]

View File

@@ -13,17 +13,20 @@ from tests.kernels.utils import opcheck
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
def rotary_embedding_opcheck(rot,
positions: torch.Tensor,
query: torch.Tensor,
key: Optional[torch.Tensor] = None):
def rotary_embedding_opcheck(
rot,
positions: torch.Tensor,
query: torch.Tensor,
key: Optional[torch.Tensor] = None,
):
cos_sin_cache = rot.cos_sin_cache.to(query.device, dtype=query.dtype)
# ops.rotary_embedding() is a in-place operation
# that updates the query and key tensors.
opcheck(torch.ops._C.rotary_embedding,
(positions, query, key, rot.head_size, cos_sin_cache,
rot.is_neox_style))
opcheck(
torch.ops._C.rotary_embedding,
(positions, query, key, rot.head_size, cos_sin_cache, rot.is_neox_style),
)
@pytest.mark.parametrize("device", ["cuda"])
@@ -34,26 +37,30 @@ def rotary_embedding_opcheck(rot,
@pytest.mark.parametrize("seq_len", [11, 1024])
@pytest.mark.parametrize("use_key", [True, False])
@pytest.mark.parametrize("head_stride_is_contiguous", [True, False])
def test_rotary_embedding_opcheck(dist_init, device, max_position,
is_neox_style, rotary_dim, head_size,
seq_len, use_key, head_stride_is_contiguous):
def test_rotary_embedding_opcheck(
dist_init,
device,
max_position,
is_neox_style,
rotary_dim,
head_size,
seq_len,
use_key,
head_stride_is_contiguous,
):
batch_size = 1
base = 10000
num_heads = 7
rot = RotaryEmbedding(head_size, rotary_dim, max_position, base,
is_neox_style, torch.float32)
rot = RotaryEmbedding(
head_size, rotary_dim, max_position, base, is_neox_style, torch.float32
)
positions = torch.randint(0,
max_position, (batch_size, seq_len),
device=device)
positions = torch.randint(0, max_position, (batch_size, seq_len), device=device)
head_stride = head_size + (64 if head_stride_is_contiguous else 0)
query = torch.randn(batch_size,
seq_len,
num_heads,
head_stride,
dtype=torch.float32,
device=device)
query = torch.randn(
batch_size, seq_len, num_heads, head_stride, dtype=torch.float32, device=device
)
key = torch.randn_like(query) if use_key else None
query = query[..., :head_size]
key = key[..., :head_size] if use_key else None
@@ -64,5 +71,8 @@ def test_rotary_embedding_opcheck(dist_init, device, max_position,
# [..., num_heads * head_dim] shape/layout
if head_stride_is_contiguous:
rotary_embedding_opcheck(
rot, positions, query.flatten(start_dim=-2),
key.flatten(start_dim=-2) if use_key else None)
rot,
positions,
query.flatten(start_dim=-2),
key.flatten(start_dim=-2) if use_key else None,
)

View File

@@ -5,20 +5,14 @@ import torch
from vllm.utils import get_cuda_view_from_cpu_tensor, is_uva_available
CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
]
CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
@pytest.mark.skipif(not is_uva_available(), reason="UVA is not available.")
@pytest.mark.parametrize("device", CUDA_DEVICES)
def test_cpu_write(device):
torch.set_default_device(device)
cpu_tensor = torch.zeros(10,
10,
device="cpu",
pin_memory=True,
dtype=torch.int32)
cpu_tensor = torch.zeros(10, 10, device="cpu", pin_memory=True, dtype=torch.int32)
cuda_view = get_cuda_view_from_cpu_tensor(cpu_tensor)
assert cuda_view.device.type == "cuda"
@@ -40,11 +34,7 @@ def test_cpu_write(device):
@pytest.mark.parametrize("device", CUDA_DEVICES)
def test_gpu_write(device):
torch.set_default_device(device)
cpu_tensor = torch.zeros(10,
10,
device="cpu",
pin_memory=True,
dtype=torch.int32)
cpu_tensor = torch.zeros(10, 10, device="cpu", pin_memory=True, dtype=torch.int32)
cuda_view = get_cuda_view_from_cpu_tensor(cpu_tensor)
assert cuda_view.device.type == "cuda"
@@ -59,4 +49,4 @@ def test_gpu_write(device):
assert cpu_tensor[0, 0] == 2
assert cpu_tensor[2, 3] == 4
assert cpu_tensor[4, 5] == -2
assert cpu_tensor[4, 5] == -2