[ROCm][Quantization] Add asymmetric INT8 quantization support to TritonInt8ScaledMMLinearKernel (#38501)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
This commit is contained in:
@@ -13,6 +13,7 @@ import os
|
||||
from contextlib import contextmanager
|
||||
|
||||
import lm_eval
|
||||
import pytest
|
||||
import yaml
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
@@ -89,9 +90,40 @@ def launch_lm_eval(eval_config, tp_size):
|
||||
return results
|
||||
|
||||
|
||||
def _check_rocm_gpu_arch_requirement(eval_config):
|
||||
"""Skip the test if the model requires a ROCm GPU arch not present.
|
||||
|
||||
Model YAML configs can specify::
|
||||
|
||||
required_gpu_arch:
|
||||
- gfx942
|
||||
- gfx950
|
||||
|
||||
The check only applies on ROCm. On other platforms (e.g. CUDA) the
|
||||
field is ignored so that shared config files work for both NVIDIA and
|
||||
AMD CI pipelines.
|
||||
"""
|
||||
required_archs = eval_config.get("required_gpu_arch")
|
||||
if not required_archs:
|
||||
return
|
||||
|
||||
if not current_platform.is_rocm():
|
||||
return
|
||||
|
||||
from vllm.platforms.rocm import _GCN_ARCH # noqa: E402
|
||||
|
||||
if not any(arch in _GCN_ARCH for arch in required_archs):
|
||||
pytest.skip(
|
||||
f"Model requires GPU arch {required_archs}, "
|
||||
f"but detected arch is '{_GCN_ARCH}'"
|
||||
)
|
||||
|
||||
|
||||
def test_lm_eval_correctness_param(config_filename, tp_size):
|
||||
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
|
||||
|
||||
_check_rocm_gpu_arch_requirement(eval_config)
|
||||
|
||||
results = launch_lm_eval(eval_config, tp_size)
|
||||
|
||||
rtol = eval_config.get("rtol", DEFAULT_RTOL)
|
||||
|
||||
Reference in New Issue
Block a user