[W8A8 Block Linear Refactor][2/N] Remove W8A8Fp8BlockLinearOp and adopt Fp8 block linear kernel selections. (#33892)
Signed-off-by: maral <maralbahari.98@gmail.com> Signed-off-by: Maral <maralbahari.98@gmail.com>
This commit is contained in:
@@ -12,6 +12,7 @@ import pytest
|
||||
import torch
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from vllm.config.model import ModelConfig
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
@@ -46,7 +47,7 @@ def _snapshot_download_or_skip(model_id: str) -> str:
|
||||
not is_quant_method_supported("modelopt"),
|
||||
reason="ModelOpt FP8 is not supported on this GPU type.",
|
||||
)
|
||||
def test_modelopt_fp8_checkpoint_setup(vllm_runner):
|
||||
def test_modelopt_fp8_checkpoint_setup(default_vllm_config, vllm_runner):
|
||||
"""Test ModelOpt FP8 checkpoint loading and structure validation."""
|
||||
# TODO: provide a small publicly available test checkpoint
|
||||
model_path = (
|
||||
@@ -61,6 +62,8 @@ def test_modelopt_fp8_checkpoint_setup(vllm_runner):
|
||||
"This test requires a local ModelOpt FP8 checkpoint."
|
||||
)
|
||||
|
||||
# Set model config as model_config.dtype is required in ModelOptFp8LinearMethod.
|
||||
default_vllm_config.model_config = ModelConfig()
|
||||
with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
@@ -120,11 +123,13 @@ def test_modelopt_fp8_checkpoint_setup(vllm_runner):
|
||||
not is_quant_method_supported("modelopt"),
|
||||
reason="ModelOpt FP8 is not supported on this GPU type.",
|
||||
)
|
||||
def test_modelopt_fp8_pc_pt_checkpoint_setup(vllm_runner):
|
||||
def test_modelopt_fp8_pc_pt_checkpoint_setup(default_vllm_config, vllm_runner):
|
||||
"""Test ModelOpt FP8_PER_CHANNEL_PER_TOKEN checkpoint setup."""
|
||||
model_id = "CedricHwang/qwen2.5-0.5b-modelopt-fp8-pc-pt"
|
||||
model_path = _snapshot_download_or_skip(model_id)
|
||||
|
||||
# Set model config as model_config.dtype is required in ModelOptFp8LinearMethod.
|
||||
default_vllm_config.model_config = ModelConfig()
|
||||
with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
@@ -181,11 +186,13 @@ def test_modelopt_fp8_pc_pt_checkpoint_setup(vllm_runner):
|
||||
not is_quant_method_supported("modelopt"),
|
||||
reason="ModelOpt FP8 is not supported on this GPU type.",
|
||||
)
|
||||
def test_modelopt_fp8_pb_wo_checkpoint_setup(vllm_runner):
|
||||
def test_modelopt_fp8_pb_wo_checkpoint_setup(default_vllm_config, vllm_runner):
|
||||
"""Test ModelOpt FP8_PB_WO checkpoint setup."""
|
||||
model_id = "CedricHwang/qwen2.5-0.5b-modelopt-fp8-pb-wo"
|
||||
model_path = _snapshot_download_or_skip(model_id)
|
||||
|
||||
# Set model config as model_config.dtype is required in ModelOptFp8LinearMethod.
|
||||
default_vllm_config.model_config = ModelConfig()
|
||||
with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
|
||||
Reference in New Issue
Block a user