[Kernel]: Cutlass 2:4 Sparsity + FP8/Int8 Quant Support (#10995)
Co-authored-by: Faraz Shahsavan <faraz.shahsavan@gmail.com> Co-authored-by: ilmarkov <markovilya197@gmail.com> Co-authored-by: Rahul Tuli <rahul@neuralmagic.com> Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
This commit is contained in:
@@ -1,14 +1,21 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
MAX_MODEL_LEN = 1024
|
||||
MODEL_NAME = os.environ.get("MODEL_NAME",
|
||||
"robertgshaw2/zephyr-7b-beta-channelwise-gptq")
|
||||
REVISION = os.environ.get("REVISION", "main")
|
||||
QUANTIZATION = os.environ.get("QUANTIZATION", "gptq_marlin")
|
||||
MIN_CAPABILITY = os.environ.get("MIN_CAPABILITY", "89")
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.has_device_capability(int(MIN_CAPABILITY)),
|
||||
reason="Current system does not have minimum capability.")
|
||||
def test_weight_loading(vllm_runner):
|
||||
"""
|
||||
Test parameter weight loading with tp>1.
|
||||
|
||||
Reference in New Issue
Block a user