[Kernel]: Cutlass 2:4 Sparsity + FP8/Int8 Quant Support (#10995)
Co-authored-by: Faraz Shahsavan <faraz.shahsavan@gmail.com> Co-authored-by: ilmarkov <markovilya197@gmail.com> Co-authored-by: Rahul Tuli <rahul@neuralmagic.com> Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
This commit is contained in:
@@ -21,6 +21,8 @@ compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
|
||||
compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
|
||||
compressed-tensors, nm-testing/TinyLlama-1.1B-Chat-v1.0-actorder-group, main
|
||||
compressed-tensors, mgoin/DeepSeek-Coder-V2-Lite-Instruct-FP8, main
|
||||
compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-FP8-Dynamic-testing, main, 90
|
||||
compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-W8A8-testing, main, 90
|
||||
awq, casperhansen/mixtral-instruct-awq, main
|
||||
awq_marlin, casperhansen/mixtral-instruct-awq, main
|
||||
fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main
|
||||
|
||||
@@ -26,6 +26,10 @@ do
|
||||
export QUANTIZATION=${array[0]}
|
||||
export MODEL_NAME=${array[1]}
|
||||
export REVISION=${array[2]}
|
||||
# If array length is larger than 3, then MIN_CAPABILITY is provided
|
||||
if [ ${#array[@]} -gt 3 ]; then
|
||||
export MIN_CAPABILITY=${array[3]}
|
||||
fi
|
||||
pytest -s weight_loading/test_weight_loading.py || LOCAL_SUCCESS=$?
|
||||
|
||||
if [[ $LOCAL_SUCCESS == 0 ]]; then
|
||||
|
||||
@@ -1,14 +1,21 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
MAX_MODEL_LEN = 1024
|
||||
MODEL_NAME = os.environ.get("MODEL_NAME",
|
||||
"robertgshaw2/zephyr-7b-beta-channelwise-gptq")
|
||||
REVISION = os.environ.get("REVISION", "main")
|
||||
QUANTIZATION = os.environ.get("QUANTIZATION", "gptq_marlin")
|
||||
MIN_CAPABILITY = os.environ.get("MIN_CAPABILITY", "89")
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.has_device_capability(int(MIN_CAPABILITY)),
|
||||
reason="Current system does not have minimum capability.")
|
||||
def test_weight_loading(vllm_runner):
|
||||
"""
|
||||
Test parameter weight loading with tp>1.
|
||||
|
||||
Reference in New Issue
Block a user