[Attention] MLA decode optimizations (#12528)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> Signed-off-by: simon-mo <xmo@berkeley.edu> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: simon-mo <simon.mo@hey.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> Co-authored-by: Tyler Michael Smith <tysmith@redhat.com> Co-authored-by: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com> Co-authored-by: simon-mo <xmo@berkeley.edu>
This commit is contained in:
@@ -20,7 +20,7 @@ compressed-tensors, nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test, main
|
||||
compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
|
||||
compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
|
||||
compressed-tensors, nm-testing/TinyLlama-1.1B-Chat-v1.0-actorder-group, main
|
||||
compressed-tensors, mgoin/DeepSeek-Coder-V2-Lite-Instruct-FP8, main
|
||||
#compressed-tensors, mgoin/DeepSeek-Coder-V2-Lite-Instruct-FP8, main
|
||||
compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-FP8-Dynamic-testing, main, 90
|
||||
compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-W8A8-testing, main, 90
|
||||
awq, casperhansen/mixtral-instruct-awq, main
|
||||
|
||||
@@ -3,7 +3,7 @@ SUCCESS=0
|
||||
|
||||
while getopts "c:" OPT; do
|
||||
case ${OPT} in
|
||||
c )
|
||||
c )
|
||||
CONFIG="$OPTARG"
|
||||
;;
|
||||
\? )
|
||||
@@ -18,9 +18,14 @@ IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
|
||||
|
||||
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
|
||||
do
|
||||
if [[ $MODEL_CONFIG == \#* ]]; then
|
||||
echo "=== SKIPPING MODEL: $MODEL_CONFIG ==="
|
||||
continue
|
||||
fi
|
||||
|
||||
LOCAL_SUCCESS=0
|
||||
IFS=', ' read -r -a array <<< "$MODEL_CONFIG"
|
||||
|
||||
|
||||
echo "=== RUNNING MODEL: $MODEL_CONFIG ==="
|
||||
|
||||
export QUANTIZATION=${array[0]}
|
||||
|
||||
Reference in New Issue
Block a user