[ROCm][Quantization][1/N] Refactor quark_moe w_mxfp4 w/ oracle (#38774)
Signed-off-by: Bowen Bao <bowenbao@amd.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
This commit is contained in:
@@ -0,0 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
model_name: amd/gpt-oss-20b-w-mxfp4-a-bf16
|
||||
metric_threshold: 0.568
|
||||
reasoning_effort: low
|
||||
server_args: "--attention-backend ROCM_AITER_UNIFIED_ATTN --moe-backend aiter"
|
||||
env:
|
||||
VLLM_ROCM_USE_AITER: "1"
|
||||
@@ -0,0 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
model_name: amd/gpt-oss-20b-w-mxfp4-a-bf16
|
||||
metric_threshold: 0.568
|
||||
reasoning_effort: low
|
||||
server_args: "--attention-backend ROCM_AITER_UNIFIED_ATTN --moe-backend triton"
|
||||
@@ -1,4 +1,6 @@
|
||||
# GFX950 model configurations for GPQA evaluation
|
||||
# Tests different environment variable combinations
|
||||
gpt-oss-20b-rocm-baseline.yaml
|
||||
gpt-oss-20b-rocm-mxfp4-fp8.yaml
|
||||
gpt-oss-20b-rocm-quark-mxfp4-bf16-aiter.yaml
|
||||
gpt-oss-20b-rocm-quark-mxfp4-bf16-triton.yaml
|
||||
gpt-oss-20b-rocm-quark-mxfp4-fp8-triton.yaml
|
||||
|
||||
Reference in New Issue
Block a user