[Kernels][MoE] Fix legacy_routing to use bitmatrix-based routing path (#38504)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
This commit is contained in:
@@ -3,4 +3,4 @@
|
||||
model_name: openai/gpt-oss-20b
|
||||
metric_threshold: 0.568
|
||||
reasoning_effort: low
|
||||
server_args: "--attention-backend ROCM_AITER_UNIFIED_ATTN"
|
||||
server_args: "--attention-backend ROCM_AITER_UNIFIED_ATTN --tensor-parallel-size 2"
|
||||
@@ -3,6 +3,6 @@
|
||||
model_name: amd/gpt-oss-20b-w-mxfp4-a-bf16
|
||||
metric_threshold: 0.568
|
||||
reasoning_effort: low
|
||||
server_args: "--attention-backend ROCM_AITER_UNIFIED_ATTN --moe-backend aiter"
|
||||
server_args: "--attention-backend ROCM_AITER_UNIFIED_ATTN --moe-backend aiter --tokenizer openai/gpt-oss-20b --tensor-parallel-size 2"
|
||||
env:
|
||||
VLLM_ROCM_USE_AITER: "1"
|
||||
VLLM_ROCM_USE_AITER: "1"
|
||||
@@ -3,4 +3,4 @@
|
||||
model_name: amd/gpt-oss-20b-w-mxfp4-a-bf16
|
||||
metric_threshold: 0.568
|
||||
reasoning_effort: low
|
||||
server_args: "--attention-backend ROCM_AITER_UNIFIED_ATTN --moe-backend triton"
|
||||
server_args: "--attention-backend ROCM_AITER_UNIFIED_ATTN --moe-backend triton --tokenizer openai/gpt-oss-20b --tensor-parallel-size 2"
|
||||
@@ -3,6 +3,6 @@
|
||||
model_name: amd/gpt-oss-20b-MoE-Quant-W-MXFP4-A-FP8-KV-FP8
|
||||
metric_threshold: 0.568
|
||||
reasoning_effort: low
|
||||
server_args: "--attention-backend ROCM_AITER_UNIFIED_ATTN"
|
||||
server_args: "--attention-backend ROCM_AITER_UNIFIED_ATTN --tensor-parallel-size 2"
|
||||
env:
|
||||
VLLM_ROCM_USE_AITER: "1"
|
||||
Reference in New Issue
Block a user