diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py index e07d6c776..626b3b160 100644 --- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py +++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py @@ -15,6 +15,7 @@ from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk from vllm.platforms import current_platform from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.v1.worker.workspace import init_workspace_manager # Weight shapes for different models: [num_experts, topk, hidden_size, # intermediate_size] @@ -297,6 +298,10 @@ def bench_run( def main(args): + # Initialize workspace manager (required for CUTLASS MoE kernels) + device = torch.device("cuda:0") + init_workspace_manager(device) + print("Benchmarking models:") for i, model in enumerate(args.models): print(f"[{i}] {model}") diff --git a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py similarity index 98% rename from benchmarks/kernels/benchmark_cutlass_fp4_moe.py rename to benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py index 7982cbb14..d6b5820a5 100644 --- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py +++ b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py @@ -21,6 +21,7 @@ from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk from vllm.scalar_type import scalar_types from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.v1.worker.workspace import init_workspace_manager WEIGHT_SHAPES_MOE = { "nvidia/DeepSeek-R1-FP4": [ @@ -441,6 +442,10 @@ def bench_run( def main(args): + # Initialize workspace manager (required for CUTLASS MoE kernels) + device = torch.device("cuda:0") + init_workspace_manager(device) + print("Benchmarking models:") for i, model in enumerate(args.models): print(f"[{i}] {model}") diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py index 9b426d8d5..4390be877 100644 --- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py +++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py @@ -14,6 +14,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import ( fused_topk, ) from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.v1.worker.workspace import init_workspace_manager DEFAULT_MODELS = [ "mistralai/Mixtral-8x7B-Instruct-v0.1", @@ -364,6 +365,10 @@ def bench_run( def main(args): + # Initialize workspace manager (required for CUTLASS MoE kernels) + device = torch.device("cuda:0") + init_workspace_manager(device) + print("Benchmarking models:") for i, model in enumerate(args.models): print(f"[{i}] {model}")