# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import sys import time import numpy as np import torch from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.torch_utils import set_random_seed # Check if CPU MoE operations are available try: from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight except (ImportError, AttributeError) as e: print("ERROR: CPU fused MoE operations are not available on this platform.") print("This benchmark requires x86 CPU with proper vLLM CPU extensions compiled.") print( "The cpu_fused_moe kernel is typically available on Linux x86_64 " "with AVX2/AVX512." ) print(f"Import error: {e}") sys.exit(1) # ISA selection following test_cpu_fused_moe.py pattern ISA_CHOICES = ["amx", "vec"] if torch._C._cpu._is_amx_tile_supported() else ["vec"] @torch.inference_mode() def main( batch_size: int, expert_num: int, hidden_size: int, intermediate_size: int, topk_num: int, use_bias: bool = False, dtype: torch.dtype = torch.bfloat16, activation: str = "silu", isa: str = "vec", seed: int = 0, iters: int = 20, ) -> None: set_random_seed(seed) # up_dim = 2 * intermediate_size for gate + up projection up_dim = 2 * intermediate_size input_tensor = torch.randn((batch_size, hidden_size), dtype=dtype) / ( 0.5 * hidden_size**0.5 ) w13 = torch.randn((expert_num, up_dim, hidden_size), dtype=dtype) / ( 0.5 * hidden_size**0.5 ) w2 = torch.randn((expert_num, hidden_size, intermediate_size), dtype=dtype) / ( 0.5 * intermediate_size**0.5 ) w13_bias = None w2_bias = None if use_bias: w13_bias = torch.randn((expert_num, up_dim), dtype=dtype) / (0.5 * up_dim**0.5) w2_bias = torch.randn((expert_num, hidden_size), dtype=dtype) / ( 0.5 * hidden_size**0.5 ) router_logits = torch.randn((batch_size, expert_num), dtype=dtype) score = torch.softmax(router_logits, dim=-1, dtype=torch.float32) topk_weights, topk_ids = torch.topk(score, topk_num) topk_ids = topk_ids.to(torch.int32) packed_w13 = cpu_prepack_moe_weight(w13, isa) packed_w2 = cpu_prepack_moe_weight(w2, isa) def run_benchmark(iters: int) -> list[float]: times = [] for _ in range(iters): start_time = time.perf_counter_ns() _ = cpu_fused_moe( input_tensor, packed_w13, packed_w2, w13_bias, w2_bias, topk_weights, topk_ids, activation, isa, ) end_time = time.perf_counter_ns() times.append((end_time - start_time) / 1e6) return times # warmup run_benchmark(5) # benchmark times = run_benchmark(iters) if not times: print("No iterations to measure. Set --iters > 0.") return time_min = min(times) time_max = max(times) time_mean = np.mean(times) time_std = np.std(times) print("\tmin (ms) = ", time_min) print("\tmax (ms) = ", time_max) print("\tmean (ms) = ", time_mean) print("\tstd = ", time_std) print("\tmedian (ms) = ", np.median(times)) # Calculate throughput metrics # FLOPs estimation: 2 * batch * topk * (hidden * up_dim + intermediate * hidden) flops_per_token = ( 2 * topk_num * (hidden_size * up_dim + intermediate_size * hidden_size) ) total_flops = batch_size * flops_per_token tflops = total_flops / (time_mean * 1e-3) / 1e12 print(f"\tthroughput (TFLOP/s) = {tflops:.4f}") if __name__ == "__main__": parser = FlexibleArgumentParser(description="Benchmark the CPU fused MoE kernel.") parser.add_argument("--batch-size", type=int, default=64) parser.add_argument("--expert-num", type=int, default=8) parser.add_argument("--hidden-size", type=int, default=2880) parser.add_argument("--intermediate-size", type=int, default=2880) parser.add_argument( "--topk-num", type=int, default=None, help="Number of experts to route each token to (default: expert_num // 2)", ) parser.add_argument("--use-bias", action="store_true") parser.add_argument( "--activation", type=str, choices=["silu", "swigluoai"], default="silu", help="Activation function", ) parser.add_argument( "--isa", type=str, choices=ISA_CHOICES, default=ISA_CHOICES[0], help=f"ISA to use (available: {ISA_CHOICES})", ) parser.add_argument("--seed", type=int, default=0) parser.add_argument("--iters", type=int, default=20) args = parser.parse_args() # Default topk_num to expert_num // 2, minimum 1 topk_num = ( args.topk_num if args.topk_num is not None else max(args.expert_num // 2, 1) ) print(args) main( batch_size=args.batch_size, expert_num=args.expert_num, hidden_size=args.hidden_size, intermediate_size=args.intermediate_size, topk_num=topk_num, use_bias=args.use_bias, dtype=torch.bfloat16, # Following test_cpu_fused_moe.py activation=args.activation, isa=args.isa, seed=args.seed, iters=args.iters, )