[ { "test_name": "serving_llama8B_bf16_tp1_sharegpt", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, "dtype": "bfloat16", "distributed_executor_backend": "mp", "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200 } }, { "test_name": "serving_llama8B_bf16_tp2_sharegpt", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 2, "dtype": "bfloat16", "distributed_executor_backend": "mp", "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200 } }, { "test_name": "serving_llama8B_bf16_tp4_sharegpt", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 4, "dtype": "bfloat16", "distributed_executor_backend": "mp", "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200 } }, { "test_name": "serving_llama8B_bf16_tp1_random_128_128", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, "dtype": "bfloat16", "distributed_executor_backend": "mp", "block_size": 128, "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128, "ignore-eos": "", "num_prompts": 1000 } }, { "test_name": "serving_llama8B_bf16_tp2_random_128_128", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 2, "dtype": "bfloat16", "distributed_executor_backend": "mp", "block_size": 128, "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128, "ignore-eos": "", "num_prompts": 1000 } }, { "test_name": "serving_llama8B_bf16_tp4_random_128_128", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 4, "dtype": "bfloat16", "distributed_executor_backend": "mp", "block_size": 128, "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128, "num_prompts": 1000 } }, { "test_name": "serving_llama8B_int8_tp1_sharegpt", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", "tensor_parallel_size": 1, "dtype": "bfloat16", "distributed_executor_backend": "mp", "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200 } }, { "test_name": "serving_llama8B_int8_tp2_sharegpt", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", "tensor_parallel_size": 2, "dtype": "bfloat16", "distributed_executor_backend": "mp", "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200 } }, { "test_name": "serving_llama8B_int8_tp4_sharegpt", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", "tensor_parallel_size": 4, "dtype": "bfloat16", "distributed_executor_backend": "mp", "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200 } }, { "test_name": "serving_llama8B_int8_tp1_random_128_128", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", "tensor_parallel_size": 1, "dtype": "bfloat16", "distributed_executor_backend": "mp", "block_size": 128, "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", "backend": "vllm", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128, "ignore-eos": "", "num_prompts": 1000 } }, { "test_name": "serving_llama8B_int8_tp2_random_128_128", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", "tensor_parallel_size": 2, "dtype": "bfloat16", "distributed_executor_backend": "mp", "block_size": 128, "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", "backend": "vllm", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128, "ignore-eos": "", "num_prompts": 1000 } }, { "test_name": "serving_llama8B_int8_tp4_random_128_128", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", "tensor_parallel_size": 4, "dtype": "bfloat16", "distributed_executor_backend": "mp", "block_size": 128, "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", "backend": "vllm", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128, "ignore-eos": "", "num_prompts": 1000 } }, { "test_name": "serving_llama8B_int4_tp1_sharegpt", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", "quantization": "awq", "tensor_parallel_size": 1, "dtype": "bfloat16", "distributed_executor_backend": "mp", "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200 } }, { "test_name": "serving_llama8B_int4_tp2_sharegpt", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", "quantization": "awq", "tensor_parallel_size": 2, "dtype": "bfloat16", "distributed_executor_backend": "mp", "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200 } }, { "test_name": "serving_llama8B_int4_tp4_sharegpt", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", "quantization": "awq", "tensor_parallel_size": 4, "dtype": "bfloat16", "distributed_executor_backend": "mp", "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200 } }, { "test_name": "serving_llama8B_int4_tp1_random_128_128", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", "quantization": "awq", "tensor_parallel_size": 1, "dtype": "bfloat16", "distributed_executor_backend": "mp", "block_size": 128, "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", "backend": "vllm", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128, "ignore-eos": "", "num_prompts": 1000 } }, { "test_name": "serving_llama8B_int4_tp2_random_128_128", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", "quantization": "awq", "tensor_parallel_size": 2, "dtype": "bfloat16", "distributed_executor_backend": "mp", "block_size": 128, "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", "backend": "vllm", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128, "ignore-eos": "", "num_prompts": 1000 } }, { "test_name": "serving_llama8B_int4_tp4_random_128_128", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", "quantization": "awq", "tensor_parallel_size": 4, "dtype": "bfloat16", "distributed_executor_backend": "mp", "block_size": 128, "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", "backend": "vllm", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128, "ignore-eos": "", "num_prompts": 1000 } } ]