Compare commits
9 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
966f933ee1 | ||
|
|
1a504aff6c | ||
|
|
01ca85bbd8 | ||
|
|
d82b9487ea | ||
|
|
be13281d4b | ||
|
|
54e084f7fb | ||
|
|
9e8f089d08 | ||
|
|
16e9064f84 | ||
|
|
5ac1a8e6e4 |
@@ -8,12 +8,12 @@ import zipfile
|
|||||||
# Note that we have 400 MiB quota, please use it wisely.
|
# Note that we have 400 MiB quota, please use it wisely.
|
||||||
# See https://github.com/pypi/support/issues/3792 .
|
# See https://github.com/pypi/support/issues/3792 .
|
||||||
# Please also sync the value with the one in Dockerfile.
|
# Please also sync the value with the one in Dockerfile.
|
||||||
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
|
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))
|
||||||
|
|
||||||
|
|
||||||
def print_top_10_largest_files(zip_file):
|
def print_top_10_largest_files(zip_file):
|
||||||
"""Print the top 10 largest files in the given zip file."""
|
"""Print the top 10 largest files in the given zip file."""
|
||||||
with zipfile.ZipFile(zip_file, "r") as z:
|
with zipfile.ZipFile(zip_file, 'r') as z:
|
||||||
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
|
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
|
||||||
file_sizes.sort(key=lambda x: x[1], reverse=True)
|
file_sizes.sort(key=lambda x: x[1], reverse=True)
|
||||||
for f, size in file_sizes[:10]:
|
for f, size in file_sizes[:10]:
|
||||||
@@ -28,18 +28,14 @@ def check_wheel_size(directory):
|
|||||||
wheel_path = os.path.join(root, file_name)
|
wheel_path = os.path.join(root, file_name)
|
||||||
wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
|
wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
|
||||||
if wheel_size_mb > VLLM_MAX_SIZE_MB:
|
if wheel_size_mb > VLLM_MAX_SIZE_MB:
|
||||||
print(
|
print(f"Not allowed: Wheel {wheel_path} is larger "
|
||||||
f"Not allowed: Wheel {wheel_path} is larger "
|
|
||||||
f"({wheel_size_mb:.2f} MB) than the limit "
|
f"({wheel_size_mb:.2f} MB) than the limit "
|
||||||
f"({VLLM_MAX_SIZE_MB} MB)."
|
f"({VLLM_MAX_SIZE_MB} MB).")
|
||||||
)
|
|
||||||
print_top_10_largest_files(wheel_path)
|
print_top_10_largest_files(wheel_path)
|
||||||
return 1
|
return 1
|
||||||
else:
|
else:
|
||||||
print(
|
print(f"Wheel {wheel_path} is within the allowed size "
|
||||||
f"Wheel {wheel_path} is within the allowed size "
|
f"({wheel_size_mb:.2f} MB).")
|
||||||
f"({wheel_size_mb:.2f} MB)."
|
|
||||||
)
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -22,5 +22,5 @@ with open("index.html", "w") as f:
|
|||||||
print(f"Generated index.html for {args.wheel}")
|
print(f"Generated index.html for {args.wheel}")
|
||||||
# cloudfront requires escaping the '+' character
|
# cloudfront requires escaping the '+' character
|
||||||
f.write(
|
f.write(
|
||||||
template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
|
template.format(wheel=filename,
|
||||||
)
|
wheel_html_escaped=filename.replace("+", "%2B")))
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
|
||||||
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
|
||||||
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
|
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
|
||||||
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
|
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
|
||||||
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
|
|
||||||
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
|
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||||
tasks:
|
tasks:
|
||||||
- name: "gsm8k"
|
- name: "gsm8k"
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
||||||
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@@ -1,11 +0,0 @@
|
|||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
|
|
||||||
model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.335
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.323
|
|
||||||
limit: 1319
|
|
||||||
num_fewshot: 5
|
|
||||||
@@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||||
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
|
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "mgoin/Minitron-4B-Base-FP8"
|
model_name: "mgoin/Minitron-4B-Base-FP8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
|
||||||
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
|
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
|
||||||
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
|
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
|
|
||||||
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||||
tasks:
|
tasks:
|
||||||
- name: "gsm8k"
|
- name: "gsm8k"
|
||||||
|
|||||||
@@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
|
|
||||||
model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.30
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.465
|
|
||||||
limit: 1319
|
|
||||||
num_fewshot: 5
|
|
||||||
@@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
|
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||||
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
|
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
|
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
|
||||||
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
|
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@@ -1,11 +0,0 @@
|
|||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
|
|
||||||
model_name: "Qwen/Qwen2.5-1.5B-Instruct"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.54
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.59
|
|
||||||
limit: 1319
|
|
||||||
num_fewshot: 5
|
|
||||||
@@ -1,11 +0,0 @@
|
|||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
|
|
||||||
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.47
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.64
|
|
||||||
limit: 1319
|
|
||||||
num_fewshot: 5
|
|
||||||
@@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
|
||||||
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
|
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml
|
|||||||
Mixtral-8x7B-Instruct-v0.1.yaml
|
Mixtral-8x7B-Instruct-v0.1.yaml
|
||||||
Qwen2-57B-A14-Instruct.yaml
|
Qwen2-57B-A14-Instruct.yaml
|
||||||
DeepSeek-V2-Lite-Chat.yaml
|
DeepSeek-V2-Lite-Chat.yaml
|
||||||
Meta-Llama-3-8B-QQQ.yaml
|
|
||||||
|
|||||||
@@ -1,6 +1,10 @@
|
|||||||
Qwen2.5-1.5B-Instruct.yaml
|
Meta-Llama-3-8B-Instruct.yaml
|
||||||
|
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
|
||||||
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
|
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
|
||||||
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
|
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
|
||||||
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
||||||
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
|
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
|
||||||
Qwen1.5-MoE-W4A16-compressed-tensors.yaml
|
Minitron-4B-Base-FP8.yaml
|
||||||
|
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
|
||||||
|
Qwen2-1.5B-Instruct-FP8W8.yaml
|
||||||
|
Meta-Llama-3-8B-QQQ.yaml
|
||||||
|
|||||||
@@ -1,43 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
def pytest_addoption(parser):
|
|
||||||
parser.addoption(
|
|
||||||
"--config-list-file",
|
|
||||||
action="store",
|
|
||||||
help="Path to the file listing model config YAMLs (one per line)",
|
|
||||||
)
|
|
||||||
parser.addoption(
|
|
||||||
"--tp-size",
|
|
||||||
action="store",
|
|
||||||
default="1",
|
|
||||||
help="Tensor parallel size to use for evaluation",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def config_list_file(pytestconfig, config_dir):
|
|
||||||
rel_path = pytestconfig.getoption("--config-list-file")
|
|
||||||
return config_dir / rel_path
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def tp_size(pytestconfig):
|
|
||||||
return pytestconfig.getoption("--tp-size")
|
|
||||||
|
|
||||||
|
|
||||||
def pytest_generate_tests(metafunc):
|
|
||||||
if "config_filename" in metafunc.fixturenames:
|
|
||||||
rel_path = metafunc.config.getoption("--config-list-file")
|
|
||||||
config_list_file = Path(rel_path).resolve()
|
|
||||||
config_dir = config_list_file.parent
|
|
||||||
with open(config_list_file, encoding="utf-8") as f:
|
|
||||||
configs = [
|
|
||||||
config_dir / line.strip()
|
|
||||||
for line in f
|
|
||||||
if line.strip() and not line.startswith("#")
|
|
||||||
]
|
|
||||||
metafunc.parametrize("config_filename", configs)
|
|
||||||
59
.buildkite/lm-eval-harness/run-tests.sh
Normal file
59
.buildkite/lm-eval-harness/run-tests.sh
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo``
|
||||||
|
echo "Runs lm eval harness on GSM8k using vllm and compares to "
|
||||||
|
echo "precomputed baseline (measured by HF transformers.)"
|
||||||
|
echo
|
||||||
|
echo "usage: ${0} <options>"
|
||||||
|
echo
|
||||||
|
echo " -c - path to the test data config (e.g. configs/small-models.txt)"
|
||||||
|
echo " -t - tensor parallel size"
|
||||||
|
echo
|
||||||
|
}
|
||||||
|
|
||||||
|
SUCCESS=0
|
||||||
|
|
||||||
|
while getopts "c:t:" OPT; do
|
||||||
|
case ${OPT} in
|
||||||
|
c )
|
||||||
|
CONFIG="$OPTARG"
|
||||||
|
;;
|
||||||
|
t )
|
||||||
|
TP_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
\? )
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
# Parse list of configs.
|
||||||
|
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
|
||||||
|
|
||||||
|
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
|
||||||
|
do
|
||||||
|
LOCAL_SUCCESS=0
|
||||||
|
|
||||||
|
echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
|
||||||
|
|
||||||
|
export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
|
||||||
|
export LM_EVAL_TP_SIZE=$TP_SIZE
|
||||||
|
pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
|
||||||
|
|
||||||
|
if [[ $LOCAL_SUCCESS == 0 ]]; then
|
||||||
|
echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
|
||||||
|
else
|
||||||
|
echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
|
||||||
|
fi
|
||||||
|
|
||||||
|
SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ "${SUCCESS}" -eq "0" ]; then
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
@@ -3,52 +3,67 @@
|
|||||||
LM eval harness on model to compare vs HF baseline computed offline.
|
LM eval harness on model to compare vs HF baseline computed offline.
|
||||||
Configs are found in configs/$MODEL.yaml
|
Configs are found in configs/$MODEL.yaml
|
||||||
|
|
||||||
pytest -s -v test_lm_eval_correctness.py \
|
* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
|
||||||
--config-list-file=configs/models-small.txt \
|
* export LM_EVAL_TP_SIZE=4
|
||||||
--tp-size=1
|
* pytest -s test_lm_eval_correctness.py
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import lm_eval
|
import lm_eval
|
||||||
import numpy as np
|
import numpy
|
||||||
|
import pytest
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
RTOL = 0.08
|
RTOL = 0.05
|
||||||
|
TEST_DATA_FILE = os.environ.get(
|
||||||
|
"LM_EVAL_TEST_DATA_FILE",
|
||||||
|
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
|
||||||
|
|
||||||
|
TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
|
||||||
|
|
||||||
|
|
||||||
def launch_lm_eval(eval_config, tp_size):
|
def launch_lm_eval(eval_config):
|
||||||
trust_remote_code = eval_config.get("trust_remote_code", False)
|
trust_remote_code = eval_config.get('trust_remote_code', False)
|
||||||
model_args = (
|
|
||||||
f"pretrained={eval_config['model_name']},"
|
model_args = f"pretrained={eval_config['model_name']}," \
|
||||||
f"tensor_parallel_size={tp_size},"
|
f"tensor_parallel_size={TP_SIZE}," \
|
||||||
f"enforce_eager=true,"
|
f"add_bos_token=true," \
|
||||||
f"add_bos_token=true,"
|
|
||||||
f"trust_remote_code={trust_remote_code}"
|
f"trust_remote_code={trust_remote_code}"
|
||||||
)
|
|
||||||
results = lm_eval.simple_evaluate(
|
results = lm_eval.simple_evaluate(
|
||||||
model="vllm",
|
model="vllm",
|
||||||
model_args=model_args,
|
model_args=model_args,
|
||||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||||
num_fewshot=eval_config["num_fewshot"],
|
num_fewshot=eval_config["num_fewshot"],
|
||||||
limit=eval_config["limit"],
|
limit=eval_config["limit"],
|
||||||
batch_size="auto",
|
batch_size="auto")
|
||||||
)
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def test_lm_eval_correctness_param(config_filename, tp_size):
|
def test_lm_eval_correctness():
|
||||||
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
|
eval_config = yaml.safe_load(
|
||||||
|
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
|
||||||
|
|
||||||
results = launch_lm_eval(eval_config, tp_size)
|
if eval_config[
|
||||||
|
"model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform": #noqa: E501
|
||||||
|
pytest.skip("FBGEMM is currently failing on main.")
|
||||||
|
|
||||||
|
# Launch eval requests.
|
||||||
|
results = launch_lm_eval(eval_config)
|
||||||
|
|
||||||
|
# Confirm scores match ground truth.
|
||||||
success = True
|
success = True
|
||||||
for task in eval_config["tasks"]:
|
for task in eval_config["tasks"]:
|
||||||
for metric in task["metrics"]:
|
for metric in task["metrics"]:
|
||||||
ground_truth = metric["value"]
|
ground_truth = metric["value"]
|
||||||
measured_value = results["results"][task["name"]][metric["name"]]
|
measured_value = results["results"][task["name"]][metric["name"]]
|
||||||
print(
|
print(f'{task["name"]} | {metric["name"]}: '
|
||||||
f"{task['name']} | {metric['name']}: "
|
f'ground_truth={ground_truth} | measured={measured_value}')
|
||||||
f"ground_truth={ground_truth} | measured={measured_value}"
|
success = success and numpy.isclose(
|
||||||
)
|
ground_truth, measured_value, rtol=RTOL)
|
||||||
success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
|
|
||||||
|
|
||||||
|
# Assert at the end, print all scores even on failure for debugging.
|
||||||
assert success
|
assert success
|
||||||
|
|||||||
@@ -65,18 +65,18 @@ def read_markdown(file):
|
|||||||
|
|
||||||
|
|
||||||
def results_to_json(latency, throughput, serving):
|
def results_to_json(latency, throughput, serving):
|
||||||
return json.dumps(
|
return json.dumps({
|
||||||
{
|
'latency': latency.to_dict(),
|
||||||
"latency": latency.to_dict(),
|
'throughput': throughput.to_dict(),
|
||||||
"throughput": throughput.to_dict(),
|
'serving': serving.to_dict()
|
||||||
"serving": serving.to_dict(),
|
})
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
# collect results
|
# collect results
|
||||||
for test_file in results_folder.glob("*.json"):
|
for test_file in results_folder.glob("*.json"):
|
||||||
|
|
||||||
with open(test_file) as f:
|
with open(test_file) as f:
|
||||||
raw_result = json.loads(f.read())
|
raw_result = json.loads(f.read())
|
||||||
|
|
||||||
@@ -120,8 +120,7 @@ if __name__ == "__main__":
|
|||||||
for perc in [10, 25, 50, 75, 90, 99]:
|
for perc in [10, 25, 50, 75, 90, 99]:
|
||||||
# Multiply 1000 to convert the time unit from s to ms
|
# Multiply 1000 to convert the time unit from s to ms
|
||||||
raw_result.update(
|
raw_result.update(
|
||||||
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}
|
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
|
||||||
)
|
|
||||||
raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
|
raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
|
||||||
|
|
||||||
# add the result to raw_result
|
# add the result to raw_result
|
||||||
@@ -154,27 +153,26 @@ if __name__ == "__main__":
|
|||||||
serving_results = pd.DataFrame.from_dict(serving_results)
|
serving_results = pd.DataFrame.from_dict(serving_results)
|
||||||
throughput_results = pd.DataFrame.from_dict(throughput_results)
|
throughput_results = pd.DataFrame.from_dict(throughput_results)
|
||||||
|
|
||||||
raw_results_json = results_to_json(
|
raw_results_json = results_to_json(latency_results, throughput_results,
|
||||||
latency_results, throughput_results, serving_results
|
serving_results)
|
||||||
)
|
|
||||||
|
|
||||||
# remapping the key, for visualization purpose
|
# remapping the key, for visualization purpose
|
||||||
if not latency_results.empty:
|
if not latency_results.empty:
|
||||||
latency_results = latency_results[list(latency_column_mapping.keys())].rename(
|
latency_results = latency_results[list(
|
||||||
columns=latency_column_mapping
|
latency_column_mapping.keys())].rename(
|
||||||
)
|
columns=latency_column_mapping)
|
||||||
if not serving_results.empty:
|
if not serving_results.empty:
|
||||||
serving_results = serving_results[list(serving_column_mapping.keys())].rename(
|
serving_results = serving_results[list(
|
||||||
columns=serving_column_mapping
|
serving_column_mapping.keys())].rename(
|
||||||
)
|
columns=serving_column_mapping)
|
||||||
if not throughput_results.empty:
|
if not throughput_results.empty:
|
||||||
throughput_results = throughput_results[
|
throughput_results = throughput_results[list(
|
||||||
list(throughput_results_column_mapping.keys())
|
throughput_results_column_mapping.keys())].rename(
|
||||||
].rename(columns=throughput_results_column_mapping)
|
columns=throughput_results_column_mapping)
|
||||||
|
|
||||||
processed_results_json = results_to_json(
|
processed_results_json = results_to_json(latency_results,
|
||||||
latency_results, throughput_results, serving_results
|
throughput_results,
|
||||||
)
|
serving_results)
|
||||||
|
|
||||||
for df in [latency_results, serving_results, throughput_results]:
|
for df in [latency_results, serving_results, throughput_results]:
|
||||||
if df.empty:
|
if df.empty:
|
||||||
@@ -186,39 +184,38 @@ if __name__ == "__main__":
|
|||||||
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
||||||
# we want to turn it into "8xGPUTYPE"
|
# we want to turn it into "8xGPUTYPE"
|
||||||
df["GPU"] = df["GPU"].apply(
|
df["GPU"] = df["GPU"].apply(
|
||||||
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
|
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
|
||||||
)
|
|
||||||
|
|
||||||
# get markdown tables
|
# get markdown tables
|
||||||
latency_md_table = tabulate(
|
latency_md_table = tabulate(latency_results,
|
||||||
latency_results, headers="keys", tablefmt="pipe", showindex=False
|
headers='keys',
|
||||||
)
|
tablefmt='pipe',
|
||||||
serving_md_table = tabulate(
|
showindex=False)
|
||||||
serving_results, headers="keys", tablefmt="pipe", showindex=False
|
serving_md_table = tabulate(serving_results,
|
||||||
)
|
headers='keys',
|
||||||
throughput_md_table = tabulate(
|
tablefmt='pipe',
|
||||||
throughput_results, headers="keys", tablefmt="pipe", showindex=False
|
showindex=False)
|
||||||
)
|
throughput_md_table = tabulate(throughput_results,
|
||||||
|
headers='keys',
|
||||||
|
tablefmt='pipe',
|
||||||
|
showindex=False)
|
||||||
|
|
||||||
# document the result
|
# document the result
|
||||||
with open(results_folder / "benchmark_results.md", "w") as f:
|
with open(results_folder / "benchmark_results.md", "w") as f:
|
||||||
results = read_markdown(
|
|
||||||
"../.buildkite/nightly-benchmarks/"
|
results = read_markdown("../.buildkite/nightly-benchmarks/" +
|
||||||
+ "performance-benchmarks-descriptions.md"
|
"performance-benchmarks-descriptions.md")
|
||||||
)
|
|
||||||
results = results.format(
|
results = results.format(
|
||||||
latency_tests_markdown_table=latency_md_table,
|
latency_tests_markdown_table=latency_md_table,
|
||||||
throughput_tests_markdown_table=throughput_md_table,
|
throughput_tests_markdown_table=throughput_md_table,
|
||||||
serving_tests_markdown_table=serving_md_table,
|
serving_tests_markdown_table=serving_md_table,
|
||||||
benchmarking_results_in_json_string=processed_results_json,
|
benchmarking_results_in_json_string=processed_results_json)
|
||||||
)
|
|
||||||
f.write(results)
|
f.write(results)
|
||||||
|
|
||||||
# document benchmarking results in json
|
# document benchmarking results in json
|
||||||
with open(results_folder / "benchmark_results.json", "w") as f:
|
with open(results_folder / "benchmark_results.json", "w") as f:
|
||||||
results = (
|
|
||||||
latency_results.to_dict(orient="records")
|
results = latency_results.to_dict(
|
||||||
+ throughput_results.to_dict(orient="records")
|
orient='records') + throughput_results.to_dict(
|
||||||
+ serving_results.to_dict(orient="records")
|
orient='records') + serving_results.to_dict(orient='records')
|
||||||
)
|
|
||||||
f.write(json.dumps(results))
|
f.write(json.dumps(results))
|
||||||
|
|||||||
@@ -14,12 +14,15 @@ def main(model, cachedir):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Download and save Hugging Face tokenizer"
|
description="Download and save Hugging Face tokenizer")
|
||||||
)
|
parser.add_argument("--model",
|
||||||
parser.add_argument("--model", type=str, required=True, help="Name of the model")
|
type=str,
|
||||||
parser.add_argument(
|
required=True,
|
||||||
"--cachedir", type=str, required=True, help="Directory to save the tokenizer"
|
help="Name of the model")
|
||||||
)
|
parser.add_argument("--cachedir",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="Directory to save the tokenizer")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args.model, args.cachedir)
|
main(args.model, args.cachedir)
|
||||||
|
|||||||
@@ -11,33 +11,33 @@ from tabulate import tabulate
|
|||||||
|
|
||||||
def parse_arguments():
|
def parse_arguments():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Parse command line arguments for summary-nightly-results script."
|
description=
|
||||||
)
|
'Parse command line arguments for summary-nightly-results script.')
|
||||||
parser.add_argument(
|
parser.add_argument('--results-folder',
|
||||||
"--results-folder",
|
|
||||||
type=str,
|
type=str,
|
||||||
required=True,
|
required=True,
|
||||||
help="The folder where the results are stored.",
|
help='The folder where the results are stored.')
|
||||||
)
|
parser.add_argument('--description',
|
||||||
parser.add_argument(
|
type=str,
|
||||||
"--description", type=str, required=True, help="Description of the results."
|
required=True,
|
||||||
)
|
help='Description of the results.')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
return args
|
return args
|
||||||
|
|
||||||
|
|
||||||
def get_perf(df, method, model, metric):
|
def get_perf(df, method, model, metric):
|
||||||
|
|
||||||
means = []
|
means = []
|
||||||
|
|
||||||
for qps in [2, 4, 8, 16, "inf"]:
|
for qps in [2, 4, 8, 16, "inf"]:
|
||||||
target = df["Test name"].str.contains(model)
|
target = df['Test name'].str.contains(model)
|
||||||
target = target & df["Engine"].str.contains(method)
|
target = target & df['Engine'].str.contains(method)
|
||||||
target = target & df["Test name"].str.contains("qps_" + str(qps))
|
target = target & df['Test name'].str.contains("qps_" + str(qps))
|
||||||
filtered_df = df[target]
|
filtered_df = df[target]
|
||||||
|
|
||||||
if filtered_df.empty:
|
if filtered_df.empty:
|
||||||
means.append(0.0)
|
means.append(0.)
|
||||||
else:
|
else:
|
||||||
means.append(filtered_df[metric].values[0])
|
means.append(filtered_df[metric].values[0])
|
||||||
|
|
||||||
@@ -45,6 +45,7 @@ def get_perf(df, method, model, metric):
|
|||||||
|
|
||||||
|
|
||||||
def get_perf_w_std(df, method, model, metric):
|
def get_perf_w_std(df, method, model, metric):
|
||||||
|
|
||||||
if metric in ["TTFT", "ITL"]:
|
if metric in ["TTFT", "ITL"]:
|
||||||
mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
|
mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
|
||||||
mean = mean.tolist()
|
mean = mean.tolist()
|
||||||
@@ -59,8 +60,7 @@ def get_perf_w_std(df, method, model, metric):
|
|||||||
else:
|
else:
|
||||||
assert metric == "Tput"
|
assert metric == "Tput"
|
||||||
mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
|
mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
|
||||||
df, method, model, "Output Tput (tok/s)"
|
df, method, model, "Output Tput (tok/s)")
|
||||||
)
|
|
||||||
mean = mean.tolist()
|
mean = mean.tolist()
|
||||||
std = None
|
std = None
|
||||||
|
|
||||||
@@ -80,17 +80,18 @@ def main(args):
|
|||||||
# generate markdown table
|
# generate markdown table
|
||||||
df = pd.DataFrame.from_dict(results)
|
df = pd.DataFrame.from_dict(results)
|
||||||
|
|
||||||
md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
|
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
|
||||||
|
|
||||||
with open(args.description) as f:
|
with open(args.description) as f:
|
||||||
description = f.read()
|
description = f.read()
|
||||||
|
|
||||||
description = description.format(nightly_results_benchmarking_table=md_table)
|
description = description.format(
|
||||||
|
nightly_results_benchmarking_table=md_table)
|
||||||
|
|
||||||
with open("nightly_results.md", "w") as f:
|
with open("nightly_results.md", "w") as f:
|
||||||
f.write(description)
|
f.write(description)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == '__main__':
|
||||||
args = parse_arguments()
|
args = parse_arguments()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@@ -10,24 +10,15 @@ set -x
|
|||||||
set -o pipefail
|
set -o pipefail
|
||||||
|
|
||||||
check_gpus() {
|
check_gpus() {
|
||||||
if command -v nvidia-smi; then
|
|
||||||
# check the number of GPUs and GPU type.
|
# check the number of GPUs and GPU type.
|
||||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||||
elif command -v amd-smi; then
|
|
||||||
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $gpu_count -gt 0 ]]; then
|
if [[ $gpu_count -gt 0 ]]; then
|
||||||
echo "GPU found."
|
echo "GPU found."
|
||||||
else
|
else
|
||||||
echo "Need at least 1 GPU to run benchmarking."
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
if command -v nvidia-smi; then
|
|
||||||
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
||||||
elif command -v amd-smi; then
|
|
||||||
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
|
|
||||||
fi
|
|
||||||
echo "GPU type is $gpu_type"
|
echo "GPU type is $gpu_type"
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -99,15 +90,9 @@ kill_gpu_processes() {
|
|||||||
|
|
||||||
|
|
||||||
# wait until GPU memory usage smaller than 1GB
|
# wait until GPU memory usage smaller than 1GB
|
||||||
if command -v nvidia-smi; then
|
|
||||||
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
||||||
sleep 1
|
sleep 1
|
||||||
done
|
done
|
||||||
elif command -v amd-smi; then
|
|
||||||
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
|
|
||||||
sleep 1
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
|
|
||||||
# remove vllm config file
|
# remove vllm config file
|
||||||
rm -rf ~/.config/vllm
|
rm -rf ~/.config/vllm
|
||||||
@@ -376,7 +361,7 @@ main() {
|
|||||||
# get the current IP address, required by benchmark_serving.py
|
# get the current IP address, required by benchmark_serving.py
|
||||||
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
||||||
# turn of the reporting of the status of each request, to clean up the terminal output
|
# turn of the reporting of the status of each request, to clean up the terminal output
|
||||||
export VLLM_LOGGING_LEVEL="WARNING"
|
export VLLM_LOG_LEVEL="WARNING"
|
||||||
|
|
||||||
# prepare for benchmarking
|
# prepare for benchmarking
|
||||||
cd benchmarks || exit 1
|
cd benchmarks || exit 1
|
||||||
|
|||||||
@@ -34,8 +34,10 @@ serving_column_mapping = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
# collect results
|
# collect results
|
||||||
for test_file in results_folder.glob("*.json"):
|
for test_file in results_folder.glob("*.json"):
|
||||||
|
|
||||||
with open(test_file) as f:
|
with open(test_file) as f:
|
||||||
raw_result = json.loads(f.read())
|
raw_result = json.loads(f.read())
|
||||||
|
|
||||||
@@ -54,16 +56,17 @@ if __name__ == "__main__":
|
|||||||
serving_results = pd.DataFrame.from_dict(serving_results)
|
serving_results = pd.DataFrame.from_dict(serving_results)
|
||||||
|
|
||||||
if not serving_results.empty:
|
if not serving_results.empty:
|
||||||
serving_results = serving_results[list(serving_column_mapping.keys())].rename(
|
serving_results = serving_results[list(
|
||||||
columns=serving_column_mapping
|
serving_column_mapping.keys())].rename(
|
||||||
)
|
columns=serving_column_mapping)
|
||||||
|
|
||||||
serving_md_table_with_headers = tabulate(
|
serving_md_table_with_headers = tabulate(serving_results,
|
||||||
serving_results, headers="keys", tablefmt="pipe", showindex=False
|
headers='keys',
|
||||||
)
|
tablefmt='pipe',
|
||||||
|
showindex=False)
|
||||||
# remove the first line of header
|
# remove the first line of header
|
||||||
serving_md_table_lines = serving_md_table_with_headers.split("\n")
|
serving_md_table_lines = serving_md_table_with_headers.split('\n')
|
||||||
serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
|
serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
|
||||||
|
|
||||||
prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||||
prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
|
prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
|
||||||
@@ -73,9 +76,10 @@ if __name__ == "__main__":
|
|||||||
# document results with header.
|
# document results with header.
|
||||||
# for those who wants to reproduce our benchmark.
|
# for those who wants to reproduce our benchmark.
|
||||||
f.write(serving_md_table_with_headers)
|
f.write(serving_md_table_with_headers)
|
||||||
f.write("\n")
|
f.write('\n')
|
||||||
|
|
||||||
# document benchmarking results in json
|
# document benchmarking results in json
|
||||||
with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
|
with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
|
||||||
results = serving_results.to_dict(orient="records")
|
|
||||||
|
results = serving_results.to_dict(orient='records')
|
||||||
f.write(json.dumps(results))
|
f.write(json.dumps(results))
|
||||||
|
|||||||
@@ -64,11 +64,9 @@
|
|||||||
"disable_log_requests": "",
|
"disable_log_requests": "",
|
||||||
"tensor_parallel_size": 4,
|
"tensor_parallel_size": 4,
|
||||||
"swap_space": 16,
|
"swap_space": 16,
|
||||||
"speculative_config": {
|
"speculative_model": "turboderp/Qwama-0.5B-Instruct",
|
||||||
"model": "turboderp/Qwama-0.5B-Instruct",
|
|
||||||
"num_speculative_tokens": 4,
|
"num_speculative_tokens": 4,
|
||||||
"draft_tensor_parallel_size": 1
|
"speculative_draft_tensor_parallel_size": 1
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
|
|||||||
@@ -1,46 +0,0 @@
|
|||||||
# This local pyproject file is part of the migration from yapf to ruff format.
|
|
||||||
# It uses the same core rules as the main pyproject.toml file, but with the
|
|
||||||
# following differences:
|
|
||||||
# - ruff line length is overridden to 88
|
|
||||||
# - deprecated typing ignores (UP006, UP035) have been removed
|
|
||||||
|
|
||||||
[tool.ruff]
|
|
||||||
line-length = 88
|
|
||||||
|
|
||||||
[tool.ruff.lint.per-file-ignores]
|
|
||||||
"vllm/third_party/**" = ["ALL"]
|
|
||||||
"vllm/version.py" = ["F401"]
|
|
||||||
"vllm/_version.py" = ["ALL"]
|
|
||||||
|
|
||||||
[tool.ruff.lint]
|
|
||||||
select = [
|
|
||||||
# pycodestyle
|
|
||||||
"E",
|
|
||||||
# Pyflakes
|
|
||||||
"F",
|
|
||||||
# pyupgrade
|
|
||||||
"UP",
|
|
||||||
# flake8-bugbear
|
|
||||||
"B",
|
|
||||||
# flake8-simplify
|
|
||||||
"SIM",
|
|
||||||
# isort
|
|
||||||
"I",
|
|
||||||
# flake8-logging-format
|
|
||||||
"G",
|
|
||||||
]
|
|
||||||
ignore = [
|
|
||||||
# star imports
|
|
||||||
"F405", "F403",
|
|
||||||
# lambda expression assignment
|
|
||||||
"E731",
|
|
||||||
# Loop control variable not used within loop body
|
|
||||||
"B007",
|
|
||||||
# f-string format
|
|
||||||
"UP032",
|
|
||||||
# Can remove once 3.10+ is the minimum Python version
|
|
||||||
"UP007",
|
|
||||||
]
|
|
||||||
|
|
||||||
[tool.ruff.format]
|
|
||||||
docstring-code-format = true
|
|
||||||
@@ -1,23 +1,23 @@
|
|||||||
steps:
|
steps:
|
||||||
- label: "Build wheel - CUDA 12.8"
|
- label: "Build wheel - CUDA 12.4"
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- label: "Build wheel - CUDA 12.6"
|
- label: "Build wheel - CUDA 12.1"
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
@@ -31,10 +31,10 @@ steps:
|
|||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
@@ -48,7 +48,7 @@ steps:
|
|||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
- label: "Build and publish TPU release image"
|
- label: "Build and publish TPU release image"
|
||||||
@@ -57,14 +57,12 @@ steps:
|
|||||||
agents:
|
agents:
|
||||||
queue: tpu_queue_postmerge
|
queue: tpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "yes | docker system prune -a"
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
|
||||||
- "git fetch --all"
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
|
|
||||||
- "docker push vllm/vllm-tpu:nightly"
|
- "docker push vllm/vllm-tpu:nightly"
|
||||||
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
|
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
|
||||||
plugins:
|
plugins:
|
||||||
- docker-login#v3.0.0:
|
- docker-login#v3.0.0:
|
||||||
username: vllmbot
|
username: vllm
|
||||||
password-env: DOCKERHUB_TOKEN
|
password-env: DOCKERHUB_TOKEN
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
@@ -84,22 +82,7 @@ steps:
|
|||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --progress plain -f Dockerfile.cpu ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- block: "Build Neuron release image"
|
|
||||||
key: block-neuron-release-image-build
|
|
||||||
depends_on: ~
|
|
||||||
|
|
||||||
- label: "Build and publish Neuron release image"
|
|
||||||
depends_on: block-neuron-release-image-build
|
|
||||||
agents:
|
|
||||||
queue: neuron-postmerge
|
|
||||||
commands:
|
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
|
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|||||||
@@ -3,9 +3,6 @@
|
|||||||
# This script runs test inside the corresponding ROCm docker container.
|
# This script runs test inside the corresponding ROCm docker container.
|
||||||
set -o pipefail
|
set -o pipefail
|
||||||
|
|
||||||
# Export Python path
|
|
||||||
export PYTHONPATH=".."
|
|
||||||
|
|
||||||
# Print ROCm version
|
# Print ROCm version
|
||||||
echo "--- Confirming Clean Initial State"
|
echo "--- Confirming Clean Initial State"
|
||||||
while true; do
|
while true; do
|
||||||
@@ -77,102 +74,50 @@ HF_MOUNT="/root/.cache/huggingface"
|
|||||||
|
|
||||||
commands=$@
|
commands=$@
|
||||||
echo "Commands:$commands"
|
echo "Commands:$commands"
|
||||||
|
|
||||||
if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
|
|
||||||
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
|
|
||||||
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
|
|
||||||
commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
|
|
||||||
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
|
|
||||||
fi
|
|
||||||
|
|
||||||
#ignore certain kernels tests
|
#ignore certain kernels tests
|
||||||
if [[ $commands == *" kernels/core"* ]]; then
|
if [[ $commands == *" kernels "* ]]; then
|
||||||
commands="${commands} \
|
commands="${commands} \
|
||||||
--ignore=kernels/core/test_fused_quant_layernorm.py \
|
--ignore=kernels/test_attention_selector.py \
|
||||||
--ignore=kernels/core/test_permute_cols.py"
|
--ignore=kernels/test_blocksparse_attention.py \
|
||||||
fi
|
--ignore=kernels/test_causal_conv1d.py \
|
||||||
|
--ignore=kernels/test_cutlass.py \
|
||||||
if [[ $commands == *" kernels/attention"* ]]; then
|
--ignore=kernels/test_encoder_decoder_attn.py \
|
||||||
commands="${commands} \
|
--ignore=kernels/test_flash_attn.py \
|
||||||
--ignore=kernels/attention/stest_attention_selector.py \
|
--ignore=kernels/test_flashinfer.py \
|
||||||
--ignore=kernels/attention/test_blocksparse_attention.py \
|
--ignore=kernels/test_int8_quant.py \
|
||||||
--ignore=kernels/attention/test_encoder_decoder_attn.py \
|
--ignore=kernels/test_machete_gemm.py \
|
||||||
--ignore=kernels/attention/test_attention_selector.py \
|
--ignore=kernels/test_mamba_ssm.py \
|
||||||
--ignore=kernels/attention/test_flash_attn.py \
|
--ignore=kernels/test_marlin_gemm.py \
|
||||||
--ignore=kernels/attention/test_flashinfer.py \
|
--ignore=kernels/test_moe.py \
|
||||||
--ignore=kernels/attention/test_prefix_prefill.py \
|
--ignore=kernels/test_prefix_prefill.py \
|
||||||
--ignore=kernels/attention/test_cascade_flash_attn.py \
|
--ignore=kernels/test_rand.py \
|
||||||
--ignore=kernels/attention/test_mha_attn.py \
|
--ignore=kernels/test_sampler.py \
|
||||||
--ignore=kernels/attention/test_lightning_attn.py \
|
--ignore=kernels/test_cascade_flash_attn.py \
|
||||||
--ignore=kernels/attention/test_attention.py"
|
--ignore=kernels/test_mamba_mixer2.py \
|
||||||
fi
|
--ignore=kernels/test_aqlm.py \
|
||||||
|
--ignore=kernels/test_machete_mm.py \
|
||||||
if [[ $commands == *" kernels/quantization"* ]]; then
|
--ignore=kernels/test_mha_attn.py \
|
||||||
commands="${commands} \
|
--ignore=kernels/test_block_fp8.py \
|
||||||
--ignore=kernels/quantization/test_int8_quant.py \
|
--ignore=kernels/test_permute_cols.py"
|
||||||
--ignore=kernels/quantization/test_aqlm.py \
|
|
||||||
--ignore=kernels/quantization/test_machete_mm.py \
|
|
||||||
--ignore=kernels/quantization/test_block_fp8.py \
|
|
||||||
--ignore=kernels/quantization/test_block_int8.py \
|
|
||||||
--ignore=kernels/quantization/test_marlin_gemm.py \
|
|
||||||
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
|
|
||||||
--ignore=kernels/quantization/test_int8_kernel.py"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *" kernels/mamba"* ]]; then
|
|
||||||
commands="${commands} \
|
|
||||||
--ignore=kernels/mamba/test_mamba_mixer2.py \
|
|
||||||
--ignore=kernels/mamba/test_causal_conv1d.py \
|
|
||||||
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *" kernels/moe"* ]]; then
|
|
||||||
commands="${commands} \
|
|
||||||
--ignore=kernels/moe/test_moe.py \
|
|
||||||
--ignore=kernels/moe/test_cutlass_moe.py \
|
|
||||||
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
#ignore certain Entrypoints/openai tests
|
#ignore certain Entrypoints/openai tests
|
||||||
if [[ $commands == *" entrypoints/openai "* ]]; then
|
if [[ $commands == *" entrypoints/openai "* ]]; then
|
||||||
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
|
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
|
||||||
--ignore=entrypoints/openai/test_audio.py \
|
--ignore=entrypoints/openai/test_audio.py \
|
||||||
|
--ignore=entrypoints/openai/test_chat.py \
|
||||||
--ignore=entrypoints/openai/test_shutdown.py \
|
--ignore=entrypoints/openai/test_shutdown.py \
|
||||||
--ignore=entrypoints/openai/test_completion.py \
|
--ignore=entrypoints/openai/test_completion.py \
|
||||||
--ignore=entrypoints/openai/test_sleep.py \
|
--ignore=entrypoints/openai/test_sleep.py \
|
||||||
--ignore=entrypoints/openai/test_models.py \
|
--ignore=entrypoints/openai/test_models.py \
|
||||||
--ignore=entrypoints/openai/test_lora_adapters.py \
|
|
||||||
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
|
|
||||||
--ignore=entrypoints/openai/test_root_path.py \
|
|
||||||
--ignore=entrypoints/openai/test_tokenization.py \
|
|
||||||
--ignore=entrypoints/openai/test_prompt_validation.py "}
|
--ignore=entrypoints/openai/test_prompt_validation.py "}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
#ignore certain Entrypoints/llm tests
|
#ignore certain Entrypoints/llm tests
|
||||||
if [[ $commands == *" entrypoints/llm "* ]]; then
|
if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
|
||||||
commands=${commands//" entrypoints/llm "/" entrypoints/llm \
|
commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
|
||||||
--ignore=entrypoints/llm/test_chat.py \
|
|
||||||
--ignore=entrypoints/llm/test_accuracy.py \
|
|
||||||
--ignore=entrypoints/llm/test_init.py \
|
|
||||||
--ignore=entrypoints/llm/test_generate_multiple_loras.py \
|
|
||||||
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
#Obsolete currently
|
|
||||||
##ignore certain Entrypoints/llm tests
|
|
||||||
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
|
|
||||||
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
|
|
||||||
#fi
|
|
||||||
|
|
||||||
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
||||||
# --ignore=entrypoints/openai/test_embedding.py \
|
# --ignore=entrypoints/openai/test_embedding.py \
|
||||||
# --ignore=entrypoints/openai/test_oot_registration.py
|
# --ignore=entrypoints/openai/test_oot_registration.py
|
||||||
@@ -181,8 +126,6 @@ fi
|
|||||||
|
|
||||||
|
|
||||||
PARALLEL_JOB_COUNT=8
|
PARALLEL_JOB_COUNT=8
|
||||||
MYPYTHONPATH=".."
|
|
||||||
|
|
||||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
||||||
if [[ $commands == *"--shard-id="* ]]; then
|
if [[ $commands == *"--shard-id="* ]]; then
|
||||||
# assign job count as the number of shards used
|
# assign job count as the number of shards used
|
||||||
@@ -191,10 +134,9 @@ if [[ $commands == *"--shard-id="* ]]; then
|
|||||||
# assign shard-id for each shard
|
# assign shard-id for each shard
|
||||||
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
|
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
|
||||||
echo "Shard ${GPU} commands:$commands_gpu"
|
echo "Shard ${GPU} commands:$commands_gpu"
|
||||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
|
||||||
docker run \
|
docker run \
|
||||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
--device /dev/kfd --device /dev/dri \
|
||||||
--network=host \
|
--network host \
|
||||||
--shm-size=16gb \
|
--shm-size=16gb \
|
||||||
--rm \
|
--rm \
|
||||||
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
||||||
@@ -203,7 +145,6 @@ if [[ $commands == *"--shard-id="* ]]; then
|
|||||||
-e AWS_SECRET_ACCESS_KEY \
|
-e AWS_SECRET_ACCESS_KEY \
|
||||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
-e "HF_HOME=${HF_MOUNT}" \
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
|
||||||
--name "${container_name}_${GPU}" \
|
--name "${container_name}_${GPU}" \
|
||||||
"${image_name}" \
|
"${image_name}" \
|
||||||
/bin/bash -c "${commands_gpu}" \
|
/bin/bash -c "${commands_gpu}" \
|
||||||
@@ -222,10 +163,9 @@ if [[ $commands == *"--shard-id="* ]]; then
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
else
|
else
|
||||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
|
||||||
docker run \
|
docker run \
|
||||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
--device /dev/kfd --device /dev/dri \
|
||||||
--network=host \
|
--network host \
|
||||||
--shm-size=16gb \
|
--shm-size=16gb \
|
||||||
--rm \
|
--rm \
|
||||||
-e HIP_VISIBLE_DEVICES=0 \
|
-e HIP_VISIBLE_DEVICES=0 \
|
||||||
@@ -234,7 +174,6 @@ else
|
|||||||
-e AWS_SECRET_ACCESS_KEY \
|
-e AWS_SECRET_ACCESS_KEY \
|
||||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
-e "HF_HOME=${HF_MOUNT}" \
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
|
||||||
--name "${container_name}" \
|
--name "${container_name}" \
|
||||||
"${image_name}" \
|
"${image_name}" \
|
||||||
/bin/bash -c "${commands}"
|
/bin/bash -c "${commands}"
|
||||||
@@ -5,8 +5,8 @@
|
|||||||
set -ex
|
set -ex
|
||||||
set -o pipefail
|
set -o pipefail
|
||||||
|
|
||||||
# cd 2 levels into the working directory
|
# cd into parent directory of this file
|
||||||
cd "$(dirname "${BASH_SOURCE[0]}")/../.."
|
cd "$(dirname "${BASH_SOURCE[0]}")/.."
|
||||||
|
|
||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
|
|
||||||
@@ -10,4 +10,5 @@ trap remove_docker_container EXIT
|
|||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
docker build -t cpu-test -f docker/Dockerfile.s390x .
|
docker build -t cpu-test -f Dockerfile.ppc64le .
|
||||||
|
|
||||||
@@ -8,19 +8,15 @@ set -ex
|
|||||||
CORE_RANGE=${CORE_RANGE:-48-95}
|
CORE_RANGE=${CORE_RANGE:-48-95}
|
||||||
NUMA_NODE=${NUMA_NODE:-1}
|
NUMA_NODE=${NUMA_NODE:-1}
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
|
||||||
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() {
|
remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
|
||||||
set -e;
|
|
||||||
docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true;
|
|
||||||
docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true;
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
|
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
|
||||||
|
|
||||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||||
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
|
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
|
||||||
@@ -40,8 +36,8 @@ function cpu_tests() {
|
|||||||
# Run basic model test
|
# Run basic model test
|
||||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -v -s tests/kernels/test_cache.py -m cpu_model
|
pip install -r vllm/requirements/test.txt
|
||||||
pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
|
pip install -r vllm/requirements/cpu.txt
|
||||||
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
||||||
pytest -v -s tests/models/embedding/language -m cpu_model
|
pytest -v -s tests/models/embedding/language -m cpu_model
|
||||||
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
|
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
|
||||||
@@ -9,13 +9,11 @@ python3 use_existing_torch.py
|
|||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
DOCKER_BUILDKIT=1 docker build . \
|
DOCKER_BUILDKIT=1 docker build . \
|
||||||
--file docker/Dockerfile \
|
|
||||||
--target vllm-openai \
|
--target vllm-openai \
|
||||||
--platform "linux/arm64" \
|
--platform "linux/arm64" \
|
||||||
-t gh200-test \
|
-t gh200-test \
|
||||||
--build-arg max_jobs=66 \
|
--build-arg max_jobs=66 \
|
||||||
--build-arg nvcc_threads=2 \
|
--build-arg nvcc_threads=2 \
|
||||||
--build-arg RUN_WHEEL_CHECK=false \
|
|
||||||
--build-arg torch_cuda_arch_list="9.0+PTX" \
|
--build-arg torch_cuda_arch_list="9.0+PTX" \
|
||||||
--build-arg vllm_fa_cmake_gpu_arches="90-real"
|
--build-arg vllm_fa_cmake_gpu_arches="90-real"
|
||||||
|
|
||||||
@@ -25,6 +23,6 @@ trap remove_docker_container EXIT
|
|||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Run the image and test offline inference
|
# Run the image and test offline inference
|
||||||
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
||||||
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
|
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
|
||||||
'
|
'
|
||||||
@@ -5,22 +5,20 @@
|
|||||||
set -ex
|
set -ex
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
docker build -t hpu-test-env -f docker/Dockerfile.hpu .
|
docker build -t hpu-test-env -f Dockerfile.hpu .
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
# certain versions of HPU software stack have a bug that can
|
# certain versions of HPU software stack have a bug that can
|
||||||
# override the exit code of the script, so we need to use
|
# override the exit code of the script, so we need to use
|
||||||
# separate remove_docker_containers and remove_docker_containers_and_exit
|
# separate remove_docker_container and remove_docker_container_and_exit
|
||||||
# functions, while other platforms only need one remove_docker_container
|
# functions, while other platforms only need one remove_docker_container
|
||||||
# function.
|
# function.
|
||||||
EXITCODE=1
|
EXITCODE=1
|
||||||
remove_docker_containers() { docker rm -f hpu-test || true; docker rm -f hpu-test-tp2 || true; }
|
remove_docker_container() { docker rm -f hpu-test || true; }
|
||||||
remove_docker_containers_and_exit() { remove_docker_containers; exit $EXITCODE; }
|
remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }
|
||||||
trap remove_docker_containers_and_exit EXIT
|
trap remove_docker_container_and_exit EXIT
|
||||||
remove_docker_containers
|
remove_docker_container
|
||||||
|
|
||||||
# Run the image and launch offline inference
|
# Run the image and launch offline inference
|
||||||
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||||
docker run --runtime=habana --name=hpu-test-tp2 --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --tensor-parallel-size 2
|
|
||||||
|
|
||||||
EXITCODE=$?
|
EXITCODE=$?
|
||||||
@@ -3,7 +3,7 @@
|
|||||||
set -euox pipefail
|
set -euox pipefail
|
||||||
|
|
||||||
if [[ $# -lt 4 ]]; then
|
if [[ $# -lt 4 ]]; then
|
||||||
echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
|
echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -11,14 +11,13 @@ container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
|||||||
HF_CACHE="$(realpath ~)/huggingface"
|
HF_CACHE="$(realpath ~)/huggingface"
|
||||||
mkdir -p "${HF_CACHE}"
|
mkdir -p "${HF_CACHE}"
|
||||||
HF_MOUNT="/root/.cache/huggingface"
|
HF_MOUNT="/root/.cache/huggingface"
|
||||||
HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
|
|
||||||
|
|
||||||
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
|
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
|
||||||
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
|
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
|
||||||
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
|
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
|
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
|
||||||
|
|
||||||
# prune old image and containers to save disk space, and only once a day
|
# prune old image and containers to save disk space, and only once a day
|
||||||
# by using a timestamp file in tmp.
|
# by using a timestamp file in tmp.
|
||||||
@@ -36,7 +35,7 @@ else
|
|||||||
date "+%s" > /tmp/neuron-docker-build-timestamp
|
date "+%s" > /tmp/neuron-docker-build-timestamp
|
||||||
fi
|
fi
|
||||||
|
|
||||||
docker build -t "${image_name}" -f docker/Dockerfile.neuron .
|
docker build -t "${image_name}" -f Dockerfile.neuron .
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
@@ -48,16 +47,8 @@ trap remove_docker_container EXIT
|
|||||||
docker run --rm -it --device=/dev/neuron0 --network bridge \
|
docker run --rm -it --device=/dev/neuron0 --network bridge \
|
||||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
-e "HF_HOME=${HF_MOUNT}" \
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
-e "HF_TOKEN=${HF_TOKEN}" \
|
|
||||||
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
|
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||||
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
|
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||||
--name "${container_name}" \
|
--name "${container_name}" \
|
||||||
${image_name} \
|
${image_name} \
|
||||||
/bin/bash -c "
|
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys"
|
||||||
python3 /workspace/vllm/examples/offline_inference/neuron.py;
|
|
||||||
python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
|
|
||||||
for f in /workspace/vllm/tests/neuron/2_core/*.py; do
|
|
||||||
echo 'Running test file: '$f;
|
|
||||||
python3 -m pytest \$f -v --capture=tee-sys;
|
|
||||||
done
|
|
||||||
"
|
|
||||||
16
.buildkite/run-openvino-test.sh
Executable file
16
.buildkite/run-openvino-test.sh
Executable file
@@ -0,0 +1,16 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script build the OpenVINO docker image and run the offline inference inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
docker build -t openvino-test -f Dockerfile.openvino .
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() { docker rm -f openvino-test || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Run the image and launch offline inference
|
||||||
|
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||||
25
.buildkite/run-tpu-test.sh
Executable file
25
.buildkite/run-tpu-test.sh
Executable file
@@ -0,0 +1,25 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Build the docker image.
|
||||||
|
docker build -f Dockerfile.tpu -t vllm-tpu .
|
||||||
|
|
||||||
|
# Set up cleanup.
|
||||||
|
remove_docker_container() { docker rm -f tpu-test || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
# Remove the container that might not be cleaned up in the previous run.
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# For HF_TOKEN.
|
||||||
|
source /etc/environment
|
||||||
|
# Run a simple end-to-end example.
|
||||||
|
docker run --privileged --net host --shm-size=16G -it \
|
||||||
|
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
||||||
|
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
||||||
|
&& python3 -m pip install pytest \
|
||||||
|
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
||||||
|
&& pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||||
|
&& python3 /workspace/vllm/tests/tpu/test_compilation.py \
|
||||||
|
&& python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
|
||||||
|
&& python3 /workspace/vllm/examples/offline_inference/tpu.py"
|
||||||
27
.buildkite/run-tpu-v1-test.sh
Executable file
27
.buildkite/run-tpu-v1-test.sh
Executable file
@@ -0,0 +1,27 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Build the docker image.
|
||||||
|
docker build -f Dockerfile.tpu -t vllm-tpu .
|
||||||
|
|
||||||
|
# Set up cleanup.
|
||||||
|
remove_docker_container() { docker rm -f tpu-test || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
# Remove the container that might not be cleaned up in the previous run.
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# For HF_TOKEN.
|
||||||
|
source /etc/environment
|
||||||
|
# Run a simple end-to-end example.
|
||||||
|
docker run --privileged --net host --shm-size=16G -it \
|
||||||
|
-e "HF_TOKEN=$HF_TOKEN" -e "VLLM_USE_V1=1" --name tpu-test \
|
||||||
|
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
||||||
|
&& python3 -m pip install pytest \
|
||||||
|
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
||||||
|
&& pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||||
|
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
|
||||||
|
&& pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
|
||||||
|
&& python3 /workspace/vllm/tests/tpu/test_compilation.py \
|
||||||
|
&& python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
|
||||||
|
&& python3 /workspace/vllm/examples/offline_inference/tpu.py"
|
||||||
@@ -8,15 +8,14 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
|
|||||||
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
docker build -t ${image_name} -f docker/Dockerfile.xpu .
|
docker build -t ${image_name} -f Dockerfile.xpu .
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
docker rm -f "${container_name}" || true;
|
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true;
|
||||||
docker image rm -f "${image_name}" || true;
|
|
||||||
docker system prune -f || true;
|
|
||||||
}
|
}
|
||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
# Run the image and test offline inference/tensor parallel
|
# Run the image and test offline inference/tensor parallel
|
||||||
docker run \
|
docker run \
|
||||||
@@ -26,6 +25,6 @@ docker run \
|
|||||||
--name "${container_name}" \
|
--name "${container_name}" \
|
||||||
"${image_name}" \
|
"${image_name}" \
|
||||||
sh -c '
|
sh -c '
|
||||||
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||||
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
|
||||||
'
|
'
|
||||||
@@ -1,48 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script build the CPU docker image and run the offline inference inside the container.
|
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# Setup cleanup
|
|
||||||
remove_docker_container() {
|
|
||||||
if [[ -n "$container_id" ]]; then
|
|
||||||
podman rm -f "$container_id" || true
|
|
||||||
fi
|
|
||||||
podman system prune -f
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
|
|
||||||
|
|
||||||
# Run the image
|
|
||||||
container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
|
|
||||||
|
|
||||||
function cpu_tests() {
|
|
||||||
|
|
||||||
# offline inference
|
|
||||||
podman exec -it "$container_id" bash -c "
|
|
||||||
set -e
|
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
|
||||||
|
|
||||||
# Run basic model test
|
|
||||||
podman exec -it "$container_id" bash -c "
|
|
||||||
set -e
|
|
||||||
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
|
||||||
pip install sentence-transformers datamodel_code_generator
|
|
||||||
pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
|
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
|
||||||
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
|
||||||
pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
|
|
||||||
}
|
|
||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
|
||||||
|
|
||||||
export container_id
|
|
||||||
export -f cpu_tests
|
|
||||||
timeout 40m bash -c cpu_tests
|
|
||||||
|
|
||||||
@@ -1,103 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -xu
|
|
||||||
|
|
||||||
# Build the docker image.
|
|
||||||
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
|
||||||
|
|
||||||
# Set up cleanup.
|
|
||||||
remove_docker_container() { docker rm -f tpu-test || true; }
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
# Remove the container that might not be cleaned up in the previous run.
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# For HF_TOKEN.
|
|
||||||
source /etc/environment
|
|
||||||
# Run a simple end-to-end example.
|
|
||||||
docker run --privileged --net host --shm-size=16G -it \
|
|
||||||
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
|
||||||
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
|
||||||
&& python3 -m pip install pytest pytest-asyncio tpu-info \
|
|
||||||
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
|
||||||
&& export VLLM_XLA_CACHE_PATH= \
|
|
||||||
&& export VLLM_USE_V1=1 \
|
|
||||||
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
|
|
||||||
&& echo HARDWARE \
|
|
||||||
&& tpu-info \
|
|
||||||
&& { \
|
|
||||||
echo TEST_0: Running test_perf.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \
|
|
||||||
echo TEST_0_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_1: Running test_compilation.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \
|
|
||||||
echo TEST_1_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_2: Running test_basic.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \
|
|
||||||
echo TEST_2_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
|
|
||||||
echo TEST_3_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_4: Running test_quantization_accuracy.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \
|
|
||||||
echo TEST_4_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_5: Running examples/offline_inference/tpu.py; \
|
|
||||||
python3 /workspace/vllm/examples/offline_inference/tpu.py; \
|
|
||||||
echo TEST_5_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_6: Running test_tpu_model_runner.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \
|
|
||||||
echo TEST_6_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_7: Running test_sampler.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \
|
|
||||||
echo TEST_7_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_8: Running test_topk_topp_sampler.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \
|
|
||||||
echo TEST_8_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_9: Running test_multimodal.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \
|
|
||||||
echo TEST_9_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_10: Running test_pallas.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \
|
|
||||||
echo TEST_10_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_11: Running test_struct_output_generate.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \
|
|
||||||
echo TEST_11_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_12: Running test_moe_pallas.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \
|
|
||||||
echo TEST_12_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
# Disable the TPU LoRA tests until the feature is activated
|
|
||||||
# & { \
|
|
||||||
# echo TEST_13: Running test_moe_pallas.py; \
|
|
||||||
# python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/; \
|
|
||||||
# echo TEST_13_EXIT_CODE: \$?; \
|
|
||||||
# } & \
|
|
||||||
wait \
|
|
||||||
&& echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \
|
|
||||||
"
|
|
||||||
|
|
||||||
# TODO: This test fails because it uses RANDOM_SEED sampling
|
|
||||||
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
|
||||||
@@ -8,7 +8,6 @@
|
|||||||
# Documentation
|
# Documentation
|
||||||
# label(str): the name of the test. emoji allowed.
|
# label(str): the name of the test. emoji allowed.
|
||||||
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
|
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
|
||||||
# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
|
|
||||||
# fast_check_only(bool): run this test on fastcheck pipeline only
|
# fast_check_only(bool): run this test on fastcheck pipeline only
|
||||||
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
|
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
|
||||||
# command(str): the single command to run for tests. incompatible with commands.
|
# command(str): the single command to run for tests. incompatible with commands.
|
||||||
@@ -32,17 +31,16 @@ steps:
|
|||||||
##### fast check tests #####
|
##### fast check tests #####
|
||||||
|
|
||||||
- label: Documentation Build # 2min
|
- label: Documentation Build # 2min
|
||||||
mirror_hardwares: [amdexperimental]
|
working_dir: "/vllm-workspace/test_docs/docs"
|
||||||
working_dir: "/vllm-workspace/test_docs"
|
|
||||||
fast_check: true
|
fast_check: true
|
||||||
no_gpu: True
|
no_gpu: True
|
||||||
commands:
|
commands:
|
||||||
- pip install -r ../requirements/docs.txt
|
- pip install -r ../../requirements/docs.txt
|
||||||
# TODO: add `--strict` once warnings in docstrings are fixed
|
- SPHINXOPTS=\"-W\" make html
|
||||||
- mkdocs build
|
# Check API reference (if it fails, you may have missing mock imports)
|
||||||
|
- grep \"sig sig-object py\" build/html/api/inference_params.html
|
||||||
|
|
||||||
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/mq_llm_engine
|
- tests/mq_llm_engine
|
||||||
@@ -58,13 +56,11 @@ steps:
|
|||||||
- pytest -v -s async_engine # AsyncLLMEngine
|
- pytest -v -s async_engine # AsyncLLMEngine
|
||||||
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
|
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
|
||||||
- pytest -v -s test_inputs.py
|
- pytest -v -s test_inputs.py
|
||||||
- pytest -v -s test_outputs.py
|
|
||||||
- pytest -v -s multimodal
|
- pytest -v -s multimodal
|
||||||
- pytest -v -s test_utils.py # Utils
|
- pytest -v -s test_utils.py # Utils
|
||||||
- pytest -v -s worker # Worker
|
- pytest -v -s worker # Worker
|
||||||
|
|
||||||
- label: Python-only Installation Test
|
- label: Python-only Installation Test
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- tests/standalone_tests/python_only_compile.sh
|
- tests/standalone_tests/python_only_compile.sh
|
||||||
- setup.py
|
- setup.py
|
||||||
@@ -72,9 +68,8 @@ steps:
|
|||||||
- bash standalone_tests/python_only_compile.sh
|
- bash standalone_tests/python_only_compile.sh
|
||||||
|
|
||||||
- label: Basic Correctness Test # 30min
|
- label: Basic Correctness Test # 30min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
#mirror_hardwares: [amd]
|
||||||
fast_check: true
|
fast_check: true
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/basic_correctness/test_basic_correctness
|
- tests/basic_correctness/test_basic_correctness
|
||||||
@@ -89,7 +84,6 @@ steps:
|
|||||||
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||||
|
|
||||||
- label: Chunked Prefill Test
|
- label: Chunked Prefill Test
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/basic_correctness/test_chunked_prefill
|
- tests/basic_correctness/test_chunked_prefill
|
||||||
@@ -98,7 +92,7 @@ steps:
|
|||||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||||
|
|
||||||
- label: Core Test # 10min
|
- label: Core Test # 10min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amd]
|
||||||
fast_check: true
|
fast_check: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/core
|
- vllm/core
|
||||||
@@ -108,10 +102,9 @@ steps:
|
|||||||
- pytest -v -s core
|
- pytest -v -s core
|
||||||
|
|
||||||
- label: Entrypoints Test # 40min
|
- label: Entrypoints Test # 40min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
fast_check: true
|
fast_check: true
|
||||||
torch_nightly: true
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/entrypoints/llm
|
- tests/entrypoints/llm
|
||||||
@@ -125,12 +118,11 @@ steps:
|
|||||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
||||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
|
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
|
||||||
- pytest -v -s entrypoints/test_chat_utils.py
|
- pytest -v -s entrypoints/test_chat_utils.py
|
||||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||||
|
|
||||||
- label: Distributed Tests (4 GPUs) # 10min
|
- label: Distributed Tests (4 GPUs) # 10min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -138,36 +130,25 @@ steps:
|
|||||||
- vllm/core/
|
- vllm/core/
|
||||||
- tests/distributed/test_utils
|
- tests/distributed/test_utils
|
||||||
- tests/distributed/test_pynccl
|
- tests/distributed/test_pynccl
|
||||||
- tests/distributed/test_events
|
|
||||||
- tests/spec_decode/e2e/test_integration_dist_tp4
|
- tests/spec_decode/e2e/test_integration_dist_tp4
|
||||||
- tests/compile/test_basic_correctness
|
- tests/compile/test_basic_correctness
|
||||||
- examples/offline_inference/rlhf.py
|
- examples/offline_inference/rlhf.py
|
||||||
- examples/offline_inference/rlhf_colocate.py
|
- examples/offline_inference/rlhf_colocate.py
|
||||||
- tests/examples/offline_inference/data_parallel.py
|
- tests/examples/offline_inference/data_parallel.py
|
||||||
- tests/v1/test_async_llm_dp.py
|
|
||||||
commands:
|
commands:
|
||||||
# test with tp=2 and external_dp=2
|
|
||||||
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
|
||||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
|
||||||
# test with tp=2 and pp=2
|
|
||||||
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
|
||||||
# test with internal dp
|
|
||||||
- python3 ../examples/offline_inference/data_parallel.py
|
- python3 ../examples/offline_inference/data_parallel.py
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
|
||||||
- pytest -v -s distributed/test_utils.py
|
- pytest -v -s distributed/test_utils.py
|
||||||
- pytest -v -s compile/test_basic_correctness.py
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
- pytest -v -s distributed/test_pynccl.py
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
- pytest -v -s distributed/test_events.py
|
|
||||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
||||||
# TODO: create a dedicated test section for multi-GPU example tests
|
# TODO: create a dedicated test section for multi-GPU example tests
|
||||||
# when we have multiple distributed example tests
|
# when we have multiple distributed example tests
|
||||||
- pushd ../examples/offline_inference
|
- pushd ../examples/offline_inference
|
||||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
- python3 rlhf.py
|
||||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
- RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||||
- popd
|
- popd
|
||||||
|
|
||||||
- label: Metrics, Tracing Test # 10min
|
- label: Metrics, Tracing Test # 10min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@@ -175,13 +156,18 @@ steps:
|
|||||||
- tests/tracing
|
- tests/tracing
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s metrics
|
- pytest -v -s metrics
|
||||||
|
- "pip install \
|
||||||
|
'opentelemetry-sdk>=1.26.0,<1.27.0' \
|
||||||
|
'opentelemetry-api>=1.26.0,<1.27.0' \
|
||||||
|
'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
|
||||||
|
'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
|
||||||
- pytest -v -s tracing
|
- pytest -v -s tracing
|
||||||
|
|
||||||
##### fast check tests #####
|
##### fast check tests #####
|
||||||
##### 1 GPU test #####
|
##### 1 GPU test #####
|
||||||
|
|
||||||
- label: Regression Test # 5min
|
- label: Regression Test # 5min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/test_regression
|
- tests/test_regression
|
||||||
@@ -191,7 +177,7 @@ steps:
|
|||||||
working_dir: "/vllm-workspace/tests" # optional
|
working_dir: "/vllm-workspace/tests" # optional
|
||||||
|
|
||||||
- label: Engine Test # 10min
|
- label: Engine Test # 10min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/engine
|
- tests/engine
|
||||||
@@ -205,24 +191,22 @@ steps:
|
|||||||
- pytest -v -s tokenization
|
- pytest -v -s tokenization
|
||||||
|
|
||||||
- label: V1 Test
|
- label: V1 Test
|
||||||
mirror_hardwares: [amdexperimental]
|
#mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/v1
|
- tests/v1
|
||||||
commands:
|
commands:
|
||||||
# split the test to avoid interference
|
# split the test to avoid interference
|
||||||
- pytest -v -s v1/core
|
- pytest -v -s v1/core
|
||||||
|
- pytest -v -s v1/entrypoints
|
||||||
- pytest -v -s v1/engine
|
- pytest -v -s v1/engine
|
||||||
- pytest -v -s v1/entrypoints
|
- pytest -v -s v1/entrypoints
|
||||||
- pytest -v -s v1/sample
|
- pytest -v -s v1/sample
|
||||||
- pytest -v -s v1/worker
|
- pytest -v -s v1/worker
|
||||||
- pytest -v -s v1/structured_output
|
- pytest -v -s v1/structured_output
|
||||||
- pytest -v -s v1/spec_decode
|
- pytest -v -s v1/test_stats.py
|
||||||
- pytest -v -s v1/kv_connector/unit
|
|
||||||
- pytest -v -s v1/test_serial_utils.py
|
|
||||||
- pytest -v -s v1/test_utils.py
|
- pytest -v -s v1/test_utils.py
|
||||||
- pytest -v -s v1/test_oracle.py
|
- pytest -v -s v1/test_oracle.py
|
||||||
- pytest -v -s v1/test_metrics_reader.py
|
|
||||||
# TODO: accuracy does not match, whether setting
|
# TODO: accuracy does not match, whether setting
|
||||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||||
- pytest -v -s v1/e2e
|
- pytest -v -s v1/e2e
|
||||||
@@ -231,8 +215,8 @@ steps:
|
|||||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||||
|
|
||||||
- label: Examples Test # 25min
|
- label: Examples Test # 25min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/examples"
|
working_dir: "/vllm-workspace/examples"
|
||||||
|
#mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/entrypoints
|
- vllm/entrypoints
|
||||||
- examples/
|
- examples/
|
||||||
@@ -247,7 +231,7 @@ steps:
|
|||||||
- python3 offline_inference/vision_language.py --seed 0
|
- python3 offline_inference/vision_language.py --seed 0
|
||||||
- python3 offline_inference/vision_language_embedding.py --seed 0
|
- python3 offline_inference/vision_language_embedding.py --seed 0
|
||||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||||
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
- VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
- python3 offline_inference/encoder_decoder.py
|
- python3 offline_inference/encoder_decoder.py
|
||||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||||
- python3 offline_inference/basic/classify.py
|
- python3 offline_inference/basic/classify.py
|
||||||
@@ -256,7 +240,7 @@ steps:
|
|||||||
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
||||||
|
|
||||||
- label: Prefix Caching Test # 9min
|
- label: Prefix Caching Test # 9min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/prefix_caching
|
- tests/prefix_caching
|
||||||
@@ -264,7 +248,6 @@ steps:
|
|||||||
- pytest -v -s prefix_caching
|
- pytest -v -s prefix_caching
|
||||||
|
|
||||||
- label: Samplers Test # 36min
|
- label: Samplers Test # 36min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor/layers
|
- vllm/model_executor/layers
|
||||||
- vllm/sampling_metadata.py
|
- vllm/sampling_metadata.py
|
||||||
@@ -275,7 +258,7 @@ steps:
|
|||||||
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
||||||
|
|
||||||
- label: LogitsProcessor Test # 5min
|
- label: LogitsProcessor Test # 5min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor/layers
|
- vllm/model_executor/layers
|
||||||
- vllm/model_executor/guided_decoding
|
- vllm/model_executor/guided_decoding
|
||||||
@@ -286,7 +269,6 @@ steps:
|
|||||||
- pytest -v -s model_executor/test_guided_processors.py
|
- pytest -v -s model_executor/test_guided_processors.py
|
||||||
|
|
||||||
- label: Speculative decoding tests # 40min
|
- label: Speculative decoding tests # 40min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/spec_decode
|
- vllm/spec_decode
|
||||||
- tests/spec_decode
|
- tests/spec_decode
|
||||||
@@ -297,29 +279,14 @@ steps:
|
|||||||
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
|
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
|
||||||
|
|
||||||
- label: LoRA Test %N # 15min each
|
- label: LoRA Test %N # 15min each
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/lora
|
- vllm/lora
|
||||||
- tests/lora
|
- tests/lora
|
||||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
|
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py --ignore=lora/test_transfomers_model.py
|
||||||
parallelism: 4
|
parallelism: 4
|
||||||
|
|
||||||
- label: PyTorch Compilation Unit Tests
|
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/compile
|
|
||||||
commands:
|
|
||||||
- pytest -v -s compile/test_pass_manager.py
|
|
||||||
- pytest -v -s compile/test_fusion.py
|
|
||||||
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
|
||||||
- pytest -v -s compile/test_sequence_parallelism.py
|
|
||||||
- pytest -v -s compile/test_async_tp.py
|
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Smoke Test # 9min
|
- label: PyTorch Fullgraph Smoke Test # 9min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
@@ -330,110 +297,58 @@ steps:
|
|||||||
- pytest -v -s compile/piecewise/test_toy_llama.py
|
- pytest -v -s compile/piecewise/test_toy_llama.py
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Test # 18min
|
- label: PyTorch Fullgraph Test # 18min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s compile/test_full_graph.py
|
- pytest -v -s compile/test_full_graph.py
|
||||||
|
|
||||||
- label: Kernels Core Operation Test
|
- label: Kernels Test %N # 1h each
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- tests/kernels/core
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/core
|
|
||||||
|
|
||||||
- label: Kernels Attention Test %N
|
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/attention/
|
|
||||||
- vllm/attention
|
- vllm/attention
|
||||||
- vllm/v1/attention
|
- tests/kernels
|
||||||
- tests/kernels/attention
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
- pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||||
parallelism: 2
|
parallelism: 4
|
||||||
|
|
||||||
- label: Kernels Quantization Test %N
|
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/quantization/
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
- tests/kernels/quantization
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
|
||||||
parallelism: 2
|
|
||||||
|
|
||||||
- label: Kernels MoE Test
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/moe/
|
|
||||||
- tests/kernels/moe
|
|
||||||
- vllm/model_executor/layers/fused_moe/
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/moe
|
|
||||||
|
|
||||||
- label: Kernels Mamba Test
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/mamba/
|
|
||||||
- tests/kernels/mamba
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/mamba
|
|
||||||
|
|
||||||
- label: Tensorizer Test # 11min
|
- label: Tensorizer Test # 11min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amd]
|
||||||
soft_fail: true
|
soft_fail: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor/model_loader
|
- vllm/model_executor/model_loader
|
||||||
- tests/tensorizer_loader
|
- tests/tensorizer_loader
|
||||||
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
|
||||||
commands:
|
commands:
|
||||||
- apt-get update && apt-get install -y curl libsodium23
|
- apt-get update && apt-get install -y curl libsodium23
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s tensorizer_loader
|
- pytest -v -s tensorizer_loader
|
||||||
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
|
||||||
|
|
||||||
- label: Benchmarks # 9min
|
- label: Benchmarks # 9min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
working_dir: "/vllm-workspace/.buildkite"
|
working_dir: "/vllm-workspace/.buildkite"
|
||||||
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- benchmarks/
|
- benchmarks/
|
||||||
commands:
|
commands:
|
||||||
- bash scripts/run-benchmarks.sh
|
- bash run-benchmarks.sh
|
||||||
|
|
||||||
- label: Benchmarks CLI Test # 10min
|
- label: Quantization Test # 33min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/benchmarks/
|
|
||||||
commands:
|
|
||||||
- pytest -v -s benchmarks/
|
|
||||||
|
|
||||||
- label: Quantization Test
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
- tests/quantization
|
- tests/quantization
|
||||||
commands:
|
command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
|
||||||
|
|
||||||
- label: LM Eval Small Models # 53min
|
- label: LM Eval Small Models # 53min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
- bash ./run-tests.sh -c configs/models-small.txt -t 1
|
||||||
|
|
||||||
- label: OpenAI API correctness
|
- label: OpenAI API correctness
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- vllm/entrypoints/openai/
|
- vllm/entrypoints/openai/
|
||||||
@@ -442,7 +357,6 @@ steps:
|
|||||||
- pytest -s entrypoints/openai/correctness/
|
- pytest -s entrypoints/openai/correctness/
|
||||||
|
|
||||||
- label: Encoder Decoder tests # 5min
|
- label: Encoder Decoder tests # 5min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/encoder_decoder
|
- tests/encoder_decoder
|
||||||
@@ -450,117 +364,98 @@ steps:
|
|||||||
- pytest -v -s encoder_decoder
|
- pytest -v -s encoder_decoder
|
||||||
|
|
||||||
- label: OpenAI-Compatible Tool Use # 20 min
|
- label: OpenAI-Compatible Tool Use # 20 min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
fast_check: false
|
fast_check: false
|
||||||
|
mirror_hardwares: [ amd ]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/tool_use
|
- tests/tool_use
|
||||||
- tests/mistral_tool_use
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s tool_use
|
- pytest -v -s tool_use
|
||||||
- pytest -v -s mistral_tool_use
|
|
||||||
|
|
||||||
##### models test #####
|
##### models test #####
|
||||||
|
|
||||||
- label: Basic Models Test # 24min
|
- label: Basic Models Test # 24min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models
|
- tests/models
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s models/test_transformers.py
|
- pytest -v -s models/test_transformers.py
|
||||||
- pytest -v -s models/test_registry.py
|
- pytest -v -s models/test_registry.py
|
||||||
- pytest -v -s models/test_utils.py
|
# V1 Test: https://github.com/vllm-project/vllm/issues/14531
|
||||||
- pytest -v -s models/test_vision.py
|
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py
|
||||||
- pytest -v -s models/test_initialization.py
|
|
||||||
|
|
||||||
- label: Language Models Test (Standard)
|
- label: Language Models Test (Standard) # 32min
|
||||||
mirror_hardwares: [amdexperimental]
|
#mirror_hardwares: [amd]
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/language
|
- tests/models/decoder_only/language
|
||||||
|
- tests/models/embedding/language
|
||||||
|
- tests/models/encoder_decoder/language
|
||||||
commands:
|
commands:
|
||||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
- pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
|
||||||
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
- pytest -v -s models/embedding/language -m core_model
|
||||||
- pip freeze | grep -E 'torch'
|
|
||||||
- pytest -v -s models/language -m core_model
|
|
||||||
|
|
||||||
- label: Language Models Test (Extended Generation) # 1hr20min
|
- label: Language Models Test (Extended) # 1h10min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
optional: true
|
optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/language/generation
|
- tests/models/decoder_only/language
|
||||||
|
- tests/models/embedding/language
|
||||||
|
- tests/models/encoder_decoder/language
|
||||||
commands:
|
commands:
|
||||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
|
||||||
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
- pytest -v -s models/embedding/language -m 'not core_model'
|
||||||
- pytest -v -s models/language/generation -m 'not core_model'
|
|
||||||
|
|
||||||
- label: Language Models Test (Extended Pooling) # 36min
|
- label: Multi-Modal Models Test (Standard) # 40min
|
||||||
mirror_hardwares: [amdexperimental]
|
#mirror_hardwares: [amd]
|
||||||
optional: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/language/pooling
|
- tests/models/decoder_only/audio_language
|
||||||
commands:
|
- tests/models/decoder_only/vision_language
|
||||||
- pytest -v -s models/language/pooling -m 'not core_model'
|
- tests/models/embedding/vision_language
|
||||||
|
- tests/models/encoder_decoder/audio_language
|
||||||
- label: Multi-Modal Models Test (Standard)
|
- tests/models/encoder_decoder/vision_language
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/multimodal
|
|
||||||
commands:
|
commands:
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pip freeze | grep -E 'torch'
|
- pytest -v -s models/multimodal
|
||||||
- pytest -v -s models/multimodal/processing
|
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
|
||||||
- pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
|
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
|
||||||
- cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
- pytest -v -s models/embedding/vision_language -m core_model
|
||||||
|
- pytest -v -s models/encoder_decoder/audio_language -m core_model
|
||||||
|
- pytest -v -s models/encoder_decoder/language -m core_model
|
||||||
|
- pytest -v -s models/encoder_decoder/vision_language -m core_model
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 1
|
- label: Multi-Modal Models Test (Extended) 1 # 48m
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
optional: true
|
optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/multimodal
|
- tests/models/decoder_only/audio_language
|
||||||
|
- tests/models/decoder_only/vision_language
|
||||||
|
- tests/models/embedding/vision_language
|
||||||
|
- tests/models/encoder_decoder/vision_language
|
||||||
commands:
|
commands:
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'
|
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
|
||||||
|
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
|
||||||
|
# HACK - run phi3v tests separately to sidestep this transformers bug
|
||||||
|
# https://github.com/huggingface/transformers/issues/34307
|
||||||
|
- pytest -v -s models/decoder_only/vision_language/test_phi3v.py
|
||||||
|
- pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
|
||||||
|
- pytest -v -s models/embedding/vision_language -m 'not core_model'
|
||||||
|
- pytest -v -s models/encoder_decoder/language -m 'not core_model'
|
||||||
|
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 2
|
- label: Multi-Modal Models Test (Extended) 2 # 38m
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
optional: true
|
optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/multimodal
|
- tests/models/decoder_only/vision_language
|
||||||
commands:
|
commands:
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 3
|
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
optional: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/multimodal
|
|
||||||
commands:
|
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
|
||||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
|
|
||||||
|
|
||||||
- label: Quantized Models Test
|
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
- tests/models/quantization
|
|
||||||
commands:
|
|
||||||
- pytest -v -s models/quantization
|
|
||||||
|
|
||||||
# This test is used only in PR development phase to test individual models and should never run on main
|
# This test is used only in PR development phase to test individual models and should never run on main
|
||||||
- label: Custom Models Test
|
- label: Custom Models Test
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
optional: true
|
optional: true
|
||||||
commands:
|
commands:
|
||||||
- echo 'Testing custom models...'
|
- echo 'Testing custom models...'
|
||||||
@@ -572,7 +467,6 @@ steps:
|
|||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
|
|
||||||
- label: Distributed Comm Ops Test # 7min
|
- label: Distributed Comm Ops Test # 7min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -583,7 +477,6 @@ steps:
|
|||||||
- pytest -v -s distributed/test_shm_broadcast.py
|
- pytest -v -s distributed/test_shm_broadcast.py
|
||||||
|
|
||||||
- label: 2 Node Tests (4 GPUs in total) # 16min
|
- label: 2 Node Tests (4 GPUs in total) # 16min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
num_nodes: 2
|
num_nodes: 2
|
||||||
@@ -602,7 +495,7 @@ steps:
|
|||||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
|
|
||||||
- label: Distributed Tests (2 GPUs) # 40min
|
- label: Distributed Tests (2 GPUs) # 40min
|
||||||
mirror_hardwares: [amdexperimental]
|
#mirror_hardwares: [amd]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -616,37 +509,33 @@ steps:
|
|||||||
- vllm/worker/worker.py
|
- vllm/worker/worker.py
|
||||||
- vllm/worker/model_runner.py
|
- vllm/worker/model_runner.py
|
||||||
- entrypoints/llm/test_collective_rpc.py
|
- entrypoints/llm/test_collective_rpc.py
|
||||||
- tests/v1/test_async_llm_dp.py
|
|
||||||
- vllm/v1/engine/
|
|
||||||
commands:
|
commands:
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
|
||||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||||
|
- VLLM_USE_V1=1 torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
|
||||||
|
- torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
|
||||||
- pytest -v -s ./compile/test_basic_correctness.py
|
- pytest -v -s ./compile/test_basic_correctness.py
|
||||||
- pytest -v -s ./compile/test_wrapper.py
|
- pytest -v -s ./compile/test_wrapper.py
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||||
# Avoid importing model tests that cause CUDA reinitialization error
|
# Avoid importing model tests that cause CUDA reinitialization error
|
||||||
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
# test sequence parallel
|
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest -v -s distributed/test_sequence_parallel.py
|
|
||||||
# this test fails consistently.
|
# this test fails consistently.
|
||||||
# TODO: investigate and fix
|
# TODO: investigate and fix
|
||||||
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||||
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||||
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
|
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
|
||||||
|
|
||||||
- label: Plugin Tests (2 GPUs) # 40min
|
- label: Plugin Tests (2 GPUs) # 40min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/plugins/
|
- vllm/plugins/
|
||||||
- tests/plugins/
|
- tests/plugins/
|
||||||
commands:
|
commands:
|
||||||
# begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
|
# begin platform plugin tests, all the code in-between runs on dummy platform
|
||||||
- pip install -e ./plugins/vllm_add_dummy_platform
|
- pip install -e ./plugins/vllm_add_dummy_platform
|
||||||
- pytest -v -s plugins_tests/test_platform_plugins.py
|
- pytest -v -s plugins_tests/test_platform_plugins.py
|
||||||
- pip uninstall vllm_add_dummy_platform -y
|
- pip uninstall vllm_add_dummy_platform -y
|
||||||
@@ -657,10 +546,8 @@ steps:
|
|||||||
- pytest -v -s distributed/test_distributed_oot.py
|
- pytest -v -s distributed/test_distributed_oot.py
|
||||||
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||||
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
|
||||||
|
|
||||||
- label: Multi-step Tests (4 GPUs) # 36min
|
- label: Multi-step Tests (4 GPUs) # 36min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -681,7 +568,6 @@ steps:
|
|||||||
- pytest -v -s multi_step/test_correctness_llm.py
|
- pytest -v -s multi_step/test_correctness_llm.py
|
||||||
|
|
||||||
- label: Pipeline Parallelism Test # 45min
|
- label: Pipeline Parallelism Test # 45min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -695,7 +581,6 @@ steps:
|
|||||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
|
|
||||||
- label: LoRA TP Test (Distributed)
|
- label: LoRA TP Test (Distributed)
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/lora
|
- vllm/lora
|
||||||
@@ -704,14 +589,17 @@ steps:
|
|||||||
# FIXIT: find out which code initialize cuda before running the test
|
# FIXIT: find out which code initialize cuda before running the test
|
||||||
# before the fix, we need to use spawn to test it
|
# before the fix, we need to use spawn to test it
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
# This test runs llama 13B, so it is required to run on 4 GPUs.
|
||||||
|
- pytest -v -s -x lora/test_long_context.py
|
||||||
# There is some Tensor Parallelism related processing logic in LoRA that
|
# There is some Tensor Parallelism related processing logic in LoRA that
|
||||||
# requires multi-GPU testing for validation.
|
# requires multi-GPU testing for validation.
|
||||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||||
- pytest -v -s -x lora/test_llama_tp.py
|
- pytest -v -s -x lora/test_llama_tp.py
|
||||||
|
- pytest -v -s -x lora/test_minicpmv_tp.py
|
||||||
|
- pytest -v -s -x lora/test_transfomers_model.py
|
||||||
|
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU Test # 33min
|
- label: Weight Loading Multiple GPU Test # 33min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -721,7 +609,6 @@ steps:
|
|||||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU Test - Large Models # optional
|
- label: Weight Loading Multiple GPU Test - Large Models # optional
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
gpu: a100
|
gpu: a100
|
||||||
@@ -760,4 +647,4 @@ steps:
|
|||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
- bash ./run-tests.sh -c configs/models-large.txt -t 4
|
||||||
|
|||||||
@@ -50,11 +50,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
|||||||
if [[ $normal_wheel == *"cu118"* ]]; then
|
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||||
# if $normal_wheel matches cu118, do not upload the index.html
|
# if $normal_wheel matches cu118, do not upload the index.html
|
||||||
echo "Skipping index files for cu118 wheels"
|
echo "Skipping index files for cu118 wheels"
|
||||||
elif [[ $normal_wheel == *"cu126"* ]]; then
|
elif [[ $normal_wheel == *"cu121"* ]]; then
|
||||||
# if $normal_wheel matches cu126, do not upload the index.html
|
# if $normal_wheel matches cu121, do not upload the index.html
|
||||||
echo "Skipping index files for cu126 wheels"
|
echo "Skipping index files for cu121 wheels"
|
||||||
else
|
else
|
||||||
# only upload index.html for cu128 wheels (default wheels)
|
# only upload index.html for cu124 wheels (default wheels)
|
||||||
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
||||||
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
||||||
fi
|
fi
|
||||||
@@ -66,13 +66,12 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
|||||||
if [[ $normal_wheel == *"cu118"* ]]; then
|
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||||
# if $normal_wheel matches cu118, do not upload the index.html
|
# if $normal_wheel matches cu118, do not upload the index.html
|
||||||
echo "Skipping index files for cu118 wheels"
|
echo "Skipping index files for cu118 wheels"
|
||||||
elif [[ $normal_wheel == *"cu126"* ]]; then
|
elif [[ $normal_wheel == *"cu121"* ]]; then
|
||||||
# if $normal_wheel matches cu126, do not upload the index.html
|
# if $normal_wheel matches cu121, do not upload the index.html
|
||||||
echo "Skipping index files for cu126 wheels"
|
echo "Skipping index files for cu121 wheels"
|
||||||
else
|
else
|
||||||
# only upload index.html for cu128 wheels (default wheels)
|
# only upload index.html for cu124 wheels (default wheels)
|
||||||
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
||||||
aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
|
|
||||||
7
.github/CODEOWNERS
vendored
7
.github/CODEOWNERS
vendored
@@ -12,8 +12,6 @@
|
|||||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
||||||
/vllm/model_executor/guided_decoding @mgoin @russellb
|
/vllm/model_executor/guided_decoding @mgoin @russellb
|
||||||
/vllm/multimodal @DarkLight1337 @ywang96
|
/vllm/multimodal @DarkLight1337 @ywang96
|
||||||
/vllm/vllm_flash_attn @LucasWilkinson
|
|
||||||
/vllm/lora @jeejeelee
|
|
||||||
CMakeLists.txt @tlrmchlsmth
|
CMakeLists.txt @tlrmchlsmth
|
||||||
|
|
||||||
# vLLM V1
|
# vLLM V1
|
||||||
@@ -41,8 +39,3 @@ CMakeLists.txt @tlrmchlsmth
|
|||||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
|
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
|
||||||
/tests/v1/structured_output @mgoin @russellb
|
/tests/v1/structured_output @mgoin @russellb
|
||||||
/tests/weight_loading @mgoin @youkaichao
|
/tests/weight_loading @mgoin @youkaichao
|
||||||
/tests/lora @jeejeelee
|
|
||||||
|
|
||||||
# Docs
|
|
||||||
/docs @hmellor
|
|
||||||
mkdocs.yaml @hmellor
|
|
||||||
2
.github/ISSUE_TEMPLATE/200-installation.yml
vendored
2
.github/ISSUE_TEMPLATE/200-installation.yml
vendored
@@ -14,7 +14,7 @@ body:
|
|||||||
description: |
|
description: |
|
||||||
Please run the following and paste the output below.
|
Please run the following and paste the output below.
|
||||||
```sh
|
```sh
|
||||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
python collect_env.py
|
python collect_env.py
|
||||||
```
|
```
|
||||||
|
|||||||
2
.github/ISSUE_TEMPLATE/300-usage.yml
vendored
2
.github/ISSUE_TEMPLATE/300-usage.yml
vendored
@@ -14,7 +14,7 @@ body:
|
|||||||
description: |
|
description: |
|
||||||
Please run the following and paste the output below.
|
Please run the following and paste the output below.
|
||||||
```sh
|
```sh
|
||||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
python collect_env.py
|
python collect_env.py
|
||||||
```
|
```
|
||||||
|
|||||||
12
.github/ISSUE_TEMPLATE/400-bug-report.yml
vendored
12
.github/ISSUE_TEMPLATE/400-bug-report.yml
vendored
@@ -14,14 +14,14 @@ body:
|
|||||||
description: |
|
description: |
|
||||||
Please run the following and paste the output below.
|
Please run the following and paste the output below.
|
||||||
```sh
|
```sh
|
||||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
python collect_env.py
|
python collect_env.py
|
||||||
```
|
```
|
||||||
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
|
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
|
||||||
value: |
|
value: |
|
||||||
<details>
|
<details>
|
||||||
<summary>The output of <code>python collect_env.py</code></summary>
|
<summary>The output of `python collect_env.py`</summary>
|
||||||
|
|
||||||
```text
|
```text
|
||||||
Your output of `python collect_env.py` here
|
Your output of `python collect_env.py` here
|
||||||
@@ -75,20 +75,20 @@ body:
|
|||||||
```
|
```
|
||||||
|
|
||||||
```
|
```
|
||||||
The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
|
The error message you got, with the full traceback.
|
||||||
```
|
```
|
||||||
validations:
|
validations:
|
||||||
required: true
|
required: true
|
||||||
- type: markdown
|
- type: markdown
|
||||||
attributes:
|
attributes:
|
||||||
value: |
|
value: >
|
||||||
⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the model's output:
|
⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output:
|
||||||
|
|
||||||
- Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
|
- Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
|
||||||
|
|
||||||
- If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
|
- If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
|
||||||
|
|
||||||
Thanks for reporting 🙏!
|
Thanks for contributing 🎉!
|
||||||
- type: checkboxes
|
- type: checkboxes
|
||||||
id: askllm
|
id: askllm
|
||||||
attributes:
|
attributes:
|
||||||
|
|||||||
69
.github/ISSUE_TEMPLATE/450-ci-failure.yml
vendored
69
.github/ISSUE_TEMPLATE/450-ci-failure.yml
vendored
@@ -1,69 +0,0 @@
|
|||||||
name: 🧪 CI failure report
|
|
||||||
description: Report a failing test.
|
|
||||||
title: "[CI Failure]: "
|
|
||||||
labels: ["ci-failure"]
|
|
||||||
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: >
|
|
||||||
#### Include the name of the failing Buildkite step and test file in the title.
|
|
||||||
- type: input
|
|
||||||
attributes:
|
|
||||||
label: Name of failing test
|
|
||||||
description: |
|
|
||||||
Paste in the fully-qualified name of the failing test from the logs.
|
|
||||||
placeholder: |
|
|
||||||
`path/to/test_file.py::test_name[params]`
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: checkboxes
|
|
||||||
attributes:
|
|
||||||
label: Basic information
|
|
||||||
description: Select all items that apply to the failing test.
|
|
||||||
options:
|
|
||||||
- label: Flaky test
|
|
||||||
- label: Can reproduce locally
|
|
||||||
- label: Caused by external libraries (e.g. bug in `transformers`)
|
|
||||||
- type: textarea
|
|
||||||
attributes:
|
|
||||||
label: 🧪 Describe the failing test
|
|
||||||
description: |
|
|
||||||
Please provide a clear and concise description of the failing test.
|
|
||||||
placeholder: |
|
|
||||||
A clear and concise description of the failing test.
|
|
||||||
|
|
||||||
```
|
|
||||||
The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
|
|
||||||
```
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
attributes:
|
|
||||||
label: 📝 History of failing test
|
|
||||||
description: |
|
|
||||||
Since when did the test start to fail?
|
|
||||||
You can look up its history via [Buildkite Test Suites](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main).
|
|
||||||
|
|
||||||
If you have time, identify the PR that caused the test to fail on main. You can do so via the following methods:
|
|
||||||
|
|
||||||
- Use Buildkite Test Suites to find the PR where the test failure first occurred, and reproduce the failure locally.
|
|
||||||
|
|
||||||
- Run [`git bisect`](https://git-scm.com/docs/git-bisect) locally.
|
|
||||||
|
|
||||||
- Manually unblock Buildkite steps for suspected PRs on main and check the results. (authorized users only)
|
|
||||||
placeholder: |
|
|
||||||
Approximate timeline and/or problematic PRs
|
|
||||||
|
|
||||||
A link to the Buildkite analytics of the failing test (if available)
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
attributes:
|
|
||||||
label: CC List.
|
|
||||||
description: >
|
|
||||||
The list of people you want to CC. Usually, this includes those who worked on the PR that failed the test.
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: >
|
|
||||||
Thanks for reporting 🙏!
|
|
||||||
2
.github/ISSUE_TEMPLATE/600-new-model.yml
vendored
2
.github/ISSUE_TEMPLATE/600-new-model.yml
vendored
@@ -9,7 +9,7 @@ body:
|
|||||||
value: >
|
value: >
|
||||||
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||||
|
|
||||||
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/index.html first to understand how to add a new model.
|
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
|
||||||
- type: textarea
|
- type: textarea
|
||||||
attributes:
|
attributes:
|
||||||
label: The model to consider.
|
label: The model to consider.
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ body:
|
|||||||
description: |
|
description: |
|
||||||
Please run the following and paste the output below.
|
Please run the following and paste the output below.
|
||||||
```sh
|
```sh
|
||||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
python collect_env.py
|
python collect_env.py
|
||||||
```
|
```
|
||||||
|
|||||||
28
.github/ISSUE_TEMPLATE/800-misc-discussion.yml
vendored
Normal file
28
.github/ISSUE_TEMPLATE/800-misc-discussion.yml
vendored
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
name: 🎲 Misc/random discussions that do not fit into the above categories.
|
||||||
|
description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
|
||||||
|
title: "[Misc]: "
|
||||||
|
labels: ["misc"]
|
||||||
|
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: Anything you want to discuss about vllm.
|
||||||
|
description: >
|
||||||
|
Anything you want to discuss about vllm.
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
Thanks for contributing 🎉!
|
||||||
|
- type: checkboxes
|
||||||
|
id: askllm
|
||||||
|
attributes:
|
||||||
|
label: Before submitting a new issue...
|
||||||
|
options:
|
||||||
|
- label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
|
||||||
|
required: true
|
||||||
4
.github/ISSUE_TEMPLATE/config.yml
vendored
4
.github/ISSUE_TEMPLATE/config.yml
vendored
@@ -1,5 +1 @@
|
|||||||
blank_issues_enabled: false
|
blank_issues_enabled: false
|
||||||
contact_links:
|
|
||||||
- name: Questions
|
|
||||||
url: https://discuss.vllm.ai
|
|
||||||
about: Ask questions and discuss with other vLLM community members
|
|
||||||
|
|||||||
2
.github/PULL_REQUEST_TEMPLATE.md
vendored
2
.github/PULL_REQUEST_TEMPLATE.md
vendored
@@ -3,4 +3,4 @@ FILL IN THE PR DESCRIPTION HERE
|
|||||||
FIX #xxxx (*link existing issues this PR will resolve*)
|
FIX #xxxx (*link existing issues this PR will resolve*)
|
||||||
|
|
||||||
<!--- pyml disable-next-line no-emphasis-as-heading -->
|
<!--- pyml disable-next-line no-emphasis-as-heading -->
|
||||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
|
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>**
|
||||||
|
|||||||
75
.github/mergify.yml
vendored
75
.github/mergify.yml
vendored
@@ -19,7 +19,7 @@ pull_request_rules:
|
|||||||
- files~=\.buildkite/
|
- files~=\.buildkite/
|
||||||
- files~=^cmake/
|
- files~=^cmake/
|
||||||
- files=CMakeLists.txt
|
- files=CMakeLists.txt
|
||||||
- files~=^docker/Dockerfile
|
- files~=^Dockerfile
|
||||||
- files~=^requirements.*\.txt
|
- files~=^requirements.*\.txt
|
||||||
- files=setup.py
|
- files=setup.py
|
||||||
actions:
|
actions:
|
||||||
@@ -55,19 +55,11 @@ pull_request_rules:
|
|||||||
description: Automatically apply structured-output label
|
description: Automatically apply structured-output label
|
||||||
conditions:
|
conditions:
|
||||||
- or:
|
- or:
|
||||||
- files~=^benchmarks/structured_schemas/
|
|
||||||
- files=benchmarks/benchmark_serving_structured_output.py
|
|
||||||
- files=benchmarks/run_structured_output_benchmark.sh
|
|
||||||
- files=docs/features/structured_outputs.md
|
|
||||||
- files=examples/offline_inference/structured_outputs.py
|
|
||||||
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
|
||||||
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
|
||||||
- files~=^vllm/model_executor/guided_decoding/
|
- files~=^vllm/model_executor/guided_decoding/
|
||||||
- files=tests/model_executor/test_guided_processors.py
|
- files=tests/model_executor/test_guided_processors.py
|
||||||
- files=tests/entrypoints/llm/test_guided_generate.py
|
- files=tests/entrypoints/llm/test_guided_generate.py
|
||||||
- files~=^tests/v1/structured_output/
|
- files=benchmarks/benchmark_serving_guided.py
|
||||||
- files=tests/v1/entrypoints/llm/test_guided_generate.py
|
- files=benchmarks/benchmark_guided.py
|
||||||
- files~=^vllm/v1/structured_output/
|
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
add:
|
add:
|
||||||
@@ -96,56 +88,6 @@ pull_request_rules:
|
|||||||
add:
|
add:
|
||||||
- v1
|
- v1
|
||||||
|
|
||||||
- name: label-tpu
|
|
||||||
description: Automatically apply tpu label
|
|
||||||
# Keep this list in sync with `label-tpu-remove` conditions
|
|
||||||
conditions:
|
|
||||||
- or:
|
|
||||||
- files~=tpu.py
|
|
||||||
- files~=_tpu
|
|
||||||
- files~=tpu_
|
|
||||||
- files~=/tpu/
|
|
||||||
- files~=pallas
|
|
||||||
actions:
|
|
||||||
label:
|
|
||||||
add:
|
|
||||||
- tpu
|
|
||||||
|
|
||||||
- name: label-tpu-remove
|
|
||||||
description: Automatically remove tpu label
|
|
||||||
# Keep this list in sync with `label-tpu` conditions
|
|
||||||
conditions:
|
|
||||||
- and:
|
|
||||||
- -files~=tpu.py
|
|
||||||
- -files~=_tpu
|
|
||||||
- -files~=tpu_
|
|
||||||
- -files~=/tpu/
|
|
||||||
- -files~=pallas
|
|
||||||
actions:
|
|
||||||
label:
|
|
||||||
remove:
|
|
||||||
- tpu
|
|
||||||
|
|
||||||
- name: label-tool-calling
|
|
||||||
description: Automatically add tool-calling label
|
|
||||||
conditions:
|
|
||||||
- or:
|
|
||||||
- files~=^tests/tool_use/
|
|
||||||
- files~=^tests/mistral_tool_use/
|
|
||||||
- files~=^tests/entrypoints/openai/tool_parsers/
|
|
||||||
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
|
||||||
- files~=^vllm/entrypoints/openai/tool_parsers/
|
|
||||||
- files=docs/features/tool_calling.md
|
|
||||||
- files~=^examples/tool_chat_*
|
|
||||||
- files=examples/offline_inference/chat_with_tools.py
|
|
||||||
- files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
|
|
||||||
- files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
|
|
||||||
- files=examples/online_serving/openai_chat_completion_client_with_tools.py
|
|
||||||
actions:
|
|
||||||
label:
|
|
||||||
add:
|
|
||||||
- tool-calling
|
|
||||||
|
|
||||||
- name: ping author on conflicts and add 'needs-rebase' label
|
- name: ping author on conflicts and add 'needs-rebase' label
|
||||||
conditions:
|
conditions:
|
||||||
- conflict
|
- conflict
|
||||||
@@ -161,17 +103,6 @@ pull_request_rules:
|
|||||||
|
|
||||||
https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
|
https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
|
||||||
|
|
||||||
- name: assign reviewer for tensorizer changes
|
|
||||||
conditions:
|
|
||||||
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
|
||||||
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
|
||||||
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
|
||||||
- files~=^tests/tensorizer_loader/
|
|
||||||
actions:
|
|
||||||
assign:
|
|
||||||
users:
|
|
||||||
- "sangstar"
|
|
||||||
|
|
||||||
- name: remove 'needs-rebase' label when conflict is resolved
|
- name: remove 'needs-rebase' label when conflict is resolved
|
||||||
conditions:
|
conditions:
|
||||||
- -conflict
|
- -conflict
|
||||||
|
|||||||
2
.github/scripts/cleanup_pr_body.sh
vendored
2
.github/scripts/cleanup_pr_body.sh
vendored
@@ -26,7 +26,7 @@ sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
|
|||||||
|
|
||||||
# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
|
# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
|
||||||
python3 - <<EOF
|
python3 - <<EOF
|
||||||
import regex as re
|
import re
|
||||||
|
|
||||||
with open("${NEW}", "r") as file:
|
with open("${NEW}", "r") as file:
|
||||||
content = file.read()
|
content = file.read()
|
||||||
|
|||||||
2
.github/workflows/add_label_automerge.yml
vendored
2
.github/workflows/add_label_automerge.yml
vendored
@@ -1,6 +1,4 @@
|
|||||||
name: Add label on auto-merge enabled
|
name: Add label on auto-merge enabled
|
||||||
permissions:
|
|
||||||
pull-requests: write
|
|
||||||
on:
|
on:
|
||||||
pull_request_target:
|
pull_request_target:
|
||||||
types:
|
types:
|
||||||
|
|||||||
7
.github/workflows/cleanup_pr_body.yml
vendored
7
.github/workflows/cleanup_pr_body.yml
vendored
@@ -20,12 +20,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
python-version: '3.12'
|
python-version: '3.12'
|
||||||
|
|
||||||
- name: Install Python dependencies
|
|
||||||
run: |
|
|
||||||
python3 -m pip install --upgrade pip
|
|
||||||
python3 -m pip install regex
|
|
||||||
|
|
||||||
- name: Update PR description
|
- name: Update PR description
|
||||||
env:
|
env:
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
|
run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
|
||||||
|
|||||||
5
.github/workflows/lint-and-deploy.yaml
vendored
5
.github/workflows/lint-and-deploy.yaml
vendored
@@ -2,9 +2,6 @@ name: Lint and Deploy Charts
|
|||||||
|
|
||||||
on: pull_request
|
on: pull_request
|
||||||
|
|
||||||
permissions:
|
|
||||||
contents: read
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
lint-and-deploy:
|
lint-and-deploy:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
@@ -53,7 +50,7 @@ jobs:
|
|||||||
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
|
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
|
||||||
|
|
||||||
- name: Build the Docker image vllm cpu
|
- name: Build the Docker image vllm cpu
|
||||||
run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
|
run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
|
||||||
|
|
||||||
- name: Configuration of docker images, network and namespace for the kind cluster
|
- name: Configuration of docker images, network and namespace for the kind cluster
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
3
.github/workflows/pre-commit.yml
vendored
3
.github/workflows/pre-commit.yml
vendored
@@ -5,9 +5,6 @@ on:
|
|||||||
push:
|
push:
|
||||||
branches: [main]
|
branches: [main]
|
||||||
|
|
||||||
permissions:
|
|
||||||
contents: read
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
pre-commit:
|
pre-commit:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|||||||
2
.github/workflows/reminder_comment.yml
vendored
2
.github/workflows/reminder_comment.yml
vendored
@@ -1,6 +1,4 @@
|
|||||||
name: PR Reminder Comment Bot
|
name: PR Reminder Comment Bot
|
||||||
permissions:
|
|
||||||
pull-requests: write
|
|
||||||
on:
|
on:
|
||||||
pull_request_target:
|
pull_request_target:
|
||||||
types: [opened]
|
types: [opened]
|
||||||
|
|||||||
10
.gitignore
vendored
10
.gitignore
vendored
@@ -2,7 +2,7 @@
|
|||||||
/vllm/_version.py
|
/vllm/_version.py
|
||||||
|
|
||||||
# vllm-flash-attn built from source
|
# vllm-flash-attn built from source
|
||||||
vllm/vllm_flash_attn/*
|
vllm/vllm_flash_attn/
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
@@ -77,6 +77,10 @@ instance/
|
|||||||
# Scrapy stuff:
|
# Scrapy stuff:
|
||||||
.scrapy
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
docs/source/getting_started/examples/
|
||||||
|
|
||||||
# PyBuilder
|
# PyBuilder
|
||||||
.pybuilder/
|
.pybuilder/
|
||||||
target/
|
target/
|
||||||
@@ -146,7 +150,6 @@ venv.bak/
|
|||||||
|
|
||||||
# mkdocs documentation
|
# mkdocs documentation
|
||||||
/site
|
/site
|
||||||
docs/examples
|
|
||||||
|
|
||||||
# mypy
|
# mypy
|
||||||
.mypy_cache/
|
.mypy_cache/
|
||||||
@@ -199,6 +202,3 @@ benchmarks/**/*.json
|
|||||||
# Linting
|
# Linting
|
||||||
actionlint
|
actionlint
|
||||||
shellcheck*/
|
shellcheck*/
|
||||||
|
|
||||||
# Ingore moe/marlin_moe gen code
|
|
||||||
csrc/moe/marlin_moe_wna16/kernel_*
|
|
||||||
|
|||||||
@@ -1,6 +1,3 @@
|
|||||||
default_install_hook_types:
|
|
||||||
- pre-commit
|
|
||||||
- commit-msg
|
|
||||||
default_stages:
|
default_stages:
|
||||||
- pre-commit # Run locally
|
- pre-commit # Run locally
|
||||||
- manual # Run in CI
|
- manual # Run in CI
|
||||||
@@ -11,45 +8,43 @@ repos:
|
|||||||
hooks:
|
hooks:
|
||||||
- id: yapf
|
- id: yapf
|
||||||
args: [--in-place, --verbose]
|
args: [--in-place, --verbose]
|
||||||
|
additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
|
||||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
rev: v0.11.7
|
rev: v0.9.3
|
||||||
hooks:
|
hooks:
|
||||||
- id: ruff
|
- id: ruff
|
||||||
args: [--output-format, github, --fix]
|
args: [--output-format, github, --fix]
|
||||||
- id: ruff-format
|
|
||||||
files: ^(.buildkite|benchmarks|examples)/.*
|
|
||||||
- repo: https://github.com/codespell-project/codespell
|
- repo: https://github.com/codespell-project/codespell
|
||||||
rev: v2.4.1
|
rev: v2.4.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: codespell
|
- id: codespell
|
||||||
additional_dependencies: ['tomli']
|
additional_dependencies: ['tomli']
|
||||||
args: ['--toml', 'pyproject.toml']
|
args: ['--toml', 'pyproject.toml']
|
||||||
- repo: https://github.com/PyCQA/isort
|
- repo: https://github.com/PyCQA/isort
|
||||||
rev: 6.0.1
|
rev: 0a0b7a830386ba6a31c2ec8316849ae4d1b8240d # 6.0.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: isort
|
- id: isort
|
||||||
- repo: https://github.com/pre-commit/mirrors-clang-format
|
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||||
rev: v20.1.3
|
rev: v19.1.7
|
||||||
hooks:
|
hooks:
|
||||||
- id: clang-format
|
- id: clang-format
|
||||||
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
||||||
types_or: [c++, cuda]
|
types_or: [c++, cuda]
|
||||||
args: [--style=file, --verbose]
|
args: [--style=file, --verbose]
|
||||||
- repo: https://github.com/jackdewinter/pymarkdown
|
- repo: https://github.com/jackdewinter/pymarkdown
|
||||||
rev: v0.9.29
|
rev: v0.9.27
|
||||||
hooks:
|
hooks:
|
||||||
- id: pymarkdown
|
- id: pymarkdown
|
||||||
exclude: '.*\.inc\.md'
|
|
||||||
args: [fix]
|
args: [fix]
|
||||||
- repo: https://github.com/rhysd/actionlint
|
- repo: https://github.com/rhysd/actionlint
|
||||||
rev: v1.7.7
|
rev: v1.7.7
|
||||||
hooks:
|
hooks:
|
||||||
- id: actionlint
|
- id: actionlint
|
||||||
- repo: https://github.com/astral-sh/uv-pre-commit
|
- repo: https://github.com/astral-sh/uv-pre-commit
|
||||||
rev: 0.6.17
|
rev: 0.6.2
|
||||||
hooks:
|
hooks:
|
||||||
- id: pip-compile
|
- id: pip-compile
|
||||||
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
|
args: [requirements/test.in, -o, requirements/test.txt]
|
||||||
files: ^requirements/test\.(in|txt)$
|
files: ^requirements/test\.(in|txt)$
|
||||||
- repo: local
|
- repo: local
|
||||||
hooks:
|
hooks:
|
||||||
@@ -104,8 +99,8 @@ repos:
|
|||||||
args:
|
args:
|
||||||
- -c
|
- -c
|
||||||
- |
|
- |
|
||||||
if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" "$(git rev-parse --git-path COMMIT_EDITMSG)"; then
|
if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
|
||||||
printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> "$(git rev-parse --git-path COMMIT_EDITMSG)"
|
printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
|
||||||
fi
|
fi
|
||||||
language: system
|
language: system
|
||||||
verbose: true
|
verbose: true
|
||||||
@@ -124,25 +119,6 @@ repos:
|
|||||||
language: system
|
language: system
|
||||||
always_run: true
|
always_run: true
|
||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
- id: update-dockerfile-graph
|
|
||||||
name: Update Dockerfile dependency graph
|
|
||||||
entry: tools/update-dockerfile-graph.sh
|
|
||||||
language: script
|
|
||||||
- id: enforce-import-regex-instead-of-re
|
|
||||||
name: Enforce import regex as re
|
|
||||||
entry: python tools/enforce_regex_import.py
|
|
||||||
language: python
|
|
||||||
types: [python]
|
|
||||||
pass_filenames: false
|
|
||||||
additional_dependencies: [regex]
|
|
||||||
# forbid directly import triton
|
|
||||||
- id: forbid-direct-triton-import
|
|
||||||
name: "Forbid direct 'import triton'"
|
|
||||||
entry: python tools/check_triton_import.py
|
|
||||||
language: python
|
|
||||||
types: [python]
|
|
||||||
pass_filenames: false
|
|
||||||
additional_dependencies: [regex]
|
|
||||||
# Keep `suggestion` last
|
# Keep `suggestion` last
|
||||||
- id: suggestion
|
- id: suggestion
|
||||||
name: Suggestion
|
name: Suggestion
|
||||||
|
|||||||
@@ -8,8 +8,12 @@ build:
|
|||||||
tools:
|
tools:
|
||||||
python: "3.12"
|
python: "3.12"
|
||||||
|
|
||||||
mkdocs:
|
sphinx:
|
||||||
configuration: mkdocs.yaml
|
configuration: docs/source/conf.py
|
||||||
|
fail_on_warning: true
|
||||||
|
|
||||||
|
# If using Sphinx, optionally build your docs in additional formats such as PDF
|
||||||
|
formats: []
|
||||||
|
|
||||||
# Optionally declare the Python requirements required to build your docs
|
# Optionally declare the Python requirements required to build your docs
|
||||||
python:
|
python:
|
||||||
|
|||||||
214
CMakeLists.txt
214
CMakeLists.txt
@@ -15,6 +15,7 @@ project(vllm_extensions LANGUAGES CXX)
|
|||||||
|
|
||||||
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
|
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
|
||||||
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
|
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
|
||||||
|
|
||||||
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
||||||
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
|
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
|
||||||
|
|
||||||
@@ -29,8 +30,11 @@ set(ignoreMe "${VLLM_PYTHON_PATH}")
|
|||||||
#
|
#
|
||||||
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
|
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
|
||||||
|
|
||||||
|
# Supported NVIDIA architectures.
|
||||||
|
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
||||||
|
|
||||||
# Supported AMD GPU architectures.
|
# Supported AMD GPU architectures.
|
||||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
|
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Supported/expected torch versions for CUDA/ROCm.
|
# Supported/expected torch versions for CUDA/ROCm.
|
||||||
@@ -40,10 +44,10 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
|
|||||||
#
|
#
|
||||||
# Note: the CUDA torch version is derived from pyproject.toml and various
|
# Note: the CUDA torch version is derived from pyproject.toml and various
|
||||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||||
# versions are derived from docker/Dockerfile.rocm
|
# versions are derived from Dockerfile.rocm
|
||||||
#
|
#
|
||||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0")
|
set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
|
||||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
|
set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Try to find python package with an executable that exactly matches
|
# Try to find python package with an executable that exactly matches
|
||||||
@@ -76,15 +80,6 @@ endif()
|
|||||||
#
|
#
|
||||||
find_package(Torch REQUIRED)
|
find_package(Torch REQUIRED)
|
||||||
|
|
||||||
# Supported NVIDIA architectures.
|
|
||||||
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
|
|
||||||
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
|
||||||
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
|
|
||||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
|
||||||
else()
|
|
||||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Forward the non-CUDA device extensions to external CMake scripts.
|
# Forward the non-CUDA device extensions to external CMake scripts.
|
||||||
#
|
#
|
||||||
@@ -232,34 +227,28 @@ endif()
|
|||||||
#
|
#
|
||||||
|
|
||||||
set(VLLM_EXT_SRC
|
set(VLLM_EXT_SRC
|
||||||
"csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
|
|
||||||
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
|
|
||||||
"csrc/cache_kernels.cu"
|
"csrc/cache_kernels.cu"
|
||||||
"csrc/attention/paged_attention_v1.cu"
|
"csrc/attention/paged_attention_v1.cu"
|
||||||
"csrc/attention/paged_attention_v2.cu"
|
"csrc/attention/paged_attention_v2.cu"
|
||||||
"csrc/attention/merge_attn_states.cu"
|
|
||||||
"csrc/attention/vertical_slash_index.cu"
|
|
||||||
"csrc/pos_encoding_kernels.cu"
|
"csrc/pos_encoding_kernels.cu"
|
||||||
"csrc/activation_kernels.cu"
|
"csrc/activation_kernels.cu"
|
||||||
"csrc/layernorm_kernels.cu"
|
"csrc/layernorm_kernels.cu"
|
||||||
"csrc/layernorm_quant_kernels.cu"
|
"csrc/layernorm_quant_kernels.cu"
|
||||||
"csrc/cuda_view.cu"
|
|
||||||
"csrc/quantization/gptq/q_gemm.cu"
|
"csrc/quantization/gptq/q_gemm.cu"
|
||||||
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
||||||
"csrc/quantization/fp8/common.cu"
|
"csrc/quantization/fp8/common.cu"
|
||||||
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
||||||
"csrc/quantization/gguf/gguf_kernel.cu"
|
"csrc/quantization/gguf/gguf_kernel.cu"
|
||||||
"csrc/quantization/activation_kernels.cu"
|
|
||||||
"csrc/cuda_utils_kernels.cu"
|
"csrc/cuda_utils_kernels.cu"
|
||||||
"csrc/prepare_inputs/advance_step.cu"
|
"csrc/prepare_inputs/advance_step.cu"
|
||||||
"csrc/custom_all_reduce.cu"
|
|
||||||
"csrc/torch_bindings.cpp")
|
"csrc/torch_bindings.cpp")
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
||||||
|
|
||||||
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
|
||||||
set(CUTLASS_REVISION "v3.9.2" CACHE STRING "CUTLASS revision to use")
|
# Please keep this in sync with FetchContent_Declare line below.
|
||||||
|
set(CUTLASS_REVISION "v3.8.0" CACHE STRING "CUTLASS revision to use")
|
||||||
|
|
||||||
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
||||||
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
||||||
@@ -277,7 +266,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
cutlass
|
cutlass
|
||||||
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
||||||
# Please keep this in sync with CUTLASS_REVISION line above.
|
# Please keep this in sync with CUTLASS_REVISION line above.
|
||||||
GIT_TAG ${CUTLASS_REVISION}
|
GIT_TAG v3.8.0
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
|
|
||||||
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
|
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
|
||||||
@@ -289,16 +278,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
FetchContent_MakeAvailable(cutlass)
|
FetchContent_MakeAvailable(cutlass)
|
||||||
|
|
||||||
list(APPEND VLLM_EXT_SRC
|
list(APPEND VLLM_EXT_SRC
|
||||||
|
"csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
|
||||||
|
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
|
||||||
"csrc/quantization/aqlm/gemm_kernels.cu"
|
"csrc/quantization/aqlm/gemm_kernels.cu"
|
||||||
"csrc/quantization/awq/gemm_kernels.cu"
|
"csrc/quantization/awq/gemm_kernels.cu"
|
||||||
|
"csrc/custom_all_reduce.cu"
|
||||||
"csrc/permute_cols.cu"
|
"csrc/permute_cols.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
|
|
||||||
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
||||||
"csrc/cutlass_extensions/common.cpp"
|
"csrc/cutlass_extensions/common.cpp")
|
||||||
"csrc/attention/mla/cutlass_mla_entry.cu")
|
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${VLLM_EXT_SRC}"
|
SRCS "${VLLM_EXT_SRC}"
|
||||||
@@ -307,55 +297,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# Only build Marlin kernels if we are building for at least some compatible archs.
|
# Only build Marlin kernels if we are building for at least some compatible archs.
|
||||||
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
||||||
# are not supported by Machete yet.
|
# are not supported by Machete yet.
|
||||||
# 9.0 for latest bf16 atomicAdd PTX
|
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
|
|
||||||
if (MARLIN_ARCHS)
|
if (MARLIN_ARCHS)
|
||||||
|
|
||||||
#
|
|
||||||
# For the Marlin kernels we automatically generate sources for various
|
|
||||||
# preselected input type pairs and schedules.
|
|
||||||
# Generate sources:
|
|
||||||
set(MARLIN_GEN_SCRIPT
|
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
|
|
||||||
file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
|
|
||||||
|
|
||||||
message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
|
|
||||||
message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
|
|
||||||
|
|
||||||
if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
|
|
||||||
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
|
|
||||||
execute_process(
|
|
||||||
COMMAND ${CMAKE_COMMAND} -E env
|
|
||||||
PYTHONPATH=$PYTHONPATH
|
|
||||||
${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
|
|
||||||
RESULT_VARIABLE marlin_generation_result
|
|
||||||
OUTPUT_VARIABLE marlin_generation_result
|
|
||||||
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
|
|
||||||
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
|
|
||||||
)
|
|
||||||
|
|
||||||
if (NOT marlin_generation_result EQUAL 0)
|
|
||||||
message(FATAL_ERROR "Marlin generation failed."
|
|
||||||
" Result: \"${marlin_generation_result}\""
|
|
||||||
"\nCheck the log for details: "
|
|
||||||
"${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
|
|
||||||
else()
|
|
||||||
set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
|
|
||||||
CACHE STRING "Last run Marlin generate script hash" FORCE)
|
|
||||||
message(STATUS "Marlin generation completed successfully.")
|
|
||||||
endif()
|
|
||||||
else()
|
|
||||||
message(STATUS "Marlin generation script has not changed, skipping generation.")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
|
|
||||||
CUDA_ARCHS "${MARLIN_ARCHS}")
|
|
||||||
|
|
||||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
|
|
||||||
|
|
||||||
set(MARLIN_SRCS
|
set(MARLIN_SRCS
|
||||||
|
"csrc/quantization/fp8/fp8_marlin.cu"
|
||||||
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
|
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
|
||||||
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
||||||
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
|
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
|
||||||
@@ -427,7 +372,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
|
|
||||||
)
|
)
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
@@ -452,9 +396,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
#
|
#
|
||||||
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
|
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
|
||||||
# kernels for the remaining archs that are not already built for 3x.
|
# kernels for the remaining archs that are not already built for 3x.
|
||||||
# (Build 8.9 for FP8)
|
|
||||||
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
|
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
|
||||||
"7.5;8.0;8.9+PTX" "${CUDA_ARCHS}")
|
"7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
|
||||||
# subtract out the archs that are already built for 3x
|
# subtract out the archs that are already built for 3x
|
||||||
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
||||||
if (SCALED_MM_2X_ARCHS)
|
if (SCALED_MM_2X_ARCHS)
|
||||||
@@ -505,9 +448,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
|
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu")
|
||||||
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
|
|
||||||
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${FP4_ARCHS}")
|
CUDA_ARCHS "${FP4_ARCHS}")
|
||||||
@@ -520,52 +461,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
set(FP4_ARCHS)
|
set(FP4_ARCHS)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# CUTLASS MLA Archs and flags
|
|
||||||
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND MLA_ARCHS)
|
|
||||||
set(SRCS
|
|
||||||
"csrc/attention/mla/cutlass_mla_kernels.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${SRCS}"
|
|
||||||
CUDA_ARCHS "${MLA_ARCHS}")
|
|
||||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
|
||||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MLA=1")
|
|
||||||
# Add MLA-specific include directories only to MLA source files
|
|
||||||
set_source_files_properties(${SRCS}
|
|
||||||
PROPERTIES INCLUDE_DIRECTORIES "${CUTLASS_DIR}/examples/77_blackwell_fmha;${CUTLASS_DIR}/examples/common")
|
|
||||||
message(STATUS "Building CUTLASS MLA for archs: ${MLA_ARCHS}")
|
|
||||||
else()
|
|
||||||
message(STATUS "Not building CUTLASS MLA as no compatible archs were found.")
|
|
||||||
# clear MLA_ARCHS
|
|
||||||
set(MLA_ARCHS)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# CUTLASS MoE kernels
|
|
||||||
|
|
||||||
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
|
|
||||||
# on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
|
|
||||||
# to compile MoE kernels that use its output.
|
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
|
||||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
|
|
||||||
"csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${SRCS}"
|
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
|
||||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
|
||||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM90=1")
|
|
||||||
message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
|
|
||||||
else()
|
|
||||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
|
||||||
message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
|
|
||||||
"not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
|
|
||||||
"if you intend on running FP8 quantized MoE models on Hopper.")
|
|
||||||
else()
|
|
||||||
message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
|
|
||||||
"in CUDA target architectures")
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Machete kernels
|
# Machete kernels
|
||||||
|
|
||||||
@@ -683,54 +578,23 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
CUDA_ARCHS "${CUDA_ARCHS}")
|
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||||
|
|
||||||
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
|
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
|
||||||
# 9.0 for latest bf16 atomicAdd PTX
|
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
|
|
||||||
if (MARLIN_MOE_ARCHS)
|
if (MARLIN_MOE_ARCHS)
|
||||||
|
set(MARLIN_MOE_SRC
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
|
||||||
|
"csrc/moe/marlin_moe_ops.cu")
|
||||||
|
|
||||||
#
|
|
||||||
# For the Marlin MOE kernels we automatically generate sources for various
|
|
||||||
# preselected input type pairs and schedules.
|
|
||||||
# Generate sources:
|
|
||||||
set(MOE_MARLIN_GEN_SCRIPT
|
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
|
|
||||||
file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
|
|
||||||
|
|
||||||
message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
|
|
||||||
message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
|
|
||||||
|
|
||||||
if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
|
|
||||||
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
|
|
||||||
execute_process(
|
|
||||||
COMMAND ${CMAKE_COMMAND} -E env
|
|
||||||
PYTHONPATH=$PYTHONPATH
|
|
||||||
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
|
|
||||||
RESULT_VARIABLE moe_marlin_generation_result
|
|
||||||
OUTPUT_VARIABLE moe_marlin_generation_output
|
|
||||||
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
|
|
||||||
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
|
|
||||||
)
|
|
||||||
|
|
||||||
if (NOT moe_marlin_generation_result EQUAL 0)
|
|
||||||
message(FATAL_ERROR "Marlin MOE generation failed."
|
|
||||||
" Result: \"${moe_marlin_generation_result}\""
|
|
||||||
"\nCheck the log for details: "
|
|
||||||
"${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
|
|
||||||
else()
|
|
||||||
set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
|
|
||||||
CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
|
|
||||||
message(STATUS "Marlin MOE generation completed successfully.")
|
|
||||||
endif()
|
|
||||||
else()
|
|
||||||
message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${MOE_WNAA16_MARLIN_SRC}"
|
SRCS "${MARLIN_MOE_SRC}"
|
||||||
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
|
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
|
||||||
|
|
||||||
list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
|
list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
|
||||||
|
|
||||||
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
|
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
|
||||||
else()
|
else()
|
||||||
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
|
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
|
||||||
@@ -738,17 +602,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|
||||||
set(MOE_PERMUTE_SRC
|
|
||||||
"csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
|
|
||||||
"csrc/moe/moe_permute_unpermute_op.cu")
|
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${MARLIN_PERMUTE_SRC}"
|
|
||||||
CUDA_ARCHS "${MOE_PERMUTE_ARCHS}")
|
|
||||||
|
|
||||||
list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
|
|
||||||
endif()
|
|
||||||
message(STATUS "Enabling moe extension.")
|
message(STATUS "Enabling moe extension.")
|
||||||
define_gpu_extension_target(
|
define_gpu_extension_target(
|
||||||
_moe_C
|
_moe_C
|
||||||
@@ -757,8 +610,6 @@ define_gpu_extension_target(
|
|||||||
SOURCES ${VLLM_MOE_EXT_SRC}
|
SOURCES ${VLLM_MOE_EXT_SRC}
|
||||||
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
||||||
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
||||||
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
|
|
||||||
INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
|
|
||||||
USE_SABI 3
|
USE_SABI 3
|
||||||
WITH_SOABI)
|
WITH_SOABI)
|
||||||
|
|
||||||
@@ -768,7 +619,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
|
|||||||
#
|
#
|
||||||
set(VLLM_ROCM_EXT_SRC
|
set(VLLM_ROCM_EXT_SRC
|
||||||
"csrc/rocm/torch_bindings.cpp"
|
"csrc/rocm/torch_bindings.cpp"
|
||||||
"csrc/rocm/skinny_gemms.cu"
|
|
||||||
"csrc/rocm/attention.cu")
|
"csrc/rocm/attention.cu")
|
||||||
|
|
||||||
define_gpu_extension_target(
|
define_gpu_extension_target(
|
||||||
|
|||||||
@@ -1,3 +1,3 @@
|
|||||||
# Contributing to vLLM
|
# Contributing to vLLM
|
||||||
|
|
||||||
You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing).
|
You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html).
|
||||||
|
|||||||
@@ -2,42 +2,33 @@
|
|||||||
# to run the OpenAI compatible server.
|
# to run the OpenAI compatible server.
|
||||||
|
|
||||||
# Please update any changes made here to
|
# Please update any changes made here to
|
||||||
# docs/contributing/dockerfile/dockerfile.md and
|
# docs/source/contributing/dockerfile/dockerfile.md and
|
||||||
# docs/assets/contributing/dockerfile-stages-dependency.png
|
# docs/source/assets/contributing/dockerfile-stages-dependency.png
|
||||||
|
|
||||||
ARG CUDA_VERSION=12.8.1
|
ARG CUDA_VERSION=12.4.1
|
||||||
#################### BASE BUILD IMAGE ####################
|
#################### BASE BUILD IMAGE ####################
|
||||||
# prepare basic build environment
|
# prepare basic build environment
|
||||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
|
||||||
ARG CUDA_VERSION=12.8.1
|
ARG CUDA_VERSION=12.4.1
|
||||||
ARG PYTHON_VERSION=3.12
|
ARG PYTHON_VERSION=3.12
|
||||||
ARG TARGETPLATFORM
|
ARG TARGETPLATFORM
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
# Install Python and other dependencies
|
# Install minimal dependencies and uv
|
||||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
RUN apt-get update -y \
|
||||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
&& apt-get install -y ccache git curl wget sudo \
|
||||||
&& apt-get update -y \
|
&& curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||||
&& apt-get install -y ccache software-properties-common git curl sudo \
|
|
||||||
&& for i in 1 2 3; do \
|
# Add uv to PATH
|
||||||
add-apt-repository -y ppa:deadsnakes/ppa && break || \
|
ENV PATH="/root/.local/bin:$PATH"
|
||||||
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
|
# Create venv with specified Python and activate by placing at the front of path
|
||||||
done \
|
ENV VIRTUAL_ENV="/opt/venv"
|
||||||
&& apt-get update -y \
|
RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
|
||||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
|
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
|
||||||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
|
||||||
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
|
|
||||||
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
|
||||||
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
|
||||||
&& python3 --version && python3 -m pip --version
|
|
||||||
# Install uv for faster pip installs
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
python3 -m pip install uv
|
|
||||||
|
|
||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
ENV UV_HTTP_TIMEOUT=500
|
||||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
|
||||||
|
|
||||||
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
|
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
|
||||||
# as it was causing spam when compiling the CUTLASS kernels
|
# as it was causing spam when compiling the CUTLASS kernels
|
||||||
@@ -55,29 +46,25 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
|||||||
|
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
# install build and runtime dependencies
|
|
||||||
|
|
||||||
# arm64 (GH200) build follows the practice of "use existing pytorch" build,
|
# arm64 (GH200) build follows the practice of "use existing pytorch" build,
|
||||||
# we need to install torch and torchvision from the nightly builds first,
|
# we need to install torch and torchvision from the nightly builds first,
|
||||||
# pytorch will not appear as a vLLM dependency in all of the following steps
|
# pytorch will not appear as a vLLM dependency in all of the following steps
|
||||||
# after this step
|
# after this step
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||||
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319"; \
|
uv pip install --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121"; \
|
||||||
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
COPY requirements/common.txt requirements/common.txt
|
COPY requirements/common.txt requirements/common.txt
|
||||||
COPY requirements/cuda.txt requirements/cuda.txt
|
COPY requirements/cuda.txt requirements/cuda.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system -r requirements/cuda.txt \
|
uv pip install -r requirements/cuda.txt
|
||||||
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
|
||||||
|
|
||||||
# cuda arch list used by torch
|
# cuda arch list used by torch
|
||||||
# can be useful for both `dev` and `test`
|
# can be useful for both `dev` and `test`
|
||||||
# explicitly set the list to avoid issues with torch 2.2
|
# explicitly set the list to avoid issues with torch 2.2
|
||||||
# see https://github.com/pytorch/pytorch/pull/123243
|
# see https://github.com/pytorch/pytorch/pull/123243
|
||||||
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
|
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
|
||||||
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
||||||
# Override the arch list for flash-attn to reduce the binary size
|
# Override the arch list for flash-attn to reduce the binary size
|
||||||
ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
|
ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
|
||||||
@@ -94,11 +81,9 @@ COPY requirements/build.txt requirements/build.txt
|
|||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
ENV UV_HTTP_TIMEOUT=500
|
||||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system -r requirements/build.txt \
|
uv pip install -r requirements/build.txt
|
||||||
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
ARG GIT_REPO_CHECK=0
|
ARG GIT_REPO_CHECK=0
|
||||||
@@ -165,60 +150,43 @@ FROM base as dev
|
|||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
ENV UV_HTTP_TIMEOUT=500
|
||||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
|
||||||
|
|
||||||
# Workaround for #17068
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
|
|
||||||
|
|
||||||
COPY requirements/lint.txt requirements/lint.txt
|
COPY requirements/lint.txt requirements/lint.txt
|
||||||
COPY requirements/test.txt requirements/test.txt
|
COPY requirements/test.txt requirements/test.txt
|
||||||
COPY requirements/dev.txt requirements/dev.txt
|
COPY requirements/dev.txt requirements/dev.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system -r requirements/dev.txt \
|
uv pip install -r requirements/dev.txt
|
||||||
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
|
||||||
#################### DEV IMAGE ####################
|
#################### DEV IMAGE ####################
|
||||||
|
|
||||||
#################### vLLM installation IMAGE ####################
|
#################### vLLM installation IMAGE ####################
|
||||||
# image with vLLM installed
|
# image with vLLM installed
|
||||||
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
||||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
|
||||||
ARG CUDA_VERSION=12.8.1
|
ARG CUDA_VERSION=12.4.1
|
||||||
ARG PYTHON_VERSION=3.12
|
ARG PYTHON_VERSION=3.12
|
||||||
WORKDIR /vllm-workspace
|
WORKDIR /vllm-workspace
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
ARG TARGETPLATFORM
|
ARG TARGETPLATFORM
|
||||||
|
|
||||||
SHELL ["/bin/bash", "-c"]
|
|
||||||
|
|
||||||
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
||||||
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
|
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
|
||||||
|
|
||||||
# Install Python and other dependencies
|
# Install minimal dependencies and uv
|
||||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
RUN apt-get update -y \
|
||||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
&& apt-get install -y ccache git curl wget sudo vim \
|
||||||
&& apt-get update -y \
|
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 libibverbs-dev \
|
||||||
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
|
&& curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
|
||||||
&& for i in 1 2 3; do \
|
# Add uv to PATH
|
||||||
add-apt-repository -y ppa:deadsnakes/ppa && break || \
|
ENV PATH="/root/.local/bin:$PATH"
|
||||||
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
|
# Create venv with specified Python and activate by placing at the front of path
|
||||||
done \
|
ENV VIRTUAL_ENV="/opt/venv"
|
||||||
&& apt-get update -y \
|
RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
|
||||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
|
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
|
||||||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
|
||||||
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
|
|
||||||
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
|
||||||
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
|
||||||
&& python3 --version && python3 -m pip --version
|
|
||||||
# Install uv for faster pip installs
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
python3 -m pip install uv
|
|
||||||
|
|
||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
ENV UV_HTTP_TIMEOUT=500
|
||||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
|
||||||
|
|
||||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||||
@@ -232,15 +200,13 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
|||||||
# after this step
|
# after this step
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||||
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319"; \
|
uv pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \
|
||||||
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Install vllm wheel first, so that torch etc will be installed.
|
# Install vllm wheel first, so that torch etc will be installed.
|
||||||
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
||||||
--mount=type=cache,target=/root/.cache/uv \
|
--mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system dist/*.whl --verbose \
|
uv pip install dist/*.whl --verbose
|
||||||
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
|
||||||
|
|
||||||
# If we need to build FlashInfer wheel before its release:
|
# If we need to build FlashInfer wheel before its release:
|
||||||
# $ export FLASHINFER_ENABLE_AOT=1
|
# $ export FLASHINFER_ENABLE_AOT=1
|
||||||
@@ -255,36 +221,18 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
|
|||||||
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl
|
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
. /etc/environment && \
|
|
||||||
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
|
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
|
||||||
# FlashInfer alreary has a wheel for PyTorch 2.7.0 and CUDA 12.8. This is enough for CI use
|
uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
|
||||||
if [[ "$CUDA_VERSION" == 12.8* ]]; then \
|
|
||||||
uv pip install --system https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.5%2Bcu128torch2.7-cp38-abi3-linux_x86_64.whl; \
|
|
||||||
else \
|
|
||||||
export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0+PTX'; \
|
|
||||||
CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
|
|
||||||
if [ "$CUDA_MAJOR" -lt 12 ]; then \
|
|
||||||
export FLASHINFER_ENABLE_SM90=0; \
|
|
||||||
fi; \
|
|
||||||
uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@21ea1d2545f74782b91eb8c08fd503ac4c0743fc" ; \
|
|
||||||
fi \
|
|
||||||
fi
|
fi
|
||||||
COPY examples examples
|
COPY examples examples
|
||||||
COPY benchmarks benchmarks
|
|
||||||
COPY ./vllm/collect_env.py .
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
# Although we build Flashinfer with AOT mode, there's still
|
||||||
. /etc/environment && \
|
|
||||||
uv pip list
|
|
||||||
|
|
||||||
# Even when we build Flashinfer with AOT mode, there's still
|
|
||||||
# some issues w.r.t. JIT compilation. Therefore we need to
|
# some issues w.r.t. JIT compilation. Therefore we need to
|
||||||
# install build dependencies for JIT compilation.
|
# install build dependencies for JIT compilation.
|
||||||
# TODO: Remove this once FlashInfer AOT wheel is fixed
|
# TODO: Remove this once FlashInfer AOT wheel is fixed
|
||||||
COPY requirements/build.txt requirements/build.txt
|
COPY requirements/build.txt requirements/build.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system -r requirements/build.txt \
|
uv pip install -r requirements/build.txt
|
||||||
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
|
||||||
|
|
||||||
#################### vLLM installation IMAGE ####################
|
#################### vLLM installation IMAGE ####################
|
||||||
|
|
||||||
@@ -298,26 +246,18 @@ ADD . /vllm-workspace/
|
|||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
ENV UV_HTTP_TIMEOUT=500
|
||||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
|
||||||
|
|
||||||
# Workaround for #17068
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
|
|
||||||
|
|
||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
|
uv pip install -r requirements/dev.txt
|
||||||
if [ "$CUDA_MAJOR" -ge 12 ]; then \
|
|
||||||
uv pip install --system -r requirements/dev.txt; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system -e tests/vllm_test_utils
|
uv pip install -e tests/vllm_test_utils
|
||||||
|
|
||||||
# enable fast downloads from hf (for testing)
|
# enable fast downloads from hf (for testing)
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system hf_transfer
|
uv pip install hf_transfer
|
||||||
ENV HF_HUB_ENABLE_HF_TRANSFER 1
|
ENV HF_HUB_ENABLE_HF_TRANSFER 1
|
||||||
|
|
||||||
# Copy in the v1 package for testing (it isn't distributed yet)
|
# Copy in the v1 package for testing (it isn't distributed yet)
|
||||||
@@ -328,15 +268,12 @@ COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
|
|||||||
# will not be imported by other tests
|
# will not be imported by other tests
|
||||||
RUN mkdir test_docs
|
RUN mkdir test_docs
|
||||||
RUN mv docs test_docs/
|
RUN mv docs test_docs/
|
||||||
RUN cp -r examples test_docs/
|
|
||||||
RUN mv vllm test_docs/
|
RUN mv vllm test_docs/
|
||||||
RUN mv mkdocs.yaml test_docs/
|
|
||||||
#################### TEST IMAGE ####################
|
#################### TEST IMAGE ####################
|
||||||
|
|
||||||
#################### OPENAI API SERVER ####################
|
#################### OPENAI API SERVER ####################
|
||||||
# base openai image with additional requirements, for any subsequent openai-style images
|
# base openai image with additional requirements, for any subsequent openai-style images
|
||||||
FROM vllm-base AS vllm-openai-base
|
FROM vllm-base AS vllm-openai-base
|
||||||
ARG TARGETPLATFORM
|
|
||||||
|
|
||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
@@ -345,9 +282,9 @@ ENV UV_HTTP_TIMEOUT=500
|
|||||||
# install additional dependencies for openai api server
|
# install additional dependencies for openai api server
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||||
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
|
uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
|
||||||
else \
|
else \
|
||||||
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.3' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
|
uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
ENV VLLM_USAGE_SOURCE production-docker-image
|
ENV VLLM_USAGE_SOURCE production-docker-image
|
||||||
69
Dockerfile.cpu
Normal file
69
Dockerfile.cpu
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
|
||||||
|
|
||||||
|
FROM ubuntu:22.04 AS cpu-test-1
|
||||||
|
|
||||||
|
ENV CCACHE_DIR=/root/.cache/ccache
|
||||||
|
|
||||||
|
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/var/cache/apt \
|
||||||
|
apt-get update -y \
|
||||||
|
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
|
||||||
|
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
||||||
|
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
||||||
|
|
||||||
|
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
|
||||||
|
# intel-openmp provides additional performance improvement vs. openmp
|
||||||
|
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install intel-openmp==2025.0.1
|
||||||
|
|
||||||
|
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
|
||||||
|
|
||||||
|
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
||||||
|
|
||||||
|
RUN pip install intel_extension_for_pytorch==2.6.0
|
||||||
|
|
||||||
|
WORKDIR /workspace
|
||||||
|
|
||||||
|
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
||||||
|
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
--mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
|
||||||
|
pip install --upgrade pip && \
|
||||||
|
pip install -r requirements/build.txt
|
||||||
|
|
||||||
|
FROM cpu-test-1 AS build
|
||||||
|
|
||||||
|
WORKDIR /workspace/vllm
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
--mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
|
||||||
|
--mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
|
||||||
|
pip install -v -r requirements/cpu.txt
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
ARG GIT_REPO_CHECK=0
|
||||||
|
RUN --mount=type=bind,source=.git,target=.git \
|
||||||
|
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
||||||
|
|
||||||
|
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
|
||||||
|
ARG VLLM_CPU_DISABLE_AVX512
|
||||||
|
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
--mount=type=cache,target=/root/.cache/ccache \
|
||||||
|
--mount=type=bind,source=.git,target=.git \
|
||||||
|
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
|
||||||
|
pip install dist/*.whl && \
|
||||||
|
rm -rf dist
|
||||||
|
|
||||||
|
WORKDIR /workspace/
|
||||||
|
|
||||||
|
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||||
|
|
||||||
|
# install development dependencies (for testing)
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install -e tests/vllm_test_utils
|
||||||
|
|
||||||
|
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
FROM vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
|
FROM vault.habana.ai/gaudi-docker/1.19.1/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
|
||||||
|
|
||||||
COPY ./ /workspace/vllm
|
COPY ./ /workspace/vllm
|
||||||
|
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
# default base image
|
# default base image
|
||||||
# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
|
# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
|
||||||
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.6.0-neuronx-py310-sdk2.23.0-ubuntu22.04"
|
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04"
|
||||||
|
|
||||||
FROM $BASE_IMAGE
|
FROM $BASE_IMAGE
|
||||||
|
|
||||||
@@ -21,8 +21,9 @@ VOLUME [ ${APP_MOUNT} ]
|
|||||||
WORKDIR ${APP_MOUNT}/vllm
|
WORKDIR ${APP_MOUNT}/vllm
|
||||||
|
|
||||||
RUN python3 -m pip install --upgrade pip
|
RUN python3 -m pip install --upgrade pip
|
||||||
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity
|
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
|
||||||
RUN python3 -m pip install neuronx-cc==2.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
|
||||||
|
RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||||
RUN python3 -m pip install pytest
|
RUN python3 -m pip install pytest
|
||||||
|
|
||||||
# uninstall transformers-neuronx package explicitly to avoid version conflict
|
# uninstall transformers-neuronx package explicitly to avoid version conflict
|
||||||
@@ -48,8 +49,6 @@ RUN python3 -m pip install -e tests/vllm_test_utils
|
|||||||
# FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict
|
# FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict
|
||||||
RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps
|
RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps
|
||||||
|
|
||||||
RUN python3 -m pip install sentencepiece transformers==4.48.0 -U
|
|
||||||
|
|
||||||
# overwrite entrypoint to run bash script
|
# overwrite entrypoint to run bash script
|
||||||
RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
|
RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
|
||||||
|
|
||||||
29
Dockerfile.openvino
Normal file
29
Dockerfile.openvino
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
|
||||||
|
# to run the OpenAI compatible server.
|
||||||
|
|
||||||
|
FROM ubuntu:22.04 AS dev
|
||||||
|
|
||||||
|
RUN apt-get update -y && \
|
||||||
|
apt-get install -y \
|
||||||
|
git python3-pip \
|
||||||
|
ffmpeg libsm6 libxext6 libgl1
|
||||||
|
WORKDIR /workspace
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
ARG GIT_REPO_CHECK=0
|
||||||
|
RUN --mount=type=bind,source=.git,target=.git \
|
||||||
|
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
||||||
|
|
||||||
|
RUN python3 -m pip install -U pip
|
||||||
|
# install build requirements
|
||||||
|
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements/build.txt
|
||||||
|
# build vLLM with OpenVINO backend
|
||||||
|
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace
|
||||||
|
|
||||||
|
COPY examples/ /workspace/examples
|
||||||
|
COPY benchmarks/ /workspace/benchmarks
|
||||||
|
|
||||||
|
# install development dependencies (for testing)
|
||||||
|
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||||
|
|
||||||
|
CMD ["/bin/bash"]
|
||||||
37
Dockerfile.ppc64le
Normal file
37
Dockerfile.ppc64le
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
FROM mambaorg/micromamba
|
||||||
|
ARG MAMBA_DOCKERFILE_ACTIVATE=1
|
||||||
|
USER root
|
||||||
|
|
||||||
|
ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
|
||||||
|
|
||||||
|
RUN apt-get update -y && apt-get install -y git wget kmod curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev
|
||||||
|
|
||||||
|
# Some packages in requirements/cpu are installed here
|
||||||
|
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
|
||||||
|
# Currently these may not be available for venv or pip directly
|
||||||
|
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 rust && micromamba clean --all --yes
|
||||||
|
|
||||||
|
COPY ./ /workspace/vllm
|
||||||
|
|
||||||
|
WORKDIR /workspace/vllm
|
||||||
|
ARG GIT_REPO_CHECK=0
|
||||||
|
RUN --mount=type=bind,source=.git,target=.git \
|
||||||
|
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
|
||||||
|
'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
|
||||||
|
-r requirements/cpu.txt \
|
||||||
|
xformers uvloop==0.20.0
|
||||||
|
|
||||||
|
RUN --mount=type=bind,source=.git,target=.git \
|
||||||
|
VLLM_TARGET_DEVICE=cpu python3 setup.py install
|
||||||
|
|
||||||
|
# install development dependencies (for testing)
|
||||||
|
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||||
|
|
||||||
|
WORKDIR /workspace/
|
||||||
|
|
||||||
|
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||||
|
|
||||||
|
ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
@@ -12,8 +12,7 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
|
|||||||
|
|
||||||
# Install some basic utilities
|
# Install some basic utilities
|
||||||
RUN apt-get update -q -y && apt-get install -q -y \
|
RUN apt-get update -q -y && apt-get install -q -y \
|
||||||
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
|
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev
|
||||||
apt-transport-https ca-certificates wget curl
|
|
||||||
# Remove sccache
|
# Remove sccache
|
||||||
RUN python3 -m pip install --upgrade pip && pip install setuptools_scm
|
RUN python3 -m pip install --upgrade pip && pip install setuptools_scm
|
||||||
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
|
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
|
||||||
@@ -41,7 +40,7 @@ ARG USE_CYTHON
|
|||||||
RUN cd vllm \
|
RUN cd vllm \
|
||||||
&& python3 -m pip install -r requirements/rocm.txt \
|
&& python3 -m pip install -r requirements/rocm.txt \
|
||||||
&& python3 setup.py clean --all \
|
&& python3 setup.py clean --all \
|
||||||
&& if [ ${USE_CYTHON} -eq "1" ]; then python3 tests/build_cython.py build_ext --inplace; fi \
|
&& if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \
|
||||||
&& python3 setup.py bdist_wheel --dist-dir=dist
|
&& python3 setup.py bdist_wheel --dist-dir=dist
|
||||||
FROM scratch AS export_vllm
|
FROM scratch AS export_vllm
|
||||||
ARG COMMON_WORKDIR
|
ARG COMMON_WORKDIR
|
||||||
@@ -114,16 +113,8 @@ COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
|
|||||||
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
||||||
ENV TOKENIZERS_PARALLELISM=false
|
ENV TOKENIZERS_PARALLELISM=false
|
||||||
|
|
||||||
# ENV that can improve safe tensor loading, and end-to-end time
|
|
||||||
ENV SAFETENSORS_FAST_GPU=1
|
|
||||||
|
|
||||||
# User-friendly environment setting for multi-processing to avoid below RuntimeError.
|
|
||||||
# RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing,
|
|
||||||
# you must use the 'spawn' start method
|
|
||||||
# See https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
|
|
||||||
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
|
|
||||||
# Performance environment variable.
|
# Performance environment variable.
|
||||||
ENV HIP_FORCE_DEV_KERNARG=1
|
ENV HIP_FORCE_DEV_KERNARG=1
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
CMD ["/bin/bash"]
|
||||||
|
|
||||||
@@ -1,26 +1,24 @@
|
|||||||
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
|
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
|
||||||
ARG HIPBLASLT_BRANCH="db8e93b4"
|
ARG HIPBLASLT_BRANCH="4d40e36"
|
||||||
ARG HIPBLAS_COMMON_BRANCH="7c1566b"
|
ARG HIPBLAS_COMMON_BRANCH="7c1566b"
|
||||||
ARG LEGACY_HIPBLASLT_OPTION=
|
ARG LEGACY_HIPBLASLT_OPTION=
|
||||||
ARG RCCL_BRANCH="648a58d"
|
ARG RCCL_BRANCH="648a58d"
|
||||||
ARG RCCL_REPO="https://github.com/ROCm/rccl"
|
ARG RCCL_REPO="https://github.com/ROCm/rccl"
|
||||||
ARG TRITON_BRANCH="e5be006"
|
ARG TRITON_BRANCH="e5be006"
|
||||||
ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
|
ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
|
||||||
ARG PYTORCH_BRANCH="295f2ed4"
|
ARG PYTORCH_BRANCH="3a585126"
|
||||||
ARG PYTORCH_VISION_BRANCH="v0.21.0"
|
ARG PYTORCH_VISION_BRANCH="v0.19.1"
|
||||||
ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
|
ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
|
||||||
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
|
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
|
||||||
ARG FA_BRANCH="1a7f4dfa"
|
ARG FA_BRANCH="b7d29fb"
|
||||||
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
|
ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
|
||||||
ARG AITER_BRANCH="c1debd8"
|
|
||||||
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
|
|
||||||
|
|
||||||
FROM ${BASE_IMAGE} AS base
|
FROM ${BASE_IMAGE} AS base
|
||||||
|
|
||||||
ENV PATH=/opt/rocm/llvm/bin:$PATH
|
ENV PATH=/opt/rocm/llvm/bin:$PATH
|
||||||
ENV ROCM_PATH=/opt/rocm
|
ENV ROCM_PATH=/opt/rocm
|
||||||
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
|
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
|
||||||
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx1100;gfx1101;gfx1200;gfx1201
|
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
|
||||||
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
|
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
|
||||||
|
|
||||||
ARG PYTHON_VERSION=3.12
|
ARG PYTHON_VERSION=3.12
|
||||||
@@ -31,11 +29,8 @@ ENV DEBIAN_FRONTEND=noninteractive
|
|||||||
|
|
||||||
# Install Python and other dependencies
|
# Install Python and other dependencies
|
||||||
RUN apt-get update -y \
|
RUN apt-get update -y \
|
||||||
&& apt-get install -y software-properties-common git curl sudo vim less libgfortran5 \
|
&& apt-get install -y software-properties-common git curl sudo vim less \
|
||||||
&& for i in 1 2 3; do \
|
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||||
add-apt-repository -y ppa:deadsnakes/ppa && break || \
|
|
||||||
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
|
|
||||||
done \
|
|
||||||
&& apt-get update -y \
|
&& apt-get update -y \
|
||||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
|
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
|
||||||
python${PYTHON_VERSION}-lib2to3 python-is-python3 \
|
python${PYTHON_VERSION}-lib2to3 python-is-python3 \
|
||||||
@@ -45,7 +40,7 @@ RUN apt-get update -y \
|
|||||||
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
||||||
&& python3 --version && python3 -m pip --version
|
&& python3 --version && python3 -m pip --version
|
||||||
|
|
||||||
RUN pip install -U packaging 'cmake<4' ninja wheel setuptools pybind11 Cython
|
RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython
|
||||||
|
|
||||||
FROM base AS build_hipblaslt
|
FROM base AS build_hipblaslt
|
||||||
ARG HIPBLASLT_BRANCH
|
ARG HIPBLASLT_BRANCH
|
||||||
@@ -63,8 +58,7 @@ RUN cd hipBLAS-common \
|
|||||||
RUN git clone https://github.com/ROCm/hipBLASLt
|
RUN git clone https://github.com/ROCm/hipBLASLt
|
||||||
RUN cd hipBLASLt \
|
RUN cd hipBLASLt \
|
||||||
&& git checkout ${HIPBLASLT_BRANCH} \
|
&& git checkout ${HIPBLASLT_BRANCH} \
|
||||||
&& apt-get install -y llvm-dev \
|
&& ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
|
||||||
&& ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
|
|
||||||
&& cd build/release \
|
&& cd build/release \
|
||||||
&& make package
|
&& make package
|
||||||
RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
|
RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
|
||||||
@@ -114,24 +108,11 @@ RUN git clone ${FA_REPO}
|
|||||||
RUN cd flash-attention \
|
RUN cd flash-attention \
|
||||||
&& git checkout ${FA_BRANCH} \
|
&& git checkout ${FA_BRANCH} \
|
||||||
&& git submodule update --init \
|
&& git submodule update --init \
|
||||||
&& GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist
|
&& MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
|
||||||
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
|
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
|
||||||
&& cp /app/vision/dist/*.whl /app/install \
|
&& cp /app/vision/dist/*.whl /app/install \
|
||||||
&& cp /app/flash-attention/dist/*.whl /app/install
|
&& cp /app/flash-attention/dist/*.whl /app/install
|
||||||
|
|
||||||
FROM base AS build_aiter
|
|
||||||
ARG AITER_BRANCH
|
|
||||||
ARG AITER_REPO
|
|
||||||
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
|
|
||||||
pip install /install/*.whl
|
|
||||||
RUN git clone --recursive ${AITER_REPO}
|
|
||||||
RUN cd aiter \
|
|
||||||
&& git checkout ${AITER_BRANCH} \
|
|
||||||
&& git submodule update --init --recursive \
|
|
||||||
&& pip install -r requirements.txt
|
|
||||||
RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
|
|
||||||
RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
|
|
||||||
|
|
||||||
FROM base AS final
|
FROM base AS final
|
||||||
RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
|
RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
|
||||||
dpkg -i /install/*deb \
|
dpkg -i /install/*deb \
|
||||||
@@ -147,11 +128,8 @@ RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
|
|||||||
pip install /install/*.whl
|
pip install /install/*.whl
|
||||||
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
|
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
|
||||||
pip install /install/*.whl
|
pip install /install/*.whl
|
||||||
RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
|
|
||||||
pip install /install/*.whl
|
|
||||||
|
|
||||||
ARG BASE_IMAGE
|
ARG BASE_IMAGE
|
||||||
ARG HIPBLAS_COMMON_BRANCH
|
|
||||||
ARG HIPBLASLT_BRANCH
|
ARG HIPBLASLT_BRANCH
|
||||||
ARG LEGACY_HIPBLASLT_OPTION
|
ARG LEGACY_HIPBLASLT_OPTION
|
||||||
ARG RCCL_BRANCH
|
ARG RCCL_BRANCH
|
||||||
@@ -164,8 +142,6 @@ ARG PYTORCH_REPO
|
|||||||
ARG PYTORCH_VISION_REPO
|
ARG PYTORCH_VISION_REPO
|
||||||
ARG FA_BRANCH
|
ARG FA_BRANCH
|
||||||
ARG FA_REPO
|
ARG FA_REPO
|
||||||
ARG AITER_BRANCH
|
|
||||||
ARG AITER_REPO
|
|
||||||
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
||||||
&& echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
|
&& echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
|
||||||
&& echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
|
&& echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
|
||||||
@@ -179,5 +155,4 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
|||||||
&& echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
|
&& echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
|
||||||
&& echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
|
&& echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
|
||||||
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
|
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
|
||||||
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
|
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt
|
||||||
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
|
|
||||||
@@ -16,7 +16,7 @@ ENV LANG=C.UTF-8 \
|
|||||||
RUN microdnf install -y \
|
RUN microdnf install -y \
|
||||||
which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \
|
which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \
|
||||||
libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
|
libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
|
||||||
openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy && \
|
openssl-devel openblas openblas-devel autoconf automake libtool cmake && \
|
||||||
microdnf clean all
|
microdnf clean all
|
||||||
|
|
||||||
# Python Installation
|
# Python Installation
|
||||||
@@ -58,7 +58,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
cd ../../python && \
|
cd ../../python && \
|
||||||
export PYARROW_PARALLEL=4 && \
|
export PYARROW_PARALLEL=4 && \
|
||||||
export ARROW_BUILD_TYPE=release && \
|
export ARROW_BUILD_TYPE=release && \
|
||||||
uv pip install -r requirements-build.txt && \
|
uv pip install -r requirements/build.txt && \
|
||||||
python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --bundle-arrow-cpp bdist_wheel
|
python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --bundle-arrow-cpp bdist_wheel
|
||||||
|
|
||||||
FROM python-install AS numa-build
|
FROM python-install AS numa-build
|
||||||
@@ -84,58 +84,18 @@ RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \
|
|||||||
rustup default stable && \
|
rustup default stable && \
|
||||||
rustup show
|
rustup show
|
||||||
|
|
||||||
FROM python-install AS torch
|
|
||||||
ARG TORCH_VERSION=2.7.0
|
|
||||||
ENV export _GLIBCXX_USE_CXX11_ABI=1
|
|
||||||
ENV CARGO_HOME=/root/.cargo
|
|
||||||
ENV RUSTUP_HOME=/root/.rustup
|
|
||||||
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
|
|
||||||
|
|
||||||
WORKDIR /tmp
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
--mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
|
|
||||||
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
|
|
||||||
git clone https://github.com/pytorch/pytorch.git && \
|
|
||||||
cd pytorch && \
|
|
||||||
git checkout v2.7.0 && \
|
|
||||||
git submodule sync && \
|
|
||||||
git submodule update --init --recursive && \
|
|
||||||
uv pip install cmake ninja && \
|
|
||||||
uv pip install -r requirements.txt && \
|
|
||||||
python setup.py bdist_wheel
|
|
||||||
|
|
||||||
|
|
||||||
FROM python-install AS torch-vision
|
FROM python-install AS torch-vision
|
||||||
# Install torchvision
|
# Install torchvision
|
||||||
ARG TORCH_VERSION=2.7.0
|
ARG TORCH_VERSION=2.7.0.dev20250304
|
||||||
ARG TORCH_VISION_VERSION=v0.20.1
|
ARG TORCH_VISION_VERSION=v0.20.1
|
||||||
WORKDIR /tmp
|
WORKDIR /tmp
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
--mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
|
|
||||||
git clone https://github.com/pytorch/vision.git && \
|
git clone https://github.com/pytorch/vision.git && \
|
||||||
cd vision && \
|
cd vision && \
|
||||||
git checkout $TORCH_VISION_VERSION && \
|
git checkout $TORCH_VISION_VERSION && \
|
||||||
TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl | head -n 1) && \
|
uv pip install -v torch==${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu && \
|
||||||
uv pip install -v $TORCH_WHL_FILE && \
|
|
||||||
python setup.py bdist_wheel
|
python setup.py bdist_wheel
|
||||||
|
|
||||||
FROM python-install AS hf-xet-builder
|
|
||||||
# Install hf-xet
|
|
||||||
WORKDIR /tmp
|
|
||||||
ENV CARGO_HOME=/root/.cargo
|
|
||||||
ENV RUSTUP_HOME=/root/.rustup
|
|
||||||
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
--mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
|
|
||||||
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
|
|
||||||
git clone https://github.com/huggingface/xet-core.git && \
|
|
||||||
cd xet-core/hf_xet/ && \
|
|
||||||
uv pip install maturin patchelf && \
|
|
||||||
python -m maturin build --release --out dist && \
|
|
||||||
mkdir -p /tmp/hf-xet/dist && \
|
|
||||||
cp dist/*.whl /tmp/hf-xet/dist/
|
|
||||||
|
|
||||||
# Final build stage
|
# Final build stage
|
||||||
FROM python-install AS vllm-cpu
|
FROM python-install AS vllm-cpu
|
||||||
ARG PYTHON_VERSION
|
ARG PYTHON_VERSION
|
||||||
@@ -147,7 +107,6 @@ ENV UV_LINK_MODE=copy
|
|||||||
ENV CARGO_HOME=/root/.cargo
|
ENV CARGO_HOME=/root/.cargo
|
||||||
ENV RUSTUP_HOME=/root/.rustup
|
ENV RUSTUP_HOME=/root/.rustup
|
||||||
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
|
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
|
||||||
ENV GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
|
|
||||||
|
|
||||||
COPY . /workspace/vllm
|
COPY . /workspace/vllm
|
||||||
WORKDIR /workspace/vllm
|
WORKDIR /workspace/vllm
|
||||||
@@ -161,18 +120,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
|
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
|
||||||
--mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \
|
--mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \
|
||||||
--mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \
|
--mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \
|
||||||
--mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \
|
|
||||||
--mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
|
|
||||||
sed -i '/^torch/d' requirements/build.txt && \
|
sed -i '/^torch/d' requirements/build.txt && \
|
||||||
ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \
|
ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \
|
||||||
VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \
|
VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \
|
||||||
HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl | head -n 1) && \
|
|
||||||
TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl | head -n 1) && \
|
|
||||||
uv pip install -v \
|
uv pip install -v \
|
||||||
$ARROW_WHL_FILE \
|
$ARROW_WHL_FILE \
|
||||||
$VISION_WHL_FILE \
|
$VISION_WHL_FILE \
|
||||||
$HF_XET_WHL_FILE \
|
--extra-index-url https://download.pytorch.org/whl/nightly/cpu \
|
||||||
$TORCH_WHL_FILE \
|
|
||||||
--index-strategy unsafe-best-match \
|
--index-strategy unsafe-best-match \
|
||||||
-r requirements/build.txt \
|
-r requirements/build.txt \
|
||||||
-r requirements/cpu.txt
|
-r requirements/cpu.txt
|
||||||
@@ -196,4 +150,3 @@ WORKDIR /home/vllm
|
|||||||
|
|
||||||
# Set the default entrypoint
|
# Set the default entrypoint
|
||||||
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
|
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
|
|
||||||
@@ -23,7 +23,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|||||||
--mount=type=bind,source=.git,target=.git \
|
--mount=type=bind,source=.git,target=.git \
|
||||||
python3 -m pip install \
|
python3 -m pip install \
|
||||||
-r requirements/tpu.txt
|
-r requirements/tpu.txt
|
||||||
RUN python3 -m pip install -e .
|
RUN python3 setup.py develop
|
||||||
|
|
||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||||
@@ -1,7 +1,11 @@
|
|||||||
# oneapi 2025.0.2 docker base image use rolling 2448 package. https://dgpu-docs.intel.com/releases/packages.html?release=Rolling+2448.13&os=Ubuntu+22.04, and we don't need install driver manually.
|
FROM intel/deep-learning-essentials:2025.0.1-0-devel-ubuntu22.04 AS vllm-base
|
||||||
FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu22.04 AS vllm-base
|
|
||||||
|
|
||||||
RUN rm /etc/apt/sources.list.d/intel-graphics.list
|
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
||||||
|
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
||||||
|
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
|
||||||
|
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
|
||||||
|
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
|
||||||
|
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
||||||
|
|
||||||
RUN apt-get update -y && \
|
RUN apt-get update -y && \
|
||||||
apt-get install -y --no-install-recommends --fix-missing \
|
apt-get install -y --no-install-recommends --fix-missing \
|
||||||
@@ -17,6 +21,8 @@ RUN apt-get update -y && \
|
|||||||
python3 \
|
python3 \
|
||||||
python3-dev \
|
python3-dev \
|
||||||
python3-pip \
|
python3-pip \
|
||||||
|
libze-intel-gpu-dev \
|
||||||
|
libze-intel-gpu1 \
|
||||||
wget
|
wget
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
WORKDIR /workspace/vllm
|
||||||
@@ -40,6 +46,12 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|||||||
--mount=type=bind,source=.git,target=.git \
|
--mount=type=bind,source=.git,target=.git \
|
||||||
python3 setup.py install
|
python3 setup.py install
|
||||||
|
|
||||||
|
# Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu
|
||||||
|
# FIXME: This will be fix in ipex 2.7. just leave this here for awareness.
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install intel-extension-for-pytorch==2.6.10+xpu \
|
||||||
|
--extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
CMD ["/bin/bash"]
|
||||||
|
|
||||||
FROM vllm-base AS vllm-openai
|
FROM vllm-base AS vllm-openai
|
||||||
40
README.md
40
README.md
@@ -1,7 +1,7 @@
|
|||||||
<p align="center">
|
<p align="center">
|
||||||
<picture>
|
<picture>
|
||||||
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-dark.png">
|
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-dark.png">
|
||||||
<img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-light.png" width=55%>
|
<img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png" width=55%>
|
||||||
</picture>
|
</picture>
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
@@ -10,24 +10,15 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
</h3>
|
</h3>
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
|
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
*Latest News* 🔥
|
*Latest News* 🔥
|
||||||
- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
|
|
||||||
- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
|
|
||||||
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
|
|
||||||
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
|
|
||||||
|
|
||||||
<details>
|
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit#slide=id.g33fb1ff286e_0_29).
|
||||||
<summary>Previous News</summary>
|
|
||||||
|
|
||||||
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
|
|
||||||
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
|
|
||||||
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
|
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
|
||||||
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
|
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
|
||||||
|
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
|
||||||
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
|
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
|
||||||
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
|
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
|
||||||
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
|
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
|
||||||
@@ -43,9 +34,8 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
|
- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
|
||||||
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
|
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## About
|
## About
|
||||||
|
|
||||||
vLLM is a fast and easy-to-use library for LLM inference and serving.
|
vLLM is a fast and easy-to-use library for LLM inference and serving.
|
||||||
@@ -58,7 +48,7 @@ vLLM is fast with:
|
|||||||
- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
|
- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
|
||||||
- Continuous batching of incoming requests
|
- Continuous batching of incoming requests
|
||||||
- Fast model execution with CUDA/HIP graph
|
- Fast model execution with CUDA/HIP graph
|
||||||
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516),INT4, INT8, and FP8.
|
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
|
||||||
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
|
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
|
||||||
- Speculative decoding
|
- Speculative decoding
|
||||||
- Chunked prefill
|
- Chunked prefill
|
||||||
@@ -74,7 +64,7 @@ vLLM is flexible and easy to use with:
|
|||||||
- OpenAI-compatible API server
|
- OpenAI-compatible API server
|
||||||
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
|
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
|
||||||
- Prefix caching support
|
- Prefix caching support
|
||||||
- Multi-LoRA support
|
- Multi-lora support
|
||||||
|
|
||||||
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
|
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
|
||||||
- Transformer-like LLMs (e.g., Llama)
|
- Transformer-like LLMs (e.g., Llama)
|
||||||
@@ -100,14 +90,14 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
|
|||||||
## Contributing
|
## Contributing
|
||||||
|
|
||||||
We welcome and value any contributions and collaborations.
|
We welcome and value any contributions and collaborations.
|
||||||
Please check out [Contributing to vLLM](https://docs.vllm.ai/en/latest/contributing/index.html) for how to get involved.
|
Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
|
||||||
|
|
||||||
## Sponsors
|
## Sponsors
|
||||||
|
|
||||||
vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
|
vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
|
||||||
|
|
||||||
<!-- Note: Please sort them in alphabetical order. -->
|
<!-- Note: Please sort them in alphabetical order. -->
|
||||||
<!-- Note: Please keep these consistent with docs/community/sponsors.md -->
|
<!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
|
||||||
Cash Donations:
|
Cash Donations:
|
||||||
- a16z
|
- a16z
|
||||||
- Dropbox
|
- Dropbox
|
||||||
@@ -123,7 +113,6 @@ Compute Resources:
|
|||||||
- Databricks
|
- Databricks
|
||||||
- DeepInfra
|
- DeepInfra
|
||||||
- Google Cloud
|
- Google Cloud
|
||||||
- Intel
|
|
||||||
- Lambda Lab
|
- Lambda Lab
|
||||||
- Nebius
|
- Nebius
|
||||||
- Novita AI
|
- Novita AI
|
||||||
@@ -154,11 +143,10 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
|
|||||||
|
|
||||||
## Contact Us
|
## Contact Us
|
||||||
|
|
||||||
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
|
- For technical questions and feature requests, please use GitHub issues or discussions.
|
||||||
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
|
- For discussing with fellow users and coordinating contributions and development, please use Slack.
|
||||||
- coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
|
- For security disclosures, please use GitHub's security advisory feature.
|
||||||
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
|
- For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
|
||||||
- For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
|
|
||||||
|
|
||||||
## Media Kit
|
## Media Kit
|
||||||
|
|
||||||
|
|||||||
@@ -41,39 +41,29 @@ become available.
|
|||||||
<td><code>synthetic</code></td>
|
<td><code>synthetic</code></td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td><strong>HuggingFace-VisionArena</strong></td>
|
<td><strong>HuggingFace</strong></td>
|
||||||
<td style="text-align: center;">✅</td>
|
<td style="text-align: center;">✅</td>
|
||||||
<td style="text-align: center;">✅</td>
|
<td style="text-align: center;">🟡</td>
|
||||||
<td><code>lmarena-ai/VisionArena-Chat</code></td>
|
<td>Specify your dataset path on HuggingFace</td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td><strong>HuggingFace-InstructCoder</strong></td>
|
<td><strong>VisionArena</strong></td>
|
||||||
<td style="text-align: center;">✅</td>
|
<td style="text-align: center;">✅</td>
|
||||||
<td style="text-align: center;">✅</td>
|
<td style="text-align: center;">✅</td>
|
||||||
<td><code>likaixin/InstructCoder</code></td>
|
<td><code>lmarena-ai/vision-arena-bench-v0.1</code> (a HuggingFace dataset)</td>
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>HuggingFace-AIMO</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>HuggingFace-Other</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
|
|
||||||
</tr>
|
</tr>
|
||||||
</tbody>
|
</tbody>
|
||||||
</table>
|
</table>
|
||||||
|
|
||||||
✅: supported
|
✅: supported
|
||||||
|
|
||||||
🟡: Partial support
|
|
||||||
|
|
||||||
🚧: to be supported
|
🚧: to be supported
|
||||||
|
|
||||||
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
|
🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
|
||||||
|
similar to `lmms-lab/LLaVA-OneVision-Data`. If you need support for other dataset
|
||||||
|
formats, please consider contributing.
|
||||||
|
|
||||||
|
**Note**: VisionArena’s `dataset-name` should be set to `hf`
|
||||||
|
|
||||||
---
|
---
|
||||||
## Example - Online Benchmark
|
## Example - Online Benchmark
|
||||||
@@ -81,7 +71,8 @@ become available.
|
|||||||
First start serving your model
|
First start serving your model
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
|
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
|
||||||
|
vllm serve ${MODEL_NAME} --disable-log-requests
|
||||||
```
|
```
|
||||||
|
|
||||||
Then run the benchmarking script
|
Then run the benchmarking script
|
||||||
@@ -89,13 +80,12 @@ Then run the benchmarking script
|
|||||||
```bash
|
```bash
|
||||||
# download dataset
|
# download dataset
|
||||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
|
||||||
--backend vllm \
|
NUM_PROMPTS=10
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
BACKEND="vllm"
|
||||||
--endpoint /v1/completions \
|
DATASET_NAME="sharegpt"
|
||||||
--dataset-name sharegpt \
|
DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
|
||||||
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
|
||||||
--num-prompts 10
|
|
||||||
```
|
```
|
||||||
|
|
||||||
If successful, you will see the following output
|
If successful, you will see the following output
|
||||||
@@ -132,104 +122,37 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
|||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
|
||||||
|
NUM_PROMPTS=10
|
||||||
|
BACKEND="openai-chat"
|
||||||
|
DATASET_NAME="hf"
|
||||||
|
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
|
||||||
|
DATASET_SPLIT='train'
|
||||||
|
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
python3 vllm/benchmarks/benchmark_serving.py \
|
||||||
--backend openai-chat \
|
--backend "${BACKEND}" \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model "${MODEL_NAME}" \
|
||||||
--endpoint /v1/chat/completions \
|
--endpoint "/v1/chat/completions" \
|
||||||
--dataset-name hf \
|
--dataset-name "${DATASET_NAME}" \
|
||||||
--dataset-path lmarena-ai/VisionArena-Chat \
|
--dataset-path "${DATASET_PATH}" \
|
||||||
--hf-split train \
|
--hf-split "${DATASET_SPLIT}" \
|
||||||
--num-prompts 1000
|
--num-prompts "${NUM_PROMPTS}"
|
||||||
```
|
|
||||||
|
|
||||||
### InstructCoder Benchmark with Speculative Decoding
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
|
||||||
--ngram_prompt_lookup_min 2 \
|
|
||||||
--ngram-prompt-lookup-max 5 \
|
|
||||||
--speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5}
|
|
||||||
```
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
python3 benchmarks/benchmark_serving.py \
|
|
||||||
--model meta-llama/Meta-Llama-3-8B-Instruct \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path likaixin/InstructCoder \
|
|
||||||
--num-prompts 2048
|
|
||||||
```
|
|
||||||
|
|
||||||
### Other HuggingFaceDataset Examples
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
|
||||||
```
|
|
||||||
|
|
||||||
**`lmms-lab/LLaVA-OneVision-Data`**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
|
||||||
--backend openai-chat \
|
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
|
||||||
--endpoint /v1/chat/completions \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path lmms-lab/LLaVA-OneVision-Data \
|
|
||||||
--hf-split train \
|
|
||||||
--hf-subset "chart2text(cauldron)" \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
|
||||||
--backend openai-chat \
|
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
|
||||||
--endpoint /v1/chat/completions \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
|
|
||||||
--hf-split train \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
**`AI-MO/aimo-validation-aime`**
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
|
||||||
--model Qwen/QwQ-32B \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path AI-MO/aimo-validation-aime \
|
|
||||||
--num-prompts 10 \
|
|
||||||
--seed 42
|
|
||||||
```
|
|
||||||
|
|
||||||
### Running With Sampling Parameters
|
|
||||||
|
|
||||||
When using OpenAI-compatible backends such as `vllm`, optional sampling
|
|
||||||
parameters can be specified. Example client command:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
|
||||||
--backend vllm \
|
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
|
||||||
--endpoint /v1/completions \
|
|
||||||
--dataset-name sharegpt \
|
|
||||||
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
|
||||||
--top-k 10 \
|
|
||||||
--top-p 0.9 \
|
|
||||||
--temperature 0.5 \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
## Example - Offline Throughput Benchmark
|
## Example - Offline Throughput Benchmark
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
|
||||||
|
NUM_PROMPTS=10
|
||||||
|
DATASET_NAME="sonnet"
|
||||||
|
DATASET_PATH="vllm/benchmarks/sonnet.txt"
|
||||||
|
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
--model "${MODEL_NAME}" \
|
||||||
--dataset-name sonnet \
|
--dataset-name "${DATASET_NAME}" \
|
||||||
--dataset-path vllm/benchmarks/sonnet.txt \
|
--dataset-path "${DATASET_PATH}" \
|
||||||
--num-prompts 10
|
--num-prompts "${NUM_PROMPTS}"
|
||||||
```
|
```
|
||||||
|
|
||||||
If successful, you will see the following output
|
If successful, you will see the following output
|
||||||
@@ -243,13 +166,19 @@ Total num output tokens: 1500
|
|||||||
### VisionArena Benchmark for Vision Language Models
|
### VisionArena Benchmark for Vision Language Models
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
|
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
|
||||||
|
NUM_PROMPTS=10
|
||||||
|
DATASET_NAME="hf"
|
||||||
|
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
|
||||||
|
DATASET_SPLIT="train"
|
||||||
|
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model "${MODEL_NAME}" \
|
||||||
--backend vllm-chat \
|
--backend "vllm-chat" \
|
||||||
--dataset-name hf \
|
--dataset-name "${DATASET_NAME}" \
|
||||||
--dataset-path lmarena-ai/VisionArena-Chat \
|
--dataset-path "${DATASET_PATH}" \
|
||||||
--num-prompts 1000 \
|
--num-prompts "${NUM_PROMPTS}" \
|
||||||
--hf-split train
|
--hf-split "${DATASET_SPLIT}"
|
||||||
```
|
```
|
||||||
|
|
||||||
The `num prompt tokens` now includes image token counts
|
The `num prompt tokens` now includes image token counts
|
||||||
@@ -260,82 +189,29 @@ Total num prompt tokens: 14527
|
|||||||
Total num output tokens: 1280
|
Total num output tokens: 1280
|
||||||
```
|
```
|
||||||
|
|
||||||
### InstructCoder Benchmark with Speculative Decoding
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
|
||||||
VLLM_USE_V1=1 \
|
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
|
||||||
--dataset-name=hf \
|
|
||||||
--dataset-path=likaixin/InstructCoder \
|
|
||||||
--model=meta-llama/Meta-Llama-3-8B-Instruct \
|
|
||||||
--input-len=1000 \
|
|
||||||
--output-len=100 \
|
|
||||||
--num-prompts=2048 \
|
|
||||||
--async-engine \
|
|
||||||
--ngram_prompt_lookup_min=2 \
|
|
||||||
--ngram-prompt-lookup-max=5 \
|
|
||||||
--speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5}
|
|
||||||
```
|
|
||||||
|
|
||||||
```
|
|
||||||
Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
|
|
||||||
Total num prompt tokens: 261136
|
|
||||||
Total num output tokens: 204800
|
|
||||||
```
|
|
||||||
|
|
||||||
### Other HuggingFaceDataset Examples
|
|
||||||
|
|
||||||
**`lmms-lab/LLaVA-OneVision-Data`**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
|
||||||
--backend vllm-chat \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path lmms-lab/LLaVA-OneVision-Data \
|
|
||||||
--hf-split train \
|
|
||||||
--hf-subset "chart2text(cauldron)" \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
|
||||||
--backend vllm-chat \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
|
|
||||||
--hf-split train \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
**`AI-MO/aimo-validation-aime`**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 benchmarks/benchmark_throughput.py \
|
|
||||||
--model Qwen/QwQ-32B \
|
|
||||||
--backend vllm \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path AI-MO/aimo-validation-aime \
|
|
||||||
--hf-split train \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
### Benchmark with LoRA Adapters
|
### Benchmark with LoRA Adapters
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
# download dataset
|
# download dataset
|
||||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
MODEL_NAME="meta-llama/Llama-2-7b-hf"
|
||||||
|
BACKEND="vllm"
|
||||||
|
DATASET_NAME="sharegpt"
|
||||||
|
DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
|
||||||
|
NUM_PROMPTS=10
|
||||||
|
MAX_LORAS=2
|
||||||
|
MAX_LORA_RANK=8
|
||||||
|
ENABLE_LORA="--enable-lora"
|
||||||
|
LORA_PATH="yard1/llama-2-7b-sql-lora-test"
|
||||||
|
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||||
--model meta-llama/Llama-2-7b-hf \
|
--model "${MODEL_NAME}" \
|
||||||
--backend vllm \
|
--backend "${BACKEND}" \
|
||||||
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
--dataset_path "${DATASET_PATH}" \
|
||||||
--dataset_name sharegpt \
|
--dataset_name "${DATASET_NAME}" \
|
||||||
--num-prompts 10 \
|
--num-prompts "${NUM_PROMPTS}" \
|
||||||
--max-loras 2 \
|
--max-loras "${MAX_LORAS}" \
|
||||||
--max-lora-rank 8 \
|
--max-lora-rank "${MAX_LORA_RANK}" \
|
||||||
--enable-lora \
|
${ENABLE_LORA} \
|
||||||
--lora-path yard1/llama-2-7b-sql-lora-test
|
--lora-path "${LORA_PATH}"
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -1,212 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
|
|
||||||
# The current server parameter combination is max_num_seqs and max_num_batched_tokens
|
|
||||||
# It also supports additional requirement: e2e latency and prefix cache.
|
|
||||||
|
|
||||||
# Pre-requisite:
|
|
||||||
# 1. Checkout to your branch, install/ update the correct running env. For TPU, activate conda env and install the corresponding torch, xla version.
|
|
||||||
# 2. If the model is customized, replace the MODEL's config with the customized config.
|
|
||||||
# 3. Set variables (ALL REQUIRED)
|
|
||||||
# BASE: your directory for vllm repo
|
|
||||||
# MODEL: the model served by vllm
|
|
||||||
# DOWNLOAD_DIR: directory to download and load model weights.
|
|
||||||
# INPUT_LEN: request input len
|
|
||||||
# OUTPUT_LEN: request output len
|
|
||||||
# MIN_CACHE_HIT_PCT: prefix cache rate
|
|
||||||
# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
|
|
||||||
# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
|
|
||||||
# 5. The final result will be saved in RESULT file.
|
|
||||||
|
|
||||||
|
|
||||||
# Example use cases
|
|
||||||
# 1. Given input_len=1800, output_len=20, what's the best max_num_seqs and max_num_batched_tokens to get highest throughput?
|
|
||||||
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=100000000000
|
|
||||||
# 2. If we have latency requirement to be lower than 500ms, what's the best server parameter?
|
|
||||||
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=500
|
|
||||||
# 3. If we want to reach 60% prefix cache, what's the best server parameter?
|
|
||||||
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=60, MAX_LATENCY_ALLOWED_MS=500
|
|
||||||
|
|
||||||
TAG=$(date +"%Y_%m_%d_%H_%M")
|
|
||||||
BASE=""
|
|
||||||
MODEL="meta-llama/Llama-3.1-8B-Instruct"
|
|
||||||
DOWNLOAD_DIR=""
|
|
||||||
INPUT_LEN=4000
|
|
||||||
OUTPUT_LEN=16
|
|
||||||
MIN_CACHE_HIT_PCT_PCT=0
|
|
||||||
MAX_LATENCY_ALLOWED_MS=100000000000
|
|
||||||
|
|
||||||
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
|
|
||||||
RESULT="$LOG_FOLDER/result.txt"
|
|
||||||
|
|
||||||
echo "result file$ $RESULT"
|
|
||||||
echo "model: $MODEL"
|
|
||||||
echo
|
|
||||||
|
|
||||||
rm -rf $LOG_FOLDER
|
|
||||||
mkdir -p $LOG_FOLDER
|
|
||||||
|
|
||||||
cd "$BASE/vllm"
|
|
||||||
# create sonnet-4x.txt so that we can sample 2048 tokens for input
|
|
||||||
echo "" > benchmarks/sonnet_4x.txt
|
|
||||||
for _ in {1..4}
|
|
||||||
do
|
|
||||||
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
|
|
||||||
done
|
|
||||||
|
|
||||||
pip install datasets
|
|
||||||
|
|
||||||
current_hash=$(git rev-parse HEAD)
|
|
||||||
echo "hash:$current_hash" >> "$RESULT"
|
|
||||||
echo "current_hash: $current_hash"
|
|
||||||
|
|
||||||
best_throughput=0
|
|
||||||
best_max_num_seqs=0
|
|
||||||
best_num_batched_tokens=0
|
|
||||||
best_goodput=0
|
|
||||||
run_benchmark() {
|
|
||||||
local max_num_seqs=$1
|
|
||||||
local max_num_batched_tokens=$2
|
|
||||||
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
|
||||||
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
|
|
||||||
echo "vllm_log: $vllm_log"
|
|
||||||
echo
|
|
||||||
rm -f $vllm_log
|
|
||||||
|
|
||||||
# start the server
|
|
||||||
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
|
|
||||||
--disable-log-requests \
|
|
||||||
--port 8004 \
|
|
||||||
--gpu-memory-utilization 0.98 \
|
|
||||||
--max-num-seqs $max_num_seqs \
|
|
||||||
--max-num-batched-tokens $max_num_batched_tokens \
|
|
||||||
--tensor-parallel-size 1 \
|
|
||||||
--enable-prefix-caching \
|
|
||||||
--load-format dummy \
|
|
||||||
--download-dir $DOWNLOAD_DIR \
|
|
||||||
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
|
|
||||||
echo "wait for 10 minutes.."
|
|
||||||
echo
|
|
||||||
# wait for 10 minutes...
|
|
||||||
server_started=0
|
|
||||||
for i in {1..60}; do
|
|
||||||
if grep -Fq "Application startup complete" "$vllm_log"; then
|
|
||||||
echo "Application started"
|
|
||||||
server_started=1
|
|
||||||
break
|
|
||||||
else
|
|
||||||
# echo "wait for 10 seconds..."
|
|
||||||
sleep 10
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
if (( ! server_started )); then
|
|
||||||
echo "server did not start within 10 minutes, terminate the benchmarking. Please check server log at $vllm_log"
|
|
||||||
echo "pkill -f vllm"
|
|
||||||
echo
|
|
||||||
pkill vllm
|
|
||||||
sleep 10
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "run benchmark test..."
|
|
||||||
echo
|
|
||||||
meet_latency_requirement=0
|
|
||||||
# get a basic qps by using request-rate inf
|
|
||||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
|
|
||||||
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
|
|
||||||
python benchmarks/benchmark_serving.py \
|
|
||||||
--backend vllm \
|
|
||||||
--model $MODEL \
|
|
||||||
--dataset-name sonnet \
|
|
||||||
--dataset-path benchmarks/sonnet_4x.txt \
|
|
||||||
--sonnet-input-len $INPUT_LEN \
|
|
||||||
--sonnet-output-len $OUTPUT_LEN \
|
|
||||||
--ignore-eos \
|
|
||||||
--disable-tqdm \
|
|
||||||
--request-rate inf \
|
|
||||||
--percentile-metrics ttft,tpot,itl,e2el \
|
|
||||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
|
||||||
--num-prompts 100 \
|
|
||||||
--sonnet-prefix-len $prefix_len \
|
|
||||||
--port 8004 > "$bm_log"
|
|
||||||
through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
|
||||||
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
|
||||||
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
|
||||||
|
|
||||||
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
|
|
||||||
meet_latency_requirement=1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if (( ! meet_latency_requirement )); then
|
|
||||||
# start from request-rate as int(through_put) + 1
|
|
||||||
request_rate=$((${through_put%.*} + 1))
|
|
||||||
while ((request_rate > 0)); do
|
|
||||||
# clear prefix cache
|
|
||||||
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
|
||||||
sleep 5
|
|
||||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
|
|
||||||
python benchmarks/benchmark_serving.py \
|
|
||||||
--backend vllm \
|
|
||||||
--model $MODEL \
|
|
||||||
--dataset-name sonnet \
|
|
||||||
--dataset-path benchmarks/sonnet_4x.txt \
|
|
||||||
--sonnet-input-len $INPUT_LEN \
|
|
||||||
--sonnet-output-len $OUTPUT_LEN \
|
|
||||||
--ignore_eos \
|
|
||||||
--disable-tqdm \
|
|
||||||
--request-rate $request_rate \
|
|
||||||
--percentile-metrics ttft,tpot,itl,e2el \
|
|
||||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
|
||||||
--num-prompts 100 \
|
|
||||||
--sonnet-prefix-len $prefix_len \
|
|
||||||
--port 8004 > "$bm_log"
|
|
||||||
through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
|
||||||
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
|
||||||
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
|
||||||
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
|
|
||||||
meet_latency_requirement=1
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
request_rate=$((request_rate-1))
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
# write the results and update the best result.
|
|
||||||
if ((meet_latency_requirement)); then
|
|
||||||
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput"
|
|
||||||
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" >> "$RESULT"
|
|
||||||
if (( $(echo "$through_put > $best_throughput" | bc -l) )); then
|
|
||||||
best_throughput=$through_put
|
|
||||||
best_max_num_seqs=$max_num_seqs
|
|
||||||
best_num_batched_tokens=$max_num_batched_tokens
|
|
||||||
best_goodput=$goodput
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
|
|
||||||
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" >> "$RESULT"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
|
||||||
|
|
||||||
echo "pkill -f vllm"
|
|
||||||
echo
|
|
||||||
pkill vllm
|
|
||||||
sleep 10
|
|
||||||
rm -f $vllm_log
|
|
||||||
printf '=%.0s' $(seq 1 20)
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
num_seqs_list="128 256"
|
|
||||||
num_batched_tokens_list="512 1024 2048 4096"
|
|
||||||
for num_seqs in $num_seqs_list; do
|
|
||||||
for num_batched_tokens in $num_batched_tokens_list; do
|
|
||||||
run_benchmark $num_seqs $num_batched_tokens
|
|
||||||
exit 0
|
|
||||||
done
|
|
||||||
done
|
|
||||||
echo "finish permutations"
|
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" >> "$RESULT"
|
|
||||||
|
|
||||||
@@ -1,6 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import io
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
@@ -12,7 +11,8 @@ from typing import Optional, Union
|
|||||||
import aiohttp
|
import aiohttp
|
||||||
import huggingface_hub.constants
|
import huggingface_hub.constants
|
||||||
from tqdm.asyncio import tqdm
|
from tqdm.asyncio import tqdm
|
||||||
from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
|
from transformers import (AutoTokenizer, PreTrainedTokenizer,
|
||||||
|
PreTrainedTokenizerFast)
|
||||||
|
|
||||||
# NOTE(simon): do not import vLLM here so the benchmark script
|
# NOTE(simon): do not import vLLM here so the benchmark script
|
||||||
# can run without vLLM installed.
|
# can run without vLLM installed.
|
||||||
@@ -32,7 +32,6 @@ class RequestFuncInput:
|
|||||||
extra_body: Optional[dict] = None
|
extra_body: Optional[dict] = None
|
||||||
multi_modal_content: Optional[dict] = None
|
multi_modal_content: Optional[dict] = None
|
||||||
ignore_eos: bool = False
|
ignore_eos: bool = False
|
||||||
language: Optional[str] = None
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -42,7 +41,8 @@ class RequestFuncOutput:
|
|||||||
latency: float = 0.0
|
latency: float = 0.0
|
||||||
output_tokens: int = 0
|
output_tokens: int = 0
|
||||||
ttft: float = 0.0 # Time to first token
|
ttft: float = 0.0 # Time to first token
|
||||||
itl: list[float] = field(default_factory=list) # list of inter-token latencies
|
itl: list[float] = field(
|
||||||
|
default_factory=list) # list of inter-token latencies
|
||||||
tpot: float = 0.0 # avg next-token latencies
|
tpot: float = 0.0 # avg next-token latencies
|
||||||
prompt_len: int = 0
|
prompt_len: int = 0
|
||||||
error: str = ""
|
error: str = ""
|
||||||
@@ -55,16 +55,15 @@ async def async_request_tgi(
|
|||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith("generate_stream")
|
assert api_url.endswith("generate_stream")
|
||||||
|
|
||||||
async with aiohttp.ClientSession(
|
async with aiohttp.ClientSession(trust_env=True,
|
||||||
trust_env=True, timeout=AIOHTTP_TIMEOUT
|
timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
) as session:
|
|
||||||
params = {
|
params = {
|
||||||
"max_new_tokens": request_func_input.output_len,
|
"max_new_tokens": request_func_input.output_len,
|
||||||
"do_sample": True,
|
"do_sample": True,
|
||||||
"temperature": 0.01, # TGI does not accept 0.0 temperature.
|
"temperature": 0.01, # TGI does not accept 0.0 temperature.
|
||||||
"top_p": 0.99, # TGI does not accept 1.0 top_p.
|
"top_p": 0.99, # TGI does not accept 1.0 top_p.
|
||||||
"truncate": request_func_input.prompt_len,
|
"truncate": request_func_input.prompt_len,
|
||||||
"ignore_eos_token": request_func_input.ignore_eos,
|
# TGI does not accept ignore_eos flag.
|
||||||
}
|
}
|
||||||
payload = {
|
payload = {
|
||||||
"inputs": request_func_input.prompt,
|
"inputs": request_func_input.prompt,
|
||||||
@@ -72,10 +71,6 @@ async def async_request_tgi(
|
|||||||
}
|
}
|
||||||
output = RequestFuncOutput()
|
output = RequestFuncOutput()
|
||||||
output.prompt_len = request_func_input.prompt_len
|
output.prompt_len = request_func_input.prompt_len
|
||||||
if request_func_input.ignore_eos:
|
|
||||||
output.output_tokens = request_func_input.output_len
|
|
||||||
else:
|
|
||||||
output.output_tokens = None
|
|
||||||
|
|
||||||
ttft = 0.0
|
ttft = 0.0
|
||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
@@ -104,7 +99,8 @@ async def async_request_tgi(
|
|||||||
|
|
||||||
# Decoding phase
|
# Decoding phase
|
||||||
else:
|
else:
|
||||||
output.itl.append(timestamp - most_recent_timestamp)
|
output.itl.append(timestamp -
|
||||||
|
most_recent_timestamp)
|
||||||
|
|
||||||
most_recent_timestamp = timestamp
|
most_recent_timestamp = timestamp
|
||||||
|
|
||||||
@@ -131,9 +127,8 @@ async def async_request_trt_llm(
|
|||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith("generate_stream")
|
assert api_url.endswith("generate_stream")
|
||||||
|
|
||||||
async with aiohttp.ClientSession(
|
async with aiohttp.ClientSession(trust_env=True,
|
||||||
trust_env=True, timeout=AIOHTTP_TIMEOUT
|
timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
) as session:
|
|
||||||
payload = {
|
payload = {
|
||||||
"accumulate_tokens": True,
|
"accumulate_tokens": True,
|
||||||
"text_input": request_func_input.prompt,
|
"text_input": request_func_input.prompt,
|
||||||
@@ -158,7 +153,8 @@ async def async_request_trt_llm(
|
|||||||
if not chunk_bytes:
|
if not chunk_bytes:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
chunk = chunk_bytes.decode("utf-8").removeprefix("data:")
|
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||||
|
"data:")
|
||||||
|
|
||||||
data = json.loads(chunk)
|
data = json.loads(chunk)
|
||||||
output.generated_text += data["text_output"]
|
output.generated_text += data["text_output"]
|
||||||
@@ -170,7 +166,8 @@ async def async_request_trt_llm(
|
|||||||
|
|
||||||
# Decoding phase
|
# Decoding phase
|
||||||
else:
|
else:
|
||||||
output.itl.append(timestamp - most_recent_timestamp)
|
output.itl.append(timestamp -
|
||||||
|
most_recent_timestamp)
|
||||||
|
|
||||||
most_recent_timestamp = timestamp
|
most_recent_timestamp = timestamp
|
||||||
|
|
||||||
@@ -194,23 +191,15 @@ async def async_request_deepspeed_mii(
|
|||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: Optional[tqdm] = None,
|
pbar: Optional[tqdm] = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
async with aiohttp.ClientSession(trust_env=True,
|
||||||
assert api_url.endswith(("completions", "profile")), (
|
timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
"OpenAI Completions API URL must end with 'completions' or 'profile'."
|
|
||||||
)
|
|
||||||
|
|
||||||
async with aiohttp.ClientSession(
|
|
||||||
trust_env=True, timeout=AIOHTTP_TIMEOUT
|
|
||||||
) as session:
|
|
||||||
payload = {
|
payload = {
|
||||||
"model": request_func_input.model,
|
|
||||||
"prompt": request_func_input.prompt,
|
"prompt": request_func_input.prompt,
|
||||||
"max_tokens": request_func_input.output_len,
|
"max_tokens": request_func_input.output_len,
|
||||||
"temperature": 0.01, # deepspeed-mii does not accept 0.0 temp.
|
"temperature": 0.01, # deepspeed-mii does not accept 0.0 temp.
|
||||||
"top_p": 1.0,
|
"top_p": 1.0,
|
||||||
}
|
}
|
||||||
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
|
||||||
|
|
||||||
output = RequestFuncOutput()
|
output = RequestFuncOutput()
|
||||||
output.prompt_len = request_func_input.prompt_len
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
|
||||||
@@ -221,22 +210,12 @@ async def async_request_deepspeed_mii(
|
|||||||
|
|
||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
try:
|
try:
|
||||||
async with session.post(
|
async with session.post(url=request_func_input.api_url,
|
||||||
url=api_url, json=payload, headers=headers
|
json=payload) as response:
|
||||||
) as response:
|
|
||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
parsed_resp = await response.json()
|
parsed_resp = await response.json()
|
||||||
output.latency = time.perf_counter() - st
|
output.latency = time.perf_counter() - st
|
||||||
if "choices" in parsed_resp:
|
|
||||||
output.generated_text = parsed_resp["choices"][0]["text"]
|
|
||||||
elif "text" in parsed_resp:
|
|
||||||
output.generated_text = parsed_resp["text"][0]
|
output.generated_text = parsed_resp["text"][0]
|
||||||
else:
|
|
||||||
output.error = (
|
|
||||||
"Unexpected response format: "
|
|
||||||
"neither 'choices' nor 'text' found"
|
|
||||||
)
|
|
||||||
output.success = False
|
|
||||||
output.success = True
|
output.success = True
|
||||||
else:
|
else:
|
||||||
output.error = response.reason or ""
|
output.error = response.reason or ""
|
||||||
@@ -256,20 +235,17 @@ async def async_request_openai_completions(
|
|||||||
pbar: Optional[tqdm] = None,
|
pbar: Optional[tqdm] = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(("completions", "profile")), (
|
assert api_url.endswith(
|
||||||
"OpenAI Completions API URL must end with 'completions' or 'profile'."
|
("completions", "profile")
|
||||||
)
|
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
|
||||||
|
|
||||||
async with aiohttp.ClientSession(
|
async with aiohttp.ClientSession(trust_env=True,
|
||||||
trust_env=True, timeout=AIOHTTP_TIMEOUT
|
timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
) as session:
|
|
||||||
payload = {
|
payload = {
|
||||||
"model": request_func_input.model_name
|
"model": request_func_input.model_name \
|
||||||
if request_func_input.model_name
|
if request_func_input.model_name else request_func_input.model,
|
||||||
else request_func_input.model,
|
|
||||||
"prompt": request_func_input.prompt,
|
"prompt": request_func_input.prompt,
|
||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
"repetition_penalty": 1.0,
|
|
||||||
"max_tokens": request_func_input.output_len,
|
"max_tokens": request_func_input.output_len,
|
||||||
"logprobs": request_func_input.logprobs,
|
"logprobs": request_func_input.logprobs,
|
||||||
"stream": True,
|
"stream": True,
|
||||||
@@ -281,7 +257,9 @@ async def async_request_openai_completions(
|
|||||||
payload["ignore_eos"] = request_func_input.ignore_eos
|
payload["ignore_eos"] = request_func_input.ignore_eos
|
||||||
if request_func_input.extra_body:
|
if request_func_input.extra_body:
|
||||||
payload.update(request_func_input.extra_body)
|
payload.update(request_func_input.extra_body)
|
||||||
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
headers = {
|
||||||
|
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
|
||||||
|
}
|
||||||
|
|
||||||
output = RequestFuncOutput()
|
output = RequestFuncOutput()
|
||||||
output.prompt_len = request_func_input.prompt_len
|
output.prompt_len = request_func_input.prompt_len
|
||||||
@@ -290,9 +268,8 @@ async def async_request_openai_completions(
|
|||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
most_recent_timestamp = st
|
most_recent_timestamp = st
|
||||||
try:
|
try:
|
||||||
async with session.post(
|
async with session.post(url=api_url, json=payload,
|
||||||
url=api_url, json=payload, headers=headers
|
headers=headers) as response:
|
||||||
) as response:
|
|
||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
first_chunk_received = False
|
first_chunk_received = False
|
||||||
async for chunk_bytes in response.content:
|
async for chunk_bytes in response.content:
|
||||||
@@ -300,7 +277,8 @@ async def async_request_openai_completions(
|
|||||||
if not chunk_bytes:
|
if not chunk_bytes:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
|
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||||
|
"data: ")
|
||||||
if chunk != "[DONE]":
|
if chunk != "[DONE]":
|
||||||
data = json.loads(chunk)
|
data = json.loads(chunk)
|
||||||
|
|
||||||
@@ -320,20 +298,21 @@ async def async_request_openai_completions(
|
|||||||
|
|
||||||
# Decoding phase
|
# Decoding phase
|
||||||
else:
|
else:
|
||||||
output.itl.append(timestamp - most_recent_timestamp)
|
output.itl.append(timestamp -
|
||||||
|
most_recent_timestamp)
|
||||||
|
|
||||||
most_recent_timestamp = timestamp
|
most_recent_timestamp = timestamp
|
||||||
generated_text += text or ""
|
generated_text += text or ""
|
||||||
elif usage := data.get("usage"):
|
elif usage := data.get("usage"):
|
||||||
output.output_tokens = usage.get("completion_tokens")
|
output.output_tokens = usage.get(
|
||||||
|
"completion_tokens")
|
||||||
if first_chunk_received:
|
if first_chunk_received:
|
||||||
output.success = True
|
output.success = True
|
||||||
else:
|
else:
|
||||||
output.success = False
|
output.success = False
|
||||||
output.error = (
|
output.error = (
|
||||||
"Never received a valid chunk to calculate TTFT."
|
"Never received a valid chunk to calculate TTFT."
|
||||||
"This response will be marked as failed!"
|
"This response will be marked as failed!")
|
||||||
)
|
|
||||||
output.generated_text = generated_text
|
output.generated_text = generated_text
|
||||||
output.latency = most_recent_timestamp - st
|
output.latency = most_recent_timestamp - st
|
||||||
else:
|
else:
|
||||||
@@ -354,22 +333,23 @@ async def async_request_openai_chat_completions(
|
|||||||
pbar: Optional[tqdm] = None,
|
pbar: Optional[tqdm] = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(("chat/completions", "profile")), (
|
assert api_url.endswith(
|
||||||
"OpenAI Chat Completions API URL must end with 'chat/completions'."
|
("chat/completions", "profile")
|
||||||
)
|
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
|
||||||
|
|
||||||
async with aiohttp.ClientSession(
|
async with aiohttp.ClientSession(trust_env=True,
|
||||||
trust_env=True, timeout=AIOHTTP_TIMEOUT
|
timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
) as session:
|
|
||||||
content = [{"type": "text", "text": request_func_input.prompt}]
|
content = [{"type": "text", "text": request_func_input.prompt}]
|
||||||
if request_func_input.multi_modal_content:
|
if request_func_input.multi_modal_content:
|
||||||
content.append(request_func_input.multi_modal_content)
|
content.append(request_func_input.multi_modal_content)
|
||||||
payload = {
|
payload = {
|
||||||
"model": request_func_input.model_name
|
"model": request_func_input.model_name \
|
||||||
if request_func_input.model_name
|
if request_func_input.model_name else request_func_input.model,
|
||||||
else request_func_input.model,
|
|
||||||
"messages": [
|
"messages": [
|
||||||
{"role": "user", "content": content},
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": content
|
||||||
|
},
|
||||||
],
|
],
|
||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
"max_completion_tokens": request_func_input.output_len,
|
"max_completion_tokens": request_func_input.output_len,
|
||||||
@@ -395,16 +375,16 @@ async def async_request_openai_chat_completions(
|
|||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
most_recent_timestamp = st
|
most_recent_timestamp = st
|
||||||
try:
|
try:
|
||||||
async with session.post(
|
async with session.post(url=api_url, json=payload,
|
||||||
url=api_url, json=payload, headers=headers
|
headers=headers) as response:
|
||||||
) as response:
|
|
||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
async for chunk_bytes in response.content:
|
async for chunk_bytes in response.content:
|
||||||
chunk_bytes = chunk_bytes.strip()
|
chunk_bytes = chunk_bytes.strip()
|
||||||
if not chunk_bytes:
|
if not chunk_bytes:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
|
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||||
|
"data: ")
|
||||||
if chunk != "[DONE]":
|
if chunk != "[DONE]":
|
||||||
timestamp = time.perf_counter()
|
timestamp = time.perf_counter()
|
||||||
data = json.loads(chunk)
|
data = json.loads(chunk)
|
||||||
@@ -418,118 +398,13 @@ async def async_request_openai_chat_completions(
|
|||||||
|
|
||||||
# Decoding phase
|
# Decoding phase
|
||||||
else:
|
else:
|
||||||
output.itl.append(timestamp - most_recent_timestamp)
|
output.itl.append(timestamp -
|
||||||
|
most_recent_timestamp)
|
||||||
generated_text += content or ""
|
|
||||||
elif usage := data.get("usage"):
|
|
||||||
output.output_tokens = usage.get("completion_tokens")
|
|
||||||
|
|
||||||
most_recent_timestamp = timestamp
|
|
||||||
|
|
||||||
output.generated_text = generated_text
|
|
||||||
output.success = True
|
|
||||||
output.latency = most_recent_timestamp - st
|
|
||||||
else:
|
|
||||||
output.error = response.reason or ""
|
|
||||||
output.success = False
|
|
||||||
except Exception:
|
|
||||||
output.success = False
|
|
||||||
exc_info = sys.exc_info()
|
|
||||||
output.error = "".join(traceback.format_exception(*exc_info))
|
|
||||||
|
|
||||||
if pbar:
|
|
||||||
pbar.update(1)
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
async def async_request_openai_audio(
|
|
||||||
request_func_input: RequestFuncInput,
|
|
||||||
pbar: Optional[tqdm] = None,
|
|
||||||
) -> RequestFuncOutput:
|
|
||||||
# Lazy import without PlaceholderModule to avoid vllm dep.
|
|
||||||
import soundfile
|
|
||||||
|
|
||||||
api_url = request_func_input.api_url
|
|
||||||
assert api_url.endswith(("transcriptions", "translations")), (
|
|
||||||
"OpenAI Chat Completions API URL must end with 'transcriptions' "
|
|
||||||
)
|
|
||||||
"or `translations`."
|
|
||||||
|
|
||||||
async with aiohttp.ClientSession(
|
|
||||||
trust_env=True, timeout=AIOHTTP_TIMEOUT
|
|
||||||
) as session:
|
|
||||||
content = [{"type": "text", "text": request_func_input.prompt}]
|
|
||||||
payload = {
|
|
||||||
"model": request_func_input.model_name
|
|
||||||
if request_func_input.model_name
|
|
||||||
else request_func_input.model,
|
|
||||||
"temperature": 0.0,
|
|
||||||
"max_completion_tokens": request_func_input.output_len,
|
|
||||||
"stream": True,
|
|
||||||
"language": "en",
|
|
||||||
# Flattened due to multipart/form-data
|
|
||||||
"stream_include_usage": True,
|
|
||||||
"stream_continuous_usage_stats": True,
|
|
||||||
}
|
|
||||||
if request_func_input.extra_body:
|
|
||||||
payload.update(request_func_input.extra_body)
|
|
||||||
headers = {
|
|
||||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
|
|
||||||
}
|
|
||||||
|
|
||||||
# Send audio file
|
|
||||||
def to_bytes(y, sr):
|
|
||||||
buffer = io.BytesIO()
|
|
||||||
soundfile.write(buffer, y, sr, format="WAV")
|
|
||||||
buffer.seek(0)
|
|
||||||
return buffer
|
|
||||||
|
|
||||||
with to_bytes(*request_func_input.multi_modal_content["audio"]) as f:
|
|
||||||
form = aiohttp.FormData()
|
|
||||||
form.add_field("file", f, content_type="audio/wav")
|
|
||||||
for key, value in payload.items():
|
|
||||||
form.add_field(key, str(value))
|
|
||||||
|
|
||||||
output = RequestFuncOutput()
|
|
||||||
output.prompt_len = request_func_input.prompt_len
|
|
||||||
|
|
||||||
generated_text = ""
|
|
||||||
ttft = 0.0
|
|
||||||
st = time.perf_counter()
|
|
||||||
most_recent_timestamp = st
|
|
||||||
try:
|
|
||||||
async with session.post(
|
|
||||||
url=api_url, data=form, headers=headers
|
|
||||||
) as response:
|
|
||||||
if response.status == 200:
|
|
||||||
async for chunk_bytes in response.content:
|
|
||||||
chunk_bytes = chunk_bytes.strip()
|
|
||||||
if not chunk_bytes:
|
|
||||||
continue
|
|
||||||
|
|
||||||
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
|
|
||||||
if chunk != "[DONE]":
|
|
||||||
timestamp = time.perf_counter()
|
|
||||||
data = json.loads(chunk)
|
|
||||||
|
|
||||||
if choices := data.get("choices"):
|
|
||||||
content = choices[0]["delta"].get("content")
|
|
||||||
# First token
|
|
||||||
if ttft == 0.0:
|
|
||||||
ttft = timestamp - st
|
|
||||||
output.ttft = ttft
|
|
||||||
|
|
||||||
# Decoding phase
|
|
||||||
else:
|
|
||||||
output.itl.append(
|
|
||||||
timestamp - most_recent_timestamp
|
|
||||||
)
|
|
||||||
|
|
||||||
generated_text += content or ""
|
generated_text += content or ""
|
||||||
elif usage := data.get("usage"):
|
elif usage := data.get("usage"):
|
||||||
output.output_tokens = usage.get(
|
output.output_tokens = usage.get(
|
||||||
"completion_tokens"
|
"completion_tokens")
|
||||||
)
|
|
||||||
|
|
||||||
most_recent_timestamp = timestamp
|
most_recent_timestamp = timestamp
|
||||||
|
|
||||||
@@ -550,7 +425,7 @@ async def async_request_openai_audio(
|
|||||||
|
|
||||||
|
|
||||||
def get_model(pretrained_model_name_or_path: str) -> str:
|
def get_model(pretrained_model_name_or_path: str) -> str:
|
||||||
if os.getenv("VLLM_USE_MODELSCOPE", "False").lower() == "true":
|
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
|
||||||
from modelscope import snapshot_download
|
from modelscope import snapshot_download
|
||||||
|
|
||||||
from vllm.model_executor.model_loader.weight_utils import get_lock
|
from vllm.model_executor.model_loader.weight_utils import get_lock
|
||||||
@@ -561,8 +436,7 @@ def get_model(pretrained_model_name_or_path: str) -> str:
|
|||||||
model_path = snapshot_download(
|
model_path = snapshot_download(
|
||||||
model_id=pretrained_model_name_or_path,
|
model_id=pretrained_model_name_or_path,
|
||||||
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
||||||
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
|
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
|
||||||
)
|
|
||||||
|
|
||||||
return model_path
|
return model_path
|
||||||
return pretrained_model_name_or_path
|
return pretrained_model_name_or_path
|
||||||
@@ -575,23 +449,23 @@ def get_tokenizer(
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
||||||
if pretrained_model_name_or_path is not None and not os.path.exists(
|
if pretrained_model_name_or_path is not None and not os.path.exists(
|
||||||
pretrained_model_name_or_path
|
pretrained_model_name_or_path):
|
||||||
):
|
pretrained_model_name_or_path = get_model(
|
||||||
pretrained_model_name_or_path = get_model(pretrained_model_name_or_path)
|
pretrained_model_name_or_path)
|
||||||
if tokenizer_mode == "slow":
|
if tokenizer_mode == "slow":
|
||||||
if kwargs.get("use_fast", False):
|
if kwargs.get("use_fast", False):
|
||||||
raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
|
raise ValueError(
|
||||||
|
"Cannot use the fast tokenizer in slow tokenizer mode.")
|
||||||
kwargs["use_fast"] = False
|
kwargs["use_fast"] = False
|
||||||
if tokenizer_mode == "mistral":
|
if tokenizer_mode == "mistral":
|
||||||
try:
|
try:
|
||||||
from vllm.transformers_utils.tokenizer import MistralTokenizer
|
from vllm.transformers_utils.tokenizer import MistralTokenizer
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
raise ImportError(
|
raise ImportError("MistralTokenizer requires vllm package.\n"
|
||||||
"MistralTokenizer requires vllm package.\n"
|
|
||||||
"Please install it with `pip install vllm` "
|
"Please install it with `pip install vllm` "
|
||||||
"to use mistral tokenizer mode."
|
"to use mistral tokenizer mode.") from e
|
||||||
) from e
|
return MistralTokenizer.from_pretrained(
|
||||||
return MistralTokenizer.from_pretrained(str(pretrained_model_name_or_path))
|
str(pretrained_model_name_or_path))
|
||||||
else:
|
else:
|
||||||
return AutoTokenizer.from_pretrained(
|
return AutoTokenizer.from_pretrained(
|
||||||
pretrained_model_name_or_path,
|
pretrained_model_name_or_path,
|
||||||
@@ -607,14 +481,7 @@ ASYNC_REQUEST_FUNCS = {
|
|||||||
"deepspeed-mii": async_request_deepspeed_mii,
|
"deepspeed-mii": async_request_deepspeed_mii,
|
||||||
"openai": async_request_openai_completions,
|
"openai": async_request_openai_completions,
|
||||||
"openai-chat": async_request_openai_chat_completions,
|
"openai-chat": async_request_openai_chat_completions,
|
||||||
"openai-audio": async_request_openai_audio,
|
|
||||||
"tensorrt-llm": async_request_trt_llm,
|
"tensorrt-llm": async_request_trt_llm,
|
||||||
"scalellm": async_request_openai_completions,
|
"scalellm": async_request_openai_completions,
|
||||||
"sglang": async_request_openai_completions,
|
"sglang": async_request_openai_completions,
|
||||||
}
|
}
|
||||||
|
|
||||||
OPENAI_COMPATIBLE_BACKENDS = [
|
|
||||||
k
|
|
||||||
for k, v in ASYNC_REQUEST_FUNCS.items()
|
|
||||||
if v in (async_request_openai_completions, async_request_openai_chat_completions)
|
|
||||||
]
|
|
||||||
|
|||||||
@@ -17,14 +17,12 @@ SampleRequest instances, similar to the approach used in ShareGPT.
|
|||||||
import base64
|
import base64
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
import logging
|
|
||||||
import random
|
import random
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from collections.abc import Mapping
|
from collections.abc import Mapping
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from functools import cache
|
from functools import cache
|
||||||
from io import BytesIO
|
from typing import Any, Optional, Union
|
||||||
from typing import Any, Callable, Optional, Union
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@@ -35,11 +33,8 @@ from transformers import PreTrainedTokenizerBase
|
|||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.lora.utils import get_adapter_absolute_path
|
from vllm.lora.utils import get_adapter_absolute_path
|
||||||
from vllm.multimodal import MultiModalDataDict
|
from vllm.multimodal import MultiModalDataDict
|
||||||
from vllm.multimodal.image import convert_image_mode
|
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# Data Classes
|
# Data Classes
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
@@ -65,7 +60,9 @@ class SampleRequest:
|
|||||||
|
|
||||||
class BenchmarkDataset(ABC):
|
class BenchmarkDataset(ABC):
|
||||||
DEFAULT_SEED = 0
|
DEFAULT_SEED = 0
|
||||||
IS_MULTIMODAL = False
|
|
||||||
|
# num_requests has default 1000 in both the benchmark_serving.py and
|
||||||
|
# benchmark_throughput.py
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -83,16 +80,18 @@ class BenchmarkDataset(ABC):
|
|||||||
self.dataset_path = dataset_path
|
self.dataset_path = dataset_path
|
||||||
# Set the random seed, ensuring that a None value is replaced with the
|
# Set the random seed, ensuring that a None value is replaced with the
|
||||||
# default seed.
|
# default seed.
|
||||||
self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED
|
self.random_seed = (random_seed
|
||||||
|
if random_seed is not None else self.DEFAULT_SEED)
|
||||||
self.data = None
|
self.data = None
|
||||||
|
|
||||||
def apply_multimodal_chat_transformation(
|
def apply_multimodal_chat_transformation(
|
||||||
self, prompt: str, mm_content: Optional[MultiModalDataDict] = None
|
self,
|
||||||
) -> list[dict]:
|
prompt: str,
|
||||||
|
mm_content: Optional[MultiModalDataDict] = None) -> list[dict]:
|
||||||
"""
|
"""
|
||||||
Transform a prompt and optional multimodal content into a chat format.
|
Transform a prompt and optional multimodal content into a chat format.
|
||||||
This method is used for chat models that expect a specific conversation
|
This method is used for chat models that expect a specific
|
||||||
format.
|
conversation format.
|
||||||
"""
|
"""
|
||||||
content = [{"text": prompt, "type": "text"}]
|
content = [{"text": prompt, "type": "text"}]
|
||||||
if mm_content is not None:
|
if mm_content is not None:
|
||||||
@@ -110,7 +109,8 @@ class BenchmarkDataset(ABC):
|
|||||||
NotImplementedError: If a subclass does not implement this method.
|
NotImplementedError: If a subclass does not implement this method.
|
||||||
"""
|
"""
|
||||||
# TODO (jenniferzhao): add support for downloading data
|
# TODO (jenniferzhao): add support for downloading data
|
||||||
raise NotImplementedError("load_data must be implemented in subclasses.")
|
raise NotImplementedError(
|
||||||
|
"load_data must be implemented in subclasses.")
|
||||||
|
|
||||||
def get_random_lora_request(
|
def get_random_lora_request(
|
||||||
self,
|
self,
|
||||||
@@ -156,9 +156,8 @@ class BenchmarkDataset(ABC):
|
|||||||
return lora_request, lora_tokenizer_cache[lora_id] or tokenizer
|
return lora_request, lora_tokenizer_cache[lora_id] or tokenizer
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def sample(
|
def sample(self, tokenizer: PreTrainedTokenizerBase,
|
||||||
self, tokenizer: PreTrainedTokenizerBase, num_requests: int
|
num_requests: int) -> list[SampleRequest]:
|
||||||
) -> list[SampleRequest]:
|
|
||||||
"""
|
"""
|
||||||
Abstract method to generate sample requests from the dataset.
|
Abstract method to generate sample requests from the dataset.
|
||||||
|
|
||||||
@@ -176,23 +175,6 @@ class BenchmarkDataset(ABC):
|
|||||||
"""
|
"""
|
||||||
raise NotImplementedError("sample must be implemented in subclasses.")
|
raise NotImplementedError("sample must be implemented in subclasses.")
|
||||||
|
|
||||||
def maybe_oversample_requests(
|
|
||||||
self, requests: list[SampleRequest], num_requests: int
|
|
||||||
) -> None:
|
|
||||||
"""
|
|
||||||
Oversamples the list of requests if its size is less than the desired
|
|
||||||
number.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
requests (List[SampleRequest]): The current list of sampled
|
|
||||||
requests. num_requests (int): The target number of requests.
|
|
||||||
"""
|
|
||||||
if len(requests) < num_requests:
|
|
||||||
random.seed(self.random_seed)
|
|
||||||
additional = random.choices(requests, k=num_requests - len(requests))
|
|
||||||
requests.extend(additional)
|
|
||||||
logger.info("Oversampled requests to reach %d total samples.", num_requests)
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# Utility Functions and Global Caches
|
# Utility Functions and Global Caches
|
||||||
@@ -216,14 +198,14 @@ def is_valid_sequence(
|
|||||||
"""
|
"""
|
||||||
# Check for invalid conditions
|
# Check for invalid conditions
|
||||||
prompt_too_short = prompt_len < min_len
|
prompt_too_short = prompt_len < min_len
|
||||||
output_too_short = (not skip_min_output_len_check) and (output_len < min_len)
|
output_too_short = (not skip_min_output_len_check) and (output_len
|
||||||
|
< min_len)
|
||||||
prompt_too_long = prompt_len > max_prompt_len
|
prompt_too_long = prompt_len > max_prompt_len
|
||||||
combined_too_long = (prompt_len + output_len) > max_total_len
|
combined_too_long = (prompt_len + output_len) > max_total_len
|
||||||
|
|
||||||
# Return True if none of the invalid conditions are met
|
# Return True if none of the invalid conditions are met
|
||||||
return not (
|
return not (prompt_too_short or output_too_short or prompt_too_long
|
||||||
prompt_too_short or output_too_short or prompt_too_long or combined_too_long
|
or combined_too_long)
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@cache
|
@cache
|
||||||
@@ -239,44 +221,41 @@ def process_image(image: Any) -> Mapping[str, Any]:
|
|||||||
"""
|
"""
|
||||||
Process a single image input and return a multimedia content dictionary.
|
Process a single image input and return a multimedia content dictionary.
|
||||||
|
|
||||||
Supports three input types:
|
For a PIL.Image.Image input:
|
||||||
|
- Converts the image to RGB.
|
||||||
|
- Saves the image as a JPEG in-memory.
|
||||||
|
- Encodes the JPEG data as a base64 string.
|
||||||
|
- Returns a dictionary with the image as a base64 data URL.
|
||||||
|
|
||||||
1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
|
For a string input:
|
||||||
containing raw image data. - Loads the bytes as a PIL.Image.Image.
|
- Treats the string as a URL or file path.
|
||||||
|
- Prepends "file://" if the string doesn't start with "http://" or
|
||||||
2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as
|
"file://".
|
||||||
a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns
|
- Returns a dictionary with the image URL.
|
||||||
a dictionary with the image as a base64 data URL.
|
|
||||||
|
|
||||||
3. String input: - Treats the string as a URL or local file path. -
|
|
||||||
Prepends "file://" if the string doesn't start with "http://" or
|
|
||||||
"file://". - Returns a dictionary with the image URL.
|
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ValueError: If the input is not a supported type.
|
ValueError: If the input is neither a PIL.Image.Image nor a string.
|
||||||
"""
|
"""
|
||||||
if isinstance(image, dict) and "bytes" in image:
|
|
||||||
image = Image.open(BytesIO(image["bytes"]))
|
|
||||||
if isinstance(image, Image.Image):
|
if isinstance(image, Image.Image):
|
||||||
image = convert_image_mode(image, "RGB")
|
image = image.convert("RGB")
|
||||||
with io.BytesIO() as image_data:
|
with io.BytesIO() as image_data:
|
||||||
image.save(image_data, format="JPEG")
|
image.save(image_data, format="JPEG")
|
||||||
image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
|
image_base64 = base64.b64encode(
|
||||||
|
image_data.getvalue()).decode("utf-8")
|
||||||
return {
|
return {
|
||||||
"type": "image_url",
|
"type": "image_url",
|
||||||
"image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
|
"image_url": {
|
||||||
|
"url": f"data:image/jpeg;base64,{image_base64}"
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
if isinstance(image, str):
|
if isinstance(image, str):
|
||||||
image_url = (
|
image_url = (image if image.startswith(
|
||||||
image if image.startswith(("http://", "file://")) else f"file://{image}"
|
("http://", "file://")) else f"file://{image}")
|
||||||
)
|
|
||||||
return {"type": "image_url", "image_url": {"url": image_url}}
|
return {"type": "image_url", "image_url": {"url": image_url}}
|
||||||
|
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Invalid image input {image}. Must be a PIL.Image.Image"
|
f"Invalid image input {image}. Must be a PIL.Image.Image or str.")
|
||||||
" or str or dictionary with raw image bytes."
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
@@ -287,7 +266,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
|
|||||||
class RandomDataset(BenchmarkDataset):
|
class RandomDataset(BenchmarkDataset):
|
||||||
# Default values copied from benchmark_serving.py for the random dataset.
|
# Default values copied from benchmark_serving.py for the random dataset.
|
||||||
DEFAULT_PREFIX_LEN = 0
|
DEFAULT_PREFIX_LEN = 0
|
||||||
DEFAULT_RANGE_RATIO = 0.0
|
DEFAULT_RANGE_RATIO = 1.0
|
||||||
DEFAULT_INPUT_LEN = 1024
|
DEFAULT_INPUT_LEN = 1024
|
||||||
DEFAULT_OUTPUT_LEN = 128
|
DEFAULT_OUTPUT_LEN = 128
|
||||||
|
|
||||||
@@ -297,72 +276,44 @@ class RandomDataset(BenchmarkDataset):
|
|||||||
) -> None:
|
) -> None:
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
def sample(
|
def sample(self,
|
||||||
self,
|
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
num_requests: int,
|
num_requests: int,
|
||||||
prefix_len: int = DEFAULT_PREFIX_LEN,
|
prefix_len: int = DEFAULT_PREFIX_LEN,
|
||||||
range_ratio: float = DEFAULT_RANGE_RATIO,
|
range_ratio: float = DEFAULT_RANGE_RATIO,
|
||||||
input_len: int = DEFAULT_INPUT_LEN,
|
input_len: int = DEFAULT_INPUT_LEN,
|
||||||
output_len: int = DEFAULT_OUTPUT_LEN,
|
output_len: int = DEFAULT_OUTPUT_LEN,
|
||||||
**kwargs,
|
**kwargs) -> list[SampleRequest]:
|
||||||
) -> list[SampleRequest]:
|
|
||||||
# Enforce range_ratio < 1
|
|
||||||
assert range_ratio < 1.0, (
|
|
||||||
"random_range_ratio must be < 1.0 to ensure a valid sampling range"
|
|
||||||
)
|
|
||||||
|
|
||||||
vocab_size = tokenizer.vocab_size
|
vocab_size = tokenizer.vocab_size
|
||||||
num_special_tokens = tokenizer.num_special_tokens_to_add()
|
|
||||||
real_input_len = input_len - num_special_tokens
|
|
||||||
|
|
||||||
prefix_token_ids = (
|
prefix_token_ids = (np.random.randint(
|
||||||
np.random.randint(0, vocab_size, size=prefix_len).tolist()
|
0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])
|
||||||
if prefix_len > 0
|
|
||||||
else []
|
|
||||||
)
|
|
||||||
|
|
||||||
# New sampling logic: [X * (1 - b), X * (1 + b)]
|
input_low = int(input_len * range_ratio)
|
||||||
input_low = int(real_input_len * (1 - range_ratio))
|
output_low = int(output_len * range_ratio)
|
||||||
input_high = int(real_input_len * (1 + range_ratio))
|
|
||||||
output_low = int(output_len * (1 - range_ratio))
|
|
||||||
output_high = int(output_len * (1 + range_ratio))
|
|
||||||
|
|
||||||
# Add logging for debugging
|
input_lens = np.random.randint(input_low,
|
||||||
logger.info("Sampling input_len from [%s, %s]", input_low, input_high)
|
input_len + 1,
|
||||||
logger.info("Sampling output_len from [%s, %s]", output_low, output_high)
|
size=num_requests)
|
||||||
|
output_lens = np.random.randint(output_low,
|
||||||
input_lens = np.random.randint(input_low, input_high + 1, size=num_requests)
|
output_len + 1,
|
||||||
output_lens = np.random.randint(output_low, output_high + 1, size=num_requests)
|
size=num_requests)
|
||||||
offsets = np.random.randint(0, vocab_size, size=num_requests)
|
offsets = np.random.randint(0, vocab_size, size=num_requests)
|
||||||
|
|
||||||
requests = []
|
requests = []
|
||||||
for i in range(num_requests):
|
for i in range(num_requests):
|
||||||
inner_seq = (
|
inner_seq = ((offsets[i] + i + np.arange(input_lens[i])) %
|
||||||
(offsets[i] + i + np.arange(input_lens[i])) % vocab_size
|
vocab_size).tolist()
|
||||||
).tolist()
|
|
||||||
token_sequence = prefix_token_ids + inner_seq
|
token_sequence = prefix_token_ids + inner_seq
|
||||||
prompt = tokenizer.decode(token_sequence)
|
prompt = tokenizer.decode(token_sequence)
|
||||||
# After decoding the prompt we have to encode and decode it again.
|
|
||||||
# This is done because in some cases N consecutive tokens
|
|
||||||
# give a string tokenized into != N number of tokens.
|
|
||||||
# For example for GPT2Tokenizer:
|
|
||||||
# [6880, 6881] -> ['Ġcalls', 'here'] ->
|
|
||||||
# [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
|
|
||||||
# To avoid uncontrolled change of the prompt length,
|
|
||||||
# the encoded sequence is truncated before being decode again.
|
|
||||||
re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[
|
|
||||||
: input_lens[i]
|
|
||||||
]
|
|
||||||
prompt = tokenizer.decode(re_encoded_sequence)
|
|
||||||
total_input_len = prefix_len + int(input_lens[i])
|
total_input_len = prefix_len + int(input_lens[i])
|
||||||
requests.append(
|
requests.append(
|
||||||
SampleRequest(
|
SampleRequest(
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
prompt_len=total_input_len,
|
prompt_len=total_input_len,
|
||||||
expected_output_len=int(output_lens[i]),
|
expected_output_len=int(output_lens[i]),
|
||||||
)
|
))
|
||||||
)
|
|
||||||
return requests
|
return requests
|
||||||
|
|
||||||
|
|
||||||
@@ -389,56 +340,49 @@ class ShareGPTDataset(BenchmarkDataset):
|
|||||||
self.data = json.load(f)
|
self.data = json.load(f)
|
||||||
# Filter entries with at least two conversation turns.
|
# Filter entries with at least two conversation turns.
|
||||||
self.data = [
|
self.data = [
|
||||||
entry
|
entry for entry in self.data
|
||||||
for entry in self.data
|
|
||||||
if "conversations" in entry and len(entry["conversations"]) >= 2
|
if "conversations" in entry and len(entry["conversations"]) >= 2
|
||||||
]
|
]
|
||||||
random.seed(self.random_seed)
|
random.seed(self.random_seed)
|
||||||
random.shuffle(self.data)
|
random.shuffle(self.data)
|
||||||
|
|
||||||
def sample(
|
def sample(self,
|
||||||
self,
|
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
num_requests: int,
|
num_requests: int,
|
||||||
lora_path: Optional[str] = None,
|
lora_path: Optional[str] = None,
|
||||||
max_loras: Optional[int] = None,
|
max_loras: Optional[int] = None,
|
||||||
output_len: Optional[int] = None,
|
output_len: Optional[int] = None,
|
||||||
enable_multimodal_chat: bool = False,
|
enable_multimodal_chat: bool = False,
|
||||||
**kwargs,
|
**kwargs) -> list:
|
||||||
) -> list:
|
|
||||||
samples: list = []
|
samples: list = []
|
||||||
for entry in self.data:
|
for entry in self.data:
|
||||||
if len(samples) >= num_requests:
|
if len(samples) >= num_requests:
|
||||||
break
|
break
|
||||||
prompt, completion = (
|
prompt, completion = entry["conversations"][0]["value"],\
|
||||||
entry["conversations"][0]["value"],
|
entry["conversations"][1]["value"]
|
||||||
entry["conversations"][1]["value"],
|
|
||||||
)
|
|
||||||
|
|
||||||
lora_request, tokenizer = self.get_random_lora_request(
|
lora_request, tokenizer = self.get_random_lora_request(
|
||||||
tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path
|
tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
|
||||||
)
|
|
||||||
prompt_ids = tokenizer(prompt).input_ids
|
prompt_ids = tokenizer(prompt).input_ids
|
||||||
completion_ids = tokenizer(completion).input_ids
|
completion_ids = tokenizer(completion).input_ids
|
||||||
prompt_len = len(prompt_ids)
|
prompt_len = len(prompt_ids)
|
||||||
new_output_len = len(completion_ids) if output_len is None else output_len
|
new_output_len = (len(completion_ids)
|
||||||
if not is_valid_sequence(
|
if output_len is None else output_len)
|
||||||
prompt_len,
|
if not is_valid_sequence(prompt_len,
|
||||||
new_output_len,
|
new_output_len,
|
||||||
skip_min_output_len_check=output_len is not None,
|
skip_min_output_len_check=output_len
|
||||||
):
|
is not None):
|
||||||
continue
|
continue
|
||||||
if enable_multimodal_chat:
|
if enable_multimodal_chat:
|
||||||
prompt = self.apply_multimodal_chat_transformation(prompt, None)
|
prompt = self.apply_multimodal_chat_transformation(
|
||||||
|
prompt, None)
|
||||||
samples.append(
|
samples.append(
|
||||||
SampleRequest(
|
SampleRequest(
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
prompt_len=prompt_len,
|
prompt_len=prompt_len,
|
||||||
expected_output_len=new_output_len,
|
expected_output_len=new_output_len,
|
||||||
lora_request=lora_request,
|
lora_request=lora_request,
|
||||||
)
|
))
|
||||||
)
|
|
||||||
self.maybe_oversample_requests(samples, num_requests)
|
|
||||||
return samples
|
return samples
|
||||||
|
|
||||||
|
|
||||||
@@ -471,57 +415,53 @@ class SonnetDataset(BenchmarkDataset):
|
|||||||
with open(self.dataset_path, encoding="utf-8") as f:
|
with open(self.dataset_path, encoding="utf-8") as f:
|
||||||
self.data = f.readlines()
|
self.data = f.readlines()
|
||||||
|
|
||||||
def sample(
|
def sample(self,
|
||||||
self,
|
|
||||||
tokenizer,
|
tokenizer,
|
||||||
num_requests: int,
|
num_requests: int,
|
||||||
prefix_len: int = DEFAULT_PREFIX_LEN,
|
prefix_len: int = DEFAULT_PREFIX_LEN,
|
||||||
input_len: int = DEFAULT_INPUT_LEN,
|
input_len: int = DEFAULT_INPUT_LEN,
|
||||||
output_len: int = DEFAULT_OUTPUT_LEN,
|
output_len: int = DEFAULT_OUTPUT_LEN,
|
||||||
return_prompt_formatted: bool = False,
|
return_prompt_formatted: bool = False,
|
||||||
**kwargs,
|
**kwargs) -> list:
|
||||||
) -> list:
|
|
||||||
# Calculate average token length for a poem line.
|
# Calculate average token length for a poem line.
|
||||||
tokenized_lines = [tokenizer(line).input_ids for line in self.data]
|
tokenized_lines = [tokenizer(line).input_ids for line in self.data]
|
||||||
avg_len = sum(len(tokens) for tokens in tokenized_lines) / len(tokenized_lines)
|
avg_len = sum(len(tokens)
|
||||||
|
for tokens in \
|
||||||
|
tokenized_lines) / len(tokenized_lines)
|
||||||
|
|
||||||
# Build the base prompt.
|
# Build the base prompt.
|
||||||
base_prompt = "Pick as many lines as you can from these poem lines:\n"
|
base_prompt = "Pick as many lines as you can from these poem lines:\n"
|
||||||
base_msg = [{"role": "user", "content": base_prompt}]
|
base_msg = [{"role": "user", "content": base_prompt}]
|
||||||
base_fmt = tokenizer.apply_chat_template(
|
base_fmt = tokenizer.apply_chat_template(base_msg,
|
||||||
base_msg, add_generation_prompt=True, tokenize=False
|
add_generation_prompt=True,
|
||||||
)
|
tokenize=False)
|
||||||
base_offset = len(tokenizer(base_fmt).input_ids)
|
base_offset = len(tokenizer(base_fmt).input_ids)
|
||||||
if input_len <= base_offset:
|
if input_len <= base_offset:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"'input_len' must be higher than the base prompt length "
|
f"'input_len' must be higher than the base prompt length "
|
||||||
f"({base_offset})."
|
f"({base_offset}).")
|
||||||
)
|
|
||||||
|
|
||||||
# Determine how many poem lines to use.
|
# Determine how many poem lines to use.
|
||||||
num_input_lines = round((input_len - base_offset) / avg_len)
|
num_input_lines = round((input_len - base_offset) / avg_len)
|
||||||
num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0)
|
num_prefix_lines = round((prefix_len - base_offset) / avg_len)
|
||||||
prefix_lines = self.data[:num_prefix_lines]
|
prefix_lines = self.data[:num_prefix_lines]
|
||||||
|
|
||||||
samples = []
|
samples = []
|
||||||
while len(samples) < num_requests:
|
for _ in range(num_requests):
|
||||||
extra_lines = random.choices(
|
extra_lines = random.choices(self.data,
|
||||||
self.data, k=num_input_lines - num_prefix_lines
|
k=num_input_lines - num_prefix_lines)
|
||||||
)
|
|
||||||
prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
|
prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
|
||||||
msg = [{"role": "user", "content": prompt}]
|
msg = [{"role": "user", "content": prompt}]
|
||||||
prompt_formatted = tokenizer.apply_chat_template(
|
prompt_formatted = tokenizer.apply_chat_template(
|
||||||
msg, add_generation_prompt=True, tokenize=False
|
msg, add_generation_prompt=True, tokenize=False)
|
||||||
)
|
|
||||||
prompt_len = len(tokenizer(prompt_formatted).input_ids)
|
prompt_len = len(tokenizer(prompt_formatted).input_ids)
|
||||||
if prompt_len <= input_len:
|
|
||||||
samples.append(
|
samples.append(
|
||||||
SampleRequest(
|
SampleRequest(
|
||||||
prompt=prompt_formatted if return_prompt_formatted else prompt,
|
prompt=prompt_formatted
|
||||||
|
if return_prompt_formatted else prompt,
|
||||||
prompt_len=prompt_len,
|
prompt_len=prompt_len,
|
||||||
expected_output_len=output_len,
|
expected_output_len=output_len,
|
||||||
)
|
))
|
||||||
)
|
|
||||||
return samples
|
return samples
|
||||||
|
|
||||||
|
|
||||||
@@ -541,9 +481,7 @@ class BurstGPTDataset(BenchmarkDataset):
|
|||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.load_data()
|
self.load_data()
|
||||||
|
|
||||||
def load_data(
|
def load_data(self, ):
|
||||||
self,
|
|
||||||
):
|
|
||||||
if self.dataset_path is None:
|
if self.dataset_path is None:
|
||||||
raise ValueError("dataset_path must be provided for loading data.")
|
raise ValueError("dataset_path must be provided for loading data.")
|
||||||
|
|
||||||
@@ -557,7 +495,8 @@ class BurstGPTDataset(BenchmarkDataset):
|
|||||||
|
|
||||||
def _sample_loaded_data(self, num_requests: int) -> list:
|
def _sample_loaded_data(self, num_requests: int) -> list:
|
||||||
if num_requests <= len(self.data):
|
if num_requests <= len(self.data):
|
||||||
data = self.data.sample(n=num_requests, random_state=self.random_seed)
|
data = self.data.sample(n=num_requests,
|
||||||
|
random_state=self.random_seed)
|
||||||
else:
|
else:
|
||||||
data = self.data.sample(
|
data = self.data.sample(
|
||||||
n=num_requests,
|
n=num_requests,
|
||||||
@@ -567,22 +506,19 @@ class BurstGPTDataset(BenchmarkDataset):
|
|||||||
# Convert the dataframe to a list of lists.
|
# Convert the dataframe to a list of lists.
|
||||||
return data.values.tolist()
|
return data.values.tolist()
|
||||||
|
|
||||||
def sample(
|
def sample(self,
|
||||||
self,
|
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
num_requests: int,
|
num_requests: int,
|
||||||
max_loras: Optional[int] = None,
|
max_loras: Optional[int] = None,
|
||||||
lora_path: Optional[str] = None,
|
lora_path: Optional[str] = None,
|
||||||
**kwargs,
|
**kwargs) -> list[SampleRequest]:
|
||||||
) -> list[SampleRequest]:
|
|
||||||
samples = []
|
samples = []
|
||||||
data = self._sample_loaded_data(num_requests=num_requests)
|
data = self._sample_loaded_data(num_requests=num_requests)
|
||||||
for i in range(num_requests):
|
for i in range(num_requests):
|
||||||
input_len = int(data[i][2])
|
input_len = int(data[i][2])
|
||||||
output_len = int(data[i][3])
|
output_len = int(data[i][3])
|
||||||
lora_req, tokenizer = self.get_random_lora_request(
|
lora_req, tokenizer = self.get_random_lora_request(
|
||||||
tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path
|
tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
|
||||||
)
|
|
||||||
vocab_size = tokenizer.vocab_size
|
vocab_size = tokenizer.vocab_size
|
||||||
# Generate a synthetic prompt: a list of token IDs computed as (i +
|
# Generate a synthetic prompt: a list of token IDs computed as (i +
|
||||||
# j) modulo vocab_size.
|
# j) modulo vocab_size.
|
||||||
@@ -594,71 +530,65 @@ class BurstGPTDataset(BenchmarkDataset):
|
|||||||
prompt_len=input_len,
|
prompt_len=input_len,
|
||||||
expected_output_len=output_len,
|
expected_output_len=output_len,
|
||||||
lora_request=lora_req,
|
lora_request=lora_req,
|
||||||
)
|
))
|
||||||
)
|
|
||||||
return samples
|
return samples
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# HuggingFace Dataset Base Implementation
|
# HuggingFace Dataset Implementation
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
class HuggingFaceDataset(BenchmarkDataset):
|
|
||||||
"""Base class for datasets hosted on HuggingFace."""
|
|
||||||
|
|
||||||
SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set()
|
|
||||||
|
class HuggingFaceDataset(BenchmarkDataset):
|
||||||
|
"""
|
||||||
|
Dataset class for processing a HuggingFace dataset with conversation data
|
||||||
|
and optional images.
|
||||||
|
"""
|
||||||
|
DEFAULT_NUM_REQUESTS = 1000
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
dataset_path: str,
|
|
||||||
dataset_split: str,
|
dataset_split: str,
|
||||||
dataset_subset: Optional[str] = None,
|
dataset_subset: Optional[str] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__(dataset_path=dataset_path, **kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
self.dataset_split = dataset_split
|
self.dataset_split = dataset_split
|
||||||
self.dataset_subset = dataset_subset
|
self.dataset_subset = dataset_subset
|
||||||
|
|
||||||
self.load_data()
|
self.load_data()
|
||||||
|
|
||||||
def load_data(self) -> None:
|
def load_data(self) -> None:
|
||||||
"""Load data from HuggingFace datasets."""
|
if not self.dataset_path:
|
||||||
|
raise ValueError("dataset_path must be provided for loading data.")
|
||||||
|
|
||||||
self.data = load_dataset(
|
self.data = load_dataset(
|
||||||
self.dataset_path,
|
self.dataset_path,
|
||||||
name=self.dataset_subset,
|
name=self.dataset_subset,
|
||||||
split=self.dataset_split,
|
split=self.dataset_split,
|
||||||
streaming=True,
|
streaming=True,
|
||||||
)
|
)
|
||||||
self.data = self.data.shuffle(seed=self.random_seed)
|
if self.data.features is None or "conversations" \
|
||||||
|
not in self.data.features:
|
||||||
|
raise ValueError(
|
||||||
|
"HuggingFaceDataset currently only supports datasets with "
|
||||||
|
"a 'conversations' column like lmms-lab/LLaVA-OneVision-Data. "
|
||||||
|
"Please consider contributing if you would like to add "
|
||||||
|
"support for additional dataset formats.")
|
||||||
|
# Shuffle and filter examples with at least 2 conversations.
|
||||||
|
self.data = self.data.shuffle(seed=self.random_seed).filter(
|
||||||
|
lambda x: len(x["conversations"]) >= 2)
|
||||||
|
|
||||||
|
def sample(self,
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# Conversation Dataset Implementation
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
class ConversationDataset(HuggingFaceDataset):
|
|
||||||
"""Dataset for conversation data with multimodal support."""
|
|
||||||
|
|
||||||
SUPPORTED_DATASET_PATHS = {
|
|
||||||
"lmms-lab/LLaVA-OneVision-Data",
|
|
||||||
"Aeala/ShareGPT_Vicuna_unfiltered",
|
|
||||||
}
|
|
||||||
IS_MULTIMODAL = True
|
|
||||||
|
|
||||||
def sample(
|
|
||||||
self,
|
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
num_requests: int,
|
num_requests: int,
|
||||||
output_len: Optional[int] = None,
|
output_len: Optional[int] = None,
|
||||||
enable_multimodal_chat: bool = False,
|
enable_multimodal_chat: bool = False,
|
||||||
**kwargs,
|
**kwargs) -> list:
|
||||||
) -> list:
|
|
||||||
# Filter examples with at least 2 conversations
|
|
||||||
filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2)
|
|
||||||
sampled_requests = []
|
sampled_requests = []
|
||||||
dynamic_output = output_len is None
|
dynamic_output = output_len is None
|
||||||
|
|
||||||
for item in filtered_data:
|
for item in self.data:
|
||||||
if len(sampled_requests) >= num_requests:
|
if len(sampled_requests) >= num_requests:
|
||||||
break
|
break
|
||||||
conv = item["conversations"]
|
conv = item["conversations"]
|
||||||
@@ -670,23 +600,24 @@ class ConversationDataset(HuggingFaceDataset):
|
|||||||
completion_len = len(completion_ids)
|
completion_len = len(completion_ids)
|
||||||
output_len = completion_len if dynamic_output else output_len
|
output_len = completion_len if dynamic_output else output_len
|
||||||
assert isinstance(output_len, int) and output_len > 0
|
assert isinstance(output_len, int) and output_len > 0
|
||||||
if dynamic_output and not is_valid_sequence(prompt_len, completion_len):
|
if dynamic_output and not is_valid_sequence(
|
||||||
|
prompt_len, completion_len):
|
||||||
continue
|
continue
|
||||||
mm_content = process_image(item["image"]) if "image" in item else None
|
mm_content = process_image(
|
||||||
|
item["image"]) if "image" in item else None
|
||||||
if enable_multimodal_chat:
|
if enable_multimodal_chat:
|
||||||
# Note: when chat is enabled the request prompt_len is no longer
|
# Note: when chat is enabled the request prompt_len is no longer
|
||||||
# accurate and we will be using request output to count the
|
# accurate and we will be using request output to count the
|
||||||
# actual prompt len and output len
|
# actual prompt len and output len
|
||||||
prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
|
prompt = self.apply_multimodal_chat_transformation(
|
||||||
|
prompt, mm_content)
|
||||||
sampled_requests.append(
|
sampled_requests.append(
|
||||||
SampleRequest(
|
SampleRequest(
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
prompt_len=prompt_len,
|
prompt_len=prompt_len,
|
||||||
expected_output_len=output_len,
|
expected_output_len=output_len,
|
||||||
multi_modal_data=mm_content,
|
multi_modal_data=mm_content,
|
||||||
)
|
))
|
||||||
)
|
|
||||||
self.maybe_oversample_requests(sampled_requests, num_requests)
|
|
||||||
return sampled_requests
|
return sampled_requests
|
||||||
|
|
||||||
|
|
||||||
@@ -701,370 +632,57 @@ class VisionArenaDataset(HuggingFaceDataset):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
DEFAULT_OUTPUT_LEN = 128
|
DEFAULT_OUTPUT_LEN = 128
|
||||||
SUPPORTED_DATASET_PATHS = {
|
DEFAULT_NUM_REQUESTS = 1000
|
||||||
"lmarena-ai/VisionArena-Chat": lambda x: x["conversation"][0][0]["content"],
|
VISION_ARENA_DATASET_PATH = "lmarena-ai/vision-arena-bench-v0.1"
|
||||||
"lmarena-ai/vision-arena-bench-v0.1": lambda x: x["turns"][0][0]["content"],
|
|
||||||
}
|
|
||||||
IS_MULTIMODAL = True
|
|
||||||
|
|
||||||
def sample(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
**kwargs,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
if self.dataset_path != self.VISION_ARENA_DATASET_PATH:
|
||||||
|
raise ValueError(f"Only support Vision Arena dataset.\
|
||||||
|
This data path {self.dataset_path} is not valid.")
|
||||||
|
if self.dataset_subset is None and self.dataset_split != "train":
|
||||||
|
raise ValueError("Dataset split must be 'train'.")
|
||||||
|
|
||||||
|
self.load_data()
|
||||||
|
|
||||||
|
def load_data(self) -> None:
|
||||||
|
dataset = load_dataset(
|
||||||
|
self.dataset_path,
|
||||||
|
name=self.dataset_subset,
|
||||||
|
split=self.dataset_split,
|
||||||
|
streaming=True,
|
||||||
|
)
|
||||||
|
self.data = dataset.shuffle(seed=self.random_seed)
|
||||||
|
|
||||||
|
def sample(self,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
num_requests: int,
|
num_requests: int,
|
||||||
output_len: Optional[int] = None,
|
output_len: Optional[int] = None,
|
||||||
enable_multimodal_chat: bool = False,
|
enable_multimodal_chat: bool = False,
|
||||||
**kwargs,
|
**kwargs) -> list:
|
||||||
) -> list:
|
output_len = (output_len
|
||||||
output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
|
if output_len is not None else self.DEFAULT_OUTPUT_LEN)
|
||||||
sampled_requests = []
|
sampled_requests = []
|
||||||
for item in self.data:
|
for item in self.data:
|
||||||
if len(sampled_requests) >= num_requests:
|
if len(sampled_requests) >= num_requests:
|
||||||
break
|
break
|
||||||
parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
|
prompt = item["turns"][0][0]["content"]
|
||||||
if parser_fn is None:
|
|
||||||
raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
|
|
||||||
prompt = parser_fn(item)
|
|
||||||
mm_content = process_image(item["images"][0])
|
mm_content = process_image(item["images"][0])
|
||||||
prompt_len = len(tokenizer(prompt).input_ids)
|
prompt_len = len(tokenizer(prompt).input_ids)
|
||||||
if enable_multimodal_chat:
|
if enable_multimodal_chat:
|
||||||
# Note: when chat is enabled the request prompt_len is no longer
|
# Note: when chat is enabled the request prompt_len is no longer
|
||||||
# accurate and we will be using request output to count the
|
# accurate and we will be using request output to count the
|
||||||
# actual prompt len
|
# actual prompt len
|
||||||
prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
|
prompt = self.apply_multimodal_chat_transformation(
|
||||||
|
prompt, mm_content)
|
||||||
sampled_requests.append(
|
sampled_requests.append(
|
||||||
SampleRequest(
|
SampleRequest(
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
prompt_len=prompt_len,
|
prompt_len=prompt_len,
|
||||||
expected_output_len=output_len,
|
expected_output_len=output_len,
|
||||||
multi_modal_data=mm_content,
|
multi_modal_data=mm_content,
|
||||||
)
|
))
|
||||||
)
|
|
||||||
self.maybe_oversample_requests(sampled_requests, num_requests)
|
|
||||||
return sampled_requests
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# Instruct Coder Dataset Implementation
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
class InstructCoderDataset(HuggingFaceDataset):
|
|
||||||
"""
|
|
||||||
InstructCoder Dataset.
|
|
||||||
https://huggingface.co/datasets/likaixin/InstructCoder
|
|
||||||
|
|
||||||
InstructCoder is the dataset designed for general code editing. It consists
|
|
||||||
of 114,239 instruction-input-output triplets, and covers multiple distinct
|
|
||||||
code editing scenario.
|
|
||||||
"""
|
|
||||||
|
|
||||||
DEFAULT_OUTPUT_LEN = 200 # this is the average default output length
|
|
||||||
SUPPORTED_DATASET_PATHS = {
|
|
||||||
"likaixin/InstructCoder",
|
|
||||||
}
|
|
||||||
|
|
||||||
def sample(
|
|
||||||
self,
|
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
|
||||||
num_requests: int,
|
|
||||||
output_len: Optional[int] = None,
|
|
||||||
enable_multimodal_chat: bool = False,
|
|
||||||
**kwargs,
|
|
||||||
) -> list:
|
|
||||||
output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
|
|
||||||
sampled_requests = []
|
|
||||||
for item in self.data:
|
|
||||||
if len(sampled_requests) >= num_requests:
|
|
||||||
break
|
|
||||||
prompt = f"{item['instruction']}:\n{item['input']}"
|
|
||||||
prompt_len = len(tokenizer(prompt).input_ids)
|
|
||||||
sampled_requests.append(
|
|
||||||
SampleRequest(
|
|
||||||
prompt=prompt,
|
|
||||||
prompt_len=prompt_len,
|
|
||||||
expected_output_len=output_len,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
self.maybe_oversample_requests(sampled_requests, num_requests)
|
|
||||||
return sampled_requests
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# MT-Bench Dataset Implementation
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
class MTBenchDataset(HuggingFaceDataset):
|
|
||||||
"""
|
|
||||||
MT-Bench Dataset.
|
|
||||||
https://huggingface.co/datasets/philschmid/mt-bench
|
|
||||||
|
|
||||||
We create a single turn dataset for MT-Bench.
|
|
||||||
This is similar to Spec decoding benchmark setup in vLLM
|
|
||||||
https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18
|
|
||||||
""" # noqa: E501
|
|
||||||
|
|
||||||
DEFAULT_OUTPUT_LEN = 256 # avg len used in SD bench in vLLM
|
|
||||||
SUPPORTED_DATASET_PATHS = {
|
|
||||||
"philschmid/mt-bench",
|
|
||||||
}
|
|
||||||
|
|
||||||
def sample(
|
|
||||||
self,
|
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
|
||||||
num_requests: int,
|
|
||||||
output_len: Optional[int] = None,
|
|
||||||
enable_multimodal_chat: bool = False,
|
|
||||||
**kwargs,
|
|
||||||
) -> list:
|
|
||||||
output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
|
|
||||||
sampled_requests = []
|
|
||||||
|
|
||||||
for item in self.data:
|
|
||||||
if len(sampled_requests) >= num_requests:
|
|
||||||
break
|
|
||||||
prompt = item["turns"][0]
|
|
||||||
|
|
||||||
# apply template
|
|
||||||
prompt = tokenizer.apply_chat_template(
|
|
||||||
[{"role": "user", "content": prompt}],
|
|
||||||
add_generation_prompt=True,
|
|
||||||
tokenize=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
prompt_len = len(tokenizer(prompt).input_ids)
|
|
||||||
sampled_requests.append(
|
|
||||||
SampleRequest(
|
|
||||||
prompt=prompt,
|
|
||||||
prompt_len=prompt_len,
|
|
||||||
expected_output_len=output_len,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
self.maybe_oversample_requests(sampled_requests, num_requests)
|
|
||||||
return sampled_requests
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# AIMO Dataset Implementation
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
class AIMODataset(HuggingFaceDataset):
|
|
||||||
"""
|
|
||||||
Dataset class for processing a AIMO dataset with reasoning questions.
|
|
||||||
"""
|
|
||||||
|
|
||||||
SUPPORTED_DATASET_PATHS = {
|
|
||||||
"AI-MO/aimo-validation-aime",
|
|
||||||
"AI-MO/NuminaMath-1.5",
|
|
||||||
"AI-MO/NuminaMath-CoT",
|
|
||||||
}
|
|
||||||
|
|
||||||
def sample(
|
|
||||||
self,
|
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
|
||||||
num_requests: int,
|
|
||||||
output_len: Optional[int] = None,
|
|
||||||
**kwargs,
|
|
||||||
) -> list:
|
|
||||||
sampled_requests = []
|
|
||||||
dynamic_output = output_len is None
|
|
||||||
|
|
||||||
for item in self.data:
|
|
||||||
if len(sampled_requests) >= num_requests:
|
|
||||||
break
|
|
||||||
prompt, completion = item["problem"], item["solution"]
|
|
||||||
|
|
||||||
prompt_ids = tokenizer(prompt).input_ids
|
|
||||||
completion_ids = tokenizer(completion).input_ids
|
|
||||||
prompt_len = len(prompt_ids)
|
|
||||||
completion_len = len(completion_ids)
|
|
||||||
output_len = completion_len if dynamic_output else output_len
|
|
||||||
assert isinstance(output_len, int) and output_len > 0
|
|
||||||
if dynamic_output and not is_valid_sequence(
|
|
||||||
prompt_len, completion_len, max_prompt_len=2048, max_total_len=32000
|
|
||||||
):
|
|
||||||
continue
|
|
||||||
sampled_requests.append(
|
|
||||||
SampleRequest(
|
|
||||||
prompt=prompt,
|
|
||||||
prompt_len=prompt_len,
|
|
||||||
expected_output_len=output_len,
|
|
||||||
multi_modal_data=None,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
self.maybe_oversample_requests(sampled_requests, num_requests)
|
|
||||||
return sampled_requests
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# Next Edit Prediction Dataset Implementation
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
zeta_prompt = """### Instruction:
|
|
||||||
You are a code completion assistant and your task is to analyze user edits and then rewrite an excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking into account the cursor location.
|
|
||||||
|
|
||||||
### User Edits:
|
|
||||||
|
|
||||||
{}
|
|
||||||
|
|
||||||
### User Excerpt:
|
|
||||||
|
|
||||||
{}
|
|
||||||
|
|
||||||
### Response:
|
|
||||||
|
|
||||||
""" # noqa: E501
|
|
||||||
|
|
||||||
|
|
||||||
def _format_zeta_prompt(
|
|
||||||
sample: dict, original_start_marker: str = "<|editable_region_start|>"
|
|
||||||
) -> dict:
|
|
||||||
"""Format the zeta prompt for the Next Edit Prediction (NEP) dataset.
|
|
||||||
|
|
||||||
This function formats examples from the NEP dataset
|
|
||||||
into prompts and expected outputs. It could be
|
|
||||||
further extended to support more NEP datasets.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
sample: The dataset sample containing events,
|
|
||||||
inputs, and outputs.
|
|
||||||
original_start_marker: The marker indicating the
|
|
||||||
start of the editable region. Defaults to
|
|
||||||
"<|editable_region_start|>".
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
A dictionary with the formatted prompts and expected outputs.
|
|
||||||
"""
|
|
||||||
events = sample["events"]
|
|
||||||
input = sample["input"]
|
|
||||||
output = sample["output"]
|
|
||||||
prompt = zeta_prompt.format(events, input)
|
|
||||||
|
|
||||||
# following the original implementation, extract the focused region
|
|
||||||
# from the raw output
|
|
||||||
output_start_index = output.find(original_start_marker)
|
|
||||||
output_focused_region = output[output_start_index:]
|
|
||||||
expected_output = output_focused_region
|
|
||||||
|
|
||||||
return {"prompt": prompt, "expected_output": expected_output}
|
|
||||||
|
|
||||||
|
|
||||||
class NextEditPredictionDataset(HuggingFaceDataset):
|
|
||||||
"""
|
|
||||||
Dataset class for processing a Next Edit Prediction dataset.
|
|
||||||
"""
|
|
||||||
|
|
||||||
SUPPORTED_DATASET_PATHS = {
|
|
||||||
"zed-industries/zeta",
|
|
||||||
}
|
|
||||||
MAPPING_PROMPT_FUNCS = {
|
|
||||||
"zed-industries/zeta": _format_zeta_prompt,
|
|
||||||
}
|
|
||||||
|
|
||||||
def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int, **kwargs):
|
|
||||||
formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.dataset_path)
|
|
||||||
if formatting_prompt_func is None:
|
|
||||||
raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
|
|
||||||
samples = []
|
|
||||||
for sample in self.data:
|
|
||||||
sample = formatting_prompt_func(sample)
|
|
||||||
samples.append(
|
|
||||||
SampleRequest(
|
|
||||||
prompt=sample["prompt"],
|
|
||||||
prompt_len=len(tokenizer(sample["prompt"]).input_ids),
|
|
||||||
expected_output_len=len(
|
|
||||||
tokenizer(sample["expected_output"]).input_ids
|
|
||||||
),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if len(samples) >= num_requests:
|
|
||||||
break
|
|
||||||
self.maybe_oversample_requests(samples, num_requests)
|
|
||||||
return samples
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# ASR Dataset Implementation
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
class ASRDataset(HuggingFaceDataset):
|
|
||||||
"""
|
|
||||||
Dataset class for processing a ASR dataset for transcription.
|
|
||||||
Tested on the following set:
|
|
||||||
|
|
||||||
+----------------+----------------------------------------+--------------------------+-----------------------------+
|
|
||||||
| Dataset | Domain | Speaking Style | hf-subset |
|
|
||||||
+----------------+----------------------------------------+--------------------------+-----------------------------+
|
|
||||||
| TED-LIUM | TED talks | Oratory | release1, release2, release3|
|
|
||||||
| | | | release3-speaker-adaptation |
|
|
||||||
| VoxPopuli | European Parliament | Oratory | en, de, it, fr, ... |
|
|
||||||
| LibriSpeech | Audiobook | Narrated | "LIUM/tedlium" |
|
|
||||||
| GigaSpeech | Audiobook, podcast, YouTube | Narrated, spontaneous | xs, s, m, l, xl, dev, test |
|
|
||||||
| SPGISpeech | Financial meetings | Oratory, spontaneous | S, M, L, dev, test |
|
|
||||||
| AMI | Meetings | Spontaneous | ihm, sdm |
|
|
||||||
+----------------+----------------------------------------+--------------------------+-----------------------------+
|
|
||||||
|
|
||||||
""" # noqa: E501
|
|
||||||
|
|
||||||
SUPPORTED_DATASET_PATHS = {
|
|
||||||
"openslr/librispeech_asr",
|
|
||||||
"facebook/voxpopuli",
|
|
||||||
"LIUM/tedlium",
|
|
||||||
"edinburghcstr/ami",
|
|
||||||
"speechcolab/gigaspeech",
|
|
||||||
"kensho/spgispeech",
|
|
||||||
}
|
|
||||||
|
|
||||||
DEFAULT_OUTPUT_LEN = 128
|
|
||||||
IS_MULTIMODAL = True
|
|
||||||
|
|
||||||
# TODO Whisper-specific. Abstract interface when more models are supported.
|
|
||||||
TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
|
|
||||||
skip_long_audios: bool = True
|
|
||||||
|
|
||||||
def sample(
|
|
||||||
self,
|
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
|
||||||
num_requests: int,
|
|
||||||
output_len: Optional[int] = None,
|
|
||||||
**kwargs,
|
|
||||||
) -> list:
|
|
||||||
import librosa
|
|
||||||
|
|
||||||
output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
|
|
||||||
prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
|
|
||||||
prompt_len = len(tokenizer(prompt).input_ids)
|
|
||||||
sampled_requests = []
|
|
||||||
skipped = 0
|
|
||||||
for item in self.data:
|
|
||||||
if len(sampled_requests) >= num_requests:
|
|
||||||
break
|
|
||||||
audio = item["audio"]
|
|
||||||
y, sr = audio["array"], audio["sampling_rate"]
|
|
||||||
duration_s = librosa.get_duration(y=y, sr=sr)
|
|
||||||
# Whisper max supported duration
|
|
||||||
if self.skip_long_audios and duration_s > 30:
|
|
||||||
skipped += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
mm_content = {"audio": (y, sr)}
|
|
||||||
sampled_requests.append(
|
|
||||||
SampleRequest(
|
|
||||||
prompt=prompt,
|
|
||||||
prompt_len=prompt_len,
|
|
||||||
expected_output_len=output_len,
|
|
||||||
multi_modal_data=mm_content,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if skipped:
|
|
||||||
logger.warning(
|
|
||||||
"%d samples discarded from dataset due to"
|
|
||||||
" their length being greater than"
|
|
||||||
" what Whisper supports.",
|
|
||||||
skipped,
|
|
||||||
)
|
|
||||||
self.maybe_oversample_requests(sampled_requests, num_requests)
|
|
||||||
return sampled_requests
|
return sampled_requests
|
||||||
|
|||||||
@@ -11,9 +11,9 @@ from typing import Any, Optional
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.inputs import PromptType
|
from vllm.inputs import PromptType
|
||||||
@@ -21,14 +21,13 @@ from vllm.sampling_params import BeamSearchParams
|
|||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
def save_to_pytorch_benchmark_format(
|
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||||
args: argparse.Namespace, results: dict[str, Any]
|
results: dict[str, Any]) -> None:
|
||||||
) -> None:
|
|
||||||
pt_records = convert_to_pytorch_benchmark_format(
|
pt_records = convert_to_pytorch_benchmark_format(
|
||||||
args=args,
|
args=args,
|
||||||
metrics={"latency": results["latencies"]},
|
metrics={"latency": results["latencies"]},
|
||||||
extra_info={k: results[k] for k in ["avg_latency", "percentiles"]},
|
extra_info={k: results[k]
|
||||||
)
|
for k in ["avg_latency", "percentiles"]})
|
||||||
if pt_records:
|
if pt_records:
|
||||||
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
|
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
|
||||||
write_to_json(pt_file, pt_records)
|
write_to_json(pt_file, pt_records)
|
||||||
@@ -43,11 +42,9 @@ def main(args: argparse.Namespace):
|
|||||||
# the engine will automatically process the request in multiple batches.
|
# the engine will automatically process the request in multiple batches.
|
||||||
llm = LLM(**dataclasses.asdict(engine_args))
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
assert llm.llm_engine.model_config.max_model_len >= (
|
assert llm.llm_engine.model_config.max_model_len >= (
|
||||||
args.input_len + args.output_len
|
args.input_len +
|
||||||
), (
|
args.output_len), ("Please ensure that max_model_len is greater than"
|
||||||
"Please ensure that max_model_len is greater than"
|
" the sum of input_len and output_len.")
|
||||||
" the sum of input_len and output_len."
|
|
||||||
)
|
|
||||||
|
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
n=args.n,
|
n=args.n,
|
||||||
@@ -58,16 +55,18 @@ def main(args: argparse.Namespace):
|
|||||||
detokenize=not args.disable_detokenize,
|
detokenize=not args.disable_detokenize,
|
||||||
)
|
)
|
||||||
print(sampling_params)
|
print(sampling_params)
|
||||||
dummy_prompt_token_ids = np.random.randint(
|
dummy_prompt_token_ids = np.random.randint(10000,
|
||||||
10000, size=(args.batch_size, args.input_len)
|
size=(args.batch_size,
|
||||||
)
|
args.input_len))
|
||||||
dummy_prompts: list[PromptType] = [
|
dummy_prompts: list[PromptType] = [{
|
||||||
{"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
|
"prompt_token_ids": batch
|
||||||
]
|
} for batch in dummy_prompt_token_ids.tolist()]
|
||||||
|
|
||||||
def llm_generate():
|
def llm_generate():
|
||||||
if not args.use_beam_search:
|
if not args.use_beam_search:
|
||||||
llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
|
llm.generate(dummy_prompts,
|
||||||
|
sampling_params=sampling_params,
|
||||||
|
use_tqdm=False)
|
||||||
else:
|
else:
|
||||||
llm.beam_search(
|
llm.beam_search(
|
||||||
dummy_prompts,
|
dummy_prompts,
|
||||||
@@ -86,8 +85,7 @@ def main(args: argparse.Namespace):
|
|||||||
torch.profiler.ProfilerActivity.CUDA,
|
torch.profiler.ProfilerActivity.CUDA,
|
||||||
],
|
],
|
||||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
||||||
str(profile_dir)
|
str(profile_dir)),
|
||||||
),
|
|
||||||
) as p:
|
) as p:
|
||||||
llm_generate()
|
llm_generate()
|
||||||
print(p.key_averages().table(sort_by="self_cuda_time_total"))
|
print(p.key_averages().table(sort_by="self_cuda_time_total"))
|
||||||
@@ -105,9 +103,8 @@ def main(args: argparse.Namespace):
|
|||||||
if args.profile:
|
if args.profile:
|
||||||
profile_dir = args.profile_result_dir
|
profile_dir = args.profile_result_dir
|
||||||
if not profile_dir:
|
if not profile_dir:
|
||||||
profile_dir = (
|
profile_dir = (Path(".") / "vllm_benchmark_result" /
|
||||||
Path(".") / "vllm_benchmark_result" / f"latency_result_{time.time()}"
|
f"latency_result_{time.time()}")
|
||||||
)
|
|
||||||
print(f"Profiling (results will be saved to '{profile_dir}')...")
|
print(f"Profiling (results will be saved to '{profile_dir}')...")
|
||||||
run_to_completion(profile_dir=profile_dir)
|
run_to_completion(profile_dir=profile_dir)
|
||||||
return
|
return
|
||||||
@@ -138,8 +135,7 @@ def main(args: argparse.Namespace):
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = FlexibleArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description="Benchmark the latency of processing a single batch of "
|
description="Benchmark the latency of processing a single batch of "
|
||||||
"requests till completion."
|
"requests till completion.")
|
||||||
)
|
|
||||||
parser.add_argument("--input-len", type=int, default=32)
|
parser.add_argument("--input-len", type=int, default=32)
|
||||||
parser.add_argument("--output-len", type=int, default=128)
|
parser.add_argument("--output-len", type=int, default=128)
|
||||||
parser.add_argument("--batch-size", type=int, default=8)
|
parser.add_argument("--batch-size", type=int, default=8)
|
||||||
@@ -156,9 +152,10 @@ if __name__ == "__main__":
|
|||||||
default=10,
|
default=10,
|
||||||
help="Number of iterations to run for warmup.",
|
help="Number of iterations to run for warmup.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument("--num-iters",
|
||||||
"--num-iters", type=int, default=30, help="Number of iterations to run."
|
type=int,
|
||||||
)
|
default=30,
|
||||||
|
help="Number of iterations to run.")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--profile",
|
"--profile",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
@@ -168,10 +165,8 @@ if __name__ == "__main__":
|
|||||||
"--profile-result-dir",
|
"--profile-result-dir",
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help=(
|
help=("path to save the pytorch profiler output. Can be visualized "
|
||||||
"path to save the pytorch profiler output. Can be visualized "
|
"with ui.perfetto.dev or Tensorboard."),
|
||||||
"with ui.perfetto.dev or Tensorboard."
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--output-json",
|
"--output-json",
|
||||||
@@ -182,15 +177,10 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--disable-detokenize",
|
"--disable-detokenize",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help=(
|
help=("Do not detokenize responses (i.e. do not include "
|
||||||
"Do not detokenize responses (i.e. do not include "
|
"detokenization time in the latency measurement)"),
|
||||||
"detokenization time in the latency measurement)"
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
parser = EngineArgs.add_cli_args(parser)
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
# V1 enables prefix caching by default which skews the latency
|
|
||||||
# numbers. We need to disable prefix caching by default.
|
|
||||||
parser.set_defaults(enable_prefix_caching=False)
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@@ -86,21 +86,20 @@ def repeat_prompts(prompts, repeat_count, mode: str):
|
|||||||
ValueError: If an invalid mode is provided.
|
ValueError: If an invalid mode is provided.
|
||||||
"""
|
"""
|
||||||
print("Repeat mode: ", mode)
|
print("Repeat mode: ", mode)
|
||||||
if mode == "random":
|
if mode == 'random':
|
||||||
repeated_prompts = prompts * repeat_count
|
repeated_prompts = prompts * repeat_count
|
||||||
random.shuffle(repeated_prompts)
|
random.shuffle(repeated_prompts)
|
||||||
return repeated_prompts
|
return repeated_prompts
|
||||||
elif mode == "tile":
|
elif mode == 'tile':
|
||||||
return prompts * repeat_count
|
return prompts * repeat_count
|
||||||
elif mode == "interleave":
|
elif mode == 'interleave':
|
||||||
repeated_prompts = []
|
repeated_prompts = []
|
||||||
for prompt in prompts:
|
for prompt in prompts:
|
||||||
repeated_prompts.extend([prompt] * repeat_count)
|
repeated_prompts.extend([prompt] * repeat_count)
|
||||||
return repeated_prompts
|
return repeated_prompts
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(f"Invalid mode: {mode}, only support "
|
||||||
f"Invalid mode: {mode}, only support 'random', 'tile', 'interleave'"
|
"'random', 'tile', 'interleave'")
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
@@ -110,16 +109,16 @@ def main(args):
|
|||||||
# we append the document id at the beginning to avoid any of the document
|
# we append the document id at the beginning to avoid any of the document
|
||||||
# being the prefix of other documents
|
# being the prefix of other documents
|
||||||
prompts = [
|
prompts = [
|
||||||
str(i) + " ".join(["hi"] * args.document_length)
|
str(i) + ' '.join(['hi'] * args.document_length)
|
||||||
for i in range(args.num_documents)
|
for i in range(args.num_documents)
|
||||||
]
|
]
|
||||||
|
|
||||||
prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode)
|
prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode)
|
||||||
|
|
||||||
warmup_prompts = [
|
warmup_prompts = [
|
||||||
"This is warm up request " + str(i) + " ".join(["hi"] * args.document_length)
|
"This is warm up request " + str(i) + \
|
||||||
for i in range(args.num_documents)
|
' '.join(['hi'] * args.document_length)
|
||||||
]
|
for i in range(args.num_documents)]
|
||||||
|
|
||||||
# Create the LLM engine
|
# Create the LLM engine
|
||||||
engine_args = EngineArgs.from_cli_args(args)
|
engine_args = EngineArgs.from_cli_args(args)
|
||||||
@@ -143,52 +142,42 @@ def main(args):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = FlexibleArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description="Benchmark the performance with or "
|
description=
|
||||||
"without automatic prefix caching."
|
'Benchmark the performance with or without automatic prefix caching.')
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--document-length",
|
'--document-length',
|
||||||
type=int,
|
type=int,
|
||||||
# Roughly the number of tokens for a system paper,
|
# Roughly the number of tokens for a system paper,
|
||||||
# excluding images
|
# excluding images
|
||||||
default=20000,
|
default=20000,
|
||||||
help="Range of input lengths for sampling prompts, "
|
help='Range of input lengths for sampling prompts,'
|
||||||
'specified as "min:max" (e.g., "128:256").',
|
'specified as "min:max" (e.g., "128:256").')
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument('--num-documents',
|
||||||
"--num-documents",
|
|
||||||
type=int,
|
type=int,
|
||||||
default=8,
|
default=8,
|
||||||
help="Range of input lengths for sampling prompts, "
|
help='Range of input lengths for sampling prompts,'
|
||||||
'specified as "min:max" (e.g., "128:256").',
|
'specified as "min:max" (e.g., "128:256").')
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument("--output-len", type=int, default=10)
|
parser.add_argument('--output-len', type=int, default=10)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument('--repeat-count',
|
||||||
"--repeat-count",
|
|
||||||
type=int,
|
type=int,
|
||||||
default=2,
|
default=2,
|
||||||
help="Number of times to repeat each prompt",
|
help='Number of times to repeat each prompt')
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument("--repeat-mode",
|
||||||
"--repeat-mode",
|
|
||||||
type=str,
|
type=str,
|
||||||
default="random",
|
default='random',
|
||||||
help="The mode to repeat prompts. The supported "
|
help='The mode to repeat prompts. The supported '
|
||||||
'modes are "random", "tile", and "interleave". '
|
'modes are "random", "tile", and "interleave". '
|
||||||
"See repeat_prompts() in the source code for details.",
|
'See repeat_prompts() in the source code for details.')
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument("--shuffle-seed",
|
||||||
"--shuffle-seed",
|
|
||||||
type=int,
|
type=int,
|
||||||
default=0,
|
default=0,
|
||||||
help='Random seed when the repeat mode is "random"',
|
help='Random seed when the repeat mode is "random"')
|
||||||
)
|
|
||||||
|
|
||||||
parser = EngineArgs.add_cli_args(parser)
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|||||||
@@ -63,15 +63,14 @@ class Request:
|
|||||||
output_len: int
|
output_len: int
|
||||||
|
|
||||||
|
|
||||||
def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> list[int]:
|
def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> str:
|
||||||
vocab = tokenizer.get_vocab()
|
vocab = tokenizer.get_vocab()
|
||||||
all_special_ids = set(tokenizer.all_special_ids)
|
|
||||||
|
|
||||||
# Remove the special tokens.
|
# Remove the special tokens.
|
||||||
return random.choices(
|
vocab = {
|
||||||
[v for k, v in vocab.items() if k not in all_special_ids],
|
k: v
|
||||||
k=length,
|
for k, v in vocab.items() if k not in tokenizer.all_special_ids
|
||||||
)
|
}
|
||||||
|
return random.choices(list(vocab.values()), k=length)
|
||||||
|
|
||||||
|
|
||||||
def sample_requests_from_dataset(
|
def sample_requests_from_dataset(
|
||||||
@@ -90,10 +89,8 @@ def sample_requests_from_dataset(
|
|||||||
# Filter out the conversations with less than 2 turns.
|
# Filter out the conversations with less than 2 turns.
|
||||||
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
|
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
|
||||||
# Only keep the first two turns of each conversation.
|
# Only keep the first two turns of each conversation.
|
||||||
dataset = [
|
dataset = [(data["conversations"][0]["value"],
|
||||||
(data["conversations"][0]["value"], data["conversations"][1]["value"])
|
data["conversations"][1]["value"]) for data in dataset]
|
||||||
for data in dataset
|
|
||||||
]
|
|
||||||
|
|
||||||
# Shuffle the dataset.
|
# Shuffle the dataset.
|
||||||
random.shuffle(dataset)
|
random.shuffle(dataset)
|
||||||
@@ -114,9 +111,8 @@ def sample_requests_from_dataset(
|
|||||||
completion = dataset[i][1]
|
completion = dataset[i][1]
|
||||||
completion_token_ids = tokenizer(completion).input_ids
|
completion_token_ids = tokenizer(completion).input_ids
|
||||||
prompt_len = len(prompt_token_ids)
|
prompt_len = len(prompt_token_ids)
|
||||||
output_len = (
|
output_len = (len(completion_token_ids)
|
||||||
len(completion_token_ids) if fixed_output_len is None else fixed_output_len
|
if fixed_output_len is None else fixed_output_len)
|
||||||
)
|
|
||||||
if min_len <= prompt_len <= max_len:
|
if min_len <= prompt_len <= max_len:
|
||||||
filtered_requests.append(Request(prompt, prompt_len, output_len))
|
filtered_requests.append(Request(prompt, prompt_len, output_len))
|
||||||
|
|
||||||
@@ -130,27 +126,27 @@ def sample_requests_from_random(
|
|||||||
fixed_output_len: Optional[int],
|
fixed_output_len: Optional[int],
|
||||||
prefix_len: int,
|
prefix_len: int,
|
||||||
) -> list[Request]:
|
) -> list[Request]:
|
||||||
|
|
||||||
requests = []
|
requests = []
|
||||||
prefix_token_ids = sample_tokens(tokenizer, prefix_len)
|
prefix_token_ids = sample_tokens(tokenizer, prefix_len)
|
||||||
min_len, max_len = input_length_range
|
min_len, max_len = input_length_range
|
||||||
|
|
||||||
for i in range(num_requests):
|
for i in range(num_requests):
|
||||||
unique_part_token_ids = sample_tokens(
|
unique_part_token_ids = sample_tokens(
|
||||||
tokenizer, random.randint(min_len - prefix_len, max_len - prefix_len)
|
tokenizer,
|
||||||
)
|
random.randint(min_len - prefix_len, max_len - prefix_len))
|
||||||
prompt_token_ids = prefix_token_ids + unique_part_token_ids
|
prompt_token_ids = prefix_token_ids + unique_part_token_ids
|
||||||
prompt = tokenizer.decode(prompt_token_ids)
|
prompt = tokenizer.decode(prompt_token_ids)
|
||||||
prompt_len = len(prompt_token_ids)
|
prompt_len = len(prompt_token_ids)
|
||||||
assert min_len <= prompt_len <= max_len, (
|
assert (min_len <= prompt_len <= max_len
|
||||||
f"prompt_len {prompt_len} out of range {min_len}:{max_len}"
|
), f"prompt_len {prompt_len} out of range {min_len}:{max_len}"
|
||||||
)
|
|
||||||
requests.append(Request(prompt, prompt_len, fixed_output_len))
|
requests.append(Request(prompt, prompt_len, fixed_output_len))
|
||||||
return requests
|
return requests
|
||||||
|
|
||||||
|
|
||||||
def repeat_and_sort_requests(
|
def repeat_and_sort_requests(requests: list[Request],
|
||||||
requests: list[Request], repeat_count: int, sort: bool = False
|
repeat_count: int,
|
||||||
) -> list[str]:
|
sort: bool = False) -> list[str]:
|
||||||
repeated_requests = requests * repeat_count
|
repeated_requests = requests * repeat_count
|
||||||
if sort:
|
if sort:
|
||||||
repeated_requests.sort(key=lambda x: x[1])
|
repeated_requests.sort(key=lambda x: x[1])
|
||||||
@@ -161,14 +157,14 @@ def repeat_and_sort_requests(
|
|||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
tokenizer = get_tokenizer(args.model, trust_remote_code=True)
|
tokenizer = get_tokenizer(args.model, trust_remote_code=True)
|
||||||
input_length_range = tuple(map(int, args.input_length_range.split(":")))
|
input_length_range = tuple(map(int, args.input_length_range.split(':')))
|
||||||
random.seed(args.seed)
|
random.seed(args.seed)
|
||||||
if args.dataset_path is not None:
|
if args.dataset_path is not None:
|
||||||
if args.prefix_len > 0:
|
if args.prefix_len > 0:
|
||||||
raise ValueError(
|
raise ValueError("prefix-len is not supported when "
|
||||||
"prefix-len is not supported when dataset-path is provided."
|
"dataset-path is provided.")
|
||||||
)
|
print(f"Start to sample {args.num_prompts} prompts "
|
||||||
print(f"Start to sample {args.num_prompts} prompts from {args.dataset_path}")
|
f"from {args.dataset_path}")
|
||||||
filtered_requests = sample_requests_from_dataset(
|
filtered_requests = sample_requests_from_dataset(
|
||||||
dataset_path=args.dataset_path,
|
dataset_path=args.dataset_path,
|
||||||
num_requests=args.num_prompts,
|
num_requests=args.num_prompts,
|
||||||
@@ -198,16 +194,14 @@ def main(args):
|
|||||||
|
|
||||||
llm = LLM(**dataclasses.asdict(engine_args))
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
|
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(temperature=0,
|
||||||
temperature=0,
|
|
||||||
max_tokens=args.output_len,
|
max_tokens=args.output_len,
|
||||||
detokenize=not args.disable_detokenize,
|
detokenize=not args.disable_detokenize)
|
||||||
)
|
|
||||||
|
|
||||||
print("Testing filtered requests")
|
print("Testing filtered requests")
|
||||||
prompts = repeat_and_sort_requests(
|
prompts = repeat_and_sort_requests(filtered_requests,
|
||||||
filtered_requests, repeat_count=args.repeat_count, sort=args.sort
|
repeat_count=args.repeat_count,
|
||||||
)
|
sort=args.sort)
|
||||||
|
|
||||||
print("------start generating------")
|
print("------start generating------")
|
||||||
test_prefix(
|
test_prefix(
|
||||||
@@ -219,35 +213,29 @@ def main(args):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = FlexibleArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description="Benchmark the performance with or without "
|
description=
|
||||||
"automatic prefix caching."
|
'Benchmark the performance with or without automatic prefix caching.')
|
||||||
)
|
parser.add_argument("--dataset-path",
|
||||||
parser.add_argument(
|
type=str,
|
||||||
"--dataset-path", type=str, default=None, help="Path to the dataset."
|
default=None,
|
||||||
)
|
help="Path to the dataset.")
|
||||||
parser.add_argument("--output-len", type=int, default=10)
|
parser.add_argument('--output-len', type=int, default=10)
|
||||||
parser.add_argument(
|
parser.add_argument('--num-prompts',
|
||||||
"--num-prompts",
|
|
||||||
type=int,
|
type=int,
|
||||||
required=True,
|
required=True,
|
||||||
help="Number of the prompts sampled from dataset",
|
help="Number of the prompts sampled from dataset")
|
||||||
)
|
parser.add_argument('--repeat-count',
|
||||||
parser.add_argument(
|
|
||||||
"--repeat-count",
|
|
||||||
type=int,
|
type=int,
|
||||||
default=1,
|
default=1,
|
||||||
help="Number of times to repeat each prompt",
|
help='Number of times to repeat each prompt')
|
||||||
)
|
parser.add_argument('--sort',
|
||||||
parser.add_argument(
|
action='store_true',
|
||||||
"--sort", action="store_true", help="Sort prompts by input length"
|
help='Sort prompts by input length')
|
||||||
)
|
parser.add_argument('--input-length-range',
|
||||||
parser.add_argument(
|
|
||||||
"--input-length-range",
|
|
||||||
type=str,
|
type=str,
|
||||||
required=True,
|
required=True,
|
||||||
help="Range of input lengths for sampling prompts,"
|
help='Range of input lengths for sampling prompts,'
|
||||||
'specified as "min:max" (e.g., "128:256").',
|
'specified as "min:max" (e.g., "128:256").')
|
||||||
)
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--prefix-len",
|
"--prefix-len",
|
||||||
type=int,
|
type=int,
|
||||||
@@ -258,12 +246,10 @@ if __name__ == "__main__":
|
|||||||
"when dataset-path is not provided.",
|
"when dataset-path is not provided.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--disable-detokenize",
|
'--disable-detokenize',
|
||||||
action="store_true",
|
action='store_true',
|
||||||
help=(
|
help=("Do not detokenize responses (i.e. do not include "
|
||||||
"Do not detokenize responses (i.e. do not include "
|
"detokenization time in the latency measurement)"),
|
||||||
"detokenization time in the latency measurement)"
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
parser = EngineArgs.add_cli_args(parser)
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
"""Benchmark offline prioritization."""
|
"""Benchmark offline prioritization."""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import dataclasses
|
import dataclasses
|
||||||
import json
|
import json
|
||||||
@@ -34,10 +33,8 @@ def sample_requests(
|
|||||||
# Filter out the conversations with less than 2 turns.
|
# Filter out the conversations with less than 2 turns.
|
||||||
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
|
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
|
||||||
# Only keep the first two turns of each conversation.
|
# Only keep the first two turns of each conversation.
|
||||||
dataset = [
|
dataset = [(data["conversations"][0]["value"],
|
||||||
(data["conversations"][0]["value"], data["conversations"][1]["value"])
|
data["conversations"][1]["value"]) for data in dataset]
|
||||||
for data in dataset
|
|
||||||
]
|
|
||||||
|
|
||||||
# Shuffle the dataset.
|
# Shuffle the dataset.
|
||||||
random.shuffle(dataset)
|
random.shuffle(dataset)
|
||||||
@@ -54,9 +51,8 @@ def sample_requests(
|
|||||||
completion = dataset[i][1]
|
completion = dataset[i][1]
|
||||||
completion_token_ids = tokenizer(completion).input_ids
|
completion_token_ids = tokenizer(completion).input_ids
|
||||||
prompt_len = len(prompt_token_ids)
|
prompt_len = len(prompt_token_ids)
|
||||||
output_len = (
|
output_len = len(completion_token_ids
|
||||||
len(completion_token_ids) if fixed_output_len is None else fixed_output_len
|
) if fixed_output_len is None else fixed_output_len
|
||||||
)
|
|
||||||
if prompt_len < 4 or output_len < 4:
|
if prompt_len < 4 or output_len < 4:
|
||||||
# Prune too short sequences.
|
# Prune too short sequences.
|
||||||
continue
|
continue
|
||||||
@@ -78,16 +74,13 @@ def run_vllm(
|
|||||||
disable_detokenize: bool = False,
|
disable_detokenize: bool = False,
|
||||||
) -> float:
|
) -> float:
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
llm = LLM(**dataclasses.asdict(engine_args))
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
|
|
||||||
assert all(
|
assert all(
|
||||||
llm.llm_engine.model_config.max_model_len >= (request[1] + request[2])
|
llm.llm_engine.model_config.max_model_len >= (request[1] + request[2])
|
||||||
for request in requests
|
for request in requests), (
|
||||||
), (
|
|
||||||
"Please ensure that max_model_len is greater than the sum of"
|
"Please ensure that max_model_len is greater than the sum of"
|
||||||
" input_len and output_len for all requests."
|
" input_len and output_len for all requests.")
|
||||||
)
|
|
||||||
|
|
||||||
# Add the requests to the engine.
|
# Add the requests to the engine.
|
||||||
prompts = []
|
prompts = []
|
||||||
@@ -104,8 +97,7 @@ def run_vllm(
|
|||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
max_tokens=output_len,
|
max_tokens=output_len,
|
||||||
detokenize=not disable_detokenize,
|
detokenize=not disable_detokenize,
|
||||||
)
|
))
|
||||||
)
|
|
||||||
|
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
llm.generate(prompts, sampling_params, priority=priority, use_tqdm=True)
|
llm.generate(prompts, sampling_params, priority=priority, use_tqdm=True)
|
||||||
@@ -119,33 +111,26 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
# Sample the requests.
|
# Sample the requests.
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
args.tokenizer, trust_remote_code=args.trust_remote_code
|
args.tokenizer, trust_remote_code=args.trust_remote_code)
|
||||||
)
|
|
||||||
if args.dataset is None:
|
if args.dataset is None:
|
||||||
# Synthesize a prompt with the given input length.
|
# Synthesize a prompt with the given input length.
|
||||||
prompt = "hi" * (args.input_len - 1)
|
prompt = "hi" * (args.input_len - 1)
|
||||||
requests = [
|
requests = [(prompt, args.input_len, args.output_len,
|
||||||
(prompt, args.input_len, args.output_len, get_random_flag())
|
get_random_flag()) for _ in range(args.num_prompts)]
|
||||||
for _ in range(args.num_prompts)
|
|
||||||
]
|
|
||||||
else:
|
else:
|
||||||
requests = sample_requests(
|
requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
|
||||||
args.dataset, args.num_prompts, tokenizer, args.output_len
|
args.output_len)
|
||||||
)
|
|
||||||
|
|
||||||
if args.backend == "vllm":
|
if args.backend == "vllm":
|
||||||
elapsed_time = run_vllm(
|
elapsed_time = run_vllm(requests, args.n,
|
||||||
requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize
|
EngineArgs.from_cli_args(args),
|
||||||
)
|
args.disable_detokenize)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown backend: {args.backend}")
|
raise ValueError(f"Unknown backend: {args.backend}")
|
||||||
total_num_tokens = sum(
|
total_num_tokens = sum(prompt_len + output_len
|
||||||
prompt_len + output_len for _, prompt_len, output_len, priority in requests
|
for _, prompt_len, output_len, priority in requests)
|
||||||
)
|
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
||||||
print(
|
f"{total_num_tokens / elapsed_time:.2f} tokens/s")
|
||||||
f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
|
||||||
f"{total_num_tokens / elapsed_time:.2f} tokens/s"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Output JSON results if specified
|
# Output JSON results if specified
|
||||||
if args.output_json:
|
if args.output_json:
|
||||||
@@ -162,44 +147,41 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
|
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
|
||||||
parser.add_argument(
|
parser.add_argument("--backend",
|
||||||
"--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm"
|
type=str,
|
||||||
)
|
choices=["vllm", "hf", "mii"],
|
||||||
parser.add_argument(
|
default="vllm")
|
||||||
"--dataset", type=str, default=None, help="Path to the dataset."
|
parser.add_argument("--dataset",
|
||||||
)
|
type=str,
|
||||||
parser.add_argument(
|
default=None,
|
||||||
"--input-len",
|
help="Path to the dataset.")
|
||||||
|
parser.add_argument("--input-len",
|
||||||
type=int,
|
type=int,
|
||||||
default=None,
|
default=None,
|
||||||
help="Input prompt length for each request",
|
help="Input prompt length for each request")
|
||||||
)
|
parser.add_argument("--output-len",
|
||||||
parser.add_argument(
|
|
||||||
"--output-len",
|
|
||||||
type=int,
|
type=int,
|
||||||
default=None,
|
default=None,
|
||||||
help="Output length for each request. Overrides the "
|
help="Output length for each request. Overrides the "
|
||||||
"output length from the dataset.",
|
"output length from the dataset.")
|
||||||
)
|
parser.add_argument("--n",
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
|
help="Number of generated sequences per prompt.")
|
||||||
|
parser.add_argument("--num-prompts",
|
||||||
|
type=int,
|
||||||
|
default=200,
|
||||||
|
help="Number of prompts to process.")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--n", type=int, default=1, help="Number of generated sequences per prompt."
|
'--output-json',
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--num-prompts", type=int, default=200, help="Number of prompts to process."
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--output-json",
|
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help="Path to save the throughput results in JSON format.",
|
help='Path to save the throughput results in JSON format.')
|
||||||
)
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--disable-detokenize",
|
'--disable-detokenize',
|
||||||
action="store_true",
|
action='store_true',
|
||||||
help=(
|
help=("Do not detokenize responses (i.e. do not include "
|
||||||
"Do not detokenize responses (i.e. do not include "
|
"detokenization time in the latency measurement)"),
|
||||||
"detokenization time in the latency measurement)"
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
parser = EngineArgs.add_cli_args(parser)
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user