[doc] use MkDocs collapsible blocks - supplement (#19973)
Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>
This commit is contained in:
@@ -61,23 +61,25 @@ To address the above issues, I have designed and developed a local Tensor memory
|
||||
|
||||
# Install vLLM
|
||||
|
||||
```shell
|
||||
# Enter the home directory or your working directory.
|
||||
cd /home
|
||||
??? Commands
|
||||
|
||||
# Download the installation package, and I will update the commit-id in time. You can directly copy the command.
|
||||
wget https://vllm-wheels.s3.us-west-2.amazonaws.com/9112b443a042d8d815880b8780633882ad32b183/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
|
||||
```shell
|
||||
# Enter the home directory or your working directory.
|
||||
cd /home
|
||||
|
||||
# Download the code repository.
|
||||
git clone -b xpyd-v1 https://github.com/Abatom/vllm.git
|
||||
cd vllm
|
||||
# Download the installation package, and I will update the commit-id in time. You can directly copy the command.
|
||||
wget https://vllm-wheels.s3.us-west-2.amazonaws.com/9112b443a042d8d815880b8780633882ad32b183/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
|
||||
|
||||
# Set the installation package path.
|
||||
export VLLM_PRECOMPILED_WHEEL_LOCATION=/home/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
|
||||
# Download the code repository.
|
||||
git clone -b xpyd-v1 https://github.com/Abatom/vllm.git
|
||||
cd vllm
|
||||
|
||||
# installation
|
||||
pip install -e . -v
|
||||
```
|
||||
# Set the installation package path.
|
||||
export VLLM_PRECOMPILED_WHEEL_LOCATION=/home/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
|
||||
|
||||
# installation
|
||||
pip install -e . -v
|
||||
```
|
||||
|
||||
# Run xPyD
|
||||
|
||||
@@ -104,83 +106,91 @@ python3 disagg_prefill_proxy_xpyd.py &
|
||||
|
||||
### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20005 \
|
||||
--tensor-parallel-size 1 \
|
||||
--seed 1024 \
|
||||
--served-model-name base_model \
|
||||
--dtype float16 \
|
||||
--max-model-len 10000 \
|
||||
--max-num-batched-tokens 10000 \
|
||||
--max-num-seqs 256 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--disable-log-request \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
|
||||
```
|
||||
??? Command
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20005 \
|
||||
--tensor-parallel-size 1 \
|
||||
--seed 1024 \
|
||||
--served-model-name base_model \
|
||||
--dtype float16 \
|
||||
--max-model-len 10000 \
|
||||
--max-num-batched-tokens 10000 \
|
||||
--max-num-seqs 256 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--disable-log-request \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
|
||||
```
|
||||
|
||||
### Decode1 (e.g. 10.0.1.3 or 10.0.1.1)
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20009 \
|
||||
--tensor-parallel-size 1 \
|
||||
--seed 1024 \
|
||||
--served-model-name base_model \
|
||||
--dtype float16 \
|
||||
--max-model-len 10000 \
|
||||
--max-num-batched-tokens 10000 \
|
||||
--max-num-seqs 256 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.7 \
|
||||
--disable-log-request \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
|
||||
```
|
||||
??? Command
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20009 \
|
||||
--tensor-parallel-size 1 \
|
||||
--seed 1024 \
|
||||
--served-model-name base_model \
|
||||
--dtype float16 \
|
||||
--max-model-len 10000 \
|
||||
--max-num-batched-tokens 10000 \
|
||||
--max-num-seqs 256 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.7 \
|
||||
--disable-log-request \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
|
||||
```
|
||||
|
||||
### Decode2 (e.g. 10.0.1.4 or 10.0.1.1)
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20003 \
|
||||
--tensor-parallel-size 1 \
|
||||
--seed 1024 \
|
||||
--served-model-name base_model \
|
||||
--dtype float16 \
|
||||
--max-model-len 10000 \
|
||||
--max-num-batched-tokens 10000 \
|
||||
--max-num-seqs 256 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.7 \
|
||||
--disable-log-request \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
|
||||
```
|
||||
??? Command
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20003 \
|
||||
--tensor-parallel-size 1 \
|
||||
--seed 1024 \
|
||||
--served-model-name base_model \
|
||||
--dtype float16 \
|
||||
--max-model-len 10000 \
|
||||
--max-num-batched-tokens 10000 \
|
||||
--max-num-seqs 256 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.7 \
|
||||
--disable-log-request \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
|
||||
```
|
||||
|
||||
### Decode3 (e.g. 10.0.1.5 or 10.0.1.1)
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20008 \
|
||||
--tensor-parallel-size 1 \
|
||||
--seed 1024 \
|
||||
--served-model-name base_model \
|
||||
--dtype float16 \
|
||||
--max-model-len 10000 \
|
||||
--max-num-batched-tokens 10000 \
|
||||
--max-num-seqs 256 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.7 \
|
||||
--disable-log-request \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
|
||||
```
|
||||
??? Command
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20008 \
|
||||
--tensor-parallel-size 1 \
|
||||
--seed 1024 \
|
||||
--served-model-name base_model \
|
||||
--dtype float16 \
|
||||
--max-model-len 10000 \
|
||||
--max-num-batched-tokens 10000 \
|
||||
--max-num-seqs 256 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.7 \
|
||||
--disable-log-request \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
|
||||
```
|
||||
|
||||
## Run 3P1D
|
||||
|
||||
@@ -193,83 +203,91 @@ python3 disagg_prefill_proxy_xpyd.py &
|
||||
|
||||
### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20005 \
|
||||
--tensor-parallel-size 1 \
|
||||
--seed 1024 \
|
||||
--served-model-name base_model \
|
||||
--dtype float16 \
|
||||
--max-model-len 10000 \
|
||||
--max-num-batched-tokens 10000 \
|
||||
--max-num-seqs 256 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--disable-log-request \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
|
||||
```
|
||||
??? Command
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20005 \
|
||||
--tensor-parallel-size 1 \
|
||||
--seed 1024 \
|
||||
--served-model-name base_model \
|
||||
--dtype float16 \
|
||||
--max-model-len 10000 \
|
||||
--max-num-batched-tokens 10000 \
|
||||
--max-num-seqs 256 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--disable-log-request \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
|
||||
```
|
||||
|
||||
### Prefill2 (e.g. 10.0.1.3 or 10.0.1.1)
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20009 \
|
||||
--tensor-parallel-size 1 \
|
||||
--seed 1024 \
|
||||
--served-model-name base_model \
|
||||
--dtype float16 \
|
||||
--max-model-len 10000 \
|
||||
--max-num-batched-tokens 10000 \
|
||||
--max-num-seqs 256 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--disable-log-request \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
|
||||
```
|
||||
??? Command
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20009 \
|
||||
--tensor-parallel-size 1 \
|
||||
--seed 1024 \
|
||||
--served-model-name base_model \
|
||||
--dtype float16 \
|
||||
--max-model-len 10000 \
|
||||
--max-num-batched-tokens 10000 \
|
||||
--max-num-seqs 256 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--disable-log-request \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
|
||||
```
|
||||
|
||||
### Prefill3 (e.g. 10.0.1.4 or 10.0.1.1)
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20003 \
|
||||
--tensor-parallel-size 1 \
|
||||
--seed 1024 \
|
||||
--served-model-name base_model \
|
||||
--dtype float16 \
|
||||
--max-model-len 10000 \
|
||||
--max-num-batched-tokens 10000 \
|
||||
--max-num-seqs 256 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--disable-log-request \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
|
||||
```
|
||||
??? Command
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20003 \
|
||||
--tensor-parallel-size 1 \
|
||||
--seed 1024 \
|
||||
--served-model-name base_model \
|
||||
--dtype float16 \
|
||||
--max-model-len 10000 \
|
||||
--max-num-batched-tokens 10000 \
|
||||
--max-num-seqs 256 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--disable-log-request \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
|
||||
```
|
||||
|
||||
### Decode1 (e.g. 10.0.1.5 or 10.0.1.1)
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20008 \
|
||||
--tensor-parallel-size 1 \
|
||||
--seed 1024 \
|
||||
--served-model-name base_model \
|
||||
--dtype float16 \
|
||||
--max-model-len 10000 \
|
||||
--max-num-batched-tokens 10000 \
|
||||
--max-num-seqs 256 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.7 \
|
||||
--disable-log-request \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
|
||||
```
|
||||
??? Command
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20008 \
|
||||
--tensor-parallel-size 1 \
|
||||
--seed 1024 \
|
||||
--served-model-name base_model \
|
||||
--dtype float16 \
|
||||
--max-model-len 10000 \
|
||||
--max-num-batched-tokens 10000 \
|
||||
--max-num-seqs 256 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.7 \
|
||||
--disable-log-request \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
|
||||
```
|
||||
|
||||
# Single request
|
||||
|
||||
@@ -286,25 +304,27 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \
|
||||
|
||||
# Benchmark
|
||||
|
||||
```shell
|
||||
python3 benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--model base_model \
|
||||
--tokenizer meta-llama/Llama-3.1-8B-Instruct \
|
||||
--dataset-name "random" \
|
||||
--host 10.0.1.1 \
|
||||
--port 10001 \
|
||||
--random-input-len 1024 \
|
||||
--random-output-len 1024 \
|
||||
--ignore-eos \
|
||||
--burstiness 100 \
|
||||
--percentile-metrics "ttft,tpot,itl,e2el" \
|
||||
--metric-percentiles "90,95,99" \
|
||||
--seed $(date +%s) \
|
||||
--trust-remote-code \
|
||||
--request-rate 3 \
|
||||
--num-prompts 1000
|
||||
```
|
||||
??? Command
|
||||
|
||||
```shell
|
||||
python3 benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--model base_model \
|
||||
--tokenizer meta-llama/Llama-3.1-8B-Instruct \
|
||||
--dataset-name "random" \
|
||||
--host 10.0.1.1 \
|
||||
--port 10001 \
|
||||
--random-input-len 1024 \
|
||||
--random-output-len 1024 \
|
||||
--ignore-eos \
|
||||
--burstiness 100 \
|
||||
--percentile-metrics "ttft,tpot,itl,e2el" \
|
||||
--metric-percentiles "90,95,99" \
|
||||
--seed $(date +%s) \
|
||||
--trust-remote-code \
|
||||
--request-rate 3 \
|
||||
--num-prompts 1000
|
||||
```
|
||||
|
||||
# Shut down
|
||||
|
||||
|
||||
Reference in New Issue
Block a user