[doc] Fold long code blocks to improve readability (#19926)
Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>
This commit is contained in:
@@ -36,23 +36,25 @@ docker build . -f Dockerfile.nginx --tag nginx-lb
|
||||
|
||||
Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`.
|
||||
|
||||
```console
|
||||
upstream backend {
|
||||
least_conn;
|
||||
server vllm0:8000 max_fails=3 fail_timeout=10000s;
|
||||
server vllm1:8000 max_fails=3 fail_timeout=10000s;
|
||||
}
|
||||
server {
|
||||
listen 80;
|
||||
location / {
|
||||
proxy_pass http://backend;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
??? Config
|
||||
|
||||
```console
|
||||
upstream backend {
|
||||
least_conn;
|
||||
server vllm0:8000 max_fails=3 fail_timeout=10000s;
|
||||
server vllm1:8000 max_fails=3 fail_timeout=10000s;
|
||||
}
|
||||
}
|
||||
```
|
||||
server {
|
||||
listen 80;
|
||||
location / {
|
||||
proxy_pass http://backend;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
[](){ #nginxloadbalancer-nginx-vllm-container }
|
||||
|
||||
@@ -93,30 +95,32 @@ Notes:
|
||||
- The below example assumes GPU backend used. If you are using CPU backend, remove `--gpus device=ID`, add `VLLM_CPU_KVCACHE_SPACE` and `VLLM_CPU_OMP_THREADS_BIND` environment variables to the docker run command.
|
||||
- Adjust the model name that you want to use in your vLLM servers if you don't want to use `Llama-2-7b-chat-hf`.
|
||||
|
||||
```console
|
||||
mkdir -p ~/.cache/huggingface/hub/
|
||||
hf_cache_dir=~/.cache/huggingface/
|
||||
docker run \
|
||||
-itd \
|
||||
--ipc host \
|
||||
--network vllm_nginx \
|
||||
--gpus device=0 \
|
||||
--shm-size=10.24gb \
|
||||
-v $hf_cache_dir:/root/.cache/huggingface/ \
|
||||
-p 8081:8000 \
|
||||
--name vllm0 vllm \
|
||||
--model meta-llama/Llama-2-7b-chat-hf
|
||||
docker run \
|
||||
-itd \
|
||||
--ipc host \
|
||||
--network vllm_nginx \
|
||||
--gpus device=1 \
|
||||
--shm-size=10.24gb \
|
||||
-v $hf_cache_dir:/root/.cache/huggingface/ \
|
||||
-p 8082:8000 \
|
||||
--name vllm1 vllm \
|
||||
--model meta-llama/Llama-2-7b-chat-hf
|
||||
```
|
||||
??? Commands
|
||||
|
||||
```console
|
||||
mkdir -p ~/.cache/huggingface/hub/
|
||||
hf_cache_dir=~/.cache/huggingface/
|
||||
docker run \
|
||||
-itd \
|
||||
--ipc host \
|
||||
--network vllm_nginx \
|
||||
--gpus device=0 \
|
||||
--shm-size=10.24gb \
|
||||
-v $hf_cache_dir:/root/.cache/huggingface/ \
|
||||
-p 8081:8000 \
|
||||
--name vllm0 vllm \
|
||||
--model meta-llama/Llama-2-7b-chat-hf
|
||||
docker run \
|
||||
-itd \
|
||||
--ipc host \
|
||||
--network vllm_nginx \
|
||||
--gpus device=1 \
|
||||
--shm-size=10.24gb \
|
||||
-v $hf_cache_dir:/root/.cache/huggingface/ \
|
||||
-p 8082:8000 \
|
||||
--name vllm1 vllm \
|
||||
--model meta-llama/Llama-2-7b-chat-hf
|
||||
```
|
||||
|
||||
!!! note
|
||||
If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`.
|
||||
|
||||
Reference in New Issue
Block a user