[doc] Fold long code blocks to improve readability (#19926)

Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-23 13:24:23 +08:00
parent 493c275352
commit f17aec0d63
50 changed files with 3455 additions and 3180 deletions
--- a/docs/deployment/frameworks/autogen.md
+++ b/docs/deployment/frameworks/autogen.md
@@ -30,51 +30,53 @@ python -m vllm.entrypoints.openai.api_server \

 - Call it with AutoGen:

-```python
-import asyncio
-from autogen_core.models import UserMessage
-from autogen_ext.models.openai import OpenAIChatCompletionClient
-from autogen_core.models import ModelFamily
+??? Code
+
+    ```python
+    import asyncio
+    from autogen_core.models import UserMessage
+    from autogen_ext.models.openai import OpenAIChatCompletionClient
+    from autogen_core.models import ModelFamily


-async def main() -> None:
-    # Create a model client
-    model_client = OpenAIChatCompletionClient(
-        model="mistralai/Mistral-7B-Instruct-v0.2",
-        base_url="http://{your-vllm-host-ip}:{your-vllm-host-port}/v1",
-        api_key="EMPTY",
-        model_info={
-            "vision": False,
-            "function_calling": False,
-            "json_output": False,
-            "family": ModelFamily.MISTRAL,
-            "structured_output": True,
-        },
-    )
+    async def main() -> None:
+        # Create a model client
+        model_client = OpenAIChatCompletionClient(
+            model="mistralai/Mistral-7B-Instruct-v0.2",
+            base_url="http://{your-vllm-host-ip}:{your-vllm-host-port}/v1",
+            api_key="EMPTY",
+            model_info={
+                "vision": False,
+                "function_calling": False,
+                "json_output": False,
+                "family": ModelFamily.MISTRAL,
+                "structured_output": True,
+            },
+        )

-    messages = [UserMessage(content="Write a very short story about a dragon.", source="user")]
+        messages = [UserMessage(content="Write a very short story about a dragon.", source="user")]

-    # Create a stream.
-    stream = model_client.create_stream(messages=messages)
+        # Create a stream.
+        stream = model_client.create_stream(messages=messages)

-    # Iterate over the stream and print the responses.
-    print("Streamed responses:")
-    async for response in stream:
-        if isinstance(response, str):
-            # A partial response is a string.
-            print(response, flush=True, end="")
-        else:
-            # The last response is a CreateResult object with the complete message.
-            print("\n\n------------\n")
-            print("The complete response:", flush=True)
-            print(response.content, flush=True)
+        # Iterate over the stream and print the responses.
+        print("Streamed responses:")
+        async for response in stream:
+            if isinstance(response, str):
+                # A partial response is a string.
+                print(response, flush=True, end="")
+            else:
+                # The last response is a CreateResult object with the complete message.
+                print("\n\n------------\n")
+                print("The complete response:", flush=True)
+                print(response.content, flush=True)

-    # Close the client when done.
-    await model_client.close()
+        # Close the client when done.
+        await model_client.close()


-asyncio.run(main())
-```
+    asyncio.run(main())
+    ```

 For details, see the tutorial:

--- a/docs/deployment/frameworks/cerebrium.md
+++ b/docs/deployment/frameworks/cerebrium.md
@@ -34,25 +34,27 @@ vllm = "latest"

 Next, let us add our code to handle inference for the LLM of your choice (`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your `main.py`:

-```python
-from vllm import LLM, SamplingParams
+??? Code

-llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1")
+    ```python
+    from vllm import LLM, SamplingParams

-def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95):
+    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1")

-    sampling_params = SamplingParams(temperature=temperature, top_p=top_p)
-    outputs = llm.generate(prompts, sampling_params)
+    def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95):

-    # Print the outputs.
-    results = []
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        results.append({"prompt": prompt, "generated_text": generated_text})
+        sampling_params = SamplingParams(temperature=temperature, top_p=top_p)
+        outputs = llm.generate(prompts, sampling_params)

-    return {"results": results}
-```
+        # Print the outputs.
+        results = []
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            results.append({"prompt": prompt, "generated_text": generated_text})
+
+        return {"results": results}
+    ```

 Then, run the following code to deploy it to the cloud:

@@ -62,47 +64,51 @@ cerebrium deploy

 If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case`/run`)

-```python
-curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
- -H 'Content-Type: application/json' \
- -H 'Authorization: <JWT TOKEN>' \
- --data '{
-   "prompts": [
-     "Hello, my name is",
-     "The president of the United States is",
-     "The capital of France is",
-     "The future of AI is"
-   ]
- }'
-```
+??? Command
+
+    ```python
+    curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
+    -H 'Content-Type: application/json' \
+    -H 'Authorization: <JWT TOKEN>' \
+    --data '{
+    "prompts": [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is"
+    ]
+    }'
+    ```

 You should get a response like:

-```python
-{
-    "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
-    "result": {
-        "result": [
-            {
-                "prompt": "Hello, my name is",
-                "generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of"
-            },
-            {
-                "prompt": "The president of the United States is",
-                "generated_text": " elected every four years. This is a democratic system.\n\n5. What"
-            },
-            {
-                "prompt": "The capital of France is",
-                "generated_text": " Paris.\n"
-            },
-            {
-                "prompt": "The future of AI is",
-                "generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective."
-            }
-        ]
-    },
-    "run_time_ms": 152.53663063049316
-}
-```
+??? Response
+
+    ```python
+    {
+        "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
+        "result": {
+            "result": [
+                {
+                    "prompt": "Hello, my name is",
+                    "generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of"
+                },
+                {
+                    "prompt": "The president of the United States is",
+                    "generated_text": " elected every four years. This is a democratic system.\n\n5. What"
+                },
+                {
+                    "prompt": "The capital of France is",
+                    "generated_text": " Paris.\n"
+                },
+                {
+                    "prompt": "The future of AI is",
+                    "generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective."
+                }
+            ]
+        },
+        "run_time_ms": 152.53663063049316
+    }
+    ```

 You now have an autoscaling endpoint where you only pay for the compute you use!
--- a/docs/deployment/frameworks/dstack.md
+++ b/docs/deployment/frameworks/dstack.md
@@ -26,75 +26,81 @@ dstack init

 Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`:

-```yaml
-type: service
+??? Config

-python: "3.11"
-env:
-    - MODEL=NousResearch/Llama-2-7b-chat-hf
-port: 8000
-resources:
-    gpu: 24GB
-commands:
-    - pip install vllm
-    - vllm serve $MODEL --port 8000
-model:
-    format: openai
-    type: chat
-    name: NousResearch/Llama-2-7b-chat-hf
-```
+    ```yaml
+    type: service
+
+    python: "3.11"
+    env:
+        - MODEL=NousResearch/Llama-2-7b-chat-hf
+    port: 8000
+    resources:
+        gpu: 24GB
+    commands:
+        - pip install vllm
+        - vllm serve $MODEL --port 8000
+    model:
+        format: openai
+        type: chat
+        name: NousResearch/Llama-2-7b-chat-hf
+    ```

 Then, run the following CLI for provisioning:

-```console
-$ dstack run . -f serve.dstack.yml
+??? Command

-⠸ Getting run plan...
- Configuration  serve.dstack.yml
- Project        deep-diver-main
- User           deep-diver
- Min resources  2..xCPU, 8GB.., 1xGPU (24GB)
- Max price      -
- Max duration   -
- Spot policy    auto
- Retry policy   no
+    ```console
+    $ dstack run . -f serve.dstack.yml

- #  BACKEND  REGION       INSTANCE       RESOURCES                               SPOT  PRICE
- 1  gcp   us-central1  g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804
- 2  gcp   us-east1     g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804
- 3  gcp   us-west1     g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804
-    ...
- Shown 3 of 193 offers, $5.876 max
+    ⠸ Getting run plan...
+    Configuration  serve.dstack.yml
+    Project        deep-diver-main
+    User           deep-diver
+    Min resources  2..xCPU, 8GB.., 1xGPU (24GB)
+    Max price      -
+    Max duration   -
+    Spot policy    auto
+    Retry policy   no

-Continue? [y/n]: y
-⠙ Submitting run...
-⠏ Launching spicy-treefrog-1 (pulling)
-spicy-treefrog-1 provisioning completed (running)
-Service is published at ...
-```
+    #  BACKEND  REGION       INSTANCE       RESOURCES                               SPOT  PRICE
+    1  gcp   us-central1  g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804
+    2  gcp   us-east1     g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804
+    3  gcp   us-west1     g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804
+        ...
+    Shown 3 of 193 offers, $5.876 max
+
+    Continue? [y/n]: y
+    ⠙ Submitting run...
+    ⠏ Launching spicy-treefrog-1 (pulling)
+    spicy-treefrog-1 provisioning completed (running)
+    Service is published at ...
+    ```

 After the provisioning, you can interact with the model by using the OpenAI SDK:

-```python
-from openai import OpenAI
+??? Code

-client = OpenAI(
-    base_url="https://gateway.<gateway domain>",
-    api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
-)
+    ```python
+    from openai import OpenAI

-completion = client.chat.completions.create(
-    model="NousResearch/Llama-2-7b-chat-hf",
-    messages=[
-        {
-            "role": "user",
-            "content": "Compose a poem that explains the concept of recursion in programming.",
-        }
-    ]
-)
+    client = OpenAI(
+        base_url="https://gateway.<gateway domain>",
+        api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
+    )

-print(completion.choices[0].message.content)
-```
+    completion = client.chat.completions.create(
+        model="NousResearch/Llama-2-7b-chat-hf",
+        messages=[
+            {
+                "role": "user",
+                "content": "Compose a poem that explains the concept of recursion in programming.",
+            }
+        ]
+    )
+
+    print(completion.choices[0].message.content)
+    ```

 !!! note
    dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm)
--- a/docs/deployment/frameworks/haystack.md
+++ b/docs/deployment/frameworks/haystack.md
@@ -27,29 +27,29 @@ vllm serve mistralai/Mistral-7B-Instruct-v0.1

 - Use the `OpenAIGenerator` and `OpenAIChatGenerator` components in Haystack to query the vLLM server.

-```python
-from haystack.components.generators.chat import OpenAIChatGenerator
-from haystack.dataclasses import ChatMessage
-from haystack.utils import Secret
+??? Code

-generator = OpenAIChatGenerator(
-    # for compatibility with the OpenAI API, a placeholder api_key is needed
-    api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"),
-    model="mistralai/Mistral-7B-Instruct-v0.1",
-    api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
-    generation_kwargs = {"max_tokens": 512}
-)
+    ```python
+    from haystack.components.generators.chat import OpenAIChatGenerator
+    from haystack.dataclasses import ChatMessage
+    from haystack.utils import Secret

-response = generator.run(
-  messages=[ChatMessage.from_user("Hi. Can you help me plan my next trip to Italy?")]
-)
+    generator = OpenAIChatGenerator(
+        # for compatibility with the OpenAI API, a placeholder api_key is needed
+        api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"),
+        model="mistralai/Mistral-7B-Instruct-v0.1",
+        api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
+        generation_kwargs = {"max_tokens": 512}
+    )

-print("-"*30)
-print(response)
-print("-"*30)
-```
+    response = generator.run(
+      messages=[ChatMessage.from_user("Hi. Can you help me plan my next trip to Italy?")]
+    )

-Output e.g.:
+    print("-"*30)
+    print(response)
+    print("-"*30)
+    ```

 ```console
 ------------------------------
--- a/docs/deployment/frameworks/litellm.md
+++ b/docs/deployment/frameworks/litellm.md
@@ -34,21 +34,23 @@ vllm serve qwen/Qwen1.5-0.5B-Chat

 - Call it with litellm:

-```python
-import litellm 
+??? Code

-messages = [{ "content": "Hello, how are you?","role": "user"}]
+    ```python
+    import litellm 

-# hosted_vllm is prefix key word and necessary
-response = litellm.completion(
-            model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
-            messages=messages,
-            api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
-            temperature=0.2,
-            max_tokens=80)
+    messages = [{ "content": "Hello, how are you?","role": "user"}]

-print(response)
-```
+    # hosted_vllm is prefix key word and necessary
+    response = litellm.completion(
+                model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
+                messages=messages,
+                api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
+                temperature=0.2,
+                max_tokens=80)
+
+    print(response)
+    ```

 ### Embeddings

--- a/docs/deployment/frameworks/lws.md
+++ b/docs/deployment/frameworks/lws.md
@@ -17,99 +17,101 @@ vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kuber

 Deploy the following yaml file `lws.yaml`

-```yaml
-apiVersion: leaderworkerset.x-k8s.io/v1
-kind: LeaderWorkerSet
-metadata:
-  name: vllm
-spec:
-  replicas: 2
-  leaderWorkerTemplate:
-    size: 2
-    restartPolicy: RecreateGroupOnPodRestart
-    leaderTemplate:
-      metadata:
-        labels:
-          role: leader
-      spec:
-        containers:
-          - name: vllm-leader
-            image: docker.io/vllm/vllm-openai:latest
-            env:
-              - name: HUGGING_FACE_HUB_TOKEN
-                value: <your-hf-token>
-            command:
-              - sh
-              - -c
-              - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); 
-                 python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
-            resources:
-              limits:
-                nvidia.com/gpu: "8"
-                memory: 1124Gi
-                ephemeral-storage: 800Gi
-              requests:
-                ephemeral-storage: 800Gi
-                cpu: 125
-            ports:
-              - containerPort: 8080
-            readinessProbe:
-              tcpSocket:
-                port: 8080
-              initialDelaySeconds: 15
-              periodSeconds: 10
-            volumeMounts:
-              - mountPath: /dev/shm
-                name: dshm
-        volumes:
-        - name: dshm
-          emptyDir:
-            medium: Memory
-            sizeLimit: 15Gi
-    workerTemplate:
-      spec:
-        containers:
-          - name: vllm-worker
-            image: docker.io/vllm/vllm-openai:latest
-            command:
-              - sh
-              - -c
-              - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
-            resources:
-              limits:
-                nvidia.com/gpu: "8"
-                memory: 1124Gi
-                ephemeral-storage: 800Gi
-              requests:
-                ephemeral-storage: 800Gi
-                cpu: 125
-            env:
-              - name: HUGGING_FACE_HUB_TOKEN
-                value: <your-hf-token>
-            volumeMounts:
-              - mountPath: /dev/shm
-                name: dshm   
-        volumes:
-        - name: dshm
-          emptyDir:
-            medium: Memory
-            sizeLimit: 15Gi
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: vllm-leader
-spec:
-  ports:
-    - name: http
-      port: 8080
-      protocol: TCP
-      targetPort: 8080
-  selector:
-    leaderworkerset.sigs.k8s.io/name: vllm
-    role: leader
-  type: ClusterIP
-```
+??? Yaml
+
+    ```yaml
+    apiVersion: leaderworkerset.x-k8s.io/v1
+    kind: LeaderWorkerSet
+    metadata:
+      name: vllm
+    spec:
+      replicas: 2
+      leaderWorkerTemplate:
+        size: 2
+        restartPolicy: RecreateGroupOnPodRestart
+        leaderTemplate:
+          metadata:
+            labels:
+              role: leader
+          spec:
+            containers:
+              - name: vllm-leader
+                image: docker.io/vllm/vllm-openai:latest
+                env:
+                  - name: HUGGING_FACE_HUB_TOKEN
+                    value: <your-hf-token>
+                command:
+                  - sh
+                  - -c
+                  - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); 
+                    python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
+                resources:
+                  limits:
+                    nvidia.com/gpu: "8"
+                    memory: 1124Gi
+                    ephemeral-storage: 800Gi
+                  requests:
+                    ephemeral-storage: 800Gi
+                    cpu: 125
+                ports:
+                  - containerPort: 8080
+                readinessProbe:
+                  tcpSocket:
+                    port: 8080
+                  initialDelaySeconds: 15
+                  periodSeconds: 10
+                volumeMounts:
+                  - mountPath: /dev/shm
+                    name: dshm
+            volumes:
+            - name: dshm
+              emptyDir:
+                medium: Memory
+                sizeLimit: 15Gi
+        workerTemplate:
+          spec:
+            containers:
+              - name: vllm-worker
+                image: docker.io/vllm/vllm-openai:latest
+                command:
+                  - sh
+                  - -c
+                  - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
+                resources:
+                  limits:
+                    nvidia.com/gpu: "8"
+                    memory: 1124Gi
+                    ephemeral-storage: 800Gi
+                  requests:
+                    ephemeral-storage: 800Gi
+                    cpu: 125
+                env:
+                  - name: HUGGING_FACE_HUB_TOKEN
+                    value: <your-hf-token>
+                volumeMounts:
+                  - mountPath: /dev/shm
+                    name: dshm   
+            volumes:
+            - name: dshm
+              emptyDir:
+                medium: Memory
+                sizeLimit: 15Gi
+    ---
+    apiVersion: v1
+    kind: Service
+    metadata:
+      name: vllm-leader
+    spec:
+      ports:
+        - name: http
+          port: 8080
+          protocol: TCP
+          targetPort: 8080
+      selector:
+        leaderworkerset.sigs.k8s.io/name: vllm
+        role: leader
+      type: ClusterIP
+    ```

 ```bash
 kubectl apply -f lws.yaml
@@ -175,25 +177,27 @@ curl http://localhost:8080/v1/completions \

 The output should be similar to the following

-```text
-{
-  "id": "cmpl-1bb34faba88b43f9862cfbfb2200949d",
-  "object": "text_completion",
-  "created": 1715138766,
-  "model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
-  "choices": [
+??? Output
+
+    ```text
    {
-      "index": 0,
-      "text": " top destination for foodies, with",
-      "logprobs": null,
-      "finish_reason": "length",
-      "stop_reason": null
+      "id": "cmpl-1bb34faba88b43f9862cfbfb2200949d",
+      "object": "text_completion",
+      "created": 1715138766,
+      "model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
+      "choices": [
+        {
+          "index": 0,
+          "text": " top destination for foodies, with",
+          "logprobs": null,
+          "finish_reason": "length",
+          "stop_reason": null
+        }
+      ],
+      "usage": {
+        "prompt_tokens": 5,
+        "total_tokens": 12,
+        "completion_tokens": 7
+      }
    }
-  ],
-  "usage": {
-    "prompt_tokens": 5,
-    "total_tokens": 12,
-    "completion_tokens": 7
-  }
-}
-```
+    ```
--- a/docs/deployment/frameworks/skypilot.md
+++ b/docs/deployment/frameworks/skypilot.md
@@ -24,48 +24,50 @@ sky check

 See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml).

-```yaml
-resources:
-  accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
-  use_spot: True
-  disk_size: 512  # Ensure model checkpoints can fit.
-  disk_tier: best
-  ports: 8081  # Expose to internet traffic.
+??? Yaml

-envs:
-  MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
-  HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
+    ```yaml
+    resources:
+      accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+      use_spot: True
+      disk_size: 512  # Ensure model checkpoints can fit.
+      disk_tier: best
+      ports: 8081  # Expose to internet traffic.

-setup: |
-  conda create -n vllm python=3.10 -y
-  conda activate vllm
+    envs:
+      MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+      HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.

-  pip install vllm==0.4.0.post1
-  # Install Gradio for web UI.
-  pip install gradio openai
-  pip install flash-attn==2.5.7
+    setup: |
+      conda create -n vllm python=3.10 -y
+      conda activate vllm

-run: |
-  conda activate vllm
-  echo 'Starting vllm api server...'
-  python -u -m vllm.entrypoints.openai.api_server \
-    --port 8081 \
-    --model $MODEL_NAME \
-    --trust-remote-code \
-    --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
-    2>&1 | tee api_server.log &
+      pip install vllm==0.4.0.post1
+      # Install Gradio for web UI.
+      pip install gradio openai
+      pip install flash-attn==2.5.7

-  echo 'Waiting for vllm api server to start...'
-  while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
+    run: |
+      conda activate vllm
+      echo 'Starting vllm api server...'
+      python -u -m vllm.entrypoints.openai.api_server \
+        --port 8081 \
+        --model $MODEL_NAME \
+        --trust-remote-code \
+        --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+        2>&1 | tee api_server.log &

-  echo 'Starting gradio server...'
-  git clone https://github.com/vllm-project/vllm.git || true
-  python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
-    -m $MODEL_NAME \
-    --port 8811 \
-    --model-url http://localhost:8081/v1 \
-    --stop-token-ids 128009,128001
-```
+      echo 'Waiting for vllm api server to start...'
+      while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
+
+      echo 'Starting gradio server...'
+      git clone https://github.com/vllm-project/vllm.git || true
+      python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
+        -m $MODEL_NAME \
+        --port 8811 \
+        --model-url http://localhost:8081/v1 \
+        --stop-token-ids 128009,128001
+    ```

 Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...):

@@ -93,68 +95,67 @@ HF_TOKEN="your-huggingface-token" \

 SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file.

-```yaml
-service:
-  replicas: 2
-  # An actual request for readiness probe.
-  readiness_probe:
-    path: /v1/chat/completions
-    post_data:
-    model: $MODEL_NAME
-    messages:
-      - role: user
-        content: Hello! What is your name?
-  max_completion_tokens: 1
-```
+??? Yaml

-<details>
-<summary>Click to see the full recipe YAML</summary>
-
-```yaml
-service:
-  replicas: 2
-  # An actual request for readiness probe.
-  readiness_probe:
-    path: /v1/chat/completions
-    post_data:
-      model: $MODEL_NAME
-      messages:
-        - role: user
-          content: Hello! What is your name?
+    ```yaml
+    service:
+      replicas: 2
+      # An actual request for readiness probe.
+      readiness_probe:
+        path: /v1/chat/completions
+        post_data:
+        model: $MODEL_NAME
+        messages:
+          - role: user
+            content: Hello! What is your name?
      max_completion_tokens: 1
+    ```

-resources:
-  accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
-  use_spot: True
-  disk_size: 512  # Ensure model checkpoints can fit.
-  disk_tier: best
-  ports: 8081  # Expose to internet traffic.
+??? Yaml

-envs:
-  MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
-  HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
+    ```yaml
+    service:
+      replicas: 2
+      # An actual request for readiness probe.
+      readiness_probe:
+        path: /v1/chat/completions
+        post_data:
+          model: $MODEL_NAME
+          messages:
+            - role: user
+              content: Hello! What is your name?
+          max_completion_tokens: 1

-setup: |
-  conda create -n vllm python=3.10 -y
-  conda activate vllm
+    resources:
+      accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+      use_spot: True
+      disk_size: 512  # Ensure model checkpoints can fit.
+      disk_tier: best
+      ports: 8081  # Expose to internet traffic.

-  pip install vllm==0.4.0.post1
-  # Install Gradio for web UI.
-  pip install gradio openai
-  pip install flash-attn==2.5.7
+    envs:
+      MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+      HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.

-run: |
-  conda activate vllm
-  echo 'Starting vllm api server...'
-  python -u -m vllm.entrypoints.openai.api_server \
-    --port 8081 \
-    --model $MODEL_NAME \
-    --trust-remote-code \
-    --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
-    2>&1 | tee api_server.log
-```
+    setup: |
+      conda create -n vllm python=3.10 -y
+      conda activate vllm

-</details>
+      pip install vllm==0.4.0.post1
+      # Install Gradio for web UI.
+      pip install gradio openai
+      pip install flash-attn==2.5.7
+
+    run: |
+      conda activate vllm
+      echo 'Starting vllm api server...'
+      python -u -m vllm.entrypoints.openai.api_server \
+        --port 8081 \
+        --model $MODEL_NAME \
+        --trust-remote-code \
+        --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+        2>&1 | tee api_server.log
+    ```

 Start the serving the Llama-3 8B model on multiple replicas:

@@ -170,8 +171,7 @@ Wait until the service is ready:
 watch -n10 sky serve status vllm
 ```

-<details>
-<summary>Example outputs:</summary>
+Example outputs:

 ```console
 Services
@@ -184,29 +184,29 @@ vllm          1   1        xx.yy.zz.121  18 mins ago  1x GCP([Spot]{'L4': 1})  R
 vllm          2   1        xx.yy.zz.245  18 mins ago  1x GCP([Spot]{'L4': 1})  READY   us-east4
 ```

-</details>
-
 After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:

-```console
-ENDPOINT=$(sky serve status --endpoint 8081 vllm)
-curl -L http://$ENDPOINT/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "meta-llama/Meta-Llama-3-8B-Instruct",
-    "messages": [
-    {
-      "role": "system",
-      "content": "You are a helpful assistant."
-    },
-    {
-      "role": "user",
-      "content": "Who are you?"
-    }
-    ],
-    "stop_token_ids": [128009,  128001]
-  }'
-```
+??? Commands
+
+    ```bash
+    ENDPOINT=$(sky serve status --endpoint 8081 vllm)
+    curl -L http://$ENDPOINT/v1/chat/completions \
+      -H "Content-Type: application/json" \
+      -d '{
+        "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+        "messages": [
+        {
+          "role": "system",
+          "content": "You are a helpful assistant."
+        },
+        {
+          "role": "user",
+          "content": "Who are you?"
+        }
+        ],
+        "stop_token_ids": [128009,  128001]
+      }'
+    ```

 To enable autoscaling, you could replace the `replicas` with the following configs in `service`:

@@ -220,57 +220,54 @@ service:

 This will scale the service up to when the QPS exceeds 2 for each replica.

-<details>
-<summary>Click to see the full recipe YAML</summary>
+??? Yaml

-```yaml
-service:
-  replica_policy:
-    min_replicas: 2
-    max_replicas: 4
-    target_qps_per_replica: 2
-  # An actual request for readiness probe.
-  readiness_probe:
-    path: /v1/chat/completions
-    post_data:
-      model: $MODEL_NAME
-      messages:
-        - role: user
-          content: Hello! What is your name?
-      max_completion_tokens: 1
+    ```yaml
+    service:
+      replica_policy:
+        min_replicas: 2
+        max_replicas: 4
+        target_qps_per_replica: 2
+      # An actual request for readiness probe.
+      readiness_probe:
+        path: /v1/chat/completions
+        post_data:
+          model: $MODEL_NAME
+          messages:
+            - role: user
+              content: Hello! What is your name?
+          max_completion_tokens: 1

-resources:
-  accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
-  use_spot: True
-  disk_size: 512  # Ensure model checkpoints can fit.
-  disk_tier: best
-  ports: 8081  # Expose to internet traffic.
+    resources:
+      accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+      use_spot: True
+      disk_size: 512  # Ensure model checkpoints can fit.
+      disk_tier: best
+      ports: 8081  # Expose to internet traffic.

-envs:
-  MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
-  HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
+    envs:
+      MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+      HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.

-setup: |
-  conda create -n vllm python=3.10 -y
-  conda activate vllm
+    setup: |
+      conda create -n vllm python=3.10 -y
+      conda activate vllm

-  pip install vllm==0.4.0.post1
-  # Install Gradio for web UI.
-  pip install gradio openai
-  pip install flash-attn==2.5.7
+      pip install vllm==0.4.0.post1
+      # Install Gradio for web UI.
+      pip install gradio openai
+      pip install flash-attn==2.5.7

-run: |
-  conda activate vllm
-  echo 'Starting vllm api server...'
-  python -u -m vllm.entrypoints.openai.api_server \
-    --port 8081 \
-    --model $MODEL_NAME \
-    --trust-remote-code \
-    --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
-    2>&1 | tee api_server.log
-```
-
-</details>
+    run: |
+      conda activate vllm
+      echo 'Starting vllm api server...'
+      python -u -m vllm.entrypoints.openai.api_server \
+        --port 8081 \
+        --model $MODEL_NAME \
+        --trust-remote-code \
+        --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+        2>&1 | tee api_server.log
+    ```

 To update the service with the new config:

@@ -288,38 +285,35 @@ sky serve down vllm

 It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas.

-<details>
-<summary>Click to see the full GUI YAML</summary>
+??? Yaml

-```yaml
-envs:
-  MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
-  ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm.
+    ```yaml
+    envs:
+      MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+      ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm.

-resources:
-  cpus: 2
+    resources:
+      cpus: 2

-setup: |
-  conda create -n vllm python=3.10 -y
-  conda activate vllm
+    setup: |
+      conda create -n vllm python=3.10 -y
+      conda activate vllm

-  # Install Gradio for web UI.
-  pip install gradio openai
+      # Install Gradio for web UI.
+      pip install gradio openai

-run: |
-  conda activate vllm
-  export PATH=$PATH:/sbin
+    run: |
+      conda activate vllm
+      export PATH=$PATH:/sbin

-  echo 'Starting gradio server...'
-  git clone https://github.com/vllm-project/vllm.git || true
-  python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
-    -m $MODEL_NAME \
-    --port 8811 \
-    --model-url http://$ENDPOINT/v1 \
-    --stop-token-ids 128009,128001 | tee ~/gradio.log
-```
-
-</details>
+      echo 'Starting gradio server...'
+      git clone https://github.com/vllm-project/vllm.git || true
+      python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
+        -m $MODEL_NAME \
+        --port 8811 \
+        --model-url http://$ENDPOINT/v1 \
+        --stop-token-ids 128009,128001 | tee ~/gradio.log
+    ```

 1. Start the chat web UI: