[doc] Fold long code blocks to improve readability (#19926)

Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-23 13:24:23 +08:00
parent 493c275352
commit f17aec0d63
50 changed files with 3455 additions and 3180 deletions
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@@ -29,24 +29,26 @@ We can now submit the prompts and call `llm.generate` with the `lora_request` pa
 of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and
 the third parameter is the path to the LoRA adapter.

-```python
-sampling_params = SamplingParams(
-    temperature=0,
-    max_tokens=256,
-    stop=["[/assistant]"]
-)
+??? Code

-prompts = [
-     "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
-     "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
-]
+    ```python
+    sampling_params = SamplingParams(
+        temperature=0,
+        max_tokens=256,
+        stop=["[/assistant]"]
+    )

-outputs = llm.generate(
-    prompts,
-    sampling_params,
-    lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
-)
-```
+    prompts = [
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
+    ]
+
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
+    )
+    ```

 Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.

@@ -68,24 +70,26 @@ The server entrypoint accepts all other LoRA configuration parameters (`max_lora
 etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along
 with its base model (if `jq` is not installed, you can follow [this guide](https://jqlang.org/download/) to install it.):

-```bash
-curl localhost:8000/v1/models | jq .
-{
-    "object": "list",
-    "data": [
-        {
-            "id": "meta-llama/Llama-2-7b-hf",
-            "object": "model",
-            ...
-        },
-        {
-            "id": "sql-lora",
-            "object": "model",
-            ...
-        }
-    ]
-}
-```
+??? Command
+
+    ```bash
+    curl localhost:8000/v1/models | jq .
+    {
+        "object": "list",
+        "data": [
+            {
+                "id": "meta-llama/Llama-2-7b-hf",
+                "object": "model",
+                ...
+            },
+            {
+                "id": "sql-lora",
+                "object": "model",
+                ...
+            }
+        ]
+    }
+    ```

 Requests can specify the LoRA adapter as if it were any other model via the `model` request parameter. The requests will be
 processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other
@@ -168,36 +172,36 @@ Alternatively, follow these example steps to implement your own plugin:

 1. Implement the LoRAResolver interface.

-    Example of a simple S3 LoRAResolver implementation:
+    ??? Example of a simple S3 LoRAResolver implementation

-    ```python
-    import os
-    import s3fs
-    from vllm.lora.request import LoRARequest
-    from vllm.lora.resolver import LoRAResolver
+        ```python
+        import os
+        import s3fs
+        from vllm.lora.request import LoRARequest
+        from vllm.lora.resolver import LoRAResolver

-    class S3LoRAResolver(LoRAResolver):
-        def __init__(self):
-            self.s3 = s3fs.S3FileSystem()
-            self.s3_path_format = os.getenv("S3_PATH_TEMPLATE")
-            self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE")
+        class S3LoRAResolver(LoRAResolver):
+            def __init__(self):
+                self.s3 = s3fs.S3FileSystem()
+                self.s3_path_format = os.getenv("S3_PATH_TEMPLATE")
+                self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE")

-        async def resolve_lora(self, base_model_name, lora_name):
-            s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
-            local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
+            async def resolve_lora(self, base_model_name, lora_name):
+                s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
+                local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name)

-            # Download the LoRA from S3 to the local path
-            await self.s3._get(
-                s3_path, local_path, recursive=True, maxdepth=1
-            )
+                # Download the LoRA from S3 to the local path
+                await self.s3._get(
+                    s3_path, local_path, recursive=True, maxdepth=1
+                )

-            lora_request = LoRARequest(
-                lora_name=lora_name,
-                lora_path=local_path,
-                lora_int_id=abs(hash(lora_name))
-            )
-            return lora_request
-    ```
+                lora_request = LoRARequest(
+                    lora_name=lora_name,
+                    lora_path=local_path,
+                    lora_int_id=abs(hash(lora_name))
+                )
+                return lora_request
+        ```

 2. Register `LoRAResolver` plugin.

@@ -234,38 +238,40 @@ The new format of `--lora-modules` is mainly to support the display of parent mo
 - The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter.
 - The `root` field points to the artifact location of the lora adapter.

-```bash
-$ curl http://localhost:8000/v1/models
+??? Command output

-{
-    "object": "list",
-    "data": [
-        {
-        "id": "meta-llama/Llama-2-7b-hf",
-        "object": "model",
-        "created": 1715644056,
-        "owned_by": "vllm",
-        "root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
-        "parent": null,
-        "permission": [
+    ```bash
+    $ curl http://localhost:8000/v1/models
+
+    {
+        "object": "list",
+        "data": [
            {
-            .....
+            "id": "meta-llama/Llama-2-7b-hf",
+            "object": "model",
+            "created": 1715644056,
+            "owned_by": "vllm",
+            "root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
+            "parent": null,
+            "permission": [
+                {
+                .....
+                }
+            ]
+            },
+            {
+            "id": "sql-lora",
+            "object": "model",
+            "created": 1715644056,
+            "owned_by": "vllm",
+            "root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
+            "parent": meta-llama/Llama-2-7b-hf,
+            "permission": [
+                {
+                ....
+                }
+            ]
            }
        ]
-        },
-        {
-        "id": "sql-lora",
-        "object": "model",
-        "created": 1715644056,
-        "owned_by": "vllm",
-        "root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
-        "parent": meta-llama/Llama-2-7b-hf,
-        "permission": [
-            {
-            ....
-            }
-        ]
-        }
-    ]
-}
-```
+    }
+    ```
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -20,111 +20,117 @@ To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]:

 You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:

-```python
-from vllm import LLM
+??? Code

-llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+    ```python
+    from vllm import LLM

-# Refer to the HuggingFace repo for the correct format to use
-prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf")

-# Load the image using PIL.Image
-image = PIL.Image.open(...)
+    # Refer to the HuggingFace repo for the correct format to use
+    prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"

-# Single prompt inference
-outputs = llm.generate({
-    "prompt": prompt,
-    "multi_modal_data": {"image": image},
-})
+    # Load the image using PIL.Image
+    image = PIL.Image.open(...)

-for o in outputs:
-    generated_text = o.outputs[0].text
-    print(generated_text)
+    # Single prompt inference
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {"image": image},
+    })

-# Batch inference
-image_1 = PIL.Image.open(...)
-image_2 = PIL.Image.open(...)
-outputs = llm.generate(
-    [
-        {
-            "prompt": "USER: <image>\nWhat is the content of this image?\nASSISTANT:",
-            "multi_modal_data": {"image": image_1},
-        },
-        {
-            "prompt": "USER: <image>\nWhat's the color of this image?\nASSISTANT:",
-            "multi_modal_data": {"image": image_2},
-        }
-    ]
-)
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)

-for o in outputs:
-    generated_text = o.outputs[0].text
-    print(generated_text)
-```
+    # Batch inference
+    image_1 = PIL.Image.open(...)
+    image_2 = PIL.Image.open(...)
+    outputs = llm.generate(
+        [
+            {
+                "prompt": "USER: <image>\nWhat is the content of this image?\nASSISTANT:",
+                "multi_modal_data": {"image": image_1},
+            },
+            {
+                "prompt": "USER: <image>\nWhat's the color of this image?\nASSISTANT:",
+                "multi_modal_data": {"image": image_2},
+            }
+        ]
+    )
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+    ```

 Full example: <gh-file:examples/offline_inference/vision_language.py>

 To substitute multiple images inside the same text prompt, you can pass in a list of images instead:

-```python
-from vllm import LLM
+??? Code

-llm = LLM(
-    model="microsoft/Phi-3.5-vision-instruct",
-    trust_remote_code=True,  # Required to load Phi-3.5-vision
-    max_model_len=4096,  # Otherwise, it may not fit in smaller GPUs
-    limit_mm_per_prompt={"image": 2},  # The maximum number to accept
-)
+    ```python
+    from vllm import LLM

-# Refer to the HuggingFace repo for the correct format to use
-prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
+    llm = LLM(
+        model="microsoft/Phi-3.5-vision-instruct",
+        trust_remote_code=True,  # Required to load Phi-3.5-vision
+        max_model_len=4096,  # Otherwise, it may not fit in smaller GPUs
+        limit_mm_per_prompt={"image": 2},  # The maximum number to accept
+    )

-# Load the images using PIL.Image
-image1 = PIL.Image.open(...)
-image2 = PIL.Image.open(...)
+    # Refer to the HuggingFace repo for the correct format to use
+    prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"

-outputs = llm.generate({
-    "prompt": prompt,
-    "multi_modal_data": {
-        "image": [image1, image2]
-    },
-})
+    # Load the images using PIL.Image
+    image1 = PIL.Image.open(...)
+    image2 = PIL.Image.open(...)

-for o in outputs:
-    generated_text = o.outputs[0].text
-    print(generated_text)
-```
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {
+            "image": [image1, image2]
+        },
+    })
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+    ```

 Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>

 Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:

-```python
-from vllm import LLM
+??? Code

-# Specify the maximum number of frames per video to be 4. This can be changed.
-llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
+    ```python
+    from vllm import LLM

-# Create the request payload.
-video_frames = ... # load your video making sure it only has the number of frames specified earlier.
-message = {
-    "role": "user",
-    "content": [
-        {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
-    ],
-}
-for i in range(len(video_frames)):
-    base64_image = encode_image(video_frames[i]) # base64 encoding.
-    new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
-    message["content"].append(new_image)
+    # Specify the maximum number of frames per video to be 4. This can be changed.
+    llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})

-# Perform inference and log output.
-outputs = llm.chat([message])
+    # Create the request payload.
+    video_frames = ... # load your video making sure it only has the number of frames specified earlier.
+    message = {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
+        ],
+    }
+    for i in range(len(video_frames)):
+        base64_image = encode_image(video_frames[i]) # base64 encoding.
+        new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
+        message["content"].append(new_image)

-for o in outputs:
-    generated_text = o.outputs[0].text
-    print(generated_text)
-```
+    # Perform inference and log output.
+    outputs = llm.chat([message])
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+    ```

 ### Video Inputs

@@ -144,68 +150,72 @@ Full example: <gh-file:examples/offline_inference/audio_language.py>
 To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
 pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.

-```python
-from vllm import LLM
+??? Code

-# Inference with image embeddings as input
-llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+    ```python
+    from vllm import LLM

-# Refer to the HuggingFace repo for the correct format to use
-prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
+    # Inference with image embeddings as input
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf")

-# Embeddings for single image
-# torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
-image_embeds = torch.load(...)
+    # Refer to the HuggingFace repo for the correct format to use
+    prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"

-outputs = llm.generate({
-    "prompt": prompt,
-    "multi_modal_data": {"image": image_embeds},
-})
+    # Embeddings for single image
+    # torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
+    image_embeds = torch.load(...)

-for o in outputs:
-    generated_text = o.outputs[0].text
-    print(generated_text)
-```
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {"image": image_embeds},
+    })
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+    ```

 For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings:

-```python
-# Construct the prompt based on your model
-prompt = ...
+??? Code

-# Embeddings for multiple images
-# torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM)
-image_embeds = torch.load(...)
+    ```python
+    # Construct the prompt based on your model
+    prompt = ...

-# Qwen2-VL
-llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
-mm_data = {
-    "image": {
-        "image_embeds": image_embeds,
-        # image_grid_thw is needed to calculate positional encoding.
-        "image_grid_thw": torch.load(...),  # torch.Tensor of shape (1, 3),
+    # Embeddings for multiple images
+    # torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM)
+    image_embeds = torch.load(...)
+
+    # Qwen2-VL
+    llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
+    mm_data = {
+        "image": {
+            "image_embeds": image_embeds,
+            # image_grid_thw is needed to calculate positional encoding.
+            "image_grid_thw": torch.load(...),  # torch.Tensor of shape (1, 3),
+        }
    }
-}

-# MiniCPM-V
-llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
-mm_data = {
-    "image": {
-        "image_embeds": image_embeds,
-        # image_sizes is needed to calculate details of the sliced image.
-        "image_sizes": [image.size for image in images],  # list of image sizes
+    # MiniCPM-V
+    llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
+    mm_data = {
+        "image": {
+            "image_embeds": image_embeds,
+            # image_sizes is needed to calculate details of the sliced image.
+            "image_sizes": [image.size for image in images],  # list of image sizes
+        }
    }
-}

-outputs = llm.generate({
-    "prompt": prompt,
-    "multi_modal_data": mm_data,
-})
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": mm_data,
+    })

-for o in outputs:
-    generated_text = o.outputs[0].text
-    print(generated_text)
-```
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+    ```

 ## Online Serving

@@ -235,51 +245,53 @@ vllm serve microsoft/Phi-3.5-vision-instruct --task generate \

 Then, you can use the OpenAI client as follows:

-```python
-from openai import OpenAI
+??? Code

-openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
+    ```python
+    from openai import OpenAI

-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"

-# Single-image input inference
-image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )

-chat_response = client.chat.completions.create(
-    model="microsoft/Phi-3.5-vision-instruct",
-    messages=[{
-        "role": "user",
-        "content": [
-            # NOTE: The prompt formatting with the image token `<image>` is not needed
-            # since the prompt will be processed automatically by the API server.
-            {"type": "text", "text": "What’s in this image?"},
-            {"type": "image_url", "image_url": {"url": image_url}},
-        ],
-    }],
-)
-print("Chat completion output:", chat_response.choices[0].message.content)
+    # Single-image input inference
+    image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"

-# Multi-image input inference
-image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
-image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
+    chat_response = client.chat.completions.create(
+        model="microsoft/Phi-3.5-vision-instruct",
+        messages=[{
+            "role": "user",
+            "content": [
+                # NOTE: The prompt formatting with the image token `<image>` is not needed
+                # since the prompt will be processed automatically by the API server.
+                {"type": "text", "text": "What’s in this image?"},
+                {"type": "image_url", "image_url": {"url": image_url}},
+            ],
+        }],
+    )
+    print("Chat completion output:", chat_response.choices[0].message.content)

-chat_response = client.chat.completions.create(
-    model="microsoft/Phi-3.5-vision-instruct",
-    messages=[{
-        "role": "user",
-        "content": [
-            {"type": "text", "text": "What are the animals in these images?"},
-            {"type": "image_url", "image_url": {"url": image_url_duck}},
-            {"type": "image_url", "image_url": {"url": image_url_lion}},
-        ],
-    }],
-)
-print("Chat completion output:", chat_response.choices[0].message.content)
-```
+    # Multi-image input inference
+    image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
+    image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
+
+    chat_response = client.chat.completions.create(
+        model="microsoft/Phi-3.5-vision-instruct",
+        messages=[{
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What are the animals in these images?"},
+                {"type": "image_url", "image_url": {"url": image_url_duck}},
+                {"type": "image_url", "image_url": {"url": image_url_lion}},
+            ],
+        }],
+    )
+    print("Chat completion output:", chat_response.choices[0].message.content)
+    ```

 Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>

@@ -311,44 +323,46 @@ vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model

 Then, you can use the OpenAI client as follows:

-```python
-from openai import OpenAI
+??? Code

-openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
+    ```python
+    from openai import OpenAI

-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"

-video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )

-## Use video url in the payload
-chat_completion_from_url = client.chat.completions.create(
-    messages=[{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "text",
-                "text": "What's in this video?"
-            },
-            {
-                "type": "video_url",
-                "video_url": {
-                    "url": video_url
+    video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
+
+    ## Use video url in the payload
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this video?"
                },
-            },
-        ],
-    }],
-    model=model,
-    max_completion_tokens=64,
-)
+                {
+                    "type": "video_url",
+                    "video_url": {
+                        "url": video_url
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )

-result = chat_completion_from_url.choices[0].message.content
-print("Chat completion output from image url:", result)
-```
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from image url:", result)
+    ```

 Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>

@@ -373,84 +387,88 @@ vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b

 Then, you can use the OpenAI client as follows:

-```python
-import base64
-import requests
-from openai import OpenAI
-from vllm.assets.audio import AudioAsset
+??? Code

-def encode_base64_content_from_url(content_url: str) -> str:
-    """Encode a content retrieved from a remote url to base64 format."""
+    ```python
+    import base64
+    import requests
+    from openai import OpenAI
+    from vllm.assets.audio import AudioAsset

-    with requests.get(content_url) as response:
-        response.raise_for_status()
-        result = base64.b64encode(response.content).decode('utf-8')
+    def encode_base64_content_from_url(content_url: str) -> str:
+        """Encode a content retrieved from a remote url to base64 format."""

-    return result
+        with requests.get(content_url) as response:
+            response.raise_for_status()
+            result = base64.b64encode(response.content).decode('utf-8')

-openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
+        return result

-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"

-# Any format supported by librosa is supported
-audio_url = AudioAsset("winning_call").url
-audio_base64 = encode_base64_content_from_url(audio_url)
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )

-chat_completion_from_base64 = client.chat.completions.create(
-    messages=[{
-        "role": "user",
-        "content": [
-            {
-                "type": "text",
-                "text": "What's in this audio?"
-            },
-            {
-                "type": "input_audio",
-                "input_audio": {
-                    "data": audio_base64,
-                    "format": "wav"
+    # Any format supported by librosa is supported
+    audio_url = AudioAsset("winning_call").url
+    audio_base64 = encode_base64_content_from_url(audio_url)
+
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[{
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this audio?"
                },
-            },
-        ],
-    }],
-    model=model,
-    max_completion_tokens=64,
-)
+                {
+                    "type": "input_audio",
+                    "input_audio": {
+                        "data": audio_base64,
+                        "format": "wav"
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )

-result = chat_completion_from_base64.choices[0].message.content
-print("Chat completion output from input audio:", result)
-```
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from input audio:", result)
+    ```

 Alternatively, you can pass `audio_url`, which is the audio counterpart of `image_url` for image input:

-```python
-chat_completion_from_url = client.chat.completions.create(
-    messages=[{
-        "role": "user",
-        "content": [
-            {
-                "type": "text",
-                "text": "What's in this audio?"
-            },
-            {
-                "type": "audio_url",
-                "audio_url": {
-                    "url": audio_url
-                },
-            },
-        ],
-    }],
-    model=model,
-    max_completion_tokens=64,
-)
+??? Code

-result = chat_completion_from_url.choices[0].message.content
-print("Chat completion output from audio url:", result)
-```
+    ```python
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[{
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this audio?"
+                },
+                {
+                    "type": "audio_url",
+                    "audio_url": {
+                        "url": audio_url
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from audio url:", result)
+    ```

 Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>

@@ -470,61 +488,63 @@ pass a tensor of shape to the corresponding field of the multi-modal dictionary.
 For image embeddings, you can pass the base64-encoded tensor to the `image_embeds` field.
 The following example demonstrates how to pass image embeddings to the OpenAI server:

-```python
-image_embedding = torch.load(...)
-grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct
+??? Code

-buffer = io.BytesIO()
-torch.save(image_embedding, buffer)
-buffer.seek(0)
-binary_data = buffer.read()
-base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')
+    ```python
+    image_embedding = torch.load(...)
+    grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct

-client = OpenAI(
-    # defaults to os.environ.get("OPENAI_API_KEY")
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
+    buffer = io.BytesIO()
+    torch.save(image_embedding, buffer)
+    buffer.seek(0)
+    binary_data = buffer.read()
+    base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')

-# Basic usage - this is equivalent to the LLaVA example for offline inference
-model = "llava-hf/llava-1.5-7b-hf"
-embeds =  {
-    "type": "image_embeds",
-    "image_embeds": f"{base64_image_embedding}" 
-}
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )

-# Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
-model = "Qwen/Qwen2-VL-2B-Instruct"
-embeds =  {
-    "type": "image_embeds",
-    "image_embeds": {
-        "image_embeds": f"{base64_image_embedding}" , # Required
-        "image_grid_thw": f"{base64_image_grid_thw}"  # Required by Qwen/Qwen2-VL-2B-Instruct
-    },
-}
-model = "openbmb/MiniCPM-V-2_6"
-embeds =  {
-    "type": "image_embeds",
-    "image_embeds": {
-        "image_embeds": f"{base64_image_embedding}" , # Required
-        "image_sizes": f"{base64_image_sizes}"  # Required by openbmb/MiniCPM-V-2_6
-    },
-}
-chat_completion = client.chat.completions.create(
-    messages=[
-    {"role": "system", "content": "You are a helpful assistant."},
-    {"role": "user", "content": [
-        {
-            "type": "text",
-            "text": "What's in this image?",
+    # Basic usage - this is equivalent to the LLaVA example for offline inference
+    model = "llava-hf/llava-1.5-7b-hf"
+    embeds =  {
+        "type": "image_embeds",
+        "image_embeds": f"{base64_image_embedding}" 
+    }
+
+    # Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
+    model = "Qwen/Qwen2-VL-2B-Instruct"
+    embeds =  {
+        "type": "image_embeds",
+        "image_embeds": {
+            "image_embeds": f"{base64_image_embedding}" , # Required
+            "image_grid_thw": f"{base64_image_grid_thw}"  # Required by Qwen/Qwen2-VL-2B-Instruct
        },
-        embeds,
-        ],
-    },
-],
-    model=model,
-)
-```
+    }
+    model = "openbmb/MiniCPM-V-2_6"
+    embeds =  {
+        "type": "image_embeds",
+        "image_embeds": {
+            "image_embeds": f"{base64_image_embedding}" , # Required
+            "image_sizes": f"{base64_image_sizes}"  # Required by openbmb/MiniCPM-V-2_6
+        },
+    }
+    chat_completion = client.chat.completions.create(
+        messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": [
+            {
+                "type": "text",
+                "text": "What's in this image?",
+            },
+            embeds,
+            ],
+        },
+    ],
+        model=model,
+    )
+    ```

 !!! note
    Only one message can contain `{"type": "image_embeds"}`.
--- a/docs/features/quantization/auto_awq.md
+++ b/docs/features/quantization/auto_awq.md
@@ -15,29 +15,31 @@ pip install autoawq

 After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:

-```python
-from awq import AutoAWQForCausalLM
-from transformers import AutoTokenizer
+??? Code

-model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
-quant_path = 'mistral-instruct-v0.2-awq'
-quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
+    ```python
+    from awq import AutoAWQForCausalLM
+    from transformers import AutoTokenizer

-# Load model
-model = AutoAWQForCausalLM.from_pretrained(
-    model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
-)
-tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
+    quant_path = 'mistral-instruct-v0.2-awq'
+    quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }

-# Quantize
-model.quantize(tokenizer, quant_config=quant_config)
+    # Load model
+    model = AutoAWQForCausalLM.from_pretrained(
+        model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

-# Save quantized model
-model.save_quantized(quant_path)
-tokenizer.save_pretrained(quant_path)
+    # Quantize
+    model.quantize(tokenizer, quant_config=quant_config)

-print(f'Model is quantized and saved at "{quant_path}"')
-```
+    # Save quantized model
+    model.save_quantized(quant_path)
+    tokenizer.save_pretrained(quant_path)
+
+    print(f'Model is quantized and saved at "{quant_path}"')
+    ```

 To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:

@@ -49,27 +51,29 @@ python examples/offline_inference/llm_engine_example.py \

 AWQ models are also supported directly through the LLM entrypoint:

-```python
-from vllm import LLM, SamplingParams
+??? Code

-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    ```python
+    from vllm import LLM, SamplingParams

-# Create an LLM.
-llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-```
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    # Create an LLM.
+    llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    ```
--- a/docs/features/quantization/bitblas.md
+++ b/docs/features/quantization/bitblas.md
@@ -43,17 +43,19 @@ llm = LLM(

 ## Read gptq format checkpoint

-```python
-from vllm import LLM
-import torch
+??? Code

-# "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
-model_id = "hxbgsyxh/llama-13b-4bit-g-1"
-llm = LLM(
-    model=model_id,
-    dtype=torch.float16,
-    trust_remote_code=True,
-    quantization="bitblas",
-    max_model_len=1024
-)
-```
+    ```python
+    from vllm import LLM
+    import torch
+
+    # "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
+    model_id = "hxbgsyxh/llama-13b-4bit-g-1"
+    llm = LLM(
+        model=model_id,
+        dtype=torch.float16,
+        trust_remote_code=True,
+        quantization="bitblas",
+        max_model_len=1024
+    )
+    ```
--- a/docs/features/quantization/fp8.md
+++ b/docs/features/quantization/fp8.md
@@ -58,22 +58,24 @@ For FP8 quantization, we can recover accuracy with simple RTN quantization. We r

 Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow.

-```python
-from llmcompressor.transformers import oneshot
-from llmcompressor.modifiers.quantization import QuantizationModifier
+??? Code

-# Configure the simple PTQ quantization
-recipe = QuantizationModifier(
-  targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
+    ```python
+    from llmcompressor.transformers import oneshot
+    from llmcompressor.modifiers.quantization import QuantizationModifier

-# Apply the quantization algorithm.
-oneshot(model=model, recipe=recipe)
+    # Configure the simple PTQ quantization
+    recipe = QuantizationModifier(
+      targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])

-# Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic
-SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
-model.save_pretrained(SAVE_DIR)
-tokenizer.save_pretrained(SAVE_DIR)
-```
+    # Apply the quantization algorithm.
+    oneshot(model=model, recipe=recipe)
+
+    # Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic
+    SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+    model.save_pretrained(SAVE_DIR)
+    tokenizer.save_pretrained(SAVE_DIR)
+    ```

 ### 3. Evaluating Accuracy

--- a/docs/features/quantization/gguf.md
+++ b/docs/features/quantization/gguf.md
@@ -41,42 +41,44 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \

 You can also use the GGUF model directly through the LLM entrypoint:

-```python
-from vllm import LLM, SamplingParams
+??? Code

-# In this script, we demonstrate how to pass input to the chat method:
-conversation = [
-   {
-      "role": "system",
-      "content": "You are a helpful assistant"
-   },
-   {
-      "role": "user",
-      "content": "Hello"
-   },
-   {
-      "role": "assistant",
-      "content": "Hello! How can I assist you today?"
-   },
-   {
-      "role": "user",
-      "content": "Write an essay about the importance of higher education.",
-   },
-]
+      ```python
+      from vllm import LLM, SamplingParams

-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+      # In this script, we demonstrate how to pass input to the chat method:
+      conversation = [
+         {
+            "role": "system",
+            "content": "You are a helpful assistant"
+         },
+         {
+            "role": "user",
+            "content": "Hello"
+         },
+         {
+            "role": "assistant",
+            "content": "Hello! How can I assist you today?"
+         },
+         {
+            "role": "user",
+            "content": "Write an essay about the importance of higher education.",
+         },
+      ]

-# Create an LLM.
-llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
-         tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.chat(conversation, sampling_params)
+      # Create a sampling params object.
+      sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

-# Print the outputs.
-for output in outputs:
-   prompt = output.prompt
-   generated_text = output.outputs[0].text
-   print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-```
+      # Create an LLM.
+      llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
+               tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+      # Generate texts from the prompts. The output is a list of RequestOutput objects
+      # that contain the prompt, generated text, and other information.
+      outputs = llm.chat(conversation, sampling_params)
+
+      # Print the outputs.
+      for output in outputs:
+         prompt = output.prompt
+         generated_text = output.outputs[0].text
+         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+      ```
--- a/docs/features/quantization/gptqmodel.md
+++ b/docs/features/quantization/gptqmodel.md
@@ -31,28 +31,30 @@ After installing GPTQModel, you are ready to quantize a model. Please refer to t

 Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:

-```python
-from datasets import load_dataset
-from gptqmodel import GPTQModel, QuantizeConfig
+??? Code

-model_id = "meta-llama/Llama-3.2-1B-Instruct"
-quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"
+    ```python
+    from datasets import load_dataset
+    from gptqmodel import GPTQModel, QuantizeConfig

-calibration_dataset = load_dataset(
-    "allenai/c4",
-    data_files="en/c4-train.00001-of-01024.json.gz",
-    split="train"
-  ).select(range(1024))["text"]
+    model_id = "meta-llama/Llama-3.2-1B-Instruct"
+    quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"

-quant_config = QuantizeConfig(bits=4, group_size=128)
+    calibration_dataset = load_dataset(
+        "allenai/c4",
+        data_files="en/c4-train.00001-of-01024.json.gz",
+        split="train"
+    ).select(range(1024))["text"]

-model = GPTQModel.load(model_id, quant_config)
+    quant_config = QuantizeConfig(bits=4, group_size=128)

-# increase `batch_size` to match gpu/vram specs to speed up quantization
-model.quantize(calibration_dataset, batch_size=2)
+    model = GPTQModel.load(model_id, quant_config)

-model.save(quant_path)
-```
+    # increase `batch_size` to match gpu/vram specs to speed up quantization
+    model.quantize(calibration_dataset, batch_size=2)
+
+    model.save(quant_path)
+    ```

 ## Running a quantized model with vLLM

@@ -67,32 +69,34 @@ python examples/offline_inference/llm_engine_example.py \

 GPTQModel quantized models are also supported directly through the LLM entrypoint:

-```python
-from vllm import LLM, SamplingParams
+??? Code

-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
+    ```python
+    from vllm import LLM, SamplingParams

-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]

-# Create an LLM.
-llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0.6, top_p=0.9)

-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
+    # Create an LLM.
+    llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")

-# Print the outputs.
-print("-"*50)
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
    print("-"*50)
-```
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-"*50)
+    ```
--- a/docs/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@@ -53,51 +53,55 @@ When quantizing weights to INT4, you need sample data to estimate the weight upd
 It's best to use calibration data that closely matches your deployment data.
 For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:

-```python
-from datasets import load_dataset
+??? Code

-NUM_CALIBRATION_SAMPLES = 512
-MAX_SEQUENCE_LENGTH = 2048
+    ```python
+    from datasets import load_dataset

-# Load and preprocess the dataset
-ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
-ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+    NUM_CALIBRATION_SAMPLES = 512
+    MAX_SEQUENCE_LENGTH = 2048

-def preprocess(example):
-    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
-ds = ds.map(preprocess)
+    # Load and preprocess the dataset
+    ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
+    ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))

-def tokenize(sample):
-    return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
-ds = ds.map(tokenize, remove_columns=ds.column_names)
-```
+    def preprocess(example):
+        return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
+    ds = ds.map(preprocess)
+
+    def tokenize(sample):
+        return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
+    ds = ds.map(tokenize, remove_columns=ds.column_names)
+    ```

 ### 3. Applying Quantization

 Now, apply the quantization algorithms:

-```python
-from llmcompressor.transformers import oneshot
-from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
+??? Code

-# Configure the quantization algorithms
-recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
+    ```python
+    from llmcompressor.transformers import oneshot
+    from llmcompressor.modifiers.quantization import GPTQModifier
+    from llmcompressor.modifiers.smoothquant import SmoothQuantModifier

-# Apply quantization
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-)
+    # Configure the quantization algorithms
+    recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])

-# Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128
-SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
-```
+    # Apply quantization
+    oneshot(
+        model=model,
+        dataset=ds,
+        recipe=recipe,
+        max_seq_length=MAX_SEQUENCE_LENGTH,
+        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    )
+
+    # Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128
+    SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
+    model.save_pretrained(SAVE_DIR, save_compressed=True)
+    tokenizer.save_pretrained(SAVE_DIR)
+    ```

 This process creates a W4A16 model with weights quantized to 4-bit integers.

@@ -137,34 +141,36 @@ $ lm_eval --model vllm \

 The following is an example of an expanded quantization recipe you can tune to your own use case:

-```python
-from compressed_tensors.quantization import (
-    QuantizationArgs,
-    QuantizationScheme,
-    QuantizationStrategy,
-    QuantizationType,
-) 
-recipe = GPTQModifier(
-    targets="Linear",
-    config_groups={
-        "config_group": QuantizationScheme(
-            targets=["Linear"],
-            weights=QuantizationArgs(
-                num_bits=4,
-                type=QuantizationType.INT,
-                strategy=QuantizationStrategy.GROUP,
-                group_size=128,
-                symmetric=True,
-                dynamic=False,
-                actorder="weight",
+??? Code
+
+    ```python
+    from compressed_tensors.quantization import (
+        QuantizationArgs,
+        QuantizationScheme,
+        QuantizationStrategy,
+        QuantizationType,
+    ) 
+    recipe = GPTQModifier(
+        targets="Linear",
+        config_groups={
+            "config_group": QuantizationScheme(
+                targets=["Linear"],
+                weights=QuantizationArgs(
+                    num_bits=4,
+                    type=QuantizationType.INT,
+                    strategy=QuantizationStrategy.GROUP,
+                    group_size=128,
+                    symmetric=True,
+                    dynamic=False,
+                    actorder="weight",
+                ),
            ),
-        ),
-    },
-    ignore=["lm_head"],
-    update_size=NUM_CALIBRATION_SAMPLES,
-    dampening_frac=0.01
-)
-```
+        },
+        ignore=["lm_head"],
+        update_size=NUM_CALIBRATION_SAMPLES,
+        dampening_frac=0.01
+    )
+    ```

 ## Troubleshooting and Support

--- a/docs/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@@ -54,54 +54,60 @@ When quantizing activations to INT8, you need sample data to estimate the activa
 It's best to use calibration data that closely matches your deployment data.
 For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:

-```python
-from datasets import load_dataset
+??? Code

-NUM_CALIBRATION_SAMPLES = 512
-MAX_SEQUENCE_LENGTH = 2048
+    ```python
+    from datasets import load_dataset

-# Load and preprocess the dataset
-ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
-ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+    NUM_CALIBRATION_SAMPLES = 512
+    MAX_SEQUENCE_LENGTH = 2048

-def preprocess(example):
-    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
-ds = ds.map(preprocess)
+    # Load and preprocess the dataset
+    ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
+    ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))

-def tokenize(sample):
-    return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
-ds = ds.map(tokenize, remove_columns=ds.column_names)
-```
+    def preprocess(example):
+        return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
+    ds = ds.map(preprocess)
+
+    def tokenize(sample):
+        return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
+    ds = ds.map(tokenize, remove_columns=ds.column_names)
+    ```
+
+</details>

 ### 3. Applying Quantization

 Now, apply the quantization algorithms:

-```python
-from llmcompressor.transformers import oneshot
-from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
+??? Code

-# Configure the quantization algorithms
-recipe = [
-    SmoothQuantModifier(smoothing_strength=0.8),
-    GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
-]
+    ```python
+    from llmcompressor.transformers import oneshot
+    from llmcompressor.modifiers.quantization import GPTQModifier
+    from llmcompressor.modifiers.smoothquant import SmoothQuantModifier

-# Apply quantization
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-)
+    # Configure the quantization algorithms
+    recipe = [
+        SmoothQuantModifier(smoothing_strength=0.8),
+        GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
+    ]

-# Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token
-SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
-```
+    # Apply quantization
+    oneshot(
+        model=model,
+        dataset=ds,
+        recipe=recipe,
+        max_seq_length=MAX_SEQUENCE_LENGTH,
+        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    )
+
+    # Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token
+    SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
+    model.save_pretrained(SAVE_DIR, save_compressed=True)
+    tokenizer.save_pretrained(SAVE_DIR)
+    ```

 This process creates a W8A8 model with weights and activations quantized to 8-bit integers.

--- a/docs/features/quantization/modelopt.md
+++ b/docs/features/quantization/modelopt.md
@@ -14,24 +14,26 @@ You can quantize HuggingFace models using the example scripts provided in the Te

 Below is an example showing how to quantize a model using modelopt's PTQ API:

-```python
-import modelopt.torch.quantization as mtq
-from transformers import AutoModelForCausalLM
+??? Code

-# Load the model from HuggingFace
-model = AutoModelForCausalLM.from_pretrained("<path_or_model_id>")
+    ```python
+    import modelopt.torch.quantization as mtq
+    from transformers import AutoModelForCausalLM

-# Select the quantization config, for example, FP8
-config = mtq.FP8_DEFAULT_CFG
+    # Load the model from HuggingFace
+    model = AutoModelForCausalLM.from_pretrained("<path_or_model_id>")

-# Define a forward loop function for calibration
-def forward_loop(model):
-    for data in calib_set:
-        model(data)
+    # Select the quantization config, for example, FP8
+    config = mtq.FP8_DEFAULT_CFG

-# PTQ with in-place replacement of quantized modules
-model = mtq.quantize(model, config, forward_loop)
-```
+    # Define a forward loop function for calibration
+    def forward_loop(model):
+        for data in calib_set:
+            model(data)
+
+    # PTQ with in-place replacement of quantized modules
+    model = mtq.quantize(model, config, forward_loop)
+    ```

 After the model is quantized, you can export it to a quantized checkpoint using the export API:

@@ -48,31 +50,33 @@ with torch.inference_mode():

 The quantized checkpoint can then be deployed with vLLM. As an example, the following code shows how to deploy `nvidia/Llama-3.1-8B-Instruct-FP8`, which is the FP8 quantized checkpoint derived from `meta-llama/Llama-3.1-8B-Instruct`, using vLLM:

-```python
-from vllm import LLM, SamplingParams
+??? Code

-def main():
+    ```python
+    from vllm import LLM, SamplingParams

-    model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
-    # Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
-    llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
+    def main():

-    sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
+        model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
+        # Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
+        llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)

-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
+        sampling_params = SamplingParams(temperature=0.8, top_p=0.9)

-    outputs = llm.generate(prompts, sampling_params)
+        prompts = [
+            "Hello, my name is",
+            "The president of the United States is",
+            "The capital of France is",
+            "The future of AI is",
+        ]

-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        outputs = llm.generate(prompts, sampling_params)

-if __name__ == "__main__":
-    main()
-```
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    if __name__ == "__main__":
+        main()
+    ```
--- a/docs/features/quantization/quantized_kvcache.md
+++ b/docs/features/quantization/quantized_kvcache.md
@@ -35,20 +35,22 @@ Studies have shown that FP8 E4M3 quantization typically only minimally degrades

 Here is an example of how to enable FP8 quantization:

-```python
-# To calculate kv cache scales on the fly enable the calculate_kv_scales
-# parameter
+??? Code

-from vllm import LLM, SamplingParams
+    ```python
+    # To calculate kv cache scales on the fly enable the calculate_kv_scales
+    # parameter

-sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
-llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
-          kv_cache_dtype="fp8",
-          calculate_kv_scales=True)
-prompt = "London is the capital of"
-out = llm.generate(prompt, sampling_params)[0].outputs[0].text
-print(out)
-```
+    from vllm import LLM, SamplingParams
+
+    sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
+    llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
+            kv_cache_dtype="fp8",
+            calculate_kv_scales=True)
+    prompt = "London is the capital of"
+    out = llm.generate(prompt, sampling_params)[0].outputs[0].text
+    print(out)
+    ```

 The `kv_cache_dtype` argument specifies the data type for KV cache storage:
 - `"auto"`: Uses the model's default "unquantized" data type
@@ -71,67 +73,69 @@ pip install llmcompressor

 Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models can use this same pattern):

-```python
-from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from llmcompressor.transformers import oneshot
+??? Code

-# Select model and load it
-MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    ```python
+    from datasets import load_dataset
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    from llmcompressor.transformers import oneshot

-# Select calibration dataset
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
+    # Select model and load it
+    MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

-# Configure calibration parameters
-NUM_CALIBRATION_SAMPLES = 512  # 512 samples is a good starting point
-MAX_SEQUENCE_LENGTH = 2048
+    # Select calibration dataset
+    DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+    DATASET_SPLIT = "train_sft"

-# Load and preprocess dataset
-ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
-ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+    # Configure calibration parameters
+    NUM_CALIBRATION_SAMPLES = 512  # 512 samples is a good starting point
+    MAX_SEQUENCE_LENGTH = 2048

-def process_and_tokenize(example):
-    text = tokenizer.apply_chat_template(example["messages"], tokenize=False)
-    return tokenizer(
-        text,
-        padding=False,
-        max_length=MAX_SEQUENCE_LENGTH,
-        truncation=True,
-        add_special_tokens=False,
+    # Load and preprocess dataset
+    ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
+    ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+
+    def process_and_tokenize(example):
+        text = tokenizer.apply_chat_template(example["messages"], tokenize=False)
+        return tokenizer(
+            text,
+            padding=False,
+            max_length=MAX_SEQUENCE_LENGTH,
+            truncation=True,
+            add_special_tokens=False,
+        )
+
+    ds = ds.map(process_and_tokenize, remove_columns=ds.column_names)
+
+    # Configure quantization settings
+    recipe = """
+    quant_stage:
+        quant_modifiers:
+            QuantizationModifier:
+                kv_cache_scheme:
+                    num_bits: 8
+                    type: float
+                    strategy: tensor
+                    dynamic: false
+                    symmetric: true
+    """
+
+    # Apply quantization
+    oneshot(
+        model=model,
+        dataset=ds,
+        recipe=recipe,
+        max_seq_length=MAX_SEQUENCE_LENGTH,
+        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
    )

-ds = ds.map(process_and_tokenize, remove_columns=ds.column_names)
-
-# Configure quantization settings
-recipe = """
-quant_stage:
-    quant_modifiers:
-        QuantizationModifier:
-            kv_cache_scheme:
-                num_bits: 8
-                type: float
-                strategy: tensor
-                dynamic: false
-                symmetric: true
-"""
-
-# Apply quantization
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-)
-
-# Save quantized model: Llama-3.1-8B-Instruct-FP8-KV
-SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
-```
+    # Save quantized model: Llama-3.1-8B-Instruct-FP8-KV
+    SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
+    model.save_pretrained(SAVE_DIR, save_compressed=True)
+    tokenizer.save_pretrained(SAVE_DIR)
+    ```

 The above script will create a folder in your current directory containing your quantized model (e.g., `Llama-3.1-8B-Instruct-FP8-KV`) with calibrated scales.

--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@@ -42,20 +42,22 @@ The Quark quantization process can be listed for 5 steps as below:
 Quark uses [Transformers](https://huggingface.co/docs/transformers/en/index)
 to fetch model and tokenizer.

-```python
-from transformers import AutoTokenizer, AutoModelForCausalLM
+??? Code

-MODEL_ID = "meta-llama/Llama-2-70b-chat-hf"
-MAX_SEQ_LEN = 512
+    ```python
+    from transformers import AutoTokenizer, AutoModelForCausalLM

-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto",
-)
-model.eval()
+    MODEL_ID = "meta-llama/Llama-2-70b-chat-hf"
+    MAX_SEQ_LEN = 512

-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN)
-tokenizer.pad_token = tokenizer.eos_token
-```
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID, device_map="auto", torch_dtype="auto",
+    )
+    model.eval()
+
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN)
+    tokenizer.pad_token = tokenizer.eos_token
+    ```

 ### 2. Prepare the Calibration Dataloader

@@ -63,22 +65,24 @@ Quark uses the [PyTorch Dataloader](https://pytorch.org/tutorials/beginner/basic
 to load calibration data. For more details about how to use calibration datasets efficiently, please refer
 to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calibration_datasets.html).

-```python
-from datasets import load_dataset
-from torch.utils.data import DataLoader
+??? Code

-BATCH_SIZE = 1
-NUM_CALIBRATION_DATA = 512
+    ```python
+    from datasets import load_dataset
+    from torch.utils.data import DataLoader

-# Load the dataset and get calibration data.
-dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
-text_data = dataset["text"][:NUM_CALIBRATION_DATA]
+    BATCH_SIZE = 1
+    NUM_CALIBRATION_DATA = 512

-tokenized_outputs = tokenizer(text_data, return_tensors="pt",
-    padding=True, truncation=True, max_length=MAX_SEQ_LEN)
-calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
-    batch_size=BATCH_SIZE, drop_last=True)
-```
+    # Load the dataset and get calibration data.
+    dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
+    text_data = dataset["text"][:NUM_CALIBRATION_DATA]
+
+    tokenized_outputs = tokenizer(text_data, return_tensors="pt",
+        padding=True, truncation=True, max_length=MAX_SEQ_LEN)
+    calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
+        batch_size=BATCH_SIZE, drop_last=True)
+    ```

 ### 3. Set the Quantization Configuration

@@ -94,42 +98,44 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
    AutoSmoothQuant config file for Llama is
    `examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`.

-```python
-from quark.torch.quantization import (Config, QuantizationConfig,
-                                     FP8E4M3PerTensorSpec,
-                                     load_quant_algo_config_from_file)
+??? Code

-# Define fp8/per-tensor/static spec.
-FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
-    is_dynamic=False).to_quantization_spec()
+    ```python
+    from quark.torch.quantization import (Config, QuantizationConfig,
+                                        FP8E4M3PerTensorSpec,
+                                        load_quant_algo_config_from_file)

-# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
-global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
-    weight=FP8_PER_TENSOR_SPEC)
+    # Define fp8/per-tensor/static spec.
+    FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
+        is_dynamic=False).to_quantization_spec()

-# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
-KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
-kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
-kv_cache_quant_config = {name :
-    QuantizationConfig(input_tensors=global_quant_config.input_tensors,
-                       weight=global_quant_config.weight,
-                       output_tensors=KV_CACHE_SPEC)
-    for name in kv_cache_layer_names_for_llama}
-layer_quant_config = kv_cache_quant_config.copy()
+    # Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
+    global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
+        weight=FP8_PER_TENSOR_SPEC)

-# Define algorithm config by config file.
-LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
-    'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
-algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
+    # Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
+    KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
+    kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
+    kv_cache_quant_config = {name :
+        QuantizationConfig(input_tensors=global_quant_config.input_tensors,
+                        weight=global_quant_config.weight,
+                        output_tensors=KV_CACHE_SPEC)
+        for name in kv_cache_layer_names_for_llama}
+    layer_quant_config = kv_cache_quant_config.copy()

-EXCLUDE_LAYERS = ["lm_head"]
-quant_config = Config(
-    global_quant_config=global_quant_config,
-    layer_quant_config=layer_quant_config,
-    kv_cache_quant_config=kv_cache_quant_config,
-    exclude=EXCLUDE_LAYERS,
-    algo_config=algo_config)
-```
+    # Define algorithm config by config file.
+    LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
+        'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
+    algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
+
+    EXCLUDE_LAYERS = ["lm_head"]
+    quant_config = Config(
+        global_quant_config=global_quant_config,
+        layer_quant_config=layer_quant_config,
+        kv_cache_quant_config=kv_cache_quant_config,
+        exclude=EXCLUDE_LAYERS,
+        algo_config=algo_config)
+    ```

 ### 4. Quantize the Model and Export

@@ -139,63 +145,67 @@ HuggingFace `safetensors`, you can refer to
 [HuggingFace format exporting](https://quark.docs.amd.com/latest/pytorch/export/quark_export_hf.html)
 for more exporting format details.

-```python
-import torch
-from quark.torch import ModelQuantizer, ModelExporter
-from quark.torch.export import ExporterConfig, JsonExporterConfig
+??? Code

-# Apply quantization.
-quantizer = ModelQuantizer(quant_config)
-quant_model = quantizer.quantize_model(model, calib_dataloader)
+    ```python
+    import torch
+    from quark.torch import ModelQuantizer, ModelExporter
+    from quark.torch.export import ExporterConfig, JsonExporterConfig

-# Freeze quantized model to export.
-freezed_model = quantizer.freeze(model)
+    # Apply quantization.
+    quantizer = ModelQuantizer(quant_config)
+    quant_model = quantizer.quantize_model(model, calib_dataloader)

-# Define export config.
-LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
-export_config = ExporterConfig(json_export_config=JsonExporterConfig())
-export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
+    # Freeze quantized model to export.
+    freezed_model = quantizer.freeze(model)

-# Model: Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant
-EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
-exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
-with torch.no_grad():
-    exporter.export_safetensors_model(freezed_model,
-        quant_config=quant_config, tokenizer=tokenizer)
-```
+    # Define export config.
+    LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
+    export_config = ExporterConfig(json_export_config=JsonExporterConfig())
+    export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
+
+    # Model: Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant
+    EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
+    exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
+    with torch.no_grad():
+        exporter.export_safetensors_model(freezed_model,
+            quant_config=quant_config, tokenizer=tokenizer)
+    ```

 ### 5. Evaluation in vLLM

 Now, you can load and run the Quark quantized model directly through the LLM entrypoint:

-```python
-from vllm import LLM, SamplingParams
+??? Code

-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    ```python
+    from vllm import LLM, SamplingParams

-# Create an LLM.
-llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
-          kv_cache_dtype='fp8',quantization='quark')
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-# Print the outputs.
-print("\nGenerated Outputs:\n" + "-" * 60)
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt:    {prompt!r}")
-    print(f"Output:    {generated_text!r}")
-    print("-" * 60)
-```
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    # Create an LLM.
+    llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
+            kv_cache_dtype='fp8',quantization='quark')
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt:    {prompt!r}")
+        print(f"Output:    {generated_text!r}")
+        print("-" * 60)
+    ```

 Or, you can use `lm_eval` to evaluate accuracy:

--- a/docs/features/quantization/torchao.md
+++ b/docs/features/quantization/torchao.md
@@ -15,26 +15,28 @@ pip install \
 ## Quantizing HuggingFace Models
 You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code:

-```Python
-import torch
-from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-from torchao.quantization import Int8WeightOnlyConfig
+??? Code

-model_name = "meta-llama/Meta-Llama-3-8B"
-quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
-quantized_model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    torch_dtype="auto",
-    device_map="auto",
-    quantization_config=quantization_config
-)
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+    ```Python
+    import torch
+    from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
+    from torchao.quantization import Int8WeightOnlyConfig

-hub_repo = # YOUR HUB REPO ID
-tokenizer.push_to_hub(hub_repo)
-quantized_model.push_to_hub(hub_repo, safe_serialization=False)
-```
+    model_name = "meta-llama/Meta-Llama-3-8B"
+    quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
+    quantized_model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype="auto",
+        device_map="auto",
+        quantization_config=quantization_config
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    input_text = "What are we having for dinner?"
+    input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+    hub_repo = # YOUR HUB REPO ID
+    tokenizer.push_to_hub(hub_repo)
+    quantized_model.push_to_hub(hub_repo, safe_serialization=False)
+    ```

 Alternatively, you can use the [TorchAO Quantization space](https://huggingface.co/spaces/medmekk/TorchAO_Quantization) for quantizing models with a simple UI.
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -33,34 +33,36 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \

 Next, make a request to the model that should return the reasoning content in the response.

-```python
-from openai import OpenAI
+??? Code

-# Modify OpenAI's API key and API base to use vLLM's API server.
-openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
+    ```python
+    from openai import OpenAI

-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
+    # Modify OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"

-models = client.models.list()
-model = models.data[0].id
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )

-# Round 1
-messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
-# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
-# For Qwen3 series, if you want to disable thinking in reasoning mode, add:
-# extra_body={"chat_template_kwargs": {"enable_thinking": False}}
-response = client.chat.completions.create(model=model, messages=messages)
+    models = client.models.list()
+    model = models.data[0].id

-reasoning_content = response.choices[0].message.reasoning_content
-content = response.choices[0].message.content
+    # Round 1
+    messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+    # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
+    # For Qwen3 series, if you want to disable thinking in reasoning mode, add:
+    # extra_body={"chat_template_kwargs": {"enable_thinking": False}}
+    response = client.chat.completions.create(model=model, messages=messages)

-print("reasoning_content:", reasoning_content)
-print("content:", content)
-```
+    reasoning_content = response.choices[0].message.reasoning_content
+    content = response.choices[0].message.content
+
+    print("reasoning_content:", reasoning_content)
+    print("content:", content)
+    ```

 The `reasoning_content` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion.

@@ -68,77 +70,81 @@ The `reasoning_content` field contains the reasoning steps that led to the final

 Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming).

-```json
-{
-    "id": "chatcmpl-123",
-    "object": "chat.completion.chunk",
-    "created": 1694268190,
-    "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
-    "system_fingerprint": "fp_44709d6fcb",
-    "choices": [
-        {
-            "index": 0,
-            "delta": {
-                "role": "assistant",
-                "reasoning_content": "is",
-            },
-            "logprobs": null,
-            "finish_reason": null
-        }
-    ]
-}
-```
+??? Json
+
+    ```json
+    {
+        "id": "chatcmpl-123",
+        "object": "chat.completion.chunk",
+        "created": 1694268190,
+        "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+        "system_fingerprint": "fp_44709d6fcb",
+        "choices": [
+            {
+                "index": 0,
+                "delta": {
+                    "role": "assistant",
+                    "reasoning_content": "is",
+                },
+                "logprobs": null,
+                "finish_reason": null
+            }
+        ]
+    }
+    ```

 OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client supports extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example:

-```python
-from openai import OpenAI
+??? Code

-# Modify OpenAI's API key and API base to use vLLM's API server.
-openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
+    ```python
+    from openai import OpenAI

-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
+    # Modify OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"

-models = client.models.list()
-model = models.data[0].id
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )

-messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
-# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
-# For Qwen3 series, if you want to disable thinking in reasoning mode, add:
-# extra_body={"chat_template_kwargs": {"enable_thinking": False}}
-stream = client.chat.completions.create(model=model,
-                                        messages=messages,
-                                        stream=True)
+    models = client.models.list()
+    model = models.data[0].id

-print("client: Start streaming chat completions...")
-printed_reasoning_content = False
-printed_content = False
+    messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+    # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
+    # For Qwen3 series, if you want to disable thinking in reasoning mode, add:
+    # extra_body={"chat_template_kwargs": {"enable_thinking": False}}
+    stream = client.chat.completions.create(model=model,
+                                            messages=messages,
+                                            stream=True)

-for chunk in stream:
-    reasoning_content = None
-    content = None
-    # Check the content is reasoning_content or content
-    if hasattr(chunk.choices[0].delta, "reasoning_content"):
-        reasoning_content = chunk.choices[0].delta.reasoning_content
-    elif hasattr(chunk.choices[0].delta, "content"):
-        content = chunk.choices[0].delta.content
+    print("client: Start streaming chat completions...")
+    printed_reasoning_content = False
+    printed_content = False

-    if reasoning_content is not None:
-        if not printed_reasoning_content:
-            printed_reasoning_content = True
-            print("reasoning_content:", end="", flush=True)
-        print(reasoning_content, end="", flush=True)
-    elif content is not None:
-        if not printed_content:
-            printed_content = True
-            print("\ncontent:", end="", flush=True)
-        # Extract and print the content
-        print(content, end="", flush=True)
-```
+    for chunk in stream:
+        reasoning_content = None
+        content = None
+        # Check the content is reasoning_content or content
+        if hasattr(chunk.choices[0].delta, "reasoning_content"):
+            reasoning_content = chunk.choices[0].delta.reasoning_content
+        elif hasattr(chunk.choices[0].delta, "content"):
+            content = chunk.choices[0].delta.content
+
+        if reasoning_content is not None:
+            if not printed_reasoning_content:
+                printed_reasoning_content = True
+                print("reasoning_content:", end="", flush=True)
+            print(reasoning_content, end="", flush=True)
+        elif content is not None:
+            if not printed_content:
+                printed_content = True
+                print("\ncontent:", end="", flush=True)
+            # Extract and print the content
+            print(content, end="", flush=True)
+    ```

 Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).

@@ -146,41 +152,43 @@ Remember to check whether the `reasoning_content` exists in the response before

 The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`.

-```python
-from openai import OpenAI
+??? Code

-client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
+    ```python
+    from openai import OpenAI

-tools = [{
-    "type": "function",
-    "function": {
-        "name": "get_weather",
-        "description": "Get the current weather in a given location",
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
-                "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
-            },
-            "required": ["location", "unit"]
+    client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
+
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
+                },
+                "required": ["location", "unit"]
+            }
        }
-    }
-}]
+    }]

-response = client.chat.completions.create(
-    model=client.models.list().data[0].id,
-    messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
-    tools=tools,
-    tool_choice="auto"
-)
+    response = client.chat.completions.create(
+        model=client.models.list().data[0].id,
+        messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
+        tools=tools,
+        tool_choice="auto"
+    )

-print(response)
-tool_call = response.choices[0].message.tool_calls[0].function
+    print(response)
+    tool_call = response.choices[0].message.tool_calls[0].function

-print(f"reasoning_content: {response.choices[0].message.reasoning_content}")
-print(f"Function called: {tool_call.name}")
-print(f"Arguments: {tool_call.arguments}")
-```
+    print(f"reasoning_content: {response.choices[0].message.reasoning_content}")
+    print(f"Function called: {tool_call.name}")
+    print(f"Arguments: {tool_call.arguments}")
+    ```

 For more examples, please refer to <gh-file:examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py>.

@@ -192,85 +200,89 @@ For more examples, please refer to <gh-file:examples/online_serving/openai_chat_

 You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.

-```python
-# import the required packages
+??? Code

-from vllm.reasoning import ReasoningParser, ReasoningParserManager
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaMessage)
+    ```python
+    # import the required packages

-# define a reasoning parser and register it to vllm
-# the name list in register_module can be used
-# in --reasoning-parser.
-@ReasoningParserManager.register_module(["example"])
-class ExampleParser(ReasoningParser):
-    def __init__(self, tokenizer: AnyTokenizer):
-        super().__init__(tokenizer)
+    from vllm.reasoning import ReasoningParser, ReasoningParserManager
+    from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                                DeltaMessage)

-    def extract_reasoning_content_streaming(
-        self,
-        previous_text: str,
-        current_text: str,
-        delta_text: str,
-        previous_token_ids: Sequence[int],
-        current_token_ids: Sequence[int],
-        delta_token_ids: Sequence[int],
-    ) -> Union[DeltaMessage, None]:
-        """
-        Instance method that should be implemented for extracting reasoning
-        from an incomplete response; for use when handling reasoning calls and
-        streaming. Has to be an instance method because  it requires state -
-        the current tokens/diffs, but also the information about what has
-        previously been parsed and extracted (see constructor)
-        """
+    # define a reasoning parser and register it to vllm
+    # the name list in register_module can be used
+    # in --reasoning-parser.
+    @ReasoningParserManager.register_module(["example"])
+    class ExampleParser(ReasoningParser):
+        def __init__(self, tokenizer: AnyTokenizer):
+            super().__init__(tokenizer)

-    def extract_reasoning_content(
-            self, model_output: str, request: ChatCompletionRequest
-    ) -> tuple[Optional[str], Optional[str]]:
-        """
-        Extract reasoning content from a complete model-generated string.
+        def extract_reasoning_content_streaming(
+            self,
+            previous_text: str,
+            current_text: str,
+            delta_text: str,
+            previous_token_ids: Sequence[int],
+            current_token_ids: Sequence[int],
+            delta_token_ids: Sequence[int],
+        ) -> Union[DeltaMessage, None]:
+            """
+            Instance method that should be implemented for extracting reasoning
+            from an incomplete response; for use when handling reasoning calls and
+            streaming. Has to be an instance method because  it requires state -
+            the current tokens/diffs, but also the information about what has
+            previously been parsed and extracted (see constructor)
+            """

-        Used for non-streaming responses where we have the entire model response
-        available before sending to the client.
+        def extract_reasoning_content(
+                self, model_output: str, request: ChatCompletionRequest
+        ) -> tuple[Optional[str], Optional[str]]:
+            """
+            Extract reasoning content from a complete model-generated string.

-        Parameters:
-        model_output: str
-            The model-generated string to extract reasoning content from.
+            Used for non-streaming responses where we have the entire model response
+            available before sending to the client.

-        request: ChatCompletionRequest
-            The request object that was used to generate the model_output.
+            Parameters:
+            model_output: str
+                The model-generated string to extract reasoning content from.

-        Returns:
-        tuple[Optional[str], Optional[str]]
-            A tuple containing the reasoning content and the content.
-        """
-```
+            request: ChatCompletionRequest
+                The request object that was used to generate the model_output.
+
+            Returns:
+            tuple[Optional[str], Optional[str]]
+                A tuple containing the reasoning content and the content.
+            """
+    ```

 Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.

-```python
-@dataclass
-class DeepSeekReasoner(Reasoner):
-    """
-    Reasoner for DeepSeek R series models.
-    """
-    start_token_id: int
-    end_token_id: int
+??? Code

-    start_token: str = "<think>"
-    end_token: str = "</think>"
+    ```python
+    @dataclass
+    class DeepSeekReasoner(Reasoner):
+        """
+        Reasoner for DeepSeek R series models.
+        """
+        start_token_id: int
+        end_token_id: int

-    @classmethod
-    def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
-        return cls(start_token_id=tokenizer.encode(
-            "<think>", add_special_tokens=False)[0],
-                   end_token_id=tokenizer.encode("</think>",
-                                                 add_special_tokens=False)[0])
+        start_token: str = "<think>"
+        end_token: str = "</think>"

-    def is_reasoning_end(self, input_ids: list[int]) -> bool:
-        return self.end_token_id in input_ids
-    ...
-```
+        @classmethod
+        def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
+            return cls(start_token_id=tokenizer.encode(
+                "<think>", add_special_tokens=False)[0],
+                    end_token_id=tokenizer.encode("</think>",
+                                                    add_special_tokens=False)[0])
+
+        def is_reasoning_end(self, input_ids: list[int]) -> bool:
+            return self.end_token_id in input_ids
+        ...
+    ```

 The structured output engine like [xgrammar](https://github.com/mlc-ai/xgrammar) will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case.

--- a/docs/features/spec_decode.md
+++ b/docs/features/spec_decode.md
@@ -18,29 +18,31 @@ Speculative decoding is a technique which improves inter-token latency in memory

 The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.

-```python
-from vllm import LLM, SamplingParams
+??? Code

-prompts = [
-    "The future of AI is",
-]
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    ```python
+    from vllm import LLM, SamplingParams

-llm = LLM(
-    model="facebook/opt-6.7b",
-    tensor_parallel_size=1,
-    speculative_config={
-        "model": "facebook/opt-125m",
-        "num_speculative_tokens": 5,
-    },
-)
-outputs = llm.generate(prompts, sampling_params)
+    prompts = [
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-```
+    llm = LLM(
+        model="facebook/opt-6.7b",
+        tensor_parallel_size=1,
+        speculative_config={
+            "model": "facebook/opt-125m",
+            "num_speculative_tokens": 5,
+        },
+    )
+    outputs = llm.generate(prompts, sampling_params)
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    ```

 To perform the same with an online mode launch the server:

@@ -60,69 +62,73 @@ python -m vllm.entrypoints.openai.api_server \

 Then use a client:

-```python
-from openai import OpenAI
+??? Code

-# Modify OpenAI's API key and API base to use vLLM's API server.
-openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
+    ```python
+    from openai import OpenAI

-client = OpenAI(
-    # defaults to os.environ.get("OPENAI_API_KEY")
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
+    # Modify OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"

-models = client.models.list()
-model = models.data[0].id
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )

-# Completion API
-stream = False
-completion = client.completions.create(
-    model=model,
-    prompt="The future of AI is",
-    echo=False,
-    n=1,
-    stream=stream,
-)
+    models = client.models.list()
+    model = models.data[0].id

-print("Completion results:")
-if stream:
-    for c in completion:
-        print(c)
-else:
-    print(completion)
-```
+    # Completion API
+    stream = False
+    completion = client.completions.create(
+        model=model,
+        prompt="The future of AI is",
+        echo=False,
+        n=1,
+        stream=stream,
+    )
+
+    print("Completion results:")
+    if stream:
+        for c in completion:
+            print(c)
+    else:
+        print(completion)
+    ```

 ## Speculating by matching n-grams in the prompt

 The following code configures vLLM to use speculative decoding where proposals are generated by
 matching n-grams in the prompt. For more information read [this thread.](https://x.com/joao_gante/status/1747322413006643259)

-```python
-from vllm import LLM, SamplingParams
+??? Code

-prompts = [
-    "The future of AI is",
-]
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    ```python
+    from vllm import LLM, SamplingParams

-llm = LLM(
-    model="facebook/opt-6.7b",
-    tensor_parallel_size=1,
-    speculative_config={
-        "method": "ngram",
-        "num_speculative_tokens": 5,
-        "prompt_lookup_max": 4,
-    },
-)
-outputs = llm.generate(prompts, sampling_params)
+    prompts = [
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-```
+    llm = LLM(
+        model="facebook/opt-6.7b",
+        tensor_parallel_size=1,
+        speculative_config={
+            "method": "ngram",
+            "num_speculative_tokens": 5,
+            "prompt_lookup_max": 4,
+        },
+    )
+    outputs = llm.generate(prompts, sampling_params)
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    ```

 ## Speculating using MLP speculators

@@ -131,29 +137,31 @@ draft models that conditioning draft predictions on both context vectors and sam
 For more information see [this blog](https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/) or
 [this technical report](https://arxiv.org/abs/2404.19124).

-```python
-from vllm import LLM, SamplingParams
+??? Code

-prompts = [
-    "The future of AI is",
-]
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    ```python
+    from vllm import LLM, SamplingParams

-llm = LLM(
-    model="meta-llama/Meta-Llama-3.1-70B-Instruct",
-    tensor_parallel_size=4,
-    speculative_config={
-        "model": "ibm-ai-platform/llama3-70b-accelerator",
-        "draft_tensor_parallel_size": 1,
-    },
-)
-outputs = llm.generate(prompts, sampling_params)
+    prompts = [
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-```
+    llm = LLM(
+        model="meta-llama/Meta-Llama-3.1-70B-Instruct",
+        tensor_parallel_size=4,
+        speculative_config={
+            "model": "ibm-ai-platform/llama3-70b-accelerator",
+            "draft_tensor_parallel_size": 1,
+        },
+    )
+    outputs = llm.generate(prompts, sampling_params)
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    ```

 Note that these speculative models currently need to be run without tensor parallelism, although
 it is possible to run the main model using tensor parallelism (see example above). Since the
@@ -177,31 +185,33 @@ A variety of speculative models of this type are available on HF hub:
 The following code configures vLLM to use speculative decoding where proposals are generated by
 an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](gh-file:examples/offline_inference/eagle.py).

-```python
-from vllm import LLM, SamplingParams
+??? Code

-prompts = [
-    "The future of AI is",
-]
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    ```python
+    from vllm import LLM, SamplingParams

-llm = LLM(
-    model="meta-llama/Meta-Llama-3-8B-Instruct",
-    tensor_parallel_size=4,
-    speculative_config={
-        "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
-        "draft_tensor_parallel_size": 1,
-    },
-)
+    prompts = [
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

-outputs = llm.generate(prompts, sampling_params)
+    llm = LLM(
+        model="meta-llama/Meta-Llama-3-8B-Instruct",
+        tensor_parallel_size=4,
+        speculative_config={
+            "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
+            "draft_tensor_parallel_size": 1,
+        },
+    )

-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    outputs = llm.generate(prompts, sampling_params)

-```
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    ```

 A few important things to consider when using the EAGLE based draft models:

--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@@ -33,39 +33,43 @@ text.

 Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:

-```python
-from openai import OpenAI
-client = OpenAI(
-    base_url="http://localhost:8000/v1",
-    api_key="-",
-)
-model = client.models.list().data[0].id
+??? Code

-completion = client.chat.completions.create(
-    model=model,
-    messages=[
-        {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
-    ],
-    extra_body={"guided_choice": ["positive", "negative"]},
-)
-print(completion.choices[0].message.content)
-```
+    ```python
+    from openai import OpenAI
+    client = OpenAI(
+        base_url="http://localhost:8000/v1",
+        api_key="-",
+    )
+    model = client.models.list().data[0].id
+
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[
+            {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
+        ],
+        extra_body={"guided_choice": ["positive", "negative"]},
+    )
+    print(completion.choices[0].message.content)
+    ```

 The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template:

-```python
-completion = client.chat.completions.create(
-    model=model,
-    messages=[
-        {
-            "role": "user",
-            "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
-        }
-    ],
-    extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
-)
-print(completion.choices[0].message.content)
-```
+??? Code
+
+    ```python
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[
+            {
+                "role": "user",
+                "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
+            }
+        ],
+        extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
+    )
+    print(completion.choices[0].message.content)
+    ```

 One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats.
 For this we can use the `guided_json` parameter in two different ways:
@@ -75,41 +79,43 @@ For this we can use the `guided_json` parameter in two different ways:

 The next example shows how to use the `guided_json` parameter with a Pydantic model:

-```python
-from pydantic import BaseModel
-from enum import Enum
+??? Code

-class CarType(str, Enum):
-    sedan = "sedan"
-    suv = "SUV"
-    truck = "Truck"
-    coupe = "Coupe"
+    ```python
+    from pydantic import BaseModel
+    from enum import Enum

-class CarDescription(BaseModel):
-    brand: str
-    model: str
-    car_type: CarType
+    class CarType(str, Enum):
+        sedan = "sedan"
+        suv = "SUV"
+        truck = "Truck"
+        coupe = "Coupe"

-json_schema = CarDescription.model_json_schema()
+    class CarDescription(BaseModel):
+        brand: str
+        model: str
+        car_type: CarType

-completion = client.chat.completions.create(
-    model=model,
-    messages=[
-        {
-            "role": "user",
-            "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
-        }
-    ],
-    "response_format": {
-        "type": "json_schema",
-        "json_schema": {
-            "name": "car-description",
-            "schema": CarDescription.model_json_schema()
+    json_schema = CarDescription.model_json_schema()
+
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[
+            {
+                "role": "user",
+                "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
+            }
+        ],
+        "response_format": {
+            "type": "json_schema",
+            "json_schema": {
+                "name": "car-description",
+                "schema": CarDescription.model_json_schema()
+            },
        },
-    },
-)
-print(completion.choices[0].message.content)
-```
+    )
+    print(completion.choices[0].message.content)
+    ```

 !!! tip
    While not strictly necessary, normally it´s better to indicate in the prompt the
@@ -121,33 +127,35 @@ difficult to use, but it´s really powerful. It allows us to define complete
 languages like SQL queries. It works by using a context free EBNF grammar.
 As an example, we can use to define a specific format of simplified SQL queries:

-```python
-simplified_sql_grammar = """
-    root ::= select_statement
+??? Code

-    select_statement ::= "SELECT " column " from " table " where " condition
+    ```python
+    simplified_sql_grammar = """
+        root ::= select_statement

-    column ::= "col_1 " | "col_2 "
+        select_statement ::= "SELECT " column " from " table " where " condition

-    table ::= "table_1 " | "table_2 "
+        column ::= "col_1 " | "col_2 "

-    condition ::= column "= " number
+        table ::= "table_1 " | "table_2 "

-    number ::= "1 " | "2 "
-"""
+        condition ::= column "= " number

-completion = client.chat.completions.create(
-    model=model,
-    messages=[
-        {
-            "role": "user",
-            "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
-        }
-    ],
-    extra_body={"guided_grammar": simplified_sql_grammar},
-)
-print(completion.choices[0].message.content)
-```
+        number ::= "1 " | "2 "
+    """
+
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[
+            {
+                "role": "user",
+                "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
+            }
+        ],
+        extra_body={"guided_grammar": simplified_sql_grammar},
+    )
+    print(completion.choices[0].message.content)
+    ```

 See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)

@@ -161,34 +169,36 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r

 Note that you can use reasoning with any provided structured outputs feature. The following uses one with JSON schema:

-```python
-from pydantic import BaseModel
+??? Code
+
+    ```python
+    from pydantic import BaseModel


-class People(BaseModel):
-    name: str
-    age: int
+    class People(BaseModel):
+        name: str
+        age: int


-completion = client.chat.completions.create(
-    model=model,
-    messages=[
-        {
-            "role": "user",
-            "content": "Generate a JSON with the name and age of one random person.",
-        }
-    ],
-    response_format={
-        "type": "json_schema",
-        "json_schema": {
-            "name": "people",
-            "schema": People.model_json_schema()
-        }
-    },
-)
-print("reasoning_content: ", completion.choices[0].message.reasoning_content)
-print("content: ", completion.choices[0].message.content)
-```
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[
+            {
+                "role": "user",
+                "content": "Generate a JSON with the name and age of one random person.",
+            }
+        ],
+        response_format={
+            "type": "json_schema",
+            "json_schema": {
+                "name": "people",
+                "schema": People.model_json_schema()
+            }
+        },
+    )
+    print("reasoning_content: ", completion.choices[0].message.reasoning_content)
+    print("content: ", completion.choices[0].message.content)
+    ```

 See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)

@@ -202,33 +212,33 @@ For the following examples, vLLM was setup using `vllm serve meta-llama/Llama-3.

 Here is a simple example demonstrating how to get structured output using Pydantic models:

-```python
-from pydantic import BaseModel
-from openai import OpenAI
+??? Code

-class Info(BaseModel):
-    name: str
-    age: int
+    ```python
+    from pydantic import BaseModel
+    from openai import OpenAI

-client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
-model = client.models.list().data[0].id
-completion = client.beta.chat.completions.parse(
-    model=model,
-    messages=[
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"},
-    ],
-    response_format=Info,
-)
+    class Info(BaseModel):
+        name: str
+        age: int

-message = completion.choices[0].message
-print(message)
-assert message.parsed
-print("Name:", message.parsed.name)
-print("Age:", message.parsed.age)
-```
+    client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
+    model = client.models.list().data[0].id
+    completion = client.beta.chat.completions.parse(
+        model=model,
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"},
+        ],
+        response_format=Info,
+    )

-Output:
+    message = completion.choices[0].message
+    print(message)
+    assert message.parsed
+    print("Name:", message.parsed.name)
+    print("Age:", message.parsed.age)
+    ```

 ```console
 ParsedChatCompletionMessage[Testing](content='{"name": "Cameron", "age": 28}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=Testing(name='Cameron', age=28))
@@ -238,35 +248,37 @@ Age: 28

 Here is a more complex example using nested Pydantic models to handle a step-by-step math solution:

-```python
-from typing import List
-from pydantic import BaseModel
-from openai import OpenAI
+??? Code

-class Step(BaseModel):
-    explanation: str
-    output: str
+    ```python
+    from typing import List
+    from pydantic import BaseModel
+    from openai import OpenAI

-class MathResponse(BaseModel):
-    steps: list[Step]
-    final_answer: str
+    class Step(BaseModel):
+        explanation: str
+        output: str

-completion = client.beta.chat.completions.parse(
-    model=model,
-    messages=[
-        {"role": "system", "content": "You are a helpful expert math tutor."},
-        {"role": "user", "content": "Solve 8x + 31 = 2."},
-    ],
-    response_format=MathResponse,
-)
+    class MathResponse(BaseModel):
+        steps: list[Step]
+        final_answer: str

-message = completion.choices[0].message
-print(message)
-assert message.parsed
-for i, step in enumerate(message.parsed.steps):
-    print(f"Step #{i}:", step)
-print("Answer:", message.parsed.final_answer)
-```
+    completion = client.beta.chat.completions.parse(
+        model=model,
+        messages=[
+            {"role": "system", "content": "You are a helpful expert math tutor."},
+            {"role": "user", "content": "Solve 8x + 31 = 2."},
+        ],
+        response_format=MathResponse,
+    )
+
+    message = completion.choices[0].message
+    print(message)
+    assert message.parsed
+    for i, step in enumerate(message.parsed.steps):
+        print(f"Step #{i}:", step)
+    print("Answer:", message.parsed.final_answer)
+    ```

 Output:

@@ -296,19 +308,21 @@ These parameters can be used in the same way as the parameters from the Online
 Serving examples above. One example for the usage of the `choice` parameter is
 shown below:

-```python
-from vllm import LLM, SamplingParams
-from vllm.sampling_params import GuidedDecodingParams
+??? Code

-llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")
+    ```python
+    from vllm import LLM, SamplingParams
+    from vllm.sampling_params import GuidedDecodingParams

-guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
-sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
-outputs = llm.generate(
-    prompts="Classify this sentiment: vLLM is wonderful!",
-    sampling_params=sampling_params,
-)
-print(outputs[0].outputs[0].text)
-```
+    llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")
+
+    guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
+    sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
+    outputs = llm.generate(
+        prompts="Classify this sentiment: vLLM is wonderful!",
+        sampling_params=sampling_params,
+    )
+    print(outputs[0].outputs[0].text)
+    ```

 See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -15,44 +15,46 @@ vllm serve meta-llama/Llama-3.1-8B-Instruct \

 Next, make a request to the model that should result in it using the available tools:

-```python
-from openai import OpenAI
-import json
+??? Code

-client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
+    ```python
+    from openai import OpenAI
+    import json

-def get_weather(location: str, unit: str):
-    return f"Getting the weather for {location} in {unit}..."
-tool_functions = {"get_weather": get_weather}
+    client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")

-tools = [{
-    "type": "function",
-    "function": {
-        "name": "get_weather",
-        "description": "Get the current weather in a given location",
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
-                "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
-            },
-            "required": ["location", "unit"]
+    def get_weather(location: str, unit: str):
+        return f"Getting the weather for {location} in {unit}..."
+    tool_functions = {"get_weather": get_weather}
+
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
+                },
+                "required": ["location", "unit"]
+            }
        }
-    }
-}]
+    }]

-response = client.chat.completions.create(
-    model=client.models.list().data[0].id,
-    messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
-    tools=tools,
-    tool_choice="auto"
-)
+    response = client.chat.completions.create(
+        model=client.models.list().data[0].id,
+        messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
+        tools=tools,
+        tool_choice="auto"
+    )

-tool_call = response.choices[0].message.tool_calls[0].function
-print(f"Function called: {tool_call.name}")
-print(f"Arguments: {tool_call.arguments}")
-print(f"Result: {get_weather(**json.loads(tool_call.arguments))}")
-```
+    tool_call = response.choices[0].message.tool_calls[0].function
+    print(f"Function called: {tool_call.name}")
+    print(f"Arguments: {tool_call.arguments}")
+    print(f"Result: {get_weather(**json.loads(tool_call.arguments))}")
+    ```

 Example output:

@@ -301,49 +303,51 @@ A tool parser plugin is a Python file containing one or more ToolParser implemen

 Here is a summary of a plugin file:

-```python
+??? Code

-# import the required packages
+    ```python

-# define a tool parser and register it to vllm
-# the name list in register_module can be used
-# in --tool-call-parser. you can define as many
-# tool parsers as you want here.
-@ToolParserManager.register_module(["example"])
-class ExampleToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
-        super().__init__(tokenizer)
+    # import the required packages

-    # adjust request. e.g.: set skip special tokens
-    # to False for tool call output.
-    def adjust_request(
-            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
-        return request
+    # define a tool parser and register it to vllm
+    # the name list in register_module can be used
+    # in --tool-call-parser. you can define as many
+    # tool parsers as you want here.
+    @ToolParserManager.register_module(["example"])
+    class ExampleToolParser(ToolParser):
+        def __init__(self, tokenizer: AnyTokenizer):
+            super().__init__(tokenizer)

-    # implement the tool call parse for stream call
-    def extract_tool_calls_streaming(
-        self,
-        previous_text: str,
-        current_text: str,
-        delta_text: str,
-        previous_token_ids: Sequence[int],
-        current_token_ids: Sequence[int],
-        delta_token_ids: Sequence[int],
-        request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
-        return delta
+        # adjust request. e.g.: set skip special tokens
+        # to False for tool call output.
+        def adjust_request(
+                self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+            return request

-    # implement the tool parse for non-stream call
-    def extract_tool_calls(
-        self,
-        model_output: str,
-        request: ChatCompletionRequest,
-    ) -> ExtractedToolCallInformation:
-        return ExtractedToolCallInformation(tools_called=False,
-                                            tool_calls=[],
-                                            content=text)
+        # implement the tool call parse for stream call
+        def extract_tool_calls_streaming(
+            self,
+            previous_text: str,
+            current_text: str,
+            delta_text: str,
+            previous_token_ids: Sequence[int],
+            current_token_ids: Sequence[int],
+            delta_token_ids: Sequence[int],
+            request: ChatCompletionRequest,
+        ) -> Union[DeltaMessage, None]:
+            return delta

-```
+        # implement the tool parse for non-stream call
+        def extract_tool_calls(
+            self,
+            model_output: str,
+            request: ChatCompletionRequest,
+        ) -> ExtractedToolCallInformation:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=text)
+
+    ```

 Then you can use this plugin in the command line like this.