[doc] Fold long code blocks to improve readability (#19926)

Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-23 13:24:23 +08:00
parent 493c275352
commit f17aec0d63
50 changed files with 3455 additions and 3180 deletions
--- a/docs/features/spec_decode.md
+++ b/docs/features/spec_decode.md
@@ -18,29 +18,31 @@ Speculative decoding is a technique which improves inter-token latency in memory

 The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.

-```python
-from vllm import LLM, SamplingParams
+??? Code

-prompts = [
-    "The future of AI is",
-]
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    ```python
+    from vllm import LLM, SamplingParams

-llm = LLM(
-    model="facebook/opt-6.7b",
-    tensor_parallel_size=1,
-    speculative_config={
-        "model": "facebook/opt-125m",
-        "num_speculative_tokens": 5,
-    },
-)
-outputs = llm.generate(prompts, sampling_params)
+    prompts = [
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-```
+    llm = LLM(
+        model="facebook/opt-6.7b",
+        tensor_parallel_size=1,
+        speculative_config={
+            "model": "facebook/opt-125m",
+            "num_speculative_tokens": 5,
+        },
+    )
+    outputs = llm.generate(prompts, sampling_params)
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    ```

 To perform the same with an online mode launch the server:

@@ -60,69 +62,73 @@ python -m vllm.entrypoints.openai.api_server \

 Then use a client:

-```python
-from openai import OpenAI
+??? Code

-# Modify OpenAI's API key and API base to use vLLM's API server.
-openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
+    ```python
+    from openai import OpenAI

-client = OpenAI(
-    # defaults to os.environ.get("OPENAI_API_KEY")
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
+    # Modify OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"

-models = client.models.list()
-model = models.data[0].id
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )

-# Completion API
-stream = False
-completion = client.completions.create(
-    model=model,
-    prompt="The future of AI is",
-    echo=False,
-    n=1,
-    stream=stream,
-)
+    models = client.models.list()
+    model = models.data[0].id

-print("Completion results:")
-if stream:
-    for c in completion:
-        print(c)
-else:
-    print(completion)
-```
+    # Completion API
+    stream = False
+    completion = client.completions.create(
+        model=model,
+        prompt="The future of AI is",
+        echo=False,
+        n=1,
+        stream=stream,
+    )
+
+    print("Completion results:")
+    if stream:
+        for c in completion:
+            print(c)
+    else:
+        print(completion)
+    ```

 ## Speculating by matching n-grams in the prompt

 The following code configures vLLM to use speculative decoding where proposals are generated by
 matching n-grams in the prompt. For more information read [this thread.](https://x.com/joao_gante/status/1747322413006643259)

-```python
-from vllm import LLM, SamplingParams
+??? Code

-prompts = [
-    "The future of AI is",
-]
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    ```python
+    from vllm import LLM, SamplingParams

-llm = LLM(
-    model="facebook/opt-6.7b",
-    tensor_parallel_size=1,
-    speculative_config={
-        "method": "ngram",
-        "num_speculative_tokens": 5,
-        "prompt_lookup_max": 4,
-    },
-)
-outputs = llm.generate(prompts, sampling_params)
+    prompts = [
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-```
+    llm = LLM(
+        model="facebook/opt-6.7b",
+        tensor_parallel_size=1,
+        speculative_config={
+            "method": "ngram",
+            "num_speculative_tokens": 5,
+            "prompt_lookup_max": 4,
+        },
+    )
+    outputs = llm.generate(prompts, sampling_params)
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    ```

 ## Speculating using MLP speculators

@@ -131,29 +137,31 @@ draft models that conditioning draft predictions on both context vectors and sam
 For more information see [this blog](https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/) or
 [this technical report](https://arxiv.org/abs/2404.19124).

-```python
-from vllm import LLM, SamplingParams
+??? Code

-prompts = [
-    "The future of AI is",
-]
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    ```python
+    from vllm import LLM, SamplingParams

-llm = LLM(
-    model="meta-llama/Meta-Llama-3.1-70B-Instruct",
-    tensor_parallel_size=4,
-    speculative_config={
-        "model": "ibm-ai-platform/llama3-70b-accelerator",
-        "draft_tensor_parallel_size": 1,
-    },
-)
-outputs = llm.generate(prompts, sampling_params)
+    prompts = [
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-```
+    llm = LLM(
+        model="meta-llama/Meta-Llama-3.1-70B-Instruct",
+        tensor_parallel_size=4,
+        speculative_config={
+            "model": "ibm-ai-platform/llama3-70b-accelerator",
+            "draft_tensor_parallel_size": 1,
+        },
+    )
+    outputs = llm.generate(prompts, sampling_params)
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    ```

 Note that these speculative models currently need to be run without tensor parallelism, although
 it is possible to run the main model using tensor parallelism (see example above). Since the
@@ -177,31 +185,33 @@ A variety of speculative models of this type are available on HF hub:
 The following code configures vLLM to use speculative decoding where proposals are generated by
 an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](gh-file:examples/offline_inference/eagle.py).

-```python
-from vllm import LLM, SamplingParams
+??? Code

-prompts = [
-    "The future of AI is",
-]
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    ```python
+    from vllm import LLM, SamplingParams

-llm = LLM(
-    model="meta-llama/Meta-Llama-3-8B-Instruct",
-    tensor_parallel_size=4,
-    speculative_config={
-        "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
-        "draft_tensor_parallel_size": 1,
-    },
-)
+    prompts = [
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

-outputs = llm.generate(prompts, sampling_params)
+    llm = LLM(
+        model="meta-llama/Meta-Llama-3-8B-Instruct",
+        tensor_parallel_size=4,
+        speculative_config={
+            "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
+            "draft_tensor_parallel_size": 1,
+        },
+    )

-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    outputs = llm.generate(prompts, sampling_params)

-```
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    ```

 A few important things to consider when using the EAGLE based draft models: