[doc] improve readability (#18675)

Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-25 16:40:31 +08:00
parent 624b77a2b3
commit 279f854519
20 changed files with 206 additions and 59 deletions
--- a/docs/features/quantization/auto_awq.md
+++ b/docs/features/quantization/auto_awq.md
@@ -42,7 +42,9 @@ print(f'Model is quantized and saved at "{quant_path}"')
 To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:

 ```console
-python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
+python examples/offline_inference/llm_engine_example.py \
+    --model TheBloke/Llama-2-7b-Chat-AWQ \
+    --quantization awq
 ```

 AWQ models are also supported directly through the LLM entrypoint:
--- a/docs/features/quantization/bitblas.md
+++ b/docs/features/quantization/bitblas.md
@@ -33,7 +33,12 @@ import torch

 # "hxbgsyxh/llama-13b-4bit-g-1-bitblas" is a pre-quantized checkpoint.
 model_id = "hxbgsyxh/llama-13b-4bit-g-1-bitblas"
-llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, quantization="bitblas")
+llm = LLM(
+    model=model_id,
+    dtype=torch.bfloat16,
+    trust_remote_code=True,
+    quantization="bitblas"
+)
 ```

 ## Read gptq format checkpoint
@@ -44,5 +49,11 @@ import torch

 # "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
 model_id = "hxbgsyxh/llama-13b-4bit-g-1"
-llm = LLM(model=model_id, dtype=torch.float16, trust_remote_code=True, quantization="bitblas", max_model_len=1024)
+llm = LLM(
+    model=model_id,
+    dtype=torch.float16,
+    trust_remote_code=True,
+    quantization="bitblas",
+    max_model_len=1024
+)
 ```
--- a/docs/features/quantization/bnb.md
+++ b/docs/features/quantization/bnb.md
@@ -27,7 +27,11 @@ from vllm import LLM
 import torch
 # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
 model_id = "unsloth/tinyllama-bnb-4bit"
-llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True)
+llm = LLM(
+    model=model_id,
+    dtype=torch.bfloat16,
+    trust_remote_code=True
+)
 ```

 ## Inflight quantization: load as 4bit quantization
@@ -38,8 +42,12 @@ For inflight 4bit quantization with BitsAndBytes, you need to explicitly specify
 from vllm import LLM
 import torch
 model_id = "huggyllama/llama-7b"
-llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
-quantization="bitsandbytes")
+llm = LLM(
+    model=model_id,
+    dtype=torch.bfloat16,
+    trust_remote_code=True,
+    quantization="bitsandbytes"
+)
 ```

 ## OpenAI Compatible Server
--- a/docs/features/quantization/gguf.md
+++ b/docs/features/quantization/gguf.md
@@ -14,14 +14,17 @@ To run a GGUF model with vLLM, you can download and use the local GGUF model fro
 ```console
 wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
 # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
-vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0
+vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
+   --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0
 ```

 You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs:

 ```console
 # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
-vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2
+vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
+   --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+   --tensor-parallel-size 2
 ```

 !!! warning
@@ -31,7 +34,9 @@ GGUF assumes that huggingface can convert the metadata to a config file. In case

 ```console
 # If you model is not supported by huggingface you can manually provide a huggingface compatible config path
-vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --hf-config-path Tinyllama/TInyLlama-1.1B-Chat-v1.0
+vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
+   --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+   --hf-config-path Tinyllama/TInyLlama-1.1B-Chat-v1.0
 ```

 You can also use the GGUF model directly through the LLM entrypoint:
--- a/docs/features/quantization/gptqmodel.md
+++ b/docs/features/quantization/gptqmodel.md
@@ -59,7 +59,8 @@ model.save(quant_path)
 To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2](https://huggingface.co/ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2) with the following command:

 ```console
-python examples/offline_inference/llm_engine_example.py --model ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2
+python examples/offline_inference/llm_engine_example.py \
+    --model ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2
 ```

 ## Using GPTQModel with vLLM's Python API
--- a/docs/features/quantization/torchao.md
+++ b/docs/features/quantization/torchao.md
@@ -7,7 +7,9 @@ We recommend installing the latest torchao nightly with
 ```console
 # Install the latest TorchAO nightly build
 # Choose the CUDA version that matches your system (cu126, cu128, etc.)
-pip install --pre torchao>=10.0.0 --index-url https://download.pytorch.org/whl/nightly/cu126
+pip install \
+    --pre torchao>=10.0.0 \
+    --index-url https://download.pytorch.org/whl/nightly/cu126
 ```

 ## Quantizing HuggingFace Models
@@ -20,7 +22,12 @@ from torchao.quantization import Int8WeightOnlyConfig

 model_name = "meta-llama/Meta-Llama-3-8B"
 quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
-quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", quantization_config=quantization_config)
+quantized_model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype="auto",
+    device_map="auto",
+    quantization_config=quantization_config
+)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 input_text = "What are we having for dinner?"
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")