[doc] Fold long code blocks to improve readability (#19926)

Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-23 13:24:23 +08:00
parent 493c275352
commit f17aec0d63
50 changed files with 3455 additions and 3180 deletions
--- a/docs/features/quantization/gptqmodel.md
+++ b/docs/features/quantization/gptqmodel.md
@@ -31,28 +31,30 @@ After installing GPTQModel, you are ready to quantize a model. Please refer to t

 Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:

-```python
-from datasets import load_dataset
-from gptqmodel import GPTQModel, QuantizeConfig
+??? Code

-model_id = "meta-llama/Llama-3.2-1B-Instruct"
-quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"
+    ```python
+    from datasets import load_dataset
+    from gptqmodel import GPTQModel, QuantizeConfig

-calibration_dataset = load_dataset(
-    "allenai/c4",
-    data_files="en/c4-train.00001-of-01024.json.gz",
-    split="train"
-  ).select(range(1024))["text"]
+    model_id = "meta-llama/Llama-3.2-1B-Instruct"
+    quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"

-quant_config = QuantizeConfig(bits=4, group_size=128)
+    calibration_dataset = load_dataset(
+        "allenai/c4",
+        data_files="en/c4-train.00001-of-01024.json.gz",
+        split="train"
+    ).select(range(1024))["text"]

-model = GPTQModel.load(model_id, quant_config)
+    quant_config = QuantizeConfig(bits=4, group_size=128)

-# increase `batch_size` to match gpu/vram specs to speed up quantization
-model.quantize(calibration_dataset, batch_size=2)
+    model = GPTQModel.load(model_id, quant_config)

-model.save(quant_path)
-```
+    # increase `batch_size` to match gpu/vram specs to speed up quantization
+    model.quantize(calibration_dataset, batch_size=2)
+
+    model.save(quant_path)
+    ```

 ## Running a quantized model with vLLM

@@ -67,32 +69,34 @@ python examples/offline_inference/llm_engine_example.py \

 GPTQModel quantized models are also supported directly through the LLM entrypoint:

-```python
-from vllm import LLM, SamplingParams
+??? Code

-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
+    ```python
+    from vllm import LLM, SamplingParams

-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]

-# Create an LLM.
-llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0.6, top_p=0.9)

-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
+    # Create an LLM.
+    llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")

-# Print the outputs.
-print("-"*50)
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
    print("-"*50)
-```
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-"*50)
+    ```