[doc] Fold long code blocks to improve readability (#19926)

Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-23 13:24:23 +08:00
parent 493c275352
commit f17aec0d63
50 changed files with 3455 additions and 3180 deletions
--- a/docs/features/quantization/auto_awq.md
+++ b/docs/features/quantization/auto_awq.md
@@ -15,29 +15,31 @@ pip install autoawq

 After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:

-```python
-from awq import AutoAWQForCausalLM
-from transformers import AutoTokenizer
+??? Code

-model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
-quant_path = 'mistral-instruct-v0.2-awq'
-quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
+    ```python
+    from awq import AutoAWQForCausalLM
+    from transformers import AutoTokenizer

-# Load model
-model = AutoAWQForCausalLM.from_pretrained(
-    model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
-)
-tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
+    quant_path = 'mistral-instruct-v0.2-awq'
+    quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }

-# Quantize
-model.quantize(tokenizer, quant_config=quant_config)
+    # Load model
+    model = AutoAWQForCausalLM.from_pretrained(
+        model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

-# Save quantized model
-model.save_quantized(quant_path)
-tokenizer.save_pretrained(quant_path)
+    # Quantize
+    model.quantize(tokenizer, quant_config=quant_config)

-print(f'Model is quantized and saved at "{quant_path}"')
-```
+    # Save quantized model
+    model.save_quantized(quant_path)
+    tokenizer.save_pretrained(quant_path)
+
+    print(f'Model is quantized and saved at "{quant_path}"')
+    ```

 To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:

@@ -49,27 +51,29 @@ python examples/offline_inference/llm_engine_example.py \

 AWQ models are also supported directly through the LLM entrypoint:

-```python
-from vllm import LLM, SamplingParams
+??? Code

-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    ```python
+    from vllm import LLM, SamplingParams

-# Create an LLM.
-llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-```
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    # Create an LLM.
+    llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    ```
--- a/docs/features/quantization/bitblas.md
+++ b/docs/features/quantization/bitblas.md
@@ -43,17 +43,19 @@ llm = LLM(

 ## Read gptq format checkpoint

-```python
-from vllm import LLM
-import torch
+??? Code

-# "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
-model_id = "hxbgsyxh/llama-13b-4bit-g-1"
-llm = LLM(
-    model=model_id,
-    dtype=torch.float16,
-    trust_remote_code=True,
-    quantization="bitblas",
-    max_model_len=1024
-)
-```
+    ```python
+    from vllm import LLM
+    import torch
+
+    # "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
+    model_id = "hxbgsyxh/llama-13b-4bit-g-1"
+    llm = LLM(
+        model=model_id,
+        dtype=torch.float16,
+        trust_remote_code=True,
+        quantization="bitblas",
+        max_model_len=1024
+    )
+    ```
--- a/docs/features/quantization/fp8.md
+++ b/docs/features/quantization/fp8.md
@@ -58,22 +58,24 @@ For FP8 quantization, we can recover accuracy with simple RTN quantization. We r

 Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow.

-```python
-from llmcompressor.transformers import oneshot
-from llmcompressor.modifiers.quantization import QuantizationModifier
+??? Code

-# Configure the simple PTQ quantization
-recipe = QuantizationModifier(
-  targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
+    ```python
+    from llmcompressor.transformers import oneshot
+    from llmcompressor.modifiers.quantization import QuantizationModifier

-# Apply the quantization algorithm.
-oneshot(model=model, recipe=recipe)
+    # Configure the simple PTQ quantization
+    recipe = QuantizationModifier(
+      targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])

-# Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic
-SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
-model.save_pretrained(SAVE_DIR)
-tokenizer.save_pretrained(SAVE_DIR)
-```
+    # Apply the quantization algorithm.
+    oneshot(model=model, recipe=recipe)
+
+    # Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic
+    SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+    model.save_pretrained(SAVE_DIR)
+    tokenizer.save_pretrained(SAVE_DIR)
+    ```

 ### 3. Evaluating Accuracy

--- a/docs/features/quantization/gguf.md
+++ b/docs/features/quantization/gguf.md
@@ -41,42 +41,44 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \

 You can also use the GGUF model directly through the LLM entrypoint:

-```python
-from vllm import LLM, SamplingParams
+??? Code

-# In this script, we demonstrate how to pass input to the chat method:
-conversation = [
-   {
-      "role": "system",
-      "content": "You are a helpful assistant"
-   },
-   {
-      "role": "user",
-      "content": "Hello"
-   },
-   {
-      "role": "assistant",
-      "content": "Hello! How can I assist you today?"
-   },
-   {
-      "role": "user",
-      "content": "Write an essay about the importance of higher education.",
-   },
-]
+      ```python
+      from vllm import LLM, SamplingParams

-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+      # In this script, we demonstrate how to pass input to the chat method:
+      conversation = [
+         {
+            "role": "system",
+            "content": "You are a helpful assistant"
+         },
+         {
+            "role": "user",
+            "content": "Hello"
+         },
+         {
+            "role": "assistant",
+            "content": "Hello! How can I assist you today?"
+         },
+         {
+            "role": "user",
+            "content": "Write an essay about the importance of higher education.",
+         },
+      ]

-# Create an LLM.
-llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
-         tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.chat(conversation, sampling_params)
+      # Create a sampling params object.
+      sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

-# Print the outputs.
-for output in outputs:
-   prompt = output.prompt
-   generated_text = output.outputs[0].text
-   print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-```
+      # Create an LLM.
+      llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
+               tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+      # Generate texts from the prompts. The output is a list of RequestOutput objects
+      # that contain the prompt, generated text, and other information.
+      outputs = llm.chat(conversation, sampling_params)
+
+      # Print the outputs.
+      for output in outputs:
+         prompt = output.prompt
+         generated_text = output.outputs[0].text
+         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+      ```
--- a/docs/features/quantization/gptqmodel.md
+++ b/docs/features/quantization/gptqmodel.md
@@ -31,28 +31,30 @@ After installing GPTQModel, you are ready to quantize a model. Please refer to t

 Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:

-```python
-from datasets import load_dataset
-from gptqmodel import GPTQModel, QuantizeConfig
+??? Code

-model_id = "meta-llama/Llama-3.2-1B-Instruct"
-quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"
+    ```python
+    from datasets import load_dataset
+    from gptqmodel import GPTQModel, QuantizeConfig

-calibration_dataset = load_dataset(
-    "allenai/c4",
-    data_files="en/c4-train.00001-of-01024.json.gz",
-    split="train"
-  ).select(range(1024))["text"]
+    model_id = "meta-llama/Llama-3.2-1B-Instruct"
+    quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"

-quant_config = QuantizeConfig(bits=4, group_size=128)
+    calibration_dataset = load_dataset(
+        "allenai/c4",
+        data_files="en/c4-train.00001-of-01024.json.gz",
+        split="train"
+    ).select(range(1024))["text"]

-model = GPTQModel.load(model_id, quant_config)
+    quant_config = QuantizeConfig(bits=4, group_size=128)

-# increase `batch_size` to match gpu/vram specs to speed up quantization
-model.quantize(calibration_dataset, batch_size=2)
+    model = GPTQModel.load(model_id, quant_config)

-model.save(quant_path)
-```
+    # increase `batch_size` to match gpu/vram specs to speed up quantization
+    model.quantize(calibration_dataset, batch_size=2)
+
+    model.save(quant_path)
+    ```

 ## Running a quantized model with vLLM

@@ -67,32 +69,34 @@ python examples/offline_inference/llm_engine_example.py \

 GPTQModel quantized models are also supported directly through the LLM entrypoint:

-```python
-from vllm import LLM, SamplingParams
+??? Code

-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
+    ```python
+    from vllm import LLM, SamplingParams

-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]

-# Create an LLM.
-llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0.6, top_p=0.9)

-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
+    # Create an LLM.
+    llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")

-# Print the outputs.
-print("-"*50)
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
    print("-"*50)
-```
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-"*50)
+    ```
--- a/docs/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@@ -53,51 +53,55 @@ When quantizing weights to INT4, you need sample data to estimate the weight upd
 It's best to use calibration data that closely matches your deployment data.
 For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:

-```python
-from datasets import load_dataset
+??? Code

-NUM_CALIBRATION_SAMPLES = 512
-MAX_SEQUENCE_LENGTH = 2048
+    ```python
+    from datasets import load_dataset

-# Load and preprocess the dataset
-ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
-ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+    NUM_CALIBRATION_SAMPLES = 512
+    MAX_SEQUENCE_LENGTH = 2048

-def preprocess(example):
-    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
-ds = ds.map(preprocess)
+    # Load and preprocess the dataset
+    ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
+    ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))

-def tokenize(sample):
-    return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
-ds = ds.map(tokenize, remove_columns=ds.column_names)
-```
+    def preprocess(example):
+        return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
+    ds = ds.map(preprocess)
+
+    def tokenize(sample):
+        return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
+    ds = ds.map(tokenize, remove_columns=ds.column_names)
+    ```

 ### 3. Applying Quantization

 Now, apply the quantization algorithms:

-```python
-from llmcompressor.transformers import oneshot
-from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
+??? Code

-# Configure the quantization algorithms
-recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
+    ```python
+    from llmcompressor.transformers import oneshot
+    from llmcompressor.modifiers.quantization import GPTQModifier
+    from llmcompressor.modifiers.smoothquant import SmoothQuantModifier

-# Apply quantization
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-)
+    # Configure the quantization algorithms
+    recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])

-# Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128
-SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
-```
+    # Apply quantization
+    oneshot(
+        model=model,
+        dataset=ds,
+        recipe=recipe,
+        max_seq_length=MAX_SEQUENCE_LENGTH,
+        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    )
+
+    # Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128
+    SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
+    model.save_pretrained(SAVE_DIR, save_compressed=True)
+    tokenizer.save_pretrained(SAVE_DIR)
+    ```

 This process creates a W4A16 model with weights quantized to 4-bit integers.

@@ -137,34 +141,36 @@ $ lm_eval --model vllm \

 The following is an example of an expanded quantization recipe you can tune to your own use case:

-```python
-from compressed_tensors.quantization import (
-    QuantizationArgs,
-    QuantizationScheme,
-    QuantizationStrategy,
-    QuantizationType,
-) 
-recipe = GPTQModifier(
-    targets="Linear",
-    config_groups={
-        "config_group": QuantizationScheme(
-            targets=["Linear"],
-            weights=QuantizationArgs(
-                num_bits=4,
-                type=QuantizationType.INT,
-                strategy=QuantizationStrategy.GROUP,
-                group_size=128,
-                symmetric=True,
-                dynamic=False,
-                actorder="weight",
+??? Code
+
+    ```python
+    from compressed_tensors.quantization import (
+        QuantizationArgs,
+        QuantizationScheme,
+        QuantizationStrategy,
+        QuantizationType,
+    ) 
+    recipe = GPTQModifier(
+        targets="Linear",
+        config_groups={
+            "config_group": QuantizationScheme(
+                targets=["Linear"],
+                weights=QuantizationArgs(
+                    num_bits=4,
+                    type=QuantizationType.INT,
+                    strategy=QuantizationStrategy.GROUP,
+                    group_size=128,
+                    symmetric=True,
+                    dynamic=False,
+                    actorder="weight",
+                ),
            ),
-        ),
-    },
-    ignore=["lm_head"],
-    update_size=NUM_CALIBRATION_SAMPLES,
-    dampening_frac=0.01
-)
-```
+        },
+        ignore=["lm_head"],
+        update_size=NUM_CALIBRATION_SAMPLES,
+        dampening_frac=0.01
+    )
+    ```

 ## Troubleshooting and Support

--- a/docs/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@@ -54,54 +54,60 @@ When quantizing activations to INT8, you need sample data to estimate the activa
 It's best to use calibration data that closely matches your deployment data.
 For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:

-```python
-from datasets import load_dataset
+??? Code

-NUM_CALIBRATION_SAMPLES = 512
-MAX_SEQUENCE_LENGTH = 2048
+    ```python
+    from datasets import load_dataset

-# Load and preprocess the dataset
-ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
-ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+    NUM_CALIBRATION_SAMPLES = 512
+    MAX_SEQUENCE_LENGTH = 2048

-def preprocess(example):
-    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
-ds = ds.map(preprocess)
+    # Load and preprocess the dataset
+    ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
+    ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))

-def tokenize(sample):
-    return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
-ds = ds.map(tokenize, remove_columns=ds.column_names)
-```
+    def preprocess(example):
+        return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
+    ds = ds.map(preprocess)
+
+    def tokenize(sample):
+        return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
+    ds = ds.map(tokenize, remove_columns=ds.column_names)
+    ```
+
+</details>

 ### 3. Applying Quantization

 Now, apply the quantization algorithms:

-```python
-from llmcompressor.transformers import oneshot
-from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
+??? Code

-# Configure the quantization algorithms
-recipe = [
-    SmoothQuantModifier(smoothing_strength=0.8),
-    GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
-]
+    ```python
+    from llmcompressor.transformers import oneshot
+    from llmcompressor.modifiers.quantization import GPTQModifier
+    from llmcompressor.modifiers.smoothquant import SmoothQuantModifier

-# Apply quantization
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-)
+    # Configure the quantization algorithms
+    recipe = [
+        SmoothQuantModifier(smoothing_strength=0.8),
+        GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
+    ]

-# Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token
-SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
-```
+    # Apply quantization
+    oneshot(
+        model=model,
+        dataset=ds,
+        recipe=recipe,
+        max_seq_length=MAX_SEQUENCE_LENGTH,
+        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    )
+
+    # Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token
+    SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
+    model.save_pretrained(SAVE_DIR, save_compressed=True)
+    tokenizer.save_pretrained(SAVE_DIR)
+    ```

 This process creates a W8A8 model with weights and activations quantized to 8-bit integers.

--- a/docs/features/quantization/modelopt.md
+++ b/docs/features/quantization/modelopt.md
@@ -14,24 +14,26 @@ You can quantize HuggingFace models using the example scripts provided in the Te

 Below is an example showing how to quantize a model using modelopt's PTQ API:

-```python
-import modelopt.torch.quantization as mtq
-from transformers import AutoModelForCausalLM
+??? Code

-# Load the model from HuggingFace
-model = AutoModelForCausalLM.from_pretrained("<path_or_model_id>")
+    ```python
+    import modelopt.torch.quantization as mtq
+    from transformers import AutoModelForCausalLM

-# Select the quantization config, for example, FP8
-config = mtq.FP8_DEFAULT_CFG
+    # Load the model from HuggingFace
+    model = AutoModelForCausalLM.from_pretrained("<path_or_model_id>")

-# Define a forward loop function for calibration
-def forward_loop(model):
-    for data in calib_set:
-        model(data)
+    # Select the quantization config, for example, FP8
+    config = mtq.FP8_DEFAULT_CFG

-# PTQ with in-place replacement of quantized modules
-model = mtq.quantize(model, config, forward_loop)
-```
+    # Define a forward loop function for calibration
+    def forward_loop(model):
+        for data in calib_set:
+            model(data)
+
+    # PTQ with in-place replacement of quantized modules
+    model = mtq.quantize(model, config, forward_loop)
+    ```

 After the model is quantized, you can export it to a quantized checkpoint using the export API:

@@ -48,31 +50,33 @@ with torch.inference_mode():

 The quantized checkpoint can then be deployed with vLLM. As an example, the following code shows how to deploy `nvidia/Llama-3.1-8B-Instruct-FP8`, which is the FP8 quantized checkpoint derived from `meta-llama/Llama-3.1-8B-Instruct`, using vLLM:

-```python
-from vllm import LLM, SamplingParams
+??? Code

-def main():
+    ```python
+    from vllm import LLM, SamplingParams

-    model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
-    # Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
-    llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
+    def main():

-    sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
+        model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
+        # Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
+        llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)

-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
+        sampling_params = SamplingParams(temperature=0.8, top_p=0.9)

-    outputs = llm.generate(prompts, sampling_params)
+        prompts = [
+            "Hello, my name is",
+            "The president of the United States is",
+            "The capital of France is",
+            "The future of AI is",
+        ]

-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        outputs = llm.generate(prompts, sampling_params)

-if __name__ == "__main__":
-    main()
-```
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    if __name__ == "__main__":
+        main()
+    ```
--- a/docs/features/quantization/quantized_kvcache.md
+++ b/docs/features/quantization/quantized_kvcache.md
@@ -35,20 +35,22 @@ Studies have shown that FP8 E4M3 quantization typically only minimally degrades

 Here is an example of how to enable FP8 quantization:

-```python
-# To calculate kv cache scales on the fly enable the calculate_kv_scales
-# parameter
+??? Code

-from vllm import LLM, SamplingParams
+    ```python
+    # To calculate kv cache scales on the fly enable the calculate_kv_scales
+    # parameter

-sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
-llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
-          kv_cache_dtype="fp8",
-          calculate_kv_scales=True)
-prompt = "London is the capital of"
-out = llm.generate(prompt, sampling_params)[0].outputs[0].text
-print(out)
-```
+    from vllm import LLM, SamplingParams
+
+    sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
+    llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
+            kv_cache_dtype="fp8",
+            calculate_kv_scales=True)
+    prompt = "London is the capital of"
+    out = llm.generate(prompt, sampling_params)[0].outputs[0].text
+    print(out)
+    ```

 The `kv_cache_dtype` argument specifies the data type for KV cache storage:
 - `"auto"`: Uses the model's default "unquantized" data type
@@ -71,67 +73,69 @@ pip install llmcompressor

 Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models can use this same pattern):

-```python
-from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from llmcompressor.transformers import oneshot
+??? Code

-# Select model and load it
-MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    ```python
+    from datasets import load_dataset
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    from llmcompressor.transformers import oneshot

-# Select calibration dataset
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
+    # Select model and load it
+    MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

-# Configure calibration parameters
-NUM_CALIBRATION_SAMPLES = 512  # 512 samples is a good starting point
-MAX_SEQUENCE_LENGTH = 2048
+    # Select calibration dataset
+    DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+    DATASET_SPLIT = "train_sft"

-# Load and preprocess dataset
-ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
-ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+    # Configure calibration parameters
+    NUM_CALIBRATION_SAMPLES = 512  # 512 samples is a good starting point
+    MAX_SEQUENCE_LENGTH = 2048

-def process_and_tokenize(example):
-    text = tokenizer.apply_chat_template(example["messages"], tokenize=False)
-    return tokenizer(
-        text,
-        padding=False,
-        max_length=MAX_SEQUENCE_LENGTH,
-        truncation=True,
-        add_special_tokens=False,
+    # Load and preprocess dataset
+    ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
+    ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+
+    def process_and_tokenize(example):
+        text = tokenizer.apply_chat_template(example["messages"], tokenize=False)
+        return tokenizer(
+            text,
+            padding=False,
+            max_length=MAX_SEQUENCE_LENGTH,
+            truncation=True,
+            add_special_tokens=False,
+        )
+
+    ds = ds.map(process_and_tokenize, remove_columns=ds.column_names)
+
+    # Configure quantization settings
+    recipe = """
+    quant_stage:
+        quant_modifiers:
+            QuantizationModifier:
+                kv_cache_scheme:
+                    num_bits: 8
+                    type: float
+                    strategy: tensor
+                    dynamic: false
+                    symmetric: true
+    """
+
+    # Apply quantization
+    oneshot(
+        model=model,
+        dataset=ds,
+        recipe=recipe,
+        max_seq_length=MAX_SEQUENCE_LENGTH,
+        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
    )

-ds = ds.map(process_and_tokenize, remove_columns=ds.column_names)
-
-# Configure quantization settings
-recipe = """
-quant_stage:
-    quant_modifiers:
-        QuantizationModifier:
-            kv_cache_scheme:
-                num_bits: 8
-                type: float
-                strategy: tensor
-                dynamic: false
-                symmetric: true
-"""
-
-# Apply quantization
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-)
-
-# Save quantized model: Llama-3.1-8B-Instruct-FP8-KV
-SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
-```
+    # Save quantized model: Llama-3.1-8B-Instruct-FP8-KV
+    SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
+    model.save_pretrained(SAVE_DIR, save_compressed=True)
+    tokenizer.save_pretrained(SAVE_DIR)
+    ```

 The above script will create a folder in your current directory containing your quantized model (e.g., `Llama-3.1-8B-Instruct-FP8-KV`) with calibrated scales.

--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@@ -42,20 +42,22 @@ The Quark quantization process can be listed for 5 steps as below:
 Quark uses [Transformers](https://huggingface.co/docs/transformers/en/index)
 to fetch model and tokenizer.

-```python
-from transformers import AutoTokenizer, AutoModelForCausalLM
+??? Code

-MODEL_ID = "meta-llama/Llama-2-70b-chat-hf"
-MAX_SEQ_LEN = 512
+    ```python
+    from transformers import AutoTokenizer, AutoModelForCausalLM

-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto",
-)
-model.eval()
+    MODEL_ID = "meta-llama/Llama-2-70b-chat-hf"
+    MAX_SEQ_LEN = 512

-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN)
-tokenizer.pad_token = tokenizer.eos_token
-```
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID, device_map="auto", torch_dtype="auto",
+    )
+    model.eval()
+
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN)
+    tokenizer.pad_token = tokenizer.eos_token
+    ```

 ### 2. Prepare the Calibration Dataloader

@@ -63,22 +65,24 @@ Quark uses the [PyTorch Dataloader](https://pytorch.org/tutorials/beginner/basic
 to load calibration data. For more details about how to use calibration datasets efficiently, please refer
 to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calibration_datasets.html).

-```python
-from datasets import load_dataset
-from torch.utils.data import DataLoader
+??? Code

-BATCH_SIZE = 1
-NUM_CALIBRATION_DATA = 512
+    ```python
+    from datasets import load_dataset
+    from torch.utils.data import DataLoader

-# Load the dataset and get calibration data.
-dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
-text_data = dataset["text"][:NUM_CALIBRATION_DATA]
+    BATCH_SIZE = 1
+    NUM_CALIBRATION_DATA = 512

-tokenized_outputs = tokenizer(text_data, return_tensors="pt",
-    padding=True, truncation=True, max_length=MAX_SEQ_LEN)
-calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
-    batch_size=BATCH_SIZE, drop_last=True)
-```
+    # Load the dataset and get calibration data.
+    dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
+    text_data = dataset["text"][:NUM_CALIBRATION_DATA]
+
+    tokenized_outputs = tokenizer(text_data, return_tensors="pt",
+        padding=True, truncation=True, max_length=MAX_SEQ_LEN)
+    calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
+        batch_size=BATCH_SIZE, drop_last=True)
+    ```

 ### 3. Set the Quantization Configuration

@@ -94,42 +98,44 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
    AutoSmoothQuant config file for Llama is
    `examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`.

-```python
-from quark.torch.quantization import (Config, QuantizationConfig,
-                                     FP8E4M3PerTensorSpec,
-                                     load_quant_algo_config_from_file)
+??? Code

-# Define fp8/per-tensor/static spec.
-FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
-    is_dynamic=False).to_quantization_spec()
+    ```python
+    from quark.torch.quantization import (Config, QuantizationConfig,
+                                        FP8E4M3PerTensorSpec,
+                                        load_quant_algo_config_from_file)

-# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
-global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
-    weight=FP8_PER_TENSOR_SPEC)
+    # Define fp8/per-tensor/static spec.
+    FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
+        is_dynamic=False).to_quantization_spec()

-# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
-KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
-kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
-kv_cache_quant_config = {name :
-    QuantizationConfig(input_tensors=global_quant_config.input_tensors,
-                       weight=global_quant_config.weight,
-                       output_tensors=KV_CACHE_SPEC)
-    for name in kv_cache_layer_names_for_llama}
-layer_quant_config = kv_cache_quant_config.copy()
+    # Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
+    global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
+        weight=FP8_PER_TENSOR_SPEC)

-# Define algorithm config by config file.
-LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
-    'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
-algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
+    # Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
+    KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
+    kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
+    kv_cache_quant_config = {name :
+        QuantizationConfig(input_tensors=global_quant_config.input_tensors,
+                        weight=global_quant_config.weight,
+                        output_tensors=KV_CACHE_SPEC)
+        for name in kv_cache_layer_names_for_llama}
+    layer_quant_config = kv_cache_quant_config.copy()

-EXCLUDE_LAYERS = ["lm_head"]
-quant_config = Config(
-    global_quant_config=global_quant_config,
-    layer_quant_config=layer_quant_config,
-    kv_cache_quant_config=kv_cache_quant_config,
-    exclude=EXCLUDE_LAYERS,
-    algo_config=algo_config)
-```
+    # Define algorithm config by config file.
+    LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
+        'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
+    algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
+
+    EXCLUDE_LAYERS = ["lm_head"]
+    quant_config = Config(
+        global_quant_config=global_quant_config,
+        layer_quant_config=layer_quant_config,
+        kv_cache_quant_config=kv_cache_quant_config,
+        exclude=EXCLUDE_LAYERS,
+        algo_config=algo_config)
+    ```

 ### 4. Quantize the Model and Export

@@ -139,63 +145,67 @@ HuggingFace `safetensors`, you can refer to
 [HuggingFace format exporting](https://quark.docs.amd.com/latest/pytorch/export/quark_export_hf.html)
 for more exporting format details.

-```python
-import torch
-from quark.torch import ModelQuantizer, ModelExporter
-from quark.torch.export import ExporterConfig, JsonExporterConfig
+??? Code

-# Apply quantization.
-quantizer = ModelQuantizer(quant_config)
-quant_model = quantizer.quantize_model(model, calib_dataloader)
+    ```python
+    import torch
+    from quark.torch import ModelQuantizer, ModelExporter
+    from quark.torch.export import ExporterConfig, JsonExporterConfig

-# Freeze quantized model to export.
-freezed_model = quantizer.freeze(model)
+    # Apply quantization.
+    quantizer = ModelQuantizer(quant_config)
+    quant_model = quantizer.quantize_model(model, calib_dataloader)

-# Define export config.
-LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
-export_config = ExporterConfig(json_export_config=JsonExporterConfig())
-export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
+    # Freeze quantized model to export.
+    freezed_model = quantizer.freeze(model)

-# Model: Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant
-EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
-exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
-with torch.no_grad():
-    exporter.export_safetensors_model(freezed_model,
-        quant_config=quant_config, tokenizer=tokenizer)
-```
+    # Define export config.
+    LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
+    export_config = ExporterConfig(json_export_config=JsonExporterConfig())
+    export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
+
+    # Model: Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant
+    EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
+    exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
+    with torch.no_grad():
+        exporter.export_safetensors_model(freezed_model,
+            quant_config=quant_config, tokenizer=tokenizer)
+    ```

 ### 5. Evaluation in vLLM

 Now, you can load and run the Quark quantized model directly through the LLM entrypoint:

-```python
-from vllm import LLM, SamplingParams
+??? Code

-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    ```python
+    from vllm import LLM, SamplingParams

-# Create an LLM.
-llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
-          kv_cache_dtype='fp8',quantization='quark')
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-# Print the outputs.
-print("\nGenerated Outputs:\n" + "-" * 60)
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt:    {prompt!r}")
-    print(f"Output:    {generated_text!r}")
-    print("-" * 60)
-```
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    # Create an LLM.
+    llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
+            kv_cache_dtype='fp8',quantization='quark')
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt:    {prompt!r}")
+        print(f"Output:    {generated_text!r}")
+        print("-" * 60)
+    ```

 Or, you can use `lm_eval` to evaluate accuracy:

--- a/docs/features/quantization/torchao.md
+++ b/docs/features/quantization/torchao.md
@@ -15,26 +15,28 @@ pip install \
 ## Quantizing HuggingFace Models
 You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code:

-```Python
-import torch
-from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-from torchao.quantization import Int8WeightOnlyConfig
+??? Code

-model_name = "meta-llama/Meta-Llama-3-8B"
-quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
-quantized_model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    torch_dtype="auto",
-    device_map="auto",
-    quantization_config=quantization_config
-)
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+    ```Python
+    import torch
+    from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
+    from torchao.quantization import Int8WeightOnlyConfig

-hub_repo = # YOUR HUB REPO ID
-tokenizer.push_to_hub(hub_repo)
-quantized_model.push_to_hub(hub_repo, safe_serialization=False)
-```
+    model_name = "meta-llama/Meta-Llama-3-8B"
+    quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
+    quantized_model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype="auto",
+        device_map="auto",
+        quantization_config=quantization_config
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    input_text = "What are we having for dinner?"
+    input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+    hub_repo = # YOUR HUB REPO ID
+    tokenizer.push_to_hub(hub_repo)
+    quantized_model.push_to_hub(hub_repo, safe_serialization=False)
+    ```

 Alternatively, you can use the [TorchAO Quantization space](https://huggingface.co/spaces/medmekk/TorchAO_Quantization) for quantizing models with a simple UI.