[doc] Fold long code blocks to improve readability (#19926)
Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>
This commit is contained in:
@@ -31,28 +31,30 @@ After installing GPTQModel, you are ready to quantize a model. Please refer to t
|
||||
|
||||
Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
|
||||
|
||||
```python
|
||||
from datasets import load_dataset
|
||||
from gptqmodel import GPTQModel, QuantizeConfig
|
||||
??? Code
|
||||
|
||||
model_id = "meta-llama/Llama-3.2-1B-Instruct"
|
||||
quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"
|
||||
```python
|
||||
from datasets import load_dataset
|
||||
from gptqmodel import GPTQModel, QuantizeConfig
|
||||
|
||||
calibration_dataset = load_dataset(
|
||||
"allenai/c4",
|
||||
data_files="en/c4-train.00001-of-01024.json.gz",
|
||||
split="train"
|
||||
).select(range(1024))["text"]
|
||||
model_id = "meta-llama/Llama-3.2-1B-Instruct"
|
||||
quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"
|
||||
|
||||
quant_config = QuantizeConfig(bits=4, group_size=128)
|
||||
calibration_dataset = load_dataset(
|
||||
"allenai/c4",
|
||||
data_files="en/c4-train.00001-of-01024.json.gz",
|
||||
split="train"
|
||||
).select(range(1024))["text"]
|
||||
|
||||
model = GPTQModel.load(model_id, quant_config)
|
||||
quant_config = QuantizeConfig(bits=4, group_size=128)
|
||||
|
||||
# increase `batch_size` to match gpu/vram specs to speed up quantization
|
||||
model.quantize(calibration_dataset, batch_size=2)
|
||||
model = GPTQModel.load(model_id, quant_config)
|
||||
|
||||
model.save(quant_path)
|
||||
```
|
||||
# increase `batch_size` to match gpu/vram specs to speed up quantization
|
||||
model.quantize(calibration_dataset, batch_size=2)
|
||||
|
||||
model.save(quant_path)
|
||||
```
|
||||
|
||||
## Running a quantized model with vLLM
|
||||
|
||||
@@ -67,32 +69,34 @@ python examples/offline_inference/llm_engine_example.py \
|
||||
|
||||
GPTQModel quantized models are also supported directly through the LLM entrypoint:
|
||||
|
||||
```python
|
||||
from vllm import LLM, SamplingParams
|
||||
??? Code
|
||||
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
```python
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
|
||||
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Create an LLM.
|
||||
llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
|
||||
|
||||
# Print the outputs.
|
||||
print("-"*50)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
# Print the outputs.
|
||||
print("-"*50)
|
||||
```
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
|
||||
print("-"*50)
|
||||
```
|
||||
|
||||
Reference in New Issue
Block a user