[Docs] Add docs about OOT Quantization Plugins (#32035)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
@@ -67,3 +67,160 @@ th:not(:first-child) {
|
||||
This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
|
||||
|
||||
For the most up-to-date information on hardware support and quantization methods, please refer to [vllm/model_executor/layers/quantization](../../../vllm/model_executor/layers/quantization) or consult with the vLLM development team.
|
||||
|
||||
## Out-of-Tree Quantization Plugins
|
||||
|
||||
vLLM supports registering custom, out-of-tree quantization methods using the `@register_quantization_config` decorator. This allows you to implement and use your own quantization schemes without modifying the vLLM codebase.
|
||||
|
||||
### Registering a Custom Quantization Method
|
||||
|
||||
To register a custom quantization method, create a class that inherits from `QuantizationConfig` and decorate it with `@register_quantization_config`. The `get_quant_method` dispatches to the appropriate quantize method based on the layer type:
|
||||
|
||||
```python
|
||||
import torch
|
||||
from vllm.model_executor.layers.quantization import (
|
||||
register_quantization_config,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.base_config import (
|
||||
QuantizationConfig,
|
||||
QuantizeMethodBase,
|
||||
)
|
||||
from vllm.model_executor.layers.linear import LinearBase
|
||||
from vllm.model_executor.layers.fused_moe import FusedMoE
|
||||
|
||||
@register_quantization_config("my_quant")
|
||||
class MyQuantConfig(QuantizationConfig):
|
||||
"""Custom quantization config."""
|
||||
|
||||
def get_name(self) -> str:
|
||||
return "my_quant"
|
||||
|
||||
def get_supported_act_dtypes(self) -> list:
|
||||
return [torch.float16, torch.bfloat16]
|
||||
|
||||
@classmethod
|
||||
def get_min_capability(cls) -> int:
|
||||
# Minimum GPU compute capability, -1 for no restriction
|
||||
return -1
|
||||
|
||||
@staticmethod
|
||||
def get_config_filenames() -> list[str]:
|
||||
# Config files to search for in model directory
|
||||
return []
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, config: dict) -> "MyQuantConfig":
|
||||
# Create config from model's quantization config
|
||||
return cls()
|
||||
|
||||
def get_quant_method(
|
||||
self, layer: torch.nn.Module, prefix: str
|
||||
) -> QuantizeMethodBase | None:
|
||||
# Dispatch based on layer type
|
||||
# NOTE: you only need to implement methods you care about
|
||||
if isinstance(layer, LinearBase):
|
||||
return MyQuantLinearMethod()
|
||||
elif isinstance(layer, FusedMoE):
|
||||
return MyQuantMoEMethod(layer.moe_config)
|
||||
return None
|
||||
```
|
||||
|
||||
### Required QuantizationConfig Methods
|
||||
|
||||
Your custom `QuantizationConfig` subclass must implement these abstract methods:
|
||||
|
||||
| Method | Description |
|
||||
|--------|-------------|
|
||||
| `get_name()` | Returns the name of the quantization method |
|
||||
| `get_supported_act_dtypes()` | Returns list of supported activation dtypes (e.g., `torch.float16`) |
|
||||
| `get_min_capability()` | Returns minimum GPU compute capability (e.g., 80 for Ampere, -1 for no restriction) |
|
||||
| `get_config_filenames()` | Returns list of config filenames to search for in model directory |
|
||||
| `from_config(config)` | Class method to create config from model's quantization config dict |
|
||||
| `get_quant_method(layer, prefix)` | Returns the quantization method for a given layer, or `None` to skip |
|
||||
|
||||
### Implementing a Quantized Linear Method
|
||||
|
||||
For linear layers, return a `QuantizeMethodBase` subclass from `get_quant_method`. You can extend `UnquantizedLinearMethod` as a starting point:
|
||||
|
||||
```python
|
||||
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
|
||||
|
||||
class MyQuantLinearMethod(UnquantizedLinearMethod):
|
||||
"""Custom quantization method for linear layers."""
|
||||
|
||||
def create_weights(
|
||||
self, layer: torch.nn.Module, *weight_args, **extra_weight_attrs
|
||||
):
|
||||
# Create quantized weights for the layer
|
||||
...
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
x: torch.Tensor,
|
||||
bias: torch.Tensor | None = None,
|
||||
) -> torch.Tensor:
|
||||
# Apply custom quantization logic here
|
||||
...
|
||||
```
|
||||
|
||||
### Implementing a Quantized MoE Method
|
||||
|
||||
For Mixture of Experts (MoE) models, return a `FusedMoEMethodBase` subclass from `get_quant_method`. You can use `UnquantizedFusedMoEMethod` to skip MoE quantization:
|
||||
|
||||
```python
|
||||
from vllm.model_executor.layers.fused_moe.layer import UnquantizedFusedMoEMethod
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe_method_base import (
|
||||
FusedMoEMethodBase,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
|
||||
|
||||
class MyQuantMoEMethod(FusedMoEMethodBase):
|
||||
"""Custom quantization method for MoE layers."""
|
||||
|
||||
def create_weights(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
num_experts: int,
|
||||
hidden_size: int,
|
||||
intermediate_size_per_partition: int,
|
||||
params_dtype: torch.dtype,
|
||||
**extra_weight_attrs,
|
||||
):
|
||||
# Create quantized weights for the MoE layer
|
||||
...
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
router: "FusedMoERouter",
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
# Apply MoE computation with quantized weights
|
||||
...
|
||||
|
||||
def get_fused_moe_quant_config(
|
||||
self, layer: torch.nn.Module
|
||||
) -> FusedMoEQuantConfig | None:
|
||||
# Return the MoE quantization configuration
|
||||
...
|
||||
```
|
||||
|
||||
See existing implementations like `Fp8MoEMethod` in `vllm/model_executor/layers/quantization/fp8.py` for reference.
|
||||
|
||||
### Using the Plugin
|
||||
|
||||
Once registered, you can use your custom quantization method with vLLM:
|
||||
|
||||
```python
|
||||
# Register your quantization method (import the module containing your config)
|
||||
import my_quant_plugin
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
# Use the custom quantization method
|
||||
llm = LLM(model="your-model", quantization="my_quant")
|
||||
```
|
||||
|
||||
For more information on the plugin system, see the [Plugin System documentation](../../design/plugin_system.md).
|
||||
|
||||
Reference in New Issue
Block a user