[Kernel] Optimize FP8 support for MoE kernel / Mixtral via static scales (#4343)

Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
Philipp Moritz
2024-04-26 21:49:59 -07:00
committed by GitHub
parent 258a2c58d0
commit 12628d3c78
7 changed files with 95 additions and 18 deletions

View File

@@ -14,6 +14,12 @@ from vllm.model_executor.utils import set_weight_attrs
class Fp8Config(QuantizationConfig):
"""Config class for FP8."""
def __init__(
self,
activation_scheme: str = "dynamic",
) -> None:
self.activation_scheme = activation_scheme
@classmethod
def get_name(cls) -> str:
return "fp8"
@@ -35,7 +41,8 @@ class Fp8Config(QuantizationConfig):
@classmethod
def from_config(cls, config: Dict[str, Any]) -> "Fp8Config":
return cls()
activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
return cls(activation_scheme)
def get_quant_method(
self, layer: torch.nn.Module) -> Optional["QuantizeMethodBase"]: