[ROCm][Quantization] GPT_OSS in amd-quark format model loading and emulations (#29008)

Signed-off-by: xuebwang-amd <xuebwang@amd.com>
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
This commit is contained in:
xuebwang-amd
2026-02-10 23:08:05 +08:00
committed by GitHub
parent 599e4335a4
commit b129136c7a
13 changed files with 1094 additions and 213 deletions

View File

@@ -229,10 +229,15 @@ class Mxfp4Config(QuantizationConfig):
)
return None
def is_mxfp4_quant(self, prefix: str, layer: torch.nn.Module) -> bool:
"""MXFP4 config always uses MXFP4 quantization."""
return True
class Mxfp4MoEMethod(FusedMoEMethodBase):
def __init__(self, moe: FusedMoEConfig):
super().__init__(moe)
self.weight_dtype = "mxfp4"
self.mxfp4_backend = get_mxfp4_backend(moe.is_lora_enabled)
self.marlin_input_dtype = None