[ROCm][Quantization] GPT_OSS in amd-quark format model loading and emulations (#29008)

Signed-off-by: xuebwang-amd <xuebwang@amd.com> Signed-off-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2026-02-10 23:08:05 +08:00
parent 599e4335a4
commit b129136c7a
13 changed files with 1094 additions and 213 deletions
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -229,10 +229,15 @@ class Mxfp4Config(QuantizationConfig):
            )
        return None

+    def is_mxfp4_quant(self, prefix: str, layer: torch.nn.Module) -> bool:
+        """MXFP4 config always uses MXFP4 quantization."""
+        return True
+

 class Mxfp4MoEMethod(FusedMoEMethodBase):
    def __init__(self, moe: FusedMoEConfig):
        super().__init__(moe)
+        self.weight_dtype = "mxfp4"
        self.mxfp4_backend = get_mxfp4_backend(moe.is_lora_enabled)

        self.marlin_input_dtype = None