diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index ff5ce20d1..5eadc6bd8 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -731,7 +731,7 @@ steps:
 
 - label: Quantization Test # 70min
   timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   source_file_dependencies:
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 535f02820..7fb38708c 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -644,6 +644,9 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
         assert output
 
 
+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
+)
 @pytest.mark.parametrize(
     "args",
     [
@@ -762,7 +765,10 @@ def test_compressed_tensors_fp8_block_enabled(vllm_runner):
 
             input_quant_op = qkv_proj.scheme.w8a8_block_fp8_linear.input_quant_op
             assert isinstance(input_quant_op, QuantFP8)
-            assert input_quant_op._forward_method == input_quant_op.forward_cuda
+            assert input_quant_op._forward_method in (
+                input_quant_op.forward_cuda,
+                input_quant_op.forward_hip,
+            )
 
         llm.apply_model(check_model)
 
diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py
index 797b565b9..40ca587bc 100644
--- a/tests/quantization/test_configs.py
+++ b/tests/quantization/test_configs.py
@@ -10,6 +10,7 @@ from dataclasses import dataclass
 import pytest
 
 from vllm.config import ModelConfig
+from vllm.platforms import current_platform
 
 
 @dataclass
@@ -23,20 +24,44 @@ MODEL_ARG_EXPTYPES = [
     # AUTOGPTQ
     # compat: autogptq <=0.7.1 is_marlin_format: bool
     # Model Serialized in Exllama Format.
-    ("TheBloke/Llama-2-7B-Chat-GPTQ", None, "gptq_marlin"),
-    ("TheBloke/Llama-2-7B-Chat-GPTQ", "marlin", "gptq_marlin"),
+    (
+        "TheBloke/Llama-2-7B-Chat-GPTQ",
+        None,
+        "gptq_marlin" if current_platform.is_cuda() else "gptq",
+    ),
+    (
+        "TheBloke/Llama-2-7B-Chat-GPTQ",
+        "marlin",
+        "gptq_marlin" if current_platform.is_cuda() else "ERROR",
+    ),
     ("TheBloke/Llama-2-7B-Chat-GPTQ", "gptq", "gptq"),
     ("TheBloke/Llama-2-7B-Chat-GPTQ", "awq", "ERROR"),
     # compat: autogptq >=0.8.0 use checkpoint_format: str
     # Model Serialized in Exllama Format.
-    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", None, "gptq_marlin"),
-    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "marlin", "gptq_marlin"),
+    (
+        "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
+        None,
+        "gptq_marlin" if current_platform.is_cuda() else "gptq",
+    ),
+    (
+        "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
+        "marlin",
+        "gptq_marlin" if current_platform.is_cuda() else "ERROR",
+    ),
     ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "gptq", "gptq"),
     ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "awq", "ERROR"),
     # AUTOAWQ
-    ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", None, "awq_marlin"),
+    (
+        "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ",
+        None,
+        "awq_marlin" if current_platform.is_cuda() else "awq",
+    ),
     ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "awq", "awq"),
-    ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "marlin", "awq_marlin"),
+    (
+        "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ",
+        "marlin",
+        "awq_marlin" if current_platform.is_cuda() else "ERROR",
+    ),
     ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "gptq", "ERROR"),
 ]
 
diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py
index 1591ce1c4..3b58614e5 100644
--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
@@ -66,7 +66,7 @@ def test_cpu_offload_compressed_tensors(monkeypatch):
     monkeypatch.setenv("VLLM_TEST_FORCE_LOAD_FORMAT", "auto")
     # Test wNa16
     compare_two_settings(
-        "nm-testing/tinyllama-oneshot-w4a16-channel-v2",
+        "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16",
         ["--enforce_eager"],
         ["--enforce_eager", "--cpu-offload-gb", "1"],
         max_wait_seconds=480,
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index 7b7b6c1d0..e8abe0d41 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -36,7 +36,9 @@ MODELS = [
     reason="FP8 is not supported on this GPU type.",
 )
 @pytest.mark.parametrize("model_id", MODELS)
-@pytest.mark.parametrize("force_marlin", [False, True])
+@pytest.mark.parametrize(
+    "force_marlin", [False] if current_platform.is_rocm() else [False, True]
+)
 @pytest.mark.parametrize(
     "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
 )
@@ -125,7 +127,9 @@ def test_kv_cache_model_load_and_run(
     reason="FP8 is not supported on this GPU type.",
 )
 @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
-@pytest.mark.parametrize("force_marlin", [False, True])
+@pytest.mark.parametrize(
+    "force_marlin", [False] if current_platform.is_rocm() else [False, True]
+)
 @pytest.mark.parametrize(
     "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
 )
@@ -197,10 +201,10 @@ def test_scaled_fp8_quant(dtype) -> None:
     def quantize_ref(tensor, inv_scale):
         # The reference implementation that fully aligns to
         # the kernel being tested.
-        finfo = torch.finfo(torch.float8_e4m3fn)
+        finfo = torch.finfo(current_platform.fp8_dtype())
         scale = inv_scale.reciprocal()
         qweight = (tensor.to(torch.float32) * scale).clamp(min=finfo.min, max=finfo.max)
-        qweight = qweight.to(torch.float8_e4m3fn)
+        qweight = qweight.to(current_platform.fp8_dtype())
         return qweight
 
     def per_tensor_dequantize(tensor, inv_scale, dtype):
@@ -267,6 +271,10 @@ def test_scaled_fp8_quant(dtype) -> None:
     )
 
 
+@pytest.mark.skipif(
+    current_platform.is_fp8_fnuz(),
+    reason="FP8 e4m3fn weight reloading is not supported on e4m3fnuz platforms",
+)
 @pytest.mark.parametrize("method_cls", [Fp8LinearMethod, Fp8MoEMethod])
 # FP8 weight reloading does not support online quantization
 @pytest.mark.parametrize("is_checkpoint_fp8_serialized", [True])  # skip False
diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py
index 37fe2dd32..f35e49094 100644
--- a/tests/quantization/test_gptq_dynamic.py
+++ b/tests/quantization/test_gptq_dynamic.py
@@ -14,6 +14,7 @@ from vllm.model_executor.layers.quantization.gptq_marlin import GPTQMarlinLinear
 from vllm.model_executor.layers.quantization.utils.gptq_utils import (
     get_dynamic_override,
 )
+from vllm.platforms import current_platform
 
 PROMPT = "On the surface of Mars, we found"
 
@@ -21,7 +22,10 @@ PROMPT = "On the surface of Mars, we found"
 # The second layer is quantized using bits=8, group_size=32
 # All other layers (layer index >= 2) are not quantized
 MODEL_QUANT = [
-    ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue", True),
+    (
+        "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
+        current_platform.is_cuda(),
+    ),
     (
         "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
         False,
diff --git a/tests/quantization/test_ptpc_fp8.py b/tests/quantization/test_ptpc_fp8.py
index 61efd2ce6..6858062b9 100644
--- a/tests/quantization/test_ptpc_fp8.py
+++ b/tests/quantization/test_ptpc_fp8.py
@@ -6,18 +6,12 @@ Run `pytest tests/quantization/test_ptpc_fp8.py --forked`.
 """
 
 import pytest
-import torch
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm.model_executor.layers.quantization.fp8 import Fp8KVCacheMethod
 from vllm.model_executor.layers.quantization.ptpc_fp8 import PTPCFp8LinearMethod
 from vllm.platforms import current_platform
 
-UNSUPPORTED_STR = (
-    "Currently torch._scaled_mm (hipBLASLt) rowwise gemm only "
-    "support output dtype of bfloat16. torch.float16 is specified."
-)
-
 
 @pytest.fixture(scope="function", autouse=True)
 def enable_pickle(monkeypatch):
@@ -30,24 +24,17 @@ def enable_pickle(monkeypatch):
     reason="PTPC FP8 is not supported on this GPU type.",
 )
 @pytest.mark.skipif(not current_platform.is_rocm(), reason="This test is for ROCm GPU.")
-@pytest.mark.parametrize("dtype", ["auto", "bfloat16", "float16"])
-@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8", "fp8_e4m3"])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
 def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
-    try:
-        llm = vllm_runner(
-            "facebook/opt-125m",
-            dtype=dtype,
-            quantization="ptpc_fp8",
-            enforce_eager=True,
-            kv_cache_dtype=kv_cache_dtype,
-        )
-    except AssertionError as e:
-        if str(e) == UNSUPPORTED_STR:
-            # If the error message matches, the test passes
-            return
-        else:
-            # If the error message does not match, re-raise the exception
-            raise
+    llm = vllm_runner(
+        "facebook/opt-125m",
+        dtype=dtype,
+        quantization="ptpc_fp8",
+        enforce_eager=True,
+        kv_cache_dtype=kv_cache_dtype,
+        allow_deprecated_quantization=True,
+    )
 
     with llm:
 
@@ -60,9 +47,9 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
                 assert attn._k_scale == 1.0
                 assert attn._v_scale == 1.0
 
+            # For GPUs with hardware support, we keep weights in fp8
             if current_platform.has_device_capability(94):
-                # For GPUs with hardware support, we keep weights in fp8
-                assert fc1.weight.dtype == torch.float8_e4m3fnuz
+                assert fc1.weight.dtype == current_platform.fp8_dtype()
 
         llm.apply_model(check_model)
 
diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py
index 20a425b72..cf3da37b0 100644
--- a/tests/quantization/utils.py
+++ b/tests/quantization/utils.py
@@ -10,6 +10,11 @@ def is_quant_method_supported(quant_method: str) -> bool:
     if not (current_platform.is_cuda() or current_platform.is_rocm()):
         return False
 
+    try:
+        current_platform.verify_quantization(quant_method)
+    except ValueError:
+        return False
+
     capability = current_platform.get_device_capability()
     assert capability is not None
 
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index 1c1587ebe..f0f8868c1 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -5,6 +5,7 @@ from typing import Literal, get_args
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
@@ -98,6 +99,9 @@ def register_quantization_config(quantization: str):
             )
         else:
             QUANTIZATION_METHODS.append(quantization)
+            # Automatically assume the custom quantization config is supported
+            if sq := current_platform.supported_quantization:
+                sq.append(quantization)
 
         if not issubclass(quant_config_cls, QuantizationConfig):
             raise ValueError(
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
index 760f1f7f7..e4286f91b 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
@@ -9,6 +9,9 @@ from vllm.model_executor.layers.quantization.compressed_tensors.triton_scaled_mm
     triton_scaled_mm,
 )
 from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    convert_to_channelwise,
+)
 from vllm.platforms import current_platform
 
 from .ScaledMMLinearKernel import ScaledMMLinearKernel, ScaledMMLinearLayerConfig
@@ -37,6 +40,20 @@ class TritonScaledMMLinearKernel(ScaledMMLinearKernel):
             torch.nn.Parameter(weight.t().data, requires_grad=False),
         )
 
+        # WEIGHT SCALE
+        # Triton kernel supports only per-tensor and per-channel.
+        # If we have a fused module (QKV, MLP) with per tensor scales (thus N
+        # scales being passed to the kernel), convert to the per-channel case.
+        is_fused_module = len(layer.logical_widths) > 1
+        weight_scale = getattr(layer, self.w_s_name)
+        if is_fused_module and not self.config.is_channelwise:
+            weight_scale = convert_to_channelwise(weight_scale, layer.logical_widths)
+        replace_parameter(
+            layer,
+            self.w_s_name,
+            torch.nn.Parameter(weight_scale.data, requires_grad=False),
+        )
+
         # INPUT SCALE
         if self.config.is_static_input_scheme:
             input_scale = getattr(layer, self.i_s_name)
diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py
index ed8a2c7fa..80efc29de 100644
--- a/vllm/model_executor/layers/quantization/ptpc_fp8.py
+++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py
@@ -103,21 +103,25 @@ class PTPCFp8LinearMethod(Fp8LinearMethod):
         )
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        layer.weight = torch.nn.Parameter(layer.weight.data, requires_grad=False)
-
-        assert layer.weight.data.dtype == torch.bfloat16, (
-            f"Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16. {str(layer.weight.data.dtype)} is specified."  # noqa: E501
-        )
-        # Quantize the weights.
-        qweight, weight_scale = ops.scaled_fp8_quant(
-            layer.weight, scale=None, use_per_token_if_dynamic=True
+        assert layer.weight.data.dtype not in (torch.float16, torch.float32), (
+            "Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support "
+            f"output dtype of bfloat16. {layer.weight.data.dtype} is specified."
         )
 
-        # Update the layer with the new values.
-        layer.weight = Parameter(
-            qweight.t(), requires_grad=False
-        )  # Pretranspose the weight
-        layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+        if layer.weight.data.dtype == torch.bfloat16:
+            # Quantize the weights.
+            qweight, weight_scale = ops.scaled_fp8_quant(
+                layer.weight, scale=None, use_per_token_if_dynamic=True
+            )
+
+            # Update the layer with the new values.
+            layer.weight = Parameter(
+                qweight.t(), requires_grad=False
+            )  # Pretranspose the weight
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+        else:
+            assert layer.weight.data.dtype == current_platform.fp8_dtype()
+            assert getattr(layer, "weight_scale", None) is not None
         layer.input_scale = None
 
     def apply(
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 3a55dd36d..d5df0013a 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -170,7 +170,9 @@ class RocmPlatform(Platform):
 
     supported_quantization: list[str] = [
         "awq",
+        "awq_marlin",  # will be overwritten with awq
         "gptq",
+        "gptq_marlin",  # will be overwritten with gptq
         "fp8",
         "compressed-tensors",
         "fbgemm_fp8",