diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 56c2f292e..db44439f3 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -2526,6 +2526,7 @@ steps: - pytest -v -s -x lora/test_llm_with_multi_loras.py - pytest -v -s -x lora/test_olmoe_tp.py - pytest -v -s -x lora/test_gptoss_tp.py + - pytest -v -s -x lora/test_qwen35_densemoel_lora.py - label: Weight Loading Multiple GPU # 7.5m diff --git a/tests/lora/test_gptoss_tp.py b/tests/lora/test_gptoss_tp.py index 855b6b796..68dd87233 100644 --- a/tests/lora/test_gptoss_tp.py +++ b/tests/lora/test_gptoss_tp.py @@ -5,6 +5,7 @@ import pytest import vllm from vllm.lora.request import LoRARequest +from vllm.platforms import current_platform from ..utils import multi_gpu_test @@ -69,6 +70,16 @@ def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int) -> None: assert generated_texts[i].startswith(EXPECTED_LORA_OUTPUT[i]) +@pytest.mark.skipif( + not current_platform.is_cuda(), + reason=( + "Mxfp4 LoRA on ROCm is blocked by a spawn compatibility issue. " + "The fused_moe_lora Triton kernel crashes in spawned subprocesses, " + "and vLLM forces spawn mode when HIP is initialized before " + "multiprocessing. Fixing this requires either making the LoRA " + "Triton kernel spawn-safe or pre-warming the kernel cache." + ), +) @pytest.mark.parametrize("mxfp4_use_marlin", [True, False]) @pytest.mark.parametrize("specialize_active_lora", [True, False]) def test_gpt_oss_lora( diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index 78876ef7c..a8bc559ce 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -109,8 +109,8 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): else: # fall back to the default config get_config_func = functools.partial( try_get_optimal_moe_lora_config, - w1_shape=layer.w13_weight.size(), - w2_shape=layer.w2_weight.size(), + w1_shape=layer.w13_weight.shape, + w2_shape=layer.w2_weight.shape, rank=rank, top_k=top_k, dtype=config_dtype, diff --git a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py index 7fc49d8d8..42f53b200 100644 --- a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py +++ b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py @@ -379,7 +379,11 @@ def _fused_moe_lora_kernel( ) a_ptrs += BLOCK_SIZE_K * SPLIT_K * stride_ak - accumulator += tl.dot(a, b) + # Cast operands to matching dtype for tl.dot. On ROCm, Triton's + # compiler may infer different types for a and b when merging + # if/else branches (TMA desc path returns fp32, tl.load returns + # the pointer's element type). + accumulator += tl.dot(a.to(tl.bfloat16), b.to(tl.bfloat16)) if MUL_ROUTED_WEIGHT: moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0.0) diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index f4e3ed8e0..34bd58832 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -229,6 +229,9 @@ class FusedMoEQuantConfig: _w1: FusedMoEQuantDesc _w2: FusedMoEQuantDesc is_nvfp4_scale_swizzled: bool = True + # CK MXFP4 (gfx950) padding info for rocm_aiter_ops.fused_moe() + hidden_pad: int = 0 + intermediate_pad: int = 0 def __post_init__(self): assert not self.per_act_token_quant or self.block_shape is None, ( diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py index 5862abe20..735a55a0f 100644 --- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py @@ -257,7 +257,7 @@ def triton_kernel_moe_forward( # sparse_logits.indx contains global expert IDs – remap to local. topk_ids = expert_map[sparse_logits.indx.to(torch.long)] topk_weights = sparse_logits.vals - local_num_experts = w1.size(0) + local_num_experts = w1.shape[0] routing_data, gather_idx, scatter_idx = make_routing_data( topk_ids, topk_weights, local_num_experts ) @@ -604,8 +604,8 @@ class BaseOAITritonExperts(mk.FusedMoEExpertsModular): require a specialized implementation, like MarlinExperts, they are free to override this function. """ - assert w1.dim() == 3 and w2.dim() == 3 - E, _, N = w1.size() + assert len(w1.shape) == 3 and len(w2.shape) == 3 + E, _, N = w1.shape K = a1.size(-1) assert a1.dim() == 2 @@ -683,7 +683,7 @@ class OAITritonExperts(BaseOAITritonExperts): if expert_map is not None: topk_ids = expert_map[topk_ids] - local_num_experts = w1.size(0) + local_num_experts = w1.shape[0] if global_num_experts == -1: global_num_experts = local_num_experts @@ -781,7 +781,7 @@ class UnfusedOAITritonExperts(BaseOAITritonExperts): if expert_map is not None: topk_ids = expert_map[topk_ids] - local_num_experts = w1.size(0) + local_num_experts = w1.shape[0] if global_num_experts == -1: global_num_experts = local_num_experts diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 85fd1813a..f1905fd28 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -567,6 +567,13 @@ class FusedMoE(CustomOp): # for heuristic purposes, so it must be initialized first. self.quant_method: FusedMoEMethodBase = _get_quant_method() + # Quant methods (e.g. Mxfp4MoEMethod) may round up hidden_dim + # and intermediate_size in moe_config during __init__. Sync + # self.hidden_size so downstream consumers (e.g. LoRA) see the + # padded value. + if self.moe_config.hidden_dim != self.hidden_size: + self.hidden_size = self.moe_config.hidden_dim + if not self.moe_config.is_act_and_mul and not current_platform.is_cuda_alike(): raise NotImplementedError( "is_act_and_mul=False is supported only for CUDA and ROCm for now" @@ -586,7 +593,7 @@ class FusedMoE(CustomOp): moe_quant_params = { "num_experts": self.local_num_experts, - "hidden_size": hidden_size, + "hidden_size": self.hidden_size, "unpadded_hidden_size": unpadded_hidden_size, "intermediate_size_per_partition": self.intermediate_size_per_partition, "params_dtype": params_dtype, diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index a6b498834..ca3536bd9 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -768,8 +768,8 @@ class FusedMoEExpertsModular(FusedMoEExperts): require a specialized implementation, like MarlinExperts, they are free to override this function. """ - assert w1.dim() == 3 and w2.dim() == 3 - E, N, _ = w1.size() + assert len(w1.shape) == 3 and len(w2.shape) == 3 + E, N, _ = w1.shape K = a1.size(-1) if a1.dim() == 2: @@ -1349,7 +1349,7 @@ class FusedMoEKernelModularImpl: else: output = torch.empty_like(hidden_states) - local_num_experts = w1.size(0) + local_num_experts = w1.shape[0] if global_num_experts == -1: global_num_experts = local_num_experts diff --git a/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py index 9d1c8e27b..09725631d 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py +++ b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py @@ -212,7 +212,11 @@ def select_mxfp4_moe_backend( # LoRA: separate experts backend path if config.is_lora_enabled: if not current_platform.is_cuda(): - raise NotImplementedError("Mxfp4 LoRA only supported on CUDA Platform.") + # ROCm: Triton mxfp4 LoRA hits GPU memory faults due to + # triton_kernels.tensor.Tensor / HIP read-only page issues + # during weight swizzle and LoRA forward. Needs work from + # the triton_kernels/aiter side. + raise NotImplementedError("Mxfp4 LoRA is currently only supported on CUDA.") if envs.VLLM_MXFP4_USE_MARLIN is False and triton_kernels_supported: logger.info_once("Using Triton backend for mxfp4 lora") return Mxfp4MoeBackend.TRITON_UNFUSED, backend_to_kernel_cls( @@ -775,6 +779,8 @@ def make_mxfp4_moe_quant_config( w2_scale: Union[torch.Tensor, "PrecisionConfig"], w1_bias: torch.Tensor | None = None, w2_bias: torch.Tensor | None = None, + hidden_pad: int = 0, + intermediate_pad: int = 0, ) -> FusedMoEQuantConfig | None: """Create a FusedMoEQuantConfig for the given MXFP4 backend.""" if mxfp4_backend in ( @@ -796,12 +802,16 @@ def make_mxfp4_moe_quant_config( Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16, Mxfp4MoeBackend.CK, ): - return mxfp4_w4a16_moe_quant_config( + config = mxfp4_w4a16_moe_quant_config( w1_bias=w1_bias, w2_bias=w2_bias, w1_scale=w1_scale, w2_scale=w2_scale, ) + if mxfp4_backend == Mxfp4MoeBackend.CK: + config.hidden_pad = hidden_pad + config.intermediate_pad = intermediate_pad + return config else: return ocp_mx_moe_quant_config( quant_dtype="mxfp4", diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index 98af53fce..98c98b7c5 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -292,6 +292,8 @@ def rocm_aiter_fused_experts( doweight_stage1=apply_router_weight_on_input, num_local_tokens=num_local_tokens, output_dtype=output_dtype, + hidden_pad=quant_config.hidden_pad, + intermediate_pad=quant_config.intermediate_pad, bias1=quant_config.w1_bias if quant_config.use_mxfp4_w4a16 else None, bias2=quant_config.w2_bias if quant_config.use_mxfp4_w4a16 else None, ) @@ -332,7 +334,15 @@ class AiterExperts(mk.FusedMoEExpertsModular): (kFp8StaticChannelSym, kFp8DynamicTokenSym), (kMxfp4Static, None), ] - return (weight_key, activation_key) in SUPPORTED_W_A + if (weight_key, activation_key) not in SUPPORTED_W_A: + return False + # CK MXFP4 MoE kernels are only supported on gfx950. + if weight_key == kMxfp4Static: + from vllm.platforms.rocm import on_gfx950 + + if not on_gfx950(): + return False + return True @staticmethod def _supports_activation(activation: MoEActivation) -> bool: diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index b8b0e5f36..8ce5432fe 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -158,6 +158,12 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): intermediate_size_per_partition_after_pad ) + # CK (gfx950) padding info for rocm_aiter_ops.fused_moe() + self.hidden_pad = extra_weight_attrs.get("hidden_pad", 0) + self.intermediate_pad = ( + intermediate_size_per_partition_after_pad - intermediate_size_per_partition + ) + # Fused gate_up_proj (column parallel) w13_weight = torch.nn.Parameter( torch.zeros( @@ -362,6 +368,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): w2_scale=w2_scale, w1_bias=w1_bias, w2_bias=w2_bias, + hidden_pad=self.hidden_pad, + intermediate_pad=self.intermediate_pad, ) def select_gemm_impl(