# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for FusedMoE weight loading with padded hidden dimensions. When using DeepEP backends or NIXL EP with models like nemotron_h, hidden_size may be rounded up (e.g., 2688 -> 3072) for backend requirements. Weight parameters are created with the padded size, but checkpoint weights have the original unpadded size. These tests verify that weight loading correctly handles this mismatch. """ import pytest import torch from vllm.model_executor.layers.fused_moe.layer import FusedMoE class TestGetHiddenDim: """Unit tests for _get_hidden_dim.""" def test_2d_non_transposed_w2(self): # w2: shard_dim=1 (intermediate), hidden=0 assert FusedMoE._get_hidden_dim(shard_dim=1, ndim=2) == 0 def test_2d_non_transposed_w13(self): # w1/w3: shard_dim=0 (intermediate), hidden=1 assert FusedMoE._get_hidden_dim(shard_dim=0, ndim=2) == 1 def test_2d_transposed_w2(self): # transposed w2: shard_dim=0, hidden=1 assert FusedMoE._get_hidden_dim(shard_dim=0, ndim=2) == 1 def test_2d_transposed_w13(self): # transposed w1/w3: shard_dim=1, hidden=0 assert FusedMoE._get_hidden_dim(shard_dim=1, ndim=2) == 0 def test_3d_non_transposed_w2(self): # 3D w2: shard_dim=2, hidden=1 assert FusedMoE._get_hidden_dim(shard_dim=2, ndim=3) == 1 def test_3d_non_transposed_w13(self): # 3D w1/w3: shard_dim=1, hidden=2 assert FusedMoE._get_hidden_dim(shard_dim=1, ndim=3) == 2 def test_3d_transposed_w2(self): # transposed 3D w2: shard_dim=1, hidden=2 assert FusedMoE._get_hidden_dim(shard_dim=1, ndim=3) == 2 def test_3d_transposed_w13(self): # transposed 3D w1/w3: shard_dim=2, hidden=1 assert FusedMoE._get_hidden_dim(shard_dim=2, ndim=3) == 1 def test_1d_returns_zero(self): # 1D per-channel scales: always returns 0 assert FusedMoE._get_hidden_dim(shard_dim=0, ndim=1) == 0 assert FusedMoE._get_hidden_dim(shard_dim=1, ndim=1) == 0 def test_invalid_shard_dim_raises(self): # shard_dim outside the data dimensions should raise with pytest.raises(ValueError, match="not a valid data dimension"): FusedMoE._get_hidden_dim(shard_dim=0, ndim=3) class TestNarrowExpertDataForPadding: """Unit tests for _narrow_expert_data_for_padding.""" def test_no_narrowing_when_shapes_match(self): expert_data = torch.zeros(1024, 1024) loaded_weight = torch.randn(1024, 1024) result = FusedMoE._narrow_expert_data_for_padding( expert_data, loaded_weight, hidden_dim=0 ) assert result.shape == loaded_weight.shape assert result.data_ptr() == expert_data.data_ptr() def test_narrow_w2_hidden_dim(self): # w2: (hidden_size, intermediate_size) - hidden_size padded at dim 0 expert_data = torch.zeros(3072, 1024) loaded_weight = torch.randn(2688, 1024) result = FusedMoE._narrow_expert_data_for_padding( expert_data, loaded_weight, hidden_dim=0 ) assert result.shape == (2688, 1024) def test_narrow_w13_hidden_dim(self): # w1/w3: (intermediate_size, hidden_size) - hidden_size padded at dim 1 expert_data = torch.zeros(2048, 3072) loaded_weight = torch.randn(2048, 2688) result = FusedMoE._narrow_expert_data_for_padding( expert_data, loaded_weight, hidden_dim=1 ) assert result.shape == (2048, 2688) def test_narrow_transposed_w2(self): # transposed w2: (intermediate_size, hidden_size) - hidden at dim 1 expert_data = torch.zeros(1024, 3072) loaded_weight = torch.randn(1024, 2688) hidden_dim = FusedMoE._get_hidden_dim(shard_dim=0, ndim=2) result = FusedMoE._narrow_expert_data_for_padding( expert_data, loaded_weight, hidden_dim=hidden_dim ) assert result.shape == (1024, 2688) def test_narrow_3d_full_load(self): # 3D tensor for full_load path: w2 (num_experts, hidden_size, intermediate) expert_data = torch.zeros(8, 3072, 1024) loaded_weight = torch.randn(8, 2688, 1024) result = FusedMoE._narrow_expert_data_for_padding( expert_data, loaded_weight, hidden_dim=1 ) assert result.shape == (8, 2688, 1024) def test_narrow_1d_scale(self): # 1D scale tensor: per-channel w2 scale (hidden_size,) expert_data = torch.zeros(3072) loaded_weight = torch.randn(2688) result = FusedMoE._narrow_expert_data_for_padding( expert_data, loaded_weight, hidden_dim=0 ) assert result.shape == (2688,) def test_scalar_weight_no_op(self): # 0-dim tensor should be a no-op expert_data = torch.zeros(3072) loaded_weight = torch.tensor(1.0) result = FusedMoE._narrow_expert_data_for_padding( expert_data, loaded_weight, hidden_dim=0 ) # ndim == 0, so no narrowing assert result.shape == (3072,) def test_no_narrowing_when_loaded_weight_larger(self): # Guard: don't narrow if loaded_weight is larger than expert_data expert_data = torch.zeros(2688, 1024) loaded_weight = torch.randn(3072, 1024) result = FusedMoE._narrow_expert_data_for_padding( expert_data, loaded_weight, hidden_dim=0 ) assert result.shape == (2688, 1024) assert result.data_ptr() == expert_data.data_ptr() def test_negative_hidden_dim_is_noop(self): # Negative hidden_dim should be a safe no-op (0 <= check) expert_data = torch.zeros(3072, 1024) loaded_weight = torch.randn(2688, 1024) result = FusedMoE._narrow_expert_data_for_padding( expert_data, loaded_weight, hidden_dim=-1 ) # -1 fails the 0 <= check, so no narrowing assert result.shape == (3072, 1024) assert result.data_ptr() == expert_data.data_ptr() def test_only_narrows_hidden_dim(self): # Verify that only the specified hidden_dim is narrowed, # even when other dimensions also differ expert_data = torch.zeros(3072, 2048) loaded_weight = torch.randn(2688, 1024) result = FusedMoE._narrow_expert_data_for_padding( expert_data, loaded_weight, hidden_dim=0 ) # Only dim 0 (hidden) should be narrowed; dim 1 stays at 2048 assert result.shape == (2688, 2048) def test_narrowed_data_shares_storage(self): # Verify narrowing returns a view (writes go to original tensor) expert_data = torch.zeros(3072, 1024) loaded_weight = torch.randn(2688, 1024) result = FusedMoE._narrow_expert_data_for_padding( expert_data, loaded_weight, hidden_dim=0 ) result.copy_(loaded_weight) # The first 2688 rows of expert_data should now have loaded_weight assert torch.equal(expert_data[:2688, :], loaded_weight) # Padded region should remain zero assert torch.equal(expert_data[2688:, :], torch.zeros(3072 - 2688, 1024)) class TestWeightLoadingWithPaddedHiddenSize: """Integration-style tests that simulate padded weight loading.""" def test_load_w2_with_padding(self): """Simulate loading w2 weights when hidden_size is padded.""" padded_hidden = 3072 original_hidden = 2688 intermediate = 1024 expert_data_full = torch.zeros(padded_hidden, intermediate) loaded_weight = torch.randn(original_hidden, intermediate) # w2 non-transposed: shard_dim=1, hidden_dim=0 hidden_dim = FusedMoE._get_hidden_dim(shard_dim=1, ndim=2) expert_data = FusedMoE._narrow_expert_data_for_padding( expert_data_full, loaded_weight, hidden_dim=hidden_dim ) expert_data.copy_(loaded_weight) assert torch.equal(expert_data_full[:original_hidden, :], loaded_weight) assert torch.equal( expert_data_full[original_hidden:, :], torch.zeros(padded_hidden - original_hidden, intermediate), ) def test_load_w13_with_padding(self): """Simulate loading w1/w3 weights when hidden_size is padded.""" padded_hidden = 3072 original_hidden = 2688 intermediate = 1024 # w1/w3: (intermediate_size, hidden_size) expert_data_full = torch.zeros(intermediate, padded_hidden) loaded_weight = torch.randn(intermediate, original_hidden) # w1 non-transposed: shard_dim=0, hidden_dim=1 hidden_dim = FusedMoE._get_hidden_dim(shard_dim=0, ndim=2) expert_data = FusedMoE._narrow_expert_data_for_padding( expert_data_full, loaded_weight, hidden_dim=hidden_dim ) expert_data.copy_(loaded_weight) assert torch.equal(expert_data_full[:, :original_hidden], loaded_weight) assert torch.equal( expert_data_full[:, original_hidden:], torch.zeros(intermediate, padded_hidden - original_hidden), ) def test_load_transposed_w2_with_padding(self): """Simulate loading transposed w2 (GPTQ) with padded hidden_size.""" padded_hidden = 3072 original_hidden = 2688 intermediate = 1024 # transposed w2: (intermediate_size, hidden_size), shard_dim=0 expert_data_full = torch.zeros(intermediate, padded_hidden) loaded_weight = torch.randn(intermediate, original_hidden) hidden_dim = FusedMoE._get_hidden_dim(shard_dim=0, ndim=2) expert_data = FusedMoE._narrow_expert_data_for_padding( expert_data_full, loaded_weight, hidden_dim=hidden_dim ) expert_data.copy_(loaded_weight) assert torch.equal(expert_data_full[:, :original_hidden], loaded_weight) def test_no_padding_is_noop(self): """Verify that when sizes match, behavior is unchanged.""" hidden = 2048 intermediate = 1024 expert_data_full = torch.zeros(hidden, intermediate) loaded_weight = torch.randn(hidden, intermediate) hidden_dim = FusedMoE._get_hidden_dim(shard_dim=1, ndim=2) expert_data = FusedMoE._narrow_expert_data_for_padding( expert_data_full, loaded_weight, hidden_dim=hidden_dim ) expert_data.copy_(loaded_weight) assert torch.equal(expert_data_full, loaded_weight) def test_bnb_shape_mismatch_raises(self): """BnB + padded hidden_size should raise via weight_loader.""" from unittest.mock import MagicMock num_experts = 1 padded_packed = 3072 # padded packed size original_packed = 2688 # original packed size # Build a param that looks like a BnB 4-bit MoE weight. param_data = torch.zeros(num_experts, padded_packed, 1, dtype=torch.uint8) param = torch.nn.Parameter(param_data, requires_grad=False) param.use_bitsandbytes_4bit = True loaded_weight = torch.randint(0, 255, (original_packed, 1), dtype=torch.uint8) # Minimal FusedMoE mock so weight_loader reaches the BnB path. moe = MagicMock(spec=FusedMoE) moe.quant_config = None moe.quant_method = MagicMock() moe.quant_method.__class__.__name__ = "BitsAndBytesMethod" moe._expert_map = None moe.tp_rank = 0 # Call the real weight_loader (unbound) with our mock as self. with pytest.raises(ValueError, match="BitsAndBytes"): FusedMoE.weight_loader( moe, param, loaded_weight, weight_name="w2", shard_id="w2", expert_id=0, )