diff --git a/vllm/patches/deepseek_v4.py b/vllm/patches/deepseek_v4.py
index d20e1b5d..110dcb1f 100644
--- a/vllm/patches/deepseek_v4.py
+++ b/vllm/patches/deepseek_v4.py
@@ -1576,15 +1576,12 @@ class DeepseekV4Model(nn.Module):
         # wo_a: attention forward reads .weight and .weight_scale_inv directly
         # for fp8_einsum. Only layer that needs FP8 conversion.
         fp8_proj_names = {"wo_a"}
-        # Attention layers called via .forward() — need bf16
-        # cuBLAS BF16 is broken on Blackwell — nothing gets dequantized to BF16.
-        # Everything stays native NVFP4/FP8 via FlashInfer CUTLASS.
-        bf16_proj_names = set()
-        bf16_shared_names = set()
+        # No BF16 dequant paths active — cuBLAS BF16 is broken on Blackwell.
+        # wo_a goes NVFP4→FP8; compressor gets reconstructed from checkpoint;
+        # MoE experts stay native NVFP4 via CUTLASS kernel.
         
         fp8_converted = 0
         fp8_from_bf16 = 0
-        bf16_converted = 0
         compressor_converted = 0
 
         # Build shard index once for compressor reconstruction (avoids N×M full-shard loads)
@@ -1608,16 +1605,6 @@ class DeepseekV4Model(nn.Module):
                     self._convert_bf16_to_fp8(mod, FP8_MAX)
                     fp8_from_bf16 += 1
             
-            # BF16 conversion: attention layers via .forward()
-            for proj_name in bf16_proj_names:
-                if not hasattr(attn, proj_name):
-                    continue
-                mod = getattr(attn, proj_name)
-                if not hasattr(mod, "weight") or mod.weight.dtype not in (torch.uint8, torch.int8):
-                    continue
-                self._dequant_nvfp4_to_bf16(mod, E2M1_LUT)
-                bf16_converted += 1
-            
             # Compressor: fused_wkv_wgate used via direct torch.mm
             # Compressor weights were SKIPPED during loading (skip patterns)
             # because the stacking weight_loader corrupts NVFP4 uint8 data.
@@ -1639,25 +1626,12 @@ class DeepseekV4Model(nn.Module):
                     if idx_compressor is not None and hasattr(idx_compressor, "fused_wkv_wgate"):
                         compressor_converted += self._reconstruct_compressor_weight(
                             idx_compressor.fused_wkv_wgate, indexer, layer_idx, E2M1_LUT, sub_path=".indexer", _shard_index=_shard_index)
-            
-            # Shared experts: dequantize NVFP4 → BF16
-            ffn = layer.ffn
-            if hasattr(ffn, "shared_experts") and ffn.shared_experts is not None:
-                for proj_name in bf16_shared_names:
-                    if not hasattr(ffn.shared_experts, proj_name):
-                        continue
-                    mod = getattr(ffn.shared_experts, proj_name)
-                    if not hasattr(mod, "weight") or mod.weight.dtype not in (torch.uint8, torch.int8):
-                        continue
-                    self._dequant_nvfp4_to_bf16(mod, E2M1_LUT)
-                    bf16_converted += 1
         
         total_fp8 = fp8_converted + fp8_from_bf16
-        total_bf16 = bf16_converted + compressor_converted
+        total_bf16 = compressor_converted
         if int(os.environ.get('NVFP4_DEBUG', '0')) and (total_fp8 > 0 or total_bf16 > 0):
             print(f"NVFP4 post-load: {fp8_converted} NVFP4->FP8, "
                   f"{fp8_from_bf16} BF16->FP8, "
-                  f"{bf16_converted} attn/shared->BF16, "
                   f"{compressor_converted} compressor->BF16")
 
 
@@ -1914,7 +1888,6 @@ class DeepseekV4Model(nn.Module):
             if hasattr(fused_mod, attr):
                 delattr(fused_mod, attr)
         return 1
-        return 0
 
     def _convert_bf16_to_fp8(self, mod, fp8_max):
         """Convert BF16 weight to FP8 for fp8_einsum path.
diff --git a/vllm/patches/staging_kernel.py b/vllm/patches/staging_kernel.py
index 769a4b5f..c4a5e709 100644
--- a/vllm/patches/staging_kernel.py
+++ b/vllm/patches/staging_kernel.py
@@ -150,8 +150,11 @@ def _deepseek_v4_stage_mega_moe_inputs_kernel(
 
     # Pack E2M1 pairs into single bytes (2 per byte, low nibble first)
     # mxf4nvf4 reads FP4 packed from SMEM — must match kernel's TMA layout
-    e2m1_lo = e2m1_4bit[0::2]  # even indices → low nibble
-    e2m1_hi = e2m1_4bit[1::2]  # odd indices → high nibble
+    # Reshape to pairs instead of strided indexing (Triton doesn't support
+    # [0::2] on reshaped tensors — unsupported tensor index error)
+    e2m1_pairs = tl.reshape(e2m1_4bit, [BLOCK_K // 2, 2])
+    e2m1_lo = e2m1_pairs[:, 0]  # even indices → low nibble
+    e2m1_hi = e2m1_pairs[:, 1]  # odd indices → high nibble
     e2m1_packed = (e2m1_hi << 4 | e2m1_lo).to(tl.uint8)  # [BLOCK_K // 2]
 
     k_offsets_out = k_block_id * (BLOCK_K // 2) + tl.arange(0, BLOCK_K // 2)