tests/lora/test_qwen3_unembed.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Tests for Qwen3 unembed LoRA support.

This test creates synthetic LoRA weights that include lm_head (output embedding)
to verify that Qwen3 properly supports LoRA on the unembed/lm_head layer.
"""

import json
import os
import tempfile

import numpy as np
import torch
from safetensors.torch import save_file

from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest

MODEL_PATH = "Qwen/Qwen3-0.6B"
HIDDEN_SIZE = 1024
VOCAB_SIZE = 151936


def create_qwen3_lora_with_lm_head(save_dir: str, rank: int = 8) -> None:
    """Create synthetic Qwen3 LoRA weights with lm_head."""
    lora_weights = {}
    for module in ["q_proj", "v_proj"]:
        lora_A = torch.from_numpy(
            np.random.randn(rank, HIDDEN_SIZE).astype(np.float16) * 0.01
        )
        lora_B = torch.zeros(HIDDEN_SIZE, rank, dtype=torch.float16)
        key_prefix = f"base_model.model.model.layers.0.self_attn.{module}"
        lora_weights[f"{key_prefix}.lora_A.weight"] = lora_A
        lora_weights[f"{key_prefix}.lora_B.weight"] = lora_B

    # lm_head LoRA weights
    lora_weights["base_model.model.lm_head.lora_A.weight"] = torch.from_numpy(
        np.random.randn(rank, HIDDEN_SIZE).astype(np.float16) * 0.01
    )
    lora_weights["base_model.model.lm_head.lora_B.weight"] = torch.zeros(
        VOCAB_SIZE, rank, dtype=torch.float16
    )

    adapter_config = {
        "peft_type": "LORA",
        "base_model_name_or_path": MODEL_PATH,
        "task_type": "CAUSAL_LM",
        "inference_mode": True,
        "r": rank,
        "lora_alpha": rank * 2,
        "lora_dropout": 0.0,
        "bias": "none",
        "target_modules": ["q_proj", "v_proj", "lm_head"],
    }

    os.makedirs(save_dir, exist_ok=True)
    with open(os.path.join(save_dir, "adapter_config.json"), "w") as f:
        json.dump(adapter_config, f)
    save_file(lora_weights, os.path.join(save_dir, "adapter_model.safetensors"))


def test_qwen3_unembed_lora():
    """Verify Qwen3 can load and generate with LoRA adapters with lm_head."""
    with tempfile.TemporaryDirectory() as tmpdir:
        # Initialize engine first (before creating torch tensors)
        llm = LLM(
            model=MODEL_PATH,
            enable_lora=True,
            max_loras=4,
            max_lora_rank=8,
            max_model_len=128,
            gpu_memory_utilization=0.8,
            enforce_eager=True,
        )

        # Create LoRA weights after engine init
        create_qwen3_lora_with_lm_head(tmpdir, rank=8)

        lora_request = LoRARequest("lm_head_lora", 1, tmpdir)
        llm.llm_engine.add_lora(lora_request)

        assert 1 in llm.llm_engine.list_loras(), "lm_head LoRA should be loaded"

        # Test generation
        sampling_params = SamplingParams(temperature=0, max_tokens=32)
        prompts = ["Hello, my name is"]

        # Generate with base model (no LoRA)
        base_outputs = llm.generate(prompts, sampling_params, use_tqdm=False)
        assert len(base_outputs) == 1
        assert len(base_outputs[0].outputs[0].text) > 0

        # Generate with lm_head LoRA
        lora_outputs = llm.generate(
            prompts, sampling_params, lora_request=lora_request, use_tqdm=False
        )
        assert len(lora_outputs) == 1
        assert len(lora_outputs[0].outputs[0].text) > 0
[Bugfix][Model] Support LoRA on Qwen3 Output Embedding (#29816) Signed-off-by: kurt <kurt@thinkingmachines.ai> 2026-02-06 04:25:31 -08:00			`# SPDX-License-Identifier: Apache-2.0`
			`# SPDX-FileCopyrightText: Copyright contributors to the vLLM project`
			`"""`
			`Tests for Qwen3 unembed LoRA support.`

			`This test creates synthetic LoRA weights that include lm_head (output embedding)`
			`to verify that Qwen3 properly supports LoRA on the unembed/lm_head layer.`
			`"""`

			`import json`
			`import os`
			`import tempfile`

			`import numpy as np`
			`import torch`
			`from safetensors.torch import save_file`

			`from vllm import LLM, SamplingParams`
			`from vllm.lora.request import LoRARequest`

			`MODEL_PATH = "Qwen/Qwen3-0.6B"`
			`HIDDEN_SIZE = 1024`
			`VOCAB_SIZE = 151936`


			`def create_qwen3_lora_with_lm_head(save_dir: str, rank: int = 8) -> None:`
			`"""Create synthetic Qwen3 LoRA weights with lm_head."""`
			`lora_weights = {}`
			`for module in ["q_proj", "v_proj"]:`
			`lora_A = torch.from_numpy(`
			`np.random.randn(rank, HIDDEN_SIZE).astype(np.float16) * 0.01`
			`)`
			`lora_B = torch.zeros(HIDDEN_SIZE, rank, dtype=torch.float16)`
			`key_prefix = f"base_model.model.model.layers.0.self_attn.{module}"`
			`lora_weights[f"{key_prefix}.lora_A.weight"] = lora_A`
			`lora_weights[f"{key_prefix}.lora_B.weight"] = lora_B`

			`# lm_head LoRA weights`
			`lora_weights["base_model.model.lm_head.lora_A.weight"] = torch.from_numpy(`
			`np.random.randn(rank, HIDDEN_SIZE).astype(np.float16) * 0.01`
			`)`
			`lora_weights["base_model.model.lm_head.lora_B.weight"] = torch.zeros(`
			`VOCAB_SIZE, rank, dtype=torch.float16`
			`)`

			`adapter_config = {`
			`"peft_type": "LORA",`
			`"base_model_name_or_path": MODEL_PATH,`
			`"task_type": "CAUSAL_LM",`
			`"inference_mode": True,`
			`"r": rank,`
			`"lora_alpha": rank * 2,`
			`"lora_dropout": 0.0,`
			`"bias": "none",`
			`"target_modules": ["q_proj", "v_proj", "lm_head"],`
			`}`

			`os.makedirs(save_dir, exist_ok=True)`
			`with open(os.path.join(save_dir, "adapter_config.json"), "w") as f:`
			`json.dump(adapter_config, f)`
			`save_file(lora_weights, os.path.join(save_dir, "adapter_model.safetensors"))`


			`def test_qwen3_unembed_lora():`
			`"""Verify Qwen3 can load and generate with LoRA adapters with lm_head."""`
			`with tempfile.TemporaryDirectory() as tmpdir:`
			`# Initialize engine first (before creating torch tensors)`
			`llm = LLM(`
			`model=MODEL_PATH,`
			`enable_lora=True,`
			`max_loras=4,`
			`max_lora_rank=8,`
			`max_model_len=128,`
			`gpu_memory_utilization=0.8,`
			`enforce_eager=True,`
			`)`

			`# Create LoRA weights after engine init`
			`create_qwen3_lora_with_lm_head(tmpdir, rank=8)`

			`lora_request = LoRARequest("lm_head_lora", 1, tmpdir)`
			`llm.llm_engine.add_lora(lora_request)`

			`assert 1 in llm.llm_engine.list_loras(), "lm_head LoRA should be loaded"`

			`# Test generation`
			`sampling_params = SamplingParams(temperature=0, max_tokens=32)`
			`prompts = ["Hello, my name is"]`

			`# Generate with base model (no LoRA)`
			`base_outputs = llm.generate(prompts, sampling_params, use_tqdm=False)`
			`assert len(base_outputs) == 1`
			`assert len(base_outputs[0].outputs[0].text) > 0`

			`# Generate with lm_head LoRA`
			`lora_outputs = llm.generate(`
			`prompts, sampling_params, lora_request=lora_request, use_tqdm=False`
			`)`
			`assert len(lora_outputs) == 1`
			`assert len(lora_outputs[0].outputs[0].text) > 0`