[V1][Quantization] Add CUDA graph compatible v1 GGUF support (#18646)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: Isotr0py <2037008807@qq.com>
This commit is contained in:
@@ -8,7 +8,6 @@ from gguf import GGMLQuantizationType, GGUFReader, ReaderTensor, dequantize
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
import vllm._custom_ops as ops
|
||||
from vllm.model_executor.layers.activation import SiluAndMul
|
||||
from vllm.model_executor.layers.fused_moe import fused_experts
|
||||
from vllm.model_executor.layers.quantization.gguf import _fused_moe_gguf
|
||||
from vllm.platforms import current_platform
|
||||
@@ -176,12 +175,11 @@ def test_moe(num_tokens: int, hidden_size: int, dtype: torch.dtype,
|
||||
|
||||
w2_dequant = torch.tensor(dequantize(w2.data, quant_type),
|
||||
device="cuda").to(dtype)
|
||||
act = SiluAndMul()
|
||||
|
||||
output = _fused_moe_gguf(x, torch.tensor(w13.data, device="cuda"),
|
||||
torch.tensor(w2.data,
|
||||
device="cuda"), topk_weights,
|
||||
topk_ids, quant_type, quant_type, act)
|
||||
topk_ids, quant_type, quant_type, "silu")
|
||||
|
||||
ref_output = fused_experts(x, w13_dequant, w2_dequant, topk_weights,
|
||||
topk_ids).reshape(output.shape)
|
||||
|
||||
Reference in New Issue
Block a user