[V1][Quantization] Add CUDA graph compatible v1 GGUF support (#18646)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: Isotr0py <2037008807@qq.com>
2025-05-27 12:40:28 +08:00
parent 1f88dbd2bb
commit 1f1b1bc03b
5 changed files with 188 additions and 59 deletions
--- a/tests/models/quantization/test_gguf.py
+++ b/tests/models/quantization/test_gguf.py
@@ -78,8 +78,12 @@ DOLPHIN_CONFIG = GGUFTestConfig(
 )

 MODELS = [
-    LLAMA_CONFIG, QWEN2_CONFIG, PHI3_CONFIG, GPT2_CONFIG, STABLELM_CONFIG,
-    DOLPHIN_CONFIG
+    LLAMA_CONFIG,
+    QWEN2_CONFIG,
+    PHI3_CONFIG,
+    GPT2_CONFIG,
+    # STABLELM_CONFIG,  # enable this when v1 support head_size=80
+    DOLPHIN_CONFIG,
    # STARCODER_CONFIG, # broken
 ]