[BugFix] Fix GGUF tp>1 when vocab_size is not divisible by 64 (#12230)
Signed-off-by: NickLucche <nlucches@redhat.com>
This commit is contained in:
@@ -66,12 +66,20 @@ STARCODER_CONFIG = GGUFTestConfig(
|
||||
gguf_filename="starcoder2-3b.Q6_K.gguf",
|
||||
)
|
||||
|
||||
DOLPHIN_CONFIG = GGUFTestConfig(
|
||||
# Test VocabParallelEmbedding sharding issue.
|
||||
original_model="cognitivecomputations/TinyDolphin-2.8-1.1b",
|
||||
gguf_repo="tsunemoto/TinyDolphin-2.8-1.1b-GGUF",
|
||||
gguf_filename="tinydolphin-2.8-1.1b.Q6_K.gguf",
|
||||
)
|
||||
|
||||
MODELS = [
|
||||
LLAMA_CONFIG,
|
||||
QWEN2_CONFIG,
|
||||
PHI3_CONFIG,
|
||||
GPT2_CONFIG,
|
||||
STABLELM_CONFIG,
|
||||
DOLPHIN_CONFIG
|
||||
# STARCODER_CONFIG, # broken
|
||||
]
|
||||
|
||||
@@ -107,6 +115,7 @@ def test_models(
|
||||
|
||||
# Run unquantized model.
|
||||
with vllm_runner(model_name=model.original_model,
|
||||
enforce_eager=True, # faster tests
|
||||
dtype=dtype,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=tp_size) as original_model:
|
||||
@@ -115,6 +124,7 @@ def test_models(
|
||||
|
||||
# Run gguf model.
|
||||
with vllm_runner(model_name=model.gguf_model,
|
||||
enforce_eager=True,
|
||||
tokenizer_name=model.original_model,
|
||||
dtype=dtype,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
|
||||
Reference in New Issue
Block a user