[V1] Support any head size for FlexAttention backend (#20467)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -33,9 +33,6 @@ if current_platform.is_rocm():
|
||||
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
|
||||
|
||||
REQUIRES_V0_MODELS = [
|
||||
# V1 Test: no way to fall back for head_dim = 80
|
||||
# https://github.com/vllm-project/vllm/issues/14524
|
||||
"qwen_vl",
|
||||
# V1 Test: not enough KV cache space in C1.
|
||||
"fuyu",
|
||||
]
|
||||
@@ -221,8 +218,7 @@ VLM_TEST_SETTINGS = {
|
||||
marks=[large_gpu_mark(min_gb=32)],
|
||||
),
|
||||
"blip2": VLMTestInfo(
|
||||
# TODO: Change back to 2.7b once head_dim = 80 is supported
|
||||
models=["Salesforce/blip2-opt-6.7b"],
|
||||
models=["Salesforce/blip2-opt-2.7b"],
|
||||
test_type=VLMTestType.IMAGE,
|
||||
prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
|
||||
img_idx_to_prompt=lambda idx: "",
|
||||
@@ -340,8 +336,7 @@ VLM_TEST_SETTINGS = {
|
||||
"h2ovl": VLMTestInfo(
|
||||
models = [
|
||||
"h2oai/h2ovl-mississippi-800m",
|
||||
# TODO: Re-enable once head_dim = 80 is supported
|
||||
# "h2oai/h2ovl-mississippi-2b",
|
||||
"h2oai/h2ovl-mississippi-2b",
|
||||
],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
|
||||
|
||||
@@ -83,7 +83,7 @@ MODELS = [
|
||||
QWEN2_CONFIG,
|
||||
PHI3_CONFIG,
|
||||
GPT2_CONFIG,
|
||||
# STABLELM_CONFIG, # enable this when v1 support head_size=80
|
||||
STABLELM_CONFIG,
|
||||
DOLPHIN_CONFIG,
|
||||
# STARCODER_CONFIG, # broken
|
||||
]
|
||||
|
||||
@@ -240,8 +240,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
"OrionForCausalLM": _HfExamplesInfo("OrionStarAI/Orion-14B-Chat",
|
||||
trust_remote_code=True),
|
||||
"PersimmonForCausalLM": _HfExamplesInfo("adept/persimmon-8b-chat"),
|
||||
"PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2", v0_only=True),
|
||||
"PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2"),
|
||||
"Phi3ForCausalLM": _HfExamplesInfo("microsoft/Phi-3-mini-4k-instruct"),
|
||||
# Blocksparse attention not supported in V1 yet
|
||||
"Phi3SmallForCausalLM": _HfExamplesInfo("microsoft/Phi-3-small-8k-instruct",
|
||||
trust_remote_code=True,
|
||||
v0_only=True),
|
||||
@@ -258,10 +259,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
"Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
|
||||
"Qwen3ForSequenceClassification": _HfExamplesInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls"), # noqa: E501
|
||||
"RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
|
||||
"StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b", # noqa: E501
|
||||
v0_only=True),
|
||||
"StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t",
|
||||
v0_only=True),
|
||||
"StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"), # noqa: E501
|
||||
"StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
|
||||
"Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
|
||||
"SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"),
|
||||
"TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
|
||||
@@ -330,8 +329,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
"AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
|
||||
"AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereForAI/aya-vision-8b"), # noqa: E501
|
||||
"Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b", # noqa: E501
|
||||
extras={"6b": "Salesforce/blip2-opt-6.7b"}, # noqa: E501
|
||||
v0_only=True),
|
||||
extras={"6b": "Salesforce/blip2-opt-6.7b"}), # noqa: E501
|
||||
"ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"), # noqa: E501
|
||||
"DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny", # noqa: E501
|
||||
extras={"fork": "Isotr0py/deepseek-vl2-tiny"}, # noqa: E501
|
||||
@@ -359,8 +357,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
trust_remote_code=True),
|
||||
"KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct", # noqa: E501
|
||||
extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"}, # noqa: E501
|
||||
trust_remote_code=True,
|
||||
v0_only=True),
|
||||
trust_remote_code=True),
|
||||
"Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
|
||||
max_model_len=10240),
|
||||
"LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
|
||||
|
||||
@@ -22,7 +22,8 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
# FIXME: Possible memory leak in the previous tests?
|
||||
if model_arch == "GraniteSpeechForConditionalGeneration":
|
||||
if model_arch in ("GraniteSpeechForConditionalGeneration",
|
||||
"KimiVLForConditionalGeneration"):
|
||||
pytest.skip("Avoid OOM")
|
||||
|
||||
# Avoid OOM and reduce initialization time by only using 1 layer
|
||||
|
||||
Reference in New Issue
Block a user