Compare commits

...

333 Commits

Author SHA1 Message Date
zhanqiuhu
bbe0574d8e [Bugfix] Disable TRTLLM attention when KV transfer is enabled (#33192)
Signed-off-by: Zhanqiu Hu <zh338@cornell.edu>
2026-02-05 00:49:18 +00:00
Luka Govedič
4d9513537d [CI][torch.compile] Reduce e2e fusion test time (#33293)
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
Signed-off-by: ProExpertProg <luka.govedic@gmail.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2026-02-04 19:09:03 -05:00
Ilya Boytsov
439afa4eea feat: Add ColBERT late interaction model support (#33686)
Signed-off-by: Ilya Boytsov <ilyaboytsov1805@gmail.com>
Signed-off-by: Ilya Boytsov <boytsovpanamera@mail.ru>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: wang.yuqi <yuqi.wang@daocloud.io>
2026-02-05 08:05:13 +08:00
Nick Hill
fa4e0fb028 [Core] Don't schedule spec tokens with prefill chunks (#33652)
Signed-off-by: Nick Hill <nickhill123@gmail.com>
2026-02-04 23:40:22 +00:00
Sage Moore
ce498a6d61 Change the type signature of MixtureOfExperts.expert_weights to MutableSequence[Sequence[Tensor]] (#33573)
Signed-off-by: Sage Moore <sagmoore@redhat.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2026-02-04 17:02:46 -05:00
Richard Zou
9f14c9224d Revert "[torch.compile] Significantly speed up cold start times" (#33820)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2026-02-04 21:59:59 +00:00
Muhammad Hashmi
535de06cb1 [Model] Add transcription support for Qwen3-Omni (#29828)
Signed-off-by: Muhammad Hashmi <mhashmi@berkeley.edu>
Signed-off-by: NickLucche <nlucches@redhat.com>
Co-authored-by: NickLucche <nlucches@redhat.com>
2026-02-04 21:17:47 +00:00
Simon Danielsson
4292c90a2a [Bugfix] Support RotaryEmbedding CustomOp for gpt-oss (#33800)
Signed-off-by: simondanielsson <simon.danielsson99@hotmail.com>
2026-02-04 20:17:41 +00:00
Taeksang Kim
6e98f6d8b6 Implement zero-copy GQA for multimodal and CPU (#33732)
Signed-off-by: Taeksang Kim <ts.kim@hyperaccel.ai>
2026-02-04 20:11:39 +00:00
kourosh hakhamaneshi
2f6d17cb2f [rocm][ray] Fix: Unify Ray device visibility handling across CUDA and ROCm (#33308)
Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
2026-02-04 10:09:14 -08:00
Isotr0py
192ad4648b [Bugfix] Fix interns1-pro initialization and PP (#33793)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-02-04 17:54:45 +00:00
Lucas Wilkinson
0e92298622 [Misc] Delay deprecation of CommonAttentionMetadata properties (#33801)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2026-02-04 08:41:57 -08:00
jiangkuaixue123
87d9a26166 [Bugfix] Fix ubatch wrapper num_tokens calculate (#33694)
Signed-off-by: jiangkuaixue123 <jiangxiaozhou111@163.com>
2026-02-04 16:41:45 +00:00
Cyrus Leung
80f921ba4b [Bugfix] Fix normalize still being passed to PoolerConfig (#33794)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-04 23:56:02 +08:00
Wentao Ye
711edaf0d0 [Perf] Optimize spec decoding + async scheduling, 1.5% Throughput improvement (#33612)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2026-02-04 09:34:32 -05:00
Micah Williamson
1d367a738e [Bugfix][ROCm] Include float8_e4m3fnuz in NCCL Dtype Dispatching (#33713)
Signed-off-by: Micah Williamson <micah.williamson@amd.com>
2026-02-04 05:36:29 -08:00
Cyrus Leung
32a02c7ca2 Apply #33621 to main (#33758)
Signed-off-by: Zachary Aristei <zaristei@nvidia.com>
Co-authored-by: zaristei2 <zaristei2@gmail.com>
Co-authored-by: Zachary Aristei <zaristei@nvidia.com>
2026-02-04 05:35:39 -08:00
Chauncey
f67ee8b859 [Perf] Optimize chat completion streaming performance (#33782)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2026-02-04 12:30:36 +00:00
Cyrus Leung
e57ef99b40 [Model] Apply #32631 for recent models (#33785)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-04 12:23:01 +00:00
Yueqian Lin
f8516a1ab9 [Bugfix][Model] Fix audio-in-video support for Qwen2.5-Omni and Qwen3-Omni (#33605)
Signed-off-by: linyueqian <linyueqian@outlook.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.io>
2026-02-04 12:15:29 +00:00
Vadim Gimpelson
824058076c [PERF] Change GDN Attention State Layout from [N, HV, K, V] to [N, HV, V, K] (#33291)
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
2026-02-04 11:20:52 +00:00
Or Ozeri
8e32690869 [KV Connector][BugFix] scheduler: Delay freeing blocks of aborted async loads (#32255)
Fixes a not-yet-reported case where it was possible for blocks to be
freed by an abort before an async transfer completed, resulting
in corrupted KV data.

Signed-off-by: Or Ozeri <oro@il.ibm.com>
2026-02-04 11:16:34 +00:00
Zhengxu Chen
a208439537 [compile] Remove runner type from ignored caching factor list. (#33712)
Signed-off-by: zhxchen17 <zhxchen17@fb.com>
2026-02-04 10:56:45 +00:00
Zhengxu Chen
bcd2f74c0d [compile] Clean up AOT compile bypass on evaluate_guards. (#33578)
Signed-off-by: zhxchen17 <zhxchen17@fb.com>
2026-02-04 02:12:53 -08:00
Kunshang Ji
f79f777803 [XPU][2/N] add support unquantized moe support for xpu (#33659)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2026-02-04 02:12:25 -08:00
Augusto Yao
4c8d1bf361 use ORJSONResponse when available to improve the efficiency of request process (#33548)
Signed-off-by: augusto.yjh <augusto.yjh@antgroup.com>
2026-02-04 10:04:11 +00:00
Kunshang Ji
061da6bcf7 [XPU] remove common path warning log (#33769)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2026-02-04 16:40:17 +08:00
zhanqiuhu
4403e3ed4c [Metrics] Add labeled prompt token metrics for P/D disaggregation (#33290)
Add labeled Prometheus metrics to distinguish where prompt tokens come
from in P/D disaggregated deployments.

In P/D disaggregation, decode instances receive KV cache from prefill instances.
Currently, decode reports inflated prompt throughput because it counts all
prompt tokens as "computed", even though most were transferred.

This PR adds labeled metrics so users can understand actual compute work vs
transferred work:

vllm:prompt_tokens_by_source_total{source="local_compute"}        # Tokens prefilled locally
vllm:prompt_tokens_by_source_total{source="external_kv_transfer"} # Tokens received via KV transfer  
vllm:prompt_tokens_by_source_total{source="local_cache_hit"}      # Tokens from local prefix cache
vllm:prompt_tokens_cached_total                                    # Total cached (local + external, -1 when all 

Signed-off-by: Zhanqiu Hu <zh338@cornell.edu>
2026-02-04 07:46:48 +00:00
Matt
08e094997e [Hardware][AMD][CI] Refactor AMD tests to properly use BuildKite parallelism (#32745)
Signed-off-by: Matthew Wong <Matthew.Wong2@amd.com>
2026-02-04 14:51:33 +08:00
Wentao Ye
d88a1df699 [Deprecation] Deprecate profiling envs (#33722)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2026-02-04 05:58:21 +00:00
Cyrus Leung
90d74ebaa4 [Deprecation] Remove _get_data_parser in MM processor (#33757)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-04 05:51:52 +00:00
Frank Wang
45f8fd6f97 [Feature] Enable TRITON_ATTN for Batch Invariance (#33688)
Signed-off-by: frankwang28 <frank.wbb@hotmail.com>
2026-02-04 13:27:34 +08:00
Wentao Ye
5e1e0a0fbd [Refactor] Remove unused dead code (#33718)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2026-02-03 21:25:11 -08:00
Michael Goin
eb5ed20743 [Bugfix] Define router_logits_dtype for remaining MoE models (#33737)
Signed-off-by: mgoin <mgoin64@gmail.com>
2026-02-04 13:24:14 +08:00
Huy Do
2647163674 Save startup benchmark results as a list of values (#33629)
Signed-off-by: Huy Do <huydhn@gmail.com>
2026-02-03 20:37:51 -08:00
Shanshan Shen
9fb27dd3b3 [MM] Align the prefix of MMEncoderAttention with Attention (#33750)
Signed-off-by: shen-shanshan <467638484@qq.com>
2026-02-04 04:07:30 +00:00
R3hankhan
4dffc5e044 [CPU] Split attention dispatch by head_dim alignment (#32161)
Signed-off-by: Rehan Khan <Rehan.Khan7@ibm.com>
2026-02-03 19:37:15 -08:00
Andrew Xia
e1bf04b6c2 [1/N] Initial Implementation of Parser for ResponsesAPI (#32712)
Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
2026-02-04 10:59:03 +08:00
Isotr0py
02080179a3 [Bugfix] Fix torchrun PP broadcast deadlock with async scheduling (#33701)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-02-04 02:17:37 +00:00
wang.yuqi
1b8fe6f7c4 [Frontend][4/n] Make pooling entrypoints request schema consensus | ScoreRequest (#33060)
Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
2026-02-04 01:48:40 +00:00
Nick Hill
52ee21021a [BugFix][Spec Decoding] Fix negative accepted tokens metric crash (#33729)
Signed-off-by: Nick Hill <nickhill123@gmail.com>
2026-02-03 23:34:41 +00:00
Wentao Ye
655efb3e69 [Dependency] Remove comments of ray in dependency files (#33351)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2026-02-03 15:30:47 -08:00
Matthew Bonanni
bd8da29a66 [Bugfix] Fix sparse MLA metadata building (#33579)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2026-02-03 15:29:48 -08:00
Michael Goin
2a99c5a6c8 [Bugfix] Disable TRTLLM FP8 MoE if router_logits_dtype==float32 and routing_method!=DeepSeekV3 (#33613)
Signed-off-by: mgoin <mgoin64@gmail.com>
2026-02-03 13:26:51 -08:00
Patrick von Platen
3f7662d650 [Voxtral Realtime] Change name (#33716)
Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com>
2026-02-03 13:03:28 -08:00
Vadim Gimpelson
a372f3f40a [MISC] Fix Tensor Parallelism for Quantized Mamba Models with n_groups=1 (#33257)
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
2026-02-03 15:10:31 -05:00
Harry Mellor
61e632aea1 Turn @config into a dataclass_transform (#31541)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-03 17:40:59 +00:00
Richard Zou
b1bb18de8d [torch.compile] Significantly speed up cold start times (#33641)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2026-02-03 09:12:11 -08:00
Lucas Wilkinson
2267cb1cfd [Attention][FA3] Update FA3 to include new swizzle optimization (#23465)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2026-02-03 08:08:47 -08:00
dtc
0d6ccf68fa [P/D] rework mooncake connector and introduce its bootstrap server (#31034)
Signed-off-by: Tianchen Ding <dtcccc@linux.alibaba.com>
Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com>
2026-02-03 08:08:25 -08:00
Cyrus Leung
18e7cbbb15 [Bugfix] Fix startup hang for Granite Speech (#33699)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-03 15:57:56 +00:00
Patrick von Platen
f0d5251715 [Voxtral models] Skip warm-up to skip confusing error message in warm-up (#33576)
Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2026-02-03 07:22:34 -08:00
Shanshan Shen
5c4f2dd6ef [MM] Pass prefix parameter to MMEncoderAttention (#33674)
Signed-off-by: shen-shanshan <467638484@qq.com>
2026-02-03 06:47:41 -08:00
wang.yuqi
f3d8a34671 [Bugfix] Do not add extra \n for image-only cases when constructing multimodal text prompts. (#33647)
Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
2026-02-03 06:43:47 -08:00
shaharmor98
4bc913aeec Feat/add nemotron nano v3 tests (#33345) 2026-02-03 08:52:49 -05:00
Kuntai Du
fbb3cf6981 [Bugfix][Async][Connector] avoid vllm-side double free during async scheduling + request abort + async KV cache transfer (#33377)
Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
2026-02-03 21:50:15 +08:00
Krish Gupta
2df2b3499d Document NixlConnector backend selection via kv_connector_extra_config (#33552)
Signed-off-by: KrxGu <krishom70@gmail.com>
2026-02-03 05:49:59 -08:00
Harry Mellor
2a8d84e66d Fix Gemma3n audio encoder for Transformers v5 (#33673)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-03 05:49:49 -08:00
zxy
a3acfa1071 [Models] Intern-S1-Pro (#33636)
Signed-off-by: zxy <zhou0493@e.ntu.edu.sg>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-02-03 05:49:45 -08:00
Harry Mellor
be8168ff88 Fix Gemma3 GGUF for Transformers v5 (#33683)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-03 12:36:53 +00:00
Harry Mellor
f6af34626d Fix offline test for Transformers v5 (#33682)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-03 12:07:24 +00:00
Song Zhixin
ceab70c89d [Bugfix] fix qwen3-asr response error (#33644)
Signed-off-by: jesse <szxfml@gmail.com>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2026-02-03 03:33:56 -08:00
Cyrus Leung
52683ccbe1 [Misc] Update default image format of encode_base64 (#33656)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-03 03:13:16 -08:00
Michael Goin
e346e2d056 [Bugfix] Disable RoutingMethodType.[Renormalize,RenormalizeNaive] TRTLLM per-tensor FP8 MoE (#33620)
Signed-off-by: mgoin <mgoin64@gmail.com>
2026-02-03 10:37:15 +00:00
Cyrus Leung
83449a5ff0 [Refactor] Clean up pooling serial utils (#33665)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-03 10:29:18 +00:00
Lucas Hänke de Cansino
dad2d6a590 [Bugfix][Model] Fix DeepSeek-OCR-2 chat template to include BOS token (#33642)
Signed-off-by: l4b4r4b4b4 <lucas.cansino@mail.de>
2026-02-03 00:35:58 -08:00
Isotr0py
32e84fa1ff [CI/Build] Investigate torchrun distributed tests hanging issue (#33650)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-02-03 15:49:17 +08:00
Richard Zou
fd9c83d0e0 [torch.compile] Document the workaround to standalone_compile failing (#33571)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2026-02-03 07:16:55 +00:00
杨朱 · Kiki
b95cc5014d [Misc] Remove deprecated VLLM_ALL2ALL_BACKEND environment variable (#33535)
Signed-off-by: carlory <baofa.fan@daocloud.io>
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-03 15:01:59 +08:00
Nick Hill
61397891ce [Minor] Some code simplification in scheduler.py (#33597)
Signed-off-by: Nick Hill <nickhill123@gmail.com>
2026-02-03 15:00:00 +08:00
杨朱 · Kiki
ef248ff740 [Misc] Remove deprecated profiler environment variables (#33536)
Signed-off-by: carlory <baofa.fan@daocloud.io>
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-03 14:58:44 +08:00
Kunshang Ji
e10604480b [XPU][1/N] Deprecate ipex and switch to vllm-xpu-kernels for xpu platform (#33379)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2026-02-02 22:46:10 -08:00
Chauncey
bf001da4bf [Bugfix] Interleaved thinking keeps compatibility with reasoning_content (#33635)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Co-authored-by: Koushik Dutta <koushd@gmail.com>
2026-02-03 06:46:05 +00:00
杨朱 · Kiki
a0a984ac2e [CI/Build] Remove hardcoded America/Los_Angeles timezone from Dockerfiles (#33553)
Signed-off-by: carlory <baofa.fan@daocloud.io>
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-02 22:32:39 -08:00
Shengliang Xu
f1cb9b5544 Fix quantized Falcon-H1 model loading issues (#32728)
Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2026-02-02 22:31:27 -08:00
Daniel Mescheder
4c4b6f7a97 [Frontend] Add sampling parameters to Responses API (#32609)
Signed-off-by: Daniel Mescheder <dmesch@amazon.com>
Co-authored-by: Daniel Mescheder <dmesch@amazon.com>
2026-02-03 13:51:10 +08:00
Roger Wang
10546f925a [Bugfix] Fix mm budget setting for Qwen Omni models (#33634)
Signed-off-by: Roger Wang <hey@rogerw.io>
2026-02-03 04:56:25 +00:00
Radu Salavat
e69c990c21 [Feature][CPU Backend]: Optimize ARM vectorization backend (#30329)
Signed-off-by: Radu Salavat <radu.salavat@arm.com>
2026-02-02 20:17:56 -08:00
Richard Zou
5eac9a1b34 [torch.compile] Don't do the fast moe cold start optimization if there is speculative decoding (#33624)
Signed-off-by: Richard Zou <zou3519@gmail.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2026-02-03 03:38:49 +00:00
Nathan Weinberg
1b60b45d0d [CI/Build] add directions for CPU image upload to Docker Hub (#32032)
Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
Signed-off-by: Nathan Weinberg <31703736+nathan-weinberg@users.noreply.github.com>
Co-authored-by: Li, Jiang <bigpyj64@gmail.com>
2026-02-03 02:48:06 +00:00
Dezhan
4b3803d180 [BugFix] DPMetadata raises assert error for dense model (#32739)
Co-authored-by: Dezhan Tu <dztu@meta.com>
2026-02-03 00:56:44 +00:00
Patrick von Platen
5019c59dd2 [Voxtral Realtime] Introduce global log mel max (#33574)
Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-02-02 17:01:47 -05:00
Lain
089cd4f002 fix cutlass_3x_gemm_fp8_blockwise on sm103a (#32224)
Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
Co-authored-by: Pavani Majety <pmajety@nvidia.com>
2026-02-02 11:47:46 -08:00
Vasiliy Kuznetsov
0130223bd9 fix memory for online fp8 quantization with streaming weight load (#31914)
Signed-off-by: vasiliy <vasiliy@fb.com>
2026-02-02 14:17:42 -05:00
Matthew Bonanni
5d1aef3004 [UX] Format attention backend log line (#33570)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2026-02-02 18:57:12 +00:00
yugong333
ffe1fc7a28 Reduce the kernel overhead when num of active loras is smaller than max loras. Multiple cuda graphs are captured for each num of active-loras. (#32005)
Signed-off-by: Yu Gong <yu3.gong@gmail.com>
2026-02-02 12:30:06 -05:00
Harry Mellor
8b7346d5f1 Update huggingface-hub again (#33567)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-02 09:20:54 -08:00
Harry Mellor
6141ebe0dd Remove incorrect tokenizer info test (#33565)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-02 17:11:44 +00:00
Yang Liu
199e3cb476 [Model] Use mm_position to compute mrope positions for GLM-4.xV (#33039)
Signed-off-by: Yang <lymailforjob@gmail.com>
2026-02-02 16:55:48 +00:00
Matthew Bonanni
9f8cb81b44 [CI] Add DeepSeek V3.2 nightly eval (#33566)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2026-02-02 16:10:02 +00:00
Cyrus Leung
d7e17aaacd [Refactor] Move profiling methods to MM budget (#33559)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-02 23:27:00 +08:00
Kebe
528e9b1490 [Feature][Core] Support Fabric detection to adapt the MNNVL protocol for the GB series (#33540)
Signed-off-by: Kebe <mail@kebe7jun.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Thomas Vegas <tvegas@nvidia.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
2026-02-02 22:55:46 +08:00
shanjiaz
d95b4be47a move spec decode slow test to test_areas.yaml (#33365)
Signed-off-by: shanjiaz <zsjwpianpian@gmail.com>
2026-02-02 06:28:36 -08:00
Isotr0py
4061dcf4c5 [Bugfix] Enable Kimi k25 processor test (#33562)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-02-02 14:25:25 +00:00
danielafrimi
0aca8b8c62 [MoE] Enable Shared/Routed Overlap For Latent MoE (Nemotron-H) (#32790)
Signed-off-by: dafrimi <dafrimi@nvidia.com>
2026-02-02 09:18:50 -05:00
Rabi Mishra
9eb58f8cf1 fix[ROCm]: Remove unconditional aiter import (#32902)
Signed-off-by: rabi <ramishra@redhat.com>
2026-02-02 22:10:02 +08:00
Cyrus Leung
b10d05b8a8 [Model] Use explicit types in get_generation_prompt (#33551)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-02 12:38:49 +00:00
Borushiki
b398e5c819 Update get_expert_mapping to include self parameter (#33525)
Signed-off-by: Borushiki <38628261+Otsutsukii@users.noreply.github.com>
2026-02-02 20:29:07 +08:00
Grzegorz K. Karch
78061ef584 Fix accessing hidden_act from model config (#32686)
Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
2026-02-02 11:11:33 +00:00
Nicolò Lucchesi
528b3076af [CI][Bugfix] Fix flaky tests/v1/kv_connector/unit/test_multi_connector.py::test_multi_example_connector_consistency (#33555)
Signed-off-by: NickLucche <nlucches@redhat.com>
2026-02-02 03:01:29 -08:00
Cyrus Leung
a502831d36 [Chore] Remove redundant input parsing methods (#33542)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-02 10:50:47 +00:00
Komal Kumar Teru
ba871fb788 [Misc] support arbitrary MM datasets in spec dec bench (#33486)
Signed-off-by: kkt-cohere <komal@cohere.com>
Signed-off-by: Komal Kumar Teru <162363718+kkt-cohere@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2026-02-02 08:49:48 +00:00
R3hankhan
ab374786c7 [CPU][IBM Z][Dockerfile] Fix IBM Z builds (#33243)
Signed-off-by: Rehan Khan <Rehan.Khan7@ibm.com>
2026-02-01 23:41:29 -08:00
RED
808dd87b30 [Model] Support DeepSeek-OCR-2 (#33165)
Signed-off-by: liuli <ll407707@alibaba-inc.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: liuli <ll407707@alibaba-inc.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-02-02 06:24:10 +00:00
Andy Lo
beb8899482 Fix mistral sliding window parsing (#33521)
Signed-off-by: Andy Lo <andy@mistral.ai>
2026-02-02 05:08:04 +00:00
Sawyer Bowerman
ce88756b96 [Doc]: update paths for Offline/Online/Others example sections (#33494)
Signed-off-by: Sawyer Bowerman <sbowerma@redhat.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-02-02 03:56:53 +00:00
Paco Xu
a3154a6092 [Doc] add missing model entries in supported_models.md (#33220)
Signed-off-by: Paco Xu <paco.xu@daocloud.io>
2026-02-02 03:37:25 +00:00
jack
7c036432fc [Bugfix] GLM-4 tool parser: incremental string streaming (#33218)
Signed-off-by: QwertyJack <7554089+QwertyJack@users.noreply.github.com>
Co-authored-by: QwertyJack <7554089+QwertyJack@users.noreply.github.com>
2026-02-02 11:13:31 +08:00
Robert Shaw
318b120766 [Nightly CI] Remove CT Model (#33530)
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
2026-02-01 19:09:09 -08:00
csy0225
c3b40dc3e7 [Models] Step-3.5-Flash (#33523)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: i-zhangmingming <i-zhangmingming@stepfun.com>
Co-authored-by: xiewuxun <xiewuxun@stepfun.com>
Co-authored-by: zetaohong <i-hongzetao@stepfun.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2026-02-02 10:21:18 +08:00
Yifan Qiao
a01ef3fa51 [Fix] prefix cache hit rate == 0 bug with gpt-oss style models (#33524)
Signed-off-by: Yifan Qiao <yifanqiao@berkeley.edu>
2026-02-02 01:59:58 +00:00
Runkai Tao
7320ca3942 Add unpermute-aware fused MoE LoRA path (#32655)
Signed-off-by: Runkai Tao <rt572@physics.rutgers.edu>
2026-02-02 09:46:09 +08:00
Nick Hill
cf0a99f84d [ModelRunner V2] Support spec decode with structured outputs (#33374)
Signed-off-by: Nick Hill <nickhill123@gmail.com>
2026-02-02 00:19:59 +00:00
Nick Hill
e535d90deb [ModelRunner V2] Misc minor simplifications and optimizations (#33467)
Signed-off-by: Nick Hill <nickhill123@gmail.com>
2026-02-01 22:17:14 +00:00
Komal Kumar Teru
0b225fb7b2 [Misc] skip target model mm emb in draft proposal step when draft is text-only (#33437)
Signed-off-by: kkt-cohere <komal@cohere.com>
2026-02-01 21:13:35 +00:00
will b.
46b4a02794 Fix DeepSeek V2 RoPE initialization error (#33501)
Signed-off-by: Eduardo Salinas <edus@microsoft.com>
Signed-off-by: catswe <212922539+catswe@users.noreply.github.com>
Co-authored-by: Eduardo Salinas <edus@microsoft.com>
2026-02-01 21:00:56 +00:00
shaharmor98
8869cd8ec1 Add MoE config for Super B200 TP2 (#33510) 2026-02-01 18:48:37 +00:00
JartX
cd86fff38f [BUGFIX] Fix hipErrorIllegalState in Qwen3-Omni during startup profiling allow inference Omni on ROCM (#33077)
Signed-off-by: JartX <sagformas@epdcenter.es>
2026-02-01 13:36:25 +00:00
Maral
b5f8c3092d [W8A8 Block Linear Refactor][1/N] Keep all quantization types into QuantFP8 class. (#33047)
Signed-off-by: maral <maralbahari.98@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2026-02-01 09:28:01 +00:00
Cyrus Leung
21997f45b1 [Redo] #33110 with threading limit (#33502)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: YunzhuLu <lucia.yunzhu@gmail.com>
2026-02-01 09:18:11 +00:00
Luka Govedič
672023877b Change defaults for vllm bench startup (#33489) 2026-01-31 23:46:01 -08:00
Zack Yu
754a8ca942 fix: only include Authorization header when OPENAI_API_KEY is set (#33488)
Signed-off-by: zack041 <zackyu041@gmail.com>
2026-01-31 23:35:09 -08:00
Eduardo Salinas
302ecf64ff [Models]: lfm2_siglip2 return intermediate encoder layers (#33370)
Signed-off-by: Eduardo Salinas <edus@microsoft.com>
2026-02-01 06:17:49 +00:00
Cyrus Leung
b6bb2842cf [Critical] Revert #33110 (#33500) 2026-01-31 21:06:42 -08:00
Cyrus Leung
79b6ec6aab [Bugfix] Fix inconsistent handling of cache reset (#33481)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-31 20:23:41 -08:00
Greg Pereira
d6416fdde9 pin LMCache to v0.3.9 or greater with vLLM v0.15.0 (#33440)
Signed-off-by: greg pereira <grpereir@redhat.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2026-01-31 20:50:38 -07:00
Andreas Karatzas
0fb3157267 [ROCm][CI] Update huggingface-hub pin (#33492)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-01 02:51:54 +00:00
Cyrus Leung
a358e4dffe [Refactor] Make Renderer an abstract class (#33479)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-01 10:36:30 +08:00
René Honig
079781177a fix: Add SM120 (RTX Blackwell) support for FlashInfer CUTLASS NVFP4 MoE kernels (#33417)
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2026-01-31 14:06:42 -08:00
Roy Wang
63c0889416 [Misc] Fix flashinfer related tests (#33462)
Signed-off-by: esmeetu <jasonailu87@gmail.com>
2026-01-31 16:10:24 -05:00
smashyalts
1e86c802d4 Fix grammar (#33121)
Signed-off-by: smashyalts <smashyalts@gmail.com>
2026-01-31 09:59:34 -08:00
linhaifeng
fedf64332e [Bugfix]: Fix display errors in TORCH_CHECK messages (#32942)
Signed-off-by: linhaifeng <1371675203@qq.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2026-01-31 09:48:48 -08:00
Xiao Yang
2238a12c13 [Misc] support collect_env for endpoint /server_info (#33246)
Signed-off-by: yang.xiao <yang.xiao@daocloud.io>
2026-02-01 01:42:59 +08:00
Harry Mellor
ce0afe2451 Update huggingface-hub pin for the last time before Transformers v5 (#33473)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-01-31 09:14:24 -08:00
Cyrus Leung
88c3e114d8 [Refactor] Move MM data parsing outside processor (#33408)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-31 16:46:14 +00:00
Cyrus Leung
92924b2ddd [Deprecation] Remove deprecated items related to pooling (#33477)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-31 08:44:40 -08:00
YunzhuLu
27cb2f678f [Bugfix] Early-reject requests with MM data longer than encode cache capacity (#33110)
Signed-off-by: YunzhuLu <lucia.yunzhu@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
2026-01-31 08:41:13 -08:00
jma99_2333
22d9a056d5 Support clear mm and encoder cache (#33452)
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.io>
2026-01-31 15:22:25 +00:00
ℍ𝕠𝕝𝕝𝕠𝕨 𝕄𝕒𝕟
13b842f271 [BugFix][Router Replay] Capture Logical Experts with EPLB (#33013)
Signed-off-by: Hollow Man <hollowman@opensuse.org>
2026-01-31 10:12:17 -05:00
Luka Govedič
15f40b20aa [fix][torch.compile] Fix cold-start compilation time increase by adding kv cache update to splitting ops (#33441)
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
Co-authored-by: Richard Zou <zou3519@gmail.com>
2026-01-31 06:48:34 -08:00
Cyrus Leung
793af538a3 [Doc] Update plugin deprecation notices (#33476)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-31 22:48:28 +08:00
cmunley1
6f5e7cda57 support return prompt token ids in responses (#33378) 2026-01-31 06:04:20 -08:00
Roy Wang
68feb76a6f [Misc] Replace deprecated interface seed_everything (#33474)
Signed-off-by: esmeetu <jasonailu87@gmail.com>
2026-01-31 05:38:39 -08:00
Cyrus Leung
4cb59dea6a [Bugfix] Fix incompatibility between #33372 and #32863 (#33475)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-31 05:21:32 -08:00
Angela Yi
608b556507 [ez] Add structured torch.compile logs (#33213)
Signed-off-by: angelayi <yiangela7@gmail.com>
2026-01-31 21:00:54 +08:00
Cyrus Leung
f0a1c8453a [Frontend] Use new Renderer for Completions and Tokenize API (#32863)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-31 04:51:15 -08:00
caozuoba
8980001c93 [perf] v1/spec_decode: skip softmax for all-greedy rejection sampling (#32852)
Signed-off-by: hdj <1293066020@qq.com>
2026-01-31 09:51:26 +00:00
jennyyyyzhen
527bcd14d4 [ROCM] Enable aiter attn backend for qwen3-next model (#32492)
Signed-off-by: jennyyyyzhen <yzhen@hmc.edu>
2026-01-31 17:03:57 +08:00
Jinwu
f68e3ea4e1 [BugFix] Add synchronize in CutlassW4A8LinearKernel to ensure data is ready for use. (#33078)
Co-authored-by: jinwuguo <jinwuguo@tencent.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2026-01-31 08:14:54 +00:00
Yanan Cao
d5c41db35b [Kernel] [Helion] [3/N] Helion kernel registry (#33203)
Signed-off-by: Yanan Cao <gmagogsfm@gmail.com>
2026-01-31 15:38:46 +08:00
Fadi Arafeh
1618e25492 [CPU][Feat] Enable KleidiAI accelerated int4 dynamic quant with BF16 activations on Arm CPUs (#33122)
Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
2026-01-31 07:16:22 +00:00
AutumnAurelium
f3888aca83 Add EAGLE3 support for AFMoE (#33111)
Signed-off-by: AutumnAurelium <88015631+AutumnAurelium@users.noreply.github.com>
2026-01-31 06:53:08 +00:00
Dimitrios Bariamis
f0bca83ee4 Add support for Mistral Large 3 inference with Flashinfer MoE (#33174)
Signed-off-by: Dimitrios Bariamis <12195802+dbari@users.noreply.github.com>
Co-authored-by: Dimitrios Bariamis <12195802+dbari@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2026-01-30 22:48:27 -08:00
Matthias Gehre
73419abfae [Bugfix] Handle Asym W4A16 (ConchLinearKernel) for CT (#33200)
Signed-off-by: Matthias Gehre <matthias.gehre@amd.com>
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-01-31 06:21:51 +00:00
Nicolò Lucchesi
e77f162cf5 [Bugfix] Fix Qwen3ASR language asr tag in output (#33410)
Signed-off-by: NickLucche <nlucches@redhat.com>
2026-01-31 05:24:49 +00:00
Yanan Cao
8ecd213c0b [Kernel] [Helion] [2/N] Helion kernel wrapper (#32964)
Signed-off-by: Yanan Cao <gmagogsfm@gmail.com>
2026-01-31 12:53:01 +08:00
Francesco Fusco
5b55c0bea7 [Attention] Clarify comment explaining attn_logits +1 dimension (#33427)
Signed-off-by: Francesco Fusco <ffu@zurich.ibm.com>
2026-01-31 04:50:30 +00:00
Patrick von Platen
15e0bb9c42 [Streaming -> Realtime] Rename all voxtral related classes, fn, files (#33415)
Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com>
2026-01-31 04:49:00 +00:00
Micah Williamson
6c64c41b4a [ROCm][CI] Force max_num_seqs=1 on ROCm In test_sharded_state_loader to reduce flakiness (#33277)
Signed-off-by: Micah Williamson <micah.williamson@amd.com>
2026-01-31 12:28:29 +08:00
Russell Bryant
a2ef06e1b3 [Misc] offest -> offset in comments and variable names (#33444)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2026-01-30 20:19:22 -08:00
Lucas Wilkinson
0a3c71e7e5 [BugFix] Fix whisper FA2 + full cudagraphs (#33360)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2026-01-31 12:15:06 +08:00
Michael Goin
29fba76781 [UX] Use gguf repo_id:quant_type syntax for examples and docs (#33371)
Signed-off-by: mgoin <mgoin64@gmail.com>
2026-01-31 12:14:54 +08:00
Isotr0py
9df152bbf6 [Misc] Algin Qwen3-VL-embedding image example outputs with HF repo example (#33419)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-01-30 19:36:56 -08:00
Nick Hill
876a16f4fb [ModelRunner V2] Fix spec decoding + logprobs (#33391)
Signed-off-by: Nick Hill <nickhill123@gmail.com>
2026-01-31 03:33:26 +00:00
Matthew Bonanni
aaa901ad55 [Attention] Move MLA forward from backend to layer (#33284)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2026-01-30 19:30:00 -08:00
Wentao Ye
010ec0c30e [Deprecation] Deprecate seed_everything and scatter_mm_placeholders in v0.15 (#33362)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2026-01-31 02:54:16 +00:00
Alberto Ferrer
64a40a7ab4 [Bugfix] Fix typo in read_offset variable name (#33426)
Signed-off-by: Alberto Ferrer <albertof@barrahome.org>
2026-01-31 01:26:15 +00:00
Gregory Shtrasberg
31aedfe7d6 [Bugfix][ROCm] Fixing the skinny gemm dispatch logic from #32831 (#33366)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2026-01-30 19:05:23 -06:00
Michael Goin
67ebaff528 Refactor NVFP4 Linear utils for ModelOpt and CT (#33201)
Signed-off-by: mgoin <mgoin64@gmail.com>
2026-01-30 16:37:42 -08:00
Chendi.Xue
2b465570e6 [CI][HPU]accelerate hpu test by skip python re-install and clean container name (#33286)
Signed-off-by: Chendi Xue <chendi.xue@intel.com>
2026-01-30 21:36:29 +00:00
Huy Do
9ca66ecc10 Indicate compile mode in the benchmark results (#32990)
Signed-off-by: Huy Do <huydhn@gmail.com>
2026-01-30 15:34:36 -05:00
Pavani Majety
c3a9752b0c [Hardware][SM100] Add TRTLLM Kernel for INT4 W4A16 Kernel. (#32437)
Signed-off-by: Pavani Majety <pmajety@nvidia.com>
2026-01-30 10:30:46 -08:00
xuebwang-amd
f451b4558b [Quantization][ROCm] Fix MoE weight loading to be robust (Qwen3_MoE/Qwen3_next as example models) (#33173)
Signed-off-by: xuebwang-amd <xuebwang@amd.com>
2026-01-30 17:50:23 +00:00
Vasiliy Kuznetsov
3f96fcf646 fix QERL attention import path (#33432)
Signed-off-by: vasiliy <vasiliy@fb.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2026-01-30 09:29:09 -08:00
Yanan Cao
6c1f9e4c18 [Kernel] [Helion] [1/N] Add Helion ConfigManager (#32740)
Signed-off-by: Yanan Cao <gmagogsfm@gmail.com>
2026-01-30 12:19:19 -05:00
Harry Mellor
67239c4c42 Fix encoder-decoder model disabling mm processor cache (#33236)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-01-30 16:30:10 +00:00
Nicolò Lucchesi
8ece60768f [CI] Qwen3-ASR transcriptios tests (#33414)
Signed-off-by: NickLucche <nlucches@redhat.com>
2026-01-30 16:17:56 +00:00
Michael Goin
fd0e377244 Support FP8 block quant for CompressedTensorsW8A16Fp8 (#33280)
Signed-off-by: mgoin <mgoin64@gmail.com>
2026-01-30 11:15:20 -05:00
Kyle Sayers
f857a03f6b [QeRL] Layerwise Reloading (#32133)
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
2026-01-30 08:50:05 -07:00
Danielle Robinson
74898a7015 [BugFix][LoRA] TritonExperts is ModularMoEPath for FP8 models (#33393)
Signed-off-by: Danielle Robinson <dmmaddix@amazon.com>
Co-authored-by: Danielle Robinson <dmmaddix@amazon.com>
2026-01-30 15:27:42 +00:00
Frank Wang
8f5d51203b Disable Cascade Attention for Batch Invariance (#32561)
Signed-off-by: frankwang28 <frank.wbb@hotmail.com>
Signed-off-by: Frank Wang <41319051+frankwang28@users.noreply.github.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2026-01-30 10:00:46 -05:00
Julien Denize
ae5b7aff2b Improve Mistral format checks. (#33253)
Signed-off-by: Julien Denize <julien.denize@mistral.ai>
Signed-off-by: juliendenize <julien.denize@mistral.ai>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2026-01-30 06:23:33 -08:00
Harry Mellor
a11bc12d53 Fix test_moe.py for Transformers v5 (#33413)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-01-30 14:03:25 +00:00
Nathan Weinberg
58cb55e4de [Doc] Enhance documentation around CPU container images (#32286)
Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
2026-01-30 13:36:20 +00:00
杨朱 · Kiki
cf896ae0e3 [Misc] Clean up HIDDEN_DEPRECATED_METRICS after metric removal (#33323)
Signed-off-by: carlory <baofa.fan@daocloud.io>
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-30 13:31:17 +00:00
Harry Mellor
c5113f60f2 Remove deprecated reasoning_content message field (#33402)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-01-30 11:48:15 +00:00
vllmellm
174f16700b [Doc] [ROCm] Update Documentation to reflect v0.15.0 release (#33388)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2026-01-30 19:06:08 +08:00
Julien Denize
8e2ad97ad0 [BUGFIX] Pixtral cannot be loaded with --limit-mm-per-prompt 0 (#33406)
Signed-off-by: juliendenize <julien.denize@mistral.ai>
2026-01-30 02:52:02 -08:00
Patrick von Platen
10152d2194 [Realtime API] Adds minimal realtime API based on websockets (#33187)
Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: Nick Hill <nickhill123@gmail.com>
2026-01-30 18:41:29 +08:00
杨朱 · Kiki
1a7894dbdf [Misc] Replace Optional[X] with X | None syntax (#33332)
Signed-off-by: carlory <baofa.fan@daocloud.io>
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-30 01:56:59 -08:00
Cyrus Leung
c87eac18f7 [Refactor] Move MM item count validation outside of processor (#33396)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-30 09:27:31 +00:00
tianshu-Michael-yu
f45870b53f fix: allow LFM2 MoE prefix caching (align) (#33376)
Signed-off-by: Tianshu Yu <tianshuyu.formal@gmail.com>
2026-01-30 08:23:14 +00:00
hujiaxin0
ba45bedfd1 [model] Add support for openPangu7B-VL (#32449)
Signed-off-by: hujiaxin <524446785@qq.com>
Signed-off-by: Emilie1001 <79921183+Emilie1001@users.noreply.github.com>
Co-authored-by: Emilie1001 <79921183+Emilie1001@users.noreply.github.com>
2026-01-30 15:54:27 +08:00
Harry Mellor
9432ed8c7e Explicitly set return_dict for apply_chat_template (#33372)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-01-30 07:27:04 +00:00
Lucas Kabela
726d89720c [CI] Enable mypy import following for vllm/spec_decode (#33282)
Signed-off-by: Lucas Kabela <lucaskabela@meta.com>
2026-01-30 06:43:32 +00:00
Harry Mellor
d334dd26c4 Move decode context parallel validationn to ParallelConfig (#33239)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-01-30 06:18:41 +00:00
Ryan Rock
070c811d6f [CI][AMD] Skip 4 GPUs testgroup ray tests (#33305)
Signed-off-by: Ryan Rock <ryan.rock@amd.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
2026-01-29 21:39:53 -08:00
Isotr0py
8bfc8d5600 [Models] Refactor Kimi-K2.5 weight loading (#33346)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-01-30 05:31:20 +00:00
Harry Huang
ec51831a22 [BugFix] Disable async scheduling for Mamba prefix caching (#33352)
Signed-off-by: huanghaoyan.hhy <huanghaoyan.hhy@alibaba-inc.com>
2026-01-30 04:40:19 +00:00
Harry Mellor
80b918f2bd Fix tie_word_embeddings for multimodal models in Transformers v5 (#33359)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-01-30 03:37:39 +00:00
Wang Haoyu
c46b0cd0af [Model][Multimodal] Add explicit MusicFlamingo adapter (#32696)
Signed-off-by: WangHaoyuuu <mailwhaoyu@gmail.com>
2026-01-30 11:01:29 +08:00
Aidan Reilly
133765760b [Docs] Adding links and intro to Speculators and LLM Compressor (#32849)
Signed-off-by: Aidan Reilly <aireilly@redhat.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-01-29 14:12:35 -08:00
Michael Goin
bfb9bdaf3f [Bugfix] Enable Triton MoE for FP8 per-tensor dynamic (#33300)
Signed-off-by: mgoin <mgoin64@gmail.com>
2026-01-29 12:15:17 -08:00
Kevin H. Luu
2284461d02 [release] Minor fixes to release annotation and wheel upload (#33129)
Signed-off-by: khluu <khluu000@gmail.com>
2026-01-29 12:09:35 -08:00
danisereb
8e2a469b3b Add Triton fused MoE config for B200 (Nemotron Nano) (#32804) 2026-01-29 19:21:33 +00:00
CarstyYou
23591e631e [Bugfix][Kernel] Fix negative memory offset in GDN Triton kernel (#33326)
Signed-off-by: CarstyYou <186021327+CarstyYou@users.noreply.github.com>
2026-01-29 10:40:11 -08:00
Linda
0493d897c4 [NVIDIA] [feat] Integrate flashinfer Trtllmgen bf16 moe (#32954)
Signed-off-by: Linda-Stadter <57756729+Linda-Stadter@users.noreply.github.com>
2026-01-29 10:00:13 -08:00
Chendi.Xue
8c8ebeb941 [BUGFIX][XPU] fix memory check after XPU reuse GPU_worker (#33358)
Signed-off-by: Chendi Xue <chendi.xue@intel.com>
2026-01-29 09:56:30 -08:00
Cyrus Leung
831453fcef [Chore] Move MediaConnector to vllm.multimodal.media (#33324)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-29 16:54:31 +00:00
Angela Yi
5a66c9cc76 [ez] Delete torch25_custom_graph_pass (#33287)
Signed-off-by: angelayi <yiangela7@gmail.com>
2026-01-29 16:47:05 +00:00
Isotr0py
5e73e4900c [Bugfix] Fix broken GLM-OCR initialization (#33350)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-01-29 07:56:05 -08:00
Cyrus Leung
c6e7404cc5 [Multimodal] Simplify MM input definitions (#33331)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-29 13:32:04 +00:00
sthWrong
17b17c0684 [Backport] [Kimi-K2.5] Replace torch.cuda with current_platform for d… (#33320) 2026-01-29 12:29:17 +00:00
Kunshang Ji
8bb6271c77 [Intel GPU] refine xpu worker (#32894)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2026-01-29 12:26:52 +00:00
Roger Wang
8b3f0a99dd [Models] Qwen3-ASR (#33312)
Signed-off-by: Roger Wang <hey@rogerw.io>
2026-01-29 19:27:15 +08:00
Li, Jiang
8311f083bd [Bugfix][CPU] Fix thread num for shared memory communication (#33317)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
Signed-off-by: Li, Jiang <bigpyj64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-01-29 03:26:58 -08:00
Patrick von Platen
40c35038d2 [Voxtral] Streaming example (#33042)
Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
2026-01-29 03:22:49 -08:00
zofia
a5aa4d5c0f [Quantization][Refactor] use platform dict to choose kernel (#33130)
Signed-off-by: Zhu, Zufang <zufang.zhu@intel.com>
Signed-off-by: zofia <110436990+zufangzhu@users.noreply.github.com>
2026-01-29 10:44:58 +00:00
andrii.pasternak
615e8033e5 [Bug Fix] Handle variable-length tensors in MultiModalFlatField batching (#31751)
Signed-off-by: Andrii Pasternak <andriipasternak31@gmail.com>
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-29 10:42:59 +00:00
Ilya Markov
d09135fbd0 [BugFix] Async Eplb fix potential race condition (#32881)
Signed-off-by: ilmarkov <markovilya197@gmail.com>
2026-01-29 10:31:40 +00:00
daniel-salib
8688c3d460 [fix] tesdt mcp_tool_calling_streaming with a more complex math question (#32769)
Signed-off-by: Daniel Salib <danielsalib@meta.com>
2026-01-29 10:25:58 +00:00
Isotr0py
5400014d55 [Chore] Remove use_data_parallel kwargs from ViT implementation (#33310)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-01-29 10:20:52 +00:00
Isotr0py
3a92c6f3b5 [Misc] Cleanup Kimi-K2.5's vision chunk modality entrypoints (#33157)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-01-29 09:46:02 +00:00
amirkl94
e01ff5c070 Bugfix: Pass router logits dtype in nemotron shared experts (#32669)
Signed-off-by: Amir Klein <203507526+amirkl94@users.noreply.github.com>
2026-01-29 09:36:34 +00:00
Harry Mellor
fb946a7f89 Make mypy opt-out instead of opt-in (#33205)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-01-29 09:12:26 +00:00
Lucas Wilkinson
a650ad1588 [Misc] Remove missed pad_for_cudagraph (#33283)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2026-01-29 09:12:05 +00:00
graftim
d697581a7c [Doc] Update outdated link to Ray documentation (#32660)
Signed-off-by: graftim <38649219+graftim@users.noreply.github.com>
2026-01-29 00:56:06 -08:00
shanjiaz
5eeba80c74 Adding optional speculator tests for larger models (#32943)
Signed-off-by: shanjiaz <zsjwpianpian@gmail.com>
2026-01-29 16:54:02 +08:00
whx
08b1195e62 [PluggableLayer][2/N] Apply PluggableLayer to linear layers (#33152)
Signed-off-by: whx-sjtu <2952154980@qq.com>
2026-01-29 16:53:15 +08:00
cmunley1
3bba2edb0f support returning tokenids in responses api (#33212)
Signed-off-by: Christian Munley <cmunley@nvidia.com>
2026-01-29 16:52:39 +08:00
Ilya Markov
53fc166402 [BugFix] Fix EPLB fail for MoeFP4 model with Marlin backend (#33262)
Signed-off-by: ilmarkov <markovilya197@gmail.com>
2026-01-29 16:52:11 +08:00
Didier Durand
31b25f6516 [Doc]: fixing multiple typos in diverse files (#33256)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
Signed-off-by: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-01-29 16:52:03 +08:00
wang.yuqi
abb34ac43a [Bugfix] Fix Qwen3-VL-Reranker load. (#33298)
Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: wang.yuqi <noooop@126.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-01-29 08:42:53 +00:00
Pengchao Wang
2515bbd027 [CI/Build][BugFix] fix cuda/compat loading order issue in docker build (#33116)
Signed-off-by: Pengchao Wang <wpc@fb.com>
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
2026-01-29 00:19:05 -08:00
TJian
c487a8eef4 [Release] [ROCm] Remove old build step (#33316)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2026-01-28 23:35:51 -08:00
Kiersten Stokes
9e138cb01d [Misc][Build] Lazy load cv2 in nemotron_parse.py (#33189)
Signed-off-by: kiersten-stokes <kierstenstokes@gmail.com>
2026-01-29 06:55:50 +00:00
TJian
f9d03599ef [Release] [CI] Optim release pipeline (#33156)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2026-01-28 22:45:42 -08:00
wangln19
39037d258e Fix tool call indexing double-counting (#33141)
Signed-off-by: wanglinian <wanglinian@stu.pku.edu.cn>
2026-01-29 05:57:09 +00:00
Cyrus Leung
51550179fc [Refactor] Define MM data parser in processing info instead of processor itself (#33260)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-29 13:55:17 +08:00
Angela Yi
07ea184f00 [ez] Delete more torch version checks <= 2.8 (#33288)
Signed-off-by: angelayi <yiangela7@gmail.com>
2026-01-29 05:28:46 +00:00
Or Ozeri
a663b218ae [Misc] Add orozery to CODEOWNERS (core, kv_transfer, kv_offload) (#33227)
Signed-off-by: Or Ozeri <oro@il.ibm.com>
2026-01-29 04:24:20 +00:00
Michael Goin
1bd47d6e5a [Bugfix] Register fp8 cutlass_group_gemm as supported for only SM90+SM100 (#33285)
Signed-off-by: mgoin <mgoin64@gmail.com>
2026-01-28 18:40:59 -08:00
Michael Goin
141cd43967 [UX] Remove noisy CT UnquantizedLinearMethod warn (#33273)
Signed-off-by: mgoin <mgoin64@gmail.com>
2026-01-28 16:09:30 -08:00
Nick Hill
6bf3b46d78 [ModelRunner V2] Misc code simplification and cleanup (#33266)
Signed-off-by: Nick Hill <nickhill123@gmail.com>
2026-01-28 14:41:23 -08:00
Matthew Bonanni
77c4f45c6c [7/N][Attention][Docs] Add documentation for attention backends (#32477)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2026-01-28 17:20:22 -05:00
Michael Goin
ca1969186d [UX] Enable nested configs in config yaml files (#33193) 2026-01-28 16:54:25 -05:00
Gregory Shtrasberg
ab597c869a [Bugfix] Add missing encoder only guard for do_kv_cache_update (#33269)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2026-01-28 21:25:07 +00:00
Angela Yi
4197168ea5 [ez] Remove checks for torch version <= 2.8 (#33209)
Signed-off-by: angelayi <yiangela7@gmail.com>
2026-01-28 16:03:56 -05:00
Rohan Potdar
59bcc5b6f2 Use aiter triton fused_add_rmsnorm_pad for gpt-oss (#30976)
Signed-off-by: Rohan138 <rohanpotdar138@gmail.com>
2026-01-28 20:47:47 +00:00
Wentao Ye
3e440786af [Feature] Fully support for async scheduling + PP, 30.8% E2E throughput improvement, 31.8% TPOT improvement (#32618)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Signed-off-by: Nick Hill <nickhill123@gmail.com>
Co-authored-by: Nick Hill <nickhill123@gmail.com>
2026-01-28 20:30:32 +00:00
Kevin H. Luu
8bdd3979d8 [CI] Change GPU key to device key for B200 test (#33275)
Signed-off-by: khluu <khluu000@gmail.com>
2026-01-28 19:14:29 +00:00
Wentao Ye
c4e744dbd4 [Perf] Optimize moe_permute for CUTLASS FP8 (#32892)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2026-01-28 10:15:24 -08:00
Nicolò Lucchesi
8ebf372e9d [CI] Whisper tests enforce_eager=False (#33098)
Signed-off-by: NickLucche <nlucches@redhat.com>
2026-01-28 09:36:56 -08:00
cwazai
f210f0b7b1 [lora/moe] Avoid extra intermediate buffer & Python slicing in expand phase when split_k == 1 (#32774)
Signed-off-by: 陈建华 <1647430658@qq.com>
2026-01-29 00:22:45 +08:00
Bin Bao
392c5af4fe [Benchmark] Add startup benchmarking to buildkite run (#33183)
Signed-off-by: Bin Bao <binbao@meta.com>
2026-01-28 16:03:07 +00:00
Robert Shaw
af9b69f977 [Quantization][Deprecation] Remove Marlin 24 (#32688)
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-01-28 15:54:59 +00:00
Chauncey
8e5e40daf4 [Misc] Provide a DeepSeek ReasoningParser with thinking enabled by default (#33221)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2026-01-28 21:16:53 +08:00
Or Ozeri
2e8de86777 Revert "Enable Cross layers KV cache layout at NIXL Connector (#30207)" (#33241)
Signed-off-by: Or Ozeri <oro@il.ibm.com>
Co-authored-by: Kevin H. Luu <khluu000@gmail.com>
2026-01-28 04:36:00 -08:00
Robert Shaw
247d1a32ea [Quantization][Deprecation] Remove BitBlas (#32683)
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Signed-off-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
2026-01-28 11:06:22 +00:00
Kevin H. Luu
ecb4f82209 [CI] Update job dependency syntax for Intel and AMD jobs (#33240)
Signed-off-by: khluu <khluu000@gmail.com>
2026-01-28 01:33:59 -08:00
Kevin H. Luu
5914090765 [CI] Update job dependency for hardware and CPU jobs (#33237)
Signed-off-by: khluu <khluu000@gmail.com>
2026-01-28 01:10:05 -08:00
Harry Mellor
f1acbd68c5 [CI] Enable mypy import following for vllm/compilation (#33199)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-01-28 08:59:54 +00:00
Yan Ma
9581185d51 [XPU]disable test_acceptance_length UT (#33226) 2026-01-28 15:24:13 +08:00
Maryam Tahhan
2dd359f953 [Docs] Simplify CPU x86 Docker build documentation (#33071)
Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
2026-01-28 06:37:09 +00:00
Gregory Shtrasberg
22ad649501 [ROCm] Enabling forward_includes_kv_cache on ROCm MHA backends (#33106)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2026-01-28 14:36:14 +08:00
ramos
36d450e3b8 Adds FunAudioChat multimodal audio model support (#2) (#33058)
Signed-off-by: ramos <49182011+nemoramo@users.noreply.github.com>
Signed-off-by: mayufeng <mayufeng@example.com>
Co-authored-by: mayufeng <mayufeng@example.com>
2026-01-28 05:18:09 +00:00
22quinn
a2b877df6c [Bugfix] Lazy import NgramProposer in GPU model runner (#32821)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2026-01-27 21:07:16 -08:00
Harry Mellor
35fb0b8613 Don't use min_pixels/max_pixels from Qwen2VL's processor (#33208)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-01-28 05:02:08 +00:00
Harry Mellor
2eb673a088 Add flake8-implicit-str-concat rules to Ruff (#33191)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-01-28 04:56:10 +00:00
Jeffrey Wang
a97b5e206d Relax protobuf library version constraints (#33202)
Signed-off-by: Jeffrey Wang <jeffreywang@anyscale.com>
2026-01-28 04:15:53 +00:00
Micah Williamson
911b51b69f [ROCm][CI] Add TORCH_NCCL_BLOCKING_WAIT For Distributed Tests (A100) (#32891)
Signed-off-by: Micah Williamson <micah.williamson@amd.com>
2026-01-28 11:32:31 +08:00
Xinan Miao
604e3b87e8 [Feature]: Container image WORKDIR consistency (#33159)
Signed-off-by: SouthWest7 <am1ao@qq.com>
Co-authored-by: SouthWest7 <am1ao@qq.com>
2026-01-28 11:06:48 +08:00
Harry Mellor
706f123b23 [Docs] Use definition lists for CLI reference docs (#33186)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Ashwin Phadke <23502062+ashwin-phadke@users.noreply.github.com>
2026-01-28 02:22:48 +00:00
Angela Yi
fb7abfc1d0 [docs] Improve tlparse section (#33211)
Signed-off-by: angelayi <yiangela7@gmail.com>
2026-01-28 02:07:37 +00:00
Kevin H. Luu
5d3d6e44e8 [CI] minor fixes to pipeline generator and tests (#33151)
Signed-off-by: khluu <khluu000@gmail.com>
2026-01-27 17:04:02 -08:00
Woosuk Kwon
46ec6d71c7 [Model Runner V2] Use a different stream for grammar bitmask h2d copy (#33059)
Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Nick Hill <nhill@redhat.com>
2026-01-27 16:37:43 -08:00
Matthew Bonanni
e82fa448c4 Add attention benchmarking tools (#26835)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
Co-authored-by: Claude <noreply@anthropic.com>
2026-01-28 00:09:20 +00:00
Richard Zou
d9aa39a3bb [torch.compile] Speed up MOE handling in forward_context (#33184)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2026-01-27 15:17:54 -08:00
Wentao Ye
3a6d5cbefd [Perf] Optimize dcp allocate tensor (#33102)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2026-01-27 17:24:41 -05:00
linhaifeng
f5d7049cc1 [Bugfix] Fix display error (inconsistent with context) (#33020)
Signed-off-by: linhaifeng <1371675203@qq.com>
2026-01-27 20:33:29 +00:00
Alexei-V-Ivanov-AMD
3c3c547ce0 Enabling "2 node" distributed tests in the AMD CI pipeline. (#32719)
Signed-off-by: DCCS-4560 <alivanov@chi-mi325x-pod1-112.ord.vultr.cpe.ice.amd.com>
Co-authored-by: DCCS-4560 <alivanov@chi-mi325x-pod1-112.ord.vultr.cpe.ice.amd.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
2026-01-27 19:13:21 +00:00
Matthew Bonanni
1cbccb6dba [Attention] Use has_flashinfer helper (#33177)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2026-01-27 18:33:17 +00:00
Iris
bd92089d33 feature: support eagle3 for HunyuanVL & Hunyuan (#33035)
Signed-off-by: irisliu10 <601012173@qq.com>
Signed-off-by: Iris <38269816+irisliu10@users.noreply.github.com>
2026-01-27 17:55:48 +00:00
Karan Bansal
a6760f1525 [Doc] Improve serve parameter documentation with meaningful defaults (#33082)
Signed-off-by: Karan Bansal <karanb192@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-01-27 09:19:37 -08:00
IriKa
66e601ef79 Support compress-tensors with nvfp4 or fp8 weights and modelopt with nvfp4 weights on Turing (#33076)
Signed-off-by: IriKa Qiu <qiujie.jq@gmail.com>
2026-01-27 11:04:05 -05:00
Nick Hill
0cd259b2d8 [BugFix] Fix P/D with non-MoE DP (#33037)
Signed-off-by: Nick Hill <nickhill123@gmail.com>
2026-01-27 08:03:47 -08:00
danielafrimi
83fb2d09e8 Support heterogeneous NemotronHPuzzle model (#32549)
Signed-off-by: <dafrimi@nvidia.com>
Signed-off-by: Daniel Afrimi <dafrimi@nvidia.com>
Signed-off-by: root <dafrimi@nvidia.com>
2026-01-27 10:55:54 -05:00
danisereb
f3a5ee705f [LoRA][Spec Decode] Support LoRA for Nemotron-H MTP models (#32265)
Signed-off-by: Daniel Serebrenik <daserebrenik@nvidia.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2026-01-27 07:53:26 -08:00
wang.yuqi
7cbbca9aaa [Frontend] Cleanup api server (#33158)
Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: wang.yuqi <noooop@126.com>
2026-01-27 15:18:10 +00:00
omkhalil
5ec44056f7 [Metrics][MFU] Fix UnembedMetrics FLOP overcounting for prefill (#33045) (#33045)
Fix UnembedMetrics to correctly count FLOPs for the unembedding (LM head) layer.

The bug: UnembedMetrics used total_num_tokens() which counts all tokens in the
batch for projection flops, vocab projections are run on just the last token for the
autoregressive use case.

Co-authored-by: Omar Mohamed Khalil <omarkhalil@meta.com>
2026-01-27 15:16:49 +00:00
Nicolò Lucchesi
492a7983dd [Bugfix] Fix DeepseekV32 AssertionError: num_kv_heads == 1 (#33090)
Signed-off-by: NickLucche <nlucches@redhat.com>
2026-01-27 15:03:20 +00:00
Matthew Bonanni
a608b4c6c2 [5/N][Attention] Finish eliminating vllm/attention folder (#32064)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2026-01-27 10:02:51 -05:00
Nicolò Lucchesi
1f3a2c2944 [Bugfix] Disable CG for Whisper+FA2 (#33164)
Signed-off-by: NickLucche <nlucches@redhat.com>
2026-01-27 21:46:51 +08:00
omerpaz95
7227d06156 [Metrics] [KVConnector] Add Offloading Connector metrics (#27942)
Added queries and hits metrics for the Offloading Connector.

Also added timing metrics for store and load operations, which take the
average time it takes to load/store, per-token.

The metrics are available from Prometheus and from the StatLogger.

Signed-off-by: omerpaz95 <omerpaz95@gmail.com>
Co-authored-by: Omer Paz <Omer.Paz@ibm.com>
2026-01-27 13:34:49 +00:00
Harry Mellor
14385c80fc Fix weight mapping test for Transfomers v5 (#33162)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-01-27 12:30:14 +00:00
wang.yuqi
76139d0801 [Frontend] Frontend will only attach supported tasks corresponding entrypoints. (#33139)
Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: wang.yuqi <noooop@126.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2026-01-27 12:15:43 +00:00
Lifan Shen
da8d0c441a [AMD][QWEN3-NEXT] FP8 Tunings (#32042)
Signed-off-by: Lifan Shen <lifans@meta.com>
2026-01-27 09:34:13 +00:00
rasmith
58996f3589 [AMD][Kernel][BugFix] Use correct scale in concat_and_cache_ds_mla_kernel when on gfx942 (#32976)
Signed-off-by: Randall Smith <ransmith@amd.com>
Signed-off-by: Randall Smith <Randall.Smith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
2026-01-27 07:16:43 +00:00
Roger Wang
b539f988e1 [Models] Kimi-K2.5 (#33131)
Signed-off-by: wanglinian <wanglinian@stu.pku.edu.cn>
Signed-off-by: wangln19 <96399074+wangln19@users.noreply.github.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: wanglinian <wanglinian@stu.pku.edu.cn>
Co-authored-by: wangln19 <96399074+wangln19@users.noreply.github.com>
Co-authored-by: Zaida Zhou <58739961+zhouzaida@users.noreply.github.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Nick Hill <nickhill123@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-01-27 14:50:31 +08:00
Andreas Karatzas
6c00645712 [CI][Pooling] Stabilize ModernBERT test (#32909)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-01-27 05:26:48 +00:00
Ning Xie
b781eeaa15 [code clean] remove duplicate code (#33135)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2026-01-27 04:57:16 +00:00
Cyrus Leung
e0b005d9cf [Frontend] Cleanup serving engine (#33103)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-26 20:47:26 -08:00
Richard Zou
3b8f0fe59e [torch.compile] Stop assuming 32 bit indexing (#33113)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2026-01-27 04:25:02 +00:00
Cyrus Leung
c831911be2 [Frontend] Reduce mixin usage in serving pooling (#33101)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-27 11:50:37 +08:00
Paco Xu
157caf511b [Perf] avoid duplicate mem_get_info() call in get_current_memory_usage (#33064)
Signed-off-by: Paco Xu <paco.xu@daocloud.io>
2026-01-27 03:45:45 +00:00
Vincent Gimenes
0b53bec60b [DOC]: Add warning about max_num_batched_tokens and max_model_len when chunked prefill is disabled (#33109)
Signed-off-by: Vincent Gimenes <147169146+VincentG1234@users.noreply.github.com>
2026-01-27 03:05:02 +00:00
Strahinja Stamenkovic
c568581ff3 Fix IndexError with encoder-decoder models when using Custom Paged Attention (#33112)
Signed-off-by: sstamenk <strahinja.stamenkovic@amd.com>
2026-01-27 10:33:37 +08:00
wangln19
2d7053438a fix: preserve native tool call ID in multi-turn tool calling (#32768)
Signed-off-by: wanglinian <wanglinian@stu.pku.edu.cn>
Signed-off-by: wangln19 <96399074+wangln19@users.noreply.github.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Isotr0py <2037008807@qq.com>
2026-01-27 10:22:35 +08:00
Robert Shaw
5a93b9162b [MoE Refactor] Integrate Naive Prepare Finalize into MK (#32567)
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Signed-off-by: Amir Klein <203507526+amirkl94@users.noreply.github.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: amirkl94 <203507526+amirkl94@users.noreply.github.com>
2026-01-27 01:28:02 +00:00
Woosuk Kwon
6d86fde09c [Model Runner V2] Remove UvaBufferPool for cpu->gpu copy (#33055)
Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Nick Hill <nhill@redhat.com>
2026-01-26 16:47:35 -08:00
XiongfeiWei
510ed1e8d3 [Bugfix][TPU] Return a Default fp8 MoE Backend (#32908)
Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2026-01-26 18:46:11 -05:00
Pengchao Wang
8caffd92df [Bugfix][MXFP4] Call trtllm_fp4_block_scale_moe with kwargs (#33104)
Signed-off-by: Pengchao Wang <wpc@fb.com>
2026-01-26 15:13:18 -08:00
dolpm
58a05b0ca1 [fix] CPUDNNLGEMMHandler pointer baked into inductor artifact (#32913)
Signed-off-by: dolpm <34420038+dolpm@users.noreply.github.com>
2026-01-26 16:59:44 -05:00
Jared Wen
6ee7f18f33 [Logging] add --disable-access-log-for-endpoints CLI option (#30011)
Add a new CLI option --disable-access-log-for-endpoints to suppress
uvicorn access logs for specified endpoints (e.g., /health, /metrics, /ping).

This addresses the need to reduce log noise in production environments
where health check endpoints are frequently polled by load balancers or
monitoring systems, generating excessive log entries that obscure
meaningful request logs.

Fixes #29982

Signed-off-by: JaredforReal <w13431838023@gmail.com>
2026-01-26 21:49:03 +00:00
Wentao Ye
8f987883cb [Refactor] Remove unused _moe_permute function (#33108)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2026-01-26 16:06:45 -05:00
Kevin H. Luu
ebe0ba91db [ci] Sync test areas with test-pipeline.yaml and enable new pipeline generator (#33080)
Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
Signed-off-by: khluu <khluu000@gmail.com>
Co-authored-by: Kevin Luu <khluu@Kevins-MacBook-Pro.local>
2026-01-26 12:28:20 -08:00
Robert Shaw
43a013c3a2 [Bugfix] Fix Dtypes for Pynccl Wrapper (#33030)
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
2026-01-26 20:09:32 +00:00
Cyrus Leung
c25dbee40d [Model] Bump transformers version for test registry (#33100)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-26 18:53:22 +00:00
Nicolò Lucchesi
19ab0f7ce5 [Bugfix] Fix Voxtral streaming slot_mapping (#33073)
Signed-off-by: NickLucche <nlucches@redhat.com>
2026-01-26 10:40:40 -08:00
danielafrimi
67fe677c53 [FIX] Always support TP > 4 for FP4 Gemm (#31099)
Signed-off-by: dafrimi <dafrimi@nvidia.com>
Co-authored-by: root <root@gpu-51.slurm-workers-slurm.slurm.svc.cluster.local>
2026-01-26 11:04:20 -07:00
Andy Lo
d56afd45fd Remove unused logic in models/mistral.py (#33095)
Signed-off-by: Andy Lo <andy@mistral.ai>
2026-01-26 09:01:52 -08:00
Chauncey
a2393ed496 [CI] Fix AssertionError: MCP tool call not found in output_messages (#33093)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2026-01-26 15:19:57 +00:00
Pleaplusone
be6931ee27 [ROCm][Bugfix] Fix ptpc scale load issue for fused shared expert path in deepseek mtp (#33018)
Signed-off-by: ganyi <ygan@amd.com>
2026-01-26 23:19:04 +08:00
Chauncey
9ef3b718d9 [Bugfix] Fix Can't instantiate abstract class DeepseekV32IndexerBackend (#33052)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2026-01-26 06:44:02 -08:00
Yuxuan Zhang
bb17e8f11c [GLM-OCR] GLM-OCR with MTP Support (#33005)
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-01-26 06:24:43 -08:00
Cyrus Leung
dcd80206b7 [Chore] Update type annotation of input_ids in model forward (#33063)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-26 06:02:10 -08:00
danisereb
f4a0921c9c [Performance] Tune Mamba selective scan kernel for B200 (#32873)
Signed-off-by: Daniel Serebrenik <daserebrenik@nvidia.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2026-01-26 05:56:54 -08:00
VihaanThat
208c56256f [Feature] Add LoRA support for Gemma3 vision components (#32764) 2026-01-26 13:56:40 +00:00
Alex Brooks
9ac818a551 [Misc] HF Hub LoRA Resolver (#20320)
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
2026-01-26 13:56:32 +00:00
Itay Etelis
6ca2c91b96 [Model] Use mm_position to compute mrope positions for Qwen3-Omni (#33010)
Signed-off-by: Itay Etelis <itay.etelis@ibm.com>
Co-authored-by: Itay Etelis <itay.etelis@ibm.com>
2026-01-26 13:48:07 +00:00
cwazai
e33192b269 [lora/moe] Improve fused MoE‑LoRA kernel indexing and memory access (#32770)
Signed-off-by: 陈建华 <1647430658@qq.com>
Signed-off-by: Yanwen Lin <lyw1124278064@gmail.com>
Signed-off-by: kimheesu <wlskaka4@gmail.com>
Signed-off-by: Divakar Verma <divakar.verma@amd.com>
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Signed-off-by: ganyi <ygan@amd.com>
Signed-off-by: whx-sjtu <2952154980@qq.com>
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
Signed-off-by: Nick Hill <nickhill123@gmail.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Daniel Serebrenik <daserebrenik@nvidia.com>
Signed-off-by: Yanan Cao <gmagogsfm@gmail.com>
Signed-off-by: Xin Yang <xyangx@amazon.com>
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Matthew Wong <Matthew.Wong2@amd.com>
Signed-off-by: knlnguyen1802 <knlnguyen1802@gmail.com>
Signed-off-by: Ifta Khairul Alam Adil <ikaadil007@gmail.com>
Signed-off-by: Ifta khairul Alam Adil <25082512+ikaadil@users.noreply.github.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com>
Signed-off-by: Huy Do <huydhn@gmail.com>
Signed-off-by: Micah Williamson <micah.williamson@amd.com>
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
Signed-off-by: Kebe <mail@kebe7jun.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Alex Sun <alex.s@amd.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Liran Schour <lirans@il.ibm.com>
Signed-off-by: liranschour <liranschour@users.noreply.github.com>
Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: Or Ozeri <oro@il.ibm.com>
Signed-off-by: Lucas Kabela <lucaskabela@meta.com>
Signed-off-by: Richard Zou <zou3519@gmail.com>
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Signed-off-by: Max de Bayser <maxdebayser@gmail.com>
Signed-off-by: AuYang <459461160@qq.com>
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Signed-off-by: rickychen-infinirc <ricky.chen@infinirc.com>
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
Signed-off-by: Eldar Kurtic <8884008+eldarkurtic@users.noreply.github.com>
Signed-off-by: eldarkurtic <8884008+eldarkurtic@users.noreply.github.com>
Signed-off-by: Bill Nell <bnell@redhat.com>
Signed-off-by: RishabhSaini <rishabhsaini01@gmail.com>
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Signed-off-by: Karan Bansal <karanb192@gmail.com>
Signed-off-by: jiang1.li <jiang1.li@intel.com>
Signed-off-by: Li, Jiang <bigpyj64@gmail.com>
Signed-off-by: wang.yuqi <noooop@126.com>
Signed-off-by: Tianshu Yu <tianshuyu.formal@gmail.com>
Signed-off-by: raushan <raushan@huggingface.co>
Signed-off-by: baonudesifeizhai <baonudesifeizhai@gmail.com>
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: sangbumlikeagod <oironese@naver.com>
Signed-off-by: sangbumlikeagod <98077576+sangbumlikeagod@users.noreply.github.com>
Signed-off-by: Matteo Fari <matteofari06@gmail.com>
Signed-off-by: huanghaoyan.hhy <huanghaoyan.hhy@alibaba-inc.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Orion Reblitz-Richardson <orionr@meta.com>
Signed-off-by: Orion Reblitz-Richardson <orionr@gmail.com>
Signed-off-by: marksverdhei <marksverdhei@hotmail.com>
Signed-off-by: Markus / Mark <46672778+marksverdhei@users.noreply.github.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Randall Smith <ransmith@amd.com>
Signed-off-by: jon <joninco@bullpoint.org>
Signed-off-by: dolpm <34420038+dolpm@users.noreply.github.com>
Signed-off-by: ElizaWszola <ewszola@redhat.com>
Signed-off-by: Luka Govedič <luka.govedic@gmail.com>
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
Signed-off-by: mohammad najafi <mohammad.najafi@amd.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: 7. Sun <jhao.sun@gmail.com>
Signed-off-by: esmeetu <jasonailu87@gmail.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Reagan <reaganjlee@gmail.com>
Signed-off-by: Reagan Lee <96998476+reaganjlee@users.noreply.github.com>
Signed-off-by: Hongjian Zhang <zhanghongjian@xiaohongshu.com>
Signed-off-by: Xingran Wang <wangxingran123456@outlook.com>
Signed-off-by: Hiroken. <105287758+HirokenOvo@users.noreply.github.com>
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Signed-off-by: Louie Tsai <louie.tsai@intel.com>
Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
Signed-off-by: Joshua Deng <joshuakdeng@gmail.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: LopezCastroRoberto <rocastro@redhat.com>
Signed-off-by: JJJYmmm <92386084+JJJYmmm@users.noreply.github.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: cwazai <38356712+cwazai@users.noreply.github.com>
Co-authored-by: Yanwen Lin <lyw1124278064@gmail.com>
Co-authored-by: Kim Hee Su <wlskaka4@gmail.com>
Co-authored-by: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Pleaplusone <ygan@amd.com>
Co-authored-by: whx <56632993+whx-sjtu@users.noreply.github.com>
Co-authored-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: danisereb <daserebrenik@nvidia.com>
Co-authored-by: Yanan Cao <gmagogsfm@users.noreply.github.com>
Co-authored-by: Xin Yang <105740670+xyang16@users.noreply.github.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: Matt <156021403+mawong-amd@users.noreply.github.com>
Co-authored-by: knlnguyen1802 <knlnguyen1802@gmail.com>
Co-authored-by: Lucain <lucainp@gmail.com>
Co-authored-by: Ifta khairul Alam Adil <25082512+ikaadil@users.noreply.github.com>
Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: Huy Do <huydhn@gmail.com>
Co-authored-by: Micah Williamson <micah.williamson@amd.com>
Co-authored-by: Andreas Karatzas <akaratza@amd.com>
Co-authored-by: Matthew Wong <Matthew.Wong2@amd.com>
Co-authored-by: Kebe <mail@kebe7jun.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Alex Sun <minchsun@amd.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: liranschour <liranschour@users.noreply.github.com>
Co-authored-by: Or Ozeri <or@ozery.com>
Co-authored-by: wang.yuqi <yuqi.wang@daocloud.io>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: Shengqi Chen <harry-chen@outlook.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
Co-authored-by: Or Ozeri <oro@il.ibm.com>
Co-authored-by: Lucas Kabela <lucaskabela@meta.com>
Co-authored-by: Richard Zou <zou3519@users.noreply.github.com>
Co-authored-by: Maximilien de Bayser <maxdebayser@gmail.com>
Co-authored-by: Xu Jinyang <72930776+AuYang261@users.noreply.github.com>
Co-authored-by: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com>
Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Co-authored-by: David Ramon Prados <davidramon3@hotmail.es>
Co-authored-by: RickyChen / 陳昭儒 <ricky.chen@infinirc.com>
Co-authored-by: Matthew Bonanni <mbonanni@redhat.com>
Co-authored-by: Fadi Arafeh <115173828+fadara01@users.noreply.github.com>
Co-authored-by: Eldar Kurtić <8884008+eldarkurtic@users.noreply.github.com>
Co-authored-by: bnellnm <49004751+bnellnm@users.noreply.github.com>
Co-authored-by: Rishabh Saini <rishabhsaini01@gmail.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Karan Bansal <karanb192@users.noreply.github.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: tianshu-Michael-yu <101950379+tianshu-Michael-yu@users.noreply.github.com>
Co-authored-by: Raushan Turganbay <raushan@huggingface.co>
Co-authored-by: baonudesifeizhai <85092850+baonudesifeizhai@users.noreply.github.com>
Co-authored-by: Mark McLoughlin <markmc@redhat.com>
Co-authored-by: sangbumlikeagod <98077576+sangbumlikeagod@users.noreply.github.com>
Co-authored-by: Matteo Fari <matteofari06@gmail.com>
Co-authored-by: Harry Huang <vastrockhuang162@gmail.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Orion Reblitz-Richardson <orionr@gmail.com>
Co-authored-by: Kevin H. Luu <khluu000@gmail.com>
Co-authored-by: Markus / Mark <46672778+marksverdhei@users.noreply.github.com>
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
Co-authored-by: rasmith <Randall.Smith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
Co-authored-by: joninco <joninco@bullpoint.org>
Co-authored-by: dolpm <34420038+dolpm@users.noreply.github.com>
Co-authored-by: ElizaWszola <ewszola@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Co-authored-by: Luka Govedič <luka.govedic@gmail.com>
Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Luka Govedič <lgovedic@redhat.com>
Co-authored-by: Joe Runde <Joseph.Runde@ibm.com>
Co-authored-by: monajafi-amd <mohammad.najafi@amd.com>
Co-authored-by: ruizcrp <ruiz.crp@gmail.com>
Co-authored-by: Shengqi Chen <i@harrychen.xyz>
Co-authored-by: 7. Sun <jhao.sun@gmail.com>
Co-authored-by: Roy Wang <jasonailu87@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Reagan Lee <96998476+reaganjlee@users.noreply.github.com>
Co-authored-by: Hiroken. <105287758+HirokenOvo@users.noreply.github.com>
Co-authored-by: Xingran Wang <wangxingran123456@outlook.com>
Co-authored-by: david guan <102001211+Chenhao-Guan@users.noreply.github.com>
Co-authored-by: Lukas Geiger <lukas.geiger94@gmail.com>
Co-authored-by: Louie Tsai <louie.tsai@intel.com>
Co-authored-by: Maryam Tahhan <mtahhan@redhat.com>
Co-authored-by: Joshua Deng <91448271+joshuadeng@users.noreply.github.com>
Co-authored-by: Nick Hill <nickhill123@gmail.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
Co-authored-by: Roberto L. Castro <38211239+LopezCastroRoberto@users.noreply.github.com>
Co-authored-by: JJJYmmm <92386084+JJJYmmm@users.noreply.github.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2026-01-26 04:56:34 -08:00
Cyrus Leung
61274bdef5 [Doc] Further update multi-modal impl doc (#33065)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-26 10:54:20 +00:00
ltd0924
b40db4dfec [StepVL] add step vl offline example (#33054)
Signed-off-by: luotingdan <luotingdan@stepfun.com>
Co-authored-by: luotingdan <luotingdan@stepfun.com>
2026-01-26 01:00:32 -08:00
1038 changed files with 51348 additions and 21585 deletions

View File

@@ -1,7 +1,8 @@
name: vllm_ci
job_dirs:
- ".buildkite/test_areas"
- ".buildkite/image_build"
- ".buildkite/test_areas"
- ".buildkite/hardware_tests"
run_all_patterns:
- "docker/Dockerfile"
- "CMakeLists.txt"

View File

@@ -0,0 +1,29 @@
group: Hardware
steps:
- label: "AMD: :docker: build image"
depends_on: []
device: amd_cpu
no_plugin: true
commands:
- >
docker build
--build-arg max_jobs=16
--build-arg REMOTE_VLLM=1
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942'
--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
--tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
-f docker/Dockerfile.rocm
--target test
--no-cache
--progress plain .
- docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}"
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 1
- exit_status: -10 # Agent was lost
limit: 1
- exit_status: 1 # Machine occasionally fail
limit: 1

View File

@@ -0,0 +1,8 @@
group: Hardware
steps:
- label: "Arm CPU Test"
soft_fail: true
device: arm_cpu
no_plugin: true
commands:
- bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh

View File

@@ -0,0 +1,10 @@
group: Hardware
depends_on: ~
steps:
- label: "Ascend NPU Test"
soft_fail: true
timeout_in_minutes: 20
no_plugin: true
device: ascend_npu
commands:
- bash .buildkite/scripts/hardware_ci/run-npu-test.sh

View File

@@ -0,0 +1,10 @@
group: Hardware
steps:
- label: "GH200 Test"
soft_fail: true
device: gh200
no_plugin: true
optional: true
commands:
- nvidia-smi
- bash .buildkite/scripts/hardware_ci/run-gh200-test.sh

View File

@@ -0,0 +1,24 @@
group: Hardware
depends_on: ~
steps:
- label: "Intel CPU Test"
soft_fail: true
device: intel_cpu
no_plugin: true
commands:
- bash .buildkite/scripts/hardware_ci/run-cpu-test.sh
- label: "Intel HPU Test"
soft_fail: true
device: intel_hpu
no_plugin: true
commands:
- bash .buildkite/scripts/hardware_ci/run-hpu-test.sh
- label: "Intel GPU Test"
depends_on: []
soft_fail: true
device: intel_gpu
no_plugin: true
commands:
- bash .buildkite/scripts/hardware_ci/run-xpu-test.sh

View File

@@ -1,56 +1,256 @@
#!/bin/bash
set -e
set -euo pipefail
if [[ $# -lt 8 ]]; then
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
exit 1
# replace invalid characters in Docker image tags and truncate to 128 chars
clean_docker_tag() {
local input="$1"
echo "$input" | sed 's/[^a-zA-Z0-9._-]/_/g' | cut -c1-128
}
print_usage_and_exit() {
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
exit 1
}
print_instance_info() {
echo ""
echo "=== Debug: Instance Information ==="
# Get IMDSv2 token
if TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" \
-H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null); then
AMI_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
http://169.254.169.254/latest/meta-data/ami-id 2>/dev/null || echo "unknown")
INSTANCE_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
http://169.254.169.254/latest/meta-data/instance-type 2>/dev/null || echo "unknown")
INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null || echo "unknown")
AZ=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
http://169.254.169.254/latest/meta-data/placement/availability-zone 2>/dev/null || echo "unknown")
echo "AMI ID: ${AMI_ID}"
echo "Instance Type: ${INSTANCE_TYPE}"
echo "Instance ID: ${INSTANCE_ID}"
echo "AZ: ${AZ}"
else
echo "Not running on EC2 or IMDS not available"
fi
# Check for warm cache AMI (marker file baked into custom AMI)
if [[ -f /etc/vllm-ami-info ]]; then
echo "Cache: warm (custom vLLM AMI)"
cat /etc/vllm-ami-info
else
echo "Cache: cold (standard AMI)"
fi
echo "==================================="
echo ""
}
setup_buildx_builder() {
echo "--- :buildkite: Setting up buildx builder"
if [[ -S "${BUILDKIT_SOCKET}" ]]; then
# Custom AMI with standalone buildkitd - use remote driver for warm cache
echo "✅ Found local buildkitd socket at ${BUILDKIT_SOCKET}"
echo "Using remote driver to connect to buildkitd (warm cache available)"
if docker buildx inspect baked-vllm-builder >/dev/null 2>&1; then
echo "Using existing baked-vllm-builder"
docker buildx use baked-vllm-builder
else
echo "Creating baked-vllm-builder with remote driver"
docker buildx create \
--name baked-vllm-builder \
--driver remote \
--use \
"unix://${BUILDKIT_SOCKET}"
fi
docker buildx inspect --bootstrap
elif docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then
# Existing builder available
echo "Using existing builder: ${BUILDER_NAME}"
docker buildx use "${BUILDER_NAME}"
docker buildx inspect --bootstrap
else
# No local buildkitd, no existing builder - create new docker-container builder
echo "No local buildkitd found, using docker-container driver"
docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use
docker buildx inspect --bootstrap
fi
# builder info
echo "Active builder:"
docker buildx ls | grep -E '^\*|^NAME' || docker buildx ls
}
check_and_skip_if_image_exists() {
if [[ -n "${IMAGE_TAG:-}" ]]; then
echo "--- :mag: Checking if image exists"
if docker manifest inspect "${IMAGE_TAG}" >/dev/null 2>&1; then
echo "Image already exists: ${IMAGE_TAG}"
echo "Skipping build"
exit 0
fi
echo "Image not found, proceeding with build"
fi
}
ecr_login() {
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
}
prepare_cache_tags() {
# resolve and set: CACHE_TO, CACHE_FROM, CACHE_FROM_BASE_BRANCH, CACHE_FROM_MAIN
TEST_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-test-cache"
MAIN_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-postmerge-cache"
if [[ "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
cache="${MAIN_CACHE_ECR}:latest"
else
clean_branch=$(clean_docker_tag "$BUILDKITE_BRANCH")
cache="${TEST_CACHE_ECR}:${clean_branch}"
fi
CACHE_TO="$cache"
CACHE_FROM="$cache"
CACHE_FROM_BASE_BRANCH="$cache"
else
CACHE_TO="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
CACHE_FROM="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
if [[ "$BUILDKITE_PULL_REQUEST_BASE_BRANCH" == "main" ]]; then
CACHE_FROM_BASE_BRANCH="${MAIN_CACHE_ECR}:latest"
else
clean_base=$(clean_docker_tag "$BUILDKITE_PULL_REQUEST_BASE_BRANCH")
CACHE_FROM_BASE_BRANCH="${TEST_CACHE_ECR}:${clean_base}"
fi
fi
CACHE_FROM_MAIN="${MAIN_CACHE_ECR}:latest"
export CACHE_TO CACHE_FROM CACHE_FROM_BASE_BRANCH CACHE_FROM_MAIN
}
resolve_parent_commit() {
if [[ -z "${PARENT_COMMIT:-}" ]]; then
PARENT_COMMIT=$(git rev-parse HEAD~1 2>/dev/null || echo "")
if [[ -n "${PARENT_COMMIT}" ]]; then
echo "Computed parent commit for cache fallback: ${PARENT_COMMIT}"
export PARENT_COMMIT
else
echo "Could not determine parent commit (may be first commit in repo)"
fi
else
echo "Using provided PARENT_COMMIT: ${PARENT_COMMIT}"
fi
}
print_bake_config() {
echo "--- :page_facing_up: Resolved bake configuration"
BAKE_CONFIG_FILE="bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
echo "Saved bake config to ${BAKE_CONFIG_FILE}"
echo "--- :arrow_down: Uploading bake config to Buildkite"
buildkite-agent artifact upload "${BAKE_CONFIG_FILE}"
}
#################################
# Main Script #
#################################
print_instance_info
if [[ $# -lt 7 ]]; then
print_usage_and_exit
fi
# input args
REGISTRY=$1
REPO=$2
BUILDKITE_COMMIT=$3
BRANCH=$4
VLLM_USE_PRECOMPILED=$5
VLLM_MERGE_BASE_COMMIT=$6
CACHE_FROM=$7
CACHE_TO=$8
IMAGE_TAG=$7
IMAGE_TAG_LATEST=${8:-} # only used for main branch, optional
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
# build config
TARGET="test-ci"
VLLM_BAKE_FILE_PATH="${VLLM_BAKE_FILE_PATH:-docker/docker-bake.hcl}"
BUILDER_NAME="${BUILDER_NAME:-vllm-builder}"
CI_HCL_URL="${CI_HCL_URL:-https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci.hcl}"
CI_HCL_PATH="/tmp/ci.hcl"
BUILDKIT_SOCKET="/run/buildkit/buildkitd.sock"
# docker buildx
docker buildx create --name vllm-builder --driver docker-container --use
docker buildx inspect --bootstrap
docker buildx ls
prepare_cache_tags
ecr_login
# skip build if image already exists
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
exit 0
# Environment info (for docs and human readers)
# VLLM_CI_BRANCH - ci-infra branch to use (default: main)
# VLLM_BAKE_FILE_PATH - Path to vLLM's bake file (default: docker/docker-bake.hcl)
# BUILDER_NAME - Name for buildx builder (default: vllm-builder)
#
# Build configuration (exported as environment variables for bake):
export BUILDKITE_COMMIT
export PARENT_COMMIT
export IMAGE_TAG
export IMAGE_TAG_LATEST
export CACHE_FROM
export CACHE_FROM_BASE_BRANCH
export CACHE_FROM_MAIN
export CACHE_TO
export VLLM_USE_PRECOMPILED
export VLLM_MERGE_BASE_COMMIT
# print args
echo "--- :mag: Arguments"
echo "REGISTRY: ${REGISTRY}"
echo "REPO: ${REPO}"
echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
echo "BRANCH: ${BRANCH}"
echo "VLLM_USE_PRECOMPILED: ${VLLM_USE_PRECOMPILED}"
echo "VLLM_MERGE_BASE_COMMIT: ${VLLM_MERGE_BASE_COMMIT}"
echo "IMAGE_TAG: ${IMAGE_TAG}"
echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"
# print build configuration
echo "--- :mag: Build configuration"
echo "TARGET: ${TARGET}"
echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
echo "BUILDER_NAME: ${BUILDER_NAME}"
echo "CI_HCL_URL: ${CI_HCL_URL}"
echo "BUILDKIT_SOCKET: ${BUILDKIT_SOCKET}"
echo "--- :mag: Cache tags"
echo "CACHE_TO: ${CACHE_TO}"
echo "CACHE_FROM: ${CACHE_FROM}"
echo "CACHE_FROM_BASE_BRANCH: ${CACHE_FROM_BASE_BRANCH}"
echo "CACHE_FROM_MAIN: ${CACHE_FROM_MAIN}"
check_and_skip_if_image_exists
echo "--- :docker: Setting up Docker buildx bake"
echo "Target: ${TARGET}"
echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
echo "CI HCL path: ${CI_HCL_PATH}"
if [[ ! -f "${VLLM_BAKE_FILE_PATH}" ]]; then
echo "Error: vLLM bake file not found at ${VLLM_BAKE_FILE_PATH}"
echo "Make sure you're running from the vLLM repository root"
exit 1
fi
if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
else
merge_base_commit_build_args=""
echo "--- :arrow_down: Downloading ci.hcl"
curl -sSfL -o "${CI_HCL_PATH}" "${CI_HCL_URL}"
echo "Downloaded to ${CI_HCL_PATH}"
if [[ ! -f "${CI_HCL_PATH}" ]]; then
echo "Error: ci.hcl not found at ${CI_HCL_PATH}"
exit 1
fi
# build
docker buildx build --file docker/Dockerfile \
--build-arg max_jobs=16 \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
--build-arg USE_SCCACHE=1 \
--build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
--build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
--build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
${merge_base_commit_build_args} \
--cache-from type=registry,ref=${CACHE_FROM},mode=max \
--cache-to type=registry,ref=${CACHE_TO},mode=max \
--tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
$( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
--push \
--target test \
--progress plain .
setup_buildx_builder
resolve_parent_commit
export PARENT_COMMIT
print_bake_config
echo "--- :docker: Building ${TARGET}"
docker --debug buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --progress plain "${TARGET}"
echo "--- :white_check_mark: Build complete"

View File

@@ -4,7 +4,8 @@ steps:
key: image-build
depends_on: []
commands:
- .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
- if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi
- if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi
retry:
automatic:
- exit_status: -1 # Agent was lost

View File

@@ -0,0 +1,15 @@
model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.695
- name: "exact_match,flexible-extract"
value: 0.447
limit: 1319
num_fewshot: 5
max_model_len: 262144
enforce_eager: false
apply_chat_template: true
fewshot_as_multiturn: true
trust_remote_code: true

View File

@@ -0,0 +1,19 @@
model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.7142
- name: "exact_match,flexible-extract"
value: 0.4579
env_vars:
VLLM_USE_FLASHINFER_MOE_FP8: "1"
VLLM_FLASHINFER_MOE_BACKEND: "throughput"
limit: 1319
num_fewshot: 5
max_model_len: 262144
kv_cache_dtype: fp8
enforce_eager: false
apply_chat_template: true
fewshot_as_multiturn: true
trust_remote_code: true

View File

@@ -1 +1,2 @@
Qwen3-235B-A22B-Instruct-2507-FP8.yaml
NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml

View File

@@ -3,3 +3,4 @@ Meta-Llama-3-70B-Instruct.yaml
Mixtral-8x7B-Instruct-v0.1.yaml
Qwen2-57B-A14-Instruct.yaml
DeepSeek-V2-Lite-Chat.yaml
NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml

View File

@@ -393,7 +393,7 @@ if __name__ == "__main__":
with open(results_folder / md_file, "w") as f:
results = read_markdown(
"../.buildkite/performance-benchmarks/"
+ "performance-benchmarks-descriptions.md"
"performance-benchmarks-descriptions.md"
)
results = results.format(
latency_tests_markdown_table=latency_md_table,

View File

@@ -25,9 +25,9 @@ check_gpus() {
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g arch_suffix=''
if command -v nvidia-smi; then
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
elif command -v amd-smi; then
@@ -181,19 +181,20 @@ upload_to_buildkite() {
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
}
run_latency_tests() {
# run latency tests using `vllm bench latency` command
# $1: a json file specifying latency test cases
run_benchmark_tests() {
# run benchmark tests using `vllm bench <test_type>` command
# $1: test type (latency or throughput)
# $2: a json file specifying test cases
local latency_test_file
latency_test_file=$1
local test_type=$1
local test_file=$2
# Iterate over latency tests
jq -c '.[]' "$latency_test_file" | while read -r params; do
# Iterate over tests
jq -c '.[]' "$test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^latency_ ]]; then
echo "In latency-test.json, test_name must start with \"latency_\"."
if [[ ! "$test_name" =~ ^${test_type}_ ]]; then
echo "In ${test_type}-test.json, test_name must start with \"${test_type}_\"."
exit 1
fi
@@ -204,15 +205,15 @@ run_latency_tests() {
fi
# get arguments
latency_params=$(echo "$params" | jq -r '.parameters')
latency_args=$(json2args "$latency_params")
latency_environment_variables=$(echo "$params" | jq -r '.environment_variables')
latency_envs=$(json2envs "$latency_environment_variables")
bench_params=$(echo "$params" | jq -r '.parameters')
bench_args=$(json2args "$bench_params")
bench_environment_variables=$(echo "$params" | jq -r '.environment_variables')
bench_envs=$(json2envs "$bench_environment_variables")
# check if there is enough GPU to run the test
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
tp=$(echo "$bench_params" | jq -r '.tensor_parallel_size')
if [[ "$ON_CPU" == "1" ]]; then
pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size // 1')
pp=$(echo "$bench_params" | jq -r '.pipeline_parallel_size // 1')
world_size=$(($tp*$pp))
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -225,97 +226,42 @@ run_latency_tests() {
fi
fi
latency_command=" $latency_envs vllm bench latency \
bench_command=" $bench_envs vllm bench $test_type \
--output-json $RESULTS_FOLDER/${test_name}.json \
$latency_args"
$bench_args"
echo "Running test case $test_name"
echo "Latency command: $latency_command"
echo "${test_type^} command: $bench_command"
# recoding benchmarking command ang GPU command
# recording benchmarking command and GPU command
jq_output=$(jq -n \
--arg latency "$latency_command" \
--arg command "$bench_command" \
--arg gpu "$gpu_type" \
--arg test_type "$test_type" \
'{
latency_command: $latency,
($test_type + "_command"): $command,
gpu_type: $gpu
}')
echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
# run the benchmark
eval "$latency_command"
eval "$bench_command"
kill_gpu_processes
done
}
run_latency_tests() {
run_benchmark_tests "latency" "$1"
}
run_startup_tests() {
run_benchmark_tests "startup" "$1"
}
run_throughput_tests() {
# run throughput tests using `vllm bench throughput`
# $1: a json file specifying throughput test cases
local throughput_test_file
throughput_test_file=$1
# Iterate over throughput tests
jq -c '.[]' "$throughput_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^throughput_ ]]; then
echo "In throughput-test.json, test_name must start with \"throughput_\"."
exit 1
fi
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# get arguments
throughput_params=$(echo "$params" | jq -r '.parameters')
throughput_args=$(json2args "$throughput_params")
throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
throughput_envs=$(json2envs "$throughput_environment_variables")
# check if there is enough GPU to run the test
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
if [[ "$ON_CPU" == "1" ]]; then
pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size // 1')
world_size=$(($tp*$pp))
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
continue
fi
else
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
fi
throughput_command=" $throughput_envs vllm bench throughput \
--output-json $RESULTS_FOLDER/${test_name}.json \
$throughput_args"
echo "Running test case $test_name"
echo "Throughput command: $throughput_command"
# recoding benchmarking command ang GPU command
jq_output=$(jq -n \
--arg command "$throughput_command" \
--arg gpu "$gpu_type" \
'{
throughput_command: $command,
gpu_type: $gpu
}')
echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
# run the benchmark
eval "$throughput_command"
kill_gpu_processes
done
run_benchmark_tests "throughput" "$1"
}
run_serving_tests() {
@@ -447,6 +393,11 @@ run_serving_tests() {
fi
fi
# save the compilation mode and optimization level on the serving results
# whenever they are set
compilation_config_mode=$(echo "$server_params" | jq -r '."compilation_config.mode" // empty')
optimization_level=$(echo "$server_params" | jq -r '.optimization_level // empty')
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
@@ -460,15 +411,15 @@ run_serving_tests() {
for max_concurrency in $max_concurrency_list; do
new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
echo " new test name $new_test_name"
# pass the tensor parallel size to the client so that it can be displayed
# on the benchmark dashboard
# pass the tensor parallel size, the compilation mode, and the optimization
# level to the client so that they can be used on the benchmark dashboard
client_command="vllm bench serve \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
--max-concurrency $max_concurrency \
--metadata "tensor_parallel_size=$tp" \
--metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
$client_args $client_remote_args "
echo "Running test case $test_name with qps $qps"
@@ -534,6 +485,7 @@ main() {
# benchmarking
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
# postprocess benchmarking results

View File

@@ -176,23 +176,6 @@ steps:
env:
DOCKER_BUILDKIT: "1"
- block: "Build release image for x86_64 ROCm"
key: block-rocm-release-image-build
depends_on: ~
- label: "Build release image - x86_64 - ROCm"
depends_on: block-rocm-release-image-build
id: build-release-image-rocm
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
# Build base image first
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --tag rocm/vllm-dev:base-$BUILDKITE_COMMIT --target final --progress plain -f docker/Dockerfile.rocm_base ."
# Build vLLM ROCm image using the base
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg BASE_IMAGE=rocm/vllm-dev:base-$BUILDKITE_COMMIT --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm --target vllm-openai --progress plain -f docker/Dockerfile.rocm ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm"
- group: "Publish release images"
key: "publish-release-images"
steps:
@@ -274,14 +257,14 @@ steps:
- input-release-version
- build-wheels
- label: "Upload release wheels to PyPI and GitHub"
- label: "Upload release wheels to PyPI"
depends_on:
- block-upload-release-wheels
id: upload-release-wheels
agents:
queue: small_cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/upload-release-wheels.sh"
- "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
# =============================================================================
# ROCm Release Pipeline (x86_64 only)
@@ -476,7 +459,7 @@ steps:
S3_BUCKET: "vllm-wheels"
# ROCm Job 2: Build vLLM ROCm Wheel
- label: ":python: Build vLLM ROCm Wheel"
- label: ":python: Build vLLM ROCm Wheel - x86_64"
id: build-rocm-vllm-wheel
depends_on:
- step: build-rocm-base-wheels
@@ -638,9 +621,93 @@ steps:
depends_on:
- step: upload-rocm-wheels
allow_failure: true
- step: input-release-version
allow_failure: true
agents:
queue: cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/annotate-rocm-release.sh"
env:
S3_BUCKET: "vllm-wheels"
# ROCm Job 5: Generate Root Index for ROCm Wheels (for release only)
# This is the job to create https://wheels.vllm.ai/rocm/ index allowing
# users to install with `uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/`
- block: "Generate Root Index for ROCm Wheels for Release"
key: block-generate-root-index-rocm-wheels
depends_on: upload-rocm-wheels
- label: ":package: Generate Root Index for ROCm Wheels for Release"
depends_on: block-generate-root-index-rocm-wheels
id: generate-root-index-rocm-wheels
agents:
queue: cpu_queue_postmerge
commands:
- "bash tools/vllm-rocm/generate-rocm-wheels-root-index.sh"
env:
S3_BUCKET: "vllm-wheels"
VARIANT: "rocm700"
# ROCm Job 5: Build ROCm Release Docker Image
- label: ":docker: Build release image - x86_64 - ROCm"
id: build-rocm-release-image
depends_on:
- step: build-rocm-base-wheels
allow_failure: false
agents:
queue: cpu_queue_postmerge
timeout_in_minutes: 60
commands:
- |
set -euo pipefail
# Login to ECR
aws ecr-public get-login-password --region us-east-1 | \
docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
# Download Docker image from S3 (set by build-rocm-base-wheels)
DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
echo "ERROR: rocm-docker-image-s3-path metadata not found"
exit 1
fi
echo "Downloading base image from $${DOCKER_IMAGE_S3_PATH}"
mkdir -p artifacts/rocm-docker-image
aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
# Load base Docker image
echo "Loading base Docker image..."
LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
echo "Loaded base image: $${BASE_IMAGE_TAG}"
# Tag and push the base image to ECR
docker tag "$${BASE_IMAGE_TAG}" public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
echo "Pushed base image: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base"
# Get GPU architectures from meta-data
PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
# Build vLLM ROCm release image using cached base
DOCKER_BUILDKIT=1 docker build \
--build-arg max_jobs=16 \
--build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
--build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
--build-arg USE_SCCACHE=1 \
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
--build-arg SCCACHE_REGION_NAME=us-west-2 \
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
--tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm \
--target vllm-openai \
--progress plain \
-f docker/Dockerfile.rocm .
# Push to ECR
docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm
echo "Pushed: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm"
env:
DOCKER_BUILDKIT: "1"
S3_BUCKET: "vllm-wheels"

View File

@@ -11,51 +11,102 @@ fi
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
To download the wheel (by commit):
\`\`\`
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_aarch64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
(Optional) For CUDA 13.0:
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_x86_64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_aarch64.whl .
(Optional) For CPU:
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl .
\`\`\`
To download the wheel (by version):
\`\`\`
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
\`\`\`
To download and upload the image:
\`\`\`
Download images:
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
Tag and push images:
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
docker push vllm/vllm-openai:latest-x86_64
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130 vllm/vllm-openai:x86_64-cu130
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:latest-x86_64-cu130
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
docker push vllm/vllm-openai:latest-x86_64-cu130
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker push vllm/vllm-openai:latest-aarch64
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai:rocm
docker tag vllm/vllm-openai:rocm vllm/vllm-openai:latest-rocm
docker tag vllm/vllm-openai:rocm vllm/vllm-openai:v${RELEASE_VERSION}-rocm
docker push vllm/vllm-openai:latest-rocm
docker push vllm/vllm-openai:v${RELEASE_VERSION}-rocm
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 vllm/vllm-openai:aarch64-cu130
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:latest-aarch64-cu130
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
docker push vllm/vllm-openai:latest-aarch64-cu130
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:latest
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:v${RELEASE_VERSION}-rocm
docker push vllm/vllm-openai-rocm:latest
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-rocm
Create multi-arch manifest:
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
docker push vllm/vllm-openai-rocm:latest-base
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
docker manifest rm vllm/vllm-openai:latest
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker manifest push vllm/vllm-openai:latest
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
docker manifest rm vllm/vllm-openai:latest-cu130
docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
docker manifest push vllm/vllm-openai:latest-cu130
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
# CPU images (vllm/vllm-openai-cpu)
docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64
docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64
docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
docker push vllm/vllm-openai-cpu:latest-x86_64
docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
docker tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:arm64
docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:latest-arm64
docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
docker push vllm/vllm-openai-cpu:latest-arm64
docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
docker manifest rm vllm/vllm-openai-cpu:latest || true
docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
docker manifest push vllm/vllm-openai-cpu:latest
docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION}
\`\`\`
EOF
EOF

View File

@@ -3,25 +3,32 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Generate Buildkite annotation for ROCm wheel release
set -ex
# Get build configuration from meta-data
# Extract ROCm version dynamically from Dockerfile.rocm_base
# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.1-complete -> extracts "7.1"
# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.0-complete -> extracts "7.0"
ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown")
PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12")
PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
# TODO: Enable the nightly build for ROCm
# Get release version, default to 1.0.0.dev for nightly/per-commit builds
RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null || echo "")
if [ -z "${RELEASE_VERSION}" ]; then
RELEASE_VERSION="1.0.0.dev"
fi
# S3 URLs
S3_BUCKET="${S3_BUCKET:-vllm-wheels}"
S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
S3_URL="https://${S3_BUCKET}.s3.${S3_REGION}.amazonaws.com"
ROCM_PATH="rocm/${BUILDKITE_COMMIT}"
S3_URL="http://${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com"
# Format ROCm version for path (e.g., "7.1" -> "rocm710")
ROCM_VERSION_PATH="rocm$(echo ${ROCM_VERSION} | tr -d '.')"
ROCM_PATH="rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}"
buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
## :rocm: ROCm Wheel Release
## ROCm Wheel and Docker Image Releases
### Build Configuration
| Setting | Value |
|---------|-------|
@@ -34,41 +41,72 @@ buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' <<
### :package: Installation
**Install from this build (by commit):**
\`\`\`bash
uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/{rocm_variant}/
# Example:
uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/rocm700/
\`\`\`bash
pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
# Example for ROCm ${ROCM_VERSION}:
pip install vllm --extra-index-url ${S3_URL}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
\`\`\`
**Install from nightly (if published):**
\`\`\`bash
uv pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/
pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
\`\`\`
### :floppy_disk: Download Wheels Directly
\`\`\`bash
# List all ROCm wheels
aws s3 ls s3://${S3_BUCKET}/${ROCM_PATH}/
aws s3 ls s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/
# Download specific wheels
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/vllm-*.whl .
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torch-*.whl .
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/triton_rocm-*.whl .
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torchvision-*.whl .
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/amdsmi-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/vllm-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torch-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-kernels-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchvision-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchaudio-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amdsmi-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/aiter-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-attn-*.whl .
\`\`\`
### :gear: Included Packages
- **vllm**: vLLM with ROCm support
- **torch**: PyTorch built for ROCm ${ROCM_VERSION}
- **triton_rocm**: Triton built for ROCm
- **triton**: Triton
- **triton-kernels**: Triton kernels
- **torchvision**: TorchVision for ROCm PyTorch
- **torchaudio**: Torchaudio for ROCm PyTorch
- **amdsmi**: AMD SMI Python bindings
- **aiter**: Aiter for ROCm
- **flash-attn**: Flash Attention for ROCm
### :warning: Notes
- These wheels are built for **ROCm ${ROCM_VERSION}** and will NOT work with CUDA GPUs
- Supported GPU architectures: ${PYTORCH_ROCM_ARCH}
- Platform: Linux x86_64 only
### :package: Docker Image Release
To download and upload the image:
\`\`\`
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
docker push vllm/vllm-openai-rocm:latest-base
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
docker push vllm/vllm-openai-rocm:latest
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
\`\`\`
EOF

View File

@@ -112,7 +112,7 @@ def parse_from_filename(file: str) -> WheelFileInfo:
def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
"""
Generate project list HTML content linking to each project & variant sub-directory.
Generate project list HTML content linking to each project & variant subdirectory.
"""
href_tags = []
for name in sorted(subdir_names):
@@ -168,23 +168,23 @@ def generate_index_and_metadata(
comment (str | None): Optional comment to include in the generated HTML files.
First, parse all wheel files to extract metadata.
We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
We need to collect all wheel files for each variant, and generate an index for it (in a subdirectory).
The index for the default variant (if any) is generated in the root index directory.
If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
is purely a copy of the corresponding variant index, with only the links adjusted.
Otherwise, all wheels without variant suffixes are treated as the default variant.
If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
If `alias_to_default` is provided, an additional alias subdirectory is created, it has the same content
as the default variant index, but the links are adjusted accordingly.
Index directory structure:
index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
index.html # project list, linking to "vllm/" and other packages, and all variant sub-directories
index.html # project list, linking to "vllm/" and other packages, and all variant subdirectories
vllm/
index.html # package index, pointing to actual files in wheel_base_dir (relative path)
metadata.json # machine-readable metadata for all wheels in this package
cpu/ # cpu variant sub-directory
cpu/ # cpu variant subdirectory
index.html
vllm/
index.html
@@ -194,7 +194,7 @@ def generate_index_and_metadata(
vllm/
index.html
metadata.json
cu130/ # cu130 variant sub-directory
cu130/ # cu130 variant subdirectory
index.html
vllm/
index.html

View File

@@ -44,6 +44,17 @@ cleanup_docker() {
fi
}
cleanup_network() {
for node in $(seq 0 $((NUM_NODES-1))); do
if docker pr -a -q -f name="node${node}" | grep -q .; then
docker stop "node${node}"
fi
done
if docker network ls | grep docker-net; then
docker network rm docker-net
fi
}
# Call the cleanup docker function
cleanup_docker
@@ -76,7 +87,7 @@ mkdir -p "${HF_CACHE}"
HF_MOUNT="/root/.cache/huggingface"
commands=$@
echo "Commands:$commands"
echo "Raw commands: $commands"
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
@@ -158,6 +169,9 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
--ignore=entrypoints/llm/test_prompt_validation.py "}
fi
commands=$(echo "$commands" | sed 's/ \\ / /g')
echo "Final commands: $commands"
# --ignore=entrypoints/openai/test_encoder_decoder.py \
# --ignore=entrypoints/openai/test_embedding.py \
# --ignore=entrypoints/openai/test_oot_registration.py
@@ -165,7 +179,6 @@ fi
# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
PARALLEL_JOB_COUNT=8
MYPYTHONPATH=".."
# Test that we're launching on the machine that has
@@ -176,53 +189,33 @@ if [[ -z "$render_gid" ]]; then
exit 1
fi
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then
# assign job count as the number of shards used
commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
# assign shard-id for each shard
commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
echo "Shard ${GPU} commands:$commands_gpu"
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
docker run \
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
--network=host \
--shm-size=16gb \
--group-add "$render_gid" \
--rm \
-e HIP_VISIBLE_DEVICES="${GPU}" \
-e HF_TOKEN \
-e AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY \
-v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \
-e "PYTHONPATH=${MYPYTHONPATH}" \
--name "${container_name}_${GPU}" \
"${image_name}" \
/bin/bash -c "${commands_gpu}" \
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
PIDS+=($!)
done
#wait for all processes to finish and collect exit codes
for pid in "${PIDS[@]}"; do
wait "${pid}"
STATUS+=($?)
done
at_least_one_shard_with_tests=0
for st in "${STATUS[@]}"; do
if [[ ${st} -ne 0 ]] && [[ ${st} -ne 5 ]]; then
echo "One of the processes failed with $st"
exit "${st}"
elif [[ ${st} -eq 5 ]]; then
echo "Shard exited with status 5 (no tests collected) - treating as success"
else # This means st is 0
at_least_one_shard_with_tests=1
fi
done
if [[ ${#STATUS[@]} -gt 0 && ${at_least_one_shard_with_tests} -eq 0 ]]; then
echo "All shards reported no tests collected. Failing the build."
exit 1
if [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then
export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')
if [[ "$commands" =~ ^(.*)"["(.*)"] && ["(.*)"]"$ ]]; then
prefix=$( echo "${BASH_REMATCH[1]}" | sed 's/;//g')
echo "PREFIX: ${prefix}"
export composite_command="(command rocm-smi || true)"
myIFS=$IFS
IFS=','
read -ra node0 <<< ${BASH_REMATCH[2]}
read -ra node1 <<< ${BASH_REMATCH[3]}
IFS=$myIFS
for i in "${!node0[@]}";do
command_node_0=$(echo ${node0[i]} | sed 's/\"//g')
command_node_1=$(echo ${node1[i]} | sed 's/\"//g')
export commands="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
echo "COMMANDS: ${commands}"
composite_command=$(echo "${composite_command} && ${commands}")
done
/bin/bash -c "${composite_command}"
cleanup_network
else
echo "Failed to parse node commands! Exiting."
cleanup_network
exit 111
fi
else
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"

View File

@@ -5,7 +5,9 @@
set -exuo pipefail
# Try building the docker image
cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}"
container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container"
cat <<EOF | docker build -t ${image_name} -f - .
FROM gaudi-base-image:latest
COPY ./ /workspace/vllm
@@ -15,7 +17,8 @@ WORKDIR /workspace/vllm
ENV no_proxy=localhost,127.0.0.1
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
RUN VLLM_TARGET_DEVICE=empty pip install .
RUN bash -c 'pip install -r <(sed "/^torch/d" requirements/build.txt)'
RUN VLLM_TARGET_DEVICE=empty pip install --no-build-isolation -e .
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
# install development dependencies (for testing)
@@ -36,15 +39,20 @@ EOF
# functions, while other platforms only need one remove_docker_container
# function.
EXITCODE=1
remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; }
remove_docker_containers() { docker rm -f ${container_name} || true; }
trap 'remove_docker_containers; exit $EXITCODE;' EXIT
remove_docker_containers
echo "Running HPU plugin v1 test"
docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \
docker run --rm --runtime=habana --name=${container_name} --network=host \
-e HABANA_VISIBLE_DEVICES=all \
hpu-plugin-v1-test-env \
/bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh"
-e VLLM_SKIP_WARMUP=true \
-e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
-e PT_HPU_LAZY_MODE=1 \
"${image_name}" \
/bin/bash -c '
cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m
'
EXITCODE=$?
if [ $EXITCODE -eq 0 ]; then

View File

@@ -38,15 +38,16 @@ docker run \
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
cd tests
pytest -v -s v1/core
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py
pytest -v -s v1/engine
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
pytest -v -s v1/structured_output
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
pytest -v -s v1/test_serial_utils.py
'

View File

@@ -43,7 +43,6 @@ trap cleanup EXIT
for BACK in "${BACKENDS[@]}"; do
VLLM_DEEP_GEMM_WARMUP=skip \
VLLM_ALL2ALL_BACKEND=$BACK \
vllm serve "$MODEL" \
--enforce-eager \
--tensor-parallel-size 2 \
@@ -52,6 +51,7 @@ for BACK in "${BACKENDS[@]}"; do
--enable-eplb \
--trust-remote-code \
--max-model-len 2048 \
--all2all-backend $BACK \
--port $PORT &
SERVER_PID=$!
wait_for_server $PORT

View File

@@ -7,17 +7,19 @@ SUBPATH=$BUILDKITE_COMMIT
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
echo "Release version from Buildkite: $RELEASE_VERSION"
GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
if [ -z "$GIT_VERSION" ]; then
echo "Release version from Buildkite: $RELEASE_VERSION"
if [[ -z "$GIT_VERSION" ]]; then
echo "[FATAL] Not on a git tag, cannot create release."
exit 1
else
echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
fi
# sanity check for version mismatch
if [ "$RELEASE_VERSION" != "$GIT_VERSION" ]; then
if [ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]; then
if [[ "$RELEASE_VERSION" != "$GIT_VERSION" ]]; then
if [[ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]]; then
echo "[WARNING] Force release and ignore version mismatch"
else
echo "[FATAL] Release version from Buildkite does not match Git version."
@@ -27,7 +29,7 @@ fi
PURE_VERSION=${RELEASE_VERSION#v} # remove leading 'v'
# check pypi token
if [ -z "$PYPI_TOKEN" ]; then
if [[ -z "$PYPI_TOKEN" ]]; then
echo "[FATAL] PYPI_TOKEN is not set."
exit 1
else
@@ -35,41 +37,8 @@ else
export TWINE_PASSWORD="$PYPI_TOKEN"
fi
# check github token
if [ -z "$GITHUB_TOKEN" ]; then
echo "[FATAL] GITHUB_TOKEN is not set."
exit 1
else
export GH_TOKEN="$GITHUB_TOKEN"
fi
set -x # avoid printing secrets above
# download gh CLI from github
# Get latest gh CLI version from GitHub API
GH_VERSION=$(curl -s https://api.github.com/repos/cli/cli/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//')
if [ -z "$GH_VERSION" ]; then
echo "[FATAL] Failed to get latest gh CLI version from GitHub"
exit 1
fi
echo "Downloading gh CLI version: $GH_VERSION"
GH_TARBALL="gh_${GH_VERSION}_linux_amd64.tar.gz"
GH_URL="https://github.com/cli/cli/releases/download/v${GH_VERSION}/${GH_TARBALL}"
GH_INSTALL_DIR="/tmp/gh-install"
mkdir -p "$GH_INSTALL_DIR"
pushd "$GH_INSTALL_DIR"
curl -L -o "$GH_TARBALL" "$GH_URL"
tar -xzf "$GH_TARBALL"
GH_BIN=$(realpath $(find . -name "gh" -type f -executable | head -n 1))
if [ -z "$GH_BIN" ]; then
echo "[FATAL] Failed to find gh CLI executable"
exit 1
fi
echo "gh CLI downloaded successfully, version: $($GH_BIN --version)"
echo "Last 5 releases on GitHub:" # as a sanity check of gh and GH_TOKEN
command "$GH_BIN" release list --limit 5
popd
# install twine from pypi
python3 -m venv /tmp/vllm-release-env
source /tmp/vllm-release-env/bin/activate
@@ -89,16 +58,13 @@ echo "Wheels copied to local directory"
git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT
ls -la $DIST_DIR
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
if [ -z "$PYPI_WHEEL_FILES" ]; then
if [[ -z "$PYPI_WHEEL_FILES" ]]; then
echo "No default variant wheels found, quitting..."
exit 1
fi
python3 -m twine check $PYPI_WHEEL_FILES
python3 -m twine --non-interactive --verbose upload $PYPI_WHEEL_FILES
echo "Wheels uploaded to PyPI"
# create release on GitHub with the release version and all wheels
command "$GH_BIN" release create $GIT_VERSION -d --latest --notes-from-tag --verify-tag $DIST_DIR/*.whl
python3 -m twine check $PYPI_WHEEL_FILES
python3 -m twine upload --non-interactive --verbose $PYPI_WHEEL_FILES
echo "Wheels uploaded to PyPI"

View File

@@ -542,7 +542,7 @@ steps:
- label: LoRA Test %N # 20min each
timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
agent_pool: mi325_8
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- vllm/lora
@@ -604,9 +604,11 @@ steps:
- tests/compile
commands:
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
# Limit to no custom ops to reduce running time
# Wrap with quotes to escape yaml and avoid starting -k string with a -
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
# # Limit to no custom ops to reduce running time
# # Wrap with quotes to escape yaml and avoid starting -k string with a -
# - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
- label: Cudagraph test
timeout_in_minutes: 20
@@ -636,12 +638,13 @@ steps:
- label: Kernels Attention Test %N # 23min
timeout_in_minutes: 35
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_8
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- csrc/attention/
- vllm/attention
- vllm/v1/attention
# TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
- vllm/model_executor/layers/attention
- tests/kernels/attention
commands:
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
@@ -650,7 +653,7 @@ steps:
- label: Kernels Quantization Test %N # 64min
timeout_in_minutes: 90
mirror_hardwares: [amdexperimental]
agent_pool: mi325_8
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- csrc/quantization/
@@ -663,7 +666,7 @@ steps:
- label: Kernels MoE Test %N # 40min
timeout_in_minutes: 60
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_8
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- csrc/quantization/cutlass_w8a8/moe/
@@ -741,7 +744,7 @@ steps:
- label: Benchmarks # 11min
timeout_in_minutes: 20
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_8
agent_pool: mi325_1
# grade: Blocking
working_dir: "/vllm-workspace/.buildkite"
source_file_dependencies:
@@ -752,7 +755,7 @@ steps:
- label: Benchmarks CLI Test # 7min
timeout_in_minutes: 20
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_8
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- vllm/
@@ -826,7 +829,7 @@ steps:
- label: Basic Models Tests (Extra Initialization) %N
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_8
agent_pool: mi325_1
# grade: Blocking
torch_nightly: true
source_file_dependencies:
@@ -887,7 +890,7 @@ steps:
- label: Language Models Tests (Extra Standard) %N
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
agent_pool: mi325_8
agent_pool: mi325_1
# grade: Blocking
torch_nightly: true
source_file_dependencies:
@@ -908,7 +911,7 @@ steps:
- label: Language Models Tests (Hybrid) %N
timeout_in_minutes: 75
mirror_hardwares: [amdexperimental]
agent_pool: mi325_8
agent_pool: mi325_1
# grade: Blocking
torch_nightly: true
source_file_dependencies:
@@ -1131,7 +1134,7 @@ steps:
- csrc/quantization/cutlass_w8a8/moe/
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
- vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -1180,7 +1183,6 @@ steps:
- tests/compile/test_fusion_attn.py
- tests/compile/test_silu_mul_quant_fusion.py
- tests/compile/distributed/test_fusion_all_reduce.py
- tests/compile/distributed/test_fusions_e2e.py
- tests/compile/fullgraph/test_full_graph.py
commands:
- nvidia-smi
@@ -1188,33 +1190,16 @@ steps:
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
# this runner has 2 GPUs available even though num_gpus=2 is not set
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
# Wrap with quotes to escape yaml
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
# # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
# # Wrap with quotes to escape yaml
# - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
- label: Blackwell Fusion E2E Tests # 30 min
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
gpu: b200
optional: true
num_gpus: 2
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/distributed/test_fusions_e2e.py
commands:
- nvidia-smi
# Run all e2e fusion tests
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
- label: Blackwell GPT-OSS Eval
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
@@ -1277,7 +1262,7 @@ steps:
- label: 2 Node Tests (4 GPUs in total) # 16min
timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
mirror_hardwares: [amdexperimental, amdmultinode]
agent_pool: mi325_4
# grade: Blocking
working_dir: "/vllm-workspace/tests"
@@ -1291,15 +1276,15 @@ steps:
- tests/distributed/
- tests/examples/offline_inference/data_parallel.py
commands:
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) | grep 'Same node test passed' | grep 'Node count test passed'
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
- label: Distributed Tests (2 GPUs) # 68min
@@ -1508,6 +1493,9 @@ steps:
source_file_dependencies:
- vllm/
commands:
# Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
# TODO: Remove when the bug is fixed in a future ROCm release
- export TORCH_NCCL_BLOCKING_WAIT=1
# NOTE: don't test llama model here, it seems hf implementation is buggy
# see https://github.com/vllm-project/vllm/pull/5689 for details
- pytest -v -s distributed/test_custom_all_reduce.py
@@ -1562,7 +1550,10 @@ steps:
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
#- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
# - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py
- HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization

View File

@@ -362,7 +362,7 @@ steps:
- pytest -v -s v1/sample
- pytest -v -s v1/logits_processors
- pytest -v -s v1/worker
- pytest -v -s v1/spec_decode
- pytest -v -s -m 'not slow_test' v1/spec_decode
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
- pytest -v -s -m 'not cpu_test' v1/metrics
- pytest -v -s v1/test_oracle.py
@@ -537,9 +537,11 @@ steps:
commands:
# fp8 kv scales not supported on sm89, tested on Blackwell instead
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
# Limit to no custom ops to reduce running time
# Wrap with quotes to escape yaml and avoid starting -k string with a -
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
# # Limit to no custom ops to reduce running time
# # Wrap with quotes to escape yaml and avoid starting -k string with a -
# - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
- label: Cudagraph test
timeout_in_minutes: 20
@@ -568,8 +570,9 @@ steps:
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/attention/
- vllm/attention
- vllm/v1/attention
# TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
- vllm/model_executor/layers/attention
- tests/kernels/attention
commands:
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
@@ -1017,7 +1020,7 @@ steps:
- csrc/quantization/cutlass_w8a8/moe/
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
- vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -1068,7 +1071,6 @@ steps:
- tests/compile/test_fusion_attn.py
- tests/compile/test_silu_mul_quant_fusion.py
- tests/compile/distributed/test_fusion_all_reduce.py
- tests/compile/distributed/test_fusions_e2e.py
- tests/compile/fullgraph/test_full_graph.py
commands:
- nvidia-smi
@@ -1076,75 +1078,15 @@ steps:
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
# this runner has 2 GPUs available even though num_gpus=2 is not set
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
# Wrap with quotes to escape yaml
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
# # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
# # Wrap with quotes to escape yaml
# - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
- label: Blackwell Fusion E2E Tests # 30 min
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
gpu: b200
optional: true
num_gpus: 2
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/distributed/test_fusions_e2e.py
commands:
- nvidia-smi
# Run all e2e fusion tests
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
- label: Hopper Fusion E2E Tests (H100) # 10min
timeout_in_minutes: 70
working_dir: "/vllm-workspace/"
gpu: h100
optional: true
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/test_fusion_attn.py
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
# skip Llama-4 since it does not fit on this device
- pytest -v -s tests/compile/test_fusion_attn.py -k 'not Llama-4'
- label: Hopper Fusion Distributed E2E Tests (2xH100) # 70min
timeout_in_minutes: 70
working_dir: "/vllm-workspace/"
gpu: h100
optional: true
num_gpus: 2
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/distributed/test_fusions_e2e.py
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
# Run all e2e fusion tests
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
- label: Blackwell GPT-OSS Eval
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
@@ -1316,7 +1258,7 @@ steps:
- pytest -v -s distributed/test_distributed_oot.py
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
- pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
- pytest -v -s plugins/lora_resolvers # unit tests for lora resolver plugins
- label: Pipeline + Context Parallelism Test # 45min
timeout_in_minutes: 60
@@ -1419,6 +1361,20 @@ steps:
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s -x lora/test_mixtral.py
- label: Acceptance Length Test (Large Models) # optional
timeout_in_minutes: 120
gpu: h100
optional: true
num_gpus: 1
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/v1/spec_decode/
- vllm/model_executor/models/mlp_speculator.py
- tests/v1/spec_decode/test_acceptance_length.py
commands:
- export VLLM_ALLOW_INSECURE_SERIALIZATION=1
- pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test
- label: LM Eval Large Models # optional
gpu: a100
optional: true

View File

@@ -4,7 +4,7 @@ depends_on:
steps:
- label: V1 attention (H100)
timeout_in_minutes: 30
gpu: h100
device: h100
source_file_dependencies:
- vllm/config/attention.py
- vllm/model_executor/layers/attention
@@ -15,7 +15,7 @@ steps:
- label: V1 attention (B200)
timeout_in_minutes: 30
gpu: b200
device: b200
source_file_dependencies:
- vllm/config/attention.py
- vllm/model_executor/layers/attention

View File

@@ -2,56 +2,196 @@ group: Compile
depends_on:
- image-build
steps:
- label: Fusion and Compile Tests (B200)
timeout_in_minutes: 40
- label: Sequence Parallel Tests (2 GPUs)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/"
gpu: b200
num_devices: 2
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/model_executor/layers/
- vllm/compilation/
- vllm/v1/worker/
- vllm/v1/cudagraph_dispatcher.py
- tests/distributed/test_sequence_parallel.py
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
- pytest -v -s tests/distributed/test_sequence_parallel.py
- label: Sequence Parallel Tests (2xH100)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/"
device: h100
optional: true
num_devices: 2
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
- pytest -v -s tests/distributed/test_sequence_parallel.py
- label: Distributed Compile Unit Tests (2xH100)
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
device: h100
num_devices: 2
source_file_dependencies:
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers
- tests/compile/distributed/test_fusion_all_reduce.py
- tests/compile/distributed/test_sequence_parallelism.py
- tests/compile/distributed/test_async_tp.py
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
- pytest -v -s tests/compile/distributed/test_async_tp.py
- label: Fusion and Compile Unit Tests (B200)
timeout_in_minutes: 20
working_dir: "/vllm-workspace/"
device: b200
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- vllm/model_executor/layers/attention/attention.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
- tests/compile/test_fusion_attn.py
- tests/compile/test_silu_mul_quant_fusion.py
- tests/compile/distributed/test_fusion_all_reduce.py
- tests/compile/distributed/test_fusions_e2e.py
- tests/compile/fullgraph/test_full_graph.py
commands:
# b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell
- nvidia-smi
- pytest -v -s tests/compile/test_fusion_attn.py
- pytest -v -s tests/compile/test_fusion_attn.py -k FLASHINFER
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
# this runner has 2 GPUs available even though num_gpus=2 is not set
# this runner has 2 GPUs available even though num_devices=2 is not set
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
# Wrap with quotes to escape yaml
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
# TODO(luka) move to H100 once pass tests run on H100
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
- label: Fusion E2E (2 GPUs)(B200)
timeout_in_minutes: 40
- label: Fusion E2E Quick (H100)
timeout_in_minutes: 15
working_dir: "/vllm-workspace/"
gpu: b200
optional: true
num_gpus: 2
device: h100
num_devices: 1
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/distributed/test_fusions_e2e.py
- csrc/quantization/
- vllm/model_executor/
- vllm/v1/attention/
- vllm/compilation/
- tests/compile/fusions_e2e/
commands:
- nvidia-smi
# Run all e2e fusion tests
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
# Run all models and attn backends but only Inductor partition and native custom ops
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
# Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
- label: Fusion E2E Config Sweep (H100)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/"
device: h100
num_devices: 1
source_file_dependencies:
- csrc/quantization/
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/attention/attention.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/fusions_e2e/
commands:
- nvidia-smi
# Run just llama3 (fp8) for all config combinations
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
- label: Fusion E2E Config Sweep (B200)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/"
device: b200
num_devices: 1
optional: true
commands:
- nvidia-smi
# Run all models and attn backends but only Inductor partition and native custom ops
# -k "inductor_partition and not +rms_norm and not +quant_fp8"
# Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
# -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
# Run just llama3 (fp8 & fp4) for all config combinations
# -k "llama-3"
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"
- label: Fusion E2E TP2 Quick (H100)
timeout_in_minutes: 20
working_dir: "/vllm-workspace/"
device: h100
num_devices: 2
source_file_dependencies:
- csrc/quantization/
- vllm/model_executor/
- vllm/v1/attention/
- vllm/compilation/
- tests/compile/fusions_e2e/
commands:
- nvidia-smi
# Run all models and attn backends but only Inductor partition and native custom ops
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
- label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
device: h100
num_devices: 2
source_file_dependencies:
- csrc/quantization/
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/attention/attention.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/fusions_e2e/
commands:
- nvidia-smi
# Run just llama3 (fp4 & fp8 & bf16) for all config combinations
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"
- label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
device: h100
num_devices: 2
source_file_dependencies:
- csrc/quantization/
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/attention/attention.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/fusions_e2e/
commands:
- nvidia-smi
# Run just llama3 (fp8 & bf16) for all config combinations
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"
- label: Fusion E2E TP2 (B200)
timeout_in_minutes: 20
working_dir: "/vllm-workspace/"
device: b200
num_devices: 2
source_file_dependencies:
- csrc/quantization/
- vllm/model_executor/
- vllm/v1/attention/
- vllm/compilation/
- tests/compile/fusions_e2e/
commands:
- nvidia-smi
# Run all models and attn backends but only Inductor partition and native custom ops
# for ar-rms-quant-fp4, also sweep llama3
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "Llama-3.1-8B-Instruct-FP4"
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"

View File

@@ -5,7 +5,7 @@ steps:
- label: Distributed Comm Ops
timeout_in_minutes: 20
working_dir: "/vllm-workspace/tests"
num_gpus: 2
num_devices: 2
source_file_dependencies:
- vllm/distributed
- tests/distributed
@@ -16,9 +16,9 @@ steps:
- pytest -v -s distributed/test_shm_storage.py
- label: Distributed (2 GPUs)
timeout_in_minutes: 90
timeout_in_minutes: 60
working_dir: "/vllm-workspace/tests"
num_gpus: 2
num_devices: 2
source_file_dependencies:
- vllm/compilation/
- vllm/distributed/
@@ -47,14 +47,13 @@ steps:
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- pytest -v -s distributed/test_sequence_parallel.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
- label: Distributed Tests (4 GPUs)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/tests"
num_gpus: 4
num_devices: 4
source_file_dependencies:
- vllm/distributed/
- tests/distributed/test_utils
@@ -103,8 +102,8 @@ steps:
- label: Distributed Tests (8 GPUs)(H100)
timeout_in_minutes: 10
gpu: h100
num_gpus: 8
device: h100
num_devices: 8
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- examples/offline_inference/torchrun_dp_example.py
@@ -120,9 +119,9 @@ steps:
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
- label: Distributed Tests (4 GPUs)(A100)
gpu: a100
device: a100
optional: true
num_gpus: 4
num_devices: 4
source_file_dependencies:
- vllm/
commands:
@@ -133,26 +132,22 @@ steps:
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s -x lora/test_mixtral.py
- label: Distributed Tests (2 GPUs)(H200)
gpu: h200
- label: Distributed Tests (2 GPUs)(H100)
timeout_in_minutes: 15
device: h100
optional: true
working_dir: "/vllm-workspace/"
num_gpus: 2
num_devices: 2
commands:
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py
- label: Distributed Tests (2 GPUs)(B200)
gpu: b200
device: b200
optional: true
working_dir: "/vllm-workspace/"
num_gpus: 2
num_devices: 2
commands:
- pytest -v -s tests/distributed/test_context_parallel.py
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
@@ -161,8 +156,9 @@ steps:
- label: 2 Node Test (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_gpus: 2
num_devices: 2
num_nodes: 2
no_plugin: true
source_file_dependencies:
- vllm/distributed/
- vllm/engine/
@@ -171,12 +167,12 @@ steps:
- tests/distributed/
- tests/examples/offline_inference/data_parallel.py
commands:
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 $IMAGE_TAG "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
- label: Distributed NixlConnector PD accuracy (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_gpus: 4
num_devices: 4
source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- tests/v1/kv_connector/nixl_integration/
@@ -184,10 +180,21 @@ steps:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
- label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
- label: Pipeline + Context Parallelism (4 GPUs))
timeout_in_minutes: 60
working_dir: "/vllm-workspace/tests"
num_gpus: 4
num_devices: 4
source_file_dependencies:
- vllm/distributed/
- vllm/engine/
@@ -196,4 +203,4 @@ steps:
- tests/distributed/
commands:
- pytest -v -s distributed/test_pp_cudagraph.py
- pytest -v -s distributed/test_pipeline_parallel.py
- pytest -v -s distributed/test_pipeline_parallel.py

View File

@@ -4,27 +4,27 @@ depends_on:
steps:
- label: DeepSeek V2-Lite Accuracy
timeout_in_minutes: 60
gpu: h100
device: h100
optional: true
num_gpus: 4
num_devices: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
- label: Qwen3-30B-A3B-FP8-block Accuracy
timeout_in_minutes: 60
gpu: h100
device: h100
optional: true
num_gpus: 4
num_devices: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
timeout_in_minutes: 60
gpu: b200
device: b200
optional: true
num_gpus: 2
num_devices: 2
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
@@ -33,10 +33,11 @@ steps:
timeout_in_minutes: 30
optional: true
soft_fail: true
num_gpus: 2
num_devices: 2
working_dir: "/vllm-workspace"
source_file_dependencies:
- vllm/
- .buildkite/scripts/run-prime-rl-test.sh
commands:
- nvidia-smi
- bash .buildkite/scripts/run-prime-rl-test.sh

View File

@@ -23,4 +23,8 @@ steps:
# TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- pytest -v -s v1/e2e
- pytest -v -s v1/engine
# Run this test standalone for now;
# need to untangle use (implicit) use of spawn/fork across the tests.
- pytest -v -s v1/engine/test_preprocess_error_handling.py
# Run the rest of v1/engine tests
- pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py

View File

@@ -14,7 +14,7 @@ steps:
- label: EPLB Execution
timeout_in_minutes: 20
working_dir: "/vllm-workspace/tests"
num_gpus: 4
num_devices: 4
source_file_dependencies:
- vllm/distributed/eplb
- tests/distributed/test_eplb_execute.py

View File

@@ -15,8 +15,9 @@ steps:
timeout_in_minutes: 35
source_file_dependencies:
- csrc/attention/
- vllm/attention
- vllm/v1/attention
# TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
- vllm/model_executor/layers/attention
- tests/kernels/attention
commands:
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
@@ -57,8 +58,8 @@ steps:
- label: Kernels DeepGEMM Test (H100)
timeout_in_minutes: 45
gpu: h100
num_gpus: 1
device: h100
num_devices: 1
source_file_dependencies:
- tools/install_deepgemm.sh
- vllm/utils/deep_gemm.py
@@ -77,7 +78,7 @@ steps:
- label: Kernels (B200)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/"
gpu: b200
device: b200
# optional: true
source_file_dependencies:
- csrc/quantization/fp4/
@@ -85,7 +86,7 @@ steps:
- csrc/quantization/cutlass_w8a8/moe/
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
- vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -114,4 +115,55 @@ steps:
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
# e2e
- pytest -v -s tests/models/quantization/test_nvfp4.py
- label: Kernels Helion Test
timeout_in_minutes: 30
device: h100
source_file_dependencies:
- vllm/utils/import_utils.py
- tests/kernels/helion/
commands:
- pip install helion
- pytest -v -s kernels/helion/
- label: Kernels FP8 MoE Test (1 H100)
timeout_in_minutes: 90
device: h100
num_devices: 1
optional: true
commands:
- pytest -v -s kernels/moe/test_cutlass_moe.py
- pytest -v -s kernels/moe/test_flashinfer.py
- pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
- pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
- pytest -v -s kernels/moe/test_moe.py
# - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
- pytest -v -s kernels/moe/test_block_int8.py
- pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
- pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
- label: Kernels FP8 MoE Test (2 H100s)
timeout_in_minutes: 90
device: h100
num_devices: 2
optional: true
commands:
- pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
- pytest -v -s kernels/moe/test_deepep_moe.py
- pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
# - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
- label: Kernels Fp4 MoE Test (B200)
timeout_in_minutes: 60
device: b200
num_devices: 1
optional: true
commands:
- pytest -v -s kernels/moe/test_cutedsl_moe.py
- pytest -v -s kernels/moe/test_flashinfer_moe.py
- pytest -v -s kernels/moe/test_nvfp4_moe.py
- pytest -v -s kernels/moe/test_ocp_mx_moe.py

View File

@@ -12,9 +12,9 @@ steps:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
- label: LM Eval Large Models (4 GPUs)(A100)
gpu: a100
device: a100
optional: true
num_gpus: 4
num_devices: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
@@ -24,9 +24,9 @@ steps:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
- label: LM Eval Large Models (4 GPUs)(H100)
gpu: h100
device: h100
optional: true
num_gpus: 4
num_devices: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
@@ -37,10 +37,39 @@ steps:
- label: LM Eval Small Models (B200)
timeout_in_minutes: 120
gpu: b200
device: b200
optional: true
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
- label: LM Eval Large Models (H200)
timeout_in_minutes: 60
device: h200
optional: true
num_devices: 8
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt
- label: MoE Refactor Integration Test (H100 - TEMPORARY)
device: h100
optional: true
num_devices: 2
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt
- label: MoE Refactor Integration Test (B200 - TEMPORARY)
device: b200
optional: true
num_devices: 2
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt
- label: MoE Refactor Integration Test (B200 DP - TEMPORARY)
device: b200
optional: true
num_devices: 2
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt

View File

@@ -14,7 +14,7 @@ steps:
- label: LoRA TP (Distributed)
timeout_in_minutes: 30
num_gpus: 4
num_devices: 4
source_file_dependencies:
- vllm/lora
- tests/lora

View File

@@ -16,7 +16,7 @@ steps:
- pytest -v -s v1/sample
- pytest -v -s v1/logits_processors
- pytest -v -s v1/worker
- pytest -v -s v1/spec_decode
- pytest -v -s -m 'not slow_test' v1/spec_decode
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
- pytest -v -s -m 'not cpu_test' v1/metrics
- pytest -v -s v1/test_oracle.py
@@ -27,11 +27,12 @@ steps:
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
- label: V1 Others (CPU)
depends_on: ~
depends_on:
- image-build-cpu
source_file_dependencies:
- vllm/
- tests/v1
no_gpu: true
device: cpu
commands:
# split the test to avoid interference
- pytest -v -s -m 'cpu_test' v1/core
@@ -82,7 +83,7 @@ steps:
- label: Metrics, Tracing (2 GPUs)
timeout_in_minutes: 20
num_gpus: 2
num_devices: 2
source_file_dependencies:
- vllm/
- tests/v1/tracing
@@ -114,7 +115,8 @@ steps:
- pytest -v -s utils_
- label: Async Engine, Inputs, Utils, Worker, Config (CPU)
depends_on: ~
depends_on:
- image-build-cpu
timeout_in_minutes: 30
source_file_dependencies:
- vllm/
@@ -127,7 +129,7 @@ steps:
- tests/tool_parsers
- tests/transformers_utils
- tests/config
no_gpu: true
device: cpu
commands:
- python3 standalone_tests/lazy_imports.py
- pytest -v -s test_inputs.py
@@ -142,7 +144,7 @@ steps:
- label: GPT-OSS Eval (B200)
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
gpu: b200
device: b200
optional: true
source_file_dependencies:
- tests/evals/gpt_oss
@@ -155,7 +157,7 @@ steps:
- label: Batch Invariance (H100)
timeout_in_minutes: 25
gpu: h100
device: h100
source_file_dependencies:
- vllm/v1/attention
- vllm/model_executor/layers
@@ -164,4 +166,18 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pip install pytest-timeout pytest-forked
- pytest -v -s v1/determinism/test_batch_invariance.py
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
- label: Acceptance Length Test (Large Models) # optional
timeout_in_minutes: 25
gpu: h100
optional: true
num_gpus: 1
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/v1/spec_decode/
- vllm/model_executor/models/mlp_speculator.py
- tests/v1/spec_decode/test_acceptance_length.py
commands:
- export VLLM_ALLOW_INSECURE_SERIALIZATION=1
- pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test

View File

@@ -39,12 +39,14 @@ steps:
- pytest -v -s models/test_transformers.py models/test_registry.py
- label: Basic Models Test (Other CPU) # 5min
depends_on:
- image-build-cpu
timeout_in_minutes: 10
source_file_dependencies:
- vllm/
- tests/models/test_utils.py
- tests/models/test_vision.py
no_gpu: true
device: cpu
commands:
- pytest -v -s models/test_utils.py models/test_vision.py

View File

@@ -5,7 +5,7 @@ steps:
- label: Distributed Model Tests (2 GPUs)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/tests"
num_gpus: 2
num_devices: 2
source_file_dependencies:
- vllm/model_executor/model_loader/sharded_state_loader.py
- vllm/model_executor/models/

View File

@@ -14,11 +14,13 @@ steps:
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Processor Test (CPU)
depends_on:
- image-build-cpu
timeout_in_minutes: 60
source_file_dependencies:
- vllm/
- tests/models/multimodal
no_gpu: true
device: cpu
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py

View File

@@ -5,7 +5,7 @@ steps:
- label: Plugin Tests (2 GPUs)
timeout_in_minutes: 60
working_dir: "/vllm-workspace/tests"
num_gpus: 2
num_devices: 2
source_file_dependencies:
- vllm/plugins/
- tests/plugins/

View File

@@ -18,7 +18,7 @@ steps:
- "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
- label: PyTorch Fullgraph Smoke Test
timeout_in_minutes: 30
timeout_in_minutes: 35
source_file_dependencies:
- vllm/
- tests/compile
@@ -30,16 +30,13 @@ steps:
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
- label: PyTorch Fullgraph
timeout_in_minutes: 40
timeout_in_minutes: 30
source_file_dependencies:
- vllm/
- tests/compile
commands:
# fp8 kv scales not supported on sm89, tested on Blackwell instead
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
# Limit to no custom ops to reduce running time
# Wrap with quotes to escape yaml and avoid starting -k string with a -
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
- label: Pytorch Nightly Dependency Override Check # 2min
# if this test fails, it means the nightly torch version is not compatible with some

View File

@@ -16,14 +16,14 @@ steps:
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
# we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
- uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129
- uv pip install --system conch-triton-kernels
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
- label: Quantized MoE Test (B200)
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
gpu: b200
device: b200
source_file_dependencies:
- tests/quantization/test_blackwell_moe.py
- vllm/model_executor/models/deepseek_v2.py

View File

@@ -5,7 +5,7 @@ steps:
- label: Weight Loading Multiple GPU # 33min
timeout_in_minutes: 45
working_dir: "/vllm-workspace/tests"
num_gpus: 2
num_devices: 2
optional: true
source_file_dependencies:
- vllm/
@@ -15,8 +15,8 @@ steps:
- label: Weight Loading Multiple GPU - Large Models # optional
working_dir: "/vllm-workspace/tests"
num_gpus: 2
gpu: a100
num_devices: 2
device: a100
optional: true
source_file_dependencies:
- vllm/

16
.github/CODEOWNERS vendored
View File

@@ -2,8 +2,8 @@
# for more info about CODEOWNERS file
# This lists cover the "core" components of vLLM that require careful review
/vllm/attention @LucasWilkinson
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
/vllm/model_executor/layers/attention @LucasWilkinson
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
/vllm/model_executor/layers/mamba @tdoublep
@@ -16,7 +16,7 @@
/vllm/entrypoints @aarnphm @chaunceyjiang
/vllm/tool_parsers @aarnphm @chaunceyjiang
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
/vllm/distributed/kv_transfer @NickLucche @ApostaC
/vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# Any change to the VllmConfig changes can have a large user-facing impact,
@@ -30,12 +30,14 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/vllm/v1/attention/backends/mla @pavanimajety
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
/vllm/v1/attention/backends/triton_attn.py @tdoublep
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
/vllm/v1/sample @22quinn @houseroad @njhill
/vllm/v1/spec_decode @benchislett @luccafong
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
/vllm/v1/kv_cache_interface.py @heheda12345
/vllm/v1/offloading @ApostaC
/vllm/v1/kv_offload @ApostaC @orozery
/vllm/v1/worker/gpu/kv_connector.py @orozery
/vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery
# Model runner V2
/vllm/v1/worker/gpu @WoosukKwon
@@ -54,13 +56,13 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
/tests/v1/structured_output @mgoin @russellb @aarnphm
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
/tests/weight_loading @mgoin @youkaichao @yewentao256
/tests/lora @jeejeelee
/tests/models/language/generation/test_hybrid.py @tdoublep
/tests/v1/kv_connector/nixl_integration @NickLucche
/tests/v1/kv_connector @ApostaC
/tests/v1/offloading @ApostaC
/tests/v1/kv_connector @ApostaC @orozery
/tests/v1/kv_offload @ApostaC @orozery
/tests/v1/determinism @yewentao256
# Transformers modeling backend

View File

@@ -154,6 +154,10 @@ repos:
files: ^docker/(Dockerfile|versions\.json)$
pass_filenames: false
additional_dependencies: [dockerfile-parse]
- id: attention-backend-docs
name: Check attention backend documentation is up to date
entry: python tools/pre_commit/generate_attention_backend_docs.py --check
language: python
# Keep `suggestion` last
- id: suggestion
name: Suggestion

View File

@@ -458,7 +458,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
set(MARLIN_SRCS
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
"csrc/quantization/marlin/marlin.cu"
"csrc/quantization/marlin/marlin_int4_fp8_preprocess.cu"
"csrc/quantization/marlin/gptq_marlin_repack.cu"

View File

@@ -0,0 +1,266 @@
# vLLM Attention Benchmarking Suite
Fast, flexible benchmarking for vLLM attention and MLA backends with an extended batch specification grammar.
## Quick Start
```bash
cd benchmarks/attention_benchmarks
# Run a pre-configured benchmark
python benchmark.py --config configs/mla_decode.yaml
python benchmark.py --config configs/mla_mixed_batch.yaml
python benchmark.py --config configs/speculative_decode.yaml
python benchmark.py --config configs/standard_attention.yaml
python benchmark.py --config configs/reorder_threshold.yaml
# Or run custom benchmarks
python benchmark.py \
--backends flash flashinfer \
--batch-specs "q2k" "8q1s1k" "2q2k_32q1s1k" \
--output-csv results.csv
```
## Simplified Batch Specification Grammar
Express workloads concisely using query length and sequence length:
```python
"q2k" # 2048-token prefill (q_len=2048, seq_len=2048)
"q1s1k" # Decode: 1 token with 1K sequence
"8q1s1k" # 8 decode requests
"q4s1k" # 4-token extend (e.g., spec decode)
"2q2k_32q1s1k" # Mixed: 2 prefills + 32 decodes
"16q4s1k" # 16 spec decode (4 tokens each)
```
### Grammar Rule
```text
Format: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
- count: Number of identical requests (optional, default=1)
- q_len: Query length (number of new tokens)
- seq_len: Total sequence length (optional, defaults to q_len for prefill)
- 'k': Multiplies value by 1024
Mixed batches: Use _ to combine (e.g., "2q2k_32q1s1k")
```
**Note**: Decode, prefill, and spec decode are just different query lengths - no special syntax needed!
## Pre-configured Benchmarks
The suite includes several pre-configured YAML benchmark configurations:
### MLA Decode Benchmark
Tests pure decode performance across MLA backends with varying batch sizes and sequence lengths.
```bash
python benchmark.py --config configs/mla_decode.yaml
```
### MLA Mixed Batch Benchmark
Tests chunked prefill performance with mixed prefill + decode batches.
```bash
python benchmark.py --config configs/mla_mixed_batch.yaml
```
### Speculative Decoding Benchmark
Tests speculative decode scenarios (K-token verification) and reorder_batch_threshold optimization.
```bash
python benchmark.py --config configs/speculative_decode.yaml
```
### Standard Attention Benchmark
Tests standard attention backends (Flash/Triton/FlashInfer) with pure prefill, decode, and mixed batches.
```bash
python benchmark.py --config configs/standard_attention.yaml
```
### Reorder Threshold Study
**Question:** At what query length does the prefill pipeline become faster than the decode pipeline?
Tests query lengths from 1-1024 across 9 batch sizes to find the crossover point. Uses `decode_vs_prefill` mode to compare both pipelines for each query length.
```bash
python benchmark.py --config configs/reorder_threshold.yaml
```
---
## Universal Benchmark
The `benchmark.py` script handles **all** backends - both standard attention and MLA.
### Standard Attention (Flash/Triton/FlashInfer)
```bash
python benchmark.py \
--backends flash triton flashinfer \
--batch-specs "q2k" "8q1s1k" "2q2k_32q1s1k" \
--num-layers 10 \
--repeats 5 \
--output-csv results.csv
```
### MLA Backends
```bash
# Compare all MLA backends
python benchmark.py \
--backends cutlass_mla flashinfer_mla flashattn_mla flashmla \
--batch-specs "64q1s1k" "64q1s4k" \
--output-csv mla_results.csv
```
### Parameter Sweeps
Use `--sweep-param` and `--sweep-values` to run parameter sweeps from the CLI:
#### CUTLASS MLA num-splits Optimization
**Question:** What is the optimal `num_kv_splits` for CUTLASS MLA?
```bash
python benchmark.py \
--backend cutlass_mla \
--batch-specs "64q1s1k" "64q1s4k" "64q1s16k" \
--sweep-param num_kv_splits \
--sweep-values 1 2 4 8 16 \
--output-json optimal_splits.json
```
#### Reorder Batch Threshold Optimization
**Question:** What's the optimal `reorder_batch_threshold` for speculative decoding?
```bash
python benchmark.py \
--backend flashmla \
--batch-specs "q4s1k" "q8s2k" \
--sweep-param reorder_batch_threshold \
--sweep-values 1 4 16 64 256 512 \
--output-csv threshold_sweep.csv
```
### All Command-Line Options
```text
--config CONFIG # Path to YAML config file (overrides other args)
--backends BACKEND [BACKEND ...] # flash, triton, flashinfer, cutlass_mla,
# flashinfer_mla, flashattn_mla, flashmla
--backend BACKEND # Single backend (alternative to --backends)
--batch-specs SPEC [SPEC ...] # Batch specifications using extended grammar
# Model configuration
--num-layers N # Number of layers
--head-dim N # Head dimension
--num-q-heads N # Query heads
--num-kv-heads N # KV heads
--block-size N # Block size
# Benchmark settings
--device DEVICE # Device (default: cuda:0)
--repeats N # Repetitions
--warmup-iters N # Warmup iterations
--profile-memory # Profile memory usage
# Parameter sweeps
--sweep-param PARAM # Parameter name to sweep (e.g., num_kv_splits,
# reorder_batch_threshold)
--sweep-values N [N ...] # Values to sweep for the parameter
# Output
--output-csv FILE # Save to CSV
--output-json FILE # Save to JSON
```
## Hardware Requirements
| Backend | Hardware |
|---------|----------|
| Flash/Triton/FlashInfer | Any CUDA GPU |
| CUTLASS MLA | Blackwell (SM100+) |
| FlashAttn MLA | Hopper (SM90+) |
| FlashMLA | Hopper (SM90+) |
| FlashInfer-MLA | Any CUDA GPU |
## Using MLA Runner Directly
All MLA backends are available through `mla_runner.run_mla_benchmark()`:
```python
from mla_runner import run_mla_benchmark
from common import BenchmarkConfig
config = BenchmarkConfig(
backend="cutlass_mla",
batch_spec="64q1s4k",
num_layers=10,
head_dim=576,
num_q_heads=128,
num_kv_heads=1,
block_size=128,
device="cuda:0",
repeats=5,
warmup_iters=3,
)
# CUTLASS MLA with specific num_kv_splits
result = run_mla_benchmark("cutlass_mla", config, num_kv_splits=4)
print(f"Time: {result.mean_time:.6f}s")
# FlashInfer-MLA
result = run_mla_benchmark("flashinfer_mla", config)
# FlashAttn MLA (Hopper SM90+)
result = run_mla_benchmark("flashattn_mla", config, reorder_batch_threshold=64)
# FlashMLA (Hopper SM90+)
result = run_mla_benchmark("flashmla", config, reorder_batch_threshold=64)
```
## Python API
```python
from batch_spec import parse_batch_spec, format_batch_spec, get_batch_stats
from common import BenchmarkConfig, BenchmarkResult, ResultsFormatter
# Parse batch specs
requests = parse_batch_spec("2q2k_q4s1k_32q1s1k")
print(format_batch_spec(requests))
# "2 prefill (2x2k), 1 extend (1xq4kv1k), 32 decode (32x1k)"
# Get batch statistics
stats = get_batch_stats(requests)
print(f"Total tokens: {stats['total_tokens']}")
print(f"Num decode: {stats['num_decode']}, Num prefill: {stats['num_prefill']}")
# Format results
formatter = ResultsFormatter()
formatter.save_csv(results, "output.csv")
formatter.save_json(results, "output.json")
```
## Tips
**1. Warmup matters** - Use `--warmup-iters 10` for stable results
**2. Multiple repeats** - Use `--repeats 20` for low variance
**3. Save results** - Always use `--output-csv` or `--output-json`
**4. Test incrementally** - Start with `--num-layers 1 --repeats 1`
**5. Extended grammar** - Leverage spec decode, chunked prefill patterns
**6. Parameter sweeps** - Use `--sweep-param` and `--sweep-values` to find optimal values

View File

@@ -0,0 +1,44 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""vLLM Attention Benchmarking Suite."""
from .batch_spec import (
BatchRequest,
format_batch_spec,
get_batch_stats,
parse_batch_spec,
reorder_for_flashinfer,
split_by_type,
)
from .common import (
BenchmarkConfig,
BenchmarkResult,
MockLayer,
MockModelConfig,
ResultsFormatter,
get_attention_scale,
is_mla_backend,
setup_mla_dims,
)
__all__ = [
# Batch specification
"BatchRequest",
"parse_batch_spec",
"format_batch_spec",
"reorder_for_flashinfer",
"split_by_type",
"get_batch_stats",
# Benchmarking infrastructure
"BenchmarkConfig",
"BenchmarkResult",
"ResultsFormatter",
# Mock objects
"MockLayer",
"MockModelConfig",
# Utilities
"setup_mla_dims",
"get_attention_scale",
"is_mla_backend",
]

View File

@@ -0,0 +1,231 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Simplified batch specification grammar for attention benchmarks.
Grammar (underscore-separated segments):
Format: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
- count: Number of identical requests (optional, default=1)
- q_len: Query length (number of new tokens)
- seq_len: Total sequence length (optional, defaults to q_len for prefill)
- 'k' suffix: Multiplies value by 1024
Common patterns:
- Prefill: q_len == seq_len (e.g., "q2k" → 2048 new tokens, 2048 seq)
- Decode: q_len == 1 (e.g., "q1s1k" → 1 token, 1024 seq length)
- Extend: q_len < seq_len (e.g., "q4s1k" → 4 tokens, 1024 seq length)
Examples:
q2k -> [(2048, 2048)] # Prefill: 2048 tokens
q1s1k -> [(1, 1024)] # Decode: 1 token, 1K sequence
8q1s1k -> [(1, 1024)] * 8 # 8 decode requests
q4s1k -> [(4, 1024)] # 4-token extend (spec decode)
2q1k_32q1s1k -> [(1024, 1024)] * 2 + [(1, 1024)] * 32 # Mixed batch
16q4s1k -> [(4, 1024)] * 16 # 16 spec decode requests
"""
from collections import Counter
from dataclasses import dataclass
import regex as re
@dataclass
class BatchRequest:
"""Represents a single request in a batch."""
q_len: int # Query length (number of new tokens)
kv_len: int # Total KV cache length
@property
def is_decode(self) -> bool:
"""True if this is a decode request (q_len == 1)."""
return self.q_len == 1
@property
def is_prefill(self) -> bool:
"""True if this is a pure prefill (q_len == kv_len)."""
return self.q_len == self.kv_len
@property
def is_extend(self) -> bool:
"""True if this is context extension (q_len > 1, kv_len > q_len)."""
return self.q_len > 1 and self.kv_len > self.q_len
@property
def context_len(self) -> int:
"""Context length (KV cache - query)."""
return self.kv_len - self.q_len
def as_tuple(self) -> tuple[int, int]:
"""Return as (q_len, kv_len) tuple for compatibility."""
return (self.q_len, self.kv_len)
def _parse_size(size_str: str, k_suffix: str) -> int:
"""Parse size string with optional 'k' suffix."""
size = int(size_str)
return size * 1024 if k_suffix == "k" else size
def parse_batch_spec(spec: str) -> list[BatchRequest]:
"""
Parse batch specification string into list of BatchRequest objects.
Grammar: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
Args:
spec: Batch specification string (see module docstring for grammar)
Returns:
List of BatchRequest objects
Raises:
ValueError: If spec format is invalid
"""
requests = []
for seg in spec.split("_"):
# Unified pattern: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
m = re.match(r"^(?:(\d+))?q(\d+)(k?)(?:s(\d+)(k?))?$", seg)
if m:
cnt = int(m.group(1)) if m.group(1) else 1
q_len = _parse_size(m.group(2), m.group(3))
kv_len = _parse_size(m.group(4), m.group(5)) if m.group(4) else q_len
requests.extend([BatchRequest(q_len=q_len, kv_len=kv_len)] * cnt)
continue
raise ValueError(f"Invalid batch spec segment: '{seg}'")
return requests
def format_batch_spec(requests: list[BatchRequest]) -> str:
"""
Format list of BatchRequest into human-readable string.
Groups requests by type and provides counts and sizes.
Args:
requests: List of BatchRequest objects
Returns:
Formatted string describing the batch
"""
kinds = {
"prefill": [],
"extend": [],
"decode": [],
}
for req in requests:
tup = (req.q_len, req.kv_len)
if req.is_prefill:
kinds["prefill"].append(tup)
elif req.is_extend:
kinds["extend"].append(tup)
elif req.is_decode:
kinds["decode"].append(tup)
parts = []
for kind in ["prefill", "extend", "decode"]:
lst = kinds[kind]
if not lst:
continue
cnt_total = len(lst)
ctr = Counter(lst)
inner = []
for (q, kv), cnt in ctr.items():
if kind == "prefill":
size = f"{q // 1024}k" if q % 1024 == 0 else str(q)
inner.append(f"{cnt}x{size}")
elif kind == "decode":
size = f"{kv // 1024}k" if kv % 1024 == 0 else str(kv)
inner.append(f"{cnt}x{size}")
else: # extend
qstr = f"{q // 1024}k" if q % 1024 == 0 else str(q)
kstr = f"{kv // 1024}k" if kv % 1024 == 0 else str(kv)
inner.append(f"{cnt}xq{qstr}kv{kstr}")
parts.append(f"{cnt_total} {kind} ({', '.join(inner)})")
return ", ".join(parts)
def reorder_for_flashinfer(requests: list[BatchRequest]) -> list[BatchRequest]:
"""
Reorder requests for FlashInfer: decode first, then prefill.
FlashInfer expects decode requests before prefill requests for
optimal performance.
Args:
requests: Original list of BatchRequest
Returns:
Reordered list with decode requests first
"""
decodes = [r for r in requests if r.is_decode]
non_decodes = [r for r in requests if not r.is_decode]
return decodes + non_decodes
def split_by_type(
requests: list[BatchRequest],
) -> dict[str, list[BatchRequest]]:
"""
Split requests by type for analysis.
Args:
requests: List of BatchRequest
Returns:
Dict with keys: 'decode', 'prefill', 'extend'
"""
result = {
"decode": [],
"prefill": [],
"extend": [],
}
for req in requests:
if req.is_decode:
result["decode"].append(req)
elif req.is_prefill:
result["prefill"].append(req)
elif req.is_extend:
result["extend"].append(req)
return result
def get_batch_stats(requests: list[BatchRequest]) -> dict:
"""
Compute statistics about a batch.
Args:
requests: List of BatchRequest
Returns:
Dict with batch statistics
"""
by_type = split_by_type(requests)
return {
"total_requests": len(requests),
"num_decode": len(by_type["decode"]),
"num_prefill": len(by_type["prefill"]),
"num_extend": len(by_type["extend"]),
"total_tokens": sum(r.q_len for r in requests),
"total_kv_cache": sum(r.kv_len for r in requests),
"max_q_len": max((r.q_len for r in requests), default=0),
"max_kv_len": max((r.kv_len for r in requests), default=0),
"avg_q_len": sum(r.q_len for r in requests) / len(requests) if requests else 0,
"avg_kv_len": (
sum(r.kv_len for r in requests) / len(requests) if requests else 0
),
}

View File

@@ -0,0 +1,886 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Universal vLLM Attention Benchmark
Benchmark any attention backend with the extended grammar.
Supports standard attention (Flash/Triton/FlashInfer) and MLA backends.
Examples:
# Standard attention
python benchmark.py --backends flash flashinfer --batch-specs "q2k" "8q1s1k"
# MLA backends
python benchmark.py --backends cutlass_mla flashinfer_mla --batch-specs "64q1s1k"
# Parameter sweep (CLI)
python benchmark.py --backend cutlass_mla \
--batch-specs "64q1s1k" \
--sweep-param num_kv_splits \
--sweep-values 1 4 8 16
# Parameter sweep (YAML config - recommended)
python benchmark.py --config configs/cutlass_numsplits.yaml
"""
import argparse
import sys
from dataclasses import replace
from pathlib import Path
import yaml
from rich.console import Console
from tqdm import tqdm
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from batch_spec import parse_batch_spec
from common import (
BenchmarkConfig,
BenchmarkResult,
ModelParameterSweep,
ParameterSweep,
ResultsFormatter,
is_mla_backend,
)
def run_standard_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
"""Run standard attention benchmark (Flash/Triton/FlashInfer)."""
from runner import run_attention_benchmark
return run_attention_benchmark(config)
def run_mla_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
"""Run MLA benchmark with appropriate backend."""
from mla_runner import run_mla_benchmark as run_mla
return run_mla(config.backend, config, **kwargs)
def run_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
"""
Run a single benchmark with proper backend selection.
Args:
config: BenchmarkConfig with backend, batch_spec, and model params
**kwargs: Additional arguments passed to MLA benchmarks
Returns:
BenchmarkResult (may have error field set on failure)
"""
try:
if is_mla_backend(config.backend):
return run_mla_benchmark(config, **kwargs)
else:
return run_standard_attention_benchmark(config)
except Exception as e:
return BenchmarkResult(
config=config,
mean_time=float("inf"),
std_time=0,
min_time=float("inf"),
max_time=float("inf"),
error=str(e),
)
def run_model_parameter_sweep(
backends: list[str],
batch_specs: list[str],
base_config_args: dict,
sweep: ModelParameterSweep,
console: Console,
) -> list[BenchmarkResult]:
"""
Run model parameter sweep for given backends and batch specs.
Args:
backends: List of backend names
batch_specs: List of batch specifications
base_config_args: Base configuration arguments (num_layers, head_dim, etc.)
sweep: ModelParameterSweep configuration
console: Rich console for output
Returns:
List of BenchmarkResult objects
"""
all_results = []
console.print(
f"[yellow]Model sweep mode: testing {sweep.param_name} = {sweep.values}[/]"
)
total = len(backends) * len(batch_specs) * len(sweep.values)
with tqdm(total=total, desc="Benchmarking") as pbar:
for backend in backends:
for spec in batch_specs:
for value in sweep.values:
# Create config with modified model parameter
config_args = base_config_args.copy()
config_args[sweep.param_name] = value
# Create config with original backend for running
clean_config = BenchmarkConfig(
backend=backend, batch_spec=spec, **config_args
)
# Run benchmark
result = run_benchmark(clean_config)
# Replace backend with labeled version for display
backend_label = sweep.get_label(backend, value)
labeled_config = replace(result.config, backend=backend_label)
result = replace(result, config=labeled_config)
all_results.append(result)
if not result.success:
console.print(
f"[red]Error {backend} {spec} {sweep.param_name}="
f"{value}: {result.error}[/]"
)
pbar.update(1)
# Display sweep results - create separate table for each parameter value
console.print("\n[bold green]Model Parameter Sweep Results:[/]")
formatter = ResultsFormatter(console)
# Group results by parameter value and extract backend mapping
by_param_value = {}
backend_mapping = {} # Maps labeled backend -> original backend
for r in all_results:
# Extract original backend and param value from labeled backend
# The label format is: {backend}_{param_name}_{value}
# We need to reverse engineer this
labeled_backend = r.config.backend
# Try each backend to find which one this result belongs to
for backend in backends:
for value in sweep.values:
expected_label = sweep.get_label(backend, value)
if labeled_backend == expected_label:
backend_mapping[labeled_backend] = backend
param_value = str(value)
if param_value not in by_param_value:
by_param_value[param_value] = []
by_param_value[param_value].append(r)
break
# Create a table for each parameter value
sorted_param_values = sorted(
by_param_value.keys(), key=lambda x: int(x) if x.isdigit() else x
)
for param_value in sorted_param_values:
console.print(f"\n[bold cyan]{sweep.param_name} = {param_value}[/]")
param_results = by_param_value[param_value]
# Create modified results with original backend names
modified_results = []
for r in param_results:
# Get the original backend name from our mapping
original_backend = backend_mapping[r.config.backend]
modified_config = replace(r.config, backend=original_backend)
modified_result = replace(r, config=modified_config)
modified_results.append(modified_result)
# Print table with original backend names
formatter.print_table(modified_results, backends, compare_to_fastest=True)
# Show optimal backend for each (param_value, batch_spec) combination
console.print(
f"\n[bold cyan]Optimal backend for each ({sweep.param_name}, batch_spec):[/]"
)
# Group by (param_value, batch_spec)
by_param_and_spec = {}
for r in all_results:
if r.success:
# Find which (backend, value) this result corresponds to
labeled_backend = r.config.backend
for backend in backends:
for value in sweep.values:
expected_label = sweep.get_label(backend, value)
if labeled_backend == expected_label:
param_value = str(value)
spec = r.config.batch_spec
key = (param_value, spec)
if key not in by_param_and_spec:
by_param_and_spec[key] = []
by_param_and_spec[key].append(r)
break
# Sort by param value then spec
sorted_keys = sorted(
by_param_and_spec.keys(),
key=lambda x: (int(x[0]) if x[0].isdigit() else x[0], x[1]),
)
current_param_value = None
for param_value, spec in sorted_keys:
# Print header when param value changes
if param_value != current_param_value:
console.print(f"\n [bold]{sweep.param_name}={param_value}:[/]")
current_param_value = param_value
results = by_param_and_spec[(param_value, spec)]
best = min(results, key=lambda r: r.mean_time)
# Extract original backend name using the mapping
backend_name = backend_mapping[best.config.backend]
# Show all backends' times for comparison
times_str = " | ".join(
[
f"{backend_mapping[r.config.backend]}: {r.mean_time:.6f}s"
for r in sorted(results, key=lambda r: r.mean_time)
]
)
console.print(
f" {spec:12s} -> [bold green]{backend_name:15s}[/] ({times_str})"
)
return all_results
def run_parameter_sweep(
backends: list[str],
batch_specs: list[str],
base_config_args: dict,
sweep: ParameterSweep,
console: Console,
) -> list[BenchmarkResult]:
"""
Run parameter sweep for given backends and batch specs.
Args:
backends: List of backend names
batch_specs: List of batch specifications
base_config_args: Base configuration arguments (num_layers, head_dim, etc.)
sweep: ParameterSweep configuration
console: Rich console for output
Returns:
List of BenchmarkResult objects
"""
all_results = []
# Build list of values to sweep (including auto if requested)
sweep_values = list(sweep.values)
if sweep.include_auto:
sweep_values.append("auto")
console.print(f"[yellow]Sweep mode: testing {sweep.param_name} = {sweep_values}[/]")
total = len(backends) * len(batch_specs) * len(sweep_values)
with tqdm(total=total, desc="Benchmarking") as pbar:
for backend in backends:
for spec in batch_specs:
for value in sweep_values:
# Create config with original backend for running
config = BenchmarkConfig(
backend=backend, batch_spec=spec, **base_config_args
)
# Prepare kwargs for benchmark runner
kwargs = {}
if value != "auto":
kwargs[sweep.param_name] = value
# Run benchmark
result = run_benchmark(config, **kwargs)
# Replace backend with labeled version for display
backend_label = sweep.get_label(backend, value)
labeled_config = replace(result.config, backend=backend_label)
result = replace(result, config=labeled_config)
all_results.append(result)
if not result.success:
console.print(
f"[red]Error {backend} {spec} {sweep.param_name}="
f"{value}: {result.error}[/]"
)
pbar.update(1)
# Display sweep results
console.print("\n[bold green]Sweep Results:[/]")
backend_labels = [sweep.get_label(b, v) for b in backends for v in sweep_values]
formatter = ResultsFormatter(console)
formatter.print_table(all_results, backend_labels)
# Show optimal values
console.print(f"\n[bold cyan]Optimal {sweep.param_name} per batch spec:[/]")
by_spec = {}
for r in all_results:
if r.success:
spec = r.config.batch_spec
if spec not in by_spec:
by_spec[spec] = []
by_spec[spec].append(r)
for spec in sorted(by_spec.keys()):
results = by_spec[spec]
best = min(results, key=lambda r: r.mean_time)
console.print(
f" {spec}: [bold green]{best.config.backend}[/] ({best.mean_time:.6f}s)"
)
return all_results
def load_config_from_yaml(config_path: str) -> dict:
"""Load configuration from YAML file."""
with open(config_path) as f:
return yaml.safe_load(f)
def generate_batch_specs_from_ranges(ranges: list[dict]) -> list[str]:
"""
Generate batch specs from range specifications.
Args:
ranges: List of range specifications, each containing:
- template: Batch spec template (e.g., "q{q_len}kv1k")
- q_len: Dict with start, stop, step, end_inclusive (optional)
- Other parameters can also be ranges
Returns:
List of generated batch spec strings
Example:
ranges = [
{
"template": "q{q_len}kv1k",
"q_len": {
"start": 1,
"stop": 16,
"step": 1,
"end_inclusive": true # Optional, defaults to true
}
}
]
Returns: ["q1kv1k", "q2kv1k", ..., "q16kv1k"]
"""
all_specs = []
for range_spec in ranges:
template = range_spec.get("template")
if not template:
raise ValueError("Range specification must include 'template'")
# Extract all range parameters from the spec
range_params = {}
for key, value in range_spec.items():
if key == "template":
continue
if isinstance(value, dict) and "start" in value:
# This is a range specification
start = value["start"]
stop = value["stop"]
step = value.get("step", 1)
# Check if end should be inclusive (default: True)
end_inclusive = value.get("end_inclusive", True)
# Adjust stop based on end_inclusive
if end_inclusive:
range_params[key] = list(range(start, stop + 1, step))
else:
range_params[key] = list(range(start, stop, step))
else:
# This is a fixed value
range_params[key] = [value]
# Generate all combinations (Cartesian product)
if range_params:
import itertools
param_names = list(range_params.keys())
param_values = [range_params[name] for name in param_names]
for values in itertools.product(*param_values):
params = dict(zip(param_names, values))
spec = template.format(**params)
all_specs.append(spec)
else:
# No parameters, just use template as-is
all_specs.append(template)
return all_specs
def main():
parser = argparse.ArgumentParser(
description="Universal vLLM attention benchmark",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__,
)
# Config file
parser.add_argument(
"--config",
help="Path to YAML config file (overrides other args)",
)
# Backend selection
parser.add_argument(
"--backends",
nargs="+",
help="Backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
"flashinfer_mla, flashattn_mla, flashmla)",
)
parser.add_argument(
"--backend",
help="Single backend (alternative to --backends)",
)
# Batch specifications
parser.add_argument(
"--batch-specs",
nargs="+",
default=["q2k", "8q1s1k"],
help="Batch specifications using extended grammar",
)
# Model config
parser.add_argument("--num-layers", type=int, default=10, help="Number of layers")
parser.add_argument("--head-dim", type=int, default=128, help="Head dimension")
parser.add_argument("--num-q-heads", type=int, default=32, help="Query heads")
parser.add_argument("--num-kv-heads", type=int, default=8, help="KV heads")
parser.add_argument("--block-size", type=int, default=16, help="Block size")
# Benchmark settings
parser.add_argument("--device", default="cuda:0", help="Device")
parser.add_argument("--repeats", type=int, default=1, help="Repetitions")
parser.add_argument("--warmup-iters", type=int, default=3, help="Warmup iterations")
parser.add_argument("--profile-memory", action="store_true", help="Profile memory")
# Parameter sweep (use YAML config for advanced sweeps)
parser.add_argument(
"--sweep-param",
help="Parameter name to sweep (e.g., num_kv_splits, reorder_batch_threshold)",
)
parser.add_argument(
"--sweep-values",
type=int,
nargs="+",
help="Values to sweep for the parameter",
)
# Output
parser.add_argument("--output-csv", help="Save to CSV")
parser.add_argument("--output-json", help="Save to JSON")
args = parser.parse_args()
console = Console()
console.print("[bold cyan]vLLM Attention Benchmark[/]")
# Load config from YAML if provided
if args.config:
console.print(f"[yellow]Loading config from: {args.config}[/]")
yaml_config = load_config_from_yaml(args.config)
# Show description if available
if "description" in yaml_config:
console.print(f"[dim]{yaml_config['description']}[/]")
# Override args with YAML values
# (YAML takes precedence unless CLI arg was explicitly set)
# Backend(s)
if "backend" in yaml_config:
args.backend = yaml_config["backend"]
args.backends = None
elif "backends" in yaml_config:
args.backends = yaml_config["backends"]
args.backend = None
# Check for special modes
if "mode" in yaml_config:
args.mode = yaml_config["mode"]
else:
args.mode = None
# Batch specs and sizes
# Support both explicit batch_specs and generated batch_spec_ranges
if "batch_spec_ranges" in yaml_config:
# Generate batch specs from ranges
generated_specs = generate_batch_specs_from_ranges(
yaml_config["batch_spec_ranges"]
)
# Combine with any explicit batch_specs
if "batch_specs" in yaml_config:
args.batch_specs = yaml_config["batch_specs"] + generated_specs
else:
args.batch_specs = generated_specs
console.print(
f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]"
)
elif "batch_specs" in yaml_config:
args.batch_specs = yaml_config["batch_specs"]
if "batch_sizes" in yaml_config:
args.batch_sizes = yaml_config["batch_sizes"]
else:
args.batch_sizes = None
# Model config
if "model" in yaml_config:
model = yaml_config["model"]
args.num_layers = model.get("num_layers", args.num_layers)
args.head_dim = model.get("head_dim", args.head_dim)
args.num_q_heads = model.get("num_q_heads", args.num_q_heads)
args.num_kv_heads = model.get("num_kv_heads", args.num_kv_heads)
args.block_size = model.get("block_size", args.block_size)
# Benchmark settings
if "benchmark" in yaml_config:
bench = yaml_config["benchmark"]
args.device = bench.get("device", args.device)
args.repeats = bench.get("repeats", args.repeats)
args.warmup_iters = bench.get("warmup_iters", args.warmup_iters)
args.profile_memory = bench.get("profile_memory", args.profile_memory)
# Parameter sweep configuration
if "parameter_sweep" in yaml_config:
sweep_config = yaml_config["parameter_sweep"]
args.parameter_sweep = ParameterSweep(
param_name=sweep_config["param_name"],
values=sweep_config["values"],
include_auto=sweep_config.get("include_auto", False),
label_format=sweep_config.get(
"label_format", "{backend}_{param_name}_{value}"
),
)
else:
args.parameter_sweep = None
# Model parameter sweep configuration
if "model_parameter_sweep" in yaml_config:
sweep_config = yaml_config["model_parameter_sweep"]
args.model_parameter_sweep = ModelParameterSweep(
param_name=sweep_config["param_name"],
values=sweep_config["values"],
label_format=sweep_config.get(
"label_format", "{backend}_{param_name}_{value}"
),
)
else:
args.model_parameter_sweep = None
# Output
if "output" in yaml_config:
output = yaml_config["output"]
if "csv" in output and not args.output_csv:
args.output_csv = output["csv"]
if "json" in output and not args.output_json:
args.output_json = output["json"]
console.print()
# Handle CLI-based parameter sweep (if not from YAML)
if (
(not hasattr(args, "parameter_sweep") or args.parameter_sweep is None)
and args.sweep_param
and args.sweep_values
):
args.parameter_sweep = ParameterSweep(
param_name=args.sweep_param,
values=args.sweep_values,
include_auto=False,
label_format="{backend}_{param_name}_{value}",
)
# Determine backends
backends = args.backends or ([args.backend] if args.backend else ["flash"])
console.print(f"Backends: {', '.join(backends)}")
console.print(f"Batch specs: {', '.join(args.batch_specs)}")
console.print()
# Run benchmarks
all_results = []
# Handle special mode: decode_vs_prefill comparison
if hasattr(args, "mode") and args.mode == "decode_vs_prefill":
console.print("[yellow]Mode: Decode vs Prefill pipeline comparison[/]")
console.print(
"[dim]For each query length, testing both decode and prefill pipelines[/]"
)
console.print("[dim]Using batched execution for optimal performance[/]")
# Extract batch sizes from config
batch_sizes = getattr(args, "batch_sizes", [1])
backend = backends[0] # Use first backend (should only be one)
# Calculate total benchmarks
total = len(batch_sizes)
with tqdm(total=total, desc="Benchmarking") as pbar:
for batch_size in batch_sizes:
# Prepare all configs for this batch size
configs_with_thresholds = []
for spec in args.batch_specs:
# Parse the batch spec to get query length
requests = parse_batch_spec(spec)
if not requests:
console.print(
f"[red]Error: Could not parse batch spec '{spec}'[/]"
)
continue
# Get query length from first request
query_length = requests[0].q_len
# Create batch spec for this batch size
# For batch_size > 1, we need to prepend the count
batch_spec = f"{batch_size}{spec}" if batch_size > 1 else spec
# Create base config (without backend name)
base_config = BenchmarkConfig(
backend=backend, # Will be overridden later
batch_spec=batch_spec,
num_layers=args.num_layers,
head_dim=args.head_dim,
num_q_heads=args.num_q_heads,
num_kv_heads=args.num_kv_heads,
block_size=args.block_size,
device=args.device,
repeats=args.repeats,
warmup_iters=args.warmup_iters,
profile_memory=args.profile_memory,
)
# Add decode pipeline config
decode_threshold = query_length
config_decode = replace(
base_config,
backend=f"{backend}_decode_qlen{query_length}_bs{batch_size}",
)
configs_with_thresholds.append((config_decode, decode_threshold))
# Add prefill pipeline config if query_length > 1
if query_length > 1:
prefill_threshold = query_length - 1
config_prefill = replace(
base_config,
backend=f"{backend}_prefill_qlen{query_length}"
f"_bs{batch_size}",
)
configs_with_thresholds.append(
(config_prefill, prefill_threshold)
)
# Run all benchmarks for this batch size in one go (batched mode)
try:
from mla_runner import run_mla_benchmark as run_mla
# Use batched API: pass list of (config, threshold) tuples
timing_results = run_mla(backend, configs_with_thresholds)
# Create BenchmarkResult objects from timing results
for (config, _), timing in zip(
configs_with_thresholds, timing_results
):
result = BenchmarkResult(
config=config,
mean_time=timing["mean"],
std_time=timing["std"],
min_time=timing["min"],
max_time=timing["max"],
throughput_tokens_per_sec=timing.get("throughput", None),
)
all_results.append(result)
except Exception as e:
import traceback
console.print(
f"[red]Error running batched benchmarks for "
f"batch_size={batch_size}: {e}[/]"
)
console.print("[red]Traceback:[/]")
traceback.print_exc()
# Add error results for all configs
for config, _ in configs_with_thresholds:
result = BenchmarkResult(
config=config,
mean_time=float("inf"),
std_time=0,
min_time=float("inf"),
max_time=float("inf"),
error=str(e),
)
all_results.append(result)
pbar.update(1)
# Display decode vs prefill results
console.print("\n[bold green]Decode vs Prefill Results:[/]")
# Group by batch size
by_batch_size = {}
for r in all_results:
if r.success:
# Extract batch size from backend name
parts = r.config.backend.split("_")
bs_part = [p for p in parts if p.startswith("bs")]
if bs_part:
bs = int(bs_part[0][2:])
if bs not in by_batch_size:
by_batch_size[bs] = []
by_batch_size[bs].append(r)
# For each batch size, analyze crossover point
for bs in sorted(by_batch_size.keys()):
console.print(f"\n[bold cyan]Batch size: {bs}[/]")
results = by_batch_size[bs]
# Group by query length
by_qlen = {}
for r in results:
parts = r.config.backend.split("_")
qlen_part = [p for p in parts if p.startswith("qlen")]
if qlen_part:
qlen = int(qlen_part[0][4:])
if qlen not in by_qlen:
by_qlen[qlen] = {}
pipeline = "decode" if "decode" in r.config.backend else "prefill"
by_qlen[qlen][pipeline] = r
# Find crossover point
last_decode_faster = None
for qlen in sorted(by_qlen.keys()):
pipelines = by_qlen[qlen]
if "decode" in pipelines and "prefill" in pipelines:
decode_time = pipelines["decode"].mean_time
prefill_time = pipelines["prefill"].mean_time
faster = "decode" if decode_time < prefill_time else "prefill"
speedup = (
prefill_time / decode_time
if decode_time < prefill_time
else decode_time / prefill_time
)
console.print(
f" qlen={qlen:3d}: decode={decode_time:.6f}s, "
f"prefill={prefill_time:.6f}s -> "
f"[bold]{faster}[/] ({speedup:.2f}x)"
)
if faster == "decode":
last_decode_faster = qlen
if last_decode_faster is not None:
optimal_threshold = last_decode_faster
console.print(
f"\n [bold green]Optimal threshold for batch_size={bs}: "
f"{optimal_threshold}[/]"
)
console.print(
f" [dim](Use decode pipeline for query_length <= "
f"{optimal_threshold})[/]"
)
else:
console.print(
f"\n [yellow]Prefill always faster for batch_size={bs}[/]"
)
# Handle model parameter sweep mode
elif hasattr(args, "model_parameter_sweep") and args.model_parameter_sweep:
# Model parameter sweep
base_config_args = {
"num_layers": args.num_layers,
"head_dim": args.head_dim,
"num_q_heads": args.num_q_heads,
"num_kv_heads": args.num_kv_heads,
"block_size": args.block_size,
"device": args.device,
"repeats": args.repeats,
"warmup_iters": args.warmup_iters,
"profile_memory": args.profile_memory,
}
all_results = run_model_parameter_sweep(
backends,
args.batch_specs,
base_config_args,
args.model_parameter_sweep,
console,
)
# Handle parameter sweep mode (unified)
elif hasattr(args, "parameter_sweep") and args.parameter_sweep:
# Unified parameter sweep
base_config_args = {
"num_layers": args.num_layers,
"head_dim": args.head_dim,
"num_q_heads": args.num_q_heads,
"num_kv_heads": args.num_kv_heads,
"block_size": args.block_size,
"device": args.device,
"repeats": args.repeats,
"warmup_iters": args.warmup_iters,
"profile_memory": args.profile_memory,
}
all_results = run_parameter_sweep(
backends, args.batch_specs, base_config_args, args.parameter_sweep, console
)
else:
# Normal mode: compare backends
total = len(backends) * len(args.batch_specs)
with tqdm(total=total, desc="Benchmarking") as pbar:
for spec in args.batch_specs:
for backend in backends:
config = BenchmarkConfig(
backend=backend,
batch_spec=spec,
num_layers=args.num_layers,
head_dim=args.head_dim,
num_q_heads=args.num_q_heads,
num_kv_heads=args.num_kv_heads,
block_size=args.block_size,
device=args.device,
repeats=args.repeats,
warmup_iters=args.warmup_iters,
profile_memory=args.profile_memory,
)
result = run_benchmark(config)
all_results.append(result)
if not result.success:
console.print(f"[red]Error {backend} {spec}: {result.error}[/]")
pbar.update(1)
# Display results
console.print("\n[bold green]Results:[/]")
formatter = ResultsFormatter(console)
formatter.print_table(all_results, backends)
# Save results
if all_results:
formatter = ResultsFormatter(console)
if args.output_csv:
formatter.save_csv(all_results, args.output_csv)
if args.output_json:
formatter.save_json(all_results, args.output_json)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,503 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Common utilities for attention benchmarking."""
import csv
import json
import math
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any
import numpy as np
import torch
from rich.console import Console
from rich.table import Table
# Mock classes for vLLM attention infrastructure
class MockHfConfig:
"""Mock HuggingFace config that satisfies vLLM's requirements."""
def __init__(self, mla_dims: dict):
self.num_attention_heads = mla_dims["num_q_heads"]
self.num_key_value_heads = mla_dims["num_kv_heads"]
self.hidden_size = mla_dims["head_dim"] * mla_dims["num_q_heads"]
self.model_type = "deepseek_v2"
self.is_encoder_decoder = False
self.kv_lora_rank = mla_dims["kv_lora_rank"]
self.qk_nope_head_dim = mla_dims["qk_nope_head_dim"]
self.qk_rope_head_dim = mla_dims["qk_rope_head_dim"]
self.v_head_dim = mla_dims["v_head_dim"]
self.qk_head_dim = mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]
def get_text_config(self):
return self
# Import AttentionLayerBase at module level to avoid circular dependencies
try:
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
_HAS_ATTENTION_LAYER_BASE = True
except ImportError:
_HAS_ATTENTION_LAYER_BASE = False
AttentionLayerBase = object # Fallback
class MockKVBProj:
"""Mock KV projection layer for MLA prefill mode.
Mimics ColumnParallelLinear behavior for kv_b_proj in MLA backends.
Projects kv_c_normed to [qk_nope_head_dim + v_head_dim] per head.
"""
def __init__(self, num_heads: int, qk_nope_head_dim: int, v_head_dim: int):
self.num_heads = num_heads
self.qk_nope_head_dim = qk_nope_head_dim
self.v_head_dim = v_head_dim
self.out_dim = qk_nope_head_dim + v_head_dim
def __call__(self, x: torch.Tensor) -> tuple[torch.Tensor]:
"""
Project kv_c_normed to output space.
Args:
x: Input tensor [num_tokens, kv_lora_rank]
Returns:
Tuple containing output tensor
[num_tokens, num_heads, qk_nope_head_dim + v_head_dim]
"""
num_tokens = x.shape[0]
result = torch.randn(
num_tokens,
self.num_heads,
self.out_dim,
device=x.device,
dtype=x.dtype,
)
return (result,) # Return as tuple to match ColumnParallelLinear API
class MockLayer(AttentionLayerBase):
"""Mock attention layer with scale parameters and impl.
Inherits from AttentionLayerBase so it passes isinstance checks
in get_layers_from_vllm_config when FlashInfer prefill is enabled.
"""
def __init__(self, device: torch.device, impl=None, kv_cache_spec=None):
# Don't call super().__init__() as AttentionLayerBase doesn't have __init__
self._k_scale = torch.tensor(1.0, device=device)
self._v_scale = torch.tensor(1.0, device=device)
self._q_scale = torch.tensor(1.0, device=device)
# Scalar floats for kernels that need them
self._k_scale_float = float(self._k_scale.item())
self._v_scale_float = float(self._v_scale.item())
self._q_scale_float = float(self._q_scale.item())
# AttentionImpl for metadata builders to query
self.impl = impl
# KV cache spec for get_kv_cache_spec
self._kv_cache_spec = kv_cache_spec
def get_attn_backend(self):
"""Get the attention backend class (required by AttentionLayerBase)."""
# Return None as this is just a mock layer for benchmarking
return None
def get_kv_cache_spec(self):
"""Get the KV cache spec (required by AttentionLayerBase)."""
return self._kv_cache_spec
class MockModelConfig:
"""Mock model configuration."""
def __init__(
self,
num_q_heads: int,
num_kv_heads: int,
head_dim: int,
dtype: torch.dtype = torch.float16,
max_model_len: int = 32768,
):
self._n_q = num_q_heads
self._n_kv = num_kv_heads
self._d = head_dim
self.dtype = dtype
self.max_model_len = max_model_len
def get_num_attention_heads(self, _=None) -> int:
return self._n_q
def get_num_kv_heads(self, _=None) -> int:
return self._n_kv
def get_head_size(self) -> int:
return self._d
def get_num_layers(self) -> int:
"""Mock method for layer count queries."""
return 1
def get_sliding_window_for_layer(self, _layer_idx: int):
"""Mock method for sliding window queries."""
return None
def get_logits_soft_cap_for_layer(self, _layer_idx: int):
"""Mock method for logits soft cap queries."""
return None
def get_sm_scale_for_layer(self, _layer_idx: int) -> float:
"""Mock method for SM scale queries."""
return 1.0 / (self.get_head_size() ** 0.5)
class MockParallelConfig:
"""Mock parallel configuration."""
pass
class MockCompilationConfig:
"""Mock compilation configuration."""
def __init__(self):
self.full_cuda_graph = False
self.static_forward_context = {}
class MockVLLMConfig:
"""Mock VLLM configuration."""
def __init__(self):
self.compilation_config = MockCompilationConfig()
class MockRunner:
"""Mock GPU runner for metadata builders."""
def __init__(
self,
seq_lens: np.ndarray,
query_start_locs: np.ndarray,
device: torch.device,
num_q_heads: int,
num_kv_heads: int,
head_dim: int,
dtype: torch.dtype,
):
self.model_config = MockModelConfig(num_q_heads, num_kv_heads, head_dim, dtype)
self.parallel_config = MockParallelConfig()
self.vllm_config = MockVLLMConfig()
self.seq_lens_np = seq_lens
self.query_start_loc_np = query_start_locs
self.device = device
self.attention_chunk_size = None
self.num_query_heads = num_q_heads
self.num_kv_heads = num_kv_heads
self.dtype = dtype
@dataclass
class ParameterSweep:
"""Configuration for sweeping a backend parameter."""
param_name: str # Name of the backend parameter to sweep
values: list[Any] # List of values to test
include_auto: bool = False # Also test with param unset (auto mode)
label_format: str = "{backend}_{param_name}_{value}" # Result label template
def get_label(self, backend: str, value: Any) -> str:
"""Generate a label for a specific parameter value."""
return self.label_format.format(
backend=backend, param_name=self.param_name, value=value
)
@dataclass
class ModelParameterSweep:
"""Configuration for sweeping a model configuration parameter."""
param_name: str # Name of the model config parameter to sweep (e.g., "num_q_heads")
values: list[Any] # List of values to test
label_format: str = "{backend}_{param_name}_{value}" # Result label template
def get_label(self, backend: str, value: Any) -> str:
"""Generate a label for a specific parameter value."""
return self.label_format.format(
backend=backend, param_name=self.param_name, value=value
)
@dataclass
class BenchmarkConfig:
"""Configuration for a single benchmark run."""
backend: str
batch_spec: str
num_layers: int
head_dim: int
num_q_heads: int
num_kv_heads: int
block_size: int
device: str
dtype: torch.dtype = torch.float16
repeats: int = 1
warmup_iters: int = 3
profile_memory: bool = False
use_cuda_graphs: bool = False
# MLA-specific
kv_lora_rank: int | None = None
qk_nope_head_dim: int | None = None
qk_rope_head_dim: int | None = None
v_head_dim: int | None = None
# Backend-specific tuning
num_kv_splits: int | None = None # CUTLASS MLA
reorder_batch_threshold: int | None = None # FlashAttn MLA, FlashMLA
@dataclass
class BenchmarkResult:
"""Results from a single benchmark run."""
config: BenchmarkConfig
mean_time: float # seconds
std_time: float # seconds
min_time: float # seconds
max_time: float # seconds
throughput_tokens_per_sec: float | None = None
memory_allocated_mb: float | None = None
memory_reserved_mb: float | None = None
error: str | None = None
@property
def success(self) -> bool:
"""Whether benchmark completed successfully."""
return self.error is None
def to_dict(self) -> dict[str, Any]:
"""Convert to dictionary for serialization."""
return {
"config": asdict(self.config),
"mean_time": self.mean_time,
"std_time": self.std_time,
"min_time": self.min_time,
"max_time": self.max_time,
"throughput_tokens_per_sec": self.throughput_tokens_per_sec,
"memory_allocated_mb": self.memory_allocated_mb,
"memory_reserved_mb": self.memory_reserved_mb,
"error": self.error,
}
class ResultsFormatter:
"""Format and display benchmark results."""
def __init__(self, console: Console | None = None):
self.console = console or Console()
def print_table(
self,
results: list[BenchmarkResult],
backends: list[str],
compare_to_fastest: bool = True,
):
"""
Print results as a rich table.
Args:
results: List of BenchmarkResult
backends: List of backend names being compared
compare_to_fastest: Show percentage comparison to fastest
"""
# Group by batch spec
by_spec = {}
for r in results:
spec = r.config.batch_spec
if spec not in by_spec:
by_spec[spec] = {}
by_spec[spec][r.config.backend] = r
# Create shortened backend names for display
def shorten_backend_name(name: str) -> str:
"""Shorten long backend names for table display."""
# Remove common prefixes
name = name.replace("flashattn_mla", "famla")
name = name.replace("flashinfer_mla", "fimla")
name = name.replace("flashmla", "fmla")
name = name.replace("cutlass_mla", "cmla")
name = name.replace("numsplits", "ns")
return name
table = Table(title="Attention Benchmark Results")
table.add_column("Batch\nSpec", no_wrap=True)
multi = len(backends) > 1
for backend in backends:
short_name = shorten_backend_name(backend)
# Time column
col_time = f"{short_name}\nTime (s)"
table.add_column(col_time, justify="right", no_wrap=False)
if multi and compare_to_fastest:
# Relative performance column
col_rel = f"{short_name}\nvs Best"
table.add_column(col_rel, justify="right", no_wrap=False)
# Add rows
for spec in sorted(by_spec.keys()):
spec_results = by_spec[spec]
times = {b: r.mean_time for b, r in spec_results.items() if r.success}
best_time = min(times.values()) if times else 0.0
row = [spec]
for backend in backends:
if backend in spec_results:
r = spec_results[backend]
if r.success:
row.append(f"{r.mean_time:.6f}")
if multi and compare_to_fastest:
pct = (
(r.mean_time / best_time * 100) if best_time > 0 else 0
)
pct_str = f"{pct:.1f}%"
if r.mean_time == best_time:
pct_str = f"[bold green]{pct_str}[/]"
row.append(pct_str)
else:
row.append("[red]ERROR[/]")
if multi and compare_to_fastest:
row.append("-")
else:
row.append("-")
if multi and compare_to_fastest:
row.append("-")
table.add_row(*row)
self.console.print(table)
def save_csv(self, results: list[BenchmarkResult], path: str):
"""Save results to CSV file."""
if not results:
return
path_obj = Path(path)
path_obj.parent.mkdir(parents=True, exist_ok=True)
with open(path, "w", newline="") as f:
writer = csv.DictWriter(
f,
fieldnames=[
"backend",
"batch_spec",
"num_layers",
"mean_time",
"std_time",
"throughput",
"memory_mb",
],
)
writer.writeheader()
for r in results:
writer.writerow(
{
"backend": r.config.backend,
"batch_spec": r.config.batch_spec,
"num_layers": r.config.num_layers,
"mean_time": r.mean_time,
"std_time": r.std_time,
"throughput": r.throughput_tokens_per_sec or 0,
"memory_mb": r.memory_allocated_mb or 0,
}
)
self.console.print(f"[green]Saved CSV results to {path}[/]")
def save_json(self, results: list[BenchmarkResult], path: str):
"""Save results to JSON file."""
path_obj = Path(path)
path_obj.parent.mkdir(parents=True, exist_ok=True)
data = [r.to_dict() for r in results]
with open(path, "w") as f:
json.dump(data, f, indent=2, default=str)
self.console.print(f"[green]Saved JSON results to {path}[/]")
def setup_mla_dims(model_name: str = "deepseek-v3") -> dict:
"""
Get MLA dimensions for known models.
Args:
model_name: Model identifier
Returns:
Dict with MLA dimension configuration
"""
configs = {
"deepseek-v2": {
"kv_lora_rank": 512,
"qk_nope_head_dim": 128,
"qk_rope_head_dim": 64,
"v_head_dim": 128,
"num_q_heads": 128,
"num_kv_heads": 1,
"head_dim": 576,
},
"deepseek-v3": {
"kv_lora_rank": 512,
"qk_nope_head_dim": 128,
"qk_rope_head_dim": 64,
"v_head_dim": 128,
"num_q_heads": 128,
"num_kv_heads": 1,
"head_dim": 576,
},
"deepseek-v2-lite": {
"kv_lora_rank": 512,
"qk_nope_head_dim": 128,
"qk_rope_head_dim": 64,
"v_head_dim": 128,
"num_q_heads": 16,
"num_kv_heads": 1,
"head_dim": 576,
},
}
if model_name not in configs:
raise ValueError(
f"Unknown model '{model_name}'. Known models: {list(configs.keys())}"
)
return configs[model_name]
def get_attention_scale(head_dim: int) -> float:
"""Compute attention scale factor (1/sqrt(d))."""
return 1.0 / math.sqrt(head_dim)
def is_mla_backend(backend: str) -> bool:
"""
Check if backend is an MLA backend using the backend's is_mla() property.
Args:
backend: Backend name (e.g., "CUTLASS_MLA", "FLASHINFER_MLA")
Returns:
True if the backend is an MLA backend, False otherwise
"""
from vllm.v1.attention.backends.registry import AttentionBackendEnum
try:
backend_class = AttentionBackendEnum[backend.upper()].get_class()
return backend_class.is_mla()
except (KeyError, ValueError, ImportError):
return False

View File

@@ -0,0 +1,61 @@
# MLA decode-only benchmark configuration
model:
name: "deepseek-v3"
num_layers: 60
num_q_heads: 128
num_kv_heads: 1 # MLA uses single latent KV
head_dim: 576
kv_lora_rank: 512
qk_nope_head_dim: 128
qk_rope_head_dim: 64
v_head_dim: 128
block_size: 128 # CUTLASS MLA and FlashAttn MLA use 128
batch_specs:
# Small batches, varying sequence lengths
- "16q1s512" # 16 requests, 512 KV cache
- "16q1s1k" # 16 requests, 1k KV cache
- "16q1s2k" # 16 requests, 2k KV cache
- "16q1s4k" # 16 requests, 4k KV cache
# Medium batches
- "32q1s1k" # 32 requests, 1k KV cache
- "32q1s2k" # 32 requests, 2k KV cache
- "32q1s4k" # 32 requests, 4k KV cache
- "32q1s8k" # 32 requests, 8k KV cache
# Large batches
- "64q1s1k" # 64 requests, 1k KV cache
- "64q1s2k" # 64 requests, 2k KV cache
- "64q1s4k" # 64 requests, 4k KV cache
- "64q1s8k" # 64 requests, 8k KV cache
# Very large batches
- "128q1s1k" # 128 requests, 1k KV cache
- "128q1s2k" # 128 requests, 2k KV cache
# Long context
- "32q1s16k" # 32 requests, 16k KV cache
- "32q1s32k" # 32 requests, 32k KV cache
backends:
- cutlass_mla
- flashinfer_mla
- flashattn_mla # Hopper only
- flashmla # Hopper only
device: "cuda:0"
repeats: 5
warmup_iters: 3
profile_memory: true
# Backend-specific tuning
cutlass_mla:
num_kv_splits: auto # or specific value like 4, 8, 16
flashattn_mla:
reorder_batch_threshold: 512
flashmla:
reorder_batch_threshold: 1

View File

@@ -0,0 +1,60 @@
# MLA mixed batch benchmark (prefill + decode)
# Tests chunked prefill performance
model:
name: "deepseek-v3"
num_layers: 60
num_q_heads: 128
num_kv_heads: 1
head_dim: 576
kv_lora_rank: 512
qk_nope_head_dim: 128
qk_rope_head_dim: 64
v_head_dim: 128
block_size: 128
batch_specs:
# Small prefill + decode
- "1q1k_8q1s1k" # 1 prefill + 8 decode
- "2q2k_16q1s1k" # 2 prefill + 16 decode
- "4q1k_32q1s2k" # 4 prefill + 32 decode
# Medium prefill + decode
- "2q4k_32q1s2k" # 2 medium prefill + 32 decode
- "4q4k_64q1s2k" # 4 medium prefill + 64 decode
- "8q2k_64q1s4k" # 8 prefill + 64 decode
# Large prefill + decode (chunked prefill stress test)
- "2q8k_32q1s1k" # 2 large prefill + 32 decode
- "1q16k_16q1s2k" # 1 very large prefill + 16 decode
- "2q16k_32q1s4k" # 2 very large prefill + 32 decode
# Context extension + decode
- "2q1kkv2k_16q1s1k" # 2 extend + 16 decode
- "4q2kkv4k_32q1s2k" # 4 extend + 32 decode
- "2q1kkv8k_32q1s2k" # 2 large extend + 32 decode
# Explicitly chunked prefill
- "q8k" # 8k prefill with chunking hint
- "q16k" # 16k prefill with chunking hint
- "2q8k_32q1s2k" # 2 chunked prefill + 32 decode
# High decode ratio (realistic serving)
- "1q2k_63q1s1k" # 1 prefill + 63 decode
- "2q2k_62q1s2k" # 2 prefill + 62 decode
- "4q4k_60q1s4k" # 4 prefill + 60 decode
backends:
- cutlass_mla
- flashinfer_mla
- flashattn_mla # Hopper only
- flashmla # Hopper only
device: "cuda:0"
repeats: 5
warmup_iters: 3
profile_memory: true
# Analyze chunked prefill workspace size impact
chunked_prefill:
test_workspace_sizes: [4096, 8192, 16384, 32768, 65536]

View File

@@ -0,0 +1,88 @@
# Study 4: What is optimal reorder_batch_threshold for MLA backends supporting query length > 1?
# Question: At what query length does prefill pipeline become faster than decode pipeline?
# Methodology: For each query length, compare decode vs prefill performance to find crossover point
# Applies to: FlashAttn MLA, FlashMLA
description: "Decode vs Prefill pipeline crossover analysis"
# Test FlashAttn MLA
backend: flashattn_mla
# Mode: decode_vs_prefill comparison (special sweep mode)
# For each batch spec, we'll test both decode and prefill pipelines
mode: "decode_vs_prefill"
# Query lengths to test (from old benchmark_mla_threshold.py methodology)
# Each query length will be tested with BOTH decode and prefill pipelines:
# - decode: threshold >= query_length (forces decode pipeline)
# - prefill: threshold < query_length (forces prefill pipeline)
#
# We use q<N>s1k format which creates q_len=N, seq_len=1024 requests
# This tests different query lengths with fixed sequence length context
#
# Using batch_spec_ranges for automatic generation:
batch_spec_ranges:
- template: "q{q_len}s1k"
q_len:
start: 1
stop: 16
step: 1
end_inclusive: false
- template: "q{q_len}s1k"
q_len:
start: 16
stop: 64
step: 2
end_inclusive: false
- template: "q{q_len}s1k"
q_len:
start: 64
stop: 1024
step: 4
end_inclusive: true
# Batch sizes to test (from old script)
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
# Model configuration (DeepSeek V2/V3 defaults)
model:
num_layers: 10
head_dim: 576
num_q_heads: 128
num_kv_heads: 1
block_size: 128
# Benchmark settings
benchmark:
device: "cuda:0"
repeats: 15 # More repeats for spec decode variance
warmup_iters: 5
profile_memory: false
# Output
output:
csv: "reorder_threshold_results.csv"
json: "reorder_threshold_results.json"
# Expected outcome (reproduces old benchmark_mla_threshold.py study):
# - For each batch size, find the crossover point where prefill becomes faster than decode
# - Show decode vs prefill performance across all query lengths
# - Determine optimal reorder_batch_threshold based on last query length where decode is faster
# - Understand how crossover point varies with batch size
# - Provide data-driven guidance for default threshold value
#
# Methodology (from old script):
# - Each query length tested with BOTH pipelines:
# * decode: threshold >= query_length (forces decode pipeline)
# * prefill: threshold < query_length (forces prefill pipeline)
# - Compare which is faster to find crossover point
#

View File

@@ -0,0 +1,62 @@
# Speculative decoding benchmark configuration
# Tests reorder_batch_threshold optimization
model:
name: "deepseek-v3"
num_layers: 60
num_q_heads: 128
num_kv_heads: 1
head_dim: 576
kv_lora_rank: 512
qk_nope_head_dim: 128
qk_rope_head_dim: 64
v_head_dim: 128
batch_specs:
# Pure speculative decode (K-token verification)
- "q2s1k" # 2-token spec, 1k KV
- "q4s1k" # 4-token spec, 1k KV
- "q8s1k" # 8-token spec, 1k KV
- "q16s1k" # 16-token spec, 1k KV
# Speculative with different context lengths
- "q4s2k" # 4-token spec, 2k KV
- "q4s4k" # 4-token spec, 4k KV
- "q8s2k" # 8-token spec, 2k KV
- "q8s4k" # 8-token spec, 4k KV
# Mixed: speculative + regular decode
- "32q4s1k" # 32 spec requests
- "16q4s1k_16q1s1k" # 16 spec + 16 regular
- "8q8s2k_24q1s2k" # 8 spec (8-tok) + 24 regular
# Mixed: speculative + prefill + decode
- "2q1k_16q4s1k_16q1s1k" # 2 prefill + 16 spec + 16 decode
- "4q2k_32q4s2k_32q1s2k" # 4 prefill + 32 spec + 32 decode
# Large batches with speculation
- "64q4s1k" # 64 spec requests
- "32q8s2k" # 32 spec (8-token)
- "16q16s4k" # 16 spec (16-token)
# Backends that support query length > 1
backends:
- flashattn_mla # reorder_batch_threshold = 512
- flashmla # reorder_batch_threshold = 1 (tunable)
# FlashInfer-MLA also supports uniform spec-as-decode but with different mechanism
# - flashinfer_mla
# Benchmark settings
benchmark:
device: "cuda:0"
repeats: 10 # More repeats for statistical significance
warmup_iters: 5
profile_memory: false
# Test these threshold values for optimization
parameter_sweep:
param_name: "reorder_batch_threshold"
values: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
include_auto: false
label_format: "{backend}_threshold_{value}"

View File

@@ -0,0 +1,40 @@
# Standard attention backend benchmark configuration
model:
num_layers: 32
num_q_heads: 32
num_kv_heads: 8 # GQA with 4:1 ratio
head_dim: 128
block_size: 16
batch_specs:
# Pure prefill
- "q512" # Small prefill (512 tokens)
- "q2k" # Medium prefill (2048 tokens)
- "q4k" # Large prefill (4096 tokens)
- "q8k" # Very large prefill (8192 tokens)
# Pure decode
- "8q1s1k" # 8 requests, 1k KV cache each
- "16q1s2k" # 16 requests, 2k KV cache each
- "32q1s1k" # 32 requests, 1k KV cache each
- "64q1s4k" # 64 requests, 4k KV cache each
# Mixed prefill/decode
- "2q2k_8q1s1k" # 2 prefill + 8 decode
- "4q1k_16q1s2k" # 4 prefill + 16 decode
- "2q4k_32q1s1k" # 2 large prefill + 32 decode
# Context extension
- "q1ks2k" # 1k query, 2k sequence (chunked prefill)
- "2q1ks4k" # 2 requests: 1k query, 4k sequence
backends:
- flash
- triton
- flashinfer
device: "cuda:0"
repeats: 5
warmup_iters: 3
profile_memory: false

View File

@@ -0,0 +1,836 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
MLA benchmark runner - shared utilities for MLA benchmarks.
This module provides helpers for running MLA backends without
needing full VllmConfig integration.
"""
import importlib
import numpy as np
import torch
from batch_spec import parse_batch_spec
from common import (
BenchmarkResult,
MockHfConfig,
MockKVBProj,
MockLayer,
setup_mla_dims,
)
from vllm.config import (
CacheConfig,
CompilationConfig,
ModelConfig,
ParallelConfig,
SchedulerConfig,
VllmConfig,
set_current_vllm_config,
)
# ============================================================================
# VllmConfig Creation
# ============================================================================
def _add_mock_methods_to_model_config(model_config: ModelConfig) -> None:
"""
Add mock methods for layer-specific queries to ModelConfig.
These methods are needed by metadata builders but aren't normally
present on ModelConfig when used in benchmark contexts.
"""
import types
model_config.get_num_layers = types.MethodType(lambda self: 1, model_config)
model_config.get_sliding_window_for_layer = types.MethodType(
lambda self, _i: None, model_config
)
model_config.get_logits_soft_cap_for_layer = types.MethodType(
lambda self, _i: None, model_config
)
model_config.get_sm_scale_for_layer = types.MethodType(
lambda self, _i: 1.0 / model_config.get_head_size() ** 0.5, model_config
)
def create_minimal_vllm_config(
model_name: str = "deepseek-v3",
block_size: int = 128,
max_num_seqs: int = 256,
mla_dims: dict | None = None,
) -> VllmConfig:
"""
Create minimal VllmConfig for MLA benchmarks.
Args:
model_name: Model name (deepseek-v2, deepseek-v3, etc.) - used if mla_dims not
provided
block_size: KV cache block size
max_num_seqs: Maximum number of sequences
mla_dims: Optional custom MLA dimensions dict. If not provided, uses
setup_mla_dims(model_name)
Returns:
VllmConfig for benchmarking
"""
# Get MLA dimensions - use provided or load from model name
if mla_dims is None:
mla_dims = setup_mla_dims(model_name)
# Create mock HF config first (avoids downloading from HuggingFace)
mock_hf_config = MockHfConfig(mla_dims)
# Create a temporary minimal config.json to avoid HF downloads
# This ensures consistent ModelConfig construction without network access
import json
import os
import shutil
import tempfile
minimal_config = {
"architectures": ["DeepseekV2ForCausalLM"],
"model_type": "deepseek_v2",
"num_attention_heads": mla_dims["num_q_heads"],
"num_key_value_heads": mla_dims["num_kv_heads"],
"hidden_size": mla_dims["head_dim"] * mla_dims["num_q_heads"],
"torch_dtype": "bfloat16",
"max_position_embeddings": 163840, # DeepSeek V3 default
"rope_theta": 10000.0,
"vocab_size": 128256,
}
# Create temporary directory with config.json
temp_dir = tempfile.mkdtemp(prefix="vllm_bench_")
config_path = os.path.join(temp_dir, "config.json")
with open(config_path, "w") as f:
json.dump(minimal_config, f)
try:
# Create model config using local path - no HF downloads
model_config = ModelConfig(
model=temp_dir, # Use local temp directory
tokenizer=None,
tokenizer_mode="auto",
trust_remote_code=True,
dtype="bfloat16",
seed=0,
max_model_len=32768,
quantization=None,
quantization_param_path=None,
enforce_eager=False,
max_context_len_to_capture=None,
max_seq_len_to_capture=8192,
max_logprobs=20,
disable_sliding_window=False,
skip_tokenizer_init=True,
served_model_name=None,
limit_mm_per_prompt=None,
use_async_output_proc=True,
config_format="auto",
)
finally:
# Clean up temporary directory
shutil.rmtree(temp_dir, ignore_errors=True)
# Override with our mock config
model_config.hf_config = mock_hf_config
model_config.hf_text_config = mock_hf_config
# Add mock methods for layer-specific queries
_add_mock_methods_to_model_config(model_config)
# Create sub-configs
cache_config = CacheConfig(
block_size=block_size,
gpu_memory_utilization=0.9,
swap_space=0,
cache_dtype="auto",
enable_prefix_caching=False,
)
scheduler_config = SchedulerConfig(
max_num_seqs=max_num_seqs,
max_num_batched_tokens=8192,
max_model_len=32768,
is_encoder_decoder=False,
enable_chunked_prefill=True,
)
parallel_config = ParallelConfig(
tensor_parallel_size=1,
)
compilation_config = CompilationConfig()
return VllmConfig(
model_config=model_config,
cache_config=cache_config,
parallel_config=parallel_config,
scheduler_config=scheduler_config,
compilation_config=compilation_config,
)
# ============================================================================
# Backend Configuration
# ============================================================================
# Backend name to class name prefix mapping
_BACKEND_NAME_MAP = {
"flashattn_mla": "FlashAttnMLA",
"flashmla": "FlashMLA",
"flashinfer_mla": "FlashInferMLA",
"cutlass_mla": "CutlassMLA",
}
# Special properties that differ from defaults
_BACKEND_PROPERTIES = {
"flashmla": {
"query_format": "concat", # Single concatenated tensor (vs tuple)
"block_size": 64, # FlashMLA uses fixed block size
},
"flashinfer_mla": {
"block_size": 64, # FlashInfer MLA only supports 32 or 64
},
}
def _get_backend_config(backend: str) -> dict:
"""
Get backend configuration using naming conventions.
All MLA backends follow the pattern:
- Module: vllm.v1.attention.backends.mla.{backend}
- Impl: {Name}Impl
- Metadata: {Name}Metadata (or MLACommonMetadata)
- DecodeMetadata: {Name}DecodeMetadata (or MLACommonDecodeMetadata)
- MetadataBuilder: {Name}MetadataBuilder
"""
if backend not in _BACKEND_NAME_MAP:
raise ValueError(f"Unknown backend: {backend}")
name = _BACKEND_NAME_MAP[backend]
props = _BACKEND_PROPERTIES.get(backend, {})
# Check if backend uses common metadata (FlashInfer, CUTLASS)
uses_common = backend in ("flashinfer_mla", "cutlass_mla")
return {
"module": f"vllm.v1.attention.backends.mla.{backend}",
"impl_class": f"{name}Impl",
"metadata_class": "MLACommonMetadata" if uses_common else f"{name}Metadata",
"decode_metadata_class": "MLACommonDecodeMetadata"
if uses_common
else f"{name}DecodeMetadata",
"builder_class": f"{name}MetadataBuilder",
"query_format": props.get("query_format", "tuple"),
"block_size": props.get("block_size", None),
}
# ============================================================================
# Metadata Building Helpers
# ============================================================================
def _build_attention_metadata(
requests: list,
block_size: int,
device: torch.device,
builder_instance,
) -> tuple:
"""
Build attention metadata from batch requests.
Args:
requests: List of BatchRequest objects
block_size: KV cache block size
device: Target device
builder_instance: Metadata builder instance
Returns:
Tuple of (metadata, kv_cache_num_blocks)
"""
q_lens = [r.q_len for r in requests]
kv_lens = [r.kv_len for r in requests]
total_q = sum(q_lens)
max_kv = max(kv_lens)
# Build query start locations
q_start_cpu = torch.tensor(
[0] + [sum(q_lens[: i + 1]) for i in range(len(q_lens))],
dtype=torch.int32,
)
q_start_gpu = q_start_cpu.to(device)
# Build sequence lengths
seq_lens_cpu = torch.tensor(kv_lens, dtype=torch.int32)
seq_lens_gpu = seq_lens_cpu.to(device)
# Build num_computed_tokens (context length for each request)
context_lens = [kv_len - q_len for q_len, kv_len in zip(q_lens, kv_lens)]
num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32)
# Build block table
num_blocks_per_req = [(kv + block_size - 1) // block_size for kv in kv_lens]
max_num_blocks = max(num_blocks_per_req)
block_table_cpu = np.zeros((len(requests), max_num_blocks), dtype=np.int32)
current_block = 0
for i, num_blocks in enumerate(num_blocks_per_req):
for j in range(num_blocks):
block_table_cpu[i, j] = current_block
current_block += 1
block_table_gpu = torch.from_numpy(block_table_cpu).to(device)
# Build slot mapping
slot_mapping_list = []
for i, (q_len, kv_len, num_blocks) in enumerate(
zip(q_lens, kv_lens, num_blocks_per_req)
):
context_len = kv_len - q_len
for j in range(q_len):
token_kv_idx = context_len + j
block_idx = token_kv_idx // block_size
offset_in_block = token_kv_idx % block_size
global_block_id = block_table_cpu[i, block_idx]
slot_id = global_block_id * block_size + offset_in_block
slot_mapping_list.append(slot_id)
slot_mapping = torch.tensor(slot_mapping_list, dtype=torch.int64, device=device)
# Create CommonAttentionMetadata
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
common_attn_metadata = CommonAttentionMetadata(
num_reqs=len(requests),
max_query_len=max(q_lens),
max_seq_len=max_kv,
num_actual_tokens=total_q,
query_start_loc=q_start_gpu,
query_start_loc_cpu=q_start_cpu,
seq_lens=seq_lens_gpu,
_seq_lens_cpu=seq_lens_cpu,
_num_computed_tokens_cpu=num_computed_tokens_cpu,
slot_mapping=slot_mapping,
block_table_tensor=block_table_gpu,
dcp_local_seq_lens=None,
)
# Use the production build() method
metadata = builder_instance.build(
common_prefix_len=0,
common_attn_metadata=common_attn_metadata,
fast_build=False,
)
return metadata, current_block
def _create_input_tensors(
total_q: int,
mla_dims: dict,
query_format: str,
device: torch.device,
dtype: torch.dtype,
):
"""
Create input tensors for both decode and prefill modes.
MLA requires different tensor formats for decode vs prefill:
- Decode: Uses kv_lora_rank (512) dimension
- Prefill: Uses qk_nope_head_dim (128) to stay under FlashAttention's 256 limit
Args:
total_q: Total number of query tokens
mla_dims: MLA dimension configuration
query_format: Either "tuple" or "concat"
device: Target device
dtype: Tensor dtype
Returns:
Tuple of (decode_inputs, prefill_inputs)
- decode_inputs: Query tensor(s) for decode mode
- prefill_inputs: Dict with 'q', 'k_c_normed', 'k_pe', 'k_scale' for prefill
"""
if query_format == "tuple":
# Decode mode format: (q_nope, q_pe) where q_nope has kv_lora_rank dim
q_nope_decode = torch.randn(
total_q,
mla_dims["num_q_heads"],
mla_dims["kv_lora_rank"],
device=device,
dtype=dtype,
)
q_pe = torch.randn(
total_q,
mla_dims["num_q_heads"],
mla_dims["qk_rope_head_dim"],
device=device,
dtype=dtype,
)
decode_inputs = (q_nope_decode, q_pe)
# For prefill, we need q with qk_nope_head_dim instead of kv_lora_rank
q_nope_prefill = torch.randn(
total_q,
mla_dims["num_q_heads"],
mla_dims["qk_nope_head_dim"],
device=device,
dtype=dtype,
)
prefill_q = torch.cat([q_nope_prefill, q_pe], dim=-1)
else: # concat
decode_inputs = torch.randn(
total_q,
mla_dims["num_q_heads"],
mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
device=device,
dtype=dtype,
)
# For prefill with concat format
prefill_q = torch.randn(
total_q,
mla_dims["num_q_heads"],
mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
device=device,
dtype=dtype,
)
# Create additional inputs needed for prefill forward
k_c_normed = torch.randn(
total_q,
mla_dims["kv_lora_rank"],
device=device,
dtype=dtype,
)
k_pe = torch.randn(
total_q,
1, # Single head for MLA
mla_dims["qk_rope_head_dim"],
device=device,
dtype=dtype,
)
k_scale = torch.ones(1, device=device, dtype=torch.float32)
output = torch.zeros(
total_q,
mla_dims["num_q_heads"] * mla_dims["v_head_dim"],
device=device,
dtype=dtype,
)
prefill_inputs = {
"q": prefill_q,
"k_c_normed": k_c_normed,
"k_pe": k_pe,
"k_scale": k_scale,
"output": output,
}
return decode_inputs, prefill_inputs
# ============================================================================
# Backend Initialization
# ============================================================================
def _create_backend_impl(
backend_cfg: dict,
mla_dims: dict,
vllm_config: VllmConfig,
device: torch.device,
):
"""
Create backend implementation instance.
Args:
backend_cfg: Backend configuration dict
mla_dims: MLA dimension configuration
vllm_config: VllmConfig instance
device: Target device
Returns:
Tuple of (impl, layer, builder_instance)
"""
# Import backend classes
backend_module = importlib.import_module(backend_cfg["module"])
impl_class = getattr(backend_module, backend_cfg["impl_class"])
# Calculate scale
scale = 1.0 / np.sqrt(mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"])
# Create mock kv_b_proj layer for prefill mode
mock_kv_b_proj = MockKVBProj(
num_heads=mla_dims["num_q_heads"],
qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
v_head_dim=mla_dims["v_head_dim"],
)
# Create impl
impl = impl_class(
num_heads=mla_dims["num_q_heads"],
head_size=mla_dims["head_dim"],
scale=scale,
num_kv_heads=mla_dims["num_kv_heads"],
alibi_slopes=None,
sliding_window=None,
kv_cache_dtype="auto",
logits_soft_cap=None,
attn_type="decoder",
kv_sharing_target_layer_name=None,
q_lora_rank=None,
kv_lora_rank=mla_dims["kv_lora_rank"],
qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
qk_rope_head_dim=mla_dims["qk_rope_head_dim"],
qk_head_dim=mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
v_head_dim=mla_dims["v_head_dim"],
kv_b_proj=mock_kv_b_proj,
)
# Initialize DCP attributes
if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size in (None, -1):
impl.dcp_world_size = 1
impl.dcp_rank = 0
# Create KV cache spec for MockLayer
from vllm.v1.kv_cache_interface import FullAttentionSpec
kv_cache_spec = FullAttentionSpec(
block_size=backend_cfg["block_size"] or vllm_config.cache_config.block_size,
num_kv_heads=1, # MLA uses 1 KV head
head_size=576, # MLA head dim
dtype=torch.bfloat16,
)
# Create mock layer
layer = MockLayer(device, impl=impl, kv_cache_spec=kv_cache_spec)
# Create builder instance if needed
builder_instance = None
if backend_cfg["builder_class"]:
builder_class = getattr(backend_module, backend_cfg["builder_class"])
# Populate static_forward_context so builder can find the layer
# MockLayer inherits from AttentionLayerBase, so isinstance checks pass
vllm_config.compilation_config.static_forward_context = {"placeholder": layer}
builder_instance = builder_class(
kv_cache_spec=kv_cache_spec,
layer_names=["placeholder"],
vllm_config=vllm_config,
device=device,
)
return impl, layer, builder_instance
# ============================================================================
# Config Helpers
# ============================================================================
def _extract_mla_dims_from_config(config) -> dict | None:
"""
Extract MLA dimensions from BenchmarkConfig if all required fields are present.
Args:
config: BenchmarkConfig instance
Returns:
Dict with MLA dimensions if all fields are provided, None otherwise
"""
# Check if all MLA-specific fields are provided
if all(
[
config.kv_lora_rank is not None,
config.qk_nope_head_dim is not None,
config.qk_rope_head_dim is not None,
config.v_head_dim is not None,
]
):
return {
"kv_lora_rank": config.kv_lora_rank,
"qk_nope_head_dim": config.qk_nope_head_dim,
"qk_rope_head_dim": config.qk_rope_head_dim,
"v_head_dim": config.v_head_dim,
"num_q_heads": config.num_q_heads,
"num_kv_heads": config.num_kv_heads,
"head_dim": config.head_dim,
}
# Fallback: if MLA fields not fully specified, try to construct from basic fields
elif config.head_dim == 576:
# This looks like a DeepSeek MLA config, use standard dimensions with custom
# head count
return {
"kv_lora_rank": 512,
"qk_nope_head_dim": 128,
"qk_rope_head_dim": 64,
"v_head_dim": 128,
"num_q_heads": config.num_q_heads,
"num_kv_heads": config.num_kv_heads,
"head_dim": config.head_dim,
}
return None
# ============================================================================
# Benchmark Execution
# ============================================================================
def _run_single_benchmark(
config,
impl,
layer,
builder_instance,
backend_cfg: dict,
mla_dims: dict,
device: torch.device,
) -> BenchmarkResult:
"""
Run a single benchmark iteration.
Args:
config: BenchmarkConfig instance
impl: Backend implementation instance
layer: MockLayer instance
builder_instance: Metadata builder instance
backend_cfg: Backend configuration dict
mla_dims: MLA dimension configuration
device: Target device
Returns:
BenchmarkResult with timing statistics
"""
# Parse batch spec
requests = parse_batch_spec(config.batch_spec)
q_lens = [r.q_len for r in requests]
total_q = sum(q_lens)
# Determine block size
block_size = backend_cfg["block_size"] or config.block_size
# Build metadata
metadata, num_blocks = _build_attention_metadata(
requests, block_size, device, builder_instance
)
# Create KV cache
kv_cache = torch.zeros(
num_blocks,
block_size,
mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
device=device,
dtype=torch.bfloat16,
)
# Create input tensors for both decode and prefill modes
decode_inputs, prefill_inputs = _create_input_tensors(
total_q,
mla_dims,
backend_cfg["query_format"],
device,
torch.bfloat16,
)
# Determine which forward method to use based on metadata
if metadata.decode is not None:
forward_fn = lambda: impl._forward_decode(
decode_inputs, kv_cache, metadata, layer
)
elif metadata.prefill is not None:
forward_fn = lambda: impl._forward_prefill(
prefill_inputs["q"],
prefill_inputs["k_c_normed"],
prefill_inputs["k_pe"],
kv_cache,
metadata,
prefill_inputs["k_scale"],
prefill_inputs["output"],
)
else:
raise RuntimeError("Metadata has neither decode nor prefill metadata")
# Warmup
for _ in range(config.warmup_iters):
forward_fn()
torch.cuda.synchronize()
# Benchmark
times = []
for _ in range(config.repeats):
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
start.record()
for _ in range(config.num_layers):
forward_fn()
end.record()
torch.cuda.synchronize()
elapsed_ms = start.elapsed_time(end)
times.append(elapsed_ms / 1000.0 / config.num_layers)
mean_time = float(np.mean(times))
return BenchmarkResult(
config=config,
mean_time=mean_time,
std_time=float(np.std(times)),
min_time=float(np.min(times)),
max_time=float(np.max(times)),
throughput_tokens_per_sec=total_q / mean_time if mean_time > 0 else 0,
)
def _run_mla_benchmark_batched(
backend: str,
configs_with_params: list[tuple], # [(config, threshold, num_splits), ...]
) -> list[BenchmarkResult]:
"""
Unified batched MLA benchmark runner for all backends.
Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla
This function reuses backend initialization across multiple benchmarks
to avoid setup/teardown overhead.
Args:
backend: Backend name
configs_with_params: List of (config, threshold, num_splits) tuples
- threshold: reorder_batch_threshold (FlashAttn/FlashMLA only)
- num_splits: num_kv_splits (CUTLASS only)
Returns:
List of BenchmarkResult objects
"""
if not configs_with_params:
return []
backend_cfg = _get_backend_config(backend)
device = torch.device(configs_with_params[0][0].device)
torch.cuda.set_device(device)
# Determine block size
config_block_size = configs_with_params[0][0].block_size
block_size = backend_cfg["block_size"] or config_block_size
# Extract MLA dimensions from the first config
first_config = configs_with_params[0][0]
mla_dims = _extract_mla_dims_from_config(first_config)
# If config didn't provide MLA dims, fall back to default model
if mla_dims is None:
mla_dims = setup_mla_dims("deepseek-v3")
# Create and set vLLM config for MLA (reused across all benchmarks)
vllm_config = create_minimal_vllm_config(
model_name="deepseek-v3", # Used only for model path
block_size=block_size,
mla_dims=mla_dims, # Use custom dims from config or default
)
results = []
with set_current_vllm_config(vllm_config):
# Create backend impl, layer, and builder (reused across benchmarks)
impl, layer, builder_instance = _create_backend_impl(
backend_cfg, mla_dims, vllm_config, device
)
# Run each benchmark with the shared impl
for config, threshold, num_splits in configs_with_params:
# Set threshold for this benchmark (FlashAttn/FlashMLA only)
original_threshold = None
if threshold is not None and builder_instance:
original_threshold = builder_instance.reorder_batch_threshold
builder_instance.reorder_batch_threshold = threshold
# Set num_splits for CUTLASS
original_num_splits = None
if num_splits is not None and hasattr(impl, "_num_kv_splits"):
original_num_splits = impl._num_kv_splits
impl._num_kv_splits = num_splits
try:
result = _run_single_benchmark(
config,
impl,
layer,
builder_instance,
backend_cfg,
mla_dims,
device,
)
results.append(result)
finally:
# Restore original threshold
if original_threshold is not None:
builder_instance.reorder_batch_threshold = original_threshold
# Restore original num_splits
if original_num_splits is not None:
impl._num_kv_splits = original_num_splits
return results
# ============================================================================
# Public API
# ============================================================================
def run_mla_benchmark(
backend: str,
config,
reorder_batch_threshold: int | None = None,
num_kv_splits: int | None = None,
) -> BenchmarkResult | list[BenchmarkResult]:
"""
Unified MLA benchmark runner for all backends.
Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla
Always uses batched execution internally for optimal performance.
Args:
backend: Backend name (flashattn_mla, flashmla, flashinfer_mla, cutlass_mla)
config: BenchmarkConfig or list of (BenchmarkConfig, param) tuples
reorder_batch_threshold: Threshold override for FlashAttn/FlashMLA
(single config mode only)
num_kv_splits: Number of KV splits for CUTLASS (single config mode only)
Returns:
BenchmarkResult (single mode) or list of BenchmarkResult (batched mode)
"""
# Normalize to batched mode: (config, threshold, num_splits)
if isinstance(config, list):
# Already in batched format
if len(config) > 0 and isinstance(config[0], tuple):
# Format: [(cfg, param), ...] where param is threshold or num_splits
if backend in ("flashattn_mla", "flashmla"):
configs_with_params = [(cfg, param, None) for cfg, param in config]
else: # cutlass_mla or flashinfer_mla
configs_with_params = [(cfg, None, param) for cfg, param in config]
else:
# Format: [cfg, ...] - just configs
configs_with_params = [(cfg, None, None) for cfg in config]
return_single = False
else:
# Single config: convert to batched format
configs_with_params = [(config, reorder_batch_threshold, num_kv_splits)]
return_single = True
# Use unified batched execution
results = _run_mla_benchmark_batched(backend, configs_with_params)
# Return single result or list based on input
return results[0] if return_single else results

View File

@@ -0,0 +1,481 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Standard attention benchmark runner - shared utilities for non-MLA benchmarks.
This module provides helpers for running standard attention backends
(FlashAttention, Triton, FlashInfer) with real vLLM integration.
"""
import types
import numpy as np
import torch
from batch_spec import parse_batch_spec, reorder_for_flashinfer
from common import BenchmarkConfig, BenchmarkResult, MockLayer, get_attention_scale
from vllm.config import (
CacheConfig,
CompilationConfig,
DeviceConfig,
LoadConfig,
ModelConfig,
ParallelConfig,
SchedulerConfig,
VllmConfig,
)
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
from vllm.v1.kv_cache_interface import FullAttentionSpec
# ============================================================================
# Backend Configuration
# ============================================================================
_BACKEND_CONFIG = {
"flash": {
"module": "vllm.v1.attention.backends.flash_attn",
"backend_class": "FlashAttentionBackend",
"dtype": torch.float16,
"cache_layout": "standard",
# ^ [2, num_blocks, block_size, num_kv_heads, head_dim]
},
"triton": {
"module": "vllm.v1.attention.backends.triton_attn",
"backend_class": "TritonAttentionBackend",
"dtype": torch.float32,
"cache_layout": "standard",
},
"flashinfer": {
"module": "vllm.v1.attention.backends.flashinfer",
"backend_class": "FlashInferBackend",
"dtype": torch.float16,
"cache_layout": "flashinfer",
# ^ [num_blocks, 2, block_size, num_kv_heads, head_dim]
},
}
def _get_backend_config(backend: str) -> dict:
if backend not in _BACKEND_CONFIG:
raise ValueError(
f"Unknown backend: {backend}. "
f"Available: {', '.join(_BACKEND_CONFIG.keys())}"
)
return _BACKEND_CONFIG[backend]
# ============================================================================
# Metadata Building Helpers
# ============================================================================
def _build_common_attn_metadata(
q_lens: list[int],
kv_lens: list[int],
block_size: int,
device: torch.device,
) -> CommonAttentionMetadata:
"""Build CommonAttentionMetadata from query/kv lengths."""
batch_size = len(q_lens)
total_tokens = sum(q_lens)
query_start_loc = torch.zeros(batch_size + 1, dtype=torch.int32, device=device)
query_start_loc[1:] = torch.tensor(q_lens, dtype=torch.int32, device=device).cumsum(
0
)
query_start_loc_cpu = query_start_loc.cpu()
seq_lens = torch.tensor(kv_lens, dtype=torch.int32, device=device)
seq_lens_cpu = seq_lens.cpu()
max_seq_len = int(seq_lens_cpu.max())
context_lens = [kv - q for kv, q in zip(kv_lens, q_lens)]
num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32)
max_blocks = (max(kv_lens) + block_size - 1) // block_size
num_blocks = batch_size * max_blocks
block_table_tensor = torch.arange(
num_blocks, dtype=torch.int32, device=device
).view(batch_size, max_blocks)
slot_mapping = torch.arange(total_tokens, dtype=torch.int64, device=device)
max_query_len = max(q_lens)
return CommonAttentionMetadata(
query_start_loc=query_start_loc,
query_start_loc_cpu=query_start_loc_cpu,
seq_lens=seq_lens,
seq_lens_cpu=seq_lens_cpu,
num_computed_tokens_cpu=num_computed_tokens_cpu,
num_reqs=batch_size,
num_actual_tokens=total_tokens,
max_query_len=max_query_len,
max_seq_len=max_seq_len,
block_table_tensor=block_table_tensor,
slot_mapping=slot_mapping,
causal=True,
)
def _create_vllm_config(
config: BenchmarkConfig,
dtype: torch.dtype,
max_num_blocks: int,
) -> VllmConfig:
"""Create a VllmConfig for benchmarking with mock model methods."""
model_config = ModelConfig(
model="meta-llama/Meta-Llama-3-8B",
tokenizer="meta-llama/Meta-Llama-3-8B",
trust_remote_code=False,
dtype=dtype,
seed=0,
max_model_len=1024,
)
cache_config = CacheConfig(
block_size=config.block_size,
cache_dtype="auto",
swap_space=0,
)
cache_config.num_gpu_blocks = max_num_blocks
cache_config.num_cpu_blocks = 0
parallel_config = ParallelConfig(tensor_parallel_size=1)
scheduler_config = SchedulerConfig(
max_num_seqs=256,
max_num_batched_tokens=8192,
max_model_len=8192,
is_encoder_decoder=False,
enable_chunked_prefill=True,
)
device_config = DeviceConfig()
load_config = LoadConfig()
compilation_config = CompilationConfig()
# Add mock methods for benchmark config values
model_config.get_num_layers = types.MethodType(
lambda self: config.num_layers, model_config
)
model_config.get_sliding_window_for_layer = types.MethodType(
lambda self, i: None, model_config
)
model_config.get_logits_soft_cap_for_layer = types.MethodType(
lambda self, i: 0.0, model_config
)
model_config.get_sm_scale_for_layer = types.MethodType(
lambda self, i: 1.0 / config.head_dim**0.5, model_config
)
model_config.get_num_attention_heads = types.MethodType(
lambda self, parallel_config=None: config.num_q_heads, model_config
)
model_config.get_num_kv_heads = types.MethodType(
lambda self, parallel_config=None: config.num_kv_heads, model_config
)
model_config.get_head_size = types.MethodType(
lambda self: config.head_dim, model_config
)
model_config.get_sliding_window = types.MethodType(lambda self: None, model_config)
return VllmConfig(
model_config=model_config,
cache_config=cache_config,
parallel_config=parallel_config,
scheduler_config=scheduler_config,
device_config=device_config,
load_config=load_config,
compilation_config=compilation_config,
)
# ============================================================================
# Backend Initialization
# ============================================================================
def _create_backend_impl(
backend_cfg: dict,
config: BenchmarkConfig,
device: torch.device,
):
"""Create backend implementation instance."""
import importlib
backend_module = importlib.import_module(backend_cfg["module"])
backend_class = getattr(backend_module, backend_cfg["backend_class"])
scale = get_attention_scale(config.head_dim)
dtype = backend_cfg["dtype"]
impl = backend_class.get_impl_cls()(
num_heads=config.num_q_heads,
head_size=config.head_dim,
scale=scale,
num_kv_heads=config.num_kv_heads,
alibi_slopes=None,
sliding_window=None,
kv_cache_dtype="auto",
)
kv_cache_spec = FullAttentionSpec(
block_size=config.block_size,
num_kv_heads=config.num_kv_heads,
head_size=config.head_dim,
dtype=dtype,
)
layer = MockLayer(device, kv_cache_spec=kv_cache_spec)
return backend_class, impl, layer, dtype
def _create_metadata_builder(
backend_class,
kv_cache_spec: FullAttentionSpec,
vllm_config: VllmConfig,
device: torch.device,
):
"""Create metadata builder instance."""
return backend_class.get_builder_cls()(
kv_cache_spec=kv_cache_spec,
layer_names=["layer_0"],
vllm_config=vllm_config,
device=device,
)
# ============================================================================
# Tensor Creation Helpers
# ============================================================================
def _create_input_tensors(
config: BenchmarkConfig,
total_q: int,
device: torch.device,
dtype: torch.dtype,
) -> tuple:
"""Create Q, K, V input tensors for all layers."""
q_list = [
torch.randn(
total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
)
for _ in range(config.num_layers)
]
k_list = [
torch.randn(
total_q, config.num_kv_heads, config.head_dim, device=device, dtype=dtype
)
for _ in range(config.num_layers)
]
v_list = [
torch.randn(
total_q, config.num_kv_heads, config.head_dim, device=device, dtype=dtype
)
for _ in range(config.num_layers)
]
return q_list, k_list, v_list
def _create_kv_cache(
config: BenchmarkConfig,
max_num_blocks: int,
cache_layout: str,
device: torch.device,
dtype: torch.dtype,
) -> list:
"""Create KV cache tensors for all layers."""
if cache_layout == "flashinfer":
# FlashInfer layout: [num_blocks, 2, block_size, num_kv_heads, head_dim]
cache_list = [
torch.zeros(
max_num_blocks,
2,
config.block_size,
config.num_kv_heads,
config.head_dim,
device=device,
dtype=dtype,
)
for _ in range(config.num_layers)
]
else:
# Standard layout: [2, num_blocks, block_size, num_kv_heads, head_dim]
cache_list = [
torch.zeros(
2,
max_num_blocks,
config.block_size,
config.num_kv_heads,
config.head_dim,
device=device,
dtype=dtype,
)
for _ in range(config.num_layers)
]
return cache_list
# ============================================================================
# Benchmark Execution
# ============================================================================
def _run_single_benchmark(
config: BenchmarkConfig,
impl,
layer,
q_list: list,
k_list: list,
v_list: list,
cache_list: list,
attn_metadata,
device: torch.device,
dtype: torch.dtype,
) -> tuple:
"""Run single benchmark iteration with warmup and timing loop."""
total_q = q_list[0].shape[0]
out = torch.empty(
total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
)
# Warmup
for _ in range(config.warmup_iters):
for i in range(config.num_layers):
impl.forward(
layer,
q_list[i],
k_list[i],
v_list[i],
cache_list[i],
attn_metadata,
output=out,
)
torch.cuda.synchronize()
# Benchmark
times = []
for _ in range(config.repeats):
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
start.record()
for i in range(config.num_layers):
impl.forward(
layer,
q_list[i],
k_list[i],
v_list[i],
cache_list[i],
attn_metadata,
output=out,
)
end.record()
torch.cuda.synchronize()
elapsed_ms = start.elapsed_time(end)
times.append(elapsed_ms / 1000.0 / config.num_layers) # seconds per layer
mem_stats = {}
if config.profile_memory:
mem_stats = {
"allocated_mb": torch.cuda.memory_allocated(device) / 1024**2,
"reserved_mb": torch.cuda.memory_reserved(device) / 1024**2,
}
return times, mem_stats
# ============================================================================
# Public API
# ============================================================================
def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
"""
Run standard attention benchmark with real kernels.
Supports: flash, triton, flashinfer
Args:
config: Benchmark configuration
Returns:
BenchmarkResult with timing and memory statistics
"""
device = torch.device(config.device)
torch.cuda.set_device(device)
backend_cfg = _get_backend_config(config.backend)
requests = parse_batch_spec(config.batch_spec)
if config.backend == "flashinfer":
requests = reorder_for_flashinfer(requests)
q_lens = [r.q_len for r in requests]
kv_lens = [r.kv_len for r in requests]
total_q = sum(q_lens)
max_kv = max(kv_lens)
max_num_blocks = (max_kv + config.block_size - 1) // config.block_size
backend_class, impl, layer, dtype = _create_backend_impl(
backend_cfg, config, device
)
common_metadata = _build_common_attn_metadata(
q_lens, kv_lens, config.block_size, device
)
kv_cache_spec = FullAttentionSpec(
block_size=config.block_size,
num_kv_heads=config.num_kv_heads,
head_size=config.head_dim,
dtype=dtype,
)
vllm_config = _create_vllm_config(config, dtype, max_num_blocks)
builder = _create_metadata_builder(
backend_class, kv_cache_spec, vllm_config, device
)
attn_metadata = builder.build(
common_prefix_len=0,
common_attn_metadata=common_metadata,
)
q_list, k_list, v_list = _create_input_tensors(config, total_q, device, dtype)
cache_list = _create_kv_cache(
config, max_num_blocks, backend_cfg["cache_layout"], device, dtype
)
times, mem_stats = _run_single_benchmark(
config,
impl,
layer,
q_list,
k_list,
v_list,
cache_list,
attn_metadata,
device,
dtype,
)
mean_time = np.mean(times)
throughput = total_q / mean_time if mean_time > 0 else 0
return BenchmarkResult(
config=config,
mean_time=mean_time,
std_time=np.std(times),
min_time=np.min(times),
max_time=np.max(times),
throughput_tokens_per_sec=throughput,
memory_allocated_mb=mem_stats.get("allocated_mb"),
memory_reserved_mb=mem_stats.get("reserved_mb"),
)

View File

@@ -1,244 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from packaging import version
from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
MINIMUM_BITBLAS_VERSION,
)
try:
import bitblas
if version.parse(bitblas.__version__) < version.parse(MINIMUM_BITBLAS_VERSION):
raise ImportError(
"bitblas version is wrong. Please "
f"install bitblas>={MINIMUM_BITBLAS_VERSION}"
)
except ImportError as e:
bitblas_import_exception = e
raise ValueError(
"Trying to use the bitblas backend, but could not import"
f"with the following error: {bitblas_import_exception}. "
"Please install bitblas through the following command: "
f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`"
) from bitblas_import_exception
from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target
from vllm.utils.argparse_utils import FlexibleArgumentParser
parser = FlexibleArgumentParser(
description="Benchmark BitBLAS int4 on a specific target."
)
# Add arguments to the parser
parser.add_argument(
"--target",
type=str,
default=auto_detect_nvidia_target(),
help="Specify the target device for benchmarking.",
)
parser.add_argument(
"--group_size", type=int, default=None, help="Group size for grouped quantization."
)
parser.add_argument(
"--A_dtype",
type=str,
default="float16",
choices=["float16", "float32", "float64", "int32", "int8"],
help="Data type of activation A.",
)
parser.add_argument(
"--W_dtype",
type=str,
default="int4",
choices=[
"float16",
"float32",
"float64",
"int32",
"int8",
"int4",
"int2",
"int1",
"nf4",
"fp4_e2m1",
],
help="Data type of weight W.",
)
parser.add_argument(
"--accum_dtype",
type=str,
default="float16",
choices=["float16", "int32"],
help="Data type for accumulation.",
)
parser.add_argument(
"--out_dtype",
type=str,
default="float16",
choices=["float16", "float32", "int32", "int8"],
help="Data type for output.",
)
parser.add_argument(
"--layout",
type=str,
default="nt",
choices=["nt", "nn"],
help="Matrix layout, 'nt' for non-transpose A and transpose W.",
)
parser.add_argument(
"--with_bias", action="store_true", help="Include bias in the benchmark."
)
parser.add_argument(
"--with_scaling",
action="store_true",
help="Include scaling factor in the quantization.",
)
parser.add_argument(
"--with_zeros", action="store_true", help="Include zeros in the quantization."
)
parser.add_argument(
"--zeros_mode",
type=str,
default=None,
choices=["original", "rescale", "quantized"],
help="Specify the mode for calculating zeros.",
)
# Parse the arguments
args = parser.parse_args()
# Assign arguments to variables
target = args.target
A_dtype = args.A_dtype
W_dtype = args.W_dtype
accum_dtype = args.accum_dtype
out_dtype = args.out_dtype
layout = args.layout
with_bias = args.with_bias
group_size = args.group_size
with_scaling = args.with_scaling
with_zeros = args.with_zeros
zeros_mode = args.zeros_mode
# Define a list of shared arguments that repeat in every config
shared_args = [
A_dtype,
W_dtype,
out_dtype,
accum_dtype,
layout,
with_bias,
group_size,
with_scaling,
with_zeros,
zeros_mode,
]
# Define just the (M, K, N) shapes in a more compact list
shapes = [
# square test
(1, 16384, 16384),
# BLOOM-176B
(1, 43008, 14336),
(1, 14336, 14336),
(1, 57344, 14336),
(1, 14336, 57344),
# OPT-65B
(1, 9216, 9216),
(1, 36864, 9216),
(1, 9216, 36864),
(1, 22016, 8192),
# LLAMA-70B/65B
(1, 8192, 22016),
(1, 8192, 8192),
(1, 28672, 8192),
(1, 8192, 28672),
# square test
(16384, 16384, 16384),
# BLOOM-176B
(8192, 43008, 14336),
(8192, 14336, 14336),
(8192, 57344, 14336),
(8192, 14336, 57344),
# OPT-65B
(8192, 9216, 9216),
(8192, 36864, 9216),
(8192, 9216, 36864),
(8192, 22016, 8192),
# LLAMA-70B/65B
(8192, 8192, 22016),
(8192, 8192, 8192),
(8192, 28672, 8192),
(8192, 8192, 28672),
]
# Build test shapes with all the shared arguments
test_shapes = [(MatmulConfig, Matmul, (*shape, *shared_args)) for shape in shapes]
benchmark_sets = []
benchmark_sets.extend(test_shapes)
benchmark_results = {}
for config_class, operator, input_args in benchmark_sets:
config = config_class(*input_args)
matmul = operator(config, target=target, enable_tuning=True)
kernel_latency = matmul.profile_latency()
print("Time cost is: {:.3f} ms".format(kernel_latency))
profile_config = {
f"{operator.__name__}-{'-'.join([str(i) for i in input_args])}": {
"BitBLAS_top20_latency": kernel_latency,
}
}
benchmark_results.update(profile_config)
# Define headers for the table
headers = [
"PrimFunc",
"Input Arguments",
"BitBLAS Top20 Latency",
]
# Calculate column widths for pretty printing
col_widths = [0, 0, 0]
for config_key, values in benchmark_results.items():
args_split = config_key.split("-")
func_name = args_split[0]
input_args_str = "-".join(args_split[1:])
col_widths[0] = max(col_widths[0], len(func_name) + 2, len(headers[0]) + 2)
col_widths[1] = max(col_widths[1], len(input_args_str) + 2, len(headers[1]) + 2)
col_widths[2] = max(
col_widths[2],
len(f"{values['BitBLAS_top20_latency']:.3f} ms") + 2,
len(headers[2]) + 2,
)
# break only if you want to measure widths from a single example;
# otherwise, let it loop over all items.
# Print header
for i, header in enumerate(headers):
headers[i] = header.ljust(col_widths[i])
print("".join(headers))
print("-" * sum(col_widths))
# Print rows
for config_key, values in benchmark_results.items():
args_split = config_key.split("-")
func_name = args_split[0]
input_args_str = "-".join(args_split[1:])
row = [
func_name,
input_args_str,
f"{values['BitBLAS_top20_latency']:.3f} ms",
]
row_str = "".join(
[str(cell).ljust(col_widths[idx]) for idx, cell in enumerate(row)]
)
print(row_str)

View File

@@ -197,7 +197,7 @@ def bench_run(
)
kernel = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
MoEPrepareAndFinalizeNoEP(),
CutlassExpertsFp4(
make_dummy_moe_config(),
quant_config=quant_config,
@@ -242,7 +242,7 @@ def bench_run(
)
kernel = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
MoEPrepareAndFinalizeNoEP(),
CutlassExpertsFp4(
make_dummy_moe_config(),
quant_config=quant_config,

View File

@@ -842,6 +842,7 @@ class BenchmarkTensors:
"sorted_token_ids": sorted_token_ids,
"expert_ids": expert_ids,
"num_tokens_post_padded": num_tokens_post_padded,
"token_lora_mapping": self.lora_kernel_meta.token_lora_mapping,
"top_k_num": ctx.top_k_num,
"device": self.input.device,
"N": lora_rank,
@@ -915,6 +916,7 @@ class BenchmarkTensors:
"sorted_token_ids": sorted_token_ids,
"expert_ids": expert_ids,
"num_tokens_post_padded": num_tokens_post_padded,
"token_lora_mapping": self.lora_kernel_meta.token_lora_mapping,
"top_k_num": ctx.top_k_num,
"device": self.input.device,
"N": lora_rank,

View File

@@ -6,12 +6,6 @@ import torch.utils.benchmark as benchmark
from benchmark_shapes import WEIGHT_SHAPES
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
GPTQ_MARLIN_24_MAX_PARALLEL,
GPTQ_MARLIN_24_MIN_THREAD_N,
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES,
GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES,
)
from vllm.model_executor.layers.quantization.utils.allspark_utils import (
ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
ALLSPARK_SUPPORTED_QUANT_TYPES,
@@ -34,9 +28,6 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
awq_marlin_quantize,
marlin_quantize,
)
from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
marlin_24_quantize,
)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
gptq_pack,
gptq_quantize_weights,
@@ -78,14 +69,7 @@ def bench_run(
if size_k % group_size != 0:
return
marlin_24_supported = (
quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
)
repack_supported = (
quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
and group_size in MARLIN_SUPPORTED_GROUP_SIZES
)
repack_supported = group_size in MARLIN_SUPPORTED_GROUP_SIZES
allspark_supported = (
quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES
and group_size == -1
@@ -126,14 +110,6 @@ def bench_run(
marlin_sort_indices,
)
def gen_marlin_24_params():
marlin_24_w_ref = marlin_24_q_w_comp = marlin_24_meta = marlin_24_s = None
if marlin_24_supported:
(marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = (
marlin_24_quantize(b, quant_type, group_size)
)
return (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s)
def gen_repack_params():
q_w_gptq = None
repack_sort_indices = None
@@ -188,9 +164,6 @@ def bench_run(
marlin_g_idx,
marlin_sort_indices,
) = gen_marlin_params()
marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s = (
gen_marlin_24_params()
)
q_w_gptq, repack_sort_indices = gen_repack_params()
qw_reorder, s_reorder, zp_reorder, sm_count, sm_version, CUBLAS_M_THRESHOLD = (
gen_allspark_params()
@@ -200,9 +173,6 @@ def bench_run(
marlin_workspace = MarlinWorkspace(
size_n, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL
)
marlin_24_workspace = MarlinWorkspace(
size_n, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL
)
globals = {
# Gen params
@@ -222,12 +192,6 @@ def bench_run(
"marlin_sort_indices": marlin_sort_indices,
"marlin_workspace": marlin_workspace,
"is_k_full": is_k_full,
# Marlin_24 params
"marlin_24_w_ref": marlin_24_w_ref,
"marlin_24_q_w_comp": marlin_24_q_w_comp,
"marlin_24_meta": marlin_24_meta,
"marlin_24_s": marlin_24_s,
"marlin_24_workspace": marlin_24_workspace,
# GPTQ params
"q_w_gptq": q_w_gptq,
"repack_sort_indices": repack_sort_indices,
@@ -240,7 +204,6 @@ def bench_run(
"CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD,
# Kernels
"marlin_gemm": ops.marlin_gemm,
"gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
"gptq_marlin_repack": ops.gptq_marlin_repack,
"allspark_w8a16_gemm": ops.allspark_w8a16_gemm,
}
@@ -281,17 +244,6 @@ def bench_run(
).blocked_autorange(min_run_time=min_run_time)
)
if marlin_24_supported:
results.append(
benchmark.Timer(
stmt="output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
description="gptq_marlin_24_gemm",
).blocked_autorange(min_run_time=min_run_time)
)
if repack_supported:
results.append(
benchmark.Timer(

View File

@@ -27,7 +27,6 @@ from vllm.model_executor.layers.fused_moe.fused_moe import *
from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
TritonOrDeepGemmExperts,
)
from vllm.platforms import current_platform
from vllm.transformers_utils.config import get_config
from vllm.triton_utils import triton
from vllm.utils.argparse_utils import FlexibleArgumentParser
@@ -482,6 +481,8 @@ class BenchmarkWorker:
block_quant_shape: list[int] = None,
use_deep_gemm: bool = False,
) -> tuple[dict[str, int], float]:
# local import to allow serialization by ray
set_random_seed(self.seed)
dtype_str = _get_config_dtype_str(
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
@@ -535,6 +536,9 @@ class BenchmarkWorker:
block_quant_shape: list[int],
use_deep_gemm: bool,
) -> dict[str, int]:
# local import to allow serialization by ray
from vllm.platforms import current_platform
best_config = None
best_time = float("inf")
if current_platform.is_rocm():
@@ -646,20 +650,28 @@ def save_configs(
f.write("\n")
def get_compressed_tensors_block_structure(config, default_value=None):
config_groups = config.get("config_groups", {})
if len(config_groups) != 1:
return default_value
group = next(iter(config_groups.values()))
weights = group.get("weights", {})
block_structure = weights.get("block_structure", default_value)
return block_structure
def get_weight_block_size_safety(config, default_value=None):
quantization_config = getattr(config, "quantization_config", {})
if isinstance(quantization_config, dict):
return quantization_config.get("weight_block_size", default_value)
if "weight_block_size" in quantization_config:
return quantization_config["weight_block_size"]
return get_compressed_tensors_block_structure(
quantization_config, default_value
)
return default_value
def main(args: argparse.Namespace):
print(args)
config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
if args.model_prefix:
config = getattr(config, args.model_prefix)
def get_model_params(config):
if config.architectures[0] == "DbrxForCausalLM":
E = config.ffn_config.moe_num_experts
topk = config.ffn_config.moe_top_k
@@ -677,6 +689,7 @@ def main(args: argparse.Namespace):
"Glm4MoeForCausalLM",
"Glm4MoeLiteForCausalLM",
"NemotronHForCausalLM",
"MistralLarge3ForCausalLM",
):
E = config.n_routed_experts
topk = config.num_experts_per_tok
@@ -697,16 +710,20 @@ def main(args: argparse.Namespace):
topk = text_config.num_experts_per_tok
intermediate_size = text_config.moe_intermediate_size
hidden_size = text_config.hidden_size
elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
elif config.architectures[0] == "HunYuanMoEV1ForCausalLM":
E = config.num_experts
topk = config.moe_topk[0]
intermediate_size = config.moe_intermediate_size[0]
hidden_size = config.hidden_size
elif config.architectures[0] in ["Qwen3OmniMoeForConditionalGeneration"]:
elif config.architectures[0] == "Qwen3OmniMoeForConditionalGeneration":
E = config.thinker_config.text_config.num_experts
topk = config.thinker_config.text_config.num_experts_per_tok
intermediate_size = config.thinker_config.text_config.moe_intermediate_size
hidden_size = config.thinker_config.text_config.hidden_size
elif config.architectures[0] == "PixtralForConditionalGeneration":
# Pixtral can contain different LLM architectures,
# recurse to get their parameters
return get_model_params(config.get_text_config())
else:
# Support for llama4
config = config.get_text_config()
@@ -715,6 +732,16 @@ def main(args: argparse.Namespace):
topk = config.num_experts_per_tok
intermediate_size = config.intermediate_size
hidden_size = config.hidden_size
return E, topk, intermediate_size, hidden_size
def main(args: argparse.Namespace):
print(args)
config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
if args.model_prefix:
config = getattr(config, args.model_prefix)
E, topk, intermediate_size, hidden_size = get_model_params(config)
enable_ep = bool(args.enable_expert_parallel)
if enable_ep:
ensure_divisibility(E, args.tp_size, "Number of experts")

View File

@@ -10,8 +10,6 @@ from transformers import AutoConfig
from vllm.model_executor.layers.fused_moe import fused_topk
from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
_moe_permute,
_moe_unpermute_and_reduce,
moe_permute,
moe_unpermute,
)
@@ -41,7 +39,6 @@ def benchmark_permute(
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
num_iters: int = 100,
use_customized_permute: bool = False,
) -> float:
# init_dtype = torch.float16 if use_fp8_w8a8 else dtype
hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
@@ -64,29 +61,14 @@ def benchmark_permute(
input_gating.copy_(gating_output[i])
def run():
if use_customized_permute:
(
permuted_hidden_states,
a1q_scale,
first_token_off,
inv_perm_idx,
m_indices,
) = moe_permute(
qhidden_states,
a1q_scale=None,
topk_ids=topk_ids,
n_expert=num_experts,
expert_map=None,
align_block_size=align_block_size,
)
else:
(
permuted_hidden_states,
a1q_scale,
sorted_token_ids,
expert_ids,
inv_perm,
) = _moe_permute(qhidden_states, None, topk_ids, num_experts, None, 16)
moe_permute(
qhidden_states,
a1q_scale=None,
topk_ids=topk_ids,
n_expert=num_experts,
expert_map=None,
align_block_size=align_block_size,
)
# JIT compilation & warmup
run()
@@ -131,11 +113,9 @@ def benchmark_unpermute(
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
num_iters: int = 100,
use_customized_permute: bool = False,
) -> float:
# init_dtype = torch.float16 if use_fp8_w8a8 else dtype
hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
output_hidden_states = torch.empty_like(hidden_states)
if use_fp8_w8a8:
align_block_size = 128 # deepgemm needs 128 m aligned block
qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
@@ -150,78 +130,37 @@ def benchmark_unpermute(
)
def prepare():
if use_customized_permute:
(
permuted_hidden_states,
a1q_scale,
first_token_off,
inv_perm_idx,
m_indices,
) = moe_permute(
qhidden_states,
a1q_scale=None,
topk_ids=topk_ids,
n_expert=num_experts,
expert_map=None,
align_block_size=align_block_size,
)
# convert to fp16/bf16 as gemm output
return (
permuted_hidden_states.to(dtype),
first_token_off,
inv_perm_idx,
m_indices,
)
else:
(
permuted_qhidden_states,
a1q_scale,
sorted_token_ids,
expert_ids,
inv_perm,
) = _moe_permute(
qhidden_states, None, topk_ids, num_experts, None, block_m=16
)
# convert to fp16/bf16 as gemm output
return (
permuted_qhidden_states.to(dtype),
a1q_scale,
sorted_token_ids,
expert_ids,
inv_perm,
)
(
permuted_hidden_states,
_,
first_token_off,
inv_perm_idx,
_,
) = moe_permute(
qhidden_states,
a1q_scale=None,
topk_ids=topk_ids,
n_expert=num_experts,
expert_map=None,
align_block_size=align_block_size,
)
# convert to fp16/bf16 as gemm output
return (
permuted_hidden_states.to(dtype),
first_token_off,
inv_perm_idx,
)
def run(input: tuple):
if use_customized_permute:
(
permuted_hidden_states,
first_token_off,
inv_perm_idx,
m_indices,
) = input
output = torch.empty_like(hidden_states)
moe_unpermute(
output,
permuted_hidden_states,
topk_weights,
inv_perm_idx,
first_token_off,
)
else:
(
permuted_hidden_states,
a1q_scale,
sorted_token_ids,
expert_ids,
inv_perm,
) = input
_moe_unpermute_and_reduce(
output_hidden_states,
permuted_hidden_states,
inv_perm,
topk_weights,
True,
)
(permuted_hidden_states, first_token_off, inv_perm_idx) = input
output = torch.empty_like(hidden_states)
moe_unpermute(
output,
permuted_hidden_states,
topk_weights,
inv_perm_idx,
first_token_off,
)
# JIT compilation & warmup
input = prepare()
@@ -276,8 +215,7 @@ class BenchmarkWorker:
dtype: torch.dtype,
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
use_customized_permute: bool = False,
) -> tuple[dict[str, int], float]:
) -> tuple[float, float]:
set_random_seed(self.seed)
permute_time = benchmark_permute(
@@ -289,7 +227,6 @@ class BenchmarkWorker:
use_fp8_w8a8,
use_int8_w8a16,
num_iters=100,
use_customized_permute=use_customized_permute,
)
unpermute_time = benchmark_unpermute(
num_tokens,
@@ -300,7 +237,6 @@ class BenchmarkWorker:
use_fp8_w8a8,
use_int8_w8a16,
num_iters=100,
use_customized_permute=use_customized_permute,
)
return permute_time, unpermute_time
@@ -347,7 +283,6 @@ def main(args: argparse.Namespace):
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16"
use_customized_permute = args.use_customized_permute
if args.batch_size is None:
batch_sizes = [
@@ -399,7 +334,6 @@ def main(args: argparse.Namespace):
dtype,
use_fp8_w8a8,
use_int8_w8a16,
use_customized_permute,
)
for batch_size in batch_sizes
],
@@ -419,7 +353,6 @@ if __name__ == "__main__":
parser.add_argument(
"--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
)
parser.add_argument("--use-customized-permute", action="store_true")
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--batch-size", type=int, required=False)
parser.add_argument("--trust-remote-code", action="store_true")

View File

@@ -22,8 +22,8 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser
mp.set_start_method("spawn", force=True)
assert current_platform.is_cuda(), (
"Only support tune w8a8 block fp8 kernel on CUDA device."
assert current_platform.is_cuda() or current_platform.is_rocm(), (
"Only support tune w8a8 block fp8 kernel on CUDA/ROCm device."
)
DTYPE_MAP = {

View File

@@ -14,7 +14,7 @@ from vllm._custom_ops import (
)
from vllm.platforms import CpuArchEnum, current_platform
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
from vllm.v1.attention.backends.cpu_attn import CPUAttentionBackend, _get_attn_isa
@@ -58,7 +58,7 @@ def main(
seed: int = 0,
iters: int = 20,
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
num_seqs = len(seq_lens)
query_lens = [x[0] for x in seq_lens]
kv_lens = [x[1] for x in seq_lens]

View File

@@ -7,8 +7,8 @@ import time
import numpy as np
import torch
from vllm.platforms import current_platform
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import set_random_seed
# Check if CPU MoE operations are available
try:
@@ -41,7 +41,7 @@ def main(
seed: int = 0,
iters: int = 20,
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
# up_dim = 2 * intermediate_size for gate + up projection
up_dim = 2 * intermediate_size

View File

@@ -359,6 +359,19 @@ else()
add_compile_definitions(-DVLLM_NUMA_DISABLED)
endif()
#
# Generate CPU attention dispatch header
#
message(STATUS "Generating CPU attention dispatch header")
execute_process(
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/csrc/cpu/generate_cpu_attn_dispatch.py
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/csrc/cpu
RESULT_VARIABLE GEN_RESULT
)
if(NOT GEN_RESULT EQUAL 0)
message(FATAL_ERROR "Failed to generate CPU attention dispatch header")
endif()
#
# _C extension
#

View File

@@ -38,7 +38,7 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 188be16520ceefdc625fdf71365585d2ee348fe2
GIT_TAG 2adfc8c2177c5b0e8ddeedfd5a8990d80eb496ff
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

View File

@@ -24,6 +24,12 @@
typedef __hip_bfloat16 __nv_bfloat16;
#endif
#if defined(__gfx942__)
constexpr float kFp8ScaleDivisor = 224.f;
#else
constexpr float kFp8ScaleDivisor = 448.f;
#endif
void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
int64_t block_size_in_bytes,
const torch::Tensor& block_mapping) {
@@ -401,8 +407,7 @@ __global__ void concat_and_cache_ds_mla_kernel(
}
// Compute the scale for the tile
float tile_scale = max_abs / 448.f;
tile_scale = fmaxf(tile_scale, FLT_MIN);
float tile_scale = fmaxf(max_abs / kFp8ScaleDivisor, FLT_MIN);
// The first lane of each half-warp writes the scale to kv_cache
if ((lane_idx == 0) || (lane_idx == 16)) {
@@ -471,11 +476,8 @@ __global__ void indexer_k_quant_and_cache_kernel(
#endif
}
#if defined(__gfx942__)
float scale = fmaxf(amax, 1e-4) / 224.0f;
#else
float scale = fmaxf(amax, 1e-4) / 448.0f;
#endif
float scale = fmaxf(amax, 1e-4) / kFp8ScaleDivisor;
if (use_ue8m0) {
scale = exp2f(ceilf(log2f(scale)));
}

View File

@@ -1,79 +1,4 @@
#include "cpu_attn_vec.hpp"
#include "cpu_attn_vec16.hpp"
#ifdef CPU_CAPABILITY_AMXBF16
#include "cpu_attn_amx.hpp"
#define AMX_DISPATCH(...) \
case cpu_attention::ISA::AMX: { \
using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::AMX, \
scalar_t, head_dim>; \
return __VA_ARGS__(); \
}
#else
#define AMX_DISPATCH(...) case cpu_attention::ISA::AMX:
#endif
#ifdef __aarch64__
#include "cpu_attn_neon.hpp"
// NEON requires head_dim to be a multiple of 32
#define NEON_DISPATCH(...) \
case cpu_attention::ISA::NEON: { \
using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::NEON, \
scalar_t, head_dim>; \
return __VA_ARGS__(); \
}
#else
#define NEON_DISPATCH(...) case cpu_attention::ISA::NEON:
#endif // #ifdef __aarch64__
#define CPU_ATTN_DISPATCH_CASE(HEAD_DIM, ...) \
case HEAD_DIM: { \
constexpr size_t head_dim = HEAD_DIM; \
return __VA_ARGS__(); \
}
#define CPU_ATTN_DISPATCH_CASE_HEADDIM(HEAD_DIM, ...) \
[&] { \
switch (HEAD_DIM) { \
CPU_ATTN_DISPATCH_CASE(32, __VA_ARGS__) \
CPU_ATTN_DISPATCH_CASE(64, __VA_ARGS__) \
CPU_ATTN_DISPATCH_CASE(80, __VA_ARGS__) \
CPU_ATTN_DISPATCH_CASE(96, __VA_ARGS__) \
CPU_ATTN_DISPATCH_CASE(112, __VA_ARGS__) \
CPU_ATTN_DISPATCH_CASE(128, __VA_ARGS__) \
CPU_ATTN_DISPATCH_CASE(160, __VA_ARGS__) \
CPU_ATTN_DISPATCH_CASE(192, __VA_ARGS__) \
CPU_ATTN_DISPATCH_CASE(224, __VA_ARGS__) \
CPU_ATTN_DISPATCH_CASE(256, __VA_ARGS__) \
default: { \
TORCH_CHECK(false, "Invalid CPU attention head_dim: " + \
std::to_string(HEAD_DIM)); \
} \
} \
}()
#define CPU_ATTN_DISPATCH_IMPL(ISA_TYPE, ...) \
[&] { \
switch (ISA_TYPE) { \
AMX_DISPATCH(__VA_ARGS__) \
NEON_DISPATCH(__VA_ARGS__) \
case cpu_attention::ISA::VEC: { \
using attn_impl = \
cpu_attention::AttentionImpl<cpu_attention::ISA::VEC, scalar_t, \
head_dim>; \
return __VA_ARGS__(); \
} \
case cpu_attention::ISA::VEC16: { \
using attn_impl = \
cpu_attention::AttentionImpl<cpu_attention::ISA::VEC16, scalar_t, \
head_dim>; \
return __VA_ARGS__(); \
} \
default: { \
TORCH_CHECK(false, "Invalid CPU attention ISA type."); \
} \
} \
}()
#include "cpu_attn_dispatch_generated.h"
torch::Tensor get_scheduler_metadata(
const int64_t num_req, const int64_t num_heads_q,
@@ -122,16 +47,14 @@ torch::Tensor get_scheduler_metadata(
input.enable_kv_split = enable_kv_split;
VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
CPU_ATTN_DISPATCH_IMPL(isa, [&]() {
input.elem_size = sizeof(scalar_t);
input.q_buffer_elem_size = sizeof(attn_impl::q_buffer_t);
input.logits_buffer_elem_size = sizeof(attn_impl::logits_buffer_t);
input.output_buffer_elem_size =
sizeof(attn_impl::partial_output_buffer_t);
input.max_num_q_per_iter = attn_impl::MaxQHeadNumPerIteration;
input.kv_block_alignment = attn_impl::BlockSizeAlignment;
});
CPU_ATTN_DISPATCH(head_dim, isa, [&]() {
input.elem_size = sizeof(scalar_t);
input.q_buffer_elem_size = sizeof(attn_impl::q_buffer_t);
input.logits_buffer_elem_size = sizeof(attn_impl::logits_buffer_t);
input.output_buffer_elem_size =
sizeof(attn_impl::partial_output_buffer_t);
input.max_num_q_per_iter = attn_impl::MaxQHeadNumPerIteration;
input.kv_block_alignment = attn_impl::BlockSizeAlignment;
});
});
@@ -184,18 +107,14 @@ void cpu_attn_reshape_and_cache(
VLLM_DISPATCH_FLOATING_TYPES(
key.scalar_type(), "cpu_attn_reshape_and_cache", [&]() {
CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
CPU_ATTN_DISPATCH_IMPL(isa_tag, [&]() {
attn_impl::reshape_and_cache(
key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
key_cache.data_ptr<scalar_t>(),
value_cache.data_ptr<scalar_t>(),
slot_mapping.data_ptr<int64_t>(), token_num,
key_token_num_stride, value_token_num_stride, head_num,
key_head_num_stride, value_head_num_stride, num_blocks,
num_blocks_stride, cache_head_num_stride, block_size,
block_size_stride);
});
CPU_ATTN_DISPATCH(head_dim, isa_tag, [&]() {
attn_impl::reshape_and_cache(
key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(),
slot_mapping.data_ptr<int64_t>(), token_num, key_token_num_stride,
value_token_num_stride, head_num, key_head_num_stride,
value_head_num_stride, num_blocks, num_blocks_stride,
cache_head_num_stride, block_size, block_size_stride);
});
});
}
@@ -257,12 +176,10 @@ void cpu_attention_with_kv_cache(
VLLM_DISPATCH_FLOATING_TYPES(
query.scalar_type(), "cpu_attention_with_kv_cache", [&]() {
CPU_ATTN_DISPATCH_CASE_HEADDIM(query.size(2), [&] {
CPU_ATTN_DISPATCH_IMPL(input.metadata->isa, [&]() {
TORCH_CHECK_EQ(input.block_size % attn_impl::BlockSizeAlignment, 0);
cpu_attention::AttentionMainLoop<attn_impl> mainloop;
mainloop(&input);
});
CPU_ATTN_DISPATCH(query.size(2), input.metadata->isa, [&]() {
TORCH_CHECK_EQ(input.block_size % attn_impl::BlockSizeAlignment, 0);
cpu_attention::AttentionMainLoop<attn_impl> mainloop;
mainloop(&input);
});
});
}

View File

@@ -377,7 +377,7 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
const int32_t q_heads_per_kv, const int64_t q_num_stride,
const int64_t q_head_stride, const float scale) {
constexpr int64_t bytes_per_head = head_dim * sizeof(scalar_t);
// static_assert(bytes_per_head % AMX_TILE_ROW_BYTES == 0);
static_assert(bytes_per_head % AMX_TILE_ROW_BYTES == 0);
constexpr int64_t head_size_block_num = bytes_per_head / AMX_TILE_ROW_BYTES;
constexpr int64_t head_elem_num_pre_block =
AMX_TILE_ROW_BYTES / sizeof(scalar_t);

View File

@@ -816,14 +816,10 @@ struct VecTypeTrait<float> {
using vec_t = vec_op::FP32Vec16;
};
// ARM only supports BF16 with ARMv8.6-A extension
#if (defined(__aarch64__) && !defined(ARM_BF16_SUPPORT))
#else
template <>
struct VecTypeTrait<c10::BFloat16> {
using vec_t = vec_op::BF16Vec16;
};
#endif
#if !defined(__powerpc__) && !defined(__s390x__)
template <>
@@ -1585,17 +1581,10 @@ class AttentionMainLoop {
if (use_sink) {
alignas(64) float s_aux_fp32[16];
#if defined(__aarch64__) && !defined(ARM_BF16_SUPPORT)
// ARM without native BF16 support: manual conversion
for (int i = 0; i < 16; ++i) {
s_aux_fp32[i] = static_cast<float>(curr_s_aux[i]);
}
#else
// All other platforms have BF16Vec16 available
vec_op::BF16Vec16 vec_bf16(curr_s_aux);
vec_op::FP32Vec16 vec_fp32(vec_bf16);
vec_fp32.save(s_aux_fp32);
#endif
float* __restrict__ curr_sum_buffer = sum_buffer;
float* __restrict__ curr_max_buffer = max_buffer;

View File

@@ -264,7 +264,7 @@ class AttentionImpl<ISA::NEON, scalar_t, head_dim> {
constexpr static ISA ISAType = ISA::NEON;
constexpr static bool scale_on_logits = false; // apply scale on q_buffer
// static_assert(HeadDim % HeadDimAlignment == 0);
static_assert(HeadDim % HeadDimAlignment == 0);
// the gemm micro kernel is Mx8
static_assert(HeadDimAlignment % 8 == 0);
static_assert(BlockSizeAlignment % 8 == 0);

File diff suppressed because it is too large Load Diff

View File

@@ -116,7 +116,7 @@ class Dequantizer4b {
scalar_vec_t output_vec_0(wb_0);
scalar_vec_t output_vec_1(wb_1);
// AMX needs to interlave K elements to pack as 32 bits
// AMX needs to interleave K elements to pack as 32 bits
if constexpr (isa == ISA::AMX) {
vec_op::interleave_save(output_vec_0, output_vec_1, curr_weight);
} else {

View File

@@ -14,13 +14,11 @@ struct KernelVecType<float> {
using cvt_vec_type = vec_op::FP32Vec16;
};
#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT)
template <>
struct KernelVecType<c10::BFloat16> {
using load_vec_type = vec_op::BF16Vec16;
using cvt_vec_type = vec_op::FP32Vec16;
};
#endif
template <>
struct KernelVecType<c10::Half> {
@@ -360,13 +358,14 @@ void onednn_scaled_mm(
const std::optional<torch::Tensor>& azp, // [M] or [1]
const std::optional<torch::Tensor>& azp_adj, // [M] or [1]
const std::optional<torch::Tensor>& bias, // [N]
int64_t handler) {
const torch::Tensor& handler_tensor) {
CPU_KERNEL_GUARD_IN(onednn_scaled_mm)
TORCH_CHECK(a.dim() == 2);
TORCH_CHECK(a.is_contiguous());
TORCH_CHECK(c.is_contiguous());
W8A8MatMulPrimitiveHandler* ptr =
reinterpret_cast<W8A8MatMulPrimitiveHandler*>(handler);
reinterpret_cast<W8A8MatMulPrimitiveHandler*>(
handler_tensor.item<int64_t>());
const int32_t* azp_ptr = nullptr;
if (azp.has_value()) {
azp_ptr = azp->data_ptr<int32_t>();
@@ -519,13 +518,14 @@ int64_t create_onednn_mm_handler(const torch::Tensor& b,
void onednn_mm(torch::Tensor& c, // [M, OC], row-major
const torch::Tensor& a, // [M, IC], row-major
const std::optional<torch::Tensor>& bias, int64_t handler) {
const std::optional<torch::Tensor>& bias,
const torch::Tensor& handler_tensor) {
CPU_KERNEL_GUARD_IN(onednn_mm)
TORCH_CHECK(a.dim() == 2);
TORCH_CHECK(a.stride(-1) == 1);
TORCH_CHECK(c.stride(-1) == 1);
MatMulPrimitiveHandler* ptr =
reinterpret_cast<MatMulPrimitiveHandler*>(handler);
reinterpret_cast<MatMulPrimitiveHandler*>(handler_tensor.item<int64_t>());
// ACL matmuls expect contiguous source tensors
#ifdef VLLM_USE_ACL

View File

@@ -0,0 +1,203 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Generate CPU attention dispatch switch cases and kernel instantiations.
"""
import os
# Head dimensions divisible by 32 (support all ISAs)
HEAD_DIMS_32 = [32, 64, 96, 128, 160, 192, 224, 256]
# Head dimensions divisible by 16 but not 32 (VEC16 only)
HEAD_DIMS_16 = [80, 112]
# ISA types
ISA_TYPES = {
"AMX": 0,
"VEC": 1,
"VEC16": 2,
"NEON": 3,
}
# ISAs supported for head_dims divisible by 32
ISA_FOR_32 = ["AMX", "NEON", "VEC", "VEC16"]
# ISAs supported for head_dims divisible by 16 only
ISA_FOR_16 = ["VEC16"]
def encode_params(head_dim: int, isa_type: str) -> int:
"""Encode head_dim and ISA type into a single int64_t."""
isa_val = ISA_TYPES[isa_type]
# Encoding: (head_dim << 8) | isa_type
# This allows head_dim up to 2^56 - 1 and 256 ISA types
return (head_dim << 8) | isa_val
def generate_cases_for_isa_group(isa_list: list[str]) -> str:
"""Generate switch cases for a specific ISA group."""
cases = []
# Generate cases for head_dims divisible by 32
for head_dim in HEAD_DIMS_32:
for isa in isa_list:
if isa not in ISA_FOR_32:
continue
encoded = encode_params(head_dim, isa)
case_str = (
f""" case {encoded}LL: {{ """
f"""/* head_dim={head_dim}, isa={isa} */ \\"""
f"""
constexpr size_t head_dim = {head_dim}; \\"""
f"""
using attn_impl = cpu_attention::AttentionImpl<"""
f"""cpu_attention::ISA::{isa}, \\"""
f"""
"""
f"""scalar_t, head_dim>; \\"""
f"""
return __VA_ARGS__(); \\"""
f"""
}} \\"""
)
cases.append(case_str)
# Generate cases for head_dims divisible by 16 only
for head_dim in HEAD_DIMS_16:
for isa in isa_list:
encoded = encode_params(head_dim, isa)
case_str = (
f""" case {encoded}LL: {{ """
f"""/* head_dim={head_dim}, isa={isa} """
f"""(using VEC16) */ \\"""
f"""
constexpr size_t head_dim = {head_dim}; \\"""
f"""
using attn_impl = cpu_attention::AttentionImpl<"""
f"""cpu_attention::ISA::VEC16, \\"""
f"""
"""
f"""scalar_t, head_dim>; \\"""
f"""
return __VA_ARGS__(); \\"""
f"""
}} \\"""
)
cases.append(case_str)
return "\n".join(cases)
def generate_helper_function() -> str:
"""Generate helper function to encode parameters."""
return """
inline int64_t encode_cpu_attn_params(int64_t head_dim, cpu_attention::ISA isa) {
return (head_dim << 8) | static_cast<int64_t>(isa);
}
"""
def generate_header_file() -> str:
"""Generate the complete header file content."""
header = """// auto generated by generate_cpu_attn_dispatch.py
// clang-format off
#ifndef CPU_ATTN_DISPATCH_GENERATED_H
#define CPU_ATTN_DISPATCH_GENERATED_H
#include "cpu_attn_vec.hpp"
#include "cpu_attn_vec16.hpp"
#ifdef CPU_CAPABILITY_AMXBF16
#include "cpu_attn_amx.hpp"
#endif
#ifdef __aarch64__
#include "cpu_attn_neon.hpp"
#endif
"""
header += generate_helper_function()
# Generate dispatch macro with conditional compilation for different ISA sets
header += """
// Dispatch macro using encoded parameters
"""
# x86_64 with AMX
header += """#if defined(CPU_CAPABILITY_AMXBF16)
#define CPU_ATTN_DISPATCH(HEAD_DIM, ISA_TYPE, ...) \\
[&] { \\
int64_t encoded_params = encode_cpu_attn_params(HEAD_DIM, ISA_TYPE); \\
switch (encoded_params) { \\
"""
header += generate_cases_for_isa_group(["AMX", "VEC", "VEC16"])
header += """
default: { \\
TORCH_CHECK(false, "Unsupported CPU attention configuration: head_dim=" + \\
std::to_string(HEAD_DIM) + " isa=" + \\
std::to_string(static_cast<int>(ISA_TYPE))); \\
} \\
} \\
}()
"""
# ARM64 with NEON
header += """#elif defined(__aarch64__)
#define CPU_ATTN_DISPATCH(HEAD_DIM, ISA_TYPE, ...) \\
[&] { \\
int64_t encoded_params = encode_cpu_attn_params(HEAD_DIM, ISA_TYPE); \\
switch (encoded_params) { \\
"""
header += generate_cases_for_isa_group(["NEON", "VEC", "VEC16"])
header += """
default: { \\
TORCH_CHECK(false, "Unsupported CPU attention configuration: head_dim=" + \\
std::to_string(HEAD_DIM) + " isa=" + \\
std::to_string(static_cast<int>(ISA_TYPE))); \\
} \\
} \\
}()
"""
# Fallback: VEC and VEC16 only
header += """#else
#define CPU_ATTN_DISPATCH(HEAD_DIM, ISA_TYPE, ...) \\
[&] { \\
int64_t encoded_params = encode_cpu_attn_params(HEAD_DIM, ISA_TYPE); \\
switch (encoded_params) { \\
"""
header += generate_cases_for_isa_group(["VEC", "VEC16"])
header += """
default: { \\
TORCH_CHECK(false, "Unsupported CPU attention configuration: head_dim=" + \\
std::to_string(HEAD_DIM) + " isa=" + \\
std::to_string(static_cast<int>(ISA_TYPE))); \\
} \\
} \\
}()
#endif /* CPU_CAPABILITY_AMXBF16 / __aarch64__ */
#endif // CPU_ATTN_DISPATCH_GENERATED_H
"""
return header
def main():
output_path = os.path.join(
os.path.dirname(__file__), "cpu_attn_dispatch_generated.h"
)
with open(output_path, "w") as f:
f.write(generate_header_file())
if __name__ == "__main__":
main()

View File

@@ -38,9 +38,7 @@ struct KernelVecType<c10::BFloat16> {
using qk_vec_type = vec_op::BF16Vec32;
using v_load_vec_type = vec_op::BF16Vec16;
};
#elif defined(__aarch64__) && !defined(ARM_BF16_SUPPORT)
// pass
#else
#elif defined(__aarch64__)
template <>
struct KernelVecType<c10::BFloat16> {
using qk_load_vec_type = vec_op::BF16Vec16;

View File

@@ -265,7 +265,7 @@ void tinygemm_kernel(
// mb_size = 4
case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
case 0x44: LAUNCH_TINYGEMM_KERNEL_NN(4, 64); break;
default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
}
}
}

View File

@@ -324,7 +324,7 @@ void tinygemm_kernel(
case 0x22: LAUNCH_TINYGEMM_KERNEL_NN(2, 32); break;
case 0x32: LAUNCH_TINYGEMM_KERNEL_NN(3, 32); break;
case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
}
}
}

View File

@@ -180,7 +180,7 @@ void tinygemm_kernel(
// mb_size = 4
case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
case 0x44: LAUNCH_TINYGEMM_KERNEL_NN(4, 64); break;
default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
}
}
}

View File

@@ -398,7 +398,7 @@ void tinygemm_kernel(
case 0x32: LAUNCH_TINYGEMM_KERNEL_NN(3, 32); break;
// mb_size = 4
case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
}
}
}
@@ -511,7 +511,7 @@ void tinygemm_kernel(
case 0x32: LAUNCH_TINYGEMM_KERNEL_NN2(3, 32); break;
// mb_size = 4
case 0x42: LAUNCH_TINYGEMM_KERNEL_NN2(4, 32); break;
default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
}
}
}

View File

@@ -271,7 +271,7 @@ void tinygemm_kernel(
case 0x22: LAUNCH_TINYGEMM_KERNEL_VNNI(2, 32); break;
case 0x32: LAUNCH_TINYGEMM_KERNEL_VNNI(3, 32); break;
case 0x42: LAUNCH_TINYGEMM_KERNEL_VNNI(4, 32); break;
default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
}
}
}
@@ -401,7 +401,7 @@ void tinygemm_kernel(
case 0x22: LAUNCH_TINYGEMM_KERNEL_VNNI2(2, 32); break;
case 0x32: LAUNCH_TINYGEMM_KERNEL_VNNI2(3, 32); break;
case 0x42: LAUNCH_TINYGEMM_KERNEL_VNNI2(4, 32); break;
default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
}
}
}

View File

@@ -237,10 +237,10 @@ struct ThreadSHMContext {
class SHMManager {
public:
explicit SHMManager(const std::string& name, const int rank,
const int group_size)
const int group_size, const int thread_num)
: _rank(rank),
_group_size(group_size),
_thread_num(omp_get_max_threads()),
_thread_num(thread_num),
_shm_names({""}),
_shared_mem_ptrs({nullptr}),
_shm_ctx(nullptr) {
@@ -282,11 +282,11 @@ class SHMManager {
}
static int64_t create_singleton_instance(const std::string& name,
const int group_size,
const int rank) {
const int group_size, const int rank,
const int thread_num) {
std::lock_guard<std::mutex> guard(SingletonInstancesLock);
SingletonInstances.emplace_back(
std::make_unique<SHMManager>(name, rank, group_size));
std::make_unique<SHMManager>(name, rank, group_size, thread_num));
return static_cast<int64_t>(SingletonInstances.size() - 1);
}
@@ -854,8 +854,9 @@ std::vector<torch::Tensor> shm_recv_tensor_list(int64_t handle, int64_t src) {
}
int64_t init_shm_manager(const std::string& name, const int64_t group_size,
const int64_t rank) {
return SHMManager::create_singleton_instance(name, group_size, rank);
const int64_t rank, const int64_t thread_num) {
return SHMManager::create_singleton_instance(name, group_size, rank,
thread_num);
}
std::string join_shm_manager(int64_t handle, const std::string& name) {

View File

@@ -19,13 +19,14 @@ void onednn_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
const std::optional<torch::Tensor>& azp,
const std::optional<torch::Tensor>& azp_adj,
const std::optional<torch::Tensor>& bias,
int64_t handler);
const torch::Tensor& handler_tensor);
int64_t create_onednn_mm_handler(const torch::Tensor& b,
int64_t primitive_cache_size);
void onednn_mm(torch::Tensor& c, const torch::Tensor& a,
const std::optional<torch::Tensor>& bias, int64_t handler);
const std::optional<torch::Tensor>& bias,
const torch::Tensor& handler_tensor);
bool is_onednn_acl_supported();
@@ -34,7 +35,7 @@ void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
torch::Tensor& block_tables, torch::Tensor& seq_lens);
int64_t init_shm_manager(const std::string& name, const int64_t group_size,
const int64_t rank);
const int64_t rank, const int64_t thread_num);
std::string join_shm_manager(int64_t handle, const std::string& name);
@@ -196,7 +197,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// oneDNN GEMM
ops.def(
"onednn_mm(Tensor! c, Tensor a, Tensor? bias, "
"int handler) -> ()");
"Tensor handler_tensor) -> ()");
ops.impl("onednn_mm", torch::kCPU, &onednn_mm);
// Check if oneDNN was built with ACL backend
@@ -212,7 +213,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// oneDNN scaled_mm for W8A8 with static per-tensor activation quantization
ops.def(
"onednn_scaled_mm(Tensor! c, Tensor a, Tensor a_scales, Tensor? azp, "
"Tensor? azp_adj, Tensor? bias, int handler) -> ()");
"Tensor? azp_adj, Tensor? bias, Tensor handler_tensor) -> ()");
ops.impl("onednn_scaled_mm", torch::kCPU, &onednn_scaled_mm);
// Compute int8 quantized tensor for given scaling factor.
@@ -231,8 +232,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// SHM CCL
#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__))
ops.def("init_shm_manager(str name, int group_size, int rank) -> int",
&init_shm_manager);
ops.def(
"init_shm_manager(str name, int group_size, int rank, int thread_num) -> "
"int",
&init_shm_manager);
ops.def("join_shm_manager(int handle, str name) -> str", &join_shm_manager);
ops.def("shm_allreduce(int handle, Tensor! data) -> ()");
ops.impl("shm_allreduce", torch::kCPU, &shm_allreduce);
@@ -291,7 +294,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"value_cache, Tensor(a3!) output, Tensor query_start_loc, Tensor "
"seq_lens, float scale, bool causal, Tensor? alibi_slopes, SymInt "
"sliding_window_left, SymInt sliding_window_right, Tensor block_table, "
"float softcap, Tensor sheduler_metadata, Tensor? s_aux) -> ()",
"float softcap, Tensor scheduler_metadata, Tensor? s_aux) -> ()",
&cpu_attention_with_kv_cache);
// placeholders

View File

@@ -30,12 +30,10 @@ struct VecTypeTrait<float> {
using vec_t = vec_op::FP32Vec16;
};
#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT)
template <>
struct VecTypeTrait<c10::BFloat16> {
using vec_t = vec_op::BF16Vec16;
};
#endif
#if !defined(__powerpc__)
template <>

View File

@@ -115,11 +115,28 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
if (flag) { // support GPUDirect RDMA if possible
prop.allocFlags.gpuDirectRDMACapable = 1;
}
int fab_flag = 0;
CUDA_CHECK(cuDeviceGetAttribute(
&fab_flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device));
if (fab_flag) { // support fabric handle if possible
prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
}
#endif
#ifndef USE_ROCM
// Allocate memory using cuMemCreate
CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));
CUresult ret = (CUresult)cuMemCreate(p_memHandle, size, &prop, 0);
if (ret) {
if (fab_flag &&
(ret == CUDA_ERROR_NOT_PERMITTED || ret == CUDA_ERROR_NOT_SUPPORTED)) {
// Fabric allocation may fail without multi-node nvlink,
// fallback to POSIX file descriptor
prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));
} else {
CUDA_CHECK(ret);
}
}
if (error_code != 0) {
return;
}

View File

@@ -3,7 +3,8 @@
#include "cutlass/cutlass.h"
#include <climits>
#include "cuda_runtime.h"
#include <iostream>
#include <cstdio>
#include <cstdlib>
/**
* Helper function for checking CUTLASS errors
@@ -31,12 +32,63 @@ int32_t get_sm_version_num();
* __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
* into code that will be executed on the device where it is defined.
*/
template <typename Kernel>
struct enable_sm75_to_sm80 : Kernel {
template <typename... Args>
CUTLASS_DEVICE static void invoke(Args&&... args) {
#if defined __CUDA_ARCH__
#if __CUDA_ARCH__ >= 750 && __CUDA_ARCH__ < 800
Kernel::invoke(std::forward<Args>(args)...);
#else
printf("This kernel only supports sm[75, 80).\n");
asm("trap;");
#endif
#endif
}
};
template <typename Kernel>
struct enable_sm80_to_sm89 : Kernel {
template <typename... Args>
CUTLASS_DEVICE static void invoke(Args&&... args) {
#if defined __CUDA_ARCH__
#if __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 890
Kernel::invoke(std::forward<Args>(args)...);
#else
printf("This kernel only supports sm[80, 89).\n");
asm("trap;");
#endif
#endif
}
};
template <typename Kernel>
struct enable_sm89_to_sm90 : Kernel {
template <typename... Args>
CUTLASS_DEVICE static void invoke(Args&&... args) {
#if defined __CUDA_ARCH__
#if __CUDA_ARCH__ >= 890 && __CUDA_ARCH__ < 900
Kernel::invoke(std::forward<Args>(args)...);
#else
printf("This kernel only supports sm[89, 90).\n");
asm("trap;");
#endif
#endif
}
};
template <typename Kernel>
struct enable_sm90_or_later : Kernel {
template <typename... Args>
CUTLASS_DEVICE void operator()(Args&&... args) {
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
#if defined __CUDA_ARCH__
#if __CUDA_ARCH__ >= 900
Kernel::operator()(std::forward<Args>(args)...);
#else
printf("This kernel only supports sm >= 90.\n");
asm("trap;");
#endif
#endif
}
};
@@ -45,18 +97,43 @@ template <typename Kernel>
struct enable_sm90_only : Kernel {
template <typename... Args>
CUTLASS_DEVICE void operator()(Args&&... args) {
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 900
#if defined __CUDA_ARCH__
#if __CUDA_ARCH__ == 900
Kernel::operator()(std::forward<Args>(args)...);
#else
printf("This kernel only supports sm90.\n");
asm("trap;");
#endif
#endif
}
};
template <typename Kernel>
struct enable_sm100_only : Kernel {
struct enable_sm100f_only : Kernel {
template <typename... Args>
CUTLASS_DEVICE void operator()(Args&&... args) {
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 1000
#if defined __CUDA_ARCH__
#if __CUDA_ARCH__ == 1000 || __CUDA_ARCH__ == 1030
Kernel::operator()(std::forward<Args>(args)...);
#else
printf("This kernel only supports sm100f.\n");
asm("trap;");
#endif
#endif
}
};
template <typename Kernel>
struct enable_sm100a_only : Kernel {
template <typename... Args>
CUTLASS_DEVICE void operator()(Args&&... args) {
#if defined __CUDA_ARCH__
#if __CUDA_ARCH__ == 1000
Kernel::operator()(std::forward<Args>(args)...);
#else
printf("This kernel only supports sm100a.\n");
asm("trap;");
#endif
#endif
}
};
@@ -65,8 +142,13 @@ template <typename Kernel>
struct enable_sm120_only : Kernel {
template <typename... Args>
CUTLASS_DEVICE void operator()(Args&&... args) {
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 1200
#if defined __CUDA_ARCH__
#if __CUDA_ARCH__ == 1200
Kernel::operator()(std::forward<Args>(args)...);
#else
printf("This kernel only supports sm120.\n");
asm("trap;");
#endif
#endif
}
};

View File

@@ -770,7 +770,7 @@ torch::Tensor moe_wna16_marlin_gemm(
b_bias = b_bias_or_none.value();
TORCH_CHECK(b_bias.device().is_cuda(), "b_bias is not on GPU");
TORCH_CHECK(b_bias.is_contiguous(), "b_bias is not contiguous");
TORCH_CHECK(b_bias.size(1) == size_n, "b_bias.size(0) != size_n");
TORCH_CHECK(b_bias.size(1) == size_n, "b_bias.size(1) != size_n");
TORCH_CHECK(b_bias.stride(1) == 1, "b_bias.stride(1) != 1");
} else {
b_bias = torch::empty({0}, options);

View File

@@ -73,25 +73,40 @@ void moe_permute(
get_ptr<int64_t>(expert_first_token_offset), n_token, n_expert,
n_local_expert, topk, sorter, get_ptr<int>(sort_workspace), stream);
// DeepGEMM: use getMIndices kernel to compute
// 1) align_expert_first_token_offset (aligned prefix offsets)
// 2) m_indices (expert id for each aligned row)
// eg. expert0: 3, expert1: 5, expert2: 2 tokens respectively
// expert_first_token_offset = [0, 3, 8, 10], align_block_size = 4
// expert0: 3->4, expert1: 5->8, expert2: 2->4
// align_expert_first_token_offset = [0, 4, 12, 16]
// so m_indices = [0,0,0,0, 1,1,1,1,1,1,1,1, 2,2,2,2]
torch::Tensor align_expert_first_token_offset;
const int64_t* aligned_expert_first_token_offset_ptr = nullptr;
if (align_block_size.has_value()) {
align_expert_first_token_offset =
torch::zeros_like(expert_first_token_offset);
getMIndices(get_ptr<int64_t>(expert_first_token_offset),
get_ptr<int64_t>(align_expert_first_token_offset),
get_ptr<int>(m_indices), n_local_expert, align_block_size_value,
stream);
aligned_expert_first_token_offset_ptr =
get_ptr<int64_t>(align_expert_first_token_offset);
}
// dispatch expandInputRowsKernelLauncher
MOE_DISPATCH(input.scalar_type(), [&] {
expandInputRowsKernelLauncher<scalar_t>(
get_ptr<scalar_t>(input), get_ptr<scalar_t>(permuted_input),
get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
get_ptr<int>(inv_permuted_idx), get_ptr<int>(permuted_idx),
get_ptr<int64_t>(expert_first_token_offset), n_token, valid_num_ptr,
n_hidden, topk, n_local_expert, align_block_size_value, stream);
get_ptr<int64_t>(expert_first_token_offset),
aligned_expert_first_token_offset_ptr, n_token, valid_num_ptr, n_hidden,
topk, n_local_expert, align_block_size_value, stream);
});
// get m_indices and update expert_first_token_offset with align block
// this is only required for DeepGemm and not required for CUTLASS group gemm
if (align_block_size.has_value()) {
auto align_expert_first_token_offset =
torch::zeros_like(expert_first_token_offset);
getMIndices(get_ptr<int64_t>(expert_first_token_offset),
get_ptr<int64_t>(align_expert_first_token_offset),
get_ptr<int>(m_indices), n_local_expert, align_block_size_value,
stream);
expert_first_token_offset.copy_(align_expert_first_token_offset);
}
}

View File

@@ -60,7 +60,8 @@ void expandInputRowsKernelLauncher(
T const* unpermuted_input, T* permuted_output, int* sorted_experts,
int const* expanded_dest_row_to_expanded_source_row,
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
int64_t* expert_first_token_offset, int64_t const num_rows,
int64_t const* expert_first_token_offset,
int64_t const* aligned_expert_first_token_offset, int64_t const num_rows,
int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
int num_local_experts, const int& align_block_size, cudaStream_t stream);

View File

@@ -5,7 +5,8 @@ __global__ void expandInputRowsKernel(
T const* unpermuted_input, T* permuted_output, int* sorted_experts,
int const* expanded_dest_row_to_expanded_source_row,
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
int64_t* expert_first_token_offset, int64_t const num_rows,
int64_t const* expert_first_token_offset,
int64_t const* aligned_expert_first_token_offset, int64_t const num_rows,
int64_t const* num_dest_rows, int64_t const cols, int64_t k,
int num_local_experts, int align_block_size) {
// Reverse permutation map.
@@ -18,35 +19,22 @@ __global__ void expandInputRowsKernel(
expanded_dest_row_to_expanded_source_row[expanded_dest_row];
int expert_id = sorted_experts[expanded_dest_row];
extern __shared__ int64_t smem_expert_first_token_offset[];
if constexpr (ALIGN_BLOCK_SIZE) {
// load g2s
for (int idx = threadIdx.x; idx < num_local_experts + 1;
idx += blockDim.x) {
smem_expert_first_token_offset[idx] =
__ldg(expert_first_token_offset + idx);
// convert (unaligned) expanded_dest_row -> aligned expanded_dest_row.
// aligned_expert_first_token_offset[e] provides the aligned prefix start
// for expert e. For non-local experts we map to the end (total aligned M).
int64_t aligned_base = 0;
int64_t token_offset_in_expert = 0;
if (expert_id >= num_local_experts) {
aligned_base =
__ldg(aligned_expert_first_token_offset + num_local_experts);
token_offset_in_expert = 0;
} else {
aligned_base = __ldg(aligned_expert_first_token_offset + expert_id);
token_offset_in_expert =
expanded_dest_row - __ldg(expert_first_token_offset + expert_id);
}
__syncthreads();
int lane_idx = threadIdx.x & 31;
if (lane_idx == 0) {
// set token_offset_in_expert = 0 if this expert is not local expert
int token_offset_in_expert =
expert_id >= num_local_experts
? 0
: expanded_dest_row - smem_expert_first_token_offset[expert_id];
int64_t accumulate_align_offset = 0;
#pragma unroll 1
for (int eidx = 1; eidx <= min(expert_id, num_local_experts); eidx++) {
auto n_token_in_expert = smem_expert_first_token_offset[eidx] -
smem_expert_first_token_offset[eidx - 1];
accumulate_align_offset += (n_token_in_expert + align_block_size - 1) /
align_block_size * align_block_size;
}
expanded_dest_row = accumulate_align_offset + token_offset_in_expert;
}
// lane0 shuffle broadcast align_expanded_dest_row
expanded_dest_row = __shfl_sync(0xffffffff, expanded_dest_row, 0);
expanded_dest_row = aligned_base + token_offset_in_expert;
}
if (threadIdx.x == 0) {
@@ -88,7 +76,8 @@ void expandInputRowsKernelLauncher(
T const* unpermuted_input, T* permuted_output, int* sorted_experts,
int const* expanded_dest_row_to_expanded_source_row,
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
int64_t* expert_first_token_offset, int64_t const num_rows,
int64_t const* expert_first_token_offset,
int64_t const* aligned_expert_first_token_offset, int64_t const num_rows,
int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
int num_local_experts, const int& align_block_size, cudaStream_t stream) {
int64_t const blocks = num_rows * k;
@@ -104,14 +93,12 @@ void expandInputRowsKernelLauncher(
bool is_align_block_size = align_block_size != -1;
auto func = func_map[is_check_skip][is_align_block_size];
int64_t smem_size = sizeof(int64_t) * (num_local_experts + 1);
func<<<blocks, threads, smem_size, stream>>>(
func<<<blocks, threads, 0, stream>>>(
unpermuted_input, permuted_output, sorted_experts,
expanded_dest_row_to_expanded_source_row,
expanded_source_row_to_expanded_dest_row, permuted_idx,
expert_first_token_offset, num_rows, num_valid_tokens_ptr, cols, k,
num_local_experts, align_block_size);
expert_first_token_offset, aligned_expert_first_token_offset, num_rows,
num_valid_tokens_ptr, cols, k, num_local_experts, align_block_size);
}
template <class T, class U>

View File

@@ -288,8 +288,8 @@ def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
)
cluster_shape = (
f"{schedule_config.cluster_shape_mnk[0]}"
+ f"x{schedule_config.cluster_shape_mnk[1]}"
+ f"x{schedule_config.cluster_shape_mnk[2]}"
f"x{schedule_config.cluster_shape_mnk[1]}"
f"x{schedule_config.cluster_shape_mnk[2]}"
)
kernel_schedule = VLLMKernelScheduleTag[schedule_config.kernel_schedule].split(
"::"
@@ -301,7 +301,7 @@ def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
return (
f"{tile_shape}_{cluster_shape}_{kernel_schedule}"
+ f"_{epilogue_schedule}_{tile_scheduler}"
f"_{epilogue_schedule}_{tile_scheduler}"
)

View File

@@ -1,203 +0,0 @@
Contains code from https://github.com/IST-DASLab/Sparse-Marlin/
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@@ -1,51 +0,0 @@
/*
* Copyright (C) 2024 Roberto Lopez Castro (roberto.lopez.castro@udc.es). All
* Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
namespace marlin_24 {
constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; }
// Instances of `Vec` are used to organize groups of >>registers<<, as needed
// for instance as inputs to tensor core operations. Consequently, all
// corresponding index accesses must be compile-time constants, which is why we
// extensively use `#pragma unroll` throughout the kernel code to guarantee
// this.
template <typename T, int n>
struct Vec {
T elems[n];
__device__ T& operator[](int i) { return elems[i]; }
};
template <int M_, int N_, int K_>
struct ShapeBase {
static constexpr int M = M_, N = N_, K = K_;
};
using I4 = Vec<int, 4>;
// Matrix fragments for tensor core instructions; their precise layout is
// documented here:
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
using FragA = Vec<half2, 4>;
using FragB = Vec<half2, 2>;
using FragM = Vec<uint, 1>;
using FragC = Vec<float, 4>;
using FragS = Vec<half2, 1>; // quantization scales
} // namespace marlin_24

View File

@@ -1,136 +0,0 @@
/*
* Copyright (C) 2024 Roberto Lopez Castro (roberto.lopez.castro@udc.es). All
* Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "base.h"
namespace marlin_24 {
// Predicated asynchronous global->shared copy; used for inputs A where we apply
// predication to handle batchsizes that are not multiples of 16.
__device__ inline void cp_async4_pred_zfill(void* smem_ptr,
const void* glob_ptr,
bool pred = true,
const bool zfill = false) {
const int BYTES = 16;
int src_in_bytes = (zfill ? 0 : BYTES);
uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %0, 0;\n"
" @p cp.async.cg.shared.global [%1], [%2], %3;\n"
"}\n" ::"r"((int)pred),
"r"(smem), "l"(glob_ptr), "n"(BYTES), "r"(src_in_bytes));
}
__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
bool pred = true) {
const int BYTES = 16;
uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %0, 0;\n"
" @p cp.async.cg.shared.global [%1], [%2], %3;\n"
"}\n" ::"r"((int)pred),
"r"(smem), "l"(glob_ptr), "n"(BYTES));
}
// Asynchronous global->shared copy
__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
const int BYTES = 16;
uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
asm volatile(
"{\n"
" cp.async.cg.shared.global [%0], [%1], %2;\n"
"}\n" ::"r"(smem),
"l"(glob_ptr), "n"(BYTES));
}
// Async copy fence.
__device__ inline void cp_async_fence() {
asm volatile("cp.async.commit_group;\n" ::);
}
// Wait until at most `n` async copy stages are still pending.
template <int n>
__device__ inline void cp_async_wait() {
asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
}
// Instruction for loading a full 16x16 matrix fragment of operand A from shared
// memory, directly in tensor core layout.
__device__ inline void ldsm4(FragA& frag_a, const void* smem_ptr) {
uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
: "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
: "r"(smem));
}
__device__ inline void ldsm4_m(FragM& frag_m, const void* smem_ptr) {
uint32_t* a = reinterpret_cast<uint32_t*>(&frag_m);
uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0,%1}, [%2];\n"
: "=r"(a[0]), "=r"(a[1])
: "r"(smem));
}
// Instruction for loading a full 16x16 matrix fragment of operand A from shared
// memory, directly in tensor core layout.
__device__ inline void ldsm4_t(FragA& frag_a, const void* smem_ptr) {
uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
asm volatile(
"ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%0,%1,%2,%3}, [%4];\n"
: "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
: "r"(smem));
}
// Wait until barrier reaches `count`, then lock for current threadblock.
__device__ inline void barrier_acquire(int* lock, int count) {
if (threadIdx.x == 0) {
int state = -1;
do
// Guarantee that subsequent writes by this threadblock will be visible
// globally.
asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
: "=r"(state)
: "l"(lock));
while (state != count);
}
__syncthreads();
}
// Release barrier and increment visitation count.
__device__ inline void barrier_release(int* lock, bool reset = false) {
__syncthreads();
if (threadIdx.x == 0) {
if (reset) {
lock[0] = 0;
return;
}
int val = 1;
// Make sure that all writes since acquiring this barrier are visible
// globally, while releasing the barrier.
asm volatile("fence.acq_rel.gpu;\n");
asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
:
: "l"(lock), "r"(val));
}
}
} // namespace marlin_24

View File

@@ -1,191 +0,0 @@
/*
* Copyright (C) 2024 Roberto Lopez Castro (roberto.lopez.castro@udc.es). All
* Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "base.h"
#include <cudaTypedefs.h>
namespace marlin_24 {
// On CUDA earlier than 12.5, the ordered_metadata version of this instruction
// is not supported. On later versions of CUDA the version without ordered
// metadata results in the following warning:
// | Advisory: Modifier .sp::ordered_metadata should be used on instruction
// | mma instead of modifier .sp as it is expected to have substantially
// | reduced performance on some future architectures
#if defined CUDA_VERSION && CUDA_VERSION >= 12050
#define MMA_SP_INST \
"mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
#else
#define MMA_SP_INST "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
#endif
// m16n8k32 sparse tensor core mma instruction with fp16 inputs and fp32
// output/accumulation.
__device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1,
const FragA& frag_b, FragC& frag_c, FragM& frag_m,
const int psel) {
const uint32_t* a0 = reinterpret_cast<const uint32_t*>(&a_frag0);
const uint32_t* a1 = reinterpret_cast<const uint32_t*>(&a_frag1);
const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
const uint32_t* e = reinterpret_cast<const uint32_t*>(&frag_m);
float* c = reinterpret_cast<float*>(&frag_c);
if (psel == 0) {
asm volatile(MMA_SP_INST
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
"{%12,%13,%14,%15}, %16, 0x0;\n"
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
: "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]),
"r"(b[2]), "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]),
"f"(c[2]), "f"(c[3]), "r"(e[0]));
asm volatile(MMA_SP_INST
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
"{%12,%13,%14,%15}, %16, 0x0;\n"
: "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7])
: "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]),
"r"(b[3]), "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]),
"f"(c[6]), "f"(c[7]), "r"(e[0]));
} else {
asm volatile(MMA_SP_INST
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
"{%12,%13,%14,%15}, %16, 0x1;\n"
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
: "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]),
"r"(b[2]), "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]),
"f"(c[2]), "f"(c[3]), "r"(e[0]));
asm volatile(MMA_SP_INST
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
"{%12,%13,%14,%15}, %16, 0x1;\n"
: "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7])
: "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]),
"r"(b[3]), "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]),
"f"(c[6]), "f"(c[7]), "r"(e[0]));
}
}
// Lookup-table based 3-input logical operation; explicitly used for
// dequantization as the compiler does not seem to automatically recognize it in
// all cases.
template <int lut>
__device__ inline int lop3(int a, int b, int c) {
int res;
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
: "=r"(res)
: "r"(a), "r"(b), "r"(c), "n"(lut));
return res;
}
__device__ __forceinline__ uint2 to_half4(float c0, float c1, float c2,
float c3) {
uint2 r;
asm("{\n\t"
".reg .f16 a, b, c, d; \n\t"
"cvt.rn.f16.f32 a, %2; \n\t"
"cvt.rn.f16.f32 b, %3; \n\t"
"cvt.rn.f16.f32 c, %4; \n\t"
"cvt.rn.f16.f32 d, %5; \n\t"
"mov.b32 %0, {a, b}; \n\t"
"mov.b32 %1, {c, d}; \n\t"
"}"
: "=r"(r.x), "=r"(r.y)
: "f"(c0), "f"(c1), "f"(c2), "f"(c3));
return r;
}
// Constructs destination register by taking bytes from 2 sources (based on
// mask)
template <int start_byte, int mask>
__device__ inline uint32_t prmt(uint32_t a) {
uint32_t res;
asm volatile("prmt.b32 %0, %1, %2, %3;\n"
: "=r"(res)
: "r"(a), "n"(start_byte), "n"(mask));
return res;
}
// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
// values. We mostly follow the strategy in the link below, with some small
// changes:
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
__device__ inline FragB dequant_4bit(int q) {
const int LO = 0x000f000f;
const int HI = 0x00f000f0;
const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`.
const int SUB = 0x64086408;
const int MUL = 0x2c002c00;
const int ADD = 0xd480d480;
FragB frag_b;
frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
*reinterpret_cast<const half2*>(&SUB));
frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
*reinterpret_cast<const half2*>(&MUL),
*reinterpret_cast<const half2*>(&ADD));
return frag_b;
}
// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
// values. We mostly follow the strategy in the link below, with some small
// changes:
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
__device__ inline FragB dequant_8bit(int q) {
static constexpr uint32_t mask_for_elt_01 = 0x5250;
static constexpr uint32_t mask_for_elt_23 = 0x5351;
static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
FragB frag_b;
frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
*reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
frag_b[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
*reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
return frag_b;
}
// Multiply dequantized values by the corresponding quantization scale; used
// only for grouped quantization.
__device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) {
half2 s = __half2half2(reinterpret_cast<__half*>(&frag_s)[i]);
frag_b[0] = __hmul2(frag_b[0], s);
frag_b[1] = __hmul2(frag_b[1], s);
}
__device__ inline void scale_floats(float* c0, float* c1, float* c2, float* c3,
FragS& s0, float* c4, float* c5, float* c6,
float* c7, FragS& s1) {
*c0 = __fmul_rn(*c0, __half2float(s0[0].x));
*c1 = __fmul_rn(*c1, __half2float(s0[0].y));
*c2 = __fmul_rn(*c2, __half2float(s0[1].x));
*c3 = __fmul_rn(*c3, __half2float(s0[1].y));
*c4 = __fmul_rn(*c4, __half2float(s1[0].x));
*c5 = __fmul_rn(*c5, __half2float(s1[0].y));
*c6 = __fmul_rn(*c6, __half2float(s1[1].x));
*c7 = __fmul_rn(*c7, __half2float(s1[1].y));
}
} // namespace marlin_24

File diff suppressed because it is too large Load Diff

View File

@@ -141,8 +141,8 @@ struct cutlass_3x_gemm_sm100 {
sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
using GemmKernel = enable_sm100f_only<cutlass::gemm::kernel::GemmUniversal<
Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>>;
};
template <typename ElementAB_, typename ElementD_,
@@ -202,8 +202,8 @@ struct cutlass_3x_gemm_sm120 {
sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
using GemmKernel = enable_sm120_only<cutlass::gemm::kernel::GemmUniversal<
Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>>;
};
} // namespace vllm

Some files were not shown because too many files have changed in this diff Show More