Compare commits

...

623 Commits

Author SHA1 Message Date
Krish Gupta
3827c8c55a [Test] Add tests for n parameter in chat completions API (#35283)
Signed-off-by: KrxGu <krishom70@gmail.com>
2026-02-26 09:14:07 +00:00
Kevin McKay
ade81f17fe [Bugfix][Hardware][AMD] Gate FP4 ops on gfx950 to prevent MI300X crash (#35250)
Signed-off-by: c0de128 <kevin.mckay@outlook.com>
2026-02-26 16:11:07 +08:00
Gregory Shtrasberg
6042e66cd5 [ROCm] Add extra step in config initialization to populate custom ops before compilation config init (#34848)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2026-02-26 16:05:40 +08:00
Chaojun Zhang
9f9a675b23 [XPU][8/N] Fix kernel bugs in XPU LoRA and MOE LORA (#34115)
Signed-off-by: chzhang <chaojun.zhang@intel.com>
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
2026-02-26 15:46:44 +08:00
Ofir Zafrir
a07c4c5939 [BugFix][XPU] Fix speculative decoding on Intel XPU due to bug with IGC_ForceOCLSIMDWidth=16 (#35298)
Signed-off-by: Ofir Zafrir <ofir.zafrir@intel.com>
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
2026-02-26 07:15:16 +00:00
Cyrus Leung
d3a51da92a [Benchmark] Simplify SLA scan (#35306)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-25 22:35:41 -08:00
Flora Feng
186ea22efe [Misc][Harmony] Move Responses API only harmony utils to responses/harmony.py (#35339)
Signed-off-by: sfeng33 <4florafeng@gmail.com>
2026-02-26 14:35:16 +08:00
Daniele
4a9c07a0a2 [BugFix] anthropic/serving_messages: fix tool call arguments streaming (#34887)
Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
2026-02-26 05:39:48 +00:00
Jason Li
9d37941017 [torch.compile] Sequence Parallelism threshold compile ranges (#28672)
Signed-off-by: jasonlizhengjian <jasonlizhengjian@gmail.com>
Signed-off-by: Jason Li <jasonlizhengjian@gmail.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2026-02-26 05:00:12 +00:00
Fadi Arafeh
4171ff6dd9 [CPU][Feat] Enable KleidiAI INT8_W4A8 for all input dtypes (#34890)
Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
2026-02-26 05:00:10 +00:00
Woosuk Kwon
13025e71e8 [Model Runner V2] Add coding style guide (#35325)
Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
2026-02-25 20:42:40 -08:00
Hanjie Qiu
71dfce6aa6 [Kernel] Refactor FlashInfer allreduce for mnnvl backend (#34109)
Signed-off-by: hjjq <50634613+hjjq@users.noreply.github.com>
Signed-off-by: wzhao18 <wzhao18.sz@gmail.com>
Co-authored-by: wzhao18 <wzhao18.sz@gmail.com>
Co-authored-by: Wei Zhao <51183510+wzhao18@users.noreply.github.com>
2026-02-26 03:17:20 +00:00
hujiaxin0
2aa4140402 openpangu-vl support video input (#34134)
Signed-off-by: hujiaxin <524446785@qq.com>
Signed-off-by: Emilie1001 <79921183+Emilie1001@users.noreply.github.com>
Co-authored-by: Emilie1001 <79921183+Emilie1001@users.noreply.github.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-02-26 03:08:09 +00:00
Roberto L. Castro
86c3b5a808 [BugFix] Fix fp4 quant kernel on CUDA 12.8 (#35210)
Signed-off-by: LopezCastroRoberto <rocastro@redhat.com>
2026-02-25 18:32:50 -08:00
Seungmin Kim
160424a937 [Bugfix] Fix CUDA compatibility path setting for both datacenter and consumer NVIDIA GPUs (#33992)
Signed-off-by: Seungmin Kim <8457324+ehfd@users.noreply.github.com>
Signed-off-by: Andrew Mello <19512127+88plug@users.noreply.github.com>
Co-authored-by: 88plug <19512127+88plug@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2026-02-25 18:15:51 -08:00
Lucas Wilkinson
9511a3f8ee [Bugfix] Fix AttributeError in SMControlContextManager (#35338)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2026-02-25 18:01:10 -08:00
Michael Goin
de527e1cec [UX] Add --moe-backend arg for explicit kernel selection (#33807)
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2026-02-25 17:44:44 -08:00
Yongye Zhu
1976356ee6 [MoE Refactor] MXFP4 Cutlass Experts to MK (#34542)
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
2026-02-25 17:32:39 -08:00
Michael Goin
cbf8f7028c [UX] Add --performance-mode {balanced,interactivity,throughput} (#34936)
Signed-off-by: mgoin <mgoin64@gmail.com>
2026-02-25 17:28:31 -08:00
Ming Yang
6831650c40 [offloader] v2: Hide weight onloading latency via prefetching (#29941)
Signed-off-by: Ming Yang <minos.future@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2026-02-25 17:20:59 -08:00
Andreas Karatzas
ed42507f6d [ROCm][CI] Amending deletion of AMD mirror (#35322)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-25 14:17:56 -08:00
Andreas Karatzas
9571e99945 [ROCm][CI] Extending attention backend coverage for Eagle spec decode tests (#35265)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-25 14:16:18 -08:00
Elizabeth Thomas
c97234c08b fix(mxfp4): Disable monolithic path for TRITON backend with EP (#34270)
Signed-off-by: Elizabeth Thomas <email2eliza@gmail.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2026-02-25 13:33:42 -08:00
rasmith
b188bab441 [CI][AMD][BugFix] Add torch.cuda.set_device to test_punica_ops so punica kernels execute on same device as tensor (#34985)
Signed-off-by: Randall Smith <Randall.Smith@amd.com>
2026-02-25 19:18:00 +00:00
Lucas Wilkinson
15d76f74e2 Revert "[Misc] Enable weights loading tracking for quantized models" (#35309) 2026-02-25 09:20:15 -08:00
Andreas Karatzas
8fd6975479 [ROCm][CI] Disable skinny GEMMs in multimodal tests to fix non-deterministic results (#35049)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-25 16:48:37 +00:00
pushkar
5d18bf8b32 [Bugfix] Fix Harmony preamble visibility in Responses API (#32114)
Signed-off-by: Pushkar Patel <git@thepushkarp.com>
Signed-off-by: pupa <pupa@users.noreply.github.com>
2026-02-25 08:08:16 -08:00
haosdent
0788ff0a15 [Bugfix] Gracefully disable AllReduceFusionPass on GPUs without multicast support (#35085)
Signed-off-by: haosdent <haosdent@gmail.com>
2026-02-25 07:31:45 -08:00
Chendi.Xue
d72b0be33c [XPU]Fix for Qwen-OMNI crash (#35249)
Signed-off-by: Chendi Xue <chendi.xue@intel.com>
2026-02-25 07:31:07 -08:00
Bhoomit
42489e43c2 [Misc][LoRA] Increase max vocab size limit to 258048 in logits processor (#34773)
Signed-off-by: Bhoomit Vasani <vbhoomit@amazon.com>
2026-02-25 23:30:55 +08:00
Mario Hong
af5e6afa0a [Bugfix] Fix step3p5 reasoning with interleaved thinking (#34211)
Signed-off-by: mariohong <mariohong128@gmail.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
2026-02-25 15:13:01 +00:00
Benjamin Chislett
ee59a7c615 [Tests] Add GSM8k check to SpecDec E2E tests (#34772)
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
2026-02-25 07:51:14 -05:00
Joao Gante
709eadbb0b Doc link typo (#35281)
Signed-off-by: Joao Gante <joaofranciscocardosogante@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-02-25 03:00:31 -08:00
Harry Mellor
90fc7f9109 Fix custom processors that use deleted behaviour for Transformers v5 (#35107)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-25 02:36:21 -08:00
Yanwen Lin
675ec59aa9 [Bugfix][CPU] Fix basic unit tests failing in CPU platforms (#34677)
Signed-off-by: Yanwen Lin <lyw1124278064@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-25 08:36:15 +00:00
Yanwen Lin
80e60a6133 [Doc] Suggest "--managed-python" flag when installing python using uv (#33069)
Signed-off-by: Yanwen Lin <lyw1124278064@gmail.com>
2026-02-25 08:19:43 +00:00
jonoillar
26e722f906 [DOC][BugFix] Specfiy build dependency installation (#34513)
Signed-off-by: Jon OILLARBURU <jon.oillarburu@multiversecomputing.com>
Co-authored-by: Jon OILLARBURU <jon.oillarburu@multiversecomputing.com>
2026-02-25 08:04:06 +00:00
lichuang
2c619e5e3f [Docs]Fix documentation formatting in architecture overview (#34679)
Signed-off-by: codedump <lichuang1982@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-25 08:00:15 +00:00
Simon Mo
8a685be8d9 docs: document committer proposal process in governance (#35225)
Signed-off-by: Simon Mo <simon.mo@hey.com>
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-25 07:58:48 +00:00
Laura Wang
2465071510 [Perf] Add opt-in SM100 Oink RMSNorm custom-op path (#31828)
Signed-off-by: Laura Wang <3700467+Laurawly@users.noreply.github.com>
Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com>
2026-02-24 23:01:53 -08:00
wenshuai
cd43673668 [Perf] Optimize FP8 gemm of sm120. (#34424)
Signed-off-by: wenshuai <wenshuai@xiaomi.com>
2026-02-24 22:25:24 -08:00
Xinyu Chen
35d44b4557 [XPU]Support CUDAGraph on XPU Platform (#34482)
Signed-off-by: Xinyu Chen <xinyu1.chen@intel.com>
Co-authored-by: chzhang <chaojun.zhang@intel.com>
Co-authored-by: zhenwei-intel <zhenwei.liu@intel.com>
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
2026-02-24 22:22:52 -08:00
Kunshang Ji
8ad54a991b [Platform] Add current_platform.num_compute_units interface (#35042)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
Signed-off-by: Kunshang Ji <jikunshang95@gmail.com>
2026-02-24 22:22:49 -08:00
Kunshang Ji
92510edc32 remove cuda check in top_k_top_p_triton kernel (#35011)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2026-02-24 22:22:31 -08:00
Isotr0py
a6c137521c [Misc] Add shard_id validation for MergedColumnLinear (#35055)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-02-24 22:12:28 -08:00
Isotr0py
4572a06afe [Misc] Enable weights loading tracking for quantized models (#35074)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-02-24 22:11:03 -08:00
Zhengxu Chen
5cc29cfb8b [compile] Improve error message during artifacts load failure. (#35115)
Signed-off-by: zhxchen17 <zhxchen17@fb.com>
2026-02-24 22:01:09 -08:00
Chen Zhang
8fae54faff [Linear Attention] fix bug for linear attention + prefix caching + reset_prefix_cache (#35157)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2026-02-24 22:00:19 -08:00
Harry Mellor
f7967577f5 Remove requirement to use --hf-overrides for DeepseekVLV2ForCausalLM (#35203)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-24 22:00:06 -08:00
pks
af770b8e7b [Bugfix] Fix AttributeError when passing StructuredOutputsParams to CompletionRequest (#35237)
Signed-off-by: Patrick Simianer <patrick@lilt.com>
2026-02-24 22:00:03 -08:00
Andreas Karatzas
2ff3e436ad [Responses][CI] Filter negative token IDs in schema fuzz test to avoid 500 errors (#35231)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-25 05:52:44 +00:00
Jhao-Ting Chen
c2c4c4611a [FIX] fused moe with lora shared expert dual stream (1.07x otps) (#34933)
Signed-off-by: Jhao-Ting Chen <jhaotingc@nvidia.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2026-02-25 04:40:45 +00:00
Rohan Potdar
f38f8c9742 [ROCm]: Enable customop and rope+kvcache fusion for AITER RoPE (#35180)
Signed-off-by: Rohan138 <rohanpotdar138@gmail.com>
2026-02-25 04:36:40 +00:00
Flora Feng
ec1d30c0f6 [Responses] Decouple SSE event helpers from Harmony context (#35148)
Signed-off-by: sfeng33 <4florafeng@gmail.com>
2026-02-24 20:05:25 -08:00
Pooya Davoodi
e3b2324ec4 [Frontend] Use init_app_state and FrontendArgs in run_batch (#32967)
Signed-off-by: Pooya Davoodi <pooya.davoodi@parasail.io>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2026-02-24 19:40:39 -08:00
Nick Hill
dbf0da817a [Core] Cleanup engine pause/sleep logic (#34528)
Signed-off-by: Nick Hill <nickhill123@gmail.com>
2026-02-24 19:33:34 -08:00
Xin Yang
3bbb2046ff [Bugfix] Fix expert_ids padding values in moe_align_block_size kernel (#35161)
Signed-off-by: Xin Yang <xyangx@amazon.com>
2026-02-24 17:14:24 -08:00
yugong333
576fe50333 Adding Nemotron fp8 Triton MoE Config (#34674)
Signed-off-by: Yu Gong <yu3.gong@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2026-02-24 15:56:38 -08:00
Hashem Hashemi
a0e50a4260 Convert wvSplitKQ to 16x16 MFMA in prep for mi4xx. (#34100)
Signed-off-by: Hashem Hashemi <hashem.hashemi@amd.com>
2026-02-24 23:35:21 +00:00
Benjamin Chislett
9fa5b25a23 [Bug][DSV3.2] Always prepare metadata for DeepGEMM Sparse Attention (#35075)
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
2026-02-24 14:55:22 -08:00
Robert Shaw
ea97750414 [CI] Fix Distributed Tests (#35236)
Signed-off-by: Robert Shaw <robertgshaw2@gmail.com>
2026-02-24 22:31:56 +00:00
Andreas Karatzas
067c5d9ad1 [ROCm][CI] Added MI325 mirrors (#34923)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-24 13:37:15 -08:00
Benjamin Chislett
f5972a872f [Model][Spec Decode] Nemotron-H MTP and Mamba Speculative Decoding Support (#33726)
Signed-off-by: Shahar Mor <smor@nvidia.com>
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Shahar Mor <smor@nvidia.com>
Co-authored-by: Roi Koren <roik@nvidia.com>
Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com>
2026-02-24 09:49:56 -08:00
Matthew Bonanni
a9e15e040d Add @MatthewBonanni to CODEOWNERS (#35207)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2026-02-24 10:45:10 -07:00
Lucas Wilkinson
542ca66357 Revert "[CI/Build] Remove redundant OpenTelemetry pip install from CI configs" (#35211) 2026-02-24 09:26:42 -08:00
Cyrus Leung
fc8456c336 [CI/Build] Fix kernels test location (#35205)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-24 09:20:34 -08:00
Wentao Ye
9ce8fad2a9 [Perf] Optimize Python Slice for Structured Output using islice instead of [:] (#33593)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-02-24 09:02:36 -08:00
Harry Mellor
c38b8d5a31 Remove padding_index from models that don't use it for better Transformers v5 compatibility (#35189)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-24 08:04:46 -08:00
Robert Shaw
60da0e1544 [CI] Remove Duplicated Tests (#35199)
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
2026-02-24 23:53:30 +08:00
danisereb
9609b1f18d Integrate flashinfer mm_mxfp8 in ModelOpt MXFP8 (#35053)
Signed-off-by: Daniel Serebrenik <daserebrenik@nvidia.com>
2026-02-24 08:45:13 -07:00
danisereb
a0c7081695 Fix fallback to default tactic (flashinfer autotuner) with trtllm_fp4_block_scale_moe (#35088)
Signed-off-by: Daniel Serebrenik <daserebrenik@nvidia.com>
2026-02-24 07:25:44 -08:00
R3hankhan
34ce0ffd1f [CPU][Perf] Accelerate Attention head for s390x using vector intrinsics (#34434)
Signed-off-by: Rehan Khan <Rehan.Khan7@ibm.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
2026-02-24 07:25:39 -08:00
Robin Nabel
0de5333989 Fix GLM4 parser tests (#34905)
Signed-off-by: Robin Nabel <opensource@nabel.co>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
2026-02-24 22:27:42 +08:00
Eldar Kurtić
a87cc50859 [Attn,KV-cache] Use per-head scales in the attention selector (#34281)
Signed-off-by: Your Name <you@example.com>
Signed-off-by: Eldar Kurtic <research@neuralmagic.com>
Co-authored-by: Eldar Kurtic <research@neuralmagic.com>
Co-authored-by: Your Name <you@example.com>
2026-02-24 09:02:43 -05:00
Cyrus Leung
761e63e541 [Frontend] Always pass supported_tasks to validation (#35186)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-24 04:16:33 -08:00
Isotr0py
d12d201409 [Bugfix] Fix failing FunASR processor test (#35111)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-02-24 04:13:45 -08:00
eustlb
b3ad37c5db [glm-asr] change defaults dummy audio size (#35108)
Signed-off-by: Eustache Le Bihan <eulebihan@gmail.com>
2026-02-24 04:13:33 -08:00
Wentao Ye
14561fabfd [Perf] Optimize pooling model redundant copy, 1.8% throughput improvement (#35127)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2026-02-24 04:13:11 -08:00
Zhengxu Chen
c77f3e1207 [compile] Save aot compile artifacts atomically. (#35117)
Signed-off-by: zhxchen17 <zhxchen17@fb.com>
2026-02-24 04:11:01 -08:00
Dor Huri
012dee9233 [Feature] Add LoRA tower/connector support for Llama 4 Vision (mllama4) (#35147)
Signed-off-by: dorhuri123 <dor.huri1@live.biu.ac.il>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2026-02-24 04:10:32 -08:00
Tugsbayasgalan Manlaibaatar
f1c664545b Make voxtral compile friendly (#33959)
Signed-off-by: Tugsbayasgalan Manlaibaatar <tmanlaibaatar@fb.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
2026-02-24 09:33:35 +01:00
Xin Yang
c870eb9e0f [LoRA] Update LoRA expand kernel block_n calculation (#32621)
Signed-off-by: Xin Yang <xyangx@amazon.com>
2026-02-23 23:17:53 -08:00
BadrBasowid
6af03f2394 [Refactor] [1/N] Reorganize kernel abstraction directory (#34055)
Signed-off-by: BadrBasowid <badr.basowid@gmail.com>
Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
2026-02-24 06:47:22 +00:00
Vlad Tiberiu Mihailescu
1a6cf39dec [CI/Build] Remove redundant OpenTelemetry pip install from CI configs (#35032)
Signed-off-by: Vlad Mihailescu <vtmihailescu@gmail.com>
2026-02-23 22:24:11 -08:00
Nicolò Lucchesi
f91808ae0d [MM] Allow audio chunking for offline LLM (#34628)
Signed-off-by: NickLucche <nlucches@redhat.com>
2026-02-23 21:04:28 -08:00
Vadim Gimpelson
33a0d43c71 [BUGFIX][Qwen3.5] Hardcode mlp.gate as not quantizable (#35156)
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
2026-02-23 19:42:24 -08:00
pschlan-amd
80d93fd6da gpu_model_runner: Cache is_encoder_decoder from model config (#35099)
Signed-off-by: Patrick Schlangen <pschlan@amd.com>
2026-02-23 19:08:34 -08:00
Jia Guo
ec85340531 [Quantization] Support FP8 MoE bias for models like GPT-OSS (#34906)
Signed-off-by: jasperjiaguo <jasperg662@gmail.com>
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-02-23 19:07:47 -08:00
Rohan Potdar
2ff4e51152 [ROCm] AITER fused RoPE+KVCache (#33443)
Signed-off-by: Rohan138 <rohanpotdar138@gmail.com>
Signed-off-by: charlifu <charlifu@amd.com>
Signed-off-by: Rohan Potdar <66227218+Rohan138@users.noreply.github.com>
Co-authored-by: charlifu <charlifu@amd.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Douglas Lehr <91553416+dllehr-amd@users.noreply.github.com>
2026-02-23 19:06:00 -08:00
Asaf Gardin
95642441d0 [Mamba1] - Change supports_update_block_table to True (#35054)
Signed-off-by: Josephasafg <ajgard7@gmail.com>
2026-02-23 19:05:57 -08:00
Xin Yang
a7c9f7b7ec [Bugfix] Fix lora_ids in FusedMoE LoRA test (#35135)
Signed-off-by: Xin Yang <xyangx@amazon.com>
2026-02-23 21:49:25 -05:00
Michael Goin
a4bd661fb3 [Perf] Enable FlashInfer DeepGEMM swapAB on SM90 by default (#34924)
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2026-02-23 17:34:41 -08:00
Michael Goin
3ef9fd0f98 [Bugfix] Fix DSV3 kernels breaking _C and _moe_C on unsupported arches (#35123)
Signed-off-by: mgoin <mgoin64@gmail.com>
2026-02-23 17:11:27 -08:00
Michael Goin
22a97e6613 [Perf] Improve default triton fused moe configs (#34846)
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2026-02-23 16:01:28 -08:00
Aaron Hao
596ed1f02e [RL] Validation for pause_mode='keep' (#34992)
Signed-off-by: ahao-anyscale <ahao@anyscale.com>
2026-02-23 16:30:56 -05:00
Nicolò Lucchesi
b8d8b7e934 [Misc] Monitor interface changes (#35113)
Signed-off-by: NickLucche <nlucches@redhat.com>
2026-02-23 17:14:51 +00:00
Harry Mellor
28c5e69ba0 Enforce that model is the first positional arg when --served-model-name is used (#34973)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-23 08:38:05 -08:00
Harry Mellor
864167d376 Fix custom processors that use deleted import for Transformers v5 (#35101)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-23 08:38:00 -08:00
haosdent
a2ba6a5244 [Bugfix] Fix prefix caching for Mamba 'all' mode (Nemotron models) (#34874)
Signed-off-by: haosdent <haosdent@gmail.com>
2026-02-23 17:31:51 +01:00
Harry Mellor
c4f38696f7 Use Xet high performance mode for Transformers v5 (#35098)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-23 08:19:30 -08:00
haosdent
a7f341c323 [Bugfix] Fix MRotaryEmbedding missing truncate attr with YaRN scaling (#35080)
Signed-off-by: haosdent <haosdent@gmail.com>
2026-02-23 16:05:52 +00:00
Robert Shaw
d13ece38d7 [CI] Skip Responses API (#34990)
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
2026-02-23 07:46:45 -08:00
Mark McLoughlin
5cc7c4452e [Metrics] Add Prometheus counters for Model FLOPs Utilization (MFU) (#30950)
Export the existing Model FLOPs Utilization (MFU) metrics via Prometheus.

`--enable-mfu-metrics` is required for these to be exposed.

Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2026-02-23 15:01:07 +00:00
Eldar Kurtić
b95bb6927f [kv-cache, ct] Use compressed-tensors as a source of ground-truth for quant strategies (#34254)
Signed-off-by: Your Name <you@example.com>
Co-authored-by: Your Name <you@example.com>
2026-02-23 07:37:55 -07:00
Cyrus Leung
392645454b [Refactor] Decouple TimingContext from InputProcessingContext (#35083)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-23 14:15:50 +00:00
Eldar Kurtić
1e8438a89a [Llama4,CI] Bring back Llama-4 bug fixes, and also fix Maverick tests (#35033)
Signed-off-by: Eldar Kurtic <you@example.com>
Co-authored-by: Eldar Kurtic <you@example.com>
2026-02-23 09:04:34 -05:00
Robert Shaw
8435b2e049 [ModelBash][DSV3] Add TRTLLM DSV3 Router GEMM kernel (6% B1 Speedup) (#34302)
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
2026-02-23 14:02:26 +00:00
Yan Ma
b1b5e045df [XPU] allow TORCH_SDPA/TRITON_ATTN as XPU vit Backend (#35010)
Signed-off-by: Yan Ma <yan.ma@intel.com>
2026-02-23 05:06:44 -08:00
Andreas Karatzas
5f68464f92 [ROCm][CI] Fix spec decode profile assertion and logprob test determinism (#35043)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-23 05:05:54 -08:00
Vincent Gimenes
aa08a30fc9 [CLEANING] Remove unused disable_by_batch_size from SpeculativeConfig (#35060)
Signed-off-by: Vincent Gimenes <vincent.gimenes@gmail.com>
2026-02-23 05:05:36 -08:00
Wentao Ye
7f40e9e516 [Refactor] Remove dead private func _fp8_perm and _extract_mask_for_item (#35068)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2026-02-23 05:05:20 -08:00
Harry Mellor
103e614b14 Fix pipeline parallel with embed scaling in the Transformers modelling backend (#35094)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-23 05:04:47 -08:00
Neil Schemenauer
54e2f83d0a [Feature] Lazy import for the "mistral" tokenizer module. (#34651)
Signed-off-by: Neil Schemenauer <nas@arctrix.com>
2026-02-23 00:43:01 -08:00
Gabe Goodhart
e631f8e78e fix: Apply embedding_multiplier to inputs_embeds (#34813)
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2026-02-23 00:42:46 -08:00
Martin Hickey
e97c46a92d [BugFix]: Fix local mypy issues (#34739)
Signed-off-by: Martin Hickey <martin.hickey@ie.ibm.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-23 00:40:29 -08:00
Jee Jee Li
7291d1b288 [Bugfix] Fix kernel benchmark (#33752)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2026-02-22 21:18:08 -08:00
Cyrus Leung
987506bca6 [Refactor] Simplify dummy data generation (#35025)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-22 20:55:27 -08:00
Woosuk Kwon
c645e9a214 [Model Runner V2] Remove propose_draft method (#35070)
Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
2026-02-22 18:27:12 -08:00
Nick Hill
944ffb5968 [Model Runner V2][Minor] Remove redundant do_spec_decode field (#35039)
Signed-off-by: Nick Hill <nickhill123@gmail.com>
Co-authored-by: Woosuk Kwon <woosuk@inferact.ai>
2026-02-22 16:18:04 -08:00
qizixi
2bcf71b9c0 [Spec Decode] Reduce TP communication for speculative decoding draft token generation (#34049)
Signed-off-by: qizixi <qizixi@meta.com>
Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com>
2026-02-22 14:59:16 -08:00
tacos8me
b7892a3bef [Model] Add NVFP4 quantization support for Step3.5-Flash (#34478)
Signed-off-by: tacos8me <ian@cloudhabit.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2026-02-22 12:30:46 -07:00
Benjamin Chislett
682566b18e [Bug] Refactor max_num_batched_tokens to account for drafting (#34898)
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
2026-02-22 11:18:46 -05:00
qizixi
b9c2a565cc [Spec Decode] Defer clearing KV connector metadata for EAGLE3 speculative decode + prefill / decode disagg setup (#34529)
Signed-off-by: qizixi <qizixi@meta.com>
Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com>
2026-02-22 08:08:32 -08:00
Andreas Karatzas
dd8c3a7fb2 [ROCm][CI] Fix realtime test timeouts caused by aiter JIT compilation delays (#35052)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-22 10:07:18 +00:00
Andreas Karatzas
a8a47c17b6 [ROCm][CI] Fix flaky embedding chat test by using tolerance-based comparison (#35050)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-22 09:03:44 +00:00
Roger Wang
40f88d8318 [Bugfix] Fix Qwen3/Qwen3.5 Reasoning Parser (#34779)
Signed-off-by: Roger Wang <hey@rogerw.io>
2026-02-21 23:15:35 -08:00
Woosuk Kwon
2cbf9656ce [Model Runner V2] Enable CUDA graph for Eagle3 (#35040)
Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
2026-02-21 21:42:50 -08:00
Xiao Li
30132cd144 Fix apply_top_k_top_p_triton called by non-cuda logits Tensor (#35030)
Signed-off-by: Xiao Li <ilx@meta.com>
2026-02-21 21:11:54 -08:00
Cyrus Leung
cbd95a2dd1 [Benchmark] Use sns.relplot for plotting (#35027)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-21 20:26:48 -08:00
Athrael Soju
970861ac0c [New Model] Add ColModernVBERT (#34558)
Signed-off-by: Athrael Soju <athrael.soju@gmail.com>
Signed-off-by: athrael-soju <athrael-soju@users.noreply.github.com>
2026-02-22 12:23:41 +08:00
Wentao Ye
d24bdd7c4b [CI] Bump mteb version to mteb[bm25s]>=2, <3 for pooling model unit tests (#34961)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2026-02-21 20:23:24 -08:00
Andreas Karatzas
d403c1da1c [CI] Stabilizing ROCm amd-ci signal and minor name fix in upstream (#35008)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-22 04:01:10 +00:00
Woosuk Kwon
b71fbd06e2 [Model Runner V2] Support attention group (#35036)
Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
2026-02-21 16:42:53 -08:00
Vadim Gimpelson
74d90b1ce4 [Model Bash][DSR1] Add selective dynamic shape marking for CustomOp (#34900)
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
2026-02-21 19:28:01 -05:00
Woosuk Kwon
a4047d4ea9 [Model Runner V2] Support Eagle3 (no CUDA graph) (#35029)
Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
2026-02-21 12:55:24 -08:00
Cyrus Leung
965fe45935 [CI/Build] Fix gRPC version mismatch (#35013)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-21 12:14:41 -07:00
Roman
98b0205c3c [Frontend] Add automatic language detection for Whisper transcription (#34342)
Signed-off-by: space_check <roman.vuskov@rwth-aachen.de>
Signed-off-by: Roman <45857014+spacecheck@users.noreply.github.com>
Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
2026-02-21 04:49:41 -08:00
Huy Do
272b535ab3 [Bugfix] Gate 256-bit instructions to CUDA 12.9+ (#34791)
Signed-off-by: Huy Do <huydhn@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2026-02-21 04:48:14 -08:00
Cyrus Leung
f74f1572ca [Benchmark] Improve benchmarks (#35012)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-21 10:31:58 +00:00
petrpechman
bebfe55b1c [Doc] Fix example of eagle3 (#34960)
Signed-off-by: Petr Pechman <petr.pechman@firma.seznam.cz>
Co-authored-by: Petr Pechman <petr.pechman@firma.seznam.cz>
2026-02-21 09:57:53 +00:00
Nick Hill
820d7815eb [Core] Minor structured-output related scheduler optimization (#34765)
Signed-off-by: Nick Hill <nickhill123@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2026-02-21 01:38:28 -08:00
Nicolò Lucchesi
ab6f3487a6 [PD] Change kv_load_failure_policy Default from "recompute" to "fail" (#34896)
Signed-off-by: NickLucche <nlucches@redhat.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2026-02-21 01:34:57 -08:00
BADAOUI Abdennacer
8dc8a99b56 [ROCm] Enable bitsandbytes quantization support on ROCm (#34688)
Signed-off-by: badaoui <abdennacerbadaoui0@gmail.com>
2026-02-21 00:34:55 -08:00
jennyyyyzhen
2aab2bb543 [ROCM] Optimize ROCM_AITER_FA spec decode eagle performance (#34541)
Signed-off-by: jennyyyyzhen <yzhen@hmc.edu>
2026-02-20 20:32:05 -08:00
Andreas Karatzas
54254f7a61 [ROCm][CI] Fix spec decode logprobs flakiness and parametrize tree attention backends (#34599)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-20 20:25:23 -08:00
Andreas Karatzas
cf93c1a128 [ROCm][AITER] Fix aiter paged_attention_v1 decode for sliding window and head_size < 64 (#34570)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-20 20:25:07 -08:00
Andreas Karatzas
89358f0d35 [CI] Fix ColBERT HF comparison tests on AMD CI + refactor (#34567)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-20 20:12:05 -08:00
zhongdaor-nv
a0fe7ea2f0 [feat] Add per-block extra_keys to KV events (#33304)
Signed-off-by: zhongdaor-nv <zhongdaor@nvidia.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2026-02-20 20:11:40 -08:00
Andreas Karatzas
991d6bff38 [CI][MCP][Harmony] Heavy refactoring Harmony & MCP response tests and stabilizing with deterministic test infrastructure (#33949)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-20 20:03:32 -08:00
Kata Coder
5719a4e4e6 [Frontend] Support multimodal inputs for late-interaction scoring (ColQwen3) + NewModel: nvidia/nemotron-colembed (#34574)
Signed-off-by: craftsangjae <craftsangjae@gmail.com>
2026-02-20 20:01:40 -08:00
pougetat
11be2c74dc [Realtime] Add Qwen3-ASR realtime streaming support (#34613)
Signed-off-by: Thomas Pouget-Abadie <thomaspou@microsoft.com>
Co-authored-by: Thomas Pouget-Abadie <thomaspou@microsoft.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
2026-02-20 19:59:42 -08:00
Xin Yang
7a5adad480 [Kernel] Optimize sample_recovered_tokens_kernel (#34974)
Signed-off-by: Xin Yang <xyangx@amazon.com>
2026-02-20 19:59:06 -08:00
Li
59c6233297 Support prompt_embeds for pooling requests in output processor (#34904)
Signed-off-by: Li Zhang <lzhanga@amazon.com>
Co-authored-by: Li Zhang <lzhanga@amazon.com>
2026-02-20 19:57:38 -08:00
Taneem Ibrahim
d38cd3dde5 [Misc] Fix mypy errors in vllm/profiler and remove from exclude list (#34959)
Signed-off-by: Taneem Ibrahim <taneem.ibrahim@gmail.com>
2026-02-20 19:56:33 -08:00
Rohan Potdar
ded333fb9b [ROCm][Bugfix]: Only save unpadded sizes for shared_experts in MoERunner to fix rmsnorm pad fusion (#34636)
Signed-off-by: Rohan138 <rohanpotdar138@gmail.com>
2026-02-20 19:56:16 -08:00
Yanan Cao
9d7577b2bd [Kernel] [Helion] [9/N] Canonicalize GPU variant names to base model names (#34928)
Signed-off-by: Yanan Cao <gmagogsfm@gmail.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 19:55:51 -08:00
Vlad Tiberiu Mihailescu
e739c29ea4 [CI/Build] Add opentelemetry libs in default vllm build (requirements/common.txt) (#34466)
Signed-off-by: Vlad Mihailescu <vtmihailescu@gmail.com>
2026-02-20 19:54:55 -08:00
yugong333
a55caf6ae9 [LoRA] Support Quantized Adapters (#30286)
Signed-off-by: Yu Gong <yu3.gong@gmail.com>
Signed-off-by: wz1qqx <ziqi.wang@novita.ai>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: wz1qqx <55830058+wz1qqx@users.noreply.github.com>
Co-authored-by: wz1qqx <ziqi.wang@novita.ai>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2026-02-20 19:54:35 -08:00
Lucas Wilkinson
0e22cd618b Revert "[Llama4,Quantization] Simplify and generalize logic for Q/K permutations in quantized self-attn layers " (#34997) 2026-02-20 17:19:19 -08:00
Wei Zhao
ea5f903f80 Bump Flashinfer Version and Re-enable DeepSeek NVFP4 AR+Norm Fusion (#34899)
Signed-off-by: wzhao18 <wzhao18.sz@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2026-02-20 13:37:31 -08:00
Ryan Rock
0632ed8778 [AMD][CI] Fix test_custom_allreduce for A100 testgroup (#34735)
Signed-off-by: Ryan Rock <ryan.rock@amd.com>
2026-02-20 21:33:04 +00:00
Lucas Wilkinson
aaefc58ee0 [CI] Revert PRs 34818 and 33600 (#34979) 2026-02-20 13:25:50 -08:00
Wei Zhao
f24b2de3d3 [Test] Add FP8 KV Cache Testing for MLA Backends (#34473)
Signed-off-by: wzhao18 <wzhao18.sz@gmail.com>
2026-02-20 18:51:58 +00:00
Michael Goin
fac1507f03 [CI] Remove failing prime-rl integration test (#34843)
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
2026-02-20 10:17:42 -08:00
Zhengxu Chen
f863994084 [compile] Fix torch.compile time discrepancy in logging. (#34912)
Signed-off-by: zhxchen17 <zhxchen17@fb.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2026-02-20 08:47:14 -08:00
Zhengxu Chen
e4a5d8c653 [compile] Move torch_aot_compile directory under torch_compile_cache (#34831)
Signed-off-by: zhxchen17 <zhxchen17@fb.com>
2026-02-20 08:46:45 -08:00
Yanan Cao
a6d0299c75 [Kernel] [Helion] [6/N] Add num_tokens dimension to silu_mul autotuning and dispatching (#34185)
Signed-off-by: Yanan Cao <gmagogsfm@gmail.com>
2026-02-20 08:36:51 -08:00
Harry Mellor
6ce80f7071 Ensure that MkDocs v2 does not get installed (#34958)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-20 15:38:11 +00:00
Huamin Li
1fe462168c [perf] Avoid dtype promotion sync in mamba_get_block_table_tensor (#34870)
Signed-off-by: Huamin Li <3ericli@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2026-02-20 06:21:56 -08:00
Flora Feng
ed31a020ee [Refactor] Extract Harmony streaming SSE event builders into streaming_events.py (#34909)
Signed-off-by: sfeng33 <4florafeng@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2026-02-20 06:20:46 -08:00
Cyrus Leung
f9ac19204f [V0 Deprecation] Remove unused MM placeholders in request output (#34944)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-20 06:19:23 -08:00
Vadim Gimpelson
59965affbd [BUGFIX] Fix _dummy_run missing prepare_inputs_event synchronization (#34866)
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
2026-02-20 05:54:27 -08:00
Xin Yang
b1c4f0b265 [Kernel] Optimize grouped topk kernel (#34206)
Signed-off-by: Xin Yang <xyangx@amazon.com>
2026-02-20 01:34:45 -08:00
Kevin McKay
8de7c636cc [Bugfix][Hardware][AMD] Fix ROCM_AITER_FA speculative decoding support (#32877)
Signed-off-by: c0de128 <kevin.mckay@outlook.com>
Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com>
2026-02-19 22:25:46 -08:00
Frank Wang
059779231f [Minor] Add logging when using MXFP4 MXFP8 TRTLLM backend (#34916)
Signed-off-by: frankwang28 <frank.wbb@hotmail.com>
Signed-off-by: Frank Wang <41319051+frankwang28@users.noreply.github.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2026-02-19 22:07:57 -08:00
tianshu-Michael-yu
ea37530b47 [Models] LFM2: Support LoRA (#34921)
Co-authored-by: Piotr Mazurek <piotr635@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2026-02-19 22:07:23 -08:00
Micah Williamson
f5432e35a3 [ROCm][CI] Loosen RemoteOpenAIServer Startup Timeout (#34922)
Signed-off-by: Micah Williamson <micah.williamson@amd.com>
2026-02-20 05:37:49 +00:00
杨朱 · Kiki
07cab212f0 [Misc] Add deprecated environment variable utilities (#33677)
Signed-off-by: carlory <baofa.fan@daocloud.io>
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-19 21:33:25 -08:00
rasmith
0c1dc42748 [CI][AMD][BugFix][P/D] Add default_vllm_config to test_moriio_connector.py so tests pass (#33739)
Signed-off-by: Randall Smith <Randall.Smith@amd.com>
2026-02-19 21:32:40 -08:00
Varun Chawla
676f82ae81 Add validation to reject non-text content in system messages (#34072)
Signed-off-by: Varun Chawla <varun_6april@hotmail.com>
2026-02-19 21:30:33 -08:00
Elizabeth Thomas
81bfc21a6a [Model Bash]: Improve FP8 Oracle for Config Specific Kernel Selection (#34260)
Signed-off-by: Elizabeth Thomas <email2eliza@gmail.com>
Signed-off-by: Robert Shaw <robertgshaw2-redhat@h100-02.nemg-001.lab.rdu2.dc.redhat.com>
Signed-off-by: Robert Shaw <robertgshaw2@gmail.com>
Co-authored-by: Robert Shaw <robertgshaw2-redhat@h100-02.nemg-001.lab.rdu2.dc.redhat.com>
Co-authored-by: Robert Shaw <robertgshaw2@gmail.com>
2026-02-19 21:29:08 -08:00
Matthias Gehre
4e2c7caf2d [Bugfix] Add regression test for MoE quant_config under torch.compile (#34335)
Signed-off-by: Matthias Gehre <matthias.gehre@amd.com>
2026-02-20 13:27:26 +08:00
Bowen Bao
d9e62c03eb [Quark] Fix MoE fp8 activation scale handling on mi300 (#34386)
Signed-off-by: Bowen Bao <bowenbao@amd.com>
2026-02-19 21:27:14 -08:00
Kevin H. Luu
a1a2d79442 [ci] Use the right tag for CPU arm64 image (#34915)
Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
2026-02-19 19:59:15 -08:00
Cyrus Leung
ac900c89bb [Refactor] Implement output type check in LLM (#34794)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-19 19:57:55 -08:00
Mark McLoughlin
76df6072ff [Core] Fix state names in pause_scheduler() (#34840)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2026-02-19 17:21:46 -08:00
Michael Goin
16f24e8797 [CI] Add GPT-OSS Eval job for H100 (#34359)
Signed-off-by: Michael Goin <mgoin64@gmail.com>
2026-02-19 17:14:54 -08:00
Nick Hill
40b2f1c3d9 [Model Runner V2] Minor CPU optimizations (#34856)
Signed-off-by: Nick Hill <nickhill123@gmail.com>
2026-02-19 16:05:37 -08:00
Mayank Ketkar
648951a9c3 [Bugfix] Fix benchmark_fused_collective crash on CustomOp init (#34665)
Signed-off-by: Mayank Ketkar <mketkar@zoox.com>
Signed-off-by: Mayank Ketkar <mayket04@gmail.com>
Co-authored-by: Mayank Ketkar <mketkar@zoox.com>
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-02-19 19:01:00 -05:00
Michael Goin
f72061a19a [UX] More descriptive reasons in is_supported_config for MoE (#34908)
Signed-off-by: mgoin <mgoin64@gmail.com>
2026-02-19 15:20:52 -08:00
Matthew Bonanni
662205d34e [Bugfix] Fix Basic Models Test (#34818)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2026-02-19 14:49:07 -08:00
Roger Wang
4fb8beefaa [Bugfix] Fix cutlass fp8 kernel on hopper for Qwen3.5 (#34914)
Signed-off-by: Roger Wang <hey@rogerw.io>
2026-02-19 13:34:55 -08:00
Alexei-V-Ivanov-AMD
304319c4ed Change targets for AMD build in the "CI" pipeline (#34918)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
2026-02-19 21:26:53 +00:00
Wentao Ye
c683d11c94 [Refactor] Deprecate head_first for chunk_gated_delta_rule (#34263)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2026-02-19 13:23:49 -05:00
roikoren755
3eff45d793 Revert "[NemotronH] Do not force router to run in fp32 (#34582)" (#34808)
Signed-off-by: Roi Koren <roik@nvidia.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2026-02-19 09:47:05 -08:00
Robert Shaw
4685a630a2 [Model Bash][DeepSeekR1] Remove Shared Expert Clone (#34344)
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
2026-02-19 07:56:14 -08:00
Eldar Kurtić
ee1d25f199 [Llama4,Quantization] Simplify and generalize logic for Q/K permutations in quantized self-attn layers (#34471)
Signed-off-by: Your Name <you@example.com>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2026-02-19 07:55:41 -08:00
Linda
6fff24f30f [Bugfix] Qwen3.5 kv-scale weight remapping (#34719)
Signed-off-by: Linda-Stadter <57756729+Linda-Stadter@users.noreply.github.com>
2026-02-19 04:13:37 -08:00
Cyrus Leung
23210a911e [CI/Build] Try to make beam search test less flaky (#34885)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-19 19:16:58 +08:00
Cyrus Leung
1391378861 [Bugfix] Fix edge case in UUID data parsing (#34884)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-19 02:24:30 -08:00
Andreas Karatzas
f6220f9877 [ROCm][Test] Fix beam search determinism failures from batch-size-dependent FP divergence and removed wrong marker (#34878)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-19 08:25:26 +00:00
Andreas Karatzas
2df2bb27b0 [ROCm][CI] Removing all blocking labels from MI355 until stable infra (#34879)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-19 07:53:08 +00:00
Tal Nir
f75b61a9e9 [Voxtral Realtime] Fix engine crash on empty multimodal embeddings (#34862)
Signed-off-by: Tal Nir <tal@nervexneurotech.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-18 23:21:47 -08:00
Wei Zhao
7f51e93864 [Bug] Fix DeepSeek V3 weight loading caused by incorrect prefix (#34876)
Signed-off-by: wzhao18 <wzhao18.sz@gmail.com>
2026-02-18 23:20:30 -08:00
Alex Brooks
4611af1663 [Bugfix] Add Quant Config to Llava Next Projector (#34847)
Signed-off-by: Alex Brooks <albrooks@redhat.com>
2026-02-18 23:18:23 -08:00
Manrique Vargas
ad5aa6bd9f fix(docs): fix typos in comments and docstrings (#34836)
Signed-off-by: machov <mv1742@nyu.edu>
2026-02-18 23:17:41 -08:00
Jaeyeon Kim(김재연)
9681068cf9 [Frontend] Fix reasoning_tokens for text-based parsers in Responses API (#33513)
Signed-off-by: Jaeyeon Kim <anencore94@gmail.com>
2026-02-18 23:16:41 -08:00
Kevin H. Luu
b6101d384d Deprecate test-pipeline.yaml (#34864)
Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
2026-02-19 02:15:27 +00:00
Woosuk Kwon
5fcb0cdd68 [Model Runner V2] Use FP32 for Gumbel Noise (#34854)
Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
2026-02-18 17:07:37 -08:00
Woosuk Kwon
c878b43b64 [Model Runner V2] Remove unnecessary copies in PW CUDA graph capture (#34849)
Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
2026-02-18 15:52:50 -08:00
rasmith
2b84ac669c [CI][AMD][BugFix] Use torch.testing.assert_close instead of assert torch.allclose in test_rocm_skinny_gemms.py (#34181)
Signed-off-by: Randall Smith <Randall.Smith@amd.com>
2026-02-18 23:10:19 +00:00
zhrrr
11d3976b88 [Model Runner V2] support piecewise & mixed cudagraph (#32771)
Signed-off-by: zhuhaoran <zhuhaoran.zhr@alibaba-inc.com>
2026-02-18 15:03:17 -08:00
Yongye Zhu
40da9625a1 [MoE Refactor] Convert mxfp4 marlin into modular kernel format (#34588)
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2026-02-18 14:37:14 -08:00
Flora Feng
8d9babd4de Fix empty tool_call_id in Anthropic messages API tool result conversion (#34745)
Signed-off-by: <>
Signed-off-by: sfeng33 <4florafeng@gmail.com>
Co-authored-by: Flora Feng <sfeng33@h100-01.nemg-001.lab.rdu2.dc.redhat.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
2026-02-18 14:31:59 -08:00
Aaron Hao
e99ba957ec [BUG] Fixing Weight Sync unit test (#34841)
Signed-off-by: ahao-anyscale <ahao@anyscale.com>
2026-02-18 17:20:10 -05:00
Kyle Sayers
64ac1395e8 [Docs] Clean up speculators docs (#34065)
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
2026-02-18 13:48:11 -08:00
Cyrus Leung
61cf087680 [Bugfix] Fix lora tests (#34834)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2026-02-18 13:22:31 -08:00
Wenlong Wang
847a57cd12 [Bugfix][MoE Kernel] Fix incorrect routing selection for models without expert groups (e.g., MiniMax-M2.1) (#34673)
Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2026-02-18 13:03:24 -08:00
rasmith
fcd6ac97ed [CI][AMD][BugFix] Skip tests in test_unquantized_backend_selection that should not run on ROCm (#34655)
Signed-off-by: Randall Smith <Randall.Smith@amd.com>
2026-02-18 15:00:40 -05:00
Woosuk Kwon
95be2a7f22 [Model Runner V2] Minor simplification for DCP (#34786)
Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
2026-02-18 11:04:53 -08:00
Jaden Mathias
0e60c925cf [Bugfix] Remove assert causing hipErrorStreamCaptureUnsupported (#34455)
Signed-off-by: Jaden Mathias <jaden.mathias@amd.com>
2026-02-18 18:54:54 +00:00
Teng Ma
d7ff22204a [Misc] Add mooncake-transfer-engine to kv_connectors requirements (#34826)
Signed-off-by: Teng Ma <teng-ma@linux.alibaba.com>
2026-02-18 18:26:24 +00:00
Isotr0py
c0bd8b13da [Bugfix] Redo Qwen3.5/Qwen3-Next GDN projector fusion (#34697)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: JJJYmmm <92386084+JJJYmmm@users.noreply.github.com>
2026-02-18 09:46:53 -08:00
Michael Goin
caeb887bf6 [Bugfix] Fix NVFP4 TRTLLM MoE non-gated support; add gsm8k for Nemotron-3-Nano FP8+NVFP4 (#34725)
Signed-off-by: mgoin <mgoin64@gmail.com>
2026-02-18 09:39:22 -08:00
Ilya Markov
6b3166a7c7 [CI][Bugfix] Fix multinode test script (#34820)
Signed-off-by: ilmarkov <markovilya197@gmail.com>
2026-02-18 11:45:10 -05:00
Robert Shaw
25e2e136ef [CI] temporarily disable multi-node tests (#34825)
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
2026-02-18 11:32:44 -05:00
Robert Shaw
6874638bc4 [Model Bash] DeepSeek R1 BF16 Min Latency QKV A GEMM (0.5% E2E Speedup) (#34758)
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
2026-02-18 07:42:36 -08:00
Burkhard Ringlein
e24663c5a9 Add unit tests for fp8 output fusion of triton_attn (#34228)
Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2026-02-18 06:22:49 -05:00
Nick Hill
c50e105a88 [Model Runner V2] Avoid prepare prefill kernel launch overhead (#34780)
Signed-off-by: Nick Hill <nickhill123@gmail.com>
2026-02-18 00:49:21 -08:00
Cyrus Leung
a766b30349 [Renderer] Deprecate code paths for old input processing (#34775)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-18 00:35:04 -08:00
Asaf Joseph Gardin
1faa8cb73c [Quantization] - Added uses_meta_device_weights to quant config (#34645)
Signed-off-by: Josephasafg <ajgard7@gmail.com>
2026-02-17 23:43:44 -08:00
Marek Michalowski
e89a91d927 [Bugfix] fix activation in cpu_fused_moe_torch call (#34696)
Signed-off-by: Marek Michalowski <marek.michalowski@arm.com>
2026-02-17 23:39:46 -08:00
Michael Goin
909b147197 [Bugfix] Fix prefix creation for Qwen3.5 (#34723)
Signed-off-by: mgoin <mgoin64@gmail.com>
2026-02-17 23:39:15 -08:00
ElizaWszola
a88b3be7c4 [Bugfix] Fix quant RMS norm fusion for quantization with TMA-aligned scales (#33255)
Signed-off-by: ElizaWszola <ewszola@redhat.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2026-02-17 23:35:04 -08:00
Nick Hill
a49ea5a58f [Model Runner V2] A bit more PP simplification (#34766)
Signed-off-by: Nick Hill <nickhill123@gmail.com>
2026-02-17 21:39:07 -08:00
Cyrus Leung
30ebe0dc3c [CI/Build] Remove use of skip_v1 (#34699)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-18 12:19:11 +08:00
Andreas Karatzas
cef65f0715 [ROCm][CI] Removed hard-coded attn backend requirement for Qwen VL (#34753)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-18 03:59:53 +00:00
Russell Bryant
6f3b2047ab [Core] Fix SSRF bypass via backslash-@ URL parsing inconsistency (#34743)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: isotr0py <2037008807@qq.com>
2026-02-18 03:53:35 +00:00
Luka Govedič
02e8f26cea [torch.compile] Turn on silu+fp4 quant fusion by default for O1+ (#34718)
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
2026-02-18 03:29:15 +00:00
Hongxia Yang
4a00a511bb [BugFix] [Build] fix string literals comparison in indexer_k_quant_and_cache calling site (#34653)
Signed-off-by: Hongxia Yang <hongxiay.yang@amd.com>
Co-authored-by: Hongxia Yang <hongxiay.yang@amd.com>
2026-02-17 19:19:41 -08:00
Cyrus Leung
a0d8d944e2 [Renderer] Move MM Hash parsing into Renderer (#34711)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-17 19:18:55 -08:00
Amr Mahdi
df3f537a66 [CI] Remove unused precompiled wheel args from image build (#34767)
Signed-off-by: Amr Mahdi <amrmahdi@meta.com>
2026-02-17 18:58:18 -08:00
Matthew Bonanni
7743152957 [Attention] Refactor check_and_update_config (#33600)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2026-02-17 17:06:54 -08:00
Wentao Ye
ab33d2a629 [Feature] Decode Context Parallel support for GPU model runner v2 (#34179)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2026-02-17 16:27:15 -08:00
Woosuk Kwon
be3af2d29e [Model Runner V2] Further simplification for PP (#34724)
Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
2026-02-17 15:18:18 -08:00
Jongseok Park
c656ba3b4d [Kernel] Triton-based Top-k and Top-p sampler kernels (#33538)
Signed-off-by: js_park <cakeng@naver.com>
Signed-off-by: Jongseok Park <37990712+cakeng@users.noreply.github.com>
Signed-off-by: Sunga Kim <sunga.kim@berkeley.edu>
Signed-off-by: Nick Hill <nickhill123@gmail.com>
Co-authored-by: Sunga Kim <sunga.kim@berkeley.edu>
Co-authored-by: Nick Hill <nickhill123@gmail.com>
2026-02-17 23:14:30 +00:00
Matthew Bonanni
dc5fa77a4e [Bugfix][MTP][Sparse MLA] Allow sparse MLA with MTP to run with FULL cudagraphs (#34457)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2026-02-17 14:01:27 -05:00
Flora Feng
1e4a084c8e [CI] Fix flaky test_parsable_context (#34717)
Signed-off-by: sfeng33 <4florafeng@gmail.com>
2026-02-17 18:42:52 +00:00
Richard Zou
7967e854da [BugFix] Fix sp tests (#34716)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2026-02-17 17:07:56 +00:00
almayne
6bd6d0c3c1 Fixed whisper CPU test that does not spawn properly. (#34324)
Signed-off-by: Anna Mayne <anna.mayne@arm.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2026-02-17 06:46:23 -08:00
Nicolò Lucchesi
8e962fef5f [CI][Nixl] Add CrossLayer KV layout tests (#34615)
Signed-off-by: NickLucche <nlucches@redhat.com>
2026-02-17 21:35:40 +08:00
Cyrus Leung
574fe75245 [Renderer] Move InputPreprocessor into Renderer (2/2) (#34560)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-17 05:29:01 -08:00
junuxyz
c61a98f529 [CI][BugFix] ShellCheck cleanup to remove baseline and preserve runtime behavior (#34514)
Signed-off-by: junuxyz <216036880+junuxyz@users.noreply.github.com>
2026-02-17 12:22:56 +00:00
Harry Mellor
28bffe9466 Fix docs build warning (#34686)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-17 02:31:40 -08:00
ChenqianCao
ad65177a19 [Bugfix] Fix 'remove_instance_endpoint' method logic in disagg_proxy_demo (#32922)
Signed-off-by: ChenqianCao <39755070+ChenqianCao@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-17 10:06:53 +00:00
Tim Dettmers
d44a5b6c47 Remove dead bitsandbytes CxB code from 8-bit inference path (#34633)
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-17 01:49:14 -08:00
Jiangyun Zhu
1d65283e95 Revert "[Models] Fuse Qwen3.5 GDN's qkvz_proj and ba_proj" (#34683) 2026-02-17 01:29:27 -08:00
kourosh hakhamaneshi
c464b57374 [Ray] Propagate third-party env vars to Ray workers via prefix matching (#34383)
Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-17 01:08:42 -08:00
Amr Mahdi
c5c38e152a [CI] Fix bake config artifact path for AMI rebuild pipeline (#34656)
Signed-off-by: Amr Mahdi <amrmahdi@meta.com>
2026-02-17 06:39:44 +00:00
Woosuk Kwon
d00df624f3 [Model Runner V2] Minor refactoring for penalties (#34662)
Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
2026-02-16 21:43:00 -08:00
Woosuk Kwon
9752da9d9c [Model Runner V2] Minor simplification for BadWordsState (#34669)
Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
2026-02-16 21:27:24 -08:00
Woosuk Kwon
04925b2202 [Model Runner V2] Minor cleanup for PP (#34666)
Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
2026-02-16 19:15:31 -08:00
Woosuk Kwon
d74278fb67 [Model Runner V2] Fix unintended CPU-GPU sync in make_dummy (#34667)
Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
2026-02-16 19:00:29 -08:00
haosdent
b68fd899d1 [Bugfix] Fix fused MoE int32 overflow in stride*offset without perf regression (#34507)
Signed-off-by: haosdent <haosdent@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2026-02-16 17:58:49 -08:00
Aneesh Puttur
0b5f9b7204 [CI] Enable mypy import following for vllm/v1/kv_offload (#34639)
Signed-off-by: Aneesh Puttur <aneeshputtur@gmail.com>
2026-02-17 09:58:15 +08:00
zhanqiuhu
9a8853f781 [Core] Pipeline Parallel support for Model Runner V2 (#33960)
Signed-off-by: Zhanqiu Hu <zh338@cornell.edu>
2026-02-16 17:48:16 -08:00
zhrrr
387a1898d9 [Model Runner V2] support bad_words sampling param (#33433)
Signed-off-by: zhuhaoran <zhuhaoran.zhr@alibaba-inc.com>
Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
Co-authored-by: Woosuk Kwon <woosuk@inferact.ai>
2026-02-16 16:36:06 -08:00
roikoren755
3b30e61507 [NemotronH] Do not force router to run in fp32 (#34582)
Signed-off-by: Roi Koren <roik@nvidia.com>
2026-02-16 10:15:32 -08:00
Alexei-V-Ivanov-AMD
824f9e8f3c Targeting the MI355 agent pool with all existing tests (#34629)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
2026-02-16 17:02:27 +00:00
Nicolò Lucchesi
6cc403e67d [Bugfix][CI] Fix flaky entrypoints/openai/test_response_api_with_harmony.py::test_function_calling[openai/gpt-oss-20b] (#34624)
Signed-off-by: NickLucche <nlucches@redhat.com>
2026-02-16 16:11:07 +00:00
Almog Tavor
72d5951d02 [Bugfix] Treat generation_config max_tokens as default not ceiling (#34063)
Signed-off-by: almogtavor <almogtavor@gmail.com>
2026-02-16 07:58:24 -08:00
Lucas Kabela
a3205beffb [CI] Enable mypy coverage for individual excluded files (#34292)
Signed-off-by: Lucas Kabela <lucaskabela@meta.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-16 07:34:29 -08:00
Christian Pinto
6930becd45 (bugfix): Fixed encode in LLM entrypoint for IOProcessr plugin prompts (#34618)
Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
2026-02-16 07:33:55 -08:00
Andreas Karatzas
03a8770a6d [ROCm][CI] Fix plugins test group; updating terratorch and dependencies (#34589)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-16 07:33:42 -08:00
Yiqi Xue
bc56a1d56e [Bugfix] Fix ARC touch KeyError for non-ready T1 blocks in kv offload (#34576)
Signed-off-by: Yiqi Xue <xuey666@gmail.com>
2026-02-16 07:33:19 -08:00
danisereb
ec7d9e6745 Fix call to moe_mk in modelopt MoE modules (required for LoRA) (#34575)
Signed-off-by: Daniel Serebrenik <daserebrenik@nvidia.com>
2026-02-16 07:33:09 -08:00
Isotr0py
3bb4e4311c [Models] Fuse Qwen3.5 GDN's qkvz_proj and ba_proj (#34492)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-02-16 07:32:51 -08:00
Amr Mahdi
08f8c198ae [CI] Disable precompiled wheel path in CI image builds (#34606)
Signed-off-by: Amr Mahdi <amrmahdi@meta.com>
2026-02-16 15:14:43 +00:00
Harry Mellor
a21cedf4ff Bump lm-eval version for Transformers v5 compatibility (#33994)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-16 05:24:35 -08:00
emricksini-h
3ef74cde5d [CI][Tracing] Fix race condition by adding server readiness check (#34364)
Attempt to resolve #34284: "Metrics Tracing (2GPU)" fails with a
segmentation fault.

Signed-off-by: emricksini-h <emrick.birivoutin@hcompany.ai>
2026-02-16 12:57:39 +00:00
Ekagra Ranjan
cd81cdb399 [Scheduler][ASR] Fix CrossAttn blocks per-request for Variable length encoder inputs (#31058)
Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
2026-02-16 11:08:44 +00:00
Andreas Karatzas
1e828573b4 [CI][Metrics] Stabilize tests with polling and subprocess guards (#34566)
test_abort_metrics_reset is flaky due to hardware-dependent
fixed sleeps: replace fixed sleeps with polling.

test_metrics_exist_run_batch passes even when the engine crashes
on startup (false positive): add subprocess lifecycle guards.

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-16 10:52:02 +00:00
Samu Tamminen
a5ccc85c8c [Bugfix] Fix Dynamo unexpected keyword argument (#34320)
Signed-off-by: Samu Tamminen <stammine@amd.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2026-02-16 01:32:30 -08:00
Roger Wang
b5475d0534 Revert "[Misc] fix qwen3.5 config" (#34610) 2026-02-16 01:06:05 -08:00
JJJYmmm
9521002f0a [Misc] fix qwen3.5 config (#34604) 2026-02-16 00:25:38 -08:00
Cyrus Leung
ec17bdd894 [Renderer] Move InputPreprocessor into Renderer (1.5/2) (#34598)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-15 23:46:33 -08:00
Amr Mahdi
bb59c90248 [CI] Write bake config to temp directory instead of repo root (#34569)
Signed-off-by: Amr Mahdi <amrmahdi@meta.com>
2026-02-15 22:15:47 -08:00
bnellnm
5bff999d12 [Bugfix] Add method to swap quant_method on FusedMoE to fix LoRA issues (#34453)
Signed-off-by: Bill Nell <bnell@redhat.com>
2026-02-15 20:10:50 -08:00
Lucas Wilkinson
bb85929aa6 [BugFix] Fix Python 3.13 FlashMLA import error (#34548)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2026-02-15 20:09:18 -08:00
Parth Bansal
5653021094 [Doc] Add Mistral-7b-v0.3 model to the batch invariance validated model (#34584)
Signed-off-by: Parth Bansal <parthbansal127@gmail.com>
2026-02-16 12:09:00 +08:00
Andreas Karatzas
974d829b05 [CI][Frontend] Return 422 instead of 500 for invalid Anthropic tool_choice (#34590)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-15 20:06:48 -08:00
Isotr0py
91ac5d9bfd [CI/Build] Enable tests for recent day-0 new models (#34585)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-02-15 18:17:04 -08:00
Luka Govedič
23d825aba1 [torch.compile] Disable ar-rms fusion for ds3-fp4 & DP, fix CI test (#34392)
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-02-15 06:33:57 -08:00
Maryam Tahhan
f07a128413 [CPU][ARM] Add ARM BF16 cross-compilation support and improve documen… (#33079)
Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
2026-02-15 06:33:08 -08:00
Isotr0py
71cd89264f [MM Encoder] Add Triton ViT attention backend (#32183)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-02-15 06:32:47 -08:00
Isotr0py
19fab44152 [Doc] Update Encoder-Decoder models support doc with Florence-2 (#34581)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-02-15 04:18:57 -08:00
Seiji Eicher
79c7e09235 [KV Connector] Add temporary, off-by-default VLLM_DISABLE_REQUEST_ID_RANDOMIZATION workaround (#34415)
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
2026-02-14 23:26:10 -08:00
haosdent
79f3fab05a [Bugfix] Handle num_expert_group=None in flashinfer block-scale FP8 MoE (#34494)
Signed-off-by: haosdent <haosdent@gmail.com>
2026-02-14 23:25:46 -08:00
Vadim Gimpelson
604b9eaec5 [BUGFIX] Fix accuracy regression for NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4 with TP>1 (#34476)
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
2026-02-14 23:25:17 -08:00
Stanislav Kirillov
50dbd6c9e6 [bugfix] Fix critical bug when reporting for all paths where handler.create_error_response is used (#34516)
Signed-off-by: Stanislav Kirillov <stas@nebius.com>
Co-authored-by: Stanislav Kirillov <stas@nebius.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2026-02-14 23:24:25 -08:00
Andreas Karatzas
98bcc6ca59 [CI][Entrypoints] Validate detokenize token IDs to prevent int64 overflow causing 500 (#34468)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-14 23:08:38 -08:00
Andreas Karatzas
f13e86d8dd [Kernels] Fix Helion GPU utils to use platform-agnostic device name API (#34537)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-14 20:29:23 -08:00
Woosuk Kwon
9ca768c740 [Model Runner V2] Minor cleanup for Sampler (#34563)
Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
2026-02-14 18:29:03 -08:00
Thomas Parnell
d5fe3f702c [Hybrid] Enable mamba prefix cache "align" mode with async scheduling (#33997)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2026-02-14 13:15:56 -08:00
Cyrus Leung
73391a1baa [Renderer] Move InputPreprocessor into Renderer (1/2) (#34510)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2026-02-14 10:14:21 -08:00
Andreas Karatzas
b3c14229b0 [ROCm][CI] Guard sparse MLA backend imports for ROCm compatibility in tests (#34538)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-14 07:32:09 -08:00
Roger Wang
2f186635cb [Bugfix] Fix Qwen3.5 config loading (#34554)
Signed-off-by: Roger Wang <hey@rogerw.io>
2026-02-14 03:56:11 -08:00
Christian Pinto
342a7cda2d [Misc] Update tests and examples for Prithvi/Terratorch models (#34416)
Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2026-02-13 23:03:51 -08:00
Kata Coder
d1ea65d0a1 [new model] add COLQwen3 code & Inference (#34398)
Signed-off-by: craftsangjae <craftsangjae@gmail.com>
Signed-off-by: katacoder <craftsangjae@gmail.com>
2026-02-14 12:15:19 +08:00
Andreas Karatzas
de42abb366 [CI] Heavy refactoring of Voxtral multimodal audio model tests (#34294)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-13 20:04:29 -08:00
Julien Denize
60ca7981bc Add explicit validation error for tool calls. (#34438)
Signed-off-by: juliendenize <julien.denize@mistral.ai>
2026-02-13 20:04:01 -08:00
Christian S. Perone
0ef5b9147b fix: use __annotations__ instead of get_type_hints() for dynamic kwargs detection (#34527)
Signed-off-by: Christian S. Perone <christian.perone@gmail.com>
Signed-off-by: Christian S. Perone <perone@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2026-02-13 20:03:37 -08:00
Shiyan Deng
ed242652d7 [bug] Make sure get_modality_with_max_tokens is deterministic (#34533)
Signed-off-by: Shiyan Deng <dsy842974287@meta.com>
2026-02-13 20:02:59 -08:00
Wei Zhao
b37b679770 [Feature][Perf] Support Selective CPU Weight Offloading (#34535)
Signed-off-by: wzhao18 <wzhao18.sz@gmail.com>
2026-02-13 20:02:24 -08:00
Andreas Karatzas
a0638d052d [Bugfix] Fix ROCm UVA CPU weight offloading broken by #32993 (#34543)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-13 20:01:42 -08:00
Harry Huang
c027541eaf [Hybrid] Enable spec decoding in mamba cache align mode (#33705)
Signed-off-by: huanghaoyan.hhy <huanghaoyan.hhy@alibaba-inc.com>
2026-02-13 13:02:28 -08:00
Ben Browning
fd267bc7b7 [Bugfix]: Fix structured output in multi-turn gpt-oss (#34454)
Signed-off-by: Ben Browning <bbrownin@redhat.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2026-02-13 11:12:48 -08:00
Michael Goin
bfaa559305 Revert "[Bugfix] Fix fused MoE IMA (sans chunking) by using int64 for strides" (#34530) 2026-02-13 10:35:29 -08:00
Richard Zou
87789c8364 [Misc] vLLM's --enforce-eager should turn off compile and cudagraphs only (#34523)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2026-02-13 09:52:20 -08:00
Pushpinder Singh
bcd65c1f6a [Bugfix] Replace c10::optional with std::optional in topk kernel (#34467)
Signed-off-by: Pushpinder Singh <pushpindersingh135@gmail.com>
2026-02-13 08:30:23 -08:00
Wei Zhao
59d53066d8 [Feature] Support CPU Offloading without Pytorch Pinned Memory that leads to doubled allocation (#32993)
Signed-off-by: wzhao18 <wzhao18.sz@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2026-02-13 08:11:26 -08:00
LoganJane
4a9952ec1b [Bugfix] Add quant_config in ViT of Kimi-K2.5 (#34501)
Signed-off-by: LoganJane <LoganJane73@hotmail.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-02-13 16:05:34 +00:00
Roger Wang
1dae7b7843 [Bugfix] Exclude language_model_only key from MM AOT compile hash but include in model one (#34508)
Signed-off-by: Roger Wang <hey@rogerw.io>
2026-02-13 13:59:00 +00:00
Roger Wang
5885e330ef [Misc] Port Qwen3.5 Configs (#34512)
Signed-off-by: Roger Wang <hey@rogerw.io>
2026-02-13 05:24:25 -08:00
Ilya Boytsov
071d863e20 Extend ColBERT support to non-standard BERT backbones (#34170)
Signed-off-by: Ilya Boytsov <ilya.boytsov@aleph-alpha.com>
2026-02-13 09:53:09 +00:00
Woosuk Kwon
0916e7960b [GDN] Use CPU tensors to build GDN metadata (#34498)
Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
2026-02-13 01:24:45 -08:00
Wentao Ye
3d2a026fd0 [Feature] Pipeline Parallel Async send/recv, 2.9% E2E throughput improvement (#33368)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2026-02-13 16:38:16 +08:00
Aaron Hao
dddbff4624 [Core] Move pause and resume functions into engine (#34125)
Signed-off-by: ahao-anyscale <ahao@anyscale.com>
Signed-off-by: Aaron Hao <ahao@anyscale.com>
Signed-off-by: hao-aaron <ahao@anyscale.com>
Signed-off-by: Nick Hill <nickhill123@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Nick Hill <nickhill123@gmail.com>
2026-02-13 00:15:10 -08:00
Martin Hickey
47e9b63e1a [KVConnector] Clean up redundant code in KV connectors (#34147)
Signed-off-by: Martin Hickey <martin.hickey@ie.ibm.com>
2026-02-13 00:14:30 -08:00
Matthias Gehre
934acddef9 [Perf] fused_moe: add int4_w4a16 benchmark support and tuning config (#34130)
Signed-off-by: Matthias Gehre <matthias.gehre@amd.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
2026-02-13 00:14:27 -08:00
Marek Michalowski
742d214d6e [Bugfix] fix the import path in moe test utils.py (#34245)
Signed-off-by: Marek Michalowski <marek.michalowski@arm.com>
2026-02-13 00:13:45 -08:00
haosdent
4137c5dfa7 [Bug Fix] Fix MambaManager.cache_blocks() crash on null blocks in align mode (#34418)
Signed-off-by: haosdent <haosdent@gmail.com>
2026-02-13 00:13:22 -08:00
Harry Huang
7a8a46ddcb [BugFix] Fix and optimize max_num_blocks_per_req calculation for MambaSpec (#34440)
Signed-off-by: huanghaoyan.hhy <huanghaoyan.hhy@alibaba-inc.com>
2026-02-13 00:13:14 -08:00
myselvess
bcf0731aa0 [New Model] support new model ovis2.6 (#34426)
Signed-off-by: myselvess <23743269+myselvess@users.noreply.github.com>
2026-02-13 00:12:45 -08:00
Cyrus Leung
ec090c2429 [Refactor] Call renderer for online IO processor request (#34490)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-02-12 22:48:45 -08:00
Roger Wang
eea3024f43 [Bugfix] Fix mamba state dtype setting for Qwen3-Next and Qwen3.5 (#34489)
Signed-off-by: Roger Wang <hey@rogerw.io>
2026-02-12 22:48:42 -08:00
Cyrus Leung
2f308214c0 [Refactor] Pass full VllmConfig to Renderer (#34485)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-12 22:48:38 -08:00
Cyrus Leung
1b4e8e53f8 [CI/Build] Fix CUDA re-initialization error in distributed model tests (#34491)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-13 06:43:53 +00:00
haosdent
dcf6ee8592 [Bugfix] Fix encoder cache underestimation for GLM-4V/GLM-OCR single image (#34483)
Signed-off-by: haosdent <haosdent@gmail.com>
2026-02-12 21:04:06 -08:00
Cyrus Leung
372b2e762a [Bugfix] Standardize getting number of image patches/tokens (#34358)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-12 20:47:01 -08:00
Andreas Karatzas
6afa587d31 [ROCm][CI] Fix serving tokens test failures (#34047)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-13 11:27:53 +08:00
Cyrus Leung
94ed6cf6ea Add new sections to CODEOWNERS (#34309)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-12 18:39:28 -08:00
Harry Huang
bf37812ca7 [Hybrid] Fix and optimize block-aligned splitting in mamba cache align mode (#33706)
Signed-off-by: huanghaoyan.hhy <huanghaoyan.hhy@alibaba-inc.com>
2026-02-12 18:21:52 -08:00
Frank Wang
b86bf4417e [Bugfix] Fix Random Dataset Prefix Length Inaccuracy (#33907)
Signed-off-by: frankwang28 <frank.wbb@hotmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
2026-02-12 18:21:19 -08:00
Yanan Cao
de13dd781f [Kernel] [Helion] [5/N] Add Helion Autotuning infrastructure (#34025)
Signed-off-by: Yanan Cao <gmagogsfm@gmail.com>
2026-02-12 18:21:05 -08:00
LoganJane
62788f99a4 [Bugfix] Delete unused redundant code in Kimi-K2.5 (#34427)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-02-12 18:18:42 -08:00
Cyrus Leung
ea5ff3a1f6 [Refactor] Simplify BOS/EOS token handling (#34435)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-12 18:18:24 -08:00
bnellnm
04ea31baab [Bugfix] Remove assert that's no longer valid (#34443)
Signed-off-by: Bill Nell <bnell@redhat.com>
2026-02-12 18:18:15 -08:00
Harry Huang
6f019e6e0a [BugFix] Add block_size validation for mamba cache align mode (#34445)
Signed-off-by: huanghaoyan.hhy <huanghaoyan.hhy@alibaba-inc.com>
2026-02-12 18:18:07 -08:00
Zhuohan Li
d707678dfb Fix num_logprobs parameter description in sampler.py (#34451)
Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
2026-02-12 18:18:03 -08:00
Cyrus Leung
fc22cae4ac [CI/Build] Update video URLs for testing (#34446)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-12 18:15:36 -08:00
Yanan Cao
96161fe978 [Kernel] [Helion] [4/N] Add silu_mul_fp8 Helion kernel (#33373)
Signed-off-by: Yanan Cao <gmagogsfm@gmail.com>
2026-02-12 18:13:12 -08:00
Jaewon
4453ba8d9e [Core] Profiler improvements and lazy initialization (#33198)
Signed-off-by: Jaewon Lee <jaewon@meta.com>
Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com>
2026-02-12 16:16:38 -08:00
Jaewon
aa181c923b [Core] Add sleep level 0 mode with enqueue/wait pattern (#33195)
Signed-off-by: Jaewon Lee <jaewon@meta.com>
Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com>
2026-02-12 16:16:25 -08:00
Alec S
be7370daf3 [Frontend] Enable generic structured_outputs for responses API (#33709)
Signed-off-by: Alec Solder <alecs@fb.com>
Co-authored-by: Alec Solder <alecs@fb.com>
2026-02-12 16:15:48 -08:00
Mengtao (Martin) Yuan
9ea1f598ce Use paged_attention_v1 for sliding window decode in rocm_aiter_fa (#34378)
Signed-off-by: Martin Yuan <myuan@meta.com>
Co-authored-by: Martin Yuan <myuan@meta.com>
2026-02-12 16:14:43 -08:00
amitz-nv
f120bd42d3 [Kernel] Support Flashinfer trtllm fused MoE non gated FP8 & NVFP4 (#33506)
Signed-off-by: amitz-nv <203509407+amitz-nv@users.noreply.github.com>
2026-02-12 13:06:58 -08:00
Hashem Hashemi
fac4e96940 small adjustment to wvSplitKrc (#34410)
Signed-off-by: Hashem Hashemi <hashem.hashemi@amd.com>
2026-02-12 20:26:36 +00:00
Michael Goin
6d4e27ce29 [Bugfix] Enforce DeepGEMM when using sparse_attn_indexer on CUDA (#34374)
Signed-off-by: mgoin <mgoin64@gmail.com>
2026-02-12 12:08:06 -08:00
Andreas Karatzas
4c078fa546 [ROCm][CI] Pin TorchCodec to v0.10.0 for ROCm compatibility (#34447)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-12 18:47:34 +00:00
Patrick von Platen
6c0baee610 [Voxtral Realtime] Refactor & Improve buffering logic (#34428)
Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-02-12 09:46:43 -08:00
Patrick von Platen
1100a97621 [Voxstral Realtime] Enable tests (#33803)
Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com>
2026-02-12 09:43:24 -08:00
xuebwang-amd
766e167821 [ROCm][quantization] improve OCP weight quant parser robust (#34431)
Signed-off-by: xuebwang-amd <xuebwang@amd.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
2026-02-12 09:40:19 -08:00
Isotr0py
becbe24808 [Bugfix] Remove broken raw url GGUF model loading support (#34433)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-02-12 09:40:01 -08:00
Harry Mellor
679ca5d8d3 Fix MoE for the Transformers modelling backend (#34436)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-12 09:29:42 -08:00
Matthew Bonanni
f2c47886fd [Attention] Add FlashInfer Sparse MLA backend (#33451)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
2026-02-12 17:21:54 +00:00
Nicolò Lucchesi
334c715e0f [Docs] Spec decoding docs warning removal (#34439)
Signed-off-by: NickLucche <nlucches@redhat.com>
2026-02-12 09:01:51 -08:00
Aaron Hao
7b5a8b4a9d [BUG] Reset running requests when clearing cache for pause/resume (#34382)
Signed-off-by: hao-aaron <ahao@anyscale.com>
2026-02-12 16:19:13 +00:00
danisereb
dea63512bb Add config file for fused MoE for Nemotron (TP4, B200) (#34411)
Signed-off-by: Daniel Serebrenik <daserebrenik@nvidia.com>
2026-02-12 06:09:55 -08:00
Douglas Lehr
8a798be929 [ROCm] Enable MXFP4 MoE weight pre-shuffling on gfx950 and update aiter (#34192)
Signed-off-by: Doug Lehr <douglehr@amd.com>
Co-authored-by: Doug Lehr <douglehr@amd.com>
Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Co-authored-by: tjtanaavllm <tunjian.tan@amd.com>
2026-02-12 05:06:33 -08:00
Cyrus Leung
fb455ed547 [V0 Deprecation] Remove code related to per-request logits processors (#34400)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-12 20:44:28 +08:00
baonudesifeizhai
f5897613fb Fix Mistral config remap to accept compressed-tensors quantization #34028 (#34104)
Signed-off-by: baonudesifeizhai <baonudesifeizhai@gmail.com>
2026-02-12 08:22:06 +00:00
Louie Tsai
55a1a9563a Vllm CPU benchmark suite improvement (#34128)
Signed-off-by: louie-tsai <louie.tsai@intel.com>
2026-02-12 16:04:44 +08:00
AllenDou
386bfe5d08 [bugfix] refactor FunASR's _get_data_parser (#34397)
Signed-off-by: zixiao <shunli.dsl@alibaba-inc.com>
Co-authored-by: zixiao <shunli.dsl@alibaba-inc.com>
2026-02-12 07:26:49 +00:00
Kyle Sayers
e9cd691132 [Bugfix] Fix Sparse24 Compressed Tensors models (#33446)
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2026-02-11 23:15:16 -08:00
Yichuan Wang
80f2ba6ea6 Fix DeepSeek-OCR tensor validation for all size variants (#34085)
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-11 22:50:23 -08:00
Lucas Wilkinson
136b0bfa59 [BugFix] Fix DP chunking (#34379)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Bill Nell <bnell@redhat.com>
Co-authored-by: Bill Nell <bnell@redhat.com>
2026-02-12 06:44:03 +00:00
Cyrus Leung
b96f7314b4 [Refactor] Pass Renderer to Input Processor (#34329)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-11 19:38:11 -08:00
Cyrus Leung
ced2a92f40 [Refactor] Move validation to params definitions (#34362)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-11 19:33:15 -08:00
Runkai Tao
e1d97c38f8 [Bug Fix] Fix naive_block_assignment always defaulting to False due to arg misalignment (#33848)
Signed-off-by: Runkai Tao <rt572@physics.rutgers.edu>
2026-02-12 11:30:57 +08:00
Michael Goin
ec12d39d44 [Bugfix] Fix MTP accuracy for GLM-5 (#34385)
Signed-off-by: mgoin <mgoin64@gmail.com>
2026-02-12 11:08:19 +08:00
Michael Goin
ff1f83b056 [Refactor] Replace activation: str with MoEActivation enum (#33843)
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
2026-02-11 17:29:32 -08:00
Kevin H. Luu
83b47f67b1 [ci] Integrate AMD tests into CI (#33626)
Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
Signed-off-by: khluu <khluu000@gmail.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
2026-02-12 08:54:17 +08:00
Micah Williamson
fb7b30c716 [ROCm][CI] Revert Test Groups From mi325_8 to mi325_1 Agent Pool In AMD CI (#34384)
Signed-off-by: Micah Williamson <micah.williamson@amd.com>
2026-02-11 15:52:34 -08:00
bnellnm
31d992d215 [Bugfix] Fix some issues with MoERunner PR #32344 (#34371)
Signed-off-by: Bill Nell <bnell@redhat.com>
2026-02-11 14:33:14 -08:00
Wei Zhao
5aff2699bd Fix CI failure - Flashinfer Kernel tests (#34316)
Signed-off-by: wzhao18 <wzhao18.sz@gmail.com>
2026-02-11 14:17:16 -08:00
Raushan Turganbay
527ca32197 [Bugfix] Fix more multimodal tests for transformers V5 (#34334)
Signed-off-by: raushan <raushan@huggingface.co>
2026-02-11 22:02:05 +01:00
Junseo Park
5458eb835d [Bugfix] send None sentinel on final commit so server properly sends transcription.done (#33963)
Signed-off-by: pjs102793 <pjs102793@naver.com>
Co-authored-by: Nick Hill <nickhill123@gmail.com>
2026-02-11 21:01:53 +00:00
Tomas Ruiz
144d9b7cc8 [Benchmarks] Reduce ready checker log verbosity (#34349)
Signed-off-by: Tomas Ruiz <tomas.ruiz.te@gmail.com>
2026-02-11 20:57:57 +00:00
elvischenv
83e26c834e [GPT-OSS] Remove unnecessary contiguous (#34337)
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
2026-02-11 15:29:29 -05:00
TJian
5001211369 [ROCm] [CI] fix test_unrecognized_env (#34350)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2026-02-11 18:50:44 +00:00
Eldar Kurtić
11c7ace340 [Bugfix] Enable attn quantization of Llama-4 by correctly permuting scales for rope (int8, fp8) (#34243)
Signed-off-by: Your Name <you@example.com>
Co-authored-by: Your Name <you@example.com>
2026-02-11 13:24:22 -05:00
Xinyu Dong
be7f3d5d20 [Bugfix] fix default is_neox_style is True for deepseek (#34353)
Signed-off-by: dongxinyu03 <dongxinyu03@baidu.com>
2026-02-11 18:20:45 +00:00
Isotr0py
0ab06100f4 [Multimodal] Expose mm_processor_kwargs for DummyInputsBuilder (#34330)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-02-11 09:37:40 -08:00
Xinyu Chen
ffb3d553cc [Model Runner V2] Init cuda graph pool when necessary (#33217)
Signed-off-by: Xinyu Chen <xinyu1.chen@intel.com>
2026-02-11 09:12:13 -08:00
junuxyz
fa7e0bfacf [CI][BugFix] Fix silent failure in shellcheck hook and baseline exist… (#32458)
Signed-off-by: junuxyz <216036880+junuxyz@users.noreply.github.com>
2026-02-11 17:03:48 +00:00
SorenDreano
48134a2c22 [Docs] Fix typo ("defult") and double spacing (#34348)
Signed-off-by: SorenDreano <71752785+SorenDreano@users.noreply.github.com>
Co-authored-by: Soren Dreano <soren@numind.ai>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-02-11 09:02:27 -08:00
kliuae
64f570ab56 [ROCm] [aiter] Split KV cache update for AiterFlashAttention (#33681)
Signed-off-by: kliuae <kuanfu.liu@embeddedllm.com>
2026-02-11 16:26:44 +00:00
Rohan Potdar
fd618871b4 [Bugfix]: Fix ROCm fusion attn test; use AttentionBackend utils to create kv cache (#33948)
Signed-off-by: Rohan138 <rohanpotdar138@gmail.com>
2026-02-11 11:12:05 -05:00
Harry Mellor
67a42b5a44 Don't try and run GLM-ASR with remote code (#34352)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-11 08:09:40 -08:00
Lucas Wilkinson
c7914d30f9 Reapply [Attention][FA3] Update FA3 to include new swizzle optimization (#34043)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2026-02-11 07:07:56 -08:00
Adam Binford
1b8756562e Responses harmony system message structured (#34268)
Signed-off-by: Adam Binford <adamq43@gmail.com>
2026-02-11 05:14:28 -08:00
Linda
275e0d2a99 [NVIDIA][test] Tests for flashinfer TRTLLM BF16 MoE (#33715)
Signed-off-by: Linda-Stadter <57756729+Linda-Stadter@users.noreply.github.com>
Co-authored-by: Pavani Majety <pmajety@nvidia.com>
2026-02-11 12:38:11 +00:00
Harry Mellor
0f5e55e7a8 Make JAIS compatible with Transformers v5 (#34264)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-11 12:30:37 +00:00
Harry Mellor
1e9204bff3 Make Qwen3VL compatible with Transformers v5 (#34262)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.io>
2026-02-11 04:13:23 -08:00
Li, Jiang
05339a7b20 [Bugfix][CPU] Fix llama4 inference on CPU (#34321)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2026-02-11 19:07:23 +08:00
Harry Mellor
40b8f55358 [Docs] Reduce time spent generating API docs (#34255)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-11 02:56:02 -08:00
Seiji Eicher
5045d5c983 Patch protobuf for CVE-2026-0994 (#34253)
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
Co-authored-by: Kevin H. Luu <khluu000@gmail.com>
2026-02-11 02:25:04 -08:00
Nick Hill
e09546cf05 [Frontend] Exploit tokenizers "new stream" in FastIncrementalDetokenizer (#34217)
Signed-off-by: Nick Hill <nickhill123@gmail.com>
2026-02-11 11:03:24 +01:00
Tianqi Ren
786806dd44 [Doc] Update Marlin support matrix for Turing (#34319)
Signed-off-by: Tianqi Ren <tianqi.r@outlook.com>
2026-02-11 09:03:41 +00:00
Nick Hill
79504027ef [Misc] Bump fastsafetensors version for latest fixes (#34273)
Signed-off-by: Nick Hill <nickhill123@gmail.com>
2026-02-11 00:30:09 -08:00
Luka Govedič
addac0e653 [torch.compile] Enable AR+rms fusion by default available for -O2 (#34299)
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
2026-02-11 00:30:00 -08:00
Cyrus Leung
675a22ed66 [Chore] Move BaseRenderer to base.py (#34308)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-11 00:29:51 -08:00
Kunshang Ji
cb9574eb85 [XPU][9/N] clean up existing ipex code/doc (#34111)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2026-02-11 00:27:15 -08:00
AllenDou
21dfb842d7 [model] support FunASR model (#33247)
Signed-off-by: zixiao <shunli.dsl@alibaba-inc.com>
Co-authored-by: zixiao <shunli.dsl@alibaba-inc.com>
2026-02-11 07:37:09 +00:00
R3hankhan
d1b837f0ae [CPU] Enable FP16 (Half dtype) support for s390x (#34116)
Signed-off-by: Rehan Khan <Rehan.Khan7@ibm.com>
2026-02-11 14:41:42 +08:00
Roger Wang
0b20469c62 [Bugfix] Fix weight naming in Qwen3.5 (#34313)
Signed-off-by: Roger Wang <hey@rogerw.io>
2026-02-10 21:37:14 -08:00
Tyler Michael Smith
d7982daff5 [Bugfix] Fix fused MoE IMA (sans chunking) by using int64 for strides (#34279)
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 05:15:52 +00:00
Robert Shaw
9b17c57460 [ModelBash][DSR1 NVFp4] Removed Bf16 Bias Cast (#34298)
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
2026-02-11 05:00:00 +00:00
Hashem Hashemi
1b3540e6c6 Threshold fix wvSplitk for occasional CI fails (#34013)
Signed-off-by: Hashem Hashemi <hashem.hashemi@amd.com>
2026-02-11 03:59:14 +00:00
Matthias Gehre
7a048ee65f [Bugfix] Fix benchmark_moe.py inplace assertion with torch >= 2.9 (#34149)
Signed-off-by: Matthias Gehre <matthias.gehre@amd.com>
2026-02-11 03:58:56 +00:00
Cyrus Leung
c9a1923bb4 [Plugin] Simplify IO Processor Plugin interface (#34236)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-10 19:47:39 -08:00
zofia
b482f71e9f [XPU][7/N] enable xpu fp8 moe (#34202)
Signed-off-by: Zhu, Zufang <zufang.zhu@intel.com>
2026-02-11 03:33:59 +00:00
Дзержи́нский
1485396abb [Kernel] Apply 256bit LDG/STG To Activation Kernels (#33022)
Signed-off-by: Dzerzhinsky <256908701+AstroVoyager7@users.noreply.github.com>
Signed-off-by: Дзержи́нский <256908701+AstroVoyager7@users.noreply.github.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2026-02-10 19:31:51 -08:00
Kebe
5ee5c86eeb [Bugfix][DeepSeek-V3.2] fix fp8 kvcache type cast (#33884)
Signed-off-by: Kebe <mail@kebe7jun.com>
2026-02-10 19:31:36 -08:00
Cyrus Leung
b5dcb372e4 [Misc] Clean up validation logic in input processor (#34144)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-10 19:29:29 -08:00
Tyler Michael Smith
066c6da6a0 [WideEP] Fix nvfp4 DeepEP High Throughput All2All backend (#33738)
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2026-02-10 19:15:43 -08:00
Richard Zou
e30cedd44b [torch.compile] Stop doing unnecessary FakeTensorProp in PiecewiseCompileInterpreter (#34093)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2026-02-10 19:15:40 -08:00
Cyrus Leung
3bcd494ef4 [Redo] Add --trust-remote-code to dataset bench args (#34251)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-11 11:10:12 +08:00
tianshu-Michael-yu
0e725a7d22 [Bugfix] Fix Worker.load_model context-manager composition for sleep mode (#34021)
Signed-off-by: tianshu.yu <tianshuyu.formal@gmail.com>
2026-02-11 11:07:51 +08:00
Lucas Wilkinson
ba0511fd80 [Misc] Add run one batch script that supports profiling (#32968)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2026-02-10 18:29:49 -08:00
Micah Williamson
4a1550d22d [ROCm][CI] Fix test_sequence_parallel.py location in AMD CI pipeline (#34280)
Signed-off-by: Micah Williamson <micah.williamson@amd.com>
2026-02-11 01:08:11 +00:00
bnellnm
d1481ba783 [MoE Refactor] Introduce MoERunner abstraction and move execution logic from FusedMoE to DefaultMoERunner (#32344)
Signed-off-by: Bill Nell <bnell@redhat.com>
2026-02-10 19:51:07 -05:00
7. Sun
dc6de33c3d [CI] Add pip caching to cleanup_pr_body workflow (#32979)
Signed-off-by: 7. Sun <jhao.sun@gmail.com>
2026-02-11 00:45:28 +00:00
Tyler Michael Smith
c4b9e6778f [Misc] Add pre-commit hook to catch boolean ops in with-statements (#34271)
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-10 15:13:20 -08:00
Richard Zou
341eed3d30 [torch.compile] Disable recursive pre_grad_passes (#34092)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2026-02-10 18:02:31 -05:00
Zhengkai Zhang
6f2f59f2b3 [Misc][Spec Decode] support different load config for draft model (#34022)
Signed-off-by: zzhengkai <zzhengkai@devgpu049.ldc1.facebook.com>
Co-authored-by: zzhengkai <zzhengkai@devgpu049.ldc1.facebook.com>
2026-02-10 14:52:43 -08:00
Ilya Markov
bb2fc8b5e7 [BugFix] Fix async EPLB hang with DeepEP LL all2all backend (#32860)
Signed-off-by: ilmarkov <markovilya197@gmail.com>
2026-02-10 22:34:47 +00:00
Ilya Markov
67132945bb [Perf] Move eplb rebalance algo to async thread (#30888)
Signed-off-by: ilmarkov <markovilya197@gmail.com>
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
2026-02-10 22:19:10 +00:00
Gregory Shtrasberg
f0ca0671c7 [Feature] Warn about unrecognized environment variables (#33581)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2026-02-10 15:45:38 -06:00
Pavani Majety
578977bb5e [SM100] Resubmit FMHA FP8 prefill for MLA (#31195)
Signed-off-by: Pavani Majety <pmajety@nvidia.com>
2026-02-10 16:18:43 -05:00
Roger Wang
9615575afc [Bugfix] Fix mamba cache dtype for Qwen3.5 (#34200)
Signed-off-by: Roger Wang <hey@rogerw.io>
2026-02-10 13:12:31 -08:00
Matthew Bonanni
4293c00b84 [Benchmarks] Fix attention benchmark smoke test (#34269)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2026-02-10 16:04:07 -05:00
J Seppänen
506ad7d7c1 [Bugfix] Fix weights offloading for sleep mode (#32947)
Signed-off-by: Jarno Seppänen <jseppanen@nvidia.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2026-02-10 20:38:17 +00:00
Reagan Lee
fdd6f2ad58 Convert online APIs to use Renderer (#34084)
Signed-off-by: Reagan Lee <“reaganjlee@gmail.com”>
Co-authored-by: Reagan Lee <“reaganjlee@gmail.com”>
2026-02-10 19:44:31 +00:00
Qi Wang
33bcd3dc3b [Misc] Introduce ec_both role EC (encoder cache) connector (#34182)
Signed-off-by: Qi Wang <qiwa@nvidia.com>
2026-02-10 18:55:35 +00:00
Michael Goin
1f5febb4b8 [UX nit] Fix non-default api_server_count message (#34152)
Signed-off-by: mgoin <mgoin64@gmail.com>
2026-02-10 10:35:58 -08:00
Andy Lo
ae871ca923 Minor cleanup for Voxtral (#34247)
Signed-off-by: Andy Lo <andy@mistral.ai>
2026-02-10 18:18:30 +00:00
Woosuk Kwon
a2443de5fa [Model Runner V2] Use pinned memory for write_contents (#34222)
Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
2026-02-10 08:55:22 -08:00
Harry Mellor
f84a2a8f31 [Docs] Speed up build environment set-up (#34240)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-10 16:34:43 +00:00
Vadim Gimpelson
000214c4bb [BUGFIX] Fix accuracy bugs in Qwen3-Next MTP (#34077)
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
2026-02-10 10:57:11 -05:00
junuxyz
c5a66d1697 [Core][BugFix] Fix PP KV cache sharding memory validation (#33698)
Signed-off-by: junuxyz <216036880+junuxyz@users.noreply.github.com>
2026-02-10 10:46:24 -05:00
Roberto L. Castro
afdce12c89 [Perf][Kernel] Add faster topKperRow decode kernel for DeepSeek-V3.2 sparse attention (#33680)
Signed-off-by: LopezCastroRoberto <rocastro@redhat.com>
Signed-off-by: Roberto L. Castro <38211239+LopezCastroRoberto@users.noreply.github.com>
Co-authored-by: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-10 10:29:52 -05:00
Zhengxu Chen
82e11973cc [compile] Enable AOT compile with 2.10 in trunk. (#34155)
Signed-off-by: Zhengxu Chen <zhxchen17@meta.com>
2026-02-10 23:24:42 +08:00
xuebwang-amd
b129136c7a [ROCm][Quantization] GPT_OSS in amd-quark format model loading and emulations (#29008)
Signed-off-by: xuebwang-amd <xuebwang@amd.com>
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2026-02-10 10:08:05 -05:00
mgazz
599e4335a4 Support benchmarking of Geospatial models (#33922)
Signed-off-by: Michele Gazzetti <michele.gazzetti1@ibm.com>
2026-02-10 07:04:16 -08:00
Fan Yang
a1946570d8 add --insecure arg to the vllm bench to skip TLS (#34026)
Signed-off-by: Fan Yang <yan9fan@meta.com>
Co-authored-by: Fan Yang <yan9fan@meta.com>
2026-02-10 22:23:52 +08:00
Harry Mellor
d0bc520569 Bump mamba-ssm version in CI for Transformers v5 compatibility (#34233)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-10 14:46:01 +01:00
Krish Gupta
748625cdaf [V1][BugFix] Fix EAGLE3 encoder cache miss with disable_chunked_mm_input (#34220)
Signed-off-by: KrxGu <krishom70@gmail.com>
2026-02-10 13:05:32 +00:00
Harry Mellor
61413973e8 Stop testing for slow tokenizers as they will not exist soon (#34235)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-10 12:08:20 +00:00
Phúc H. Lê Khắc
94de871546 [Misc] allow specify is_mm_prefix_lm in hf_config (#34215) 2026-02-10 11:16:21 +00:00
tc-mb
e042d7e685 Add flagos in MiniCPM-o (#34126)
Signed-off-by: tc-mb <caitianchi@modelbest.cn>
Signed-off-by: Vincent-Xiao <vincent.xiao.me@gmail.com>
Co-authored-by: Vincent-Xiao <vincent.xiao.me@gmail.com>
2026-02-10 02:51:48 -08:00
Roger Wang
ae4e280602 [Bugfix] Fix FI kernelchunk_gated_delta_rule output shape for Qwen3.5 (#34219)
Signed-off-by: Roger Wang <hey@rogerw.io>
2026-02-10 10:41:24 +00:00
zzaebok
cbea11c9f0 [Docs] Fix format error in KV load failure recovery doc (#34137)
Signed-off-by: Jaebok Lee <jaebok9541@naver.com>
2026-02-10 02:16:26 -08:00
Cyrus Leung
2c32558a3c [Bugfix] Fix --trust-remote-code conflict (#34218)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-10 00:29:10 -08:00
Zetong Li
5f970120f0 [Bugfix] Fix memory inconsistency in cross-process shared memory (#32022)
Signed-off-by: Zetong Li <slippersss@126.com>
2026-02-10 08:22:03 +00:00
Cyrus Leung
998e2d91f8 Revert #34208 (#34216) 2026-02-09 23:59:04 -08:00
Wentao Ye
e1060a71a1 [Perf] Optimize detokenizer python logic (#32975)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2026-02-09 23:54:41 -08:00
Chen Zhang
97fa8f6590 [BugFix] Avoid prefix cache hit in the same schedule step for mamba layers (#29387)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2026-02-10 07:41:16 +00:00
wang.yuqi
dab1de9f38 [Frontend][CI] Consolidate instrumentator entrypoints (#34123)
Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
2026-02-10 07:30:19 +00:00
Balaxxe
8d48d0a9d9 [Bugfix] Sort hf_weights_files in fastsafetensors_weights_iterator to match #33491 (#34190)
Signed-off-by: Balaxxe <136368465+jaim12005@users.noreply.github.com>
2026-02-09 23:06:30 -08:00
Andrew Xia
9608844f96 [responsesAPI] fix simpleContext streaming output_messages (#34188)
Signed-off-by: Andrew Xia <axia@meta.com>
Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
2026-02-09 22:53:07 -08:00
Cyrus Leung
f69b903b4c [Bugfix] Add --trust-remote-code to dataset bench args (#34208)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-09 22:37:50 -08:00
Lucas Wilkinson
81e217fe6b [Bugfix] Fix DP Attention Padding in Dummy Run (#34187)
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Benjamin Chislett <bchislett@nvidia.com>
2026-02-10 05:29:39 +00:00
Cyrus Leung
ab97bcf662 [CI/Build] Relax test_mcp_tool_call (#34204)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-10 05:18:57 +00:00
Cyrus Leung
25e48a3aae [Doc] Update usage of --limit-mm-per-prompt (#34148)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-09 21:12:13 -08:00
Roger Wang
8a5e0e2b2b [Bugfix][Core] Fix CPU memory leak from Request reference cycle in prefix caching (#34183)
Signed-off-by: Roger Wang <hey@rogerw.io>
2026-02-10 13:03:32 +08:00
Andreas Karatzas
4cde2e0159 [ROCm][Bugfix] Resolve Dynamo tracing crash from amdsmi calls in on_gfx* arch detection (#34108)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-09 20:50:20 -08:00
Roger Wang
047a457fa4 [Bugfix] Adopt ChunkGatedDeltaRule for Qwen3.5 (#34198)
Signed-off-by: Roger Wang <hey@rogerw.io>
2026-02-10 03:47:54 +00:00
Yuwei An
e94ec59733 [LMCache] Token Base IPC API (#34175)
Signed-off-by: Oasis-Git <ayw.sirius19@gmail.com>
2026-02-10 01:18:42 +00:00
Ning Xie
13397841ab [structured output] validate unsupported json features first (#33233)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
2026-02-09 23:49:09 +00:00
Gregory Shtrasberg
c60f8e3b49 [Bugfix][ROCm][GPT-OSS] Use old triton_kernels implementation on ROCm if the new API is not available (#34153)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2026-02-09 17:38:54 -06:00
Michael Goin
5e75a14a66 [Doc] Add DCP support to attention backend doc (#33936) 2026-02-09 18:33:43 -05:00
Nick Hill
e7e52781ff [ModelRunner V2][BugFix] Fix max_query_len calculation (#34167)
Signed-off-by: Nick Hill <nickhill123@gmail.com>
2026-02-09 21:47:17 +00:00
Charlie Fu
bb9f97308d [torch.compile][Fusion] Fix attention fusion pass removing kv_udpate op. (#33945)
Signed-off-by: charlifu <charlifu@amd.com>
2026-02-09 16:15:43 -05:00
Hongxia Yang
4d39650961 [ROCm] update triton branch to support gpt-oss models for gfx11xx devices (#34032)
Signed-off-by: Hongxia Yang <hongxia.yang@amd.com>
2026-02-09 19:36:30 +00:00
Artus Krohn-Grimberghe
8fd31f6245 [Bugfix] Voxtral prompt/audio placeholder alignment (#34140)
Signed-off-by: Artus KG <artuskg@gmail.com>
2026-02-09 19:30:38 +00:00
Artus Krohn-Grimberghe
eadb4e868b [Bugfix] Avoid duplicate k-proj weight emission in helper (#34142)
Signed-off-by: Artus KG <artuskg@gmail.com>
2026-02-09 19:17:44 +00:00
Jiangyun Zhu
285bab4752 [Kernel] use flashinfer for gdn prefill (#32846)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
2026-02-09 12:17:25 -05:00
TomerBN-Nvidia
995bbf38f1 [Bugfix] Fix shared expert input for latent MoE in EP+DP (Nemotron-H) (#34087)
Signed-off-by: Tomer Natan <tbarnatan@nvidia.com>
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-09 16:44:18 +00:00
Mohammad Miadh Angkad
d4f123cc48 [Kernel] FlashInfer: switch allreduce fusion to unified API (#33985)
Signed-off-by: Mohammad Miadh Angkad <176301910+mmangkad@users.noreply.github.com>
2026-02-09 15:43:24 +00:00
ZhengHongming888
cb62e86f83 Add NUMA Core binding in nixl_connector for CPU xPyD (#32365)
Signed-off-by: Hongming Zheng <hongming.zheng@intel.com>
Signed-off-by: ZhengHongming888 <hongming.zheng@intel.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-02-09 15:39:12 +00:00
Luka Govedič
781ddf7868 [CI][torch.compile] Fix incorrect filtering for E2E fusion tests on B200 (#34031)
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
2026-02-09 10:05:14 -05:00
Roger Wang
64a9c2528b [UX] Add --language-model-only for hybrid models (#34120)
Signed-off-by: Roger Wang <hey@rogerw.io>
2026-02-09 14:57:33 +00:00
Lucas Wilkinson
d0d97e2974 [Misc] Fix up attention benchmarks (#33810)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
Co-authored-by: Matthew Bonanni <mbonanni@redhat.com>
2026-02-09 09:42:03 -05:00
JJJYmmm
9562912cea [MODEL] Adding Support for Qwen3.5 Models (#34110)
Signed-off-by: JJJYmmm <1650675829@qq.com>
Signed-off-by: JJJYmmm <92386084+JJJYmmm@users.noreply.github.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: wulipc <wulipc@users.noreply.github.com>
Co-authored-by: ywang96 <ywang96@users.noreply.github.com>
Co-authored-by: Isotr0py <Isotr0py@users.noreply.github.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
2026-02-09 21:12:58 +08:00
zofia
9bdb06b436 [XPU][6/N] add xpu scaled_mm kernel (#34117)
Signed-off-by: Zhu, Zufang <zufang.zhu@intel.com>
2026-02-09 20:17:35 +08:00
Nikhil Gupta
caad9f1e01 [Fix] [CPU Backend] : Prepack weights for w8a8 oneDNN matmul (#33901)
Signed-off-by: nikhil-arm <nikhil.gupta2@arm.com>
2026-02-09 18:04:41 +08:00
Ekagra Ranjan
1d5922fade [ASR] Fix audio benchmark and add RTFx metric (#32300)
Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com>
2026-02-09 10:02:37 +00:00
Andreas Karatzas
3025b3cebb [CI] Remove empty image_size_factors for fuyu, glm4_1v, glm_ocr (#34107)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-09 17:37:04 +08:00
Jee Jee Li
978a37c823 [Model] GLM adaptation (#34124) 2026-02-09 17:32:52 +08:00
ihb2032
5a5c43511a fix(cpu): fix mla_decode compilation on x86 without AVX512 (#34052)
Signed-off-by: ihb2032 <hebome@foxmail.com>
Co-authored-by: root <root@LAPTOP-FKNHV411.localdomain>
2026-02-09 08:55:41 +00:00
Nick Hill
d9bede0314 [BugFix] Fix fastsafetensors TP all procs using all GPUs (#34070)
Signed-off-by: Nick Hill <nickhill123@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2026-02-09 15:15:46 +08:00
wang.yuqi
22b64948f6 [Frontend][last/5] Make pooling entrypoints request schema consensus. (#31127)
Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
2026-02-09 06:42:38 +00:00
Reagan Lee
7c233dbb36 [Tiny] Rename encoder budget file to more specific name (#34103)
Signed-off-by: Reagan Lee <“reaganjlee@gmail.com”>
Co-authored-by: Reagan Lee <“reaganjlee@gmail.com”>
2026-02-09 03:48:19 +00:00
kourosh hakhamaneshi
a75a5b54c7 [bug-fix] supported_tasks is breaking backward compatibility at init_app_state (#34027)
Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Signed-off-by: kourosh hakhamaneshi <31483498+kouroshHakha@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2026-02-09 09:46:46 +08:00
Andrey Talman
f97ca67176 [Release 2.10] Update to Torch 2.10 - final release (#30525) 2026-02-08 13:51:09 -08:00
danisereb
084aa19f02 Add support for ModelOpt MXFP8 dense models (#33786)
Signed-off-by: Daniel Serebrenik <daserebrenik@nvidia.com>
2026-02-08 11:16:48 -08:00
navmarri14
1ecfabe525 glm 4.6 fused tuned inference config for B200 (#32958) 2026-02-08 18:55:47 +00:00
Richard Zou
4df841fe75 [torch.compile] Add an option to force-enable the MOE cold start optimization (#33735)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2026-02-08 18:42:56 +00:00
TomerBN-Nvidia
a263aa6140 [BugFix] Change support no act and mul for marlin (#34088)
Signed-off-by: Tomer Natan <tbarnatan@computelab-frontend-8.nvidia.com>
Co-authored-by: Tomer Natan <tbarnatan@computelab-frontend-8.nvidia.com>
2026-02-08 17:18:22 +00:00
aabbccddwasd
179ae7da8f [Revert] Fix performance regression for GLM-4.7-GPTQ decode and MTP acceptance rate (#33771)
Signed-off-by: aabbccddwasd <aabbccddwasd@qq.com>
2026-02-08 08:13:24 -08:00
Reagan Lee
c4df59ad43 Add embedding input functionality for disabled modalities [remake] (#32493)
Signed-off-by: Reagan Lee <“reaganjlee@gmail.com”>
Signed-off-by: Reagan Lee <reaganjlee@gmail.com>
Signed-off-by: Reagan Lee <96998476+reaganjlee@users.noreply.github.com>
Co-authored-by: Reagan Lee <“reaganjlee@gmail.com”>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-02-08 04:57:16 -08:00
TJian
785cf28fff [ROCm] [CI] Reduce Resource of two test groups (#34059)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2026-02-08 15:17:26 +08:00
Nick Hill
a96197f564 [Perf] Simplify DeepseekV32 tokenizer, ensure fast detokenization used (#33855)
Signed-off-by: Nick Hill <nickhill123@gmail.com>
2026-02-08 07:16:34 +00:00
Andreas Karatzas
ab10d79855 [ROCm][Bugfix] fix act_quant_fusion module import error (#34069)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-07 19:21:12 -08:00
Cyrus Leung
7fcb705b80 [CI/Build] Skip GCS test (#34057)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-07 08:52:38 -08:00
Cyrus Leung
b956cdf818 [Doc] Fix run_batch docs (#34056)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-07 06:18:16 -08:00
Hashem Hashemi
ed17f54c8b Perf tuning and expansion of cases covered for wvSplitKrc (#33493)
Signed-off-by: Hashem Hashemi <hashem.hashemi@amd.com>
2026-02-07 05:33:11 -08:00
Jiang Wu
860981d8d8 Make directory exist ok for ray spinning up multiple replicas on a single instance (#33604)
Signed-off-by: Jiang Wu <jwu@cclgroup.com>
2026-02-07 05:30:49 -08:00
zifeitong
52181baaea Update DeepGEMM version pin in Dockerfile to match #32479 (#33935)
Signed-off-by: Zifei Tong <zifeitong@gmail.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2026-02-07 05:30:22 -08:00
Rohan Potdar
de3869bb4d move checks out of unified_kv_cache_update custom op (#33943)
Signed-off-by: Rohan138 <rohanpotdar138@gmail.com>
2026-02-07 05:30:09 -08:00
whx
ce9b3cd3e9 [PluggableLayer][3/N] Apply PluggableLayer to mamba layers. (#33660)
Signed-off-by: whx-sjtu <2952154980@qq.com>
2026-02-07 05:26:05 -08:00
Jee Jee Li
db4ede9743 [Model] Enable Step3p5ForCausalLM testing (#33755)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2026-02-07 05:25:24 -08:00
Pooya Davoodi
2cb2340f7a [Frontend]Add support for transcriptions and translations to run_batch (#33934)
Signed-off-by: Pooya Davoodi <pooya.davoodi@parasail.io>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2026-02-07 05:24:57 -08:00
TundeAtSN
4df44c16ba Enable Eagle3 speculative decoding for Mistral3ForConditionalGeneration to support eagle3 (#33939)
Signed-off-by: Akintunde Oladipo <akintunde.oladipo@servicenow.com>
Signed-off-by: TundeAtSN <akintunde.oladipo@servicenow.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-02-07 05:24:52 -08:00
Richard Zou
81fe69cae5 [torch.compile] Stop compiling identical artifacts (#34003)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2026-02-07 05:24:48 -08:00
Mohammad Miadh Angkad
dd6a6e1190 [Kernel] Add KernelConfig flag to enable/disable FlashInfer autotune (#34006)
Signed-off-by: Mohammad Miadh Angkad <176301910+mmangkad@users.noreply.github.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2026-02-07 05:24:44 -08:00
Cyrus Leung
edb359cce4 [Renderer] Define render_cmpl and render_chat (#34039)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-07 05:24:40 -08:00
wang.yuqi
6ed5eda300 [CI][Build] Pin grpcio-tools==1.78.0 (#34048)
Signed-off-by: wang.yuqi <noooop@126.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2026-02-07 05:24:35 -08:00
Cyrus Leung
11a4c9d30d [Misc] Simplify get_max_tokens (#34036)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-07 00:59:49 -08:00
lukec
15a0b9e570 Fix spelling errors (#33978) 2026-02-06 23:58:50 -08:00
Andreas Karatzas
c490d8cc73 [ROCm][CI] Pinning lm-eval version to resolve multi-modal small eval bug (#34038)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-06 22:21:08 -08:00
Cyrus Leung
48312e579a [Misc] Make PlaceholderRange.get_num_embeds a method (#34035)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-07 05:30:17 +00:00
Vel
bc32444b23 [Kernel] Add enable_sm120_or_later for SM121 (DGX Spark) CUTLASS support (#33517)
Signed-off-by: code4me2 <velvetmoon222999@gmail.com>
2026-02-06 20:28:01 -08:00
Wentao Ye
18e8545297 [Revert] Add util handle_deprecated back (#33998)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2026-02-07 04:14:45 +00:00
果冻虾仁
6f7adc533a fix description in plugin_system.md (#33999) 2026-02-06 19:37:02 -08:00
Nick Hill
40218a82ba [ModelRunner V2] Revert token rank comparison difference for now (#34017)
Signed-off-by: Nick Hill <nickhill123@gmail.com>
2026-02-07 11:11:05 +08:00
kourosh hakhamaneshi
1c3b22058f [Misc] Add backward-compatible import aliases for renamed translations module (#34015)
Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-07 11:01:41 +08:00
Xin Yang
3920cafdd6 [Bugfix] Fix _fused_moe_lora_expand signature mismatch (#33821)
Signed-off-by: Xin Yang <xyangx@amazon.com>
2026-02-07 10:45:59 +08:00
rasmith
ec28784fdc [CI][AMD]Bugfix] Check that model_config is not None in enable_norm_pad_fusion (#34007)
Signed-off-by: Randall Smith <Randall.Smith@amd.com>
2026-02-07 02:43:25 +00:00
Nicolò Lucchesi
55aeec04f5 [Bugfix] Fix Whisper tokenization (#34011)
Signed-off-by: NickLucche <nlucches@redhat.com>
2026-02-07 10:42:52 +08:00
Ikenna
906077181b [Bugfix] Fix QK Norm+RoPE fusion pattern matching on B200+FP8 (#33967)
Signed-off-by: Ikenna <ikennachifo@gmail.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2026-02-07 02:27:33 +00:00
Aaron Hao
89a385d79f [Feat][RL] Pause and Resume with keep requests for single engine (#32351)
Signed-off-by: ahao-anyscale <ahao@anyscale.com>
Signed-off-by: Aaron Hao <ahao@anyscale.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2026-02-07 00:08:58 +00:00
kourosh hakhamaneshi
4a2d00eafd [bugfix] [ROCm] Fix premature CUDA initialization in platform detection (#33941)
Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
2026-02-06 16:17:55 -06:00
Dimitrios Bariamis
207c3a0c20 Fix RoutingMethodType logic (#33919)
Signed-off-by: Dimitrios Bariamis <12195802+dbari@users.noreply.github.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Dimitrios Bariamis <12195802+dbari@users.noreply.github.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2026-02-06 14:03:34 -08:00
Sumanth R Hegde
ae2e93f89b [Fix] Fix logprobs=0 handling for /inference/v1/generate endpoint (#34010)
Signed-off-by: SumanthRH <sumanthrh99@gmail.com>
2026-02-06 20:33:40 +00:00
xuebwang-amd
9e9acce577 [Bugfix] Fix no attribute error of SharedFusedMoE (DeepSeek-V3.1 as test model) (#33993)
Signed-off-by: xuebwang-amd <xuebwang@amd.com>
2026-02-06 19:11:32 +00:00
Charlie Fu
fe5438200b [Rocm][Bugfix] Fix dtype not same for gemm_a4w4 op (#33734)
Signed-off-by: charlifu <charlifu@amd.com>
2026-02-06 19:09:59 +00:00
Wentao Ye
77c09e1130 [Refactor] Remove align block size logic in moe_permute (#33449)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2026-02-06 10:57:06 -08:00
zhrrr
16786da735 [Model Runner V2] support apply penalty for spec decode (#33251)
Signed-off-by: zhuhaoran <zhuhaoran.zhr@alibaba-inc.com>
2026-02-06 10:56:48 -08:00
vllmellm
aaa2efbe98 [DOC] [ROCm] Update docker deployment doc (#33971)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-06 10:05:35 -08:00
Seiji Eicher
aca5967416 [KV Connector] Add missing method overrides to MultiConnector (#33292)
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
2026-02-06 12:58:21 -05:00
Wentao Ye
67a746e87f [Log] Optimize duplicate startup log (#33944)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2026-02-06 17:49:56 +00:00
Chauncey
7bec435130 [Bugfix] Fix the issue where tool calling does not work when using fast detokenization with dsv32 (#33964)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2026-02-06 09:23:44 -08:00
Eldar Kurtić
5c52644b10 [Docs] Update link to Benchmark CLI documentation (#33254)
Signed-off-by: Eldar Kurtić <8884008+eldarkurtic@users.noreply.github.com>
2026-02-06 16:00:59 +00:00
zofia
2ce9fe4ad0 [XPU][5/N] add wna16 xpu kernel (#33973)
Signed-off-by: Zhu, Zufang <zufang.zhu@intel.com>
2026-02-06 15:59:53 +00:00
Cyrus Leung
cd8b405bd0 [Refactor] Consolidate sequence normalization and enc-dec parsing (#33928)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-06 15:43:47 +00:00
tc-mb
4707f7ebb4 [Model] Support MiniCPM-o 4.5 (#33431)
Signed-off-by: caitianchi <caitianchi@modelbest.cn>
Signed-off-by: tc-mb <caitianchi@modelbest.cn>
Co-authored-by: mslv <mslv@baai.ac.cn>
2026-02-06 15:29:10 +00:00
Michael Goin
c39ee9ee2b [Docs] Add sections on process architecture and minimum CPU resources (#33940)
It seems users can be confused about vLLM's performance when running
with very small amounts of CPU cores available. We are missing a clear
overview of what vLLM's process architecture is, so I added this along with
some diagrams in arch_overview.md, and included a section on CPU resource
recommendations in optimization.md

Signed-off-by: mgoin <mgoin64@gmail.com>
2026-02-06 15:26:43 +00:00
Andreas Karatzas
350ca72c04 [ROCm][AITER] Fix AITER import regression for explicit backend selection (#33749)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-06 15:08:16 +00:00
FredericOdermatt
1fb0495a72 [FIX] guidance: use max(vocab_size, len(tokenizer)) for n_vocab (#33509)
Signed-off-by: Frederic Odermatt <frederic.odermatt@44ai.ch>
2026-02-06 14:23:03 +00:00
Raushan Turganbay
85ee1d962b [Bugfix] Fix models and tests for transformers v5 (#33977)
Signed-off-by: raushan <raushan@huggingface.co>
Signed-off-by: Raushan Turganbay <raushan.turganbay@alumni.nu.edu.kz>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-06 21:47:41 +08:00
Harry Mellor
51a7bda625 Update WeightTransferConfig to be more standard like the others (#33989)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-06 13:15:00 +00:00
SorenDreano
6e7b1c4b59 [Docs] Improve documentation (#33799)
Co-authored-by: Soren Dreano <soren@numind.ai>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2026-02-06 12:57:09 +00:00
Kurt Shuster
2991dd3d22 [Bugfix][Model] Support LoRA on Qwen3 Output Embedding (#29816)
Signed-off-by: kurt <kurt@thinkingmachines.ai>
2026-02-06 20:25:31 +08:00
Luka Govedič
ac32e66cf9 [torch.compile] Reorganize vllm/compilation and tests/compile (0/N for vLLM IR) (#33731)
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
Signed-off-by: ProExpertProg <luka.govedic@gmail.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2026-02-06 04:19:49 -08:00
Fadi Arafeh
f79d9dce16 [CPU][BugFix] Fix loading of w8a8int models with bias (#33582)
Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
2026-02-06 11:59:20 +00:00
Harry Mellor
ba5cbbf107 Bump HF Hub client to get bug fix (#33984)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-06 11:25:33 +00:00
zhang-prog
233b26ab35 [PaddleOCR-VL] Add BC for transformers 5.0 config (#33976)
Signed-off-by: zhangyue66 <zhangyue66@baidu.com>
2026-02-06 10:33:49 +00:00
Harry Mellor
791a94bed0 Consolidate and fix forbidden import pre-commit checks (#33982)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-06 01:47:41 -08:00
Xinyu Chen
e969a169ef support view_from_cpu_tensor on XPU (#33868)
Signed-off-by: Xinyu Chen <xinyu1.chen@intel.com>
2026-02-06 08:34:20 +00:00
Harry Mellor
6d8d34be6d Fix main pre-commit (#33975)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-06 00:08:05 -08:00
Gassan Salama
1363e3d6d5 [cpu][performance] CPU Paged Attention NEON BFMMLA BF16 Implementation (#32263)
Signed-off-by: Gassan <gassan.salama@arm.com>
2026-02-06 15:01:48 +08:00
chengchengpei
965525667b Onboard voyage-4-nano (#33720)
Signed-off-by: Chengcheng Pei <chengchengpei@outlook.com>
Signed-off-by: chengchengpei <5881383+chengchengpei@users.noreply.github.com>
Co-authored-by: chengchengpei <5881383+chengchengpei@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-02-06 06:23:34 +00:00
sihao_li
6550815c3a [XPU]Replace pip in docker.xpu with uv pip (#31112)
Signed-off-by: sihao.li <sihao.li@intel.com>
2026-02-06 14:02:33 +08:00
Kunshang Ji
7439e4f41b [XPU][4/N] add mxfp4 moe model support (#33679)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2026-02-06 13:03:59 +08:00
R3hankhan
ac04dd374f [CPU] Add BF16 Kernel type for s390x (#33788)
Signed-off-by: Rehan Khan <Rehan.Khan7@ibm.com>
2026-02-06 04:57:02 +00:00
Cyrus Leung
035a6cb09a [Misc] Update code for encoder-decoder models (#33900)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-06 11:38:39 +08:00
Mingliang Li
a32cb49b60 feat(frontend): early-fail tokenization guard for user requests (#31366)
Signed-off-by: limingliang <limingliang@stepfun.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: limingliang <limingliang@stepfun.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-05 19:38:02 -08:00
Rabi Mishra
20d7454c9b fix(ROCm): Make flash_attn import optional in MLA attention (#33511)
Signed-off-by: rabi <ramishra@redhat.com>
2026-02-06 02:22:53 +00:00
Simon Mo
5819ca8944 [Docs] Add reo analytics (#33957)
Signed-off-by: simon-mo <simon.mo@hey.com>
2026-02-05 17:42:22 -08:00
Xin Yang
79028d4388 [Perf] Disable clean_logits in deepgemm fp8_mqa_logits kernel (#33568) 2026-02-05 20:34:00 -05:00
emricksini-h
325ab6b0a8 [Feature] OTEL tracing during loading (#31162) 2026-02-05 16:59:28 -08:00
Wei Zhao
91a07ff618 [Bugfix] Fix DeepSeek v3.2 tokenizer outputting None issue (#33832)
Signed-off-by: wzhao18 <wzhao18.sz@gmail.com>
2026-02-05 23:50:49 +00:00
Hashem Hashemi
d5c4800112 Adds padding and perf improvements to wvSplitK_fp8 (#33527)
Signed-off-by: Hashem Hashemi <hashem.hashemi@amd.com>
2026-02-05 22:16:02 +00:00
Lumosis
42d5d705f9 [Minor] Sort safetensors files to ensure deterministic loading order (#33491)
Signed-off-by: Lihao Ran <imlihao.ran@gmail.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2026-02-05 17:05:09 -05:00
Cyrus Leung
116880a5a0 [Bugfix] Make MM batching more robust (#33817)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-05 20:40:58 +00:00
Matthew Bonanni
4145e50d85 [Bugfix] Fix DSV3.2 NVFP4 (#33932)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2026-02-05 19:22:19 +00:00
Nicolò Lucchesi
20f5d185a6 [Misc] Rename translations to speech_to_text for OAI serving component (#33904)
Signed-off-by: NickLucche <nlucches@redhat.com>
2026-02-05 19:16:52 +00:00
Harry Mellor
1887acca9e Fix tokenizer test for renamed attr on Transformers v5 (#33902)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-05 19:16:20 +00:00
Tsukasa OI
92e7562a99 [Bugfix] Suppress non-TTY color output on the process name part of the log (#29714)
Signed-off-by: Tsukasa OI <floss_llm@irq.a4lg.com>
2026-02-05 18:47:09 +00:00
Isotr0py
87d0d17ab5 [Models] Consolidate Deepseek-OCR2 processor (#33909)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-02-05 18:29:20 +00:00
bnellnm
a57c8228ff [Moe Refactor] Make Inplace Flag for FusedMoEModularKernel part of the constructor (#33375)
Signed-off-by: Bill Nell <bnell@redhat.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2026-02-05 18:07:18 +00:00
zackyoray
1ee95841bd [Bugfix] Fix swapped engine_ids in NIXL Llama 4 local attention path (#33795)
Signed-off-by: Yoray Zack <yorayz@nvidia.com>
2026-02-05 17:51:58 +00:00
Nicolò Lucchesi
7d8c6804e2 [Misc] Add debug logs (#33931)
Signed-off-by: NickLucche <nlucches@redhat.com>
2026-02-05 09:42:40 -08:00
Benjamin Chislett
af3162d3aa [Spec Decode] Unified Parallel Drafting (#32887)
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
2026-02-05 12:37:18 -05:00
danisereb
5b2a9422f0 [BugFix] Fix LoRA Fp8 (#33879)
Signed-off-by: Daniel Serebrenik <daserebrenik@nvidia.com>
2026-02-05 17:25:55 +00:00
Aaron Hao
c1858b7ec8 [Feat][RL][1/2] Native Weight Syncing API: NCCL (#31943)
Signed-off-by: ahao-anyscale <ahao@anyscale.com>
Signed-off-by: Aaron Hao <ahao@anyscale.com>
Co-authored-by: SumanthRH <sumanthrh99@gmail.com>
2026-02-05 12:13:23 -05:00
Mario Hong
82914d2ae8 [Bugfix] Fix step3p5 parser when using mtp (#33690)
Signed-off-by: mariohong <mariohong128@gmail.com>
2026-02-05 16:04:04 +00:00
Nicolò Lucchesi
81a90e5277 [Docs] Add bart-plugin to docs (#33905)
Signed-off-by: NickLucche <nlucches@redhat.com>
2026-02-05 12:20:25 +00:00
wang.yuqi
1c3a221d3b [Bugfix] Fix corner case of sparse embedding (#33886)
Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
2026-02-05 02:51:22 -08:00
Cyrus Leung
7bd42e609d [Refactor] Clean up input preprocessing (#33687)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-05 18:43:42 +08:00
Isotr0py
a2522839d8 [Bugfix] Fix Kimi-K2.5 NVFP4 checkpoints weight loading (#33876)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-02-05 10:29:54 +00:00
jiahanc
59a5cb387a [perf] Integrate flashinfer concat_mla_k (#31171) 2026-02-05 05:23:11 -05:00
liranschour
8322d4e47f Enable Cross layers KV cache layout at NIXL Connector V2 (#33339)
Signed-off-by: Liran Schour <lirans@il.ibm.com>
Signed-off-by: liranschour <liranschour@users.noreply.github.com>
Co-authored-by: Or Ozeri <or@ozery.com>
Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
2026-02-05 02:17:02 -08:00
Andreas Karatzas
3e472e81f9 [ROCm][Bugfix][CI] Fix hybrid models and their tests (Mamba/Jamba/Bamba) (#32710)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
Signed-off-by: Matthew Wong <Matthew.Wong2@amd.com>
Co-authored-by: Matthew Wong <Matthew.Wong2@amd.com>
2026-02-05 10:01:23 +00:00
Cyrus Leung
038914b7c8 [Refactor] Move task outside of PoolingParams.verify (#33796)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Co-authored-by: wang.yuqi <yuqi.wang@daocloud.io>
2026-02-05 09:33:11 +00:00
Pavani Majety
d2f4a71cd5 [Bugfix] Kimi-K2 grouped_topk usage for Flashinfer monolithic kernels. (#33858)
Signed-off-by: Pavani Majety <pmajety@nvidia.com>
2026-02-05 09:32:10 +00:00
Mark McLoughlin
2abd97592f [KV Connector][Metrics] Do not count local prefix cache hits in connector queries (#30522)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2026-02-05 09:57:27 +02:00
Chauncey
6abb0454ad [Perf] Optimize the performance of structured output + reasoning (#33557)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2026-02-05 15:45:29 +08:00
Li, Jiang
db6f71d4c9 [CI/Build] Fix CPU CI test case title (#33870)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2026-02-05 15:07:14 +08:00
Fadi Arafeh
fd03538bf9 [CPU][BugFix] Allow w8a8 oneDNN quantized matmul to support 3D inputs (#33727)
Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
2026-02-05 06:26:09 +00:00
Andreas Karatzas
1f70313e59 [Bugfix] Fix ScoreMultiModalParam multi-document scoring returning single result (#33837)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Co-authored-by: wang.yuqi <yuqi.wang@daocloud.io>
2026-02-05 06:17:00 +00:00
Li, Jiang
07daee132b [CI/Build] Parallelize CPU CI tests (#33778)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2026-02-05 13:53:48 +08:00
Andrew Xia
9595afda18 [2/N] move responses/serving _make_response_output_items logic to parser (#33281)
Signed-off-by: Andrew Xia <axia@fb.com>
Signed-off-by: Andrew Xia <axia@meta.com>
Co-authored-by: Andrew Xia <axia@fb.com>
2026-02-05 13:46:15 +08:00
rasmith
c1395f72cd [CI][AMD][BugFix] Ensure VLLM_ROCM_USE_AITER is set so test_rocm_aiter_topk.py can run correctly (#33840)
Signed-off-by: Randall Smith <Randall.Smith@amd.com>
2026-02-05 05:05:48 +00:00
rinbaro
007b183d74 [docs] fix unintentional misspellings (#33863)
Signed-off-by: rinbaro <ilgomishra@gmail.com>
2026-02-04 20:50:59 -08:00
Nick Hill
add9f1fbd9 [Minor] Include StreamingInput in inputs package (#33856)
Signed-off-by: Nick Hill <nickhill123@gmail.com>
2026-02-05 04:38:20 +00:00
Luka Govedič
e3bf79ffa0 Revert "[Attention][FA3] Update FA3 to include new swizzle optimization" (#33841) 2026-02-04 19:54:27 -08:00
Andreas Karatzas
fb1270f1f8 [CI][Bugfix]: return McpCall for built-in MCP tools in non-streaming mode (#32762)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-05 11:14:06 +08:00
Kevin H. Luu
72bb24e2db [release] Minor fixes to release annotation (#33849)
Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
2026-02-05 02:07:35 +00:00
Chauncey
a7be77beef [Bugfix] fix DeepSeek R1 with CUTLASS MLA Broken on B200 (#33637)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2026-02-05 01:28:36 +00:00
1260 changed files with 101937 additions and 27281 deletions

View File

@@ -1,6 +1,7 @@
group: Hardware group: Hardware - AMD Build
steps: steps:
- label: "AMD: :docker: build image" - label: "AMD: :docker: build image"
key: image-build-amd
depends_on: [] depends_on: []
device: amd_cpu device: amd_cpu
no_plugin: true no_plugin: true
@@ -9,7 +10,7 @@ steps:
docker build docker build
--build-arg max_jobs=16 --build-arg max_jobs=16
--build-arg REMOTE_VLLM=1 --build-arg REMOTE_VLLM=1
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942' --build-arg ARG_PYTORCH_ROCM_ARCH='gfx942;gfx950'
--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
--tag "rocm/vllm-ci:${BUILDKITE_COMMIT}" --tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
-f docker/Dockerfile.rocm -f docker/Dockerfile.rocm

View File

@@ -1,8 +0,0 @@
group: Hardware
steps:
- label: "Arm CPU Test"
soft_fail: true
device: arm_cpu
no_plugin: true
commands:
- bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh

View File

@@ -0,0 +1,100 @@
group: CPU
depends_on: []
steps:
- label: CPU-Kernel Tests
depends_on: []
soft_fail: true
device: intel_cpu
no_plugin: true
source_file_dependencies:
- csrc/cpu/
- cmake/cpu_extension.cmake
- CMakeLists.txt
- vllm/_custom_ops.py
- tests/kernels/attention/test_cpu_attn.py
- tests/kernels/moe/test_cpu_fused_moe.py
- tests/kernels/test_onednn.py
commands:
- |
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
pytest -x -v -s tests/kernels/test_onednn.py"
- label: CPU-Language Generation and Pooling Model Tests
depends_on: []
soft_fail: true
device: intel_cpu
no_plugin: true
source_file_dependencies:
- csrc/cpu/
- vllm/
- tests/models/language/generation/
- tests/models/language/pooling/
commands:
- |
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
pytest -x -v -s tests/models/language/generation -m cpu_model
pytest -x -v -s tests/models/language/pooling -m cpu_model"
- label: CPU-Quantization Model Tests
depends_on: []
soft_fail: true
device: intel_cpu
no_plugin: true
source_file_dependencies:
- csrc/cpu/
- vllm/model_executor/layers/quantization/cpu_wna16.py
- vllm/model_executor/layers/quantization/gptq_marlin.py
- vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
- vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
- vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py
- tests/quantization/test_compressed_tensors.py
- tests/quantization/test_cpu_wna16.py
commands:
- |
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs
pytest -x -v -s tests/quantization/test_cpu_wna16.py"
- label: CPU-Distributed Tests
depends_on: []
soft_fail: true
device: intel_cpu
no_plugin: true
source_file_dependencies:
- csrc/cpu/shm.cpp
- vllm/v1/worker/cpu_worker.py
- vllm/v1/worker/gpu_worker.py
- vllm/v1/worker/cpu_model_runner.py
- vllm/v1/worker/gpu_model_runner.py
- vllm/platforms/cpu.py
- vllm/distributed/parallel_state.py
- vllm/distributed/device_communicators/cpu_communicator.py
commands:
- |
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m "
bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh"
- label: CPU-Multi-Modal Model Tests %N
depends_on: []
soft_fail: true
device: intel_cpu
no_plugin: true
source_file_dependencies:
# - vllm/
- vllm/model_executor/layers/rotary_embedding
- tests/models/multimodal/generation/
commands:
- |
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 45m "
pytest -x -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_pixtral.py -m cpu_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB"
parallelism: 2
- label: "Arm CPU Test"
depends_on: []
soft_fail: true
device: arm_cpu
no_plugin: true
commands:
- bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh

View File

@@ -1,13 +1,6 @@
group: Hardware group: Hardware
depends_on: ~ depends_on: ~
steps: steps:
- label: "Intel CPU Test"
soft_fail: true
device: intel_cpu
no_plugin: true
commands:
- bash .buildkite/scripts/hardware_ci/run-cpu-test.sh
- label: "Intel HPU Test" - label: "Intel HPU Test"
soft_fail: true soft_fail: true
device: intel_hpu device: intel_hpu

View File

@@ -8,7 +8,7 @@ clean_docker_tag() {
} }
print_usage_and_exit() { print_usage_and_exit() {
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>" echo "Usage: $0 <registry> <repo> <commit> <branch> <image_tag> [<image_tag_latest>]"
exit 1 exit 1
} }
@@ -142,11 +142,16 @@ resolve_parent_commit() {
print_bake_config() { print_bake_config() {
echo "--- :page_facing_up: Resolved bake configuration" echo "--- :page_facing_up: Resolved bake configuration"
BAKE_CONFIG_FILE="bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json" # Write to a temp directory to avoid polluting the repo root (which is the
# Docker build context). Files left in the repo root get COPY'd into the
# image and can cause duplicate artifact uploads from downstream steps.
local bake_tmp
bake_tmp="$(mktemp -d)"
BAKE_CONFIG_FILE="${bake_tmp}/bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
echo "Saved bake config to ${BAKE_CONFIG_FILE}" echo "Saved bake config to ${BAKE_CONFIG_FILE}"
echo "--- :arrow_down: Uploading bake config to Buildkite" echo "--- :arrow_down: Uploading bake config to Buildkite"
buildkite-agent artifact upload "${BAKE_CONFIG_FILE}" (cd "$(dirname "${BAKE_CONFIG_FILE}")" && buildkite-agent artifact upload "$(basename "${BAKE_CONFIG_FILE}")")
} }
################################# #################################
@@ -154,7 +159,7 @@ print_bake_config() {
################################# #################################
print_instance_info print_instance_info
if [[ $# -lt 7 ]]; then if [[ $# -lt 5 ]]; then
print_usage_and_exit print_usage_and_exit
fi fi
@@ -163,10 +168,8 @@ REGISTRY=$1
REPO=$2 REPO=$2
BUILDKITE_COMMIT=$3 BUILDKITE_COMMIT=$3
BRANCH=$4 BRANCH=$4
VLLM_USE_PRECOMPILED=$5 IMAGE_TAG=$5
VLLM_MERGE_BASE_COMMIT=$6 IMAGE_TAG_LATEST=${6:-} # only used for main branch, optional
IMAGE_TAG=$7
IMAGE_TAG_LATEST=${8:-} # only used for main branch, optional
# build config # build config
TARGET="test-ci" TARGET="test-ci"
@@ -193,8 +196,6 @@ export CACHE_FROM
export CACHE_FROM_BASE_BRANCH export CACHE_FROM_BASE_BRANCH
export CACHE_FROM_MAIN export CACHE_FROM_MAIN
export CACHE_TO export CACHE_TO
export VLLM_USE_PRECOMPILED
export VLLM_MERGE_BASE_COMMIT
# print args # print args
echo "--- :mag: Arguments" echo "--- :mag: Arguments"
@@ -202,8 +203,6 @@ echo "REGISTRY: ${REGISTRY}"
echo "REPO: ${REPO}" echo "REPO: ${REPO}"
echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}" echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
echo "BRANCH: ${BRANCH}" echo "BRANCH: ${BRANCH}"
echo "VLLM_USE_PRECOMPILED: ${VLLM_USE_PRECOMPILED}"
echo "VLLM_MERGE_BASE_COMMIT: ${VLLM_MERGE_BASE_COMMIT}"
echo "IMAGE_TAG: ${IMAGE_TAG}" echo "IMAGE_TAG: ${IMAGE_TAG}"
echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}" echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"

View File

@@ -3,9 +3,9 @@ steps:
- label: ":docker: Build image" - label: ":docker: Build image"
key: image-build key: image-build
depends_on: [] depends_on: []
timeout_in_minutes: 600
commands: commands:
- if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG $IMAGE_TAG_LATEST; else .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG; fi
- if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi
retry: retry:
automatic: automatic:
- exit_status: -1 # Agent was lost - exit_status: -1 # Agent was lost

View File

@@ -11,10 +11,10 @@ REPO=$2
BUILDKITE_COMMIT=$3 BUILDKITE_COMMIT=$3
# authenticate with AWS ECR # authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
# skip build if image already exists # skip build if image already exists
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu) ]]; then
echo "Image not found, proceeding with build..." echo "Image not found, proceeding with build..."
else else
echo "Image found" echo "Image found"
@@ -24,13 +24,13 @@ fi
# build # build
docker build --file docker/Dockerfile.cpu \ docker build --file docker/Dockerfile.cpu \
--build-arg max_jobs=16 \ --build-arg max_jobs=16 \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \ --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
--build-arg VLLM_CPU_AVX512BF16=true \ --build-arg VLLM_CPU_AVX512BF16=true \
--build-arg VLLM_CPU_AVX512VNNI=true \ --build-arg VLLM_CPU_AVX512VNNI=true \
--build-arg VLLM_CPU_AMXBF16=true \ --build-arg VLLM_CPU_AMXBF16=true \
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \ --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \
--target vllm-test \ --target vllm-test \
--progress plain . --progress plain .
# push # push
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu

View File

@@ -11,10 +11,10 @@ REPO=$2
BUILDKITE_COMMIT=$3 BUILDKITE_COMMIT=$3
# authenticate with AWS ECR # authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
# skip build if image already exists # skip build if image already exists
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then
echo "Image not found, proceeding with build..." echo "Image not found, proceeding with build..."
else else
echo "Image found" echo "Image found"
@@ -24,10 +24,10 @@ fi
# build # build
docker build --file docker/Dockerfile.cpu \ docker build --file docker/Dockerfile.cpu \
--build-arg max_jobs=16 \ --build-arg max_jobs=16 \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \ --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \ --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu \
--target vllm-test \ --target vllm-test \
--progress plain . --progress plain .
# push # push
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu

View File

@@ -11,10 +11,10 @@ REPO=$2
BUILDKITE_COMMIT=$3 BUILDKITE_COMMIT=$3
# authenticate with AWS ECR # authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
# skip build if image already exists # skip build if image already exists
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu) ]]; then
echo "Image not found, proceeding with build..." echo "Image not found, proceeding with build..."
else else
echo "Image found" echo "Image found"
@@ -25,10 +25,10 @@ fi
docker build \ docker build \
--file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \ --file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
--build-arg max_jobs=16 \ --build-arg max_jobs=16 \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \ --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \ --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu \
--progress plain \ --progress plain \
https://github.com/vllm-project/vllm-gaudi.git https://github.com/vllm-project/vllm-gaudi.git
# push # push
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu

View File

@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on chartqa for vllm. # We can use this script to compute baseline accuracy on chartqa for vllm.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.9.2" # pip install "lm-eval[api]>=0.4.11"
usage() { usage() {
echo`` echo``
@@ -41,4 +41,4 @@ lm_eval --model vllm-vlm \
--tasks chartqa \ --tasks chartqa \
--batch_size auto \ --batch_size auto \
--apply_chat_template \ --apply_chat_template \
--limit $LIMIT --limit "$LIMIT"

View File

@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on GSM for transformers. # We can use this script to compute baseline accuracy on GSM for transformers.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.9.2" # pip install "lm-eval[api]>=0.4.11"
usage() { usage() {
echo`` echo``

View File

@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support. # We use this for fp8, which HF does not support.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.9.2" # pip install "lm-eval[api]>=0.4.11"
usage() { usage() {
echo`` echo``

View File

@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support. # We use this for fp8, which HF does not support.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.9.2" # pip install "lm-eval[api]>=0.4.11"
usage() { usage() {
echo`` echo``
@@ -20,14 +20,11 @@ usage() {
echo echo
} }
while getopts "m:b:l:f:t:" OPT; do while getopts "m:l:f:t:" OPT; do
case ${OPT} in case ${OPT} in
m ) m )
MODEL="$OPTARG" MODEL="$OPTARG"
;; ;;
b )
BATCH_SIZE="$OPTARG"
;;
l ) l )
LIMIT="$OPTARG" LIMIT="$OPTARG"
;; ;;

View File

@@ -9,8 +9,10 @@ import json
import os import os
from dataclasses import dataclass from dataclasses import dataclass
from importlib import util from importlib import util
from pathlib import Path
import pandas as pd import pandas as pd
import regex as re
pd.options.display.float_format = "{:.2f}".format pd.options.display.float_format = "{:.2f}".format
plotly_found = util.find_spec("plotly.express") is not None plotly_found = util.find_spec("plotly.express") is not None
@@ -275,6 +277,131 @@ def _apply_two_decimals(
return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="") return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="")
# -----------------------------
# Export helpers (Excel + CSV)
# -----------------------------
def _sanitize_sheet_name(name: str) -> str:
"""
Excel sheet constraints:
- max 31 chars
- cannot contain: : \ / ? * [ ]
- cannot be empty
"""
name = "sheet" if name is None else str(name)
name = re.sub(r"[:\\/?*\[\]]", "_", name)
name = name.strip().strip("'")
name = re.sub(r"\s+", " ", name)
if not name:
name = "sheet"
return name[:31]
def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str:
d = dict(zip(group_cols, gkey_tuple))
model = d.get("Model", "model")
model_short = str(model).split("/")[-1]
ilen = d.get("Input Len", "")
olen = d.get("Output Len", "")
lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else ""
return _sanitize_sheet_name(f"{model_short}{lens}")
def _write_tables_to_excel_sheet(
writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]]
):
startrow = 0
for title, df in blocks:
pd.DataFrame([[title]]).to_excel(
writer, sheet_name=sheet, index=False, header=False, startrow=startrow
)
startrow += 1
df.to_excel(writer, sheet_name=sheet, index=False, startrow=startrow)
startrow += len(df) + 3
def _safe_filename(s: str) -> str:
s = re.sub(r"[^\w\-.]+", "_", str(s).strip())
return s[:180] if len(s) > 180 else s
# -----------------------------
# vLLM environment export helper
# -----------------------------
def _parse_vllm_env_txt(env_path: Path) -> pd.DataFrame:
"""Parse vllm_env.txt into a flat table (Section, Key, Value).
Supports:
- section headers as standalone lines (no ':' or '=')
- key-value lines like 'OS: Ubuntu ...'
- env var lines like 'HF_HOME=/data/hf'
"""
lines = env_path.read_text(encoding="utf-8", errors="replace").splitlines()
section = "General"
rows: list[dict] = []
def set_section(s: str):
nonlocal section
s = (s or "").strip()
if s:
section = s
for raw in lines:
stripped = raw.strip()
if not stripped:
continue
# divider lines like =====
if set(stripped) <= {"="}:
continue
# section header heuristic: short standalone line
if ":" not in stripped and "=" not in stripped and len(stripped) <= 64:
if stripped.lower().startswith("collecting environment information"):
continue
set_section(stripped)
continue
# env var style: KEY=VALUE (and not a URL with :)
if "=" in stripped and ":" not in stripped:
k, v = stripped.split("=", 1)
k = k.strip()
v = v.strip()
if k:
rows.append({"Section": section, "Key": k, "Value": v})
continue
# key: value
if ":" in stripped:
k, v = stripped.split(":", 1)
k = k.strip()
v = v.strip()
if k:
rows.append({"Section": section, "Key": k, "Value": v})
continue
return pd.DataFrame(rows, columns=["Section", "Key", "Value"])
def _load_env_df_for_inputs(args, files: list[str]) -> pd.DataFrame | None:
"""Load vllm_env.txt next to the *original* input JSON file.
Note: when only one -f is provided, the script may split JSON into ./splits/...,
but vllm_env.txt typically lives next to the original benchmark_results.json.
"""
base_dir: Path | None = None
if getattr(args, "file", None):
base_dir = Path(args.file[0]).resolve().parent
elif files:
base_dir = Path(files[0]).resolve().parent
if base_dir is None:
return None
env_path = base_dir / "vllm_env.txt"
if not env_path.exists():
return None
df = _parse_vllm_env_txt(env_path)
return df
# ----------------------------- # -----------------------------
# Valid max concurrency summary helpers # Valid max concurrency summary helpers
# ----------------------------- # -----------------------------
@@ -428,7 +555,6 @@ def build_valid_max_concurrency_summary_html(
summary_df = pd.DataFrame(rows) summary_df = pd.DataFrame(rows)
# --- Coerce numeric columns so Styler doesn't miss them due to object dtype ---
for c in summary_df.columns: for c in summary_df.columns:
if c == "Configuration": if c == "Configuration":
continue continue
@@ -436,12 +562,10 @@ def build_valid_max_concurrency_summary_html(
both_col = f"Max {conc_col} (Both)" both_col = f"Max {conc_col} (Both)"
# --- Strict 2-decimal formatting for ALL non-Configuration columns ---
formatters = {} formatters = {}
for c in summary_df.columns: for c in summary_df.columns:
if c == "Configuration": if c == "Configuration":
continue continue
# default argument binds per-column formatter correctly
formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}" formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}"
styler = summary_df.style.format(formatters) styler = summary_df.style.format(formatters)
@@ -460,6 +584,95 @@ def build_valid_max_concurrency_summary_html(
return title + styler.to_html(table_attributes='border="1" class="dataframe"') return title + styler.to_html(table_attributes='border="1" class="dataframe"')
def build_valid_max_concurrency_summary_df(
tput_group_df: pd.DataFrame | None,
ttft_group_df: pd.DataFrame | None,
tpot_group_df: pd.DataFrame | None,
conc_col: str,
args,
) -> pd.DataFrame | None:
if ttft_group_df is None and tpot_group_df is None:
return None
ttft_cols = (
_config_value_columns(ttft_group_df, conc_col)
if ttft_group_df is not None
else []
)
tpot_cols = (
_config_value_columns(tpot_group_df, conc_col)
if tpot_group_df is not None
else []
)
tput_cols = (
_config_value_columns(tput_group_df, conc_col)
if tput_group_df is not None
else []
)
if ttft_group_df is not None and tpot_group_df is not None:
cfg_cols = [c for c in ttft_cols if c in tpot_cols]
if tput_group_df is not None:
cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols
else:
cfg_cols = ttft_cols or tpot_cols
if not cfg_cols:
cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
rows = []
for cfg in cfg_cols:
ttft_max = (
_max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
if ttft_group_df is not None
else pd.NA
)
tpot_max = (
_max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
if tpot_group_df is not None
else pd.NA
)
both = (
pd.NA
if (pd.isna(ttft_max) or pd.isna(tpot_max))
else min(ttft_max, tpot_max)
)
tput_at_both = (
_value_at_concurrency(tput_group_df, conc_col, cfg, both)
if tput_group_df is not None
else pd.NA
)
ttft_at_both = (
_value_at_concurrency(ttft_group_df, conc_col, cfg, both)
if ttft_group_df is not None
else pd.NA
)
tpot_at_both = (
_value_at_concurrency(tpot_group_df, conc_col, cfg, both)
if tpot_group_df is not None
else pd.NA
)
rows.append(
{
"Configuration": cfg,
f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
f"Max {conc_col} (Both)": both,
"Output Tput @ Both (tok/s)": tput_at_both,
"TTFT @ Both (ms)": ttft_at_both,
"TPOT @ Both (ms)": tpot_at_both,
}
)
df = pd.DataFrame(rows)
for c in df.columns:
if c != "Configuration":
df[c] = pd.to_numeric(df[c], errors="coerce")
return df
# ----------------------------- # -----------------------------
# Plot helper # Plot helper
# ----------------------------- # -----------------------------
@@ -537,6 +750,21 @@ def build_parser() -> argparse.ArgumentParser:
default=100.0, default=100.0,
help="Reference limit for TPOT plots (ms)", help="Reference limit for TPOT plots (ms)",
) )
# ---- NEW: export options ----
parser.add_argument(
"--excel-out",
type=str,
default="perf_comparison.xlsx",
help="Write one sheet per (Model, Dataset, Input Len, Output Len).",
)
parser.add_argument(
"--csv-out-dir",
type=str,
default="",
help="If set, write per-group per-metric CSVs into this directory.",
)
return parser return parser
@@ -657,7 +885,6 @@ def maybe_write_plot(
markers=True, markers=True,
) )
# Ensure plot hover + y tick labels are also 2 decimals.
fig.update_traces(hovertemplate="%{y:.2f}<extra></extra>") fig.update_traces(hovertemplate="%{y:.2f}<extra></extra>")
fig.update_yaxes(tickformat=".2f") fig.update_yaxes(tickformat=".2f")
@@ -730,6 +957,27 @@ def write_report_group_first(
for metric_label, (df, _) in metric_cache.items() for metric_label, (df, _) in metric_cache.items()
} }
csv_dir = Path(args.csv_out_dir) if args.csv_out_dir else None
if csv_dir:
csv_dir.mkdir(parents=True, exist_ok=True)
excel_path = args.excel_out or "perf_comparison.xlsx"
with pd.ExcelWriter(excel_path, engine="openpyxl") as xw:
# ---- Environment sheet (first) ----
env_sheet = _sanitize_sheet_name("Environment")
env_df = _load_env_df_for_inputs(args, files)
if env_df is None or env_df.empty:
pd.DataFrame(
[
{
"Section": "Environment",
"Key": "vllm_env.txt",
"Value": "NOT FOUND (or empty)",
}
]
).to_excel(xw, sheet_name=env_sheet, index=False)
else:
env_df.to_excel(xw, sheet_name=env_sheet, index=False)
with open("perf_comparison.html", "w", encoding="utf-8") as main_fh: with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
main_fh.write('<meta charset="utf-8">\n') main_fh.write('<meta charset="utf-8">\n')
for gkey in group_keys: for gkey in group_keys:
@@ -744,6 +992,16 @@ def write_report_group_first(
) )
main_fh.write(group_header) main_fh.write(group_header)
sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple)
sheet_base = sheet
dedup_i = 1
while sheet in xw.sheets:
dedup_i += 1
sheet = _sanitize_sheet_name(f"{sheet_base}_{dedup_i}")
excel_blocks: list[tuple[str, pd.DataFrame]] = []
with open(sub_path, "w", encoding="utf-8") as sub_fh: with open(sub_path, "w", encoding="utf-8") as sub_fh:
sub_fh.write('<meta charset="utf-8">\n') sub_fh.write('<meta charset="utf-8">\n')
sub_fh.write(group_header) sub_fh.write(group_header)
@@ -765,7 +1023,6 @@ def write_report_group_first(
f"{_html.escape(metric_label)} — missing for this group" f"{_html.escape(metric_label)} — missing for this group"
"</div>\n" "</div>\n"
) )
main_fh.write(missing) main_fh.write(missing)
sub_fh.write(missing) sub_fh.write(missing)
continue continue
@@ -801,6 +1058,17 @@ def write_report_group_first(
args=args, args=args,
) )
excel_blocks.append(
(metric_label, display_group.reset_index(drop=True))
)
if csv_dir:
fn = _safe_filename(
f"{sheet}__{metric_label}".replace(" ", "_").replace(
"/", "_"
)
)
display_group.to_csv(csv_dir / f"{fn}.csv", index=False)
summary_html = build_valid_max_concurrency_summary_html( summary_html = build_valid_max_concurrency_summary_html(
tput_group_df=tput_group_df, tput_group_df=tput_group_df,
ttft_group_df=ttft_group_df, ttft_group_df=ttft_group_df,
@@ -812,6 +1080,29 @@ def write_report_group_first(
main_fh.write(summary_html) main_fh.write(summary_html)
sub_fh.write(summary_html) sub_fh.write(summary_html)
summary_df = build_valid_max_concurrency_summary_df(
tput_group_df=tput_group_df,
ttft_group_df=ttft_group_df,
tpot_group_df=tpot_group_df,
conc_col=conc_col,
args=args,
)
if summary_df is not None:
excel_blocks.append(
("Valid Max Concurrency Summary", summary_df)
)
if csv_dir:
fn = _safe_filename(
f"{sheet}__Valid_Max_Concurrency_Summary"
)
summary_df.to_csv(csv_dir / f"{fn}.csv", index=False)
_write_tables_to_excel_sheet(xw, sheet, excel_blocks)
print(f"Wrote Excel: {excel_path}")
if csv_dir:
print(f"Wrote CSVs under: {csv_dir}")
def main(): def main():
args = build_parser().parse_args() args = build_parser().parse_args()

View File

@@ -1,6 +1,4 @@
#!/bin/bash #!/bin/bash
# This script should be run inside the CI process
# This script assumes that we are already inside the vllm/ directory # This script assumes that we are already inside the vllm/ directory
# Benchmarking results will be available inside vllm/benchmarks/results/ # Benchmarking results will be available inside vllm/benchmarks/results/
@@ -9,14 +7,19 @@
set -x set -x
set -o pipefail set -o pipefail
# Environment-driven debug controls (like ON_CPU=1)
DRY_RUN="${DRY_RUN:-0}"
MODEL_FILTER="${MODEL_FILTER:-}"
DTYPE_FILTER="${DTYPE_FILTER:-}"
check_gpus() { check_gpus() {
if command -v nvidia-smi; then if command -v nvidia-smi; then
# check the number of GPUs and GPU type. # check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) declare -g gpu_count=$(nvidia-smi --list-gpus | grep -c . || true)
elif command -v amd-smi; then elif command -v amd-smi; then
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l) declare -g gpu_count=$(amd-smi list | grep -c 'GPU' || true)
elif command -v hl-smi; then elif command -v hl-smi; then
declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l) declare -g gpu_count=$(hl-smi --list | grep -ci "Module ID" || true)
fi fi
if [[ $gpu_count -gt 0 ]]; then if [[ $gpu_count -gt 0 ]]; then
@@ -44,7 +47,7 @@ check_cpus() {
declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}') declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
if [[ $numa_count -gt 0 ]]; then if [[ $numa_count -gt 0 ]]; then
echo "NUMA found." echo "NUMA found."
echo $numa_count echo "$numa_count"
else else
echo "Need at least 1 NUMA to run benchmarking." echo "Need at least 1 NUMA to run benchmarking."
exit 1 exit 1
@@ -112,13 +115,12 @@ json2envs() {
} }
wait_for_server() { wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
local timeout_val="1200" local timeout_val="1200"
timeout "$timeout_val" bash -c ' timeout "$timeout_val" bash -c '
until curl -X POST localhost:8000/v1/completions; do until curl -sf http://localhost:8000/v1/models >/dev/null; do
sleep 1 sleep 1
done' && return 0 || return 1 done
'
} }
kill_processes_launched_by_current_bash() { kill_processes_launched_by_current_bash() {
@@ -252,37 +254,16 @@ run_benchmark_tests() {
done done
} }
run_latency_tests() { run_latency_tests() { run_benchmark_tests "latency" "$1"; }
run_benchmark_tests "latency" "$1" run_startup_tests() { run_benchmark_tests "startup" "$1"; }
} run_throughput_tests() { run_benchmark_tests "throughput" "$1"; }
run_startup_tests() { merge_serving_tests_stream() {
run_benchmark_tests "startup" "$1" # Emit merged serving test objects, optionally filtered by MODEL_FILTER/DTYPE_FILTER in DRY_RUN mode.
} # This helper does NOT modify JSON; it only filters the stream in dry-run mode.
local serving_test_file="$1"
run_throughput_tests() { # shellcheck disable=SC2016
run_benchmark_tests "throughput" "$1" local merged='
}
run_serving_tests() {
# run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
#
# Supported JSON formats:
# 1) Plain format: top-level array
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
#
# 2) Default parameters field + plain format tests
# {
# "defaults": { ... },
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
# }
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '
if type == "array" then if type == "array" then
# Plain format: test cases array # Plain format: test cases array
.[] .[]
@@ -304,7 +285,50 @@ run_serving_tests() {
else else
error("Unsupported serving test file format: must be array or object with .tests") error("Unsupported serving test file format: must be array or object with .tests")
end end
' "$serving_test_file" | while read -r params; do '
jq -c "$merged" "$serving_test_file" | \
if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
jq -c --arg model "$MODEL_FILTER" --arg dtype "$DTYPE_FILTER" '
select((($model|length)==0)
or ((.server_parameters.model // "") == $model)
or ((.client_parameters.model // "") == $model))
| select((($dtype|length)==0) or ((.server_parameters.dtype // "") == $dtype))
'
else
cat
fi
}
run_serving_tests() {
# run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
#
# Supported JSON formats:
# 1) Plain format: top-level array
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
#
# 2) Default parameters field + plain format tests
# {
# "defaults": { ... },
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
# }
local serving_test_file
serving_test_file=$1
# In dry-run mode, if filters are provided but no tests match, fail fast.
if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
local count
count=$(merge_serving_tests_stream "$serving_test_file" | wc -l | tr -d ' ')
if [[ "$count" -eq 0 ]]; then
echo "No matching serving tests found in $serving_test_file for model='$MODEL_FILTER' dtype='$DTYPE_FILTER'." >&2
return 0
fi
fi
# Iterate over serving tests (merged + optional filtered stream)
merge_serving_tests_stream "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it. # get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name') test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^serving_ ]]; then if [[ ! "$test_name" =~ ^serving_ ]]; then
@@ -373,7 +397,7 @@ run_serving_tests() {
echo "Server command: $server_command" echo "Server command: $server_command"
# support remote vllm server # support remote vllm server
client_remote_args="" client_remote_args=""
if [[ -z "${REMOTE_HOST}" ]]; then if [[ -z "${REMOTE_HOST}" && "${DRY_RUN:-0}" != "1" ]]; then
bash -c "$server_command" & bash -c "$server_command" &
server_pid=$! server_pid=$!
# wait until the server is alive # wait until the server is alive
@@ -384,6 +408,9 @@ run_serving_tests() {
echo "" echo ""
echo "vLLM failed to start within the timeout period." echo "vLLM failed to start within the timeout period."
fi fi
elif [[ "${DRY_RUN:-0}" == "1" ]]; then
# dry-run: don't start server
echo "Dry Run."
else else
server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT" server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
if [[ ${REMOTE_PORT} ]]; then if [[ ${REMOTE_PORT} ]]; then
@@ -402,14 +429,12 @@ run_serving_tests() {
for qps in $qps_list; do for qps in $qps_list; do
# remove the surrounding single quote from qps # remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf" qps="inf"
echo "now qps is $qps"
fi fi
# iterate over different max_concurrency # iterate over different max_concurrency
for max_concurrency in $max_concurrency_list; do for max_concurrency in $max_concurrency_list; do
new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
echo " new test name $new_test_name" echo " new test name $new_test_name"
# pass the tensor parallel size, the compilation mode, and the optimization # pass the tensor parallel size, the compilation mode, and the optimization
# level to the client so that they can be used on the benchmark dashboard # level to the client so that they can be used on the benchmark dashboard
@@ -425,7 +450,9 @@ run_serving_tests() {
echo "Running test case $test_name with qps $qps" echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command" echo "Client command: $client_command"
if [[ "${DRY_RUN:-0}" != "1" ]]; then
bash -c "$client_command" bash -c "$client_command"
fi
# record the benchmarking commands # record the benchmarking commands
jq_output=$(jq -n \ jq_output=$(jq -n \
@@ -443,12 +470,15 @@ run_serving_tests() {
done done
# clean up # clean up
kill -9 $server_pid if [[ "${DRY_RUN:-0}" != "1" ]]; then
kill -9 "$server_pid"
kill_gpu_processes kill_gpu_processes
fi
done done
} }
main() { main() {
local ARCH local ARCH
ARCH='' ARCH=''
if [[ "$ON_CPU" == "1" ]]; then if [[ "$ON_CPU" == "1" ]]; then
@@ -458,7 +488,13 @@ main() {
check_gpus check_gpus
ARCH="$arch_suffix" ARCH="$arch_suffix"
fi fi
# DRY_RUN does not execute vLLM; do not require HF_TOKEN.
if [[ "${DRY_RUN:-0}" != "1" ]]; then
check_hf_token check_hf_token
else
echo "DRY_RUN=1 -> skip HF_TOKEN validation"
fi
# dependencies # dependencies
(which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
@@ -479,11 +515,16 @@ main() {
# dump vllm info via vllm collect-env # dump vllm info via vllm collect-env
env_output=$(vllm collect-env) env_output=$(vllm collect-env)
echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt" echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
# benchmarking # benchmarking
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" || exit $?
if [[ "${DRY_RUN:-0}" == "1" ]]; then
echo "DRY_RUN=1 -> skip latency/startup/throughput suites"
exit 0
fi
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}" run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}" run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}" run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"

View File

@@ -0,0 +1,41 @@
{
"defaults": {
"qps_list": [
"inf"
],
"max_concurrency_list": [
32,
64,
128
],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"dtype": "bfloat16",
"model": "jinaai/jina-embeddings-v3",
"trust_remote_code": ""
},
"client_parameters": {
"model": "jinaai/jina-embeddings-v3",
"backend": "openai-embeddings",
"endpoint": "/v1/embeddings",
"dataset_name": "sharegpt",
"dataset_path": "ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
"tests": [
{
"test_name": "serving_jina_embed_v3_tp1_sharegpt",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {}
}
]
}

View File

@@ -0,0 +1,283 @@
{
"defaults": {
"qps_list": [
"inf"
],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"ignore-eos": "",
"num_prompts": 200
}
},
"tests": [
{
"test_name": "serving_llama8B_tp1_sharegpt",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name": "serving_llama8B_tp2_sharegpt",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name": "serving_llama8B_tp1_random_128_128",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp2_random_128_128",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp4_random_128_128",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp1_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp2_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp4_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp2_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp4_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp1_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp2_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 2
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp4_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 4
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama3B_tp1_random_128_128",
"server_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_granite2B_tp1_random_128_128",
"server_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen1.7B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-1.7B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-1.7B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen4B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-4B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-4B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen8B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-8B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-8B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_glm9B_tp1_random_128_128",
"server_parameters": {
"model": "zai-org/glm-4-9b-hf",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "zai-org/glm-4-9b-hf",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_gemma7B_tp1_random_128_128",
"server_parameters": {
"model": "google/gemma-7b",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "google/gemma-7b",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
}
]
}

View File

@@ -148,136 +148,6 @@
"random-input-len": 2048, "random-input-len": 2048,
"random-output-len": 128 "random-output-len": 128
} }
},
{
"test_name": "serving_llama8B_int4_tp1_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp2_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 2
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp4_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 4
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama3B_tp1_random_128_128",
"server_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_granite2B_tp1_random_128_128",
"server_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen1.7B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-1.7B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-1.7B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen4B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-4B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-4B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen8B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-8B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-8B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_glm9B_tp1_random_128_128",
"server_parameters": {
"model": "zai-org/glm-4-9b-hf",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "zai-org/glm-4-9b-hf",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_gemma7B_tp1_random_128_128",
"server_parameters": {
"model": "google/gemma-7b",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "google/gemma-7b",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
} }
] ]
} }

View File

@@ -27,7 +27,7 @@ aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-
To download and upload the image: To download and upload the image:
\`\`\` \`\`\`
Download images: # Download images:
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
@@ -35,8 +35,12 @@ docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
Tag and push images: # Tag and push images:
## CUDA
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64 docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
@@ -62,34 +66,21 @@ docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-a
docker push vllm/vllm-openai:latest-aarch64-cu130 docker push vllm/vllm-openai:latest-aarch64-cu130
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm ## ROCm
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:latest
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:v${RELEASE_VERSION}-rocm docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
docker push vllm/vllm-openai-rocm:latest docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-rocm docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
docker push vllm/vllm-openai-rocm:latest
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
Create multi-arch manifest:
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
docker push vllm/vllm-openai-rocm:latest-base docker push vllm/vllm-openai-rocm:latest-base
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
docker manifest rm vllm/vllm-openai:latest ## CPU
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker manifest push vllm/vllm-openai:latest
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
docker manifest rm vllm/vllm-openai:latest-cu130
docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
docker manifest push vllm/vllm-openai:latest-cu130
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
# CPU images (vllm/vllm-openai-cpu)
docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64 docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64
docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64 docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64
@@ -103,6 +94,20 @@ docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-a
docker push vllm/vllm-openai-cpu:latest-arm64 docker push vllm/vllm-openai-cpu:latest-arm64
docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64 docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
# Create multi-arch manifest:
docker manifest rm vllm/vllm-openai:latest
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker manifest push vllm/vllm-openai:latest
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
docker manifest rm vllm/vllm-openai:latest-cu130
docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
docker manifest push vllm/vllm-openai:latest-cu130
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
docker manifest rm vllm/vllm-openai-cpu:latest || true docker manifest rm vllm/vllm-openai-cpu:latest || true
docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64 docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64 docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64

View File

@@ -25,7 +25,7 @@ S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
S3_URL="http://${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com" S3_URL="http://${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com"
# Format ROCm version for path (e.g., "7.1" -> "rocm710") # Format ROCm version for path (e.g., "7.1" -> "rocm710")
ROCM_VERSION_PATH="rocm$(echo ${ROCM_VERSION} | tr -d '.')" ROCM_VERSION_PATH="rocm$(echo "${ROCM_VERSION}" | tr -d '.')"
ROCM_PATH="rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}" ROCM_PATH="rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}"
buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
## ROCm Wheel and Docker Image Releases ## ROCm Wheel and Docker Image Releases

View File

@@ -83,7 +83,7 @@ case "${1:-}" in
exit 1 exit 1
fi fi
WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l) WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
if [[ "$WHEEL_COUNT" -eq 0 ]]; then if [[ "$WHEEL_COUNT" -eq 0 ]]; then
echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2 echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
exit 1 exit 1
@@ -110,9 +110,9 @@ case "${1:-}" in
echo "" echo ""
echo "Downloaded wheels:" echo "Downloaded wheels:"
ls -lh artifacts/rocm-base-wheels/ find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' -exec ls -lh {} \;
WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l) WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
echo "" echo ""
echo "Total: $WHEEL_COUNT wheels" echo "Total: $WHEEL_COUNT wheels"
echo "========================================" echo "========================================"

View File

@@ -134,7 +134,7 @@ log_info "Fetching merged PRs from milestone '${MILESTONE}'..."
# Store PR data in a temp file # Store PR data in a temp file
PR_DATA=$(mktemp) PR_DATA=$(mktemp)
trap "rm -f $PR_DATA" EXIT trap 'rm -f "$PR_DATA"' EXIT
if ! gh pr list --state merged --search "milestone:${MILESTONE}" \ if ! gh pr list --state merged --search "milestone:${MILESTONE}" \
--limit 1000 \ --limit 1000 \

View File

@@ -1,25 +1,37 @@
#!/bin/bash #!/bin/bash
# This script runs test inside the corresponding ROCm docker container. # This script runs tests inside the corresponding ROCm docker container.
# It handles both single-node and multi-node test configurations.
#
# Multi-node detection: Instead of matching on fragile group names, we detect
# multi-node jobs structurally by looking for the bracket command syntax
# "[node0_cmds] && [node1_cmds]" or via the NUM_NODES environment variable.
set -o pipefail set -o pipefail
# Export Python path # Export Python path
export PYTHONPATH=".." export PYTHONPATH=".."
# Print ROCm version ###############################################################################
echo "--- Confirming Clean Initial State" # Helper Functions
while true; do ###############################################################################
sleep 3
wait_for_clean_gpus() {
local timeout=${1:-300}
local start=$SECONDS
echo "--- Waiting for clean GPU state (timeout: ${timeout}s)"
while true; do
if grep -q clean /opt/amdgpu/etc/gpu_state; then if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\"" echo "GPUs state is \"clean\""
break return
fi fi
done if (( SECONDS - start >= timeout )); then
echo "Error: GPUs did not reach clean state within ${timeout}s" >&2
exit 1
fi
sleep 3
done
}
echo "--- ROCm info"
rocminfo
# cleanup older docker images
cleanup_docker() { cleanup_docker() {
# Get Docker's root directory # Get Docker's root directory
docker_root=$(docker info -f '{{.DockerRootDir}}') docker_root=$(docker info -f '{{.DockerRootDir}}')
@@ -28,15 +40,12 @@ cleanup_docker() {
exit 1 exit 1
fi fi
echo "Docker root directory: $docker_root" echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70 threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed." echo "Docker images and volumes cleanup completed."
else else
@@ -45,31 +54,166 @@ cleanup_docker() {
} }
cleanup_network() { cleanup_network() {
for node in $(seq 0 $((NUM_NODES-1))); do local max_nodes=${NUM_NODES:-2}
if docker pr -a -q -f name="node${node}" | grep -q .; then for node in $(seq 0 $((max_nodes - 1))); do
docker stop "node${node}" if docker ps -a -q -f name="node${node}" | grep -q .; then
docker stop "node${node}" || true
fi fi
done done
if docker network ls | grep docker-net; then if docker network ls | grep -q docker-net; then
docker network rm docker-net docker network rm docker-net || true
fi fi
} }
# Call the cleanup docker function is_multi_node() {
local cmds="$1"
# Primary signal: NUM_NODES environment variable set by the pipeline
if [[ "${NUM_NODES:-1}" -gt 1 ]]; then
return 0
fi
# Fallback: detect the bracket syntax structurally
# Pattern: [...] && [...] (per-node command arrays)
if [[ "$cmds" =~ \[.*\].*\&\&.*\[.*\] ]]; then
return 0
fi
return 1
}
###############################################################################
# Pytest marker re-quoting
#
# When commands are passed through Buildkite -> shell -> $* -> bash -c,
# quotes around pytest -m marker expressions get stripped:
# pytest -v -s -m 'not cpu_test' v1/core
# becomes:
# pytest -v -s -m not cpu_test v1/core
#
# pytest then interprets "cpu_test" as a file path, not part of the marker.
# This function detects unquoted multi-word marker expressions and re-quotes
# them so they survive the final bash -c expansion.
###############################################################################
re_quote_pytest_markers() {
local cmds="$1"
# Pattern: -m not <identifier> -> -m 'not <identifier>'
# Handles the common cases: 'not cpu_test', 'not slow_test', etc.
cmds=$(echo "$cmds" | sed -E "s/-m not ([a-zA-Z_][a-zA-Z0-9_]*)/-m 'not \1'/g")
echo "$cmds"
}
###############################################################################
# ROCm-specific pytest command rewrites
#
# These apply ignore flags and environment overrides for tests that are not
# yet supported or behave differently on ROCm hardware. Kept as a single
# function so new exclusions are easy to add in one place.
###############################################################################
apply_rocm_test_overrides() {
local cmds="$1"
# --- Model registry filter ---
if [[ $cmds == *"pytest -v -s models/test_registry.py"* ]]; then
cmds=${cmds//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
fi
# --- LoRA: disable custom paged attention ---
if [[ $cmds == *"pytest -v -s lora"* ]]; then
cmds=${cmds//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
fi
# --- Kernel ignores ---
if [[ $cmds == *" kernels/core"* ]]; then
cmds="${cmds} \
--ignore=kernels/core/test_fused_quant_layernorm.py \
--ignore=kernels/core/test_permute_cols.py"
fi
if [[ $cmds == *" kernels/attention"* ]]; then
cmds="${cmds} \
--ignore=kernels/attention/test_attention_selector.py \
--ignore=kernels/attention/test_encoder_decoder_attn.py \
--ignore=kernels/attention/test_flash_attn.py \
--ignore=kernels/attention/test_flashinfer.py \
--ignore=kernels/attention/test_prefix_prefill.py \
--ignore=kernels/attention/test_cascade_flash_attn.py \
--ignore=kernels/attention/test_mha_attn.py \
--ignore=kernels/attention/test_lightning_attn.py \
--ignore=kernels/attention/test_attention.py"
fi
if [[ $cmds == *" kernels/quantization"* ]]; then
cmds="${cmds} \
--ignore=kernels/quantization/test_int8_quant.py \
--ignore=kernels/quantization/test_machete_mm.py \
--ignore=kernels/quantization/test_block_fp8.py \
--ignore=kernels/quantization/test_block_int8.py \
--ignore=kernels/quantization/test_marlin_gemm.py \
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
--ignore=kernels/quantization/test_int8_kernel.py"
fi
if [[ $cmds == *" kernels/mamba"* ]]; then
cmds="${cmds} \
--ignore=kernels/mamba/test_mamba_mixer2.py \
--ignore=kernels/mamba/test_causal_conv1d.py \
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
fi
if [[ $cmds == *" kernels/moe"* ]]; then
cmds="${cmds} \
--ignore=kernels/moe/test_moe.py \
--ignore=kernels/moe/test_cutlass_moe.py \
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
fi
# --- Entrypoint ignores ---
if [[ $cmds == *" entrypoints/openai "* ]]; then
cmds=${cmds//" entrypoints/openai "/" entrypoints/openai \
--ignore=entrypoints/openai/test_audio.py \
--ignore=entrypoints/openai/test_shutdown.py \
--ignore=entrypoints/openai/test_completion.py \
--ignore=entrypoints/openai/test_models.py \
--ignore=entrypoints/openai/test_lora_adapters.py \
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
--ignore=entrypoints/openai/test_root_path.py \
--ignore=entrypoints/openai/test_tokenization.py \
--ignore=entrypoints/openai/test_prompt_validation.py "}
fi
if [[ $cmds == *" entrypoints/llm "* ]]; then
cmds=${cmds//" entrypoints/llm "/" entrypoints/llm \
--ignore=entrypoints/llm/test_chat.py \
--ignore=entrypoints/llm/test_accuracy.py \
--ignore=entrypoints/llm/test_init.py \
--ignore=entrypoints/llm/test_prompt_validation.py "}
fi
# Clean up escaped newlines from --ignore appends
cmds=$(echo "$cmds" | sed 's/ \\ / /g')
echo "$cmds"
}
###############################################################################
# Main
###############################################################################
# --- GPU initialization ---
echo "--- Confirming Clean Initial State"
wait_for_clean_gpus
echo "--- ROCm info"
rocminfo
# --- Docker housekeeping ---
cleanup_docker cleanup_docker
echo "--- Resetting GPUs" echo "--- Resetting GPUs"
echo "reset" > /opt/amdgpu/etc/gpu_state echo "reset" > /opt/amdgpu/etc/gpu_state
wait_for_clean_gpus
while true; do # --- Pull test image ---
sleep 3
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
break
fi
done
echo "--- Pulling container" echo "--- Pulling container"
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}" image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
@@ -80,144 +224,74 @@ remove_docker_container() {
} }
trap remove_docker_container EXIT trap remove_docker_container EXIT
# --- Prepare commands ---
echo "--- Running container" echo "--- Running container"
HF_CACHE="$(realpath ~)/huggingface" HF_CACHE="$(realpath ~)/huggingface"
mkdir -p "${HF_CACHE}" mkdir -p "${HF_CACHE}"
HF_MOUNT="/root/.cache/huggingface" HF_MOUNT="/root/.cache/huggingface"
commands=$@ commands="$*"
echo "Raw commands: $commands" echo "Raw commands: $commands"
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"} # Fix quoting before ROCm overrides (so overrides see correct structure)
commands=$(re_quote_pytest_markers "$commands")
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then commands=$(apply_rocm_test_overrides "$commands")
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
fi
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}
if [[ $commands == *"pytest -v -s lora"* ]]; then
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
fi
#ignore certain kernels tests
if [[ $commands == *" kernels/core"* ]]; then
commands="${commands} \
--ignore=kernels/core/test_fused_quant_layernorm.py \
--ignore=kernels/core/test_permute_cols.py"
fi
if [[ $commands == *" kernels/attention"* ]]; then
commands="${commands} \
--ignore=kernels/attention/test_attention_selector.py \
--ignore=kernels/attention/test_encoder_decoder_attn.py \
--ignore=kernels/attention/test_flash_attn.py \
--ignore=kernels/attention/test_flashinfer.py \
--ignore=kernels/attention/test_prefix_prefill.py \
--ignore=kernels/attention/test_cascade_flash_attn.py \
--ignore=kernels/attention/test_mha_attn.py \
--ignore=kernels/attention/test_lightning_attn.py \
--ignore=kernels/attention/test_attention.py"
fi
if [[ $commands == *" kernels/quantization"* ]]; then
commands="${commands} \
--ignore=kernels/quantization/test_int8_quant.py \
--ignore=kernels/quantization/test_machete_mm.py \
--ignore=kernels/quantization/test_block_fp8.py \
--ignore=kernels/quantization/test_block_int8.py \
--ignore=kernels/quantization/test_marlin_gemm.py \
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
--ignore=kernels/quantization/test_int8_kernel.py"
fi
if [[ $commands == *" kernels/mamba"* ]]; then
commands="${commands} \
--ignore=kernels/mamba/test_mamba_mixer2.py \
--ignore=kernels/mamba/test_causal_conv1d.py \
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
fi
if [[ $commands == *" kernels/moe"* ]]; then
commands="${commands} \
--ignore=kernels/moe/test_moe.py \
--ignore=kernels/moe/test_cutlass_moe.py \
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
fi
#ignore certain Entrypoints/openai tests
if [[ $commands == *" entrypoints/openai "* ]]; then
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
--ignore=entrypoints/openai/test_audio.py \
--ignore=entrypoints/openai/test_shutdown.py \
--ignore=entrypoints/openai/test_completion.py \
--ignore=entrypoints/openai/test_models.py \
--ignore=entrypoints/openai/test_lora_adapters.py \
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
--ignore=entrypoints/openai/test_root_path.py \
--ignore=entrypoints/openai/test_tokenization.py \
--ignore=entrypoints/openai/test_prompt_validation.py "}
fi
#ignore certain Entrypoints/llm tests
if [[ $commands == *" entrypoints/llm "* ]]; then
commands=${commands//" entrypoints/llm "/" entrypoints/llm \
--ignore=entrypoints/llm/test_chat.py \
--ignore=entrypoints/llm/test_accuracy.py \
--ignore=entrypoints/llm/test_init.py \
--ignore=entrypoints/llm/test_prompt_validation.py "}
fi
commands=$(echo "$commands" | sed 's/ \\ / /g')
echo "Final commands: $commands" echo "Final commands: $commands"
# --ignore=entrypoints/openai/test_encoder_decoder.py \
# --ignore=entrypoints/openai/test_embedding.py \
# --ignore=entrypoints/openai/test_oot_registration.py
# --ignore=entrypoints/openai/test_accuracy.py \
# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
MYPYTHONPATH=".." MYPYTHONPATH=".."
# Test that we're launching on the machine that has # Verify GPU access
# proper access to GPUs
render_gid=$(getent group render | cut -d: -f3) render_gid=$(getent group render | cut -d: -f3)
if [[ -z "$render_gid" ]]; then if [[ -z "$render_gid" ]]; then
echo "Error: 'render' group not found. This is required for GPU access." >&2 echo "Error: 'render' group not found. This is required for GPU access." >&2
exit 1 exit 1
fi fi
if [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then # --- Route: multi-node vs single-node ---
if is_multi_node "$commands"; then
echo "--- Multi-node job detected"
export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/') export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')
if [[ "$commands" =~ ^(.*)"["(.*)"] && ["(.*)"]"$ ]]; then # Parse the bracket syntax: prefix ; [node0_cmds] && [node1_cmds]
prefix=$( echo "${BASH_REMATCH[1]}" | sed 's/;//g') # BASH_REMATCH[1] = prefix (everything before first bracket)
# BASH_REMATCH[2] = comma-separated node0 commands
# BASH_REMATCH[3] = comma-separated node1 commands
if [[ "$commands" =~ ^(.*)\[(.*)"] && ["(.*)\]$ ]]; then
prefix=$(echo "${BASH_REMATCH[1]}" | sed 's/;//g')
echo "PREFIX: ${prefix}" echo "PREFIX: ${prefix}"
export composite_command="(command rocm-smi || true)"
myIFS=$IFS
IFS=','
read -ra node0 <<< ${BASH_REMATCH[2]}
read -ra node1 <<< ${BASH_REMATCH[3]}
IFS=$myIFS
for i in "${!node0[@]}";do
command_node_0=$(echo ${node0[i]} | sed 's/\"//g')
command_node_1=$(echo ${node1[i]} | sed 's/\"//g')
export commands="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'" export composite_command="(command rocm-smi || true)"
echo "COMMANDS: ${commands}" saved_IFS=$IFS
composite_command=$(echo "${composite_command} && ${commands}") IFS=','
read -ra node0 <<< "${BASH_REMATCH[2]}"
read -ra node1 <<< "${BASH_REMATCH[3]}"
IFS=$saved_IFS
if [[ ${#node0[@]} -ne ${#node1[@]} ]]; then
echo "Warning: node0 has ${#node0[@]} commands, node1 has ${#node1[@]}. They will be paired by index."
fi
for i in "${!node0[@]}"; do
command_node_0=$(echo "${node0[i]}" | sed 's/\"//g')
command_node_1=$(echo "${node1[i]}" | sed 's/\"//g')
step_cmd="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
echo "COMMANDS: ${step_cmd}"
composite_command="${composite_command} && ${step_cmd}"
done done
/bin/bash -c "${composite_command}" /bin/bash -c "${composite_command}"
cleanup_network cleanup_network
else else
echo "Failed to parse node commands! Exiting." echo "Multi-node job detected but failed to parse bracket command syntax."
echo "Expected format: prefix ; [node0_cmd1, node0_cmd2] && [node1_cmd1, node1_cmd2]"
echo "Got: $commands"
cleanup_network cleanup_network
exit 111 exit 111
fi fi
else else
echo "--- Single-node job"
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES" echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
docker run \ docker run \
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \ --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \

View File

@@ -0,0 +1,26 @@
#!/bin/bash
set -euox pipefail
echo "--- PP+TP"
vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
vllm bench serve \
--backend vllm \
--dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \
--num-prompts 20 \
--endpoint /v1/completions
kill -s SIGTERM $server_pid &
echo "--- DP+TP"
vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
vllm bench serve \
--backend vllm \
--dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \
--num-prompts 20 \
--endpoint /v1/completions
kill -s SIGTERM $server_pid &

View File

@@ -27,7 +27,7 @@ function cpu_tests() {
podman exec -it "$container_id" bash -c " podman exec -it "$container_id" bash -c "
export TORCH_COMPILE_DISABLE=1 export TORCH_COMPILE_DISABLE=1
set -xve set -xve
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
# Run basic model test # Run basic model test
podman exec -it "$container_id" bash -c " podman exec -it "$container_id" bash -c "
@@ -43,7 +43,7 @@ function cpu_tests() {
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it] pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach] pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being. # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> "$HOME"/test_rest.log
} }
# All of CPU tests are expected to be finished less than 40 mins. # All of CPU tests are expected to be finished less than 40 mins.

View File

@@ -2,119 +2,19 @@
# This script build the CPU docker image and run the offline inference inside the container. # This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage. # It serves a sanity check for compilation and basic model usage.
set -ex set -euox pipefail
# allow to bind to different cores # allow to bind to different cores
CORE_RANGE=${CORE_RANGE:-48-95} CORE_RANGE=${CORE_RANGE:-48-95}
# used for TP/PP E2E test
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
NUMA_NODE=${NUMA_NODE:-1} NUMA_NODE=${NUMA_NODE:-1}
IMAGE_NAME="cpu-test-$NUMA_NODE"
TIMEOUT_VAL=$1
TEST_COMMAND=$2
export CMAKE_BUILD_PARALLEL_LEVEL=32 # building the docker image
echo "--- :docker: Building Docker image"
# Setup cleanup docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .
remove_docker_container() {
set -e;
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
}
trap remove_docker_container EXIT
remove_docker_container
# Try building the docker image
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel. # Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g "$IMAGE_NAME" \
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 timeout "$TIMEOUT_VAL" bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
function cpu_tests() {
set -e
export NUMA_NODE=$2
# list packages
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
set -e
pip list"
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pip list"
# offline inference
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run kernel tests
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
pytest -x -v -s tests/kernels/test_onednn.py"
# Run basic model test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
# Note: disable until supports V1
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
pytest -x -v -s tests/models/language/generation -m cpu_model
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
pytest -x -v -s tests/models/language/pooling -m cpu_model
pytest -x -v -s tests/models/multimodal/generation \
--ignore=tests/models/multimodal/generation/test_pixtral.py \
-m cpu_model"
# Run compressed-tensor test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -x -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
# Run AWQ/GPTQ test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -x -s -v \
tests/quantization/test_cpu_wna16.py"
# Run multi-lora tests
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -x -s -v \
tests/lora/test_qwenvl.py"
# online serving: tp+pp
docker exec cpu-test-"$NUMA_NODE" bash -c '
set -e
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
vllm bench serve \
--backend vllm \
--dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \
--num-prompts 20 \
--endpoint /v1/completions
kill -s SIGTERM $server_pid &'
# online serving: tp+dp
docker exec cpu-test-"$NUMA_NODE" bash -c '
set -e
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
vllm bench serve \
--backend vllm \
--dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \
--num-prompts 20 \
--endpoint /v1/completions
kill -s SIGTERM $server_pid &'
}
# All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests
timeout 2.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"

View File

@@ -7,7 +7,7 @@ set -exuo pipefail
# Try building the docker image # Try building the docker image
image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}" image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}"
container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container" container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container"
cat <<EOF | docker build -t ${image_name} -f - . cat <<EOF | docker build -t "${image_name}" -f - .
FROM gaudi-base-image:latest FROM gaudi-base-image:latest
COPY ./ /workspace/vllm COPY ./ /workspace/vllm
@@ -39,12 +39,12 @@ EOF
# functions, while other platforms only need one remove_docker_container # functions, while other platforms only need one remove_docker_container
# function. # function.
EXITCODE=1 EXITCODE=1
remove_docker_containers() { docker rm -f ${container_name} || true; } remove_docker_containers() { docker rm -f "${container_name}" || true; }
trap 'remove_docker_containers; exit $EXITCODE;' EXIT trap 'remove_docker_containers; exit $EXITCODE;' EXIT
remove_docker_containers remove_docker_containers
echo "Running HPU plugin v1 test" echo "Running HPU plugin v1 test"
docker run --rm --runtime=habana --name=${container_name} --network=host \ docker run --rm --runtime=habana --name="${container_name}" --network=host \
-e HABANA_VISIBLE_DEVICES=all \ -e HABANA_VISIBLE_DEVICES=all \
-e VLLM_SKIP_WARMUP=true \ -e VLLM_SKIP_WARMUP=true \
-e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \ -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \

View File

@@ -41,6 +41,7 @@ get_config() {
echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2 echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
exit 1 exit 1
fi fi
# shellcheck source=/dev/null
source "${TEST_RUN_CONFIG_FILE}" source "${TEST_RUN_CONFIG_FILE}"
echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}" echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
return 0 return 0
@@ -48,9 +49,8 @@ get_config() {
# get test running configuration. # get test running configuration.
fetch_vllm_test_cfg fetch_vllm_test_cfg
get_config
# Check if the function call was successful. If not, exit the script. # Check if the function call was successful. If not, exit the script.
if [ $? -ne 0 ]; then if ! get_config; then
exit 1 exit 1
fi fi
@@ -62,14 +62,14 @@ agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
echo "agent_idx: ${agent_idx}" echo "agent_idx: ${agent_idx}"
builder_name="cachebuilder${agent_idx}" builder_name="cachebuilder${agent_idx}"
builder_cache_dir="/mnt/docker-cache${agent_idx}" builder_cache_dir="/mnt/docker-cache${agent_idx}"
mkdir -p ${builder_cache_dir} mkdir -p "${builder_cache_dir}"
# Try building the docker image # Try building the docker image
cat <<EOF | DOCKER_BUILDKIT=1 docker build \ cat <<EOF | DOCKER_BUILDKIT=1 docker build \
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \ --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:"${PYPI_CACHE_HOST}" \
--builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \ --builder "${builder_name}" --cache-from type=local,src="${builder_cache_dir}" \
--cache-to type=local,dest=${builder_cache_dir},mode=max \ --cache-to type=local,dest="${builder_cache_dir}",mode=max \
--progress=plain --load -t ${image_name} -f - . --progress=plain --load -t "${image_name}" -f - .
FROM ${BASE_IMAGE_NAME} FROM ${BASE_IMAGE_NAME}
# Define environments # Define environments
@@ -116,7 +116,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \ export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \ source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/$(uname -i)-linux/devlib && \
python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
@@ -139,7 +139,7 @@ trap remove_docker_container EXIT
# Generate corresponding --device args based on BUILDKITE_AGENT_NAME # Generate corresponding --device args based on BUILDKITE_AGENT_NAME
# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1. # Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards. # e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
# returns --device /dev/davinci0 --device /dev/davinci1 # returns one argument per line: --device, /dev/davinciX, ...
parse_and_gen_devices() { parse_and_gen_devices() {
local input="$1" local input="$1"
local index cards_num local index cards_num
@@ -151,29 +151,24 @@ parse_and_gen_devices() {
return 1 return 1
fi fi
local devices=""
local i=0 local i=0
while (( i < cards_num )); do while (( i < cards_num )); do
local dev_idx=$(((index - 1)*cards_num + i )) local dev_idx=$(((index - 1)*cards_num + i ))
devices="$devices --device /dev/davinci${dev_idx}" printf '%s\n' "--device"
printf '%s\n' "/dev/davinci${dev_idx}"
((i++)) ((i++))
done done
# trim leading space
devices="${devices#"${devices%%[![:space:]]*}"}"
# Output devices: assigned to the caller variable
printf '%s' "$devices"
} }
devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1 mapfile -t device_args < <(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware. # Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
# This test checks whether the OOT platform interface is functioning properly in conjunction with # This test checks whether the OOT platform interface is functioning properly in conjunction with
# the hardware plugin vllm-ascend. # the hardware plugin vllm-ascend.
model_cache_dir=/mnt/modelscope${agent_idx} model_cache_dir=/mnt/modelscope${agent_idx}
mkdir -p ${model_cache_dir} mkdir -p "${model_cache_dir}"
docker run \ docker run \
${devices} \ "${device_args[@]}" \
--device /dev/davinci_manager \ --device /dev/davinci_manager \
--device /dev/devmm_svm \ --device /dev/devmm_svm \
--device /dev/hisi_hdc \ --device /dev/hisi_hdc \
@@ -182,7 +177,7 @@ docker run \
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
-v /etc/ascend_install.info:/etc/ascend_install.info \ -v /etc/ascend_install.info:/etc/ascend_install.info \
-v ${model_cache_dir}:/root/.cache/modelscope \ -v "${model_cache_dir}":/root/.cache/modelscope \
--entrypoint="" \ --entrypoint="" \
--name "${container_name}" \ --name "${container_name}" \
"${image_name}" \ "${image_name}" \

View File

@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---" echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \ && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---" echo "--- Python dependencies installed ---"

View File

@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---" echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \ && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---" echo "--- Python dependencies installed ---"

View File

@@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
# Try building the docker image # Try building the docker image
docker build -t ${image_name} -f docker/Dockerfile.xpu . docker build -t "${image_name}" -f docker/Dockerfile.xpu .
# Setup cleanup # Setup cleanup
remove_docker_container() { remove_docker_container() {
@@ -39,6 +39,8 @@ docker run \
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
cd tests cd tests

View File

@@ -21,16 +21,16 @@ echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag nam
# pull original arch-dependent images from AWS ECR Public # pull original arch-dependent images from AWS ECR Public
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7 aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX"
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX"
# tag arch-dependent images # tag arch-dependent images
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-x86_64 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-x86_64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-aarch64 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-aarch64
# push arch-dependent images to DockerHub # push arch-dependent images to DockerHub
docker push vllm/vllm-openai:$TAG_NAME-x86_64 docker push vllm/vllm-openai:"$TAG_NAME"-x86_64
docker push vllm/vllm-openai:$TAG_NAME-aarch64 docker push vllm/vllm-openai:"$TAG_NAME"-aarch64
# push arch-independent manifest to DockerHub # push arch-independent manifest to DockerHub
docker manifest create vllm/vllm-openai:$TAG_NAME vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend docker manifest create vllm/vllm-openai:"$TAG_NAME" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend
docker manifest create vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend docker manifest create vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend
docker manifest push vllm/vllm-openai:$TAG_NAME docker manifest push vllm/vllm-openai:"$TAG_NAME"
docker manifest push vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT docker manifest push vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT"

View File

@@ -1,64 +0,0 @@
#!/bin/bash
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Setup script for Prime-RL integration tests
# This script prepares the environment for running Prime-RL tests with nightly vLLM
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
exit 0
fi
echo "Setting up Prime-RL integration test environment..."
# Clean up any existing Prime-RL directory
if [ -d "${PRIME_RL_DIR}" ]; then
echo "Removing existing Prime-RL directory..."
rm -rf "${PRIME_RL_DIR}"
fi
# Install UV if not available
if ! command -v uv &> /dev/null; then
echo "Installing UV package manager..."
curl -LsSf https://astral.sh/uv/install.sh | sh
source $HOME/.local/bin/env
fi
# Clone Prime-RL repository at specific branch for reproducible tests
PRIME_RL_BRANCH="integ-vllm-main"
echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
cd "${PRIME_RL_DIR}"
echo "Setting up UV project environment..."
export UV_PROJECT_ENVIRONMENT=/usr/local
ln -s /usr/bin/python3 /usr/local/bin/python
# Remove vllm pin from pyproject.toml
echo "Removing vllm pin from pyproject.toml..."
sed -i '/vllm==/d' pyproject.toml
# Sync Prime-RL dependencies
echo "Installing Prime-RL dependencies..."
uv sync --inexact && uv sync --inexact --all-extras
# Verify installation
echo "Verifying installations..."
uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
echo "Prime-RL integration test environment setup complete!"
echo "Running Prime-RL integration tests..."
export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
uv run pytest -vs tests/integration/test_rl.py -m gpu
echo "Prime-RL integration tests completed!"

View File

@@ -51,14 +51,14 @@ for BACK in "${BACKENDS[@]}"; do
--enable-eplb \ --enable-eplb \
--trust-remote-code \ --trust-remote-code \
--max-model-len 2048 \ --max-model-len 2048 \
--all2all-backend $BACK \ --all2all-backend "$BACK" \
--port $PORT & --port "$PORT" &
SERVER_PID=$! SERVER_PID=$!
wait_for_server $PORT wait_for_server "$PORT"
TAG=$(echo "$MODEL" | tr '/: \\n' '_____') TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
OUT="${OUT_DIR}/${TAG}_${BACK}.json" OUT="${OUT_DIR}/${TAG}_${BACK}.json"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT} python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
python3 - <<PY python3 - <<PY
import json; acc=json.load(open('${OUT}'))['accuracy'] import json; acc=json.load(open('${OUT}'))['accuracy']
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}") print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")

View File

@@ -0,0 +1,57 @@
#!/usr/bin/env bash
set -euxo pipefail
# Nightly e2e test for prefetch offloading with a MoE model.
# Runs DeepSeek-V2-Lite with prefetch offloading of MoE expert weights
# and validates GSM8K accuracy matches baseline (no offloading).
#
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
THRESHOLD=${1:-0.25}
NUM_Q=${2:-1319}
PORT=${3:-8030}
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
mkdir -p "${OUT_DIR}"
wait_for_server() {
local port=$1
timeout 600 bash -c '
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
sleep 1
done'
}
MODEL="deepseek-ai/DeepSeek-V2-Lite"
cleanup() {
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
kill "${SERVER_PID}" 2>/dev/null || true
for _ in {1..20}; do
kill -0 "${SERVER_PID}" 2>/dev/null || break
sleep 0.5
done
kill -9 "${SERVER_PID}" 2>/dev/null || true
fi
}
trap cleanup EXIT
vllm serve "$MODEL" \
--max-model-len 2048 \
--offload-group-size 8 \
--offload-num-in-group 2 \
--offload-prefetch-step 1 \
--offload-params w13_weight w2_weight \
--port "$PORT" &
SERVER_PID=$!
wait_for_server "$PORT"
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
OUT="${OUT_DIR}/${TAG}_prefetch_offload.json"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
python3 - <<PY
import json; acc=json.load(open('${OUT}'))['accuracy']
print(f"${MODEL} prefetch_offload: accuracy {acc:.3f}")
assert acc >= ${THRESHOLD}, f"${MODEL} prefetch_offload accuracy {acc}"
PY
cleanup
SERVER_PID=

View File

@@ -47,20 +47,20 @@ for BACK in "${BACKENDS[@]}"; do
vllm serve "$MODEL" \ vllm serve "$MODEL" \
--enforce-eager \ --enforce-eager \
--enable-eplb \ --enable-eplb \
--all2all-backend $BACK \ --all2all-backend "$BACK" \
--eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \ --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
--tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \ --tensor-parallel-size "${TENSOR_PARALLEL_SIZE}" \
--data-parallel-size ${DATA_PARALLEL_SIZE} \ --data-parallel-size "${DATA_PARALLEL_SIZE}" \
--enable-expert-parallel \ --enable-expert-parallel \
--trust-remote-code \ --trust-remote-code \
--max-model-len 2048 \ --max-model-len 2048 \
--port $PORT & --port "$PORT" &
SERVER_PID=$! SERVER_PID=$!
wait_for_server $PORT wait_for_server "$PORT"
TAG=$(echo "$MODEL" | tr '/: \\n' '_____') TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
OUT="${OUT_DIR}/${TAG}_${BACK}.json" OUT="${OUT_DIR}/${TAG}_${BACK}.json"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT} python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
python3 - <<PY python3 - <<PY
import json; acc=json.load(open('${OUT}'))['accuracy'] import json; acc=json.load(open('${OUT}'))['accuracy']
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}") print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")

View File

@@ -51,20 +51,20 @@ for BACK in "${BACKENDS[@]}"; do
--tensor-parallel-size 4 \ --tensor-parallel-size 4 \
--enable-expert-parallel \ --enable-expert-parallel \
--enable-eplb \ --enable-eplb \
--all2all-backend $BACK \ --all2all-backend "$BACK" \
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \ --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \ --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
--trust-remote-code \ --trust-remote-code \
--max-model-len 2048 \ --max-model-len 2048 \
--gpu-memory-utilization 0.9 \ --gpu-memory-utilization 0.9 \
"${PLATFORM_ARGS[@]}" \ "${PLATFORM_ARGS[@]}" \
--port $PORT & --port "$PORT" &
SERVER_PID=$! SERVER_PID=$!
wait_for_server $PORT wait_for_server "$PORT"
TAG=$(echo "$MODEL" | tr '/: \\n' '_____') TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
OUT="${OUT_DIR}/${TAG}_${BACK}.json" OUT="${OUT_DIR}/${TAG}_${BACK}.json"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT} python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
python3 - <<PY python3 - <<PY
import json; acc=json.load(open('${OUT}'))['accuracy'] import json; acc=json.load(open('${OUT}'))['accuracy']
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}") print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")

View File

@@ -9,10 +9,11 @@ ENV_FILE=$1
# For testing on local vm, use `set -a` to export all variables # For testing on local vm, use `set -a` to export all variables
source /etc/environment source /etc/environment
source $ENV_FILE # shellcheck source=/dev/null
source "$ENV_FILE"
remove_docker_container() { remove_docker_container() {
docker rm -f $CONTAINER_NAME || true; docker rm -f "$CONTAINER_NAME" || true;
} }
trap remove_docker_container EXIT trap remove_docker_container EXIT
@@ -41,13 +42,13 @@ echo
echo "starting docker...$CONTAINER_NAME" echo "starting docker...$CONTAINER_NAME"
echo echo
docker run \ docker run \
-v $DOWNLOAD_DIR:$DOWNLOAD_DIR \ -v "$DOWNLOAD_DIR":"$DOWNLOAD_DIR" \
--env-file $ENV_FILE \ --env-file "$ENV_FILE" \
-e HF_TOKEN="$HF_TOKEN" \ -e HF_TOKEN="$HF_TOKEN" \
-e TARGET_COMMIT=$BUILDKITE_COMMIT \ -e TARGET_COMMIT="$BUILDKITE_COMMIT" \
-e MODEL=$MODEL \ -e MODEL="$MODEL" \
-e WORKSPACE=/workspace \ -e WORKSPACE=/workspace \
--name $CONTAINER_NAME \ --name "$CONTAINER_NAME" \
-d \ -d \
--privileged \ --privileged \
--network host \ --network host \

View File

@@ -42,21 +42,21 @@ echo "lanching vllm..."
echo "logging to $VLLM_LOG" echo "logging to $VLLM_LOG"
echo echo
vllm serve $MODEL \ vllm serve "$MODEL" \
--seed 42 \ --seed 42 \
--max-num-seqs $MAX_NUM_SEQS \ --max-num-seqs "$MAX_NUM_SEQS" \
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \ --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \ --tensor-parallel-size "$TENSOR_PARALLEL_SIZE" \
--no-enable-prefix-caching \ --no-enable-prefix-caching \
--download_dir $DOWNLOAD_DIR \ --download_dir "$DOWNLOAD_DIR" \
--max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 & --max-model-len "$MAX_MODEL_LEN" > "$VLLM_LOG" 2>&1 &
echo "wait for 20 minutes.." echo "wait for 20 minutes.."
echo echo
# sleep 1200 # sleep 1200
# wait for 10 minutes... # wait for 10 minutes...
for i in {1..120}; do for _ in {1..120}; do
# TODO: detect other type of errors. # TODO: detect other type of errors.
if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
echo "Detected RuntimeError, exiting." echo "Detected RuntimeError, exiting."
@@ -78,11 +78,11 @@ echo "logging to $BM_LOG"
echo echo
vllm bench serve \ vllm bench serve \
--backend vllm \ --backend vllm \
--model $MODEL \ --model "$MODEL" \
--dataset-name sonnet \ --dataset-name sonnet \
--dataset-path benchmarks/sonnet_4x.txt \ --dataset-path benchmarks/sonnet_4x.txt \
--sonnet-input-len $INPUT_LEN \ --sonnet-input-len "$INPUT_LEN" \
--sonnet-output-len $OUTPUT_LEN \ --sonnet-output-len "$OUTPUT_LEN" \
--ignore-eos > "$BM_LOG" --ignore-eos > "$BM_LOG"
echo "completed..." echo "completed..."

View File

@@ -76,16 +76,15 @@ mkdir -p "$INDICES_OUTPUT_DIR"
# this indices have relative paths that could work as long as it is next to the wheel directory in s3 # this indices have relative paths that could work as long as it is next to the wheel directory in s3
# i.e., the wheels are always in s3://vllm-wheels/<commit>/ # i.e., the wheels are always in s3://vllm-wheels/<commit>/
# and indices can be placed in /<commit>/, or /nightly/, or /<version>/ # and indices can be placed in /<commit>/, or /nightly/, or /<version>/
if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then alias_args=()
alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS" if [[ -n "$DEFAULT_VARIANT_ALIAS" ]]; then
else alias_args=(--alias-to-default "$DEFAULT_VARIANT_ALIAS")
alias_arg=""
fi fi
# HACK: we do not need regex module here, but it is required by pre-commit hook # HACK: we do not need regex module here, but it is required by pre-commit hook
# To avoid any external dependency, we simply replace it back to the stdlib re module # To avoid any external dependency, we simply replace it back to the stdlib re module
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" "${alias_args[@]}"
# copy indices to /<commit>/ unconditionally # copy indices to /<commit>/ unconditionally
echo "Uploading indices to $S3_COMMIT_PREFIX" echo "Uploading indices to $S3_COMMIT_PREFIX"
@@ -100,9 +99,9 @@ fi
# re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version # re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
if [[ "$version" != *"dev"* ]]; then if [[ "$version" != *"dev"* ]]; then
echo "Re-generating indices for /$pure_version/" echo "Re-generating indices for /$pure_version/"
rm -rf "$INDICES_OUTPUT_DIR/*" rm -rf "${INDICES_OUTPUT_DIR:?}/*"
mkdir -p "$INDICES_OUTPUT_DIR" mkdir -p "$INDICES_OUTPUT_DIR"
# wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path # wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" "${alias_args[@]}"
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/" aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
fi fi

View File

@@ -7,7 +7,7 @@ SUBPATH=$BUILDKITE_COMMIT
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/" S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
RELEASE_VERSION=$(buildkite-agent meta-data get release-version) RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null) GIT_VERSION=$(git describe --exact-match --tags "$BUILDKITE_COMMIT" 2>/dev/null)
echo "Release version from Buildkite: $RELEASE_VERSION" echo "Release version from Buildkite: $RELEASE_VERSION"
@@ -55,7 +55,7 @@ mkdir -p $DIST_DIR
aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
echo "Wheels copied to local directory" echo "Wheels copied to local directory"
# generate source tarball # generate source tarball
git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" "$BUILDKITE_COMMIT"
ls -la $DIST_DIR ls -la $DIST_DIR
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name) # upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
@@ -65,6 +65,6 @@ if [[ -z "$PYPI_WHEEL_FILES" ]]; then
exit 1 exit 1
fi fi
python3 -m twine check $PYPI_WHEEL_FILES python3 -m twine check "$PYPI_WHEEL_FILES"
python3 -m twine upload --non-interactive --verbose $PYPI_WHEEL_FILES python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES"
echo "Wheels uploaded to PyPI" echo "Wheels uploaded to PyPI"

View File

@@ -55,7 +55,7 @@ mkdir -p all-rocm-wheels
cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true
cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true
WHEEL_COUNT=$(ls all-rocm-wheels/*.whl 2>/dev/null | wc -l) WHEEL_COUNT=$(find all-rocm-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
echo "Total wheels to upload: $WHEEL_COUNT" echo "Total wheels to upload: $WHEEL_COUNT"
if [ "$WHEEL_COUNT" -eq 0 ]; then if [ "$WHEEL_COUNT" -eq 0 ]; then
@@ -115,7 +115,7 @@ if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] |
fi fi
# Extract version from vLLM wheel and update version-specific index # Extract version from vLLM wheel and update version-specific index
VLLM_WHEEL=$(ls all-rocm-wheels/vllm*.whl 2>/dev/null | head -1) VLLM_WHEEL=$(find all-rocm-wheels -maxdepth 1 -name 'vllm*.whl' 2>/dev/null | head -1)
if [ -n "$VLLM_WHEEL" ]; then if [ -n "$VLLM_WHEEL" ]; then
VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2) VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
echo "Version in wheel: $VERSION" echo "Version in wheel: $VERSION"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -14,3 +14,8 @@ steps:
- pytest -v -s basic_correctness/test_cumem.py - pytest -v -s basic_correctness/test_cumem.py
- pytest -v -s basic_correctness/test_basic_correctness.py - pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py - pytest -v -s basic_correctness/test_cpu_offload.py
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd

View File

@@ -17,3 +17,15 @@ steps:
- tests/benchmarks/ - tests/benchmarks/
commands: commands:
- pytest -v -s benchmarks/ - pytest -v -s benchmarks/
- label: Attention Benchmarks Smoke Test (B200)
device: b200
num_gpus: 2
optional: true
working_dir: "/vllm-workspace/"
timeout_in_minutes: 10
source_file_dependencies:
- benchmarks/attention_benchmarks/
- vllm/v1/attention/
commands:
- python3 benchmarks/attention_benchmarks/benchmark.py --backends flash flashinfer --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1

View File

@@ -2,7 +2,7 @@ group: Compile
depends_on: depends_on:
- image-build - image-build
steps: steps:
- label: Sequence Parallel Tests (2 GPUs) - label: Sequence Parallel Correctness Tests (2 GPUs)
timeout_in_minutes: 50 timeout_in_minutes: 50
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
num_devices: 2 num_devices: 2
@@ -11,12 +11,12 @@ steps:
- vllm/compilation/ - vllm/compilation/
- vllm/v1/worker/ - vllm/v1/worker/
- vllm/v1/cudagraph_dispatcher.py - vllm/v1/cudagraph_dispatcher.py
- tests/distributed/test_sequence_parallel.py - tests/compile/correctness_e2e/test_sequence_parallel.py
commands: commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1 - export VLLM_TEST_CLEAN_GPU_MEMORY=1
- pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
- label: Sequence Parallel Tests (2xH100) - label: Sequence Parallel Correctness Tests (2xH100)
timeout_in_minutes: 50 timeout_in_minutes: 50
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
device: h100 device: h100
@@ -24,24 +24,30 @@ steps:
num_devices: 2 num_devices: 2
commands: commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1 - export VLLM_TEST_CLEAN_GPU_MEMORY=1
- pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
- label: AsyncTP Correctness Tests (2xH100)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/"
device: h100
optional: true
num_devices: 2
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
- pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
- label: Distributed Compile Unit Tests (2xH100) - label: Distributed Compile Unit Tests (2xH100)
timeout_in_minutes: 40 timeout_in_minutes: 20
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
device: h100 device: h100
num_devices: 2 num_devices: 2
source_file_dependencies: source_file_dependencies:
- vllm/compilation/ - vllm/compilation/
- vllm/model_executor/layers - vllm/model_executor/layers
- tests/compile/distributed/test_fusion_all_reduce.py - tests/compile/passes/distributed/
- tests/compile/distributed/test_sequence_parallelism.py
- tests/compile/distributed/test_async_tp.py
commands: commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1 - export VLLM_TEST_CLEAN_GPU_MEMORY=1
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py - pytest -s -v tests/compile/passes/distributed
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
- pytest -v -s tests/compile/distributed/test_async_tp.py
- label: Fusion and Compile Unit Tests (B200) - label: Fusion and Compile Unit Tests (B200)
timeout_in_minutes: 20 timeout_in_minutes: 20
@@ -55,17 +61,17 @@ steps:
- vllm/model_executor/layers/attention/attention.py - vllm/model_executor/layers/attention/attention.py
- vllm/v1/attention/backends/flashinfer.py - vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
- tests/compile/test_fusion_attn.py - tests/compile/passes/test_fusion_attn.py
- tests/compile/test_silu_mul_quant_fusion.py - tests/compile/passes/test_silu_mul_quant_fusion.py
- tests/compile/distributed/test_fusion_all_reduce.py - tests/compile/passes/distributed/test_fusion_all_reduce.py
- tests/compile/fullgraph/test_full_graph.py - tests/compile/fullgraph/test_full_graph.py
commands: commands:
# b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell # b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell
- nvidia-smi - nvidia-smi
- pytest -v -s tests/compile/test_fusion_attn.py -k FLASHINFER - pytest -v -s tests/compile/passes/test_fusion_attn.py -k FLASHINFER
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
# this runner has 2 GPUs available even though num_devices=2 is not set # this runner has 2 GPUs available even though num_devices=2 is not set
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
# TODO(luka) move to H100 once pass tests run on H100 # TODO(luka) move to H100 once pass tests run on H100
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
@@ -115,13 +121,10 @@ steps:
optional: true optional: true
commands: commands:
- nvidia-smi - nvidia-smi
# Run all models and attn backends but only Inductor partition and native custom ops # Run all models but only FLASHINFER, Inductor partition and native custom ops
# -k "inductor_partition and not +rms_norm and not +quant_fp8"
# Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
# -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" # Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition)
# Run just llama3 (fp8 & fp4) for all config combinations - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3) or llama-3)"
# -k "llama-3"
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"
- label: Fusion E2E TP2 Quick (H100) - label: Fusion E2E TP2 Quick (H100)
timeout_in_minutes: 20 timeout_in_minutes: 20
@@ -156,7 +159,7 @@ steps:
- tests/compile/fusions_e2e/ - tests/compile/fusions_e2e/
commands: commands:
- nvidia-smi - nvidia-smi
# Run just llama3 (fp4 & fp8 & bf16) for all config combinations # Run just llama3 (fp8 & bf16) for all config combinations
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3" - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"
- label: Fusion E2E TP2 AsyncTP Config Sweep (H100) - label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
@@ -191,7 +194,8 @@ steps:
- tests/compile/fusions_e2e/ - tests/compile/fusions_e2e/
commands: commands:
- nvidia-smi - nvidia-smi
# Run all models and attn backends but only Inductor partition and native custom ops # Run all models but only FLASHINFER, Inductor partition and native custom ops
# include qwen with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
# for ar-rms-quant-fp4, also sweep llama3 # for ar-rms-quant-fp4, also sweep llama3
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "Llama-3.1-8B-Instruct-FP4" - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)) or Llama-3.1-8B-Instruct-FP4"
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8" - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)"

View File

@@ -9,6 +9,7 @@ steps:
- tests/cuda - tests/cuda
commands: commands:
- pytest -v -s cuda/test_cuda_context.py - pytest -v -s cuda/test_cuda_context.py
- pytest -v -s cuda/test_platform_no_cuda_init.py
- label: Cudagraph - label: Cudagraph
timeout_in_minutes: 20 timeout_in_minutes: 20

View File

@@ -62,6 +62,7 @@ steps:
- tests/compile/fullgraph/test_basic_correctness.py - tests/compile/fullgraph/test_basic_correctness.py
- examples/offline_inference/rlhf.py - examples/offline_inference/rlhf.py
- examples/offline_inference/rlhf_colocate.py - examples/offline_inference/rlhf_colocate.py
- examples/offline_inference/new_weight_syncing/
- tests/examples/offline_inference/data_parallel.py - tests/examples/offline_inference/data_parallel.py
- tests/v1/distributed - tests/v1/distributed
- tests/v1/engine/test_engine_core_client.py - tests/v1/engine/test_engine_core_client.py
@@ -96,9 +97,13 @@ steps:
- pytest -v -s distributed/test_symm_mem_allreduce.py - pytest -v -s distributed/test_symm_mem_allreduce.py
# TODO: create a dedicated test section for multi-GPU example tests # TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests # when we have multiple distributed example tests
# OLD rlhf examples
- cd ../examples/offline_inference - cd ../examples/offline_inference
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
# NEW rlhf examples
- cd new_weight_syncing
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
- label: Distributed Tests (8 GPUs)(H100) - label: Distributed Tests (8 GPUs)(H100)
timeout_in_minutes: 10 timeout_in_minutes: 10
@@ -140,6 +145,7 @@ steps:
num_devices: 2 num_devices: 2
commands: commands:
- pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py - pytest -v -s tests/v1/distributed/test_dbo.py
@@ -159,6 +165,7 @@ steps:
num_devices: 2 num_devices: 2
num_nodes: 2 num_nodes: 2
no_plugin: true no_plugin: true
optional: true # TODO: revert once infra issue solved
source_file_dependencies: source_file_dependencies:
- vllm/distributed/ - vllm/distributed/
- vllm/engine/ - vllm/engine/
@@ -191,7 +198,18 @@ steps:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
- label: Pipeline + Context Parallelism (4 GPUs)) - label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
- label: Pipeline + Context Parallelism (4 GPUs)
timeout_in_minutes: 60 timeout_in_minutes: 60
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_devices: 4 num_devices: 4

View File

@@ -29,15 +29,11 @@ steps:
commands: commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
- label: Prime-RL Integration (2 GPUs) - label: DeepSeek V2-Lite Prefetch Offload Accuracy (H100)
timeout_in_minutes: 30 timeout_in_minutes: 60
device: h100
optional: true optional: true
soft_fail: true num_devices: 1
num_devices: 2
working_dir: "/vllm-workspace" working_dir: "/vllm-workspace"
source_file_dependencies:
- vllm/
- .buildkite/scripts/run-prime-rl-test.sh
commands: commands:
- nvidia-smi - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh 0.25 200 8030
- bash .buildkite/scripts/run-prime-rl-test.sh

View File

@@ -28,3 +28,11 @@ steps:
- pytest -v -s v1/engine/test_preprocess_error_handling.py - pytest -v -s v1/engine/test_preprocess_error_handling.py
# Run the rest of v1/engine tests # Run the rest of v1/engine tests
- pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
commands:
- pytest -v -s v1/e2e
- pytest -v -s v1/engine

View File

@@ -24,6 +24,11 @@ steps:
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: Entrypoints Integration (API Server 1) - label: Entrypoints Integration (API Server 1)
timeout_in_minutes: 130 timeout_in_minutes: 130
@@ -42,15 +47,13 @@ steps:
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/tool_use
- tests/entrypoints/sleep
- tests/entrypoints/instrumentator
- tests/entrypoints/rpc - tests/entrypoints/rpc
- tests/entrypoints/instrumentator
- tests/tool_use
commands: commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
- pytest -v -s entrypoints/instrumentator - pytest -v -s entrypoints/instrumentator
- pytest -v -s entrypoints/sleep - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
- pytest -v -s tool_use - pytest -v -s tool_use
- label: Entrypoints Integration (Pooling) - label: Entrypoints Integration (Pooling)
@@ -62,6 +65,11 @@ steps:
commands: commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/pooling - pytest -v -s entrypoints/pooling
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: Entrypoints Integration (Responses API) - label: Entrypoints Integration (Responses API)
timeout_in_minutes: 50 timeout_in_minutes: 50

View File

@@ -115,6 +115,7 @@ steps:
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py - pytest -v -s tests/kernels/moe/test_flashinfer.py
- pytest -v -s tests/kernels/moe/test_flashinfer_moe.py
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
# e2e # e2e
- pytest -v -s tests/models/quantization/test_nvfp4.py - pytest -v -s tests/models/quantization/test_nvfp4.py
@@ -156,14 +157,3 @@ steps:
- pytest -v -s kernels/moe/test_deepep_moe.py - pytest -v -s kernels/moe/test_deepep_moe.py
- pytest -v -s kernels/moe/test_pplx_cutlass_moe.py - pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
# - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main # - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
- label: Kernels Fp4 MoE Test (B200)
timeout_in_minutes: 60
device: b200
num_devices: 1
optional: true
commands:
- pytest -v -s kernels/moe/test_cutedsl_moe.py
- pytest -v -s kernels/moe/test_flashinfer_moe.py
- pytest -v -s kernels/moe/test_nvfp4_moe.py
- pytest -v -s kernels/moe/test_ocp_mx_moe.py

View File

@@ -73,3 +73,29 @@ steps:
num_devices: 2 num_devices: 2
commands: commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
- label: GPQA Eval (GPT-OSS) (H100)
timeout_in_minutes: 120
device: h100
optional: true
num_devices: 2
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
- tests/evals/gpt_oss/
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-h100.txt
- label: GPQA Eval (GPT-OSS) (B200)
timeout_in_minutes: 120
device: b200
optional: true
num_devices: 2
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
- tests/evals/gpt_oss/
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-b200.txt

View File

@@ -16,6 +16,7 @@ steps:
- pytest -v -s v1/sample - pytest -v -s v1/sample
- pytest -v -s v1/logits_processors - pytest -v -s v1/logits_processors
- pytest -v -s v1/worker - pytest -v -s v1/worker
# TODO: create another `optional` test group for slow tests
- pytest -v -s -m 'not slow_test' v1/spec_decode - pytest -v -s -m 'not slow_test' v1/spec_decode
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
- pytest -v -s -m 'not cpu_test' v1/metrics - pytest -v -s -m 'not cpu_test' v1/metrics
@@ -25,6 +26,11 @@ steps:
# Integration test for streaming correctness (requires special branch). # Integration test for streaming correctness (requires special branch).
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: V1 Others (CPU) - label: V1 Others (CPU)
depends_on: depends_on:
@@ -72,7 +78,7 @@ steps:
- python3 offline_inference/vision_language_multi_image.py --seed 0 - python3 offline_inference/vision_language_multi_image.py --seed 0
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
# for pooling models # for pooling models
- python3 pooling/pooling/vision_language_pooling.py --seed 0 - python3 pooling/embed/vision_embedding_offline.py --seed 0
# for features demo # for features demo
- python3 offline_inference/prefix_caching.py - python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py - python3 offline_inference/llm_engine_example.py
@@ -108,9 +114,11 @@ steps:
timeout_in_minutes: 50 timeout_in_minutes: 50
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/detokenizer
- tests/multimodal - tests/multimodal
- tests/utils_ - tests/utils_
commands: commands:
- pytest -v -s detokenizer
- pytest -v -s -m 'not cpu_test' multimodal - pytest -v -s -m 'not cpu_test' multimodal
- pytest -v -s utils_ - pytest -v -s utils_
@@ -122,6 +130,8 @@ steps:
- vllm/ - vllm/
- tests/test_inputs.py - tests/test_inputs.py
- tests/test_outputs.py - tests/test_outputs.py
- tests/test_pooling_params.py
- tests/test_ray_env.py
- tests/multimodal - tests/multimodal
- tests/renderers - tests/renderers
- tests/standalone_tests/lazy_imports.py - tests/standalone_tests/lazy_imports.py
@@ -134,6 +144,8 @@ steps:
- python3 standalone_tests/lazy_imports.py - python3 standalone_tests/lazy_imports.py
- pytest -v -s test_inputs.py - pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py - pytest -v -s test_outputs.py
- pytest -v -s test_pooling_params.py
- pytest -v -s test_ray_env.py
- pytest -v -s -m 'cpu_test' multimodal - pytest -v -s -m 'cpu_test' multimodal
- pytest -v -s renderers - pytest -v -s renderers
- pytest -v -s tokenizers_ - pytest -v -s tokenizers_
@@ -141,20 +153,6 @@ steps:
- pytest -v -s transformers_utils - pytest -v -s transformers_utils
- pytest -v -s config - pytest -v -s config
- label: GPT-OSS Eval (B200)
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
device: b200
optional: true
source_file_dependencies:
- tests/evals/gpt_oss
- vllm/model_executor/models/gpt_oss.py
- vllm/model_executor/layers/quantization/mxfp4.py
- vllm/v1/attention/backends/flashinfer.py
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
- label: Batch Invariance (H100) - label: Batch Invariance (H100)
timeout_in_minutes: 25 timeout_in_minutes: 25
device: h100 device: h100

View File

@@ -4,7 +4,6 @@ depends_on:
steps: steps:
- label: Basic Models Tests (Initialization) - label: Basic Models Tests (Initialization)
timeout_in_minutes: 45 timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true torch_nightly: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@@ -16,7 +15,6 @@ steps:
- label: Basic Models Tests (Extra Initialization) %N - label: Basic Models Tests (Extra Initialization) %N
timeout_in_minutes: 45 timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true torch_nightly: true
source_file_dependencies: source_file_dependencies:
- vllm/model_executor/models/ - vllm/model_executor/models/
@@ -33,10 +31,17 @@ steps:
timeout_in_minutes: 45 timeout_in_minutes: 45
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/test_terratorch.py
- tests/models/test_transformers.py - tests/models/test_transformers.py
- tests/models/test_registry.py - tests/models/test_registry.py
commands: commands:
- pytest -v -s models/test_transformers.py models/test_registry.py - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: Basic Models Test (Other CPU) # 5min - label: Basic Models Test (Other CPU) # 5min
depends_on: depends_on:

View File

@@ -4,7 +4,6 @@ depends_on:
steps: steps:
- label: Language Models Tests (Standard) - label: Language Models Tests (Standard)
timeout_in_minutes: 25 timeout_in_minutes: 25
mirror_hardwares: [amdexperimental]
torch_nightly: true torch_nightly: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@@ -16,7 +15,6 @@ steps:
- label: Language Models Tests (Extra Standard) %N - label: Language Models Tests (Extra Standard) %N
timeout_in_minutes: 45 timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true torch_nightly: true
source_file_dependencies: source_file_dependencies:
- vllm/model_executor/models/ - vllm/model_executor/models/
@@ -32,7 +30,6 @@ steps:
- label: Language Models Tests (Hybrid) %N - label: Language Models Tests (Hybrid) %N
timeout_in_minutes: 75 timeout_in_minutes: 75
mirror_hardwares: [amdexperimental]
torch_nightly: true torch_nightly: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@@ -40,7 +37,7 @@ steps:
commands: commands:
# Install fast path packages for testing against transformers # Install fast path packages for testing against transformers
# Note: also needed to run plamo2 model in vLLM # Note: also needed to run plamo2 model in vLLM
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
# Shard hybrid language model tests # Shard hybrid language model tests
- pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
@@ -48,7 +45,6 @@ steps:
- label: Language Models Test (Extended Generation) # 80min - label: Language Models Test (Extended Generation) # 80min
timeout_in_minutes: 110 timeout_in_minutes: 110
mirror_hardwares: [amdexperimental]
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@@ -56,13 +52,21 @@ steps:
commands: commands:
# Install fast path packages for testing against transformers # Install fast path packages for testing against transformers
# Note: also needed to run plamo2 model in vLLM # Note: also needed to run plamo2 model in vLLM
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
commands:
- uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
- label: Language Models Test (PPL) - label: Language Models Test (PPL)
timeout_in_minutes: 110 timeout_in_minutes: 110
mirror_hardwares: [amdexperimental]
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@@ -72,17 +76,20 @@ steps:
- label: Language Models Test (Extended Pooling) # 36min - label: Language Models Test (Extended Pooling) # 36min
timeout_in_minutes: 50 timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/language/pooling - tests/models/language/pooling
commands: commands:
- pytest -v -s models/language/pooling -m 'not core_model' - pytest -v -s models/language/pooling -m 'not core_model'
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: Language Models Test (MTEB) - label: Language Models Test (MTEB)
timeout_in_minutes: 110 timeout_in_minutes: 110
mirror_hardwares: [amdexperimental]
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/

View File

@@ -3,7 +3,7 @@ depends_on:
- image-build - image-build
steps: steps:
- label: PyTorch Compilation Unit Tests - label: PyTorch Compilation Unit Tests
timeout_in_minutes: 30 timeout_in_minutes: 10
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/compile - tests/compile
@@ -17,6 +17,14 @@ steps:
# (using -0 for proper path handling) # (using -0 for proper path handling)
- "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'" - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
- label: PyTorch Compilation Passes Unit Tests
timeout_in_minutes: 20
source_file_dependencies:
- vllm/
- tests/compile/passes
commands:
- pytest -s -v compile/passes --ignore compile/passes/distributed
- label: PyTorch Fullgraph Smoke Test - label: PyTorch Fullgraph Smoke Test
timeout_in_minutes: 35 timeout_in_minutes: 35
source_file_dependencies: source_file_dependencies:

View File

@@ -12,3 +12,10 @@ steps:
commands: commands:
- pytest -v -s samplers - pytest -v -s samplers
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
commands:
- pytest -v -s samplers

48
.github/CODEOWNERS vendored
View File

@@ -2,42 +2,60 @@
# for more info about CODEOWNERS file # for more info about CODEOWNERS file
# This lists cover the "core" components of vLLM that require careful review # This lists cover the "core" components of vLLM that require careful review
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn /vllm/compilation @zou3519 @youkaichao @ProExpertProg
/vllm/model_executor/layers/attention @LucasWilkinson /vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery
/vllm/lora @jeejeelee
/vllm/model_executor/layers/attention @LucasWilkinson @MatthewBonanni
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
/vllm/model_executor/layers/mamba @tdoublep /vllm/model_executor/layers/mamba @tdoublep
/vllm/model_executor/model_loader @22quinn /vllm/model_executor/model_loader @22quinn
/vllm/model_executor/layers/batch_invariant.py @yewentao256 /vllm/model_executor/layers/batch_invariant.py @yewentao256
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
/vllm/vllm_flash_attn @LucasWilkinson /vllm/vllm_flash_attn @LucasWilkinson @MatthewBonanni
/vllm/lora @jeejeelee
/vllm/reasoning @aarnphm @chaunceyjiang
/vllm/entrypoints @aarnphm @chaunceyjiang
/vllm/tool_parsers @aarnphm @chaunceyjiang
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
/vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery
CMakeLists.txt @tlrmchlsmth @LucasWilkinson CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# Any change to the VllmConfig changes can have a large user-facing impact, # Any change to the VllmConfig changes can have a large user-facing impact,
# so spam a lot of people # so spam a lot of people
/vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg /vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
/vllm/config/cache.py @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345 /vllm/config/cache.py @heheda12345
# Entrypoints
/vllm/entrypoints/anthropic @mgoin @DarkLight1337
/vllm/entrypoints/cli @hmellor @mgoin @DarkLight1337 @russellb
/vllm/entrypoints/mcp @heheda12345
/vllm/entrypoints/openai @aarnphm @chaunceyjiang @DarkLight1337 @russellb
/vllm/entrypoints/openai/realtime @njhill
/vllm/entrypoints/openai/speech_to_text @NickLucche
/vllm/entrypoints/pooling @noooop
/vllm/entrypoints/sagemaker @DarkLight1337
/vllm/entrypoints/serve @njhill
/vllm/entrypoints/*.py @njhill
/vllm/entrypoints/chat_utils.py @DarkLight1337
/vllm/entrypoints/llm.py @DarkLight1337
# Input/Output Processing
/vllm/sampling_params.py @njhill @NickLucche
/vllm/pooling_params.py @noooop @DarkLight1337
/vllm/tokenizers @DarkLight1337 @njhill
/vllm/renderers @DarkLight1337 @njhill
/vllm/reasoning @aarnphm @chaunceyjiang
/vllm/tool_parsers @aarnphm @chaunceyjiang
# vLLM V1 # vLLM V1
/vllm/v1/attention @LucasWilkinson /vllm/v1/attention @LucasWilkinson @MatthewBonanni
/vllm/v1/attention/backend.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill /vllm/v1/attention/backend.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
/vllm/v1/attention/backends/mla @pavanimajety /vllm/v1/attention/backends/mla @pavanimajety
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety /vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
/vllm/v1/attention/backends/triton_attn.py @tdoublep /vllm/v1/attention/backends/triton_attn.py @tdoublep
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery /vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
/vllm/v1/sample @22quinn @houseroad @njhill /vllm/v1/sample @22quinn @houseroad @njhill
/vllm/v1/spec_decode @benchislett @luccafong /vllm/v1/spec_decode @benchislett @luccafong @MatthewBonanni
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
/vllm/v1/kv_cache_interface.py @heheda12345 /vllm/v1/kv_cache_interface.py @heheda12345
/vllm/v1/kv_offload @ApostaC @orozery /vllm/v1/kv_offload @ApostaC @orozery
/vllm/v1/worker/gpu/kv_connector.py @orozery /vllm/v1/worker/gpu/kv_connector.py @orozery
/vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery /vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery @NickLucche
# Model runner V2 # Model runner V2
/vllm/v1/worker/gpu @WoosukKwon /vllm/v1/worker/gpu @WoosukKwon
@@ -115,8 +133,8 @@ mkdocs.yaml @hmellor
/vllm/model_executor/models/mixtral*.py @patrickvonplaten /vllm/model_executor/models/mixtral*.py @patrickvonplaten
/vllm/model_executor/models/voxtral*.py @patrickvonplaten /vllm/model_executor/models/voxtral*.py @patrickvonplaten
/vllm/model_executor/models/pixtral*.py @patrickvonplaten /vllm/model_executor/models/pixtral*.py @patrickvonplaten
/vllm/tokenizers/mistral.py @patrickvonplaten
/vllm/transformers_utils/configs/mistral.py @patrickvonplaten /vllm/transformers_utils/configs/mistral.py @patrickvonplaten
/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
# Kernels # Kernels
/vllm/v1/attention/ops/chunked_prefill_paged_decode.py @tdoublep /vllm/v1/attention/ops/chunked_prefill_paged_decode.py @tdoublep
@@ -152,9 +170,7 @@ mkdocs.yaml @hmellor
/examples/pooling @noooop /examples/pooling @noooop
/tests/models/*/pooling* @noooop /tests/models/*/pooling* @noooop
/tests/entrypoints/pooling @noooop /tests/entrypoints/pooling @noooop
/vllm/entrypoints/pooling @noooop
/vllm/config/pooler.py @noooop /vllm/config/pooler.py @noooop
/vllm/pooling_params.py @noooop
/vllm/model_executor/layers/pooler @noooop /vllm/model_executor/layers/pooler @noooop
# Security guide and policies # Security guide and policies

View File

@@ -19,6 +19,7 @@ jobs:
uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
with: with:
python-version: '3.12' python-version: '3.12'
cache: 'pip'
- name: Install Python dependencies - name: Install Python dependencies
run: | run: |

3
.gitignore vendored
View File

@@ -238,3 +238,6 @@ ep_kernels_workspace/
vllm/grpc/vllm_engine_pb2.py vllm/grpc/vllm_engine_pb2.py
vllm/grpc/vllm_engine_pb2_grpc.py vllm/grpc/vllm_engine_pb2_grpc.py
vllm/grpc/vllm_engine_pb2.pyi vllm/grpc/vllm_engine_pb2.pyi
# Ignore generated cpu headers
csrc/cpu/cpu_attn_dispatch_generated.h

View File

@@ -121,24 +121,9 @@ repos:
name: Update Dockerfile dependency graph name: Update Dockerfile dependency graph
entry: tools/pre_commit/update-dockerfile-graph.sh entry: tools/pre_commit/update-dockerfile-graph.sh
language: script language: script
- id: enforce-import-regex-instead-of-re - id: check-forbidden-imports
name: Enforce import regex as re name: Check for forbidden imports
entry: python tools/pre_commit/enforce_regex_import.py entry: python tools/pre_commit/check_forbidden_imports.py
language: python
types: [python]
pass_filenames: false
additional_dependencies: [regex]
# forbid directly import triton
- id: forbid-direct-triton-import
name: "Forbid direct 'import triton'"
entry: python tools/pre_commit/check_triton_import.py
language: python
types: [python]
pass_filenames: false
additional_dependencies: [regex]
- id: check-pickle-imports
name: Prevent new pickle/cloudpickle imports
entry: python tools/pre_commit/check_pickle_imports.py
language: python language: python
types: [python] types: [python]
additional_dependencies: [regex] additional_dependencies: [regex]
@@ -158,6 +143,11 @@ repos:
name: Check attention backend documentation is up to date name: Check attention backend documentation is up to date
entry: python tools/pre_commit/generate_attention_backend_docs.py --check entry: python tools/pre_commit/generate_attention_backend_docs.py --check
language: python language: python
- id: check-boolean-context-manager
name: Check for boolean ops in with-statements
entry: python tools/pre_commit/check_boolean_context_manager.py
language: python
types: [python]
# Keep `suggestion` last # Keep `suggestion` last
- id: suggestion - id: suggestion
name: Suggestion name: Suggestion

View File

@@ -9,13 +9,14 @@ build:
python: "3.12" python: "3.12"
jobs: jobs:
post_checkout: post_checkout:
- git fetch --unshallow || true - git fetch origin main --unshallow --no-tags --filter=blob:none || true
pre_create_environment:
- pip install uv
create_environment:
- uv venv $READTHEDOCS_VIRTUALENV_PATH
install:
- uv pip install --python $READTHEDOCS_VIRTUALENV_PATH/bin/python --no-cache-dir -r requirements/docs.txt
mkdocs: mkdocs:
configuration: mkdocs.yaml configuration: mkdocs.yaml
fail_on_warning: true fail_on_warning: true
# Optionally declare the Python requirements required to build your docs
python:
install:
- requirements: requirements/docs.txt

View File

@@ -56,8 +56,8 @@ endif()
# requirements.txt files and should be kept consistent. The ROCm torch # requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from docker/Dockerfile.rocm # versions are derived from docker/Dockerfile.rocm
# #
set(TORCH_SUPPORTED_VERSION_CUDA "2.9.1") set(TORCH_SUPPORTED_VERSION_CUDA "2.10.0")
set(TORCH_SUPPORTED_VERSION_ROCM "2.9.1") set(TORCH_SUPPORTED_VERSION_ROCM "2.10.0")
# #
# Try to find python package with an executable that exactly matches # Try to find python package with an executable that exactly matches
@@ -293,6 +293,7 @@ set(VLLM_EXT_SRC
"csrc/fused_qknorm_rope_kernel.cu" "csrc/fused_qknorm_rope_kernel.cu"
"csrc/layernorm_quant_kernels.cu" "csrc/layernorm_quant_kernels.cu"
"csrc/sampler.cu" "csrc/sampler.cu"
"csrc/topk.cu"
"csrc/cuda_view.cu" "csrc/cuda_view.cu"
"csrc/quantization/gptq/q_gemm.cu" "csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/w8a8/int8/scaled_quant.cu" "csrc/quantization/w8a8/int8/scaled_quant.cu"
@@ -770,6 +771,24 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif() endif()
endif() endif()
# DeepSeek V3 fused A GEMM kernel (requires SM 9.0+, Hopper and later)
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(DSV3_FUSED_A_GEMM_ARCHS "9.0a;10.0f;11.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(DSV3_FUSED_A_GEMM_ARCHS "9.0a;10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND DSV3_FUSED_A_GEMM_ARCHS)
set(DSV3_FUSED_A_GEMM_SRC "csrc/dsv3_fused_a_gemm.cu")
set_gencode_flags_for_srcs(
SRCS "${DSV3_FUSED_A_GEMM_SRC}"
CUDA_ARCHS "${DSV3_FUSED_A_GEMM_ARCHS}")
list(APPEND VLLM_EXT_SRC ${DSV3_FUSED_A_GEMM_SRC})
message(STATUS "Building dsv3_fused_a_gemm for archs: ${DSV3_FUSED_A_GEMM_ARCHS}")
else()
message(STATUS "Not building dsv3_fused_a_gemm as no compatible archs found "
"in CUDA target architectures.")
endif()
# moe_data.cu is used by all CUTLASS MoE kernels. # moe_data.cu is used by all CUTLASS MoE kernels.
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}") cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
@@ -1081,6 +1100,27 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Not building Marlin MOE kernels as no compatible archs found" message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
" in CUDA target architectures") " in CUDA target architectures")
endif() endif()
# DeepSeek V3 router GEMM kernel - requires SM90+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(DSV3_ROUTER_GEMM_ARCHS "9.0a;10.0f;11.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(DSV3_ROUTER_GEMM_ARCHS "9.0a;10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND DSV3_ROUTER_GEMM_ARCHS)
set(DSV3_ROUTER_GEMM_SRC
"csrc/moe/dsv3_router_gemm_entry.cu"
"csrc/moe/dsv3_router_gemm_float_out.cu"
"csrc/moe/dsv3_router_gemm_bf16_out.cu")
set_gencode_flags_for_srcs(
SRCS "${DSV3_ROUTER_GEMM_SRC}"
CUDA_ARCHS "${DSV3_ROUTER_GEMM_ARCHS}")
list(APPEND VLLM_MOE_EXT_SRC "${DSV3_ROUTER_GEMM_SRC}")
message(STATUS "Building DSV3 router GEMM kernel for archs: ${DSV3_ROUTER_GEMM_ARCHS}")
else()
message(STATUS "Not building DSV3 router GEMM kernel as no compatible archs found"
" (requires SM90+ and CUDA >= 12.0)")
endif()
endif() endif()
message(STATUS "Enabling moe extension.") message(STATUS "Enabling moe extension.")

View File

@@ -11,7 +11,7 @@ This directory used to contain vLLM's benchmark scripts and utilities for perfor
## Usage ## Usage
For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/contributing/benchmarks.html#benchmark-cli). For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/benchmarking/cli/#benchmark-cli).
For full CLI reference see: For full CLI reference see:

View File

@@ -229,3 +229,40 @@ def get_batch_stats(requests: list[BatchRequest]) -> dict:
sum(r.kv_len for r in requests) / len(requests) if requests else 0 sum(r.kv_len for r in requests) / len(requests) if requests else 0
), ),
} }
def get_batch_type(batch_spec: str, spec_decode_threshold: int = 8) -> str:
"""
Classify a batch spec into a type string.
Args:
batch_spec: Batch specification string (e.g., "q2k", "8q1s1k", "2q2k_8q1s1k")
spec_decode_threshold: Max q_len to be considered spec-decode vs extend
Returns:
Type string: "prefill", "decode", "spec-decode", "extend", or "mixed (types...)"
"""
requests = parse_batch_spec(batch_spec)
# Classify each request
types_present = set()
for req in requests:
if req.is_decode:
types_present.add("decode")
elif req.is_prefill:
types_present.add("prefill")
elif req.is_extend:
# Distinguish spec-decode (small q_len) from extend (chunked prefill)
if req.q_len <= spec_decode_threshold:
types_present.add("spec-decode")
else:
types_present.add("extend")
if len(types_present) == 1:
return types_present.pop()
elif len(types_present) > 1:
# Sort for consistent output
sorted_types = sorted(types_present)
return f"mixed ({'+'.join(sorted_types)})"
else:
return "unknown"

View File

@@ -43,6 +43,7 @@ from common import (
ModelParameterSweep, ModelParameterSweep,
ParameterSweep, ParameterSweep,
ResultsFormatter, ResultsFormatter,
batch_spec_sort_key,
is_mla_backend, is_mla_backend,
) )
@@ -218,10 +219,13 @@ def run_model_parameter_sweep(
by_param_and_spec[key].append(r) by_param_and_spec[key].append(r)
break break
# Sort by param value then spec # Sort by param value then spec (batch_size, q_len, kv_len)
sorted_keys = sorted( sorted_keys = sorted(
by_param_and_spec.keys(), by_param_and_spec.keys(),
key=lambda x: (int(x[0]) if x[0].isdigit() else x[0], x[1]), key=lambda x: (
int(x[0]) if x[0].isdigit() else x[0],
batch_spec_sort_key(x[1]),
),
) )
current_param_value = None current_param_value = None
@@ -330,7 +334,7 @@ def run_parameter_sweep(
by_spec[spec] = [] by_spec[spec] = []
by_spec[spec].append(r) by_spec[spec].append(r)
for spec in sorted(by_spec.keys()): for spec in sorted(by_spec.keys(), key=batch_spec_sort_key):
results = by_spec[spec] results = by_spec[spec]
best = min(results, key=lambda r: r.mean_time) best = min(results, key=lambda r: r.mean_time)
console.print( console.print(
@@ -496,9 +500,12 @@ def main():
if "description" in yaml_config: if "description" in yaml_config:
console.print(f"[dim]{yaml_config['description']}[/]") console.print(f"[dim]{yaml_config['description']}[/]")
# Override args with YAML values # Override args with YAML values, but CLI args take precedence
# (YAML takes precedence unless CLI arg was explicitly set) # Check if CLI provided backends (they would be non-None and not default)
# Backend(s) cli_backends_provided = args.backends is not None or args.backend is not None
# Backend(s) - only use YAML if CLI didn't specify
if not cli_backends_provided:
if "backend" in yaml_config: if "backend" in yaml_config:
args.backend = yaml_config["backend"] args.backend = yaml_config["backend"]
args.backends = None args.backends = None
@@ -544,13 +551,15 @@ def main():
args.num_kv_heads = model.get("num_kv_heads", args.num_kv_heads) args.num_kv_heads = model.get("num_kv_heads", args.num_kv_heads)
args.block_size = model.get("block_size", args.block_size) args.block_size = model.get("block_size", args.block_size)
# Benchmark settings # Benchmark settings (top-level keys)
if "benchmark" in yaml_config: if "device" in yaml_config:
bench = yaml_config["benchmark"] args.device = yaml_config["device"]
args.device = bench.get("device", args.device) if "repeats" in yaml_config:
args.repeats = bench.get("repeats", args.repeats) args.repeats = yaml_config["repeats"]
args.warmup_iters = bench.get("warmup_iters", args.warmup_iters) if "warmup_iters" in yaml_config:
args.profile_memory = bench.get("profile_memory", args.profile_memory) args.warmup_iters = yaml_config["warmup_iters"]
if "profile_memory" in yaml_config:
args.profile_memory = yaml_config["profile_memory"]
# Parameter sweep configuration # Parameter sweep configuration
if "parameter_sweep" in yaml_config: if "parameter_sweep" in yaml_config:

View File

@@ -12,16 +12,36 @@ from typing import Any
import numpy as np import numpy as np
import torch import torch
from batch_spec import get_batch_type, parse_batch_spec
from rich.console import Console from rich.console import Console
from rich.table import Table from rich.table import Table
def batch_spec_sort_key(spec: str) -> tuple[int, int, int]:
"""
Extract sorting key from batch spec: (batch_size, max_q_len, max_kv_len).
This ensures results are sorted by batch size first, then query length,
then sequence length, rather than alphabetically.
"""
try:
requests = parse_batch_spec(spec)
batch_size = len(requests)
max_q_len = max(r.q_len for r in requests) if requests else 0
max_kv_len = max(r.kv_len for r in requests) if requests else 0
return (batch_size, max_q_len, max_kv_len)
except Exception:
# Fallback for unparseable specs
return (0, 0, 0)
# Mock classes for vLLM attention infrastructure # Mock classes for vLLM attention infrastructure
class MockHfConfig: class MockHfConfig:
"""Mock HuggingFace config that satisfies vLLM's requirements.""" """Mock HuggingFace config that satisfies vLLM's requirements."""
def __init__(self, mla_dims: dict): def __init__(self, mla_dims: dict, index_topk: int | None = None):
self.num_attention_heads = mla_dims["num_q_heads"] self.num_attention_heads = mla_dims["num_q_heads"]
self.num_key_value_heads = mla_dims["num_kv_heads"] self.num_key_value_heads = mla_dims["num_kv_heads"]
self.hidden_size = mla_dims["head_dim"] * mla_dims["num_q_heads"] self.hidden_size = mla_dims["head_dim"] * mla_dims["num_q_heads"]
@@ -32,6 +52,8 @@ class MockHfConfig:
self.qk_rope_head_dim = mla_dims["qk_rope_head_dim"] self.qk_rope_head_dim = mla_dims["qk_rope_head_dim"]
self.v_head_dim = mla_dims["v_head_dim"] self.v_head_dim = mla_dims["v_head_dim"]
self.qk_head_dim = mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"] self.qk_head_dim = mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]
if index_topk is not None:
self.index_topk = index_topk
def get_text_config(self): def get_text_config(self):
return self return self
@@ -82,6 +104,38 @@ class MockKVBProj:
return (result,) # Return as tuple to match ColumnParallelLinear API return (result,) # Return as tuple to match ColumnParallelLinear API
class MockIndexer:
"""Mock Indexer for sparse MLA backends.
Provides topk_indices_buffer that sparse MLA backends use to determine
which KV cache slots to attend to for each token.
"""
def __init__(
self,
max_num_tokens: int,
topk_tokens: int,
device: torch.device,
):
self.topk_tokens = topk_tokens
self.topk_indices_buffer = torch.zeros(
(max_num_tokens, topk_tokens),
dtype=torch.int32,
device=device,
)
def fill_random_indices(self, num_tokens: int, max_kv_len: int):
"""Fill topk_indices_buffer with random valid indices for benchmarking."""
indices = torch.randint(
0,
max_kv_len,
(num_tokens, self.topk_tokens),
dtype=torch.int32,
device=self.topk_indices_buffer.device,
)
self.topk_indices_buffer[:num_tokens] = indices
class MockLayer(AttentionLayerBase): class MockLayer(AttentionLayerBase):
"""Mock attention layer with scale parameters and impl. """Mock attention layer with scale parameters and impl.
@@ -316,14 +370,19 @@ class ResultsFormatter:
backends: List of backend names being compared backends: List of backend names being compared
compare_to_fastest: Show percentage comparison to fastest compare_to_fastest: Show percentage comparison to fastest
""" """
# Group by batch spec # Group by batch spec, preserving first-occurrence order
by_spec = {} by_spec = {}
specs_order = []
for r in results: for r in results:
spec = r.config.batch_spec spec = r.config.batch_spec
if spec not in by_spec: if spec not in by_spec:
by_spec[spec] = {} by_spec[spec] = {}
specs_order.append(spec)
by_spec[spec][r.config.backend] = r by_spec[spec][r.config.backend] = r
# Sort specs by (batch_size, q_len, kv_len) instead of alphabetically
specs_order = sorted(by_spec.keys(), key=batch_spec_sort_key)
# Create shortened backend names for display # Create shortened backend names for display
def shorten_backend_name(name: str) -> str: def shorten_backend_name(name: str) -> str:
"""Shorten long backend names for table display.""" """Shorten long backend names for table display."""
@@ -337,6 +396,8 @@ class ResultsFormatter:
table = Table(title="Attention Benchmark Results") table = Table(title="Attention Benchmark Results")
table.add_column("Batch\nSpec", no_wrap=True) table.add_column("Batch\nSpec", no_wrap=True)
table.add_column("Type", no_wrap=True)
table.add_column("Batch\nSize", justify="right", no_wrap=True)
multi = len(backends) > 1 multi = len(backends) > 1
for backend in backends: for backend in backends:
@@ -350,12 +411,14 @@ class ResultsFormatter:
table.add_column(col_rel, justify="right", no_wrap=False) table.add_column(col_rel, justify="right", no_wrap=False)
# Add rows # Add rows
for spec in sorted(by_spec.keys()): for spec in specs_order:
spec_results = by_spec[spec] spec_results = by_spec[spec]
times = {b: r.mean_time for b, r in spec_results.items() if r.success} times = {b: r.mean_time for b, r in spec_results.items() if r.success}
best_time = min(times.values()) if times else 0.0 best_time = min(times.values()) if times else 0.0
row = [spec] batch_type = get_batch_type(spec)
batch_size = len(parse_batch_spec(spec))
row = [spec, batch_type, str(batch_size)]
for backend in backends: for backend in backends:
if backend in spec_results: if backend in spec_results:
r = spec_results[backend] r = spec_results[backend]
@@ -486,10 +549,11 @@ def get_attention_scale(head_dim: int) -> float:
def is_mla_backend(backend: str) -> bool: def is_mla_backend(backend: str) -> bool:
""" """
Check if backend is an MLA backend using the backend's is_mla() property. Check if backend is an MLA backend using the AttentionBackendEnum.
Args: Args:
backend: Backend name (e.g., "CUTLASS_MLA", "FLASHINFER_MLA") backend: Backend name matching AttentionBackendEnum exactly
(e.g., "FLASHMLA_SPARSE")
Returns: Returns:
True if the backend is an MLA backend, False otherwise True if the backend is an MLA backend, False otherwise
@@ -497,7 +561,8 @@ def is_mla_backend(backend: str) -> bool:
from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.attention.backends.registry import AttentionBackendEnum
try: try:
backend_class = AttentionBackendEnum[backend.upper()].get_class() backend_enum = AttentionBackendEnum[backend]
backend_class = backend_enum.get_class()
return backend_class.is_mla() return backend_class.is_mla()
except (KeyError, ValueError, ImportError): except (KeyError, ValueError, ImportError, AttributeError):
return False return False

View File

@@ -3,7 +3,7 @@
model: model:
name: "deepseek-v3" name: "deepseek-v3"
num_layers: 60 num_layers: 60
num_q_heads: 128 num_q_heads: 128 # Base value, can be swept for TP simulation
num_kv_heads: 1 # MLA uses single latent KV num_kv_heads: 1 # MLA uses single latent KV
head_dim: 576 head_dim: 576
kv_lora_rank: 512 kv_lora_rank: 512
@@ -12,6 +12,13 @@ model:
v_head_dim: 128 v_head_dim: 128
block_size: 128 # CUTLASS MLA and FlashAttn MLA use 128 block_size: 128 # CUTLASS MLA and FlashAttn MLA use 128
# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
model_parameter_sweep:
param_name: "num_q_heads"
values: [128, 64, 32, 16]
label_format: "{backend}_{value}h"
batch_specs: batch_specs:
# Small batches, varying sequence lengths # Small batches, varying sequence lengths
- "16q1s512" # 16 requests, 512 KV cache - "16q1s512" # 16 requests, 512 KV cache
@@ -34,28 +41,30 @@ batch_specs:
# Very large batches # Very large batches
- "128q1s1k" # 128 requests, 1k KV cache - "128q1s1k" # 128 requests, 1k KV cache
- "128q1s2k" # 128 requests, 2k KV cache - "128q1s2k" # 128 requests, 2k KV cache
- "128q1s4k" # 128 requests, 4k KV cache
- "128q1s8k" # 128 requests, 8k KV cache
# Long context # Long context
- "32q1s16k" # 32 requests, 16k KV cache - "32q1s16k" # 32 requests, 16k KV cache
- "32q1s32k" # 32 requests, 32k KV cache - "32q1s32k" # 32 requests, 32k KV cache
backends: backends:
- cutlass_mla - CUTLASS_MLA
- flashinfer_mla - FLASHINFER_MLA
- flashattn_mla # Hopper only - FLASH_ATTN_MLA # Hopper only
- flashmla # Hopper only - FLASHMLA # Hopper only
device: "cuda:0" device: "cuda:0"
repeats: 5 repeats: 100
warmup_iters: 3 warmup_iters: 10
profile_memory: true profile_memory: true
# Backend-specific tuning # Backend-specific tuning
cutlass_mla: CUTLASS_MLA:
num_kv_splits: auto # or specific value like 4, 8, 16 num_kv_splits: auto # or specific value like 4, 8, 16
flashattn_mla: FLASH_ATTN_MLA:
reorder_batch_threshold: 512 reorder_batch_threshold: 512
flashmla: FLASHMLA:
reorder_batch_threshold: 1 reorder_batch_threshold: 1

View File

@@ -45,10 +45,10 @@ batch_specs:
- "4q4k_60q1s4k" # 4 prefill + 60 decode - "4q4k_60q1s4k" # 4 prefill + 60 decode
backends: backends:
- cutlass_mla - CUTLASS_MLA
- flashinfer_mla - FLASHINFER_MLA
- flashattn_mla # Hopper only - FLASH_ATTN_MLA # Hopper only
- flashmla # Hopper only - FLASHMLA # Hopper only
device: "cuda:0" device: "cuda:0"
repeats: 5 repeats: 5

View File

@@ -0,0 +1,62 @@
# MLA prefill-only benchmark configuration for sparse backends
model:
name: "deepseek-v3"
num_layers: 60
num_q_heads: 128
num_kv_heads: 1
head_dim: 576
kv_lora_rank: 512
qk_nope_head_dim: 128
qk_rope_head_dim: 64
v_head_dim: 128
block_size: 128
# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
model_parameter_sweep:
param_name: "num_q_heads"
values: [128, 64, 32, 16]
label_format: "{backend}_{value}h"
batch_specs:
# Pure prefill
- "1q512"
- "1q1k"
- "1q2k"
- "1q4k"
- "1q8k"
# Batched pure prefill
- "2q512"
- "2q1k"
- "2q2k"
- "2q4k"
- "2q8k"
- "4q512"
- "4q1k"
- "4q2k"
- "4q4k"
- "4q8k"
- "8q512"
- "8q1k"
- "8q2k"
- "8q4k"
- "8q8k"
# Extend
- "1q512s4k"
- "1q512s8k"
- "1q1ks8k"
- "1q2ks8k"
- "1q2ks16k"
- "1q4ks16k"
backends:
- FLASHMLA_SPARSE
- FLASHINFER_MLA_SPARSE
device: "cuda:0"
repeats: 10
warmup_iters: 3
profile_memory: true

View File

@@ -6,7 +6,7 @@
description: "Decode vs Prefill pipeline crossover analysis" description: "Decode vs Prefill pipeline crossover analysis"
# Test FlashAttn MLA # Test FlashAttn MLA
backend: flashattn_mla backend: FLASH_ATTN_MLA
# Mode: decode_vs_prefill comparison (special sweep mode) # Mode: decode_vs_prefill comparison (special sweep mode)
# For each batch spec, we'll test both decode and prefill pipelines # For each batch spec, we'll test both decode and prefill pipelines
@@ -62,11 +62,10 @@ model:
block_size: 128 block_size: 128
# Benchmark settings # Benchmark settings
benchmark: device: "cuda:0"
device: "cuda:0" repeats: 15 # More repeats for spec decode variance
repeats: 15 # More repeats for spec decode variance warmup_iters: 5
warmup_iters: 5 profile_memory: false
profile_memory: false
# Output # Output
output: output:

View File

@@ -41,18 +41,17 @@ batch_specs:
# Backends that support query length > 1 # Backends that support query length > 1
backends: backends:
- flashattn_mla # reorder_batch_threshold = 512 - FLASH_ATTN_MLA # reorder_batch_threshold = 512
- flashmla # reorder_batch_threshold = 1 (tunable) - FLASHMLA # reorder_batch_threshold = 1 (tunable)
# FlashInfer-MLA also supports uniform spec-as-decode but with different mechanism # FlashInfer-MLA also supports uniform spec-as-decode but with different mechanism
# - flashinfer_mla # - FLASHINFER_MLA
# Benchmark settings # Benchmark settings
benchmark: device: "cuda:0"
device: "cuda:0" repeats: 10 # More repeats for statistical significance
repeats: 10 # More repeats for statistical significance warmup_iters: 5
warmup_iters: 5 profile_memory: false
profile_memory: false
# Test these threshold values for optimization # Test these threshold values for optimization
parameter_sweep: parameter_sweep:

View File

@@ -25,14 +25,22 @@ batch_specs:
- "4q1k_16q1s2k" # 4 prefill + 16 decode - "4q1k_16q1s2k" # 4 prefill + 16 decode
- "2q4k_32q1s1k" # 2 large prefill + 32 decode - "2q4k_32q1s1k" # 2 large prefill + 32 decode
# Context extension # Speculative decode (q <= 8)
- "q1ks2k" # 1k query, 2k sequence (chunked prefill) - "16q2s1k" # 16 requests, 2 spec tokens, 1k KV cache
- "16q4s1k" # 16 requests, 4 spec tokens, 1k KV cache
- "16q8s1k" # 16 requests, 8 spec tokens, 1k KV cache
- "32q4s2k" # 32 requests, 4 spec tokens, 2k KV cache
- "8q8s4k" # 8 requests, 8 spec tokens, 4k KV cache
# Context extension (chunked prefill)
- "q1ks2k" # 1k query, 2k sequence
- "2q1ks4k" # 2 requests: 1k query, 4k sequence - "2q1ks4k" # 2 requests: 1k query, 4k sequence
# Available backends: FLASH_ATTN, TRITON_ATTN, FLASHINFER
backends: backends:
- flash - FLASH_ATTN
- triton - TRITON_ATTN
- flashinfer - FLASHINFER
device: "cuda:0" device: "cuda:0"
repeats: 5 repeats: 5

View File

@@ -8,14 +8,13 @@ This module provides helpers for running MLA backends without
needing full VllmConfig integration. needing full VllmConfig integration.
""" """
import importlib
import numpy as np import numpy as np
import torch import torch
from batch_spec import parse_batch_spec from batch_spec import parse_batch_spec
from common import ( from common import (
BenchmarkResult, BenchmarkResult,
MockHfConfig, MockHfConfig,
MockIndexer,
MockKVBProj, MockKVBProj,
MockLayer, MockLayer,
setup_mla_dims, setup_mla_dims,
@@ -62,6 +61,7 @@ def create_minimal_vllm_config(
block_size: int = 128, block_size: int = 128,
max_num_seqs: int = 256, max_num_seqs: int = 256,
mla_dims: dict | None = None, mla_dims: dict | None = None,
index_topk: int | None = None,
) -> VllmConfig: ) -> VllmConfig:
""" """
Create minimal VllmConfig for MLA benchmarks. Create minimal VllmConfig for MLA benchmarks.
@@ -73,6 +73,8 @@ def create_minimal_vllm_config(
max_num_seqs: Maximum number of sequences max_num_seqs: Maximum number of sequences
mla_dims: Optional custom MLA dimensions dict. If not provided, uses mla_dims: Optional custom MLA dimensions dict. If not provided, uses
setup_mla_dims(model_name) setup_mla_dims(model_name)
index_topk: Optional topk value for sparse MLA backends. If provided,
the config will include index_topk for sparse attention.
Returns: Returns:
VllmConfig for benchmarking VllmConfig for benchmarking
@@ -82,7 +84,7 @@ def create_minimal_vllm_config(
mla_dims = setup_mla_dims(model_name) mla_dims = setup_mla_dims(model_name)
# Create mock HF config first (avoids downloading from HuggingFace) # Create mock HF config first (avoids downloading from HuggingFace)
mock_hf_config = MockHfConfig(mla_dims) mock_hf_config = MockHfConfig(mla_dims, index_topk=index_topk)
# Create a temporary minimal config.json to avoid HF downloads # Create a temporary minimal config.json to avoid HF downloads
# This ensures consistent ModelConfig construction without network access # This ensures consistent ModelConfig construction without network access
@@ -120,16 +122,12 @@ def create_minimal_vllm_config(
seed=0, seed=0,
max_model_len=32768, max_model_len=32768,
quantization=None, quantization=None,
quantization_param_path=None,
enforce_eager=False, enforce_eager=False,
max_context_len_to_capture=None,
max_seq_len_to_capture=8192,
max_logprobs=20, max_logprobs=20,
disable_sliding_window=False, disable_sliding_window=False,
skip_tokenizer_init=True, skip_tokenizer_init=True,
served_model_name=None, served_model_name=None,
limit_mm_per_prompt=None, limit_mm_per_prompt=None,
use_async_output_proc=True,
config_format="auto", config_format="auto",
) )
finally: finally:
@@ -180,56 +178,65 @@ def create_minimal_vllm_config(
# ============================================================================ # ============================================================================
# Backend name to class name prefix mapping # Backend-specific properties that can't be inferred from the backend class
_BACKEND_NAME_MAP = { # Keys are AttentionBackendEnum names (uppercase)
"flashattn_mla": "FlashAttnMLA",
"flashmla": "FlashMLA",
"flashinfer_mla": "FlashInferMLA",
"cutlass_mla": "CutlassMLA",
}
# Special properties that differ from defaults
_BACKEND_PROPERTIES = { _BACKEND_PROPERTIES = {
"flashmla": { "FLASHMLA": {
"query_format": "concat", # Single concatenated tensor (vs tuple) "query_format": "concat", # Single concatenated tensor (vs tuple)
"block_size": 64, # FlashMLA uses fixed block size
}, },
"flashinfer_mla": { "FLASHMLA_SPARSE": {
"block_size": 64, # FlashInfer MLA only supports 32 or 64 "query_format": "concat", # Single concatenated tensor (vs tuple)
}, },
} }
def _get_backend_config(backend: str) -> dict: def _get_backend_config(backend: str) -> dict:
""" """
Get backend configuration using naming conventions. Get backend configuration from AttentionBackendEnum.
All MLA backends follow the pattern: Uses the registry to get the backend class and extract configuration
- Module: vllm.v1.attention.backends.mla.{backend} from its methods (get_impl_cls, get_builder_cls, is_sparse, etc.).
- Impl: {Name}Impl
- Metadata: {Name}Metadata (or MLACommonMetadata) Args:
- DecodeMetadata: {Name}DecodeMetadata (or MLACommonDecodeMetadata) backend: Backend name matching AttentionBackendEnum exactly
- MetadataBuilder: {Name}MetadataBuilder (e.g., "FLASHMLA_SPARSE")
Returns:
Dict with backend configuration
""" """
if backend not in _BACKEND_NAME_MAP: from vllm.v1.attention.backends.registry import AttentionBackendEnum
raise ValueError(f"Unknown backend: {backend}")
name = _BACKEND_NAME_MAP[backend] try:
backend_enum = AttentionBackendEnum[backend]
backend_class = backend_enum.get_class()
except (KeyError, ValueError) as e:
valid_backends = [e.name for e in AttentionBackendEnum if e.name != "CUSTOM"]
raise ValueError(
f"Unknown backend: {backend}. "
f"Valid MLA backends: {[b for b in valid_backends if 'MLA' in b]}"
) from e
# Get block size from backend class
block_sizes = backend_class.get_supported_kernel_block_sizes()
# Use first supported block size (backends typically support one for MLA)
block_size = block_sizes[0] if block_sizes else None
if hasattr(block_size, "value"):
# Handle MultipleOf enum
block_size = None
# Check if sparse via class method if available
is_sparse = getattr(backend_class, "is_sparse", lambda: False)()
# Get properties that can't be inferred
props = _BACKEND_PROPERTIES.get(backend, {}) props = _BACKEND_PROPERTIES.get(backend, {})
# Check if backend uses common metadata (FlashInfer, CUTLASS)
uses_common = backend in ("flashinfer_mla", "cutlass_mla")
return { return {
"module": f"vllm.v1.attention.backends.mla.{backend}", "backend_class": backend_class,
"impl_class": f"{name}Impl", "impl_class": backend_class.get_impl_cls(),
"metadata_class": "MLACommonMetadata" if uses_common else f"{name}Metadata", "builder_class": backend_class.get_builder_cls(),
"decode_metadata_class": "MLACommonDecodeMetadata"
if uses_common
else f"{name}DecodeMetadata",
"builder_class": f"{name}MetadataBuilder",
"query_format": props.get("query_format", "tuple"), "query_format": props.get("query_format", "tuple"),
"block_size": props.get("block_size", None), "block_size": block_size,
"is_sparse": is_sparse,
} }
@@ -447,22 +454,26 @@ def _create_backend_impl(
mla_dims: dict, mla_dims: dict,
vllm_config: VllmConfig, vllm_config: VllmConfig,
device: torch.device, device: torch.device,
max_num_tokens: int = 8192,
index_topk: int | None = None,
): ):
""" """
Create backend implementation instance. Create backend implementation instance.
Args: Args:
backend_cfg: Backend configuration dict backend_cfg: Backend configuration dict from _get_backend_config()
mla_dims: MLA dimension configuration mla_dims: MLA dimension configuration
vllm_config: VllmConfig instance vllm_config: VllmConfig instance
device: Target device device: Target device
max_num_tokens: Maximum number of tokens for sparse indexer buffer
index_topk: Topk value for sparse MLA backends
Returns: Returns:
Tuple of (impl, layer, builder_instance) Tuple of (impl, layer, builder_instance, indexer)
""" """
# Import backend classes # Get classes from backend config (already resolved by _get_backend_config)
backend_module = importlib.import_module(backend_cfg["module"]) impl_class = backend_cfg["impl_class"]
impl_class = getattr(backend_module, backend_cfg["impl_class"]) builder_class = backend_cfg["builder_class"]
# Calculate scale # Calculate scale
scale = 1.0 / np.sqrt(mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]) scale = 1.0 / np.sqrt(mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"])
@@ -474,27 +485,45 @@ def _create_backend_impl(
v_head_dim=mla_dims["v_head_dim"], v_head_dim=mla_dims["v_head_dim"],
) )
# Create impl # Create indexer for sparse backends
impl = impl_class( indexer = None
num_heads=mla_dims["num_q_heads"], if backend_cfg.get("is_sparse", False):
head_size=mla_dims["head_dim"], if index_topk is None:
scale=scale, index_topk = 2048 # Default topk for sparse MLA
num_kv_heads=mla_dims["num_kv_heads"], indexer = MockIndexer(
alibi_slopes=None, max_num_tokens=max_num_tokens,
sliding_window=None, topk_tokens=index_topk,
kv_cache_dtype="auto", device=device,
logits_soft_cap=None,
attn_type="decoder",
kv_sharing_target_layer_name=None,
q_lora_rank=None,
kv_lora_rank=mla_dims["kv_lora_rank"],
qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
qk_rope_head_dim=mla_dims["qk_rope_head_dim"],
qk_head_dim=mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
v_head_dim=mla_dims["v_head_dim"],
kv_b_proj=mock_kv_b_proj,
) )
# Build impl kwargs
impl_kwargs = {
"num_heads": mla_dims["num_q_heads"],
"head_size": mla_dims["head_dim"],
"scale": scale,
"num_kv_heads": mla_dims["num_kv_heads"],
"alibi_slopes": None,
"sliding_window": None,
"kv_cache_dtype": "auto",
"logits_soft_cap": None,
"attn_type": "decoder",
"kv_sharing_target_layer_name": None,
"q_lora_rank": None,
"kv_lora_rank": mla_dims["kv_lora_rank"],
"qk_nope_head_dim": mla_dims["qk_nope_head_dim"],
"qk_rope_head_dim": mla_dims["qk_rope_head_dim"],
"qk_head_dim": mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
"v_head_dim": mla_dims["v_head_dim"],
"kv_b_proj": mock_kv_b_proj,
}
# Add indexer for sparse backends
if indexer is not None:
impl_kwargs["indexer"] = indexer
# Create impl
impl = impl_class(**impl_kwargs)
# Initialize DCP attributes # Initialize DCP attributes
if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size in (None, -1): if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size in (None, -1):
impl.dcp_world_size = 1 impl.dcp_world_size = 1
@@ -515,9 +544,7 @@ def _create_backend_impl(
# Create builder instance if needed # Create builder instance if needed
builder_instance = None builder_instance = None
if backend_cfg["builder_class"]: if builder_class:
builder_class = getattr(backend_module, backend_cfg["builder_class"])
# Populate static_forward_context so builder can find the layer # Populate static_forward_context so builder can find the layer
# MockLayer inherits from AttentionLayerBase, so isinstance checks pass # MockLayer inherits from AttentionLayerBase, so isinstance checks pass
vllm_config.compilation_config.static_forward_context = {"placeholder": layer} vllm_config.compilation_config.static_forward_context = {"placeholder": layer}
@@ -529,7 +556,7 @@ def _create_backend_impl(
device=device, device=device,
) )
return impl, layer, builder_instance return impl, layer, builder_instance, indexer
# ============================================================================ # ============================================================================
@@ -594,6 +621,7 @@ def _run_single_benchmark(
backend_cfg: dict, backend_cfg: dict,
mla_dims: dict, mla_dims: dict,
device: torch.device, device: torch.device,
indexer=None,
) -> BenchmarkResult: ) -> BenchmarkResult:
""" """
Run a single benchmark iteration. Run a single benchmark iteration.
@@ -606,6 +634,7 @@ def _run_single_benchmark(
backend_cfg: Backend configuration dict backend_cfg: Backend configuration dict
mla_dims: MLA dimension configuration mla_dims: MLA dimension configuration
device: Target device device: Target device
indexer: Optional MockIndexer for sparse backends
Returns: Returns:
BenchmarkResult with timing statistics BenchmarkResult with timing statistics
@@ -613,7 +642,9 @@ def _run_single_benchmark(
# Parse batch spec # Parse batch spec
requests = parse_batch_spec(config.batch_spec) requests = parse_batch_spec(config.batch_spec)
q_lens = [r.q_len for r in requests] q_lens = [r.q_len for r in requests]
kv_lens = [r.kv_len for r in requests]
total_q = sum(q_lens) total_q = sum(q_lens)
max_kv_len = max(kv_lens)
# Determine block size # Determine block size
block_size = backend_cfg["block_size"] or config.block_size block_size = backend_cfg["block_size"] or config.block_size
@@ -641,8 +672,16 @@ def _run_single_benchmark(
torch.bfloat16, torch.bfloat16,
) )
# Determine which forward method to use based on metadata # Fill indexer with random indices for sparse backends
if metadata.decode is not None: is_sparse = backend_cfg.get("is_sparse", False)
if is_sparse and indexer is not None:
indexer.fill_random_indices(total_q, max_kv_len)
# Determine which forward method to use
if is_sparse:
# Sparse backends use forward_mqa
forward_fn = lambda: impl.forward_mqa(decode_inputs, kv_cache, metadata, layer)
elif metadata.decode is not None:
forward_fn = lambda: impl._forward_decode( forward_fn = lambda: impl._forward_decode(
decode_inputs, kv_cache, metadata, layer decode_inputs, kv_cache, metadata, layer
) )
@@ -693,11 +732,13 @@ def _run_single_benchmark(
def _run_mla_benchmark_batched( def _run_mla_benchmark_batched(
backend: str, backend: str,
configs_with_params: list[tuple], # [(config, threshold, num_splits), ...] configs_with_params: list[tuple], # [(config, threshold, num_splits), ...]
index_topk: int = 2048,
) -> list[BenchmarkResult]: ) -> list[BenchmarkResult]:
""" """
Unified batched MLA benchmark runner for all backends. Unified batched MLA benchmark runner for all backends.
Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla,
flashinfer_mla_sparse, flashmla_sparse
This function reuses backend initialization across multiple benchmarks This function reuses backend initialization across multiple benchmarks
to avoid setup/teardown overhead. to avoid setup/teardown overhead.
@@ -707,6 +748,7 @@ def _run_mla_benchmark_batched(
configs_with_params: List of (config, threshold, num_splits) tuples configs_with_params: List of (config, threshold, num_splits) tuples
- threshold: reorder_batch_threshold (FlashAttn/FlashMLA only) - threshold: reorder_batch_threshold (FlashAttn/FlashMLA only)
- num_splits: num_kv_splits (CUTLASS only) - num_splits: num_kv_splits (CUTLASS only)
index_topk: Topk value for sparse MLA backends (default 2048)
Returns: Returns:
List of BenchmarkResult objects List of BenchmarkResult objects
@@ -730,19 +772,27 @@ def _run_mla_benchmark_batched(
if mla_dims is None: if mla_dims is None:
mla_dims = setup_mla_dims("deepseek-v3") mla_dims = setup_mla_dims("deepseek-v3")
# Determine if this is a sparse backend
is_sparse = backend_cfg.get("is_sparse", False)
# Create and set vLLM config for MLA (reused across all benchmarks) # Create and set vLLM config for MLA (reused across all benchmarks)
vllm_config = create_minimal_vllm_config( vllm_config = create_minimal_vllm_config(
model_name="deepseek-v3", # Used only for model path model_name="deepseek-v3", # Used only for model path
block_size=block_size, block_size=block_size,
mla_dims=mla_dims, # Use custom dims from config or default mla_dims=mla_dims, # Use custom dims from config or default
index_topk=index_topk if is_sparse else None,
) )
results = [] results = []
with set_current_vllm_config(vllm_config): with set_current_vllm_config(vllm_config):
# Create backend impl, layer, and builder (reused across benchmarks) # Create backend impl, layer, builder, and indexer (reused across benchmarks)
impl, layer, builder_instance = _create_backend_impl( impl, layer, builder_instance, indexer = _create_backend_impl(
backend_cfg, mla_dims, vllm_config, device backend_cfg,
mla_dims,
vllm_config,
device,
index_topk=index_topk if is_sparse else None,
) )
# Run each benchmark with the shared impl # Run each benchmark with the shared impl
@@ -768,6 +818,7 @@ def _run_mla_benchmark_batched(
backend_cfg, backend_cfg,
mla_dims, mla_dims,
device, device,
indexer=indexer,
) )
results.append(result) results.append(result)
@@ -793,20 +844,24 @@ def run_mla_benchmark(
config, config,
reorder_batch_threshold: int | None = None, reorder_batch_threshold: int | None = None,
num_kv_splits: int | None = None, num_kv_splits: int | None = None,
index_topk: int = 2048,
) -> BenchmarkResult | list[BenchmarkResult]: ) -> BenchmarkResult | list[BenchmarkResult]:
""" """
Unified MLA benchmark runner for all backends. Unified MLA benchmark runner for all backends.
Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla,
flashinfer_mla_sparse, flashmla_sparse
Always uses batched execution internally for optimal performance. Always uses batched execution internally for optimal performance.
Args: Args:
backend: Backend name (flashattn_mla, flashmla, flashinfer_mla, cutlass_mla) backend: Backend name (flashattn_mla, flashmla, flashinfer_mla, cutlass_mla,
flashinfer_mla_sparse, flashmla_sparse)
config: BenchmarkConfig or list of (BenchmarkConfig, param) tuples config: BenchmarkConfig or list of (BenchmarkConfig, param) tuples
reorder_batch_threshold: Threshold override for FlashAttn/FlashMLA reorder_batch_threshold: Threshold override for FlashAttn/FlashMLA
(single config mode only) (single config mode only)
num_kv_splits: Number of KV splits for CUTLASS (single config mode only) num_kv_splits: Number of KV splits for CUTLASS (single config mode only)
index_topk: Topk value for sparse MLA backends (default 2048)
Returns: Returns:
BenchmarkResult (single mode) or list of BenchmarkResult (batched mode) BenchmarkResult (single mode) or list of BenchmarkResult (batched mode)
@@ -816,9 +871,9 @@ def run_mla_benchmark(
# Already in batched format # Already in batched format
if len(config) > 0 and isinstance(config[0], tuple): if len(config) > 0 and isinstance(config[0], tuple):
# Format: [(cfg, param), ...] where param is threshold or num_splits # Format: [(cfg, param), ...] where param is threshold or num_splits
if backend in ("flashattn_mla", "flashmla"): if backend in ("flashattn_mla", "flashmla", "flashmla_sparse"):
configs_with_params = [(cfg, param, None) for cfg, param in config] configs_with_params = [(cfg, param, None) for cfg, param in config]
else: # cutlass_mla or flashinfer_mla else: # cutlass_mla, flashinfer_mla, or sparse backends
configs_with_params = [(cfg, None, param) for cfg, param in config] configs_with_params = [(cfg, None, param) for cfg, param in config]
else: else:
# Format: [cfg, ...] - just configs # Format: [cfg, ...] - just configs
@@ -830,7 +885,7 @@ def run_mla_benchmark(
return_single = True return_single = True
# Use unified batched execution # Use unified batched execution
results = _run_mla_benchmark_batched(backend, configs_with_params) results = _run_mla_benchmark_batched(backend, configs_with_params, index_topk)
# Return single result or list based on input # Return single result or list based on input
return results[0] if return_single else results return results[0] if return_single else results

View File

@@ -8,7 +8,9 @@ This module provides helpers for running standard attention backends
(FlashAttention, Triton, FlashInfer) with real vLLM integration. (FlashAttention, Triton, FlashInfer) with real vLLM integration.
""" """
import logging
import types import types
from contextlib import contextmanager
import numpy as np import numpy as np
import torch import torch
@@ -24,8 +26,13 @@ from vllm.config import (
ParallelConfig, ParallelConfig,
SchedulerConfig, SchedulerConfig,
VllmConfig, VllmConfig,
set_current_vllm_config,
)
from vllm.v1.attention.backends.utils import (
CommonAttentionMetadata,
get_kv_cache_layout,
set_kv_cache_layout,
) )
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
from vllm.v1.kv_cache_interface import FullAttentionSpec from vllm.v1.kv_cache_interface import FullAttentionSpec
# ============================================================================ # ============================================================================
@@ -33,37 +40,41 @@ from vllm.v1.kv_cache_interface import FullAttentionSpec
# ============================================================================ # ============================================================================
_BACKEND_CONFIG = {
"flash": {
"module": "vllm.v1.attention.backends.flash_attn",
"backend_class": "FlashAttentionBackend",
"dtype": torch.float16,
"cache_layout": "standard",
# ^ [2, num_blocks, block_size, num_kv_heads, head_dim]
},
"triton": {
"module": "vllm.v1.attention.backends.triton_attn",
"backend_class": "TritonAttentionBackend",
"dtype": torch.float32,
"cache_layout": "standard",
},
"flashinfer": {
"module": "vllm.v1.attention.backends.flashinfer",
"backend_class": "FlashInferBackend",
"dtype": torch.float16,
"cache_layout": "flashinfer",
# ^ [num_blocks, 2, block_size, num_kv_heads, head_dim]
},
}
def _get_backend_config(backend: str) -> dict: def _get_backend_config(backend: str) -> dict:
if backend not in _BACKEND_CONFIG: """
Get backend configuration from AttentionBackendEnum.
Args:
backend: Backend name matching AttentionBackendEnum exactly
(e.g., "FLASH_ATTN", "TRITON_ATTN", "FLASHINFER")
Returns:
Dict with backend_class
"""
from vllm.v1.attention.backends.registry import AttentionBackendEnum
try:
backend_enum = AttentionBackendEnum[backend]
backend_class = backend_enum.get_class()
except (KeyError, ValueError) as e:
valid_backends = [b.name for b in AttentionBackendEnum if b.name != "CUSTOM"]
raise ValueError( raise ValueError(
f"Unknown backend: {backend}. " f"Unknown backend: {backend}. Valid backends: {valid_backends}"
f"Available: {', '.join(_BACKEND_CONFIG.keys())}" ) from e
)
return _BACKEND_CONFIG[backend] return {"backend_class": backend_class}
@contextmanager
def log_warnings_and_errors_only():
"""Temporarily set vLLM logger to WARNING level."""
logger = logging.getLogger("vllm")
old_level = logger.level
logger.setLevel(logging.WARNING)
try:
yield
finally:
logger.setLevel(old_level)
# ============================================================================ # ============================================================================
@@ -88,11 +99,7 @@ def _build_common_attn_metadata(
query_start_loc_cpu = query_start_loc.cpu() query_start_loc_cpu = query_start_loc.cpu()
seq_lens = torch.tensor(kv_lens, dtype=torch.int32, device=device) seq_lens = torch.tensor(kv_lens, dtype=torch.int32, device=device)
seq_lens_cpu = seq_lens.cpu() max_seq_len = int(seq_lens.max().item())
max_seq_len = int(seq_lens_cpu.max())
context_lens = [kv - q for kv, q in zip(kv_lens, q_lens)]
num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32)
max_blocks = (max(kv_lens) + block_size - 1) // block_size max_blocks = (max(kv_lens) + block_size - 1) // block_size
num_blocks = batch_size * max_blocks num_blocks = batch_size * max_blocks
@@ -107,8 +114,6 @@ def _build_common_attn_metadata(
query_start_loc=query_start_loc, query_start_loc=query_start_loc,
query_start_loc_cpu=query_start_loc_cpu, query_start_loc_cpu=query_start_loc_cpu,
seq_lens=seq_lens, seq_lens=seq_lens,
seq_lens_cpu=seq_lens_cpu,
num_computed_tokens_cpu=num_computed_tokens_cpu,
num_reqs=batch_size, num_reqs=batch_size,
num_actual_tokens=total_tokens, num_actual_tokens=total_tokens,
max_query_len=max_query_len, max_query_len=max_query_len,
@@ -121,7 +126,6 @@ def _build_common_attn_metadata(
def _create_vllm_config( def _create_vllm_config(
config: BenchmarkConfig, config: BenchmarkConfig,
dtype: torch.dtype,
max_num_blocks: int, max_num_blocks: int,
) -> VllmConfig: ) -> VllmConfig:
"""Create a VllmConfig for benchmarking with mock model methods.""" """Create a VllmConfig for benchmarking with mock model methods."""
@@ -129,7 +133,7 @@ def _create_vllm_config(
model="meta-llama/Meta-Llama-3-8B", model="meta-llama/Meta-Llama-3-8B",
tokenizer="meta-llama/Meta-Llama-3-8B", tokenizer="meta-llama/Meta-Llama-3-8B",
trust_remote_code=False, trust_remote_code=False,
dtype=dtype, dtype="auto", # Use model's native dtype
seed=0, seed=0,
max_model_len=1024, max_model_len=1024,
) )
@@ -198,15 +202,12 @@ def _create_backend_impl(
backend_cfg: dict, backend_cfg: dict,
config: BenchmarkConfig, config: BenchmarkConfig,
device: torch.device, device: torch.device,
dtype: torch.dtype,
): ):
"""Create backend implementation instance.""" """Create backend implementation instance."""
import importlib backend_class = backend_cfg["backend_class"]
backend_module = importlib.import_module(backend_cfg["module"])
backend_class = getattr(backend_module, backend_cfg["backend_class"])
scale = get_attention_scale(config.head_dim) scale = get_attention_scale(config.head_dim)
dtype = backend_cfg["dtype"]
impl = backend_class.get_impl_cls()( impl = backend_class.get_impl_cls()(
num_heads=config.num_q_heads, num_heads=config.num_q_heads,
@@ -227,7 +228,7 @@ def _create_backend_impl(
layer = MockLayer(device, kv_cache_spec=kv_cache_spec) layer = MockLayer(device, kv_cache_spec=kv_cache_spec)
return backend_class, impl, layer, dtype return backend_class, impl, layer
def _create_metadata_builder( def _create_metadata_builder(
@@ -235,11 +236,44 @@ def _create_metadata_builder(
kv_cache_spec: FullAttentionSpec, kv_cache_spec: FullAttentionSpec,
vllm_config: VllmConfig, vllm_config: VllmConfig,
device: torch.device, device: torch.device,
backend_name: str = "",
): ):
"""Create metadata builder instance.""" """Create metadata builder instance."""
return backend_class.get_builder_cls()( layer_names = ["layer_0"]
builder_cls = backend_class.get_builder_cls()
# Flashinfer needs get_per_layer_parameters mocked since we don't have
# real model layers registered
if backend_name == "FLASHINFER":
import unittest.mock
from vllm.v1.attention.backends.utils import PerLayerParameters
def mock_get_per_layer_parameters(vllm_config, layer_names, impl_cls):
head_size = vllm_config.model_config.get_head_size()
return {
layer_name: PerLayerParameters(
window_left=-1, # No sliding window
logits_soft_cap=0.0, # No soft cap
sm_scale=1.0 / (head_size**0.5), # Standard scale
)
for layer_name in layer_names
}
with unittest.mock.patch(
"vllm.v1.attention.backends.flashinfer.get_per_layer_parameters",
mock_get_per_layer_parameters,
):
return builder_cls(
kv_cache_spec=kv_cache_spec, kv_cache_spec=kv_cache_spec,
layer_names=["layer_0"], layer_names=layer_names,
vllm_config=vllm_config,
device=device,
)
return builder_cls(
kv_cache_spec=kv_cache_spec,
layer_names=layer_names,
vllm_config=vllm_config, vllm_config=vllm_config,
device=device, device=device,
) )
@@ -281,39 +315,44 @@ def _create_input_tensors(
def _create_kv_cache( def _create_kv_cache(
config: BenchmarkConfig, config: BenchmarkConfig,
max_num_blocks: int, max_num_blocks: int,
cache_layout: str, backend_class,
device: torch.device, device: torch.device,
dtype: torch.dtype, dtype: torch.dtype,
) -> list: ) -> list:
"""Create KV cache tensors for all layers.""" """Create KV cache tensors for all layers using the backend's methods.
if cache_layout == "flashinfer":
# FlashInfer layout: [num_blocks, 2, block_size, num_kv_heads, head_dim] Uses the backend's get_kv_cache_shape() and get_kv_cache_stride_order()
cache_list = [ to create the cache with the correct shape and memory layout.
torch.zeros( """
max_num_blocks, # Get the logical shape from the backend
2, cache_shape = backend_class.get_kv_cache_shape(
config.block_size, num_blocks=max_num_blocks,
config.num_kv_heads, block_size=config.block_size,
config.head_dim, num_kv_heads=config.num_kv_heads,
device=device, head_size=config.head_dim,
dtype=dtype,
) )
for _ in range(config.num_layers)
] # Get the stride order for custom memory layout
else: try:
# Standard layout: [2, num_blocks, block_size, num_kv_heads, head_dim] stride_order = backend_class.get_kv_cache_stride_order()
cache_list = [ assert len(stride_order) == len(cache_shape)
torch.zeros( except (AttributeError, NotImplementedError):
2, stride_order = tuple(range(len(cache_shape)))
max_num_blocks,
config.block_size, # Permute shape to physical layout order
config.num_kv_heads, physical_shape = tuple(cache_shape[i] for i in stride_order)
config.head_dim,
device=device, # Compute inverse permutation to get back to logical view
dtype=dtype, inv_order = [stride_order.index(i) for i in range(len(stride_order))]
)
for _ in range(config.num_layers) cache_list = []
] for _ in range(config.num_layers):
# Allocate in physical layout order (contiguous in memory)
cache = torch.zeros(*physical_shape, device=device, dtype=dtype)
# Permute to logical view
cache = cache.permute(*inv_order)
cache_list.append(cache)
return cache_list return cache_list
@@ -396,7 +435,7 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
""" """
Run standard attention benchmark with real kernels. Run standard attention benchmark with real kernels.
Supports: flash, triton, flashinfer Supports: FLASH_ATTN, TRITON_ATTN, FLASHINFER
Args: Args:
config: Benchmark configuration config: Benchmark configuration
@@ -411,20 +450,39 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
requests = parse_batch_spec(config.batch_spec) requests = parse_batch_spec(config.batch_spec)
if config.backend == "flashinfer": if config.backend == "FLASHINFER":
requests = reorder_for_flashinfer(requests) requests = reorder_for_flashinfer(requests)
q_lens = [r.q_len for r in requests] q_lens = [r.q_len for r in requests]
kv_lens = [r.kv_len for r in requests] kv_lens = [r.kv_len for r in requests]
total_q = sum(q_lens) total_q = sum(q_lens)
max_kv = max(kv_lens) max_kv = max(kv_lens)
batch_size = len(q_lens)
max_num_blocks = (max_kv + config.block_size - 1) // config.block_size # Calculate total blocks needed: batch_size * max_blocks_per_request
max_blocks_per_request = (max_kv + config.block_size - 1) // config.block_size
max_num_blocks = batch_size * max_blocks_per_request
backend_class, impl, layer, dtype = _create_backend_impl( # Suppress vLLM logs during setup to reduce spam
backend_cfg, config, device with log_warnings_and_errors_only():
# Create vllm_config first - uses model's native dtype via "auto"
vllm_config = _create_vllm_config(config, max_num_blocks)
dtype = vllm_config.model_config.dtype
# Wrap everything in set_current_vllm_config context
# This is required for backends like flashinfer that need global config
with set_current_vllm_config(vllm_config):
backend_class, impl, layer = _create_backend_impl(
backend_cfg, config, device, dtype
) )
# Set KV cache layout if the backend requires a specific one
# (e.g., FlashInfer requires HND on SM100/Blackwell for TRTLLM attention)
required_layout = backend_class.get_required_kv_cache_layout()
if required_layout is not None:
set_kv_cache_layout(required_layout)
get_kv_cache_layout.cache_clear()
common_metadata = _build_common_attn_metadata( common_metadata = _build_common_attn_metadata(
q_lens, kv_lens, config.block_size, device q_lens, kv_lens, config.block_size, device
) )
@@ -436,10 +494,8 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
dtype=dtype, dtype=dtype,
) )
vllm_config = _create_vllm_config(config, dtype, max_num_blocks)
builder = _create_metadata_builder( builder = _create_metadata_builder(
backend_class, kv_cache_spec, vllm_config, device backend_class, kv_cache_spec, vllm_config, device, config.backend
) )
attn_metadata = builder.build( attn_metadata = builder.build(
@@ -447,10 +503,12 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
common_attn_metadata=common_metadata, common_attn_metadata=common_metadata,
) )
q_list, k_list, v_list = _create_input_tensors(config, total_q, device, dtype) q_list, k_list, v_list = _create_input_tensors(
config, total_q, device, dtype
)
cache_list = _create_kv_cache( cache_list = _create_kv_cache(
config, max_num_blocks, backend_cfg["cache_layout"], device, dtype config, max_num_blocks, backend_class, device, dtype
) )
times, mem_stats = _run_single_benchmark( times, mem_stats = _run_single_benchmark(

View File

@@ -46,10 +46,10 @@ echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL"
echo "RESULT_FILE=$RESULT" echo "RESULT_FILE=$RESULT"
echo "====================== AUTO TUNEPARAMETERS ====================" echo "====================== AUTO TUNEPARAMETERS ===================="
rm -rf $LOG_FOLDER rm -rf "$LOG_FOLDER"
rm -rf $PROFILE_PATH rm -rf "$PROFILE_PATH"
mkdir -p $LOG_FOLDER mkdir -p "$LOG_FOLDER"
mkdir -p $PROFILE_PATH mkdir -p "$PROFILE_PATH"
cd "$BASE/vllm" cd "$BASE/vllm"
@@ -114,7 +114,7 @@ start_server() {
# wait for 10 minutes... # wait for 10 minutes...
server_started=0 server_started=0
for i in {1..60}; do for _ in {1..60}; do
# This line checks whether the server is still alive or not, # This line checks whether the server is still alive or not,
# since that we should always have permission to send signal to the server process. # since that we should always have permission to send signal to the server process.
kill -0 $server_pid 2> /dev/null || break kill -0 $server_pid 2> /dev/null || break
@@ -145,12 +145,12 @@ run_benchmark() {
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt" local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
echo "vllm_log: $vllm_log" echo "vllm_log: $vllm_log"
echo echo
rm -f $vllm_log rm -f "$vllm_log"
pkill -if "vllm serve" || true pkill -if "vllm serve" || true
echo "starting server..." echo "starting server..."
# Call start_server without a profile_dir to avoid profiling overhead # Call start_server without a profile_dir to avoid profiling overhead
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log "" start_server "$gpu_memory_utilization" "$max_num_seqs" "$max_num_batched_tokens" "$vllm_log" ""
result=$? result=$?
if [[ "$result" -eq 1 ]]; then if [[ "$result" -eq 1 ]]; then
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens" echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
@@ -168,15 +168,15 @@ run_benchmark() {
# --profile flag is removed from this call # --profile flag is removed from this call
vllm bench serve \ vllm bench serve \
--backend vllm \ --backend vllm \
--model $MODEL \ --model "$MODEL" \
--dataset-name random \ --dataset-name random \
--random-input-len $adjusted_input_len \ --random-input-len $adjusted_input_len \
--random-output-len $OUTPUT_LEN \ --random-output-len "$OUTPUT_LEN" \
--ignore-eos \ --ignore-eos \
--disable-tqdm \ --disable-tqdm \
--request-rate inf \ --request-rate inf \
--percentile-metrics ttft,tpot,itl,e2el \ --percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \ --goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \
--num-prompts 1000 \ --num-prompts 1000 \
--random-prefix-len $prefix_len \ --random-prefix-len $prefix_len \
--host "$HOSTNAME" \ --host "$HOSTNAME" \
@@ -195,20 +195,20 @@ run_benchmark() {
request_rate=$((${throughput%.*} + 1)) request_rate=$((${throughput%.*} + 1))
while ((request_rate > 0)); do while ((request_rate > 0)); do
# clear prefix cache # clear prefix cache
curl -X POST http://${HOSTNAME}:8004/reset_prefix_cache curl -X POST http://"${HOSTNAME}":8004/reset_prefix_cache
sleep 5 sleep 5
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt" bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
vllm bench serve \ vllm bench serve \
--backend vllm \ --backend vllm \
--model $MODEL \ --model "$MODEL" \
--dataset-name random \ --dataset-name random \
--random-input-len $adjusted_input_len \ --random-input-len $adjusted_input_len \
--random-output-len $OUTPUT_LEN \ --random-output-len "$OUTPUT_LEN" \
--ignore-eos \ --ignore-eos \
--disable-tqdm \ --disable-tqdm \
--request-rate $request_rate \ --request-rate $request_rate \
--percentile-metrics ttft,tpot,itl,e2el \ --percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \ --goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \
--num-prompts 100 \ --num-prompts 100 \
--random-prefix-len $prefix_len \ --random-prefix-len $prefix_len \
--host "$HOSTNAME" \ --host "$HOSTNAME" \
@@ -255,7 +255,7 @@ gpu_memory_utilization=0.98
find_gpu_memory_utilization=0 find_gpu_memory_utilization=0
while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
# Pass empty string for profile_dir argument # Pass empty string for profile_dir argument
start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" "" start_server "$gpu_memory_utilization" "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" ""
result=$? result=$?
if [[ "$result" -eq 0 ]]; then if [[ "$result" -eq 0 ]]; then
find_gpu_memory_utilization=1 find_gpu_memory_utilization=1
@@ -274,7 +274,7 @@ fi
for num_seqs in "${num_seqs_list[@]}"; do for num_seqs in "${num_seqs_list[@]}"; do
for num_batched_tokens in "${num_batched_tokens_list[@]}"; do for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization run_benchmark "$num_seqs" "$num_batched_tokens" "$gpu_memory_utilization"
done done
done done
echo "finish permutations" echo "finish permutations"
@@ -285,7 +285,7 @@ echo "finish permutations"
if (( $(echo "$best_throughput > 0" | bc -l) )); then if (( $(echo "$best_throughput > 0" | bc -l) )); then
echo echo
echo "Benchmark tuning finished. Now running profiling on the best configuration found..." echo "Benchmark tuning finished. Now running profiling on the best configuration found..."
echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput" echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput, goodput: $best_goodput"
echo echo
vllm_log="$LOG_FOLDER/vllm_log_BEST_PROFILE.txt" vllm_log="$LOG_FOLDER/vllm_log_BEST_PROFILE.txt"
@@ -293,7 +293,7 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
# Start server with the best params and profiling ENABLED # Start server with the best params and profiling ENABLED
echo "Starting server for profiling..." echo "Starting server for profiling..."
start_server $gpu_memory_utilization $best_max_num_seqs $best_num_batched_tokens "$vllm_log" "$PROFILE_PATH" start_server "$gpu_memory_utilization" "$best_max_num_seqs" "$best_num_batched_tokens" "$vllm_log" "$PROFILE_PATH"
# Run benchmark with the best params and the --profile flag # Run benchmark with the best params and the --profile flag
echo "Running benchmark with profiling..." echo "Running benchmark with profiling..."
@@ -301,15 +301,15 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
adjusted_input_len=$(( INPUT_LEN - prefix_len )) adjusted_input_len=$(( INPUT_LEN - prefix_len ))
vllm bench serve \ vllm bench serve \
--backend vllm \ --backend vllm \
--model $MODEL \ --model "$MODEL" \
--dataset-name random \ --dataset-name random \
--random-input-len $adjusted_input_len \ --random-input-len $adjusted_input_len \
--random-output-len $OUTPUT_LEN \ --random-output-len "$OUTPUT_LEN" \
--ignore-eos \ --ignore-eos \
--disable-tqdm \ --disable-tqdm \
--request-rate $best_request_rate \ --request-rate "$best_request_rate" \
--percentile-metrics ttft,tpot,itl,e2el \ --percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \ --goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \
--num-prompts 100 \ --num-prompts 100 \
--random-prefix-len $prefix_len \ --random-prefix-len $prefix_len \
--host "$HOSTNAME" \ --host "$HOSTNAME" \

View File

@@ -64,7 +64,7 @@ for i in $(seq 0 $(($num_runs - 1))); do
else else
STATUS="FAILURE" STATUS="FAILURE"
((FAILURE_COUNT++)) ((FAILURE_COUNT++))
FAILED_RUNS+=("Run #$((i+1)): $(echo $run_object | jq -c .)") FAILED_RUNS+=("Run #$((i+1)): $(echo "$run_object" | jq -c .)")
fi fi
RUN_OUTPUT=$(<"$RUN_OUTPUT_FILE") RUN_OUTPUT=$(<"$RUN_OUTPUT_FILE")

View File

@@ -0,0 +1,471 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Benchmark comparing Triton vs PyTorch sort-based top-k/top-p implementations.
Compares:
- apply_top_k_top_p_triton (Triton binary search)
- apply_top_k_top_p (PyTorch sort-based)
Scenarios:
- top_k only (whole batch, partial batch)
- top_p only (whole batch, partial batch)
- mix of top_k and top_p
"""
import argparse
import gc
from dataclasses import dataclass
import torch
from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p_pytorch
from vllm.v1.sample.ops.topk_topp_triton import (
apply_top_k_top_p_triton,
reset_buffer_cache,
)
@dataclass
class BenchmarkConfig:
"""Configuration for a benchmark run."""
name: str
batch_size: int
vocab_size: int
# k and p can be tensors or None
k_values: torch.Tensor | None # [batch_size] or None
p_values: torch.Tensor | None # [batch_size] or None
description: str
ops_pct: float = 0.0 # Percentage of ops relative to batch size
def calculate_ops_pct(
k_values: torch.Tensor | None,
p_values: torch.Tensor | None,
vocab_size: int,
batch_size: int,
) -> float:
"""
Calculate the percentage of active top-k and top-p operations.
Returns percentage where 100% = batch_size ops.
E.g., if all rows have both top-k and top-p active, returns 200%.
"""
active_ops = 0
if k_values is not None:
# Count rows where k < vocab_size (active top-k filtering)
active_ops += (k_values < vocab_size).sum().item()
if p_values is not None:
# Count rows where p < 1.0 (active top-p filtering)
active_ops += (p_values < 1.0).sum().item()
return (active_ops / batch_size) * 100 if batch_size > 0 else 0.0
def create_logits(
batch_size: int, vocab_size: int, device: str = "cuda"
) -> torch.Tensor:
"""Create random logits mimicking a realistic LLM distribution.
Uses a Zipf-like probability distribution (rank^-1.1) converted to logits
via log, then randomly permuted per row. This produces a peaked distribution
where a small number of tokens capture most probability mass, similar to
real model outputs.
"""
# Create Zipf-like probabilities: p(rank) ~ rank^(-alpha)
ranks = torch.arange(1, vocab_size + 1, dtype=torch.float32, device=device)
probs = ranks.pow(-1.1)
probs = probs / probs.sum()
# Convert to logits (log-probabilities, unnormalized is fine)
base_logits = probs.log()
# Broadcast to batch and randomly permute each row
logits = base_logits.unsqueeze(0).expand(batch_size, -1).clone()
for i in range(batch_size):
logits[i] = logits[i, torch.randperm(vocab_size, device=device)]
return logits
def measure_memory() -> tuple[int, int]:
"""Return (allocated, reserved) memory in bytes."""
torch.cuda.synchronize()
return torch.cuda.memory_allocated(), torch.cuda.max_memory_allocated()
def reset_memory_stats():
"""Reset peak memory statistics."""
reset_buffer_cache()
torch.cuda.reset_peak_memory_stats()
torch.cuda.empty_cache()
gc.collect()
def benchmark_function(
func,
logits: torch.Tensor,
k: torch.Tensor | None,
p: torch.Tensor | None,
warmup_iters: int = 5,
benchmark_iters: int = 20,
) -> tuple[float, int]:
"""
Benchmark a function and return (avg_time_ms, peak_memory_bytes).
Returns average time in milliseconds and peak memory usage.
"""
# Warmup
for _ in range(warmup_iters):
logits_copy = logits.clone()
func(logits_copy, k, p)
torch.cuda.synchronize()
# Reset memory stats before benchmark
reset_memory_stats()
# Benchmark
start_events = [
torch.cuda.Event(enable_timing=True) for _ in range(benchmark_iters)
]
end_events = [torch.cuda.Event(enable_timing=True) for _ in range(benchmark_iters)]
for i in range(benchmark_iters):
logits_copy = logits.clone()
start_events[i].record()
func(logits_copy, k, p)
end_events[i].record()
torch.cuda.synchronize()
# Calculate timing
times = [
start_events[i].elapsed_time(end_events[i]) for i in range(benchmark_iters)
]
avg_time = sum(times) / len(times)
# Get peak memory
_, peak_memory = measure_memory()
return avg_time, peak_memory
def create_benchmark_configs(
batch_sizes: list[int],
vocab_sizes: list[int],
device: str = "cuda",
) -> list[BenchmarkConfig]:
"""Create all benchmark configurations."""
configs = []
for vocab_size in vocab_sizes:
for batch_size in batch_sizes:
# 1. Top-k only - whole batch (all rows have k < vocab_size)
k_all = torch.full((batch_size,), 50, dtype=torch.int32, device=device)
configs.append(
BenchmarkConfig(
name=f"topk_whole_b{batch_size}_v{vocab_size // 1000}k",
batch_size=batch_size,
vocab_size=vocab_size,
k_values=k_all,
p_values=None,
description=f"Top-k only (whole batch, k=50), "
f"batch={batch_size}, vocab={vocab_size}",
ops_pct=calculate_ops_pct(k_all, None, vocab_size, batch_size),
)
)
# 2. Top-k only - partial batch (half have k=50, half have k=vocab_size)
k_partial = torch.full((batch_size,), 50, dtype=torch.int32, device=device)
k_partial[batch_size // 2 :] = vocab_size # No filtering for second half
configs.append(
BenchmarkConfig(
name=f"topk_partial_b{batch_size}_v{vocab_size // 1000}k",
batch_size=batch_size,
vocab_size=vocab_size,
k_values=k_partial,
p_values=None,
description=f"Top-k only (partial batch, 50% k=50, 50% k=vocab), "
f"batch={batch_size}, vocab={vocab_size}",
ops_pct=calculate_ops_pct(k_partial, None, vocab_size, batch_size),
)
)
# 3. Top-p only - whole batch (all rows have p < 1.0)
p_all = torch.full((batch_size,), 0.9, dtype=torch.float32, device=device)
configs.append(
BenchmarkConfig(
name=f"topp_whole_b{batch_size}_v{vocab_size // 1000}k",
batch_size=batch_size,
vocab_size=vocab_size,
k_values=None,
p_values=p_all,
description=f"Top-p only (whole batch, p=0.9), "
f"batch={batch_size}, vocab={vocab_size}",
ops_pct=calculate_ops_pct(None, p_all, vocab_size, batch_size),
)
)
# 4. Top-p only - partial batch (half have p=0.9, half have p=1.0)
p_partial = torch.full(
(batch_size,), 0.9, dtype=torch.float32, device=device
)
p_partial[batch_size // 2 :] = 1.0 # No filtering for second half
configs.append(
BenchmarkConfig(
name=f"topp_partial_b{batch_size}_v{vocab_size // 1000}k",
batch_size=batch_size,
vocab_size=vocab_size,
k_values=None,
p_values=p_partial,
description=f"Top-p only (partial batch, 50% p=0.9, 50% p=1.0), "
f"batch={batch_size}, vocab={vocab_size}",
ops_pct=calculate_ops_pct(None, p_partial, vocab_size, batch_size),
)
)
# 5. Mix of top-k and top-p (both applied to whole batch)
k_mix = torch.full((batch_size,), 100, dtype=torch.int32, device=device)
p_mix = torch.full((batch_size,), 0.9, dtype=torch.float32, device=device)
configs.append(
BenchmarkConfig(
name=f"topk_topp_whole_b{batch_size}_v{vocab_size // 1000}k",
batch_size=batch_size,
vocab_size=vocab_size,
k_values=k_mix,
p_values=p_mix,
description=f"Top-k + Top-p (whole batch, k=100, p=0.9), "
f"batch={batch_size}, vocab={vocab_size}",
ops_pct=calculate_ops_pct(k_mix, p_mix, vocab_size, batch_size),
)
)
# 6. Mix with partial application (some rows k only, some p only, some both)
k_mixed = torch.full(
(batch_size,), vocab_size, dtype=torch.int32, device=device
)
p_mixed = torch.full((batch_size,), 1.0, dtype=torch.float32, device=device)
# First third: k only
third = batch_size // 3
k_mixed[:third] = 50
# Second third: p only
p_mixed[third : 2 * third] = 0.5
# Last third: both k and p
k_mixed[2 * third :] = 100
p_mixed[2 * third :] = 0.9
configs.append(
BenchmarkConfig(
name=f"mixed_partial_b{batch_size}_v{vocab_size // 1000}k",
batch_size=batch_size,
vocab_size=vocab_size,
k_values=k_mixed,
p_values=p_mixed,
description=f"Mixed partial (1/3 k=50, 1/3 p=0.9, 1/3 both), "
f"batch={batch_size}, vocab={vocab_size}",
ops_pct=calculate_ops_pct(k_mixed, p_mixed, vocab_size, batch_size),
)
)
return configs
def format_memory(bytes_val: int) -> str:
"""Format memory in human-readable form."""
if bytes_val >= 1024**3:
return f"{bytes_val / (1024**3):.2f} GB"
elif bytes_val >= 1024**2:
return f"{bytes_val / (1024**2):.2f} MB"
elif bytes_val >= 1024:
return f"{bytes_val / 1024:.2f} KB"
return f"{bytes_val} B"
def run_benchmark(
configs: list[BenchmarkConfig],
warmup_iters: int = 5,
benchmark_iters: int = 20,
verbose: bool = True,
):
"""Run all benchmarks and print results."""
results = []
print("=" * 100)
print("Top-k/Top-p Benchmark: Triton vs PyTorch Sort-based")
print("=" * 100)
print()
for config in configs:
if verbose:
print(f"Running: {config.description}")
# Create fresh logits for this config
logits = create_logits(config.batch_size, config.vocab_size)
# Benchmark Triton
reset_memory_stats()
triton_time, triton_mem = benchmark_function(
apply_top_k_top_p_triton,
logits,
config.k_values,
config.p_values,
warmup_iters,
benchmark_iters,
)
# Benchmark PyTorch
reset_memory_stats()
pytorch_time, pytorch_mem = benchmark_function(
apply_top_k_top_p_pytorch,
logits,
config.k_values,
config.p_values,
warmup_iters,
benchmark_iters,
)
speedup = pytorch_time / triton_time if triton_time > 0 else float("inf")
mem_ratio = pytorch_mem / triton_mem if triton_mem > 0 else float("inf")
result = {
"config": config,
"triton_time_ms": triton_time,
"pytorch_time_ms": pytorch_time,
"triton_mem": triton_mem,
"pytorch_mem": pytorch_mem,
"speedup": speedup,
"mem_ratio": mem_ratio,
}
results.append(result)
if verbose:
print(f" Triton: {triton_time:.3f} ms, {format_memory(triton_mem)}")
print(f" PyTorch: {pytorch_time:.3f} ms, {format_memory(pytorch_mem)}")
print(f" Speedup: {speedup:.2f}x, Memory ratio: {mem_ratio:.2f}x")
print()
# Clean up
del logits
reset_memory_stats()
return results
def print_summary_table(results: list[dict]):
"""Print a summary table of results."""
print()
print("=" * 130)
print("SUMMARY TABLE")
print("=" * 130)
print()
# Header
header = (
f"{'Scenario':<40} {'Batch':>6} {'Vocab':>7} {'Ops%':>6} "
f"{'Triton (ms)':>12} {'PyTorch (ms)':>13} {'Speedup':>8} "
f"{'Tri Mem':>10} {'Pyt Mem':>10}"
)
print(header)
print("-" * 130)
# Group by scenario type
current_vocab = None
for result in results:
config = result["config"]
# Add separator between vocab sizes
if current_vocab != config.vocab_size:
if current_vocab is not None:
print("-" * 130)
current_vocab = config.vocab_size
scenario = config.name.split("_b")[0] # Extract scenario name
print(
f"{scenario:<40} {config.batch_size:>6} {config.vocab_size:>7} "
f"{config.ops_pct:>5.0f}% "
f"{result['triton_time_ms']:>12.3f} {result['pytorch_time_ms']:>13.3f} "
f"{result['speedup']:>7.2f}x "
f"{format_memory(result['triton_mem']):>10} "
f"{format_memory(result['pytorch_mem']):>10}"
)
print("=" * 130)
def main():
parser = argparse.ArgumentParser(
description="Benchmark Triton vs PyTorch sort-based top-k/top-p implementations"
)
parser.add_argument(
"--batch-sizes",
type=int,
nargs="+",
default=[1, 4, 16, 64, 128, 512, 1024, 2048],
help="Batch sizes to test (default: 1 4 16 64)",
)
parser.add_argument(
"--vocab-sizes",
type=int,
nargs="+",
default=[32768, 131072], # 32k, 128k
help="Vocabulary sizes to test (default: 32768 131072)",
)
parser.add_argument(
"--warmup-iters",
type=int,
default=5,
help="Number of warmup iterations (default: 5)",
)
parser.add_argument(
"--benchmark-iters",
type=int,
default=20,
help="Number of benchmark iterations (default: 20)",
)
parser.add_argument(
"--quiet",
action="store_true",
help="Only print summary table",
)
args = parser.parse_args()
# Print configuration
print(f"Batch sizes: {args.batch_sizes}")
print(f"Vocab sizes: {args.vocab_sizes}")
print(f"Warmup iterations: {args.warmup_iters}")
print(f"Benchmark iterations: {args.benchmark_iters}")
print()
# Check CUDA
if not torch.cuda.is_available():
print("ERROR: CUDA is not available. This benchmark requires a GPU.")
return
device_name = torch.cuda.get_device_name(0)
print(f"GPU: {device_name}")
print()
# Create configs
configs = create_benchmark_configs(
args.batch_sizes,
args.vocab_sizes,
)
# Run benchmarks
results = run_benchmark(
configs,
warmup_iters=args.warmup_iters,
benchmark_iters=args.benchmark_iters,
verbose=not args.quiet,
)
# Print summary
print_summary_table(results)
if __name__ == "__main__":
main()

View File

@@ -13,6 +13,7 @@ from torch.utils.benchmark import Measurement as TMeasurement
from tqdm import tqdm from tqdm import tqdm
import vllm._custom_ops as ops import vllm._custom_ops as ops
from vllm.benchmarks.lib.utils import default_vllm_config
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.quantization.utils.fp8_utils import ( from vllm.model_executor.layers.quantization.utils.fp8_utils import (
per_token_group_quant_fp8, per_token_group_quant_fp8,
@@ -291,6 +292,7 @@ def print_timers(timers: Iterable[TMeasurement]):
compare.print() compare.print()
@default_vllm_config()
def main(): def main():
torch.set_default_device("cuda") torch.set_default_device("cuda")
bench_params = get_bench_params() bench_params = get_bench_params()

View File

@@ -7,6 +7,7 @@ import itertools
import torch import torch
import vllm.model_executor.layers.activation # noqa F401 import vllm.model_executor.layers.activation # noqa F401
from vllm.benchmarks.lib.utils import default_vllm_config
from vllm.model_executor.custom_op import op_registry from vllm.model_executor.custom_op import op_registry
from vllm.triton_utils import triton from vllm.triton_utils import triton
from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.argparse_utils import FlexibleArgumentParser
@@ -18,6 +19,7 @@ intermediate_size = [3072, 9728, 12288]
configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size)) configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size))
@default_vllm_config()
def benchmark_activation( def benchmark_activation(
batch_size: int, batch_size: int,
seq_len: int, seq_len: int,

View File

@@ -8,6 +8,7 @@ os.environ["VLLM_USE_DEEP_GEMM"] = "0"
import torch import torch
from vllm.benchmarks.lib.utils import default_vllm_config
from vllm.model_executor.layers.quantization.utils.fp8_utils import ( from vllm.model_executor.layers.quantization.utils.fp8_utils import (
W8A8BlockFp8LinearOp, W8A8BlockFp8LinearOp,
) )
@@ -40,6 +41,7 @@ DEEPSEEK_V3_SHAPES = [
] ]
@default_vllm_config()
def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass): def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
"""Build runner function for w8a8 block fp8 matmul.""" """Build runner function for w8a8 block fp8 matmul."""
factor_for_scale = 1e-2 factor_for_scale = 1e-2

View File

@@ -11,6 +11,7 @@ import torch
import vllm.model_executor.layers.fused_moe.modular_kernel as mk import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from tests.kernels.moe.utils import make_dummy_moe_config from tests.kernels.moe.utils import make_dummy_moe_config
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.model_executor.layers.fused_moe.activation import MoEActivation
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8 from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
@@ -161,7 +162,7 @@ def bench_run(
w2_fp8q_cutlass, w2_fp8q_cutlass,
topk_weights, topk_weights,
topk_ids, topk_ids,
activation="silu", activation=MoEActivation.SILU,
global_num_experts=num_experts, global_num_experts=num_experts,
) )
torch.cuda.synchronize() torch.cuda.synchronize()

View File

@@ -30,6 +30,9 @@ import torch.distributed as dist
from torch.distributed import ProcessGroup from torch.distributed import ProcessGroup
from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
from vllm.distributed.device_communicators.flashinfer_all_reduce import (
FlashInferAllReduce,
)
from vllm.distributed.device_communicators.pynccl import ( from vllm.distributed.device_communicators.pynccl import (
PyNcclCommunicator, PyNcclCommunicator,
register_nccl_symmetric_ops, register_nccl_symmetric_ops,
@@ -44,7 +47,7 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser
logger = init_logger(__name__) logger = init_logger(__name__)
# Default sequence lengths to benchmark # Default sequence lengths to benchmark
DEFAULT_SEQUENCE_LENGTHS = [128, 512, 1024, 2048, 4096, 8192] DEFAULT_SEQUENCE_LENGTHS = [16, 64, 128, 512, 1024, 2048, 4096, 8192]
# Fixed hidden size and dtype for all benchmarks # Fixed hidden size and dtype for all benchmarks
HIDDEN_SIZE = 8192 HIDDEN_SIZE = 8192
@@ -81,6 +84,7 @@ class CommunicatorBenchmark:
self.symm_mem_comm = None self.symm_mem_comm = None
self.symm_mem_comm_multimem = None self.symm_mem_comm_multimem = None
self.symm_mem_comm_two_shot = None self.symm_mem_comm_two_shot = None
self.fi_ar_comm = None
self._init_communicators() self._init_communicators()
@@ -161,6 +165,22 @@ class CommunicatorBenchmark:
) )
self.symm_mem_comm_two_shot = None self.symm_mem_comm_two_shot = None
try:
self.fi_ar_comm = FlashInferAllReduce(
group=self.cpu_group,
device=self.device,
)
if not self.fi_ar_comm.disabled:
logger.info("Rank %s: FlashInferAllReduce initialized", self.rank)
else:
logger.info("Rank %s: FlashInferAllReduce disabled", self.rank)
self.fi_ar_comm = None
except Exception as e:
logger.warning(
"Rank %s: Failed to initialize FlashInferAllReduce: %s", self.rank, e
)
self.fi_ar_comm = None
def benchmark_allreduce( def benchmark_allreduce(
self, sequence_length: int, num_warmup: int, num_trials: int self, sequence_length: int, num_warmup: int, num_trials: int
) -> dict[str, float]: ) -> dict[str, float]:
@@ -180,7 +200,8 @@ class CommunicatorBenchmark:
lambda t, c=comm: c.custom_all_reduce(t), lambda t, c=comm: c.custom_all_reduce(t),
lambda t, c=comm: c.should_custom_ar(t), lambda t, c=comm: c.should_custom_ar(t),
comm.capture(), comm.capture(),
"1stage", # env variable value {"VLLM_CUSTOM_ALLREDUCE_ALGO": "1stage"},
None, # no destroy function
) )
) )
# CustomAllreduce two-shot # CustomAllreduce two-shot
@@ -190,7 +211,8 @@ class CommunicatorBenchmark:
lambda t, c=comm: c.custom_all_reduce(t), lambda t, c=comm: c.custom_all_reduce(t),
lambda t, c=comm: c.should_custom_ar(t), lambda t, c=comm: c.should_custom_ar(t),
comm.capture(), comm.capture(),
"2stage", # env variable value {"VLLM_CUSTOM_ALLREDUCE_ALGO": "2stage"},
None, # no destroy function
) )
) )
@@ -202,7 +224,8 @@ class CommunicatorBenchmark:
lambda t, c=comm: c.all_reduce(t), lambda t, c=comm: c.all_reduce(t),
lambda t: True, # Always available if initialized lambda t: True, # Always available if initialized
nullcontext(), nullcontext(),
None, # no env variable needed {}, # no env variable needed
None, # no destroy function
) )
) )
communicators.append( communicators.append(
@@ -211,7 +234,8 @@ class CommunicatorBenchmark:
lambda t: torch.ops.vllm.all_reduce_symmetric_with_copy(t), lambda t: torch.ops.vllm.all_reduce_symmetric_with_copy(t),
lambda t: True, # Always available if initialized lambda t: True, # Always available if initialized
nullcontext(), nullcontext(),
None, # no env variable needed {}, # no env variable needed
None, # no destroy function
) )
) )
@@ -223,7 +247,8 @@ class CommunicatorBenchmark:
lambda t, c=comm: c.all_reduce(t), lambda t, c=comm: c.all_reduce(t),
lambda t, c=comm: c.should_use_symm_mem(t), lambda t, c=comm: c.should_use_symm_mem(t),
nullcontext(), nullcontext(),
None, # no env variable needed {}, # no env variable needed
None, # no destroy function
) )
) )
@@ -235,19 +260,48 @@ class CommunicatorBenchmark:
lambda t, c=comm: c.all_reduce(t), lambda t, c=comm: c.all_reduce(t),
lambda t, c=comm: c.should_use_symm_mem(t), lambda t, c=comm: c.should_use_symm_mem(t),
nullcontext(), nullcontext(),
None, # no env variable needed {}, # no env variable needed
None, # no destroy function needed
)
)
if self.fi_ar_comm is not None:
comm = self.fi_ar_comm
communicators.append(
(
"flashinfer_trtllm",
lambda t, c=comm: c.all_reduce(t),
lambda t, c=comm: c.should_use_fi_ar(t),
nullcontext(),
{"VLLM_FLASHINFER_ALLREDUCE_BACKEND": "trtllm"},
lambda c=comm: c.destroy(),
)
)
communicators.append(
(
"flashinfer_mnnvl",
lambda t, c=comm: c.all_reduce(t),
lambda t, c=comm: c.should_use_fi_ar(t),
nullcontext(),
{"VLLM_FLASHINFER_ALLREDUCE_BACKEND": "mnnvl"},
lambda c=comm: c.destroy(),
) )
) )
# Benchmark each communicator # Benchmark each communicator
for name, allreduce_fn, should_use_fn, context, env_var in communicators: for (
# Set environment variable if needed name,
if env_var is not None: allreduce_fn,
os.environ["VLLM_CUSTOM_ALLREDUCE_ALGO"] = env_var should_use_fn,
else: context,
# Clear the environment variable to avoid interference env_dict,
os.environ.pop("VLLM_CUSTOM_ALLREDUCE_ALGO", None) destroy_fn,
) in communicators:
# Save original values and apply new environment variables
saved_env = {key: os.environ.get(key) for key in env_dict}
for key, value in env_dict.items():
os.environ[key] = value
try:
latency = self.benchmark_allreduce_single( latency = self.benchmark_allreduce_single(
sequence_length, sequence_length,
allreduce_fn, allreduce_fn,
@@ -258,6 +312,15 @@ class CommunicatorBenchmark:
) )
if latency is not None: if latency is not None:
results[name] = latency results[name] = latency
finally:
if destroy_fn is not None:
destroy_fn()
# Restore environment variables to their original state
for key, original_value in saved_env.items():
if original_value is None:
os.environ.pop(key, None)
else:
os.environ[key] = original_value
return results return results

View File

@@ -5,8 +5,11 @@
Benchmark for FlashInfer fused collective operations vs standard operations. Benchmark for FlashInfer fused collective operations vs standard operations.
This benchmark compares: This benchmark compares:
1. FlashInfer's trtllm_allreduce_fusion (fused allreduce + rmsnorm + optional quant) 1. FlashInfer's allreduce_fusion with trtllm backend
2. Standard tensor_model_parallel_all_reduce + separate rmsnorm/quant operations (fused allreduce + rmsnorm + optional FP8/FP4 quant)
2. FlashInfer's allreduce_fusion with mnnvl backend
(fused allreduce + rmsnorm only, no quantization support)
3. Standard tensor_model_parallel_all_reduce + separate rmsnorm/quant operations
Usage with torchrun: Usage with torchrun:
torchrun --nproc_per_node=2 benchmark_fused_collective.py torchrun --nproc_per_node=2 benchmark_fused_collective.py
@@ -24,7 +27,6 @@ import torch.distributed as dist # type: ignore
from vllm.config.vllm import CompilationConfig, VllmConfig, set_current_vllm_config from vllm.config.vllm import CompilationConfig, VllmConfig, set_current_vllm_config
from vllm.distributed import ( from vllm.distributed import (
get_tp_group,
tensor_model_parallel_all_reduce, tensor_model_parallel_all_reduce,
) )
from vllm.distributed.parallel_state import ( from vllm.distributed.parallel_state import (
@@ -49,14 +51,19 @@ SCALED_FP4_QUANT_OP = torch.ops._C.scaled_fp4_quant
logger = init_logger(__name__) logger = init_logger(__name__)
# Try to import FlashInfer # Try to import FlashInfer
TorchDistBackend = None
try: try:
import flashinfer.comm as flashinfer_comm # type: ignore import flashinfer.comm as flashinfer_comm # type: ignore
from flashinfer.comm.mnnvl import ( # type: ignore
if not hasattr(flashinfer_comm, "trtllm_allreduce_fusion"): TorchDistBackend,
flashinfer_comm = None
logger.warning(
"FlashInfer comm module found but missing trtllm_allreduce_fusion"
) )
if not (
hasattr(flashinfer_comm, "allreduce_fusion")
and hasattr(flashinfer_comm, "create_allreduce_fusion_workspace")
):
flashinfer_comm = None
logger.warning("FlashInfer comm module found but missing allreduce_fusion API")
except ImportError: except ImportError:
flashinfer_comm = None flashinfer_comm = None
logger.warning("FlashInfer not found, only benchmarking standard operations") logger.warning("FlashInfer not found, only benchmarking standard operations")
@@ -74,57 +81,70 @@ _FI_MAX_SIZES = {
8: 64 * MiB, # 64MB 8: 64 * MiB, # 64MB
} }
# Global workspace tensor for FlashInfer # Global workspace tensors for FlashInfer (keyed by backend name)
_FI_WORKSPACE_TENSOR = None _FI_WORKSPACES: dict = {}
# Backends to benchmark
FLASHINFER_BACKENDS = ["trtllm", "mnnvl"]
def setup_flashinfer_workspace( def setup_flashinfer_workspace(
backend: str,
world_size: int, world_size: int,
rank: int, rank: int,
hidden_dim: int, hidden_dim: int,
max_token_num: int, max_token_num: int,
use_fp32_lamport: bool = False, dtype: torch.dtype,
): ):
"""Setup FlashInfer workspace for fused allreduce operations.""" """Setup FlashInfer workspace for fused allreduce operations."""
global _FI_WORKSPACE_TENSOR global FI_WORKSPACES
if flashinfer_comm is None: if flashinfer_comm is None:
return None, None return None
if world_size not in _FI_MAX_SIZES: if world_size not in _FI_MAX_SIZES:
logger.warning("FlashInfer not supported for world size %s", world_size) logger.warning("FlashInfer not supported for world size %s", world_size)
return None, None return None
try: try:
# Create IPC workspace kwargs = {}
ipc_handles, workspace_tensor = ( if TorchDistBackend is not None:
flashinfer_comm.trtllm_create_ipc_workspace_for_all_reduce_fusion( kwargs["comm_backend"] = TorchDistBackend(group=dist.group.WORLD)
tp_rank=rank,
tp_size=world_size, workspace = flashinfer_comm.create_allreduce_fusion_workspace(
backend=backend,
world_size=world_size,
rank=rank,
max_token_num=max_token_num, max_token_num=max_token_num,
hidden_dim=hidden_dim, hidden_dim=hidden_dim,
group=get_tp_group().device_group, dtype=dtype,
use_fp32_lamport=use_fp32_lamport, **kwargs,
)
) )
_FI_WORKSPACE_TENSOR = workspace_tensor _FI_WORKSPACES[backend] = workspace
return ipc_handles, workspace_tensor return workspace
except Exception as e: except Exception as e:
logger.error("Failed to setup FlashInfer workspace: %s", e) logger.error(
return None, None "Failed to setup FlashInfer workspace (backend=%s): %s", backend, e
)
return None
def cleanup_flashinfer_workspace(ipc_handles): def cleanup_flashinfer_workspaces():
"""Cleanup FlashInfer workspace.""" """Cleanup all FlashInfer workspaces."""
if flashinfer_comm is None or ipc_handles is None: if flashinfer_comm is None:
return return
for backend, workspace in _FI_WORKSPACES.items():
try: try:
group = get_tp_group().device_group workspace.destroy()
flashinfer_comm.trtllm_destroy_ipc_workspace_for_all_reduce(ipc_handles, group)
except Exception as e: except Exception as e:
logger.error("Failed to cleanup FlashInfer workspace: %s", e) logger.error(
"Failed to cleanup FlashInfer workspace (backend=%s): %s",
backend,
e,
)
_FI_WORKSPACES.clear()
class FlashInferFusedAllReduceParams: class FlashInferFusedAllReduceParams:
@@ -132,25 +152,15 @@ class FlashInferFusedAllReduceParams:
def __init__( def __init__(
self, self,
rank: int,
world_size: int,
use_fp32_lamport: bool = False,
max_token_num: int = 1024, max_token_num: int = 1024,
): ):
self.rank = rank
self.world_size = world_size
self.use_fp32_lamport = use_fp32_lamport
self.trigger_completion_at_end = True
self.launch_with_pdl = True self.launch_with_pdl = True
self.fp32_acc = True self.fp32_acc = True
self.max_token_num = max_token_num self.max_token_num = max_token_num
def get_trtllm_fused_allreduce_kwargs(self): def get_flashinfer_fused_allreduce_kwargs(self):
return { return {
"world_rank": self.rank,
"world_size": self.world_size,
"launch_with_pdl": self.launch_with_pdl, "launch_with_pdl": self.launch_with_pdl,
"trigger_completion_at_end": self.trigger_completion_at_end,
"fp32_acc": self.fp32_acc, "fp32_acc": self.fp32_acc,
} }
@@ -161,11 +171,12 @@ def flashinfer_fused_allreduce_rmsnorm(
rms_gamma: torch.Tensor, rms_gamma: torch.Tensor,
rms_eps: float, rms_eps: float,
allreduce_params: "FlashInferFusedAllReduceParams", allreduce_params: "FlashInferFusedAllReduceParams",
workspace: object,
use_oneshot: bool, use_oneshot: bool,
norm_out: torch.Tensor | None = None, norm_out: torch.Tensor | None = None,
): ):
"""FlashInfer fused allreduce + rmsnorm operation.""" """FlashInfer fused allreduce + rmsnorm operation."""
if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None: if flashinfer_comm is None or workspace is None:
raise RuntimeError("FlashInfer not available or workspace not initialized") raise RuntimeError("FlashInfer not available or workspace not initialized")
if norm_out is None: if norm_out is None:
@@ -174,24 +185,25 @@ def flashinfer_fused_allreduce_rmsnorm(
else: else:
residual_out = input_tensor residual_out = input_tensor
flashinfer_comm.trtllm_allreduce_fusion( layout_code = None
allreduce_in=input_tensor, if workspace.backend == "trtllm":
token_num=input_tensor.shape[0], layout_code = flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4
flashinfer_comm.allreduce_fusion(
input=input_tensor,
workspace=workspace,
pattern=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNorm,
residual_in=residual, residual_in=residual,
residual_out=residual_out, residual_out=residual_out,
norm_out=norm_out, norm_out=norm_out,
rms_gamma=rms_gamma, rms_gamma=rms_gamma,
rms_eps=rms_eps, rms_eps=rms_eps,
hidden_dim=input_tensor.shape[-1],
workspace_ptrs=_FI_WORKSPACE_TENSOR,
pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNorm,
allreduce_out=None,
quant_out=None, quant_out=None,
scale_out=None, scale_out=None,
layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4, layout_code=layout_code,
scale_factor=None, scale_factor=None,
use_oneshot=use_oneshot, use_oneshot=use_oneshot,
**allreduce_params.get_trtllm_fused_allreduce_kwargs(), **allreduce_params.get_flashinfer_fused_allreduce_kwargs(),
) )
@@ -202,12 +214,16 @@ def flashinfer_fused_allreduce_rmsnorm_fp8_quant(
rms_eps: float, rms_eps: float,
scale_factor: torch.Tensor, scale_factor: torch.Tensor,
allreduce_params: FlashInferFusedAllReduceParams, allreduce_params: FlashInferFusedAllReduceParams,
workspace: object,
use_oneshot: bool = True, use_oneshot: bool = True,
norm_out: torch.Tensor | None = None, norm_out: torch.Tensor | None = None,
quant_out: torch.Tensor | None = None, quant_out: torch.Tensor | None = None,
): ):
"""FlashInfer fused allreduce + rmsnorm + FP8 quantization.""" """FlashInfer fused allreduce + rmsnorm + FP8 quantization.
if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None:
Note: Only supported by the trtllm backend.
"""
if flashinfer_comm is None or workspace is None:
raise RuntimeError("FlashInfer not available or workspace not initialized") raise RuntimeError("FlashInfer not available or workspace not initialized")
if norm_out is None: if norm_out is None:
@@ -216,24 +232,21 @@ def flashinfer_fused_allreduce_rmsnorm_fp8_quant(
else: else:
residual_out = input_tensor residual_out = input_tensor
flashinfer_comm.trtllm_allreduce_fusion( flashinfer_comm.allreduce_fusion(
allreduce_in=input_tensor, input=input_tensor,
token_num=input_tensor.shape[0], workspace=workspace,
pattern=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant,
residual_in=residual, residual_in=residual,
residual_out=residual_out, residual_out=residual_out,
norm_out=norm_out, norm_out=norm_out,
rms_gamma=rms_gamma, rms_gamma=rms_gamma,
rms_eps=rms_eps, rms_eps=rms_eps,
hidden_dim=input_tensor.shape[-1],
workspace_ptrs=_FI_WORKSPACE_TENSOR,
pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant,
allreduce_out=None,
quant_out=quant_out, quant_out=quant_out,
scale_out=None, scale_out=None,
layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4, layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
scale_factor=scale_factor, scale_factor=scale_factor,
use_oneshot=use_oneshot, use_oneshot=use_oneshot,
**allreduce_params.get_trtllm_fused_allreduce_kwargs(), **allreduce_params.get_flashinfer_fused_allreduce_kwargs(),
) )
@@ -244,13 +257,17 @@ def flashinfer_fused_allreduce_rmsnorm_fp4_quant(
rms_eps: float, rms_eps: float,
input_global_scale: torch.Tensor, input_global_scale: torch.Tensor,
allreduce_params: FlashInferFusedAllReduceParams, allreduce_params: FlashInferFusedAllReduceParams,
workspace: object,
quant_out: torch.Tensor, quant_out: torch.Tensor,
use_oneshot: bool, use_oneshot: bool,
output_scale: torch.Tensor, output_scale: torch.Tensor,
norm_out: torch.Tensor | None = None, norm_out: torch.Tensor | None = None,
): ):
"""FlashInfer fused allreduce + rmsnorm + FP4 quantization.""" """FlashInfer fused allreduce + rmsnorm + FP4 quantization.
if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None:
Note: Only supported by the trtllm backend.
"""
if flashinfer_comm is None or workspace is None:
raise RuntimeError("FlashInfer not available or workspace not initialized") raise RuntimeError("FlashInfer not available or workspace not initialized")
if norm_out is None: if norm_out is None:
@@ -259,24 +276,21 @@ def flashinfer_fused_allreduce_rmsnorm_fp4_quant(
else: else:
residual_out = input_tensor residual_out = input_tensor
flashinfer_comm.trtllm_allreduce_fusion( flashinfer_comm.allreduce_fusion(
allreduce_in=input_tensor, input=input_tensor,
token_num=input_tensor.shape[0], workspace=workspace,
pattern=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant,
residual_in=residual, residual_in=residual,
residual_out=residual_out, residual_out=residual_out,
norm_out=norm_out, norm_out=norm_out,
rms_gamma=rms_gamma, rms_gamma=rms_gamma,
rms_eps=rms_eps, rms_eps=rms_eps,
hidden_dim=input_tensor.shape[-1],
workspace_ptrs=_FI_WORKSPACE_TENSOR,
pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant,
allreduce_out=None,
quant_out=quant_out, quant_out=quant_out,
scale_out=output_scale, scale_out=output_scale,
layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4, layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
scale_factor=input_global_scale, scale_factor=input_global_scale,
use_oneshot=use_oneshot, use_oneshot=use_oneshot,
**allreduce_params.get_trtllm_fused_allreduce_kwargs(), **allreduce_params.get_flashinfer_fused_allreduce_kwargs(),
) )
@@ -409,13 +423,16 @@ def run_benchmarks(
dtype: torch.dtype, dtype: torch.dtype,
use_residual: bool, use_residual: bool,
allreduce_params: FlashInferFusedAllReduceParams | None, allreduce_params: FlashInferFusedAllReduceParams | None,
workspaces: dict,
quant_modes: set[str], quant_modes: set[str],
no_oneshot: bool, no_oneshot: bool,
): ):
"""Run all benchmarks for given configuration. """Run all benchmarks for given configuration.
Args: Args:
quant_mode: "none", "fp8_only", "fp4_only", or "all" allreduce_params: Shared parameters for FlashInfer fused allreduce.
workspaces: Dict mapping backend name ("trtllm", "mnnvl") to workspace.
quant_modes: Set of quantization modes: "none", "fp8", "fp4".
""" """
( (
input_tensor, input_tensor,
@@ -431,18 +448,18 @@ def run_benchmarks(
rms_eps = 1e-6 rms_eps = 1e-6
results = {} results = {}
vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
use_oneshot_options = [False] if no_oneshot else [True, False] use_oneshot_options = [False] if no_oneshot else [True, False]
# Create RMSNorm and QuantFP8 layers once for native benchmarks
if "none" in quant_modes: if "none" in quant_modes:
# Standard AllReduce + RMSNorm # Standard AllReduce + RMSNorm
# Re-create VllmFusedAllreduce per config so CustomOp binds the
# correct forward method (native vs custom kernel).
for custom_op in ["-rms_norm", "+rms_norm"]: for custom_op in ["-rms_norm", "+rms_norm"]:
with set_current_vllm_config( with set_current_vllm_config(
VllmConfig(compilation_config=CompilationConfig(custom_ops=[custom_op])) VllmConfig(compilation_config=CompilationConfig(custom_ops=[custom_op]))
): ):
try: try:
vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
suffix = ( suffix = (
"_custom_rms_norm" if "+" in custom_op else "_native_rms_norm" "_custom_rms_norm" if "+" in custom_op else "_native_rms_norm"
) )
@@ -461,6 +478,7 @@ def run_benchmarks(
VllmConfig(compilation_config=CompilationConfig(custom_ops=["-rms_norm"])) VllmConfig(compilation_config=CompilationConfig(custom_ops=["-rms_norm"]))
): ):
try: try:
vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
standard_allreduce_rmsnorm_native_compiled = torch.compile( standard_allreduce_rmsnorm_native_compiled = torch.compile(
vllm_fused_allreduce.allreduce_rmsnorm, vllm_fused_allreduce.allreduce_rmsnorm,
fullgraph=True, fullgraph=True,
@@ -476,10 +494,11 @@ def run_benchmarks(
logger.error("Standard AllReduce+RMSNorm Native Compiled failed: %s", e) logger.error("Standard AllReduce+RMSNorm Native Compiled failed: %s", e)
results["standard_allreduce_rmsnorm_native_compiled"] = float("inf") results["standard_allreduce_rmsnorm_native_compiled"] = float("inf")
# FlashInfer Fused AllReduce + RMSNorm Oneshot/Twoshot # FlashInfer Fused AllReduce + RMSNorm (all backends)
if flashinfer_comm is not None and allreduce_params is not None: for backend, workspace in workspaces.items():
for use_oneshot in use_oneshot_options: for use_oneshot in use_oneshot_options:
suffix = "_oneshot" if use_oneshot else "_twoshot" suffix = "_oneshot" if use_oneshot else "_twoshot"
key = f"flashinfer_{backend}_fused_allreduce_rmsnorm{suffix}"
try: try:
time_ms = benchmark_operation( time_ms = benchmark_operation(
flashinfer_fused_allreduce_rmsnorm, flashinfer_fused_allreduce_rmsnorm,
@@ -489,14 +508,17 @@ def run_benchmarks(
rms_gamma=rms_gamma, rms_gamma=rms_gamma,
rms_eps=rms_eps, rms_eps=rms_eps,
allreduce_params=allreduce_params, allreduce_params=allreduce_params,
workspace=workspace,
use_oneshot=use_oneshot, use_oneshot=use_oneshot,
) )
results[f"flashinfer_fused_allreduce_rmsnorm{suffix}"] = time_ms results[key] = time_ms
except Exception as e: except Exception as e:
logger.error("FlashInfer Fused AllReduce+RMSNorm failed: %s", e) logger.error(
results[f"flashinfer_fused_allreduce_rmsnorm{suffix}"] = float( "FlashInfer (%s) Fused AllReduce+RMSNorm failed: %s",
"inf" backend,
e,
) )
results[key] = float("inf")
if "fp8" in quant_modes: if "fp8" in quant_modes:
# Standard AllReduce + RMSNorm + FP8 Quant # Standard AllReduce + RMSNorm + FP8 Quant
@@ -505,7 +527,7 @@ def run_benchmarks(
"_custom_rms_norm" if "+" in rms_norm_custom_op else "_native_rms_norm" "_custom_rms_norm" if "+" in rms_norm_custom_op else "_native_rms_norm"
) )
for quant_fp8_custom_op in ["-quant_fp8", "+quant_fp8"]: for quant_fp8_custom_op in ["-quant_fp8", "+quant_fp8"]:
suffix += ( op_suffix = suffix + (
"_custom_quant_fp8" "_custom_quant_fp8"
if "+" in quant_fp8_custom_op if "+" in quant_fp8_custom_op
else "_native_quant_fp8" else "_native_quant_fp8"
@@ -518,16 +540,17 @@ def run_benchmarks(
) )
): ):
try: try:
vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
time_ms = benchmark_operation( time_ms = benchmark_operation(
vllm_fused_allreduce.allreduce_rmsnorm_fp8_quant, vllm_fused_allreduce.allreduce_rmsnorm_fp8_quant,
input_tensor, input_tensor,
residual=residual, residual=residual,
scale_factor=scale_fp8, scale_factor=scale_fp8,
) )
results[f"standard_allreduce{suffix}"] = time_ms results[f"standard_allreduce{op_suffix}"] = time_ms
except Exception as e: except Exception as e:
logger.error("Standard AllReduce+RMSNorm+FP8 failed: %s", e) logger.error("Standard AllReduce+RMSNorm+FP8 failed: %s", e)
results[f"standard_allreduce{suffix}"] = float("inf") results[f"standard_allreduce{op_suffix}"] = float("inf")
# Standard AllReduce + RMSNorm + FP8 Quant Native Compiled # Standard AllReduce + RMSNorm + FP8 Quant Native Compiled
with set_current_vllm_config( with set_current_vllm_config(
@@ -538,6 +561,7 @@ def run_benchmarks(
) )
): ):
try: try:
vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
standard_allreduce_rmsnorm_fp8_quant_native_compiled = torch.compile( standard_allreduce_rmsnorm_fp8_quant_native_compiled = torch.compile(
vllm_fused_allreduce.allreduce_rmsnorm_fp8_quant, vllm_fused_allreduce.allreduce_rmsnorm_fp8_quant,
fullgraph=True, fullgraph=True,
@@ -560,10 +584,12 @@ def run_benchmarks(
"inf" "inf"
) )
# FlashInfer Fused AllReduce + RMSNorm + FP8 Quant Oneshot # FlashInfer Fused AllReduce + RMSNorm + FP8 Quant (trtllm only)
if flashinfer_comm is not None and allreduce_params is not None: if "trtllm" in workspaces:
trtllm_ws = workspaces["trtllm"]
for use_oneshot in use_oneshot_options: for use_oneshot in use_oneshot_options:
suffix = "_oneshot" if use_oneshot else "_twoshot" suffix = "_oneshot" if use_oneshot else "_twoshot"
key = f"flashinfer_trtllm_fused_allreduce_rmsnorm_fp8_quant{suffix}"
try: try:
time_ms = benchmark_operation( time_ms = benchmark_operation(
flashinfer_fused_allreduce_rmsnorm_fp8_quant, flashinfer_fused_allreduce_rmsnorm_fp8_quant,
@@ -575,19 +601,16 @@ def run_benchmarks(
scale_factor=scale_fp8, scale_factor=scale_fp8,
quant_out=quant_out_fp8, quant_out=quant_out_fp8,
allreduce_params=allreduce_params, allreduce_params=allreduce_params,
workspace=trtllm_ws,
use_oneshot=use_oneshot, use_oneshot=use_oneshot,
) )
results[f"flashinfer_fused_allreduce_rmsnorm_fp8_quant{suffix}"] = ( results[key] = time_ms
time_ms
)
except Exception as e: except Exception as e:
logger.error( logger.error(
"FlashInfer Fused AllReduce+RMSNorm+FP8 Oneshot failed: %s", "FlashInfer (trtllm) Fused AllReduce+RMSNorm+FP8 failed: %s",
e, e,
) )
results[f"flashinfer_fused_allreduce_rmsnorm_fp8_quant{suffix}"] = ( results[key] = float("inf")
float("inf")
)
if "fp4" in quant_modes and current_platform.has_device_capability(100): if "fp4" in quant_modes and current_platform.has_device_capability(100):
# Standard AllReduce + RMSNorm + FP4 Quant # Standard AllReduce + RMSNorm + FP4 Quant
@@ -603,6 +626,7 @@ def run_benchmarks(
) )
): ):
try: try:
vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
time_ms = benchmark_operation( time_ms = benchmark_operation(
vllm_fused_allreduce.allreduce_rmsnorm_fp4_quant, vllm_fused_allreduce.allreduce_rmsnorm_fp4_quant,
input_tensor, input_tensor,
@@ -621,6 +645,7 @@ def run_benchmarks(
VllmConfig(compilation_config=CompilationConfig(custom_ops=["-rms_norm"])) VllmConfig(compilation_config=CompilationConfig(custom_ops=["-rms_norm"]))
): ):
try: try:
vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
standard_allreduce_rmsnorm_fp4_quant_native_compiled = torch.compile( standard_allreduce_rmsnorm_fp4_quant_native_compiled = torch.compile(
vllm_fused_allreduce.allreduce_rmsnorm_fp4_quant, vllm_fused_allreduce.allreduce_rmsnorm_fp4_quant,
fullgraph=True, fullgraph=True,
@@ -645,10 +670,12 @@ def run_benchmarks(
"inf" "inf"
) )
# FlashInfer Fused AllReduce + RMSNorm + FP4 Quant Oneshot # FlashInfer Fused AllReduce + RMSNorm + FP4 Quant (trtllm only)
if flashinfer_comm is not None and allreduce_params is not None: if "trtllm" in workspaces:
trtllm_ws = workspaces["trtllm"]
for use_oneshot in use_oneshot_options: for use_oneshot in use_oneshot_options:
suffix = "_oneshot" if use_oneshot else "_twoshot" suffix = "_oneshot" if use_oneshot else "_twoshot"
key = f"flashinfer_trtllm_fused_allreduce_rmsnorm_fp4_quant{suffix}"
try: try:
time_ms = benchmark_operation( time_ms = benchmark_operation(
flashinfer_fused_allreduce_rmsnorm_fp4_quant, flashinfer_fused_allreduce_rmsnorm_fp4_quant,
@@ -659,49 +686,18 @@ def run_benchmarks(
rms_eps=rms_eps, rms_eps=rms_eps,
input_global_scale=scale_fp4, input_global_scale=scale_fp4,
allreduce_params=allreduce_params, allreduce_params=allreduce_params,
workspace=trtllm_ws,
quant_out=fp4_quant_out, quant_out=fp4_quant_out,
output_scale=fp4_output_scale, output_scale=fp4_output_scale,
use_oneshot=use_oneshot, use_oneshot=use_oneshot,
) )
results[f"flashinfer_fused_allreduce_rmsnorm_fp4_quant{suffix}"] = ( results[key] = time_ms
time_ms
)
except Exception as e: except Exception as e:
logger.error( logger.error(
"FlashInfer Fused AllReduce+RMSNorm+FP4 Oneshot failed: %s", "FlashInfer (trtllm) Fused AllReduce+RMSNorm+FP4 failed: %s",
e, e,
) )
results[f"flashinfer_fused_allreduce_rmsnorm_fp4_quant{suffix}"] = ( results[key] = float("inf")
float("inf")
)
# FlashInfer Fused AllReduce + RMSNorm + FP4 Quant Two-shot
if flashinfer_comm is not None and allreduce_params is not None:
try:
time_ms = benchmark_operation(
flashinfer_fused_allreduce_rmsnorm_fp4_quant,
input_tensor,
residual=residual,
norm_out=norm_out,
rms_gamma=rms_gamma,
rms_eps=rms_eps,
input_global_scale=scale_fp4,
allreduce_params=allreduce_params,
quant_out=fp4_quant_out,
output_scale=fp4_output_scale,
use_oneshot=False,
)
results["flashinfer_fused_allreduce_rmsnorm_fp4_quant_twoshot"] = (
time_ms
)
except Exception as e:
logger.error(
"FlashInfer Fused AllReduce+RMSNorm+FP4 Two-shot failed: %s",
e,
)
results["flashinfer_fused_allreduce_rmsnorm_fp4_quant_twoshot"] = float(
"inf"
)
return results return results
@@ -1039,24 +1035,33 @@ def main():
configs = list(itertools.product(args.num_tokens, dtypes, residual_options)) configs = list(itertools.product(args.num_tokens, dtypes, residual_options))
# Setup FlashInfer workspace if available # Setup FlashInfer workspaces for all backends
ipc_handles = None
allreduce_params = None allreduce_params = None
if flashinfer_comm is not None: if flashinfer_comm is not None:
# Use the largest hidden dimension for workspace setup # Use the largest hidden dimension for workspace setup
max_element_size = max(torch.finfo(dt).bits // 8 for dt in dtypes)
workspace_dtype = (
torch.float32
if max_element_size == 4
else (torch.bfloat16 if torch.bfloat16 in dtypes else torch.float16)
)
max_num_token = _FI_MAX_SIZES.get(world_size) // ( max_num_token = _FI_MAX_SIZES.get(world_size) // (
args.hidden_dim * world_size * 2 args.hidden_dim * max_element_size
) )
ipc_handles, workspace_tensor = setup_flashinfer_workspace( for backend in FLASHINFER_BACKENDS:
world_size, rank, args.hidden_dim, max_num_token setup_flashinfer_workspace(
) backend=backend,
if workspace_tensor is not None:
allreduce_params = FlashInferFusedAllReduceParams(
rank=rank,
world_size=world_size, world_size=world_size,
rank=rank,
hidden_dim=args.hidden_dim,
max_token_num=max_num_token,
dtype=workspace_dtype,
)
if _FI_WORKSPACES:
allreduce_params = FlashInferFusedAllReduceParams(
max_token_num=max_num_token, max_token_num=max_num_token,
) )
@@ -1081,6 +1086,7 @@ def main():
dtype, dtype,
use_residual, use_residual,
allreduce_params, allreduce_params,
workspaces=_FI_WORKSPACES,
quant_modes=quant_modes, quant_modes=quant_modes,
no_oneshot=args.no_oneshot, no_oneshot=args.no_oneshot,
) )
@@ -1119,11 +1125,13 @@ def main():
finally: finally:
# Cleanup # Cleanup
if ipc_handles is not None: cleanup_flashinfer_workspaces()
cleanup_flashinfer_workspace(ipc_handles)
dist.barrier() dist.barrier()
if __name__ == "__main__": if __name__ == "__main__":
from vllm.config import VllmConfig, set_current_vllm_config
with set_current_vllm_config(VllmConfig()):
main() main()

View File

@@ -5,12 +5,14 @@ import time
import torch import torch
from vllm.benchmarks.lib.utils import default_vllm_config
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
@torch.inference_mode() @torch.inference_mode()
@default_vllm_config()
def main( def main(
num_tokens: int, num_tokens: int,
hidden_size: int, hidden_size: int,

View File

@@ -16,6 +16,7 @@ import torch
from ray.experimental.tqdm_ray import tqdm from ray.experimental.tqdm_ray import tqdm
from vllm.model_executor.layers.fused_moe import fused_topk from vllm.model_executor.layers.fused_moe import fused_topk
from vllm.model_executor.layers.fused_moe.activation import MoEActivation
from vllm.model_executor.layers.fused_moe.config import ( from vllm.model_executor.layers.fused_moe.config import (
FusedMoEConfig, FusedMoEConfig,
FusedMoEParallelConfig, FusedMoEParallelConfig,
@@ -99,13 +100,38 @@ def benchmark_config(
dtype: torch.dtype, dtype: torch.dtype,
use_fp8_w8a8: bool, use_fp8_w8a8: bool,
use_int8_w8a16: bool, use_int8_w8a16: bool,
use_int4_w4a16: bool = False,
num_iters: int = 100, num_iters: int = 100,
block_quant_shape: list[int] = None, block_quant_shape: list[int] = None,
use_deep_gemm: bool = False, use_deep_gemm: bool = False,
) -> float: ) -> float:
init_dtype = torch.float16 if use_fp8_w8a8 else dtype init_dtype = torch.float16 if use_fp8_w8a8 else dtype
x = torch.randn(num_tokens, hidden_size, dtype=dtype) x = torch.randn(num_tokens, hidden_size, dtype=dtype)
if use_int8_w8a16: if use_int4_w4a16:
# Int4 packed weights: 2 int4 values per uint8 byte
# K dimension is packed (halved)
intermediate_size = shard_intermediate_size // 2 # after silu_and_mul
w1 = torch.randint(
0,
255,
(
num_experts,
shard_intermediate_size,
hidden_size // 2, # int4 packing
),
dtype=torch.uint8,
)
w2 = torch.randint(
0,
255,
(
num_experts,
hidden_size,
intermediate_size // 2, # int4 packing
),
dtype=torch.uint8,
)
elif use_int8_w8a16:
w1 = torch.randint( w1 = torch.randint(
-127, -127,
127, 127,
@@ -139,7 +165,20 @@ def benchmark_config(
w2_scale = None w2_scale = None
a1_scale = None a1_scale = None
a2_scale = None a2_scale = None
if use_int8_w8a16: if use_int4_w4a16:
if block_quant_shape is None:
raise ValueError("block_quant_shape is required for int4_w4a16")
group_size = block_quant_shape[1]
# Scales shape: (E, N, K // group_size) in fp16
w1_scale = torch.rand(
(num_experts, shard_intermediate_size, hidden_size // group_size),
dtype=dtype,
)
w2_scale = torch.rand(
(num_experts, hidden_size, intermediate_size // group_size),
dtype=dtype,
)
elif use_int8_w8a16:
w1_scale = torch.randn( w1_scale = torch.randn(
(num_experts, 2 * shard_intermediate_size), dtype=torch.float32 (num_experts, 2 * shard_intermediate_size), dtype=torch.float32
) )
@@ -198,6 +237,7 @@ def benchmark_config(
a1_scale=a1_scale, a1_scale=a1_scale,
a2_scale=a2_scale, a2_scale=a2_scale,
block_shape=block_quant_shape, block_shape=block_quant_shape,
weight_dtype="int4" if use_int4_w4a16 else None,
) )
deep_gemm_experts = None deep_gemm_experts = None
@@ -211,7 +251,8 @@ def benchmark_config(
hidden_dim=hidden_size, hidden_dim=hidden_size,
intermediate_size_per_partition=shard_intermediate_size, intermediate_size_per_partition=shard_intermediate_size,
num_local_experts=num_experts, num_local_experts=num_experts,
activation="silu", num_logical_experts=num_experts,
activation=MoEActivation.SILU,
moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(), moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
in_dtype=init_dtype, in_dtype=init_dtype,
routing_method=RoutingMethodType.TopK, routing_method=RoutingMethodType.TopK,
@@ -226,9 +267,10 @@ def benchmark_config(
x, input_gating, topk, renormalize=not use_deep_gemm x, input_gating, topk, renormalize=not use_deep_gemm
) )
inplace = not disable_inplace()
if use_deep_gemm: if use_deep_gemm:
return deep_gemm_experts( return deep_gemm_experts(
x, w1, w2, topk_weights, topk_ids, inplace=True x, w1, w2, topk_weights, topk_ids, inplace=inplace
) )
return fused_experts( return fused_experts(
x, x,
@@ -236,7 +278,7 @@ def benchmark_config(
w2, w2,
topk_weights, topk_weights,
topk_ids, topk_ids,
inplace=True, inplace=inplace,
quant_config=quant_config, quant_config=quant_config,
) )
@@ -478,6 +520,7 @@ class BenchmarkWorker:
dtype: torch.dtype, dtype: torch.dtype,
use_fp8_w8a8: bool, use_fp8_w8a8: bool,
use_int8_w8a16: bool, use_int8_w8a16: bool,
use_int4_w4a16: bool = False,
block_quant_shape: list[int] = None, block_quant_shape: list[int] = None,
use_deep_gemm: bool = False, use_deep_gemm: bool = False,
) -> tuple[dict[str, int], float]: ) -> tuple[dict[str, int], float]:
@@ -485,7 +528,10 @@ class BenchmarkWorker:
set_random_seed(self.seed) set_random_seed(self.seed)
dtype_str = _get_config_dtype_str( dtype_str = _get_config_dtype_str(
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8 dtype,
use_int8_w8a16=use_int8_w8a16,
use_fp8_w8a8=use_fp8_w8a8,
use_int4_w4a16=use_int4_w4a16,
) )
# NOTE(woosuk): The current naming convention uses w2.shape[2], which # NOTE(woosuk): The current naming convention uses w2.shape[2], which
# is the intermediate size after silu_and_mul. # is the intermediate size after silu_and_mul.
@@ -516,6 +562,7 @@ class BenchmarkWorker:
dtype, dtype,
use_fp8_w8a8, use_fp8_w8a8,
use_int8_w8a16, use_int8_w8a16,
use_int4_w4a16=use_int4_w4a16,
num_iters=100, num_iters=100,
block_quant_shape=block_quant_shape, block_quant_shape=block_quant_shape,
use_deep_gemm=use_deep_gemm, use_deep_gemm=use_deep_gemm,
@@ -532,6 +579,7 @@ class BenchmarkWorker:
dtype: torch.dtype, dtype: torch.dtype,
use_fp8_w8a8: bool, use_fp8_w8a8: bool,
use_int8_w8a16: bool, use_int8_w8a16: bool,
use_int4_w4a16: bool,
search_space: list[dict[str, int]], search_space: list[dict[str, int]],
block_quant_shape: list[int], block_quant_shape: list[int],
use_deep_gemm: bool, use_deep_gemm: bool,
@@ -542,7 +590,7 @@ class BenchmarkWorker:
best_config = None best_config = None
best_time = float("inf") best_time = float("inf")
if current_platform.is_rocm(): if current_platform.is_rocm():
is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16) is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16 or use_int4_w4a16)
search_space = prune_rocm_search_space( search_space = prune_rocm_search_space(
num_tokens, num_tokens,
shard_intermediate_size, shard_intermediate_size,
@@ -571,6 +619,7 @@ class BenchmarkWorker:
dtype, dtype,
use_fp8_w8a8, use_fp8_w8a8,
use_int8_w8a16, use_int8_w8a16,
use_int4_w4a16,
num_iters=20, num_iters=20,
block_quant_shape=block_quant_shape, block_quant_shape=block_quant_shape,
use_deep_gemm=use_deep_gemm, use_deep_gemm=use_deep_gemm,
@@ -618,6 +667,7 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
else {} else {}
), ),
**({"kpack": config["kpack"]} if "kpack" in config else {}), **({"kpack": config["kpack"]} if "kpack" in config else {}),
**({"SPLIT_K": config["SPLIT_K"]} if "SPLIT_K" in config else {}),
} }
@@ -630,11 +680,15 @@ def save_configs(
dtype: torch.dtype, dtype: torch.dtype,
use_fp8_w8a8: bool, use_fp8_w8a8: bool,
use_int8_w8a16: bool, use_int8_w8a16: bool,
use_int4_w4a16: bool,
block_quant_shape: list[int], block_quant_shape: list[int],
save_dir: str, save_dir: str,
) -> None: ) -> None:
dtype_str = _get_config_dtype_str( dtype_str = _get_config_dtype_str(
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8 dtype,
use_int8_w8a16=use_int8_w8a16,
use_fp8_w8a8=use_fp8_w8a8,
use_int4_w4a16=use_int4_w4a16,
) )
# NOTE(woosuk): The current naming convention uses w2.shape[2], which # NOTE(woosuk): The current naming convention uses w2.shape[2], which
@@ -686,6 +740,7 @@ def get_model_params(config):
"DeepseekV2ForCausalLM", "DeepseekV2ForCausalLM",
"DeepseekV3ForCausalLM", "DeepseekV3ForCausalLM",
"DeepseekV32ForCausalLM", "DeepseekV32ForCausalLM",
"GlmMoeDsaForCausalLM",
"Glm4MoeForCausalLM", "Glm4MoeForCausalLM",
"Glm4MoeLiteForCausalLM", "Glm4MoeLiteForCausalLM",
"NemotronHForCausalLM", "NemotronHForCausalLM",
@@ -735,6 +790,38 @@ def get_model_params(config):
return E, topk, intermediate_size, hidden_size return E, topk, intermediate_size, hidden_size
def get_quantization_group_size(config) -> int | None:
"""Extract the quantization group size from the HF model config.
This reads directly from the HuggingFace config object (as returned by
``get_config()``), not from vLLM's quantization config classes.
Supports AWQ/GPTQ-style configs (direct 'group_size' key) and
compressed-tensors configs (nested inside 'config_groups').
"""
quantization_config = getattr(config, "quantization_config", {})
if not isinstance(quantization_config, dict):
return None
# AWQ / GPTQ style: group_size is a top-level key
gs = quantization_config.get("group_size")
if gs is not None:
return gs
# compressed-tensors style: group_size is nested in config_groups
config_groups = quantization_config.get("config_groups", {})
if not isinstance(config_groups, dict):
return None
for group_cfg in config_groups.values():
if not isinstance(group_cfg, dict):
continue
weights = group_cfg.get("weights", {})
if not isinstance(weights, dict):
continue
gs = weights.get("group_size")
if gs is not None:
return gs
return None
def main(args: argparse.Namespace): def main(args: argparse.Namespace):
print(args) print(args)
@@ -753,7 +840,20 @@ def main(args: argparse.Namespace):
dtype = torch.float16 if current_platform.is_rocm() else config.dtype dtype = torch.float16 if current_platform.is_rocm() else config.dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16" use_int8_w8a16 = args.dtype == "int8_w8a16"
use_int4_w4a16 = args.dtype == "int4_w4a16"
block_quant_shape = get_weight_block_size_safety(config) block_quant_shape = get_weight_block_size_safety(config)
if use_int4_w4a16:
group_size = get_quantization_group_size(config)
if group_size is None:
raise ValueError(
"Could not determine group_size from model config. "
"The model's quantization_config must contain a 'group_size' "
"field (AWQ/GPTQ) or 'config_groups.*.weights.group_size' "
"(compressed-tensors)."
)
# For int4_w4a16, block_shape = [0, group_size]
# block_shape[0]=0 means no block quantization on N dimension
block_quant_shape = [0, group_size]
if args.batch_size is None: if args.batch_size is None:
batch_sizes = [ batch_sizes = [
@@ -807,8 +907,20 @@ def main(args: argparse.Namespace):
return ray.get(outputs) return ray.get(outputs)
if args.tune: if args.tune:
is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16) # int4_w4a16 weights are uint8-packed, not fp16; treat like fp8 for
search_space = get_configs_compute_bound(is_fp16, block_quant_shape) # search space generation (no matrix_instr_nonkdim/kpack exploration).
is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16 or use_int4_w4a16)
# For int4_w4a16, the group_size constraint on BLOCK_SIZE_K does not
# apply: the gptq_awq kernel handles arbitrary BLOCK_SIZE_K regardless
# of group_size. Skip block_quant_shape filtering to keep the full
# search space (e.g. BLOCK_SIZE_K=64 with group_size=128).
tune_block_quant_shape = None if use_int4_w4a16 else block_quant_shape
search_space = get_configs_compute_bound(is_fp16, tune_block_quant_shape)
if use_int4_w4a16:
# SPLIT_K is a required kernel constexpr for gptq_awq kernel;
# only SPLIT_K=1 is used at runtime, so fix it during tuning.
for cfg in search_space:
cfg["SPLIT_K"] = 1
print(f"Start tuning over {len(search_space)} configurations...") print(f"Start tuning over {len(search_space)} configurations...")
if use_deep_gemm: if use_deep_gemm:
raise ValueError( raise ValueError(
@@ -828,6 +940,7 @@ def main(args: argparse.Namespace):
dtype, dtype,
use_fp8_w8a8, use_fp8_w8a8,
use_int8_w8a16, use_int8_w8a16,
use_int4_w4a16,
search_space, search_space,
block_quant_shape, block_quant_shape,
use_deep_gemm, use_deep_gemm,
@@ -847,6 +960,7 @@ def main(args: argparse.Namespace):
dtype, dtype,
use_fp8_w8a8, use_fp8_w8a8,
use_int8_w8a16, use_int8_w8a16,
use_int4_w4a16,
block_quant_shape, block_quant_shape,
args.save_dir, args.save_dir,
) )
@@ -865,6 +979,7 @@ def main(args: argparse.Namespace):
dtype, dtype,
use_fp8_w8a8, use_fp8_w8a8,
use_int8_w8a16, use_int8_w8a16,
use_int4_w4a16,
block_quant_shape, block_quant_shape,
use_deep_gemm, use_deep_gemm,
) )
@@ -887,7 +1002,10 @@ if __name__ == "__main__":
) )
parser.add_argument("--enable-expert-parallel", "-enable-ep", action="store_true") parser.add_argument("--enable-expert-parallel", "-enable-ep", action="store_true")
parser.add_argument( parser.add_argument(
"--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto" "--dtype",
type=str,
choices=["auto", "fp8_w8a8", "int8_w8a16", "int4_w4a16"],
default="auto",
) )
parser.add_argument("--use-deep-gemm", action="store_true") parser.add_argument("--use-deep-gemm", action="store_true")
parser.add_argument( parser.add_argument(

View File

@@ -0,0 +1,278 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Benchmark comparing old vs new default fused MoE configs.
Runs the triton fused_moe kernel with three configurations for each scenario:
1. Tuned config (from JSON file, if available) — the target to match
2. Old default (the hardcoded defaults before this change)
3. New default (the improved defaults)
Usage:
python benchmarks/kernels/benchmark_moe_defaults.py
Produces a table showing kernel time (us) and speedup of new vs old defaults.
"""
import torch
from vllm.model_executor.layers.fused_moe import fused_topk, override_config
from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_experts,
get_default_config,
get_moe_configs,
)
from vllm.platforms import current_platform
from vllm.triton_utils import triton
from vllm.utils.torch_utils import set_random_seed
FP8_DTYPE = current_platform.fp8_dtype()
def old_default_config(M, E, N, K, topk, dtype=None, block_shape=None):
"""The original defaults before https://github.com/vllm-project/vllm/pull/34846,
for comparison."""
if dtype == "fp8_w8a8" and block_shape is not None:
return {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": block_shape[0],
"BLOCK_SIZE_K": block_shape[1],
"GROUP_SIZE_M": 32,
"SPLIT_K": 1,
"num_warps": 4,
"num_stages": 3 if not current_platform.is_rocm() else 2,
}
elif M <= E:
return {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"SPLIT_K": 1,
}
else:
return {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
}
def benchmark_config(
config,
M,
E,
N,
K,
topk,
dtype,
use_fp8=False,
block_shape=None,
num_iters=100,
):
"""Time a single kernel config. Returns kernel time in microseconds."""
init_dtype = torch.float16 if use_fp8 else dtype
a = torch.randn(M, K, device="cuda", dtype=init_dtype) / 10
w1 = torch.randn(E, 2 * N, K, device="cuda", dtype=init_dtype) / 10
w2 = torch.randn(E, K, N, device="cuda", dtype=init_dtype) / 10
w1_scale = None
w2_scale = None
a1_scale = None
a2_scale = None
if use_fp8:
if block_shape is not None:
bsn, bsk = block_shape
n_tiles_w1 = triton.cdiv(2 * N, bsn)
k_tiles_w1 = triton.cdiv(K, bsk)
n_tiles_w2 = triton.cdiv(K, bsn)
k_tiles_w2 = triton.cdiv(N, bsk)
w1_scale = torch.rand(
E, n_tiles_w1, k_tiles_w1, device="cuda", dtype=torch.float32
)
w2_scale = torch.rand(
E, n_tiles_w2, k_tiles_w2, device="cuda", dtype=torch.float32
)
else:
w1_scale = torch.rand(E, device="cuda", dtype=torch.float32)
w2_scale = torch.rand(E, device="cuda", dtype=torch.float32)
a1_scale = torch.rand(1, device="cuda", dtype=torch.float32)
a2_scale = torch.rand(1, device="cuda", dtype=torch.float32)
# Only weights are stored in fp8; activations stay in bf16/fp16
# and get dynamically quantized inside the kernel.
w1 = w1.to(FP8_DTYPE)
w2 = w2.to(FP8_DTYPE)
quant_config = FusedMoEQuantConfig.make(
quant_dtype=torch.float8_e4m3fn if use_fp8 else None,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
block_shape=block_shape,
)
gating = torch.randn(M, E, device="cuda", dtype=torch.float32)
# Warmup
for _ in range(20):
with override_config(config):
topk_weights, topk_ids, _ = fused_topk(a, gating, topk, renormalize=True)
fused_experts(
a,
w1,
w2,
topk_weights,
topk_ids,
quant_config=quant_config,
)
torch.cuda.synchronize()
# Benchmark
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
start.record()
for _ in range(num_iters):
with override_config(config):
topk_weights, topk_ids, _ = fused_topk(a, gating, topk, renormalize=True)
fused_experts(
a,
w1,
w2,
topk_weights,
topk_ids,
quant_config=quant_config,
)
end.record()
torch.cuda.synchronize()
return start.elapsed_time(end) / num_iters * 1000 # ms -> us
# Model configurations: (name, E, N, K, topk, dtype_str, use_fp8, block_shape)
# N = moe_intermediate_size // tp_size (the value used in config file lookup)
MODELS = [
# --- Few experts ---
("Mixtral bf16", 8, 7168, 4096, 2, None, False, None),
("Mixtral fp8", 8, 7168, 4096, 2, "fp8_w8a8", True, None),
# --- Many experts: real model shapes at tp=1 ---
# Qwen2-MoE-57B: E=60, topk=4, N=1408, K=2048
("Qwen2-MoE bf16", 60, 1408, 2048, 4, None, False, None),
# DeepSeek-V2: E=64, topk=6, N=1407, K=4096
# (use 1408 to avoid odd alignment; real model is 1407)
("DeepSeek-V2 bf16", 64, 1408, 4096, 6, None, False, None),
# OLMoE-7B: E=64, topk=8, N=2048, K=2048
("OLMoE bf16", 64, 2048, 2048, 8, None, False, None),
# GLM-4-100B-A10B: E=128, topk=8, N=1408, K=4096
("GLM-4-MoE bf16", 128, 1408, 4096, 8, None, False, None),
# Qwen3-30B-A3B: E=128, topk=8, N=768, K=2048
("Qwen3-MoE bf16", 128, 768, 2048, 8, None, False, None),
# DeepSeek-V3 / MiMo-V2-Flash: E=256, topk=8, N=2048, K=7168
("DeepSeek-V3 bf16", 256, 2048, 7168, 8, None, False, None),
# Qwen3.5-70B-A22B (Qwen3-Next): E=512, topk=10, N=512, K=2048
("Qwen3-Next bf16", 512, 512, 2048, 10, None, False, None),
# E=128 N=1856 bf16
("E128 N1856 bf16", 128, 1856, 4096, 8, None, False, None),
# E=256 N=512 bf16 (DS-V3 tp=4)
("DS-V3 tp4 bf16", 256, 512, 7168, 8, None, False, None),
# E=512 N=512 bf16 (Qwen3-Next tp=1)
("Qwen3-Next bf16", 512, 512, 2048, 10, None, False, None),
# E=512 N=256 bf16 (Qwen3-Next tp=2)
("Qwen3-Next tp2", 512, 256, 2048, 10, None, False, None),
# --- FP8 block quant (many experts) ---
# DS-V3 tp=4: E=256, N=512, fp8 block
("DS-V3 tp4 fp8blk", 256, 512, 7168, 8, "fp8_w8a8", True, [128, 128]),
# DS-V3 tp=8: E=256, N=256, fp8 block
("DS-V3 tp8 fp8blk", 256, 256, 7168, 8, "fp8_w8a8", True, [128, 128]),
# Qwen3-Next tp=2 fp8 block
("Qwen3-Next tp2 fp8blk", 512, 256, 2048, 10, "fp8_w8a8", True, [128, 128]),
]
BATCH_SIZES = [1, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]
def main():
set_random_seed(0)
torch.set_default_device("cuda")
dtype = torch.bfloat16
for name, E, N, K, topk, dtype_str, use_fp8, block_shape in MODELS:
print(f"\n{'=' * 90}")
print(f" {name} (E={E}, N={N}, K={K}, topk={topk})")
print(f"{'=' * 90}")
# Try to load tuned config
block_n = block_shape[0] if block_shape else None
block_k = block_shape[1] if block_shape else None
tuned = get_moe_configs(E, N, dtype_str, block_n, block_k)
has_tuned = tuned is not None
print(f" Tuned config available: {has_tuned}")
hdr = (
f"{'Batch':>6} | {'Tuned (us)':>11} | {'Old (us)':>11} | "
f"{'New (us)':>11} | {'New/Old':>8} | {'New/Tuned':>10}"
)
print(f" {hdr}")
print(f" {'-' * len(hdr)}")
for M in BATCH_SIZES:
old_cfg = old_default_config(M, E, N, K, topk, dtype_str, block_shape)
new_cfg = get_default_config(M, E, N, K, topk, dtype_str, block_shape)
if has_tuned:
tuned_cfg = tuned[min(tuned.keys(), key=lambda x: abs(x - M))]
t_tuned = benchmark_config(
tuned_cfg,
M,
E,
N,
K,
topk,
dtype,
use_fp8=use_fp8,
block_shape=block_shape,
)
else:
t_tuned = None
t_old = benchmark_config(
old_cfg,
M,
E,
N,
K,
topk,
dtype,
use_fp8=use_fp8,
block_shape=block_shape,
)
t_new = benchmark_config(
new_cfg,
M,
E,
N,
K,
topk,
dtype,
use_fp8=use_fp8,
block_shape=block_shape,
)
ratio_new_old = t_new / t_old
tuned_str = f"{t_tuned:11.2f}" if t_tuned else f"{'N/A':>11}"
ratio_tuned = f"{t_new / t_tuned:10.2f}x" if t_tuned else f"{'N/A':>10}"
# flag regressions where new default is >5% slower than old
marker = " <--" if ratio_new_old > 1.05 else ""
print(
f" {M:>6} | {tuned_str} | {t_old:11.2f} | {t_new:11.2f} "
f"| {ratio_new_old:7.2f}x | {ratio_tuned}{marker}"
)
if __name__ == "__main__":
main()

View File

@@ -44,10 +44,8 @@ def benchmark_permute(
hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype) hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
# output_hidden_states = torch.empty_like(hidden_states) # output_hidden_states = torch.empty_like(hidden_states)
if use_fp8_w8a8: if use_fp8_w8a8:
align_block_size = 128 # deepgemm needs 128 m aligned block
qhidden_states, scale = _fp8_quantize(hidden_states, None, None) qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
else: else:
align_block_size = None
qhidden_states = hidden_states qhidden_states = hidden_states
gating_output = torch.randn(num_iters, num_tokens, num_experts, dtype=torch.float32) gating_output = torch.randn(num_iters, num_tokens, num_experts, dtype=torch.float32)
@@ -67,7 +65,6 @@ def benchmark_permute(
topk_ids=topk_ids, topk_ids=topk_ids,
n_expert=num_experts, n_expert=num_experts,
expert_map=None, expert_map=None,
align_block_size=align_block_size,
) )
# JIT compilation & warmup # JIT compilation & warmup
@@ -117,10 +114,8 @@ def benchmark_unpermute(
# init_dtype = torch.float16 if use_fp8_w8a8 else dtype # init_dtype = torch.float16 if use_fp8_w8a8 else dtype
hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype) hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
if use_fp8_w8a8: if use_fp8_w8a8:
align_block_size = 128 # deepgemm needs 128 m aligned block
qhidden_states, scale = _fp8_quantize(hidden_states, None, None) qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
else: else:
align_block_size = None
qhidden_states = hidden_states qhidden_states = hidden_states
input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32) input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32)
@@ -142,7 +137,6 @@ def benchmark_unpermute(
topk_ids=topk_ids, topk_ids=topk_ids,
n_expert=num_experts, n_expert=num_experts,
expert_map=None, expert_map=None,
align_block_size=align_block_size,
) )
# convert to fp16/bf16 as gemm output # convert to fp16/bf16 as gemm output
return ( return (

View File

@@ -36,6 +36,7 @@ from typing import Any
import numpy as np import numpy as np
import torch import torch
from vllm.benchmarks.lib.utils import default_vllm_config
from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.transformers_utils.config import get_config from vllm.transformers_utils.config import get_config
from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.argparse_utils import FlexibleArgumentParser
@@ -78,6 +79,7 @@ def calculate_stats(times: list[float]) -> dict[str, float]:
} }
@default_vllm_config()
def benchmark_mrope( def benchmark_mrope(
model_name: str, model_name: str,
num_tokens: int, num_tokens: int,

View File

@@ -7,6 +7,7 @@ from unittest.mock import patch
import pandas as pd import pandas as pd
import torch import torch
from vllm.benchmarks.lib.utils import default_vllm_config
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
from vllm.triton_utils import triton from vllm.triton_utils import triton
@@ -84,6 +85,7 @@ def calculate_diff(
configs = [] configs = []
@default_vllm_config()
def benchmark_quantization( def benchmark_quantization(
batch_size, batch_size,
hidden_size, hidden_size,

View File

@@ -5,6 +5,7 @@ import itertools
import torch import torch
from vllm.benchmarks.lib.utils import default_vllm_config
from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.triton_utils import triton from vllm.triton_utils import triton
from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.argparse_utils import FlexibleArgumentParser
@@ -29,6 +30,7 @@ def get_benchmark(head_size, rotary_dim, is_neox_style, device):
args={}, args={},
) )
) )
@default_vllm_config()
def benchmark(batch_size, seq_len, num_heads, provider): def benchmark(batch_size, seq_len, num_heads, provider):
dtype = torch.bfloat16 dtype = torch.bfloat16
max_position = 8192 max_position = 8192

View File

@@ -71,7 +71,7 @@ while [[ $# -gt 0 ]]; do
usage usage
;; ;;
*) *)
echo "Unknown argument: $1\n" printf "Unknown argument: %s\n" "$1"
usage usage
;; ;;
esac esac
@@ -84,15 +84,17 @@ mkdir -p "$OUTPUT_DIR"
QPS_VALUES=(25 20 15 10 5 1) QPS_VALUES=(25 20 15 10 5 1)
# Common parameters # Common parameters
COMMON_PARAMS="--backend $BACKEND \ COMMON_PARAMS=(
--model $MODEL \ --backend "$BACKEND"
--dataset $DATASET \ --model "$MODEL"
--structured-output-ratio $STRUCTURED_OUTPUT_RATIO \ --dataset "$DATASET"
--save-results \ --structured-output-ratio "$STRUCTURED_OUTPUT_RATIO"
--result-dir $OUTPUT_DIR \ --save-results
--output-len $MAX_NEW_TOKENS \ --result-dir "$OUTPUT_DIR"
--port $PORT \ --output-len "$MAX_NEW_TOKENS"
--tokenizer-mode $TOKENIZER_MODE" --port "$PORT"
--tokenizer-mode "$TOKENIZER_MODE"
)
echo "Starting structured output benchmark with model: $MODEL" echo "Starting structured output benchmark with model: $MODEL"
echo "Backend: $BACKEND" echo "Backend: $BACKEND"
@@ -109,17 +111,17 @@ for qps in "${QPS_VALUES[@]}"; do
GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown") GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")
# Construct filename for this run # Construct filename for this run
FILENAME="${BACKEND}_${qps}qps_$(basename $MODEL)_${DATASET}_${GIT_HASH}.json" FILENAME="${BACKEND}_${qps}qps_$(basename "$MODEL")_${DATASET}_${GIT_HASH}_${GIT_BRANCH}.json"
NUM_PROMPTS=$(echo "$TOTAL_SECONDS * $qps" | bc) NUM_PROMPTS=$(echo "$TOTAL_SECONDS * $qps" | bc)
NUM_PROMPTS=${NUM_PROMPTS%.*} # Remove fractional part NUM_PROMPTS=${NUM_PROMPTS%.*} # Remove fractional part
echo "Running benchmark with $NUM_PROMPTS prompts" echo "Running benchmark with $NUM_PROMPTS prompts"
# Run the benchmark # Run the benchmark
python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \ python "$SCRIPT_DIR/benchmark_serving_structured_output.py" "${COMMON_PARAMS[@]}" \
--request-rate $qps \ --request-rate "$qps" \
--result-filename "$FILENAME" \ --result-filename "$FILENAME" \
--num-prompts $NUM_PROMPTS --num-prompts "$NUM_PROMPTS"
echo "Completed benchmark with QPS: $qps" echo "Completed benchmark with QPS: $qps"
echo "----------------------------------------" echo "----------------------------------------"

Some files were not shown because too many files have changed in this diff Show More