Compare commits

...

4013 Commits

Author SHA1 Message Date
Tao He
880c741bb6 [Bugfix] fixes the causal_conv1d_update kernel update non-speculative decoding cases (#24680)
Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-09-11 18:16:43 -07:00
RichardoMu
40b6c9122b [V1] feat:add engine v1 tracing (#20372)
Signed-off-by: Mu Huai <tianbowen.tbw@antgroup.com>
Signed-off-by: Ye Zhang <zhysishu@gmail.com>
Signed-off-by: RichardoMu <44485717+RichardoMrMu@users.noreply.github.com>
Signed-off-by: simon-mo <simon.mo@hey.com>
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: Mu Huai <tianbowen.tbw@antgroup.com>
Co-authored-by: Ye Zhang <zhysishu@gmail.com>
Co-authored-by: Benjamin Bartels <benjamin@bartels.dev>
Co-authored-by: simon-mo <simon.mo@hey.com>
Co-authored-by: 瑜琮 <ly186375@antfin.com>
Co-authored-by: Aaron Pham <contact@aarnphm.xyz>
Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-09-11 17:10:39 -07:00
Lucas Wilkinson
2e6bc46821 [Startup] Make DeepGEMM warmup scale with max-num-batched-tokens (#24693)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-09-11 20:10:19 -04:00
Wentao Ye
fcba05c435 [Bug] Fix Layer weight_block_size Assertion Issue (#24674)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-09-11 19:47:59 -04:00
Zazzle516
7a30fa8708 [Doc] Clarify cudagraph capture size logic and default behavior in scheduler (#18698)
Signed-off-by: Zazzle516 <2405677060@qq.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-11 23:18:09 +00:00
Chen Zhang
f82f7a8990 [Qwen3-Next] MOE configs for H100 TP4 (#24699)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-09-11 15:45:52 -07:00
Michael Goin
c3aea10dc8 [Perf] Use upstream CUTLASS for SM90 Block FP8 kernel (#23280)
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-09-11 15:43:14 -07:00
Matthew Bonanni
d4fd2768ef [Bugfix][Attention] Fix FlashInfer MLA block size logic (#24692)
Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
2025-09-11 22:39:42 +00:00
Vadim Gimpelson
7a70a71892 [Qwen3-Next] Add B200 MoE configs for Qwen3-next (#24698)
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
2025-09-11 15:34:58 -07:00
Zhewen Li
7d4651997a [CI/Build] Add bc-linter to vLLM CI (#21234)
Signed-off-by: zhewenli <zhewenli@meta.com>
2025-09-11 15:34:36 -07:00
Woosuk Kwon
569bf1c9c0 [Qwen3-Next] MoE configs for H200 TP=1,2,4 (#24695)
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
2025-09-11 14:38:16 -07:00
Wentao Ye
1ec20355f5 [Bugfix] Set VLLM_ALLREDUCE_USE_SYMM_MEM default to False (#24696)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-09-11 14:32:27 -07:00
Xiaozhu Meng
e42af78b18 [flashinfer] [kernel] support for fp8 kv cache for trtllm prefill attention (#24197)
Signed-off-by: Xiaozhu <mxz297@gmail.com>
2025-09-11 14:20:09 -07:00
Duncan Moss
074854b24f [Kernel][B200] mxfp4 fused cutlass moe (#23696)
Signed-off-by: Duncan Moss <djm.moss@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-09-11 17:04:56 -04:00
Andrew Xia
79ac59f32e Update Spec Decode metrics to include drafted and accepted token throughput (#24127)
Signed-off-by: Andrew Xia <axia@meta.com>
2025-09-11 19:58:43 +00:00
Nick Hill
b971f91504 [BugFix] Fix tokenize asyncio task leak (#24677)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-09-11 19:44:04 +00:00
Woosuk Kwon
c733bd5e87 [Qwen3-Next] Add MoE Config for H200 (#24688)
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
2025-09-11 12:40:15 -07:00
Wentao Ye
a892b259b4 [Doc] Remove Useless Comments (#24687)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-09-11 12:25:47 -07:00
Peter Salas
127ded0a9e [Ultravox] Use wrapped_model_config to instantiate inner model (#24679)
Signed-off-by: Peter Salas <peter@fixie.ai>
2025-09-11 18:52:24 +00:00
Isotr0py
bb2b5126da [VLM] Migrate remain DP-supported ViT models to use disable_tp (#24363)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-11 18:30:41 +00:00
Harry Mellor
361ae27f8a [Docs] Fix formatting of transcription doc (#24676)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-11 11:18:06 -07:00
co63oc
e26fef8397 fix some typos (#24616)
Signed-off-by: co63oc <co63oc@users.noreply.github.com>
2025-09-11 10:48:46 -07:00
Harry Mellor
c1eda615ba Fix model name included in responses (#24663)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-11 10:47:51 -07:00
Konrad Zawora
4aa23892d6 [Bugfix] Fix platform-specific routing in CustomOp implementations (#24444)
Signed-off-by: Konrad Zawora <kzawora@habana.ai>
2025-09-11 17:15:01 +00:00
Ilya Markov
1fdd5c42d7 [Kernels] Enable Torch Symmetric Memory All-Reduce By Default (#24111)
Signed-off-by: ilmarkov <markovilya197@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-09-11 09:45:31 -07:00
Isotr0py
bcbe2a4d9e [VLM] Optimize GLM4.5-V-style video processing to only decode necessary frames (#24161)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-11 09:44:34 -07:00
Harry Mellor
51d41265ad [Docs] Fix typos in EP deployment doc (#24669)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-11 09:07:23 -07:00
Wentao Ye
4984a291d5 [Doc] Fix Markdown Pre-commit Error (#24670)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-09-11 09:05:59 -07:00
Nicolò Lucchesi
404c85ca72 [Docs] Add transcription support to model (#24664)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-09-11 07:39:01 -07:00
Jee Jee Li
817beef7f3 [Bugifx] Fix qwen-next packed_modules_mapping (#24656)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-09-11 22:26:17 +08:00
Mengqing Cao
4f6593b058 [HybridKVCache][Platform] Add support_hybrid_kv_cache for platform (#24646)
Signed-off-by: MengqingCao <cmq0113@163.com>
2025-09-11 21:47:58 +08:00
Boyuan Feng
94e6b2d55f Allow users to specify kv cache memory size (#21489)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-11 13:41:07 +00:00
wang.yuqi
fd1ce98cdd [CI] Split mteb test from Language Models Test (#24634)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-09-11 06:37:51 -07:00
Jee Jee Li
d11ec124a0 [Bench] Add qwen-next in benchmark_moe.py (#24661)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-09-11 21:29:43 +08:00
youkaichao
f510715882 [build] add torch to tool.uv no-build-isolation-package (#24303)
Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-11 13:19:44 +00:00
Tao He
f946197473 [Docs] Fixes a typo in the qwen3next model name. (#24654)
Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>
2025-09-11 19:35:14 +08:00
Fanli Lin
0cd72a7b72 [XPU] add missing dependency tblib for XPU CI (#24639)
Signed-off-by: Fanli Lin <fanli.lin@intel.com>
2025-09-11 11:22:33 +00:00
Harry Mellor
5f5271f1ee Move LoRAConfig from config/__init__.py to config/lora.py (#24644)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-11 11:01:38 +00:00
Harry Mellor
d6249d0699 Fix typing for safetensors_load_strategy (#24641)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-11 10:41:39 +00:00
wang.yuqi
25bb9e8c65 [CI Failure] fix models/language/pooling/test_auto_prefix_cache_support.py (#24636)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-09-11 03:31:23 -07:00
Nicolò Lucchesi
a1213fae5f [Misc] Add @NickLucche to codeowners (#24647)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-09-11 17:18:09 +08:00
wang.yuqi
a8b0361c92 [CI] Split pooling from entrypoints Test (#24632)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-09-11 01:53:09 -07:00
Kyuyeun Kim
ed5ae4aace [Bugfix] Fix _synced_weight_loader (#24565)
Signed-off-by: Kyuyeun Kim <kyuyeunk@google.com>
2025-09-11 16:52:33 +08:00
Xingyu Liu
0fc36463e0 [CI]Add transformers_utils to Async Engine, Inputs, Utils, Worker Test (#24615)
Signed-off-by: Xingyu Liu <charlotteliu12x@gmail.com>
2025-09-11 01:52:10 -07:00
Michael Yao
d14c4ebf08 [Docs] Use 1-2-3 list for deploy steps in deployment/frameworks/ (#24633)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-09-11 01:50:12 -07:00
Russell Bryant
ba6011027d [Docs] Update V1 doc to reflect whisper support (#24606)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-09-11 01:50:08 -07:00
Michael Yao
85df8afdae [Docs] Revise frameworks/anything-llm.md (#24489)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-09-11 01:50:05 -07:00
Cyrus Leung
6aeb1dab4a [Bugfix] Fix incorrect import of CacheConfig (#24631)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-11 01:48:25 -07:00
Tao He
e93f4cc9e3 Add the support for the qwen3 next model (a hybrid attention model). (#24526)
Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-09-11 15:32:09 +08:00
Jerry Zhang
2048c4e379 [torchao] Support quantization configs using module swap (#21982)
Signed-off-by: Jerry Zhang <jerryzh168@gmail.com>
2025-09-10 23:53:24 -07:00
Chenxi Yang
d13360183a Remove redundant all gather + split (#23441)
Co-authored-by: Chenxi Yang <cxyang@meta.com>
Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com>
2025-09-10 23:45:07 -07:00
TaehyunKim
9bd831f501 [Model] New model support for Motif-1-Tiny (#23414)
Signed-off-by: ca1207 <ca1207zzz@gmail.com>
Signed-off-by: TaehyunKim <73943231+ca1207@users.noreply.github.com>
Co-authored-by: WyldeCat <skan1543@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-09-10 23:29:40 -07:00
Didier Durand
e2b1f863aa [Doc]: fixing doc typos (#24635)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
2025-09-10 23:19:28 -07:00
shengshiqi-google
41329a0ff9 [Core] feat: Add --safetensors-load-strategy flag for faster safetensors loading from Lustre (#24469)
Signed-off-by: Shiqi Sheng <shengshiqi@google.com>
Signed-off-by: shengshiqi-google <160179165+shengshiqi-google@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-09-10 23:10:01 -07:00
Tomas Ruiz
ee0bc5e1b4 Enable --profile in 'vllm bench throughput' (#24575)
Signed-off-by: Tomas Ruiz <tomas.ruiz.te@gmail.com>
2025-09-10 23:06:19 -07:00
Saman A. Pour
3d1393f6fc Kimi K2 Fused MoE kernels Optimization configs (#24597)
Signed-off-by: Saman Keon <samanamp@outlook.com>
2025-09-10 23:06:16 -07:00
Guy Stone
8a894084d2 [Engine][Chore] use local variable and remove output var assignment (#24554)
Signed-off-by: Guy Stone <guys@spotify.com>
2025-09-10 23:05:42 -07:00
Nick Hill
e2d8c27f68 [BugFix] Fix pipeline parallel (#24621)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-09-10 23:05:30 -07:00
Li, Jiang
29799ddacc [Bugfix] Add missing VIT backend dispatch on CPU (#24623)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-09-10 22:28:41 -07:00
Peter Salas
f17a6aa4ec [Ultravox] Fix Gemma instantiation, support quantization via --hf-overrides (#24131)
Signed-off-by: Peter Salas <peter@fixie.ai>
2025-09-10 22:25:34 -07:00
Wenlong Wang
6c8deacd72 [Bug] [Spec Decode] Fix model_initialization test and mismatch in aux_hidden_layers (#24613)
Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-09-10 21:23:18 -07:00
Chauncey
55b823ba0f Add @chaunceyjiang to codeowner for reasoning Reasoning and Tool parser (#24406)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-09-11 04:23:04 +00:00
youkaichao
8c5a747246 [distributed] update known issues (#24624)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-09-11 11:09:38 +08:00
Alexandre Marques
5931b7e5d9 [Models][Quantization] Add quantization configuration update in Voxtral model (#24122)
Signed-off-by: Alexandre Marques <almarque@redhat.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-09-10 19:13:56 -07:00
Jonathan Berkhahn
cc99baf14d [Misc] Make timeout passable in init_distributed_environment (#24522)
Signed-off-by: jberkhahn <jaberkha@us.ibm.com>
2025-09-10 15:41:12 -07:00
Hanjie Qiu
dcb28a332b [Kernel] Flashinfer MLA (trtllm-gen) decode kernel integration (#21078)
Signed-off-by: hjjq <hanjieq@nvidia.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-09-10 15:31:10 -07:00
Michael Goin
fba7856581 [Perf] Warmup FlashInfer attention during startup (#23439)
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
Co-authored-by: Luka Govedič <lgovedic@redhat.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Matthew Bonanni <mbonanni001@gmail.com>
2025-09-10 15:03:17 -07:00
Chen Zhang
b5e383cd8b [gpt-oss] raise error for flashinfer backend without trtllm (#24482)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-09-10 14:33:13 -07:00
Gregory Shtrasberg
9a161307f5 [torch.compile][ROCm][V1] Enable attention output FP8 fusion for V1 attention backends (#19767)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
Co-authored-by: Luka Govedič <lgovedic@redhat.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-09-10 13:59:55 -07:00
Russell Bryant
37e8182bfe [v1] Add Whisper model support (encoder-decoder) (#21088)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: NickLucche <nlucches@redhat.com>
2025-09-10 13:53:35 -07:00
Nick Hill
4db4426404 [CI] Fail subprocess tests with root-cause error (#23795)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-09-10 13:53:21 -07:00
Thien Tran
a0933c3bd6 [Bugfix] Enable FP8 KV cache for FlashInfer and Triton backend on non-sm100 GPUs (#24577)
Signed-off-by: Thien Tran <gau.nernst@yahoo.com.sg>
2025-09-10 12:33:41 -07:00
rongfu.leng
09e68bce34 [Misc] update log level debug to warning when process port is used by (#24226)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-09-10 11:32:57 -07:00
Xingyu Liu
9fb74c27a7 [Core] Support configuration parsing plugin (#24277)
Signed-off-by: Xingyu Liu <charlotteliu12x@gmail.com>
Signed-off-by: Xingyu Liu <38244988+charlotte12l@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-09-10 11:32:43 -07:00
Ming Yang
4032949630 [Bugfix] Fix DeepEP config for DP4TP4 (#23619)
Signed-off-by: Ming Yang <minos.future@gmail.com>
2025-09-10 10:37:56 -07:00
tomeras91
08abfa78ec [Bugfix] fix modelopt exclude_modules name mapping (#24178)
Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-09-10 10:20:46 -07:00
Shiyan Deng
2bef2d1405 [Logging] allow config logging stream (#24336)
Signed-off-by: Shiyan Deng <dsy842974287@meta.com>
2025-09-10 15:02:01 +00:00
Robin
36cacd0958 [Doc] Add documentation for GLM-4.5 series models: tool-calling and reasoning parser (#24589)
Signed-off-by: WangErXiao <863579016@qq.com>
2025-09-10 07:50:55 -07:00
Jee Jee Li
bb3eb80d92 [Core] Split LoRA layers (#24574)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-09-10 07:47:51 -07:00
pwschuurman
fcc0a3130a [CI] Fix tensorizer test assertion (#24545)
Signed-off-by: Peter Schuurman <psch@google.com>
2025-09-10 06:57:36 -07:00
zzhxxx
736569da8d [Platform] Custom ops support for LMhead and LogitsProcessor (#23564)
Signed-off-by: zzhx1 <zzh_201018@outlook.com>
2025-09-10 06:26:31 -07:00
Kay Yan
2eb9986a2d [BugFix] python collect_env.py and vllm collect-env compatibility with uv venv (#24066)
Signed-off-by: Kay Yan <kay.yan@daocloud.io>
2025-09-10 21:25:33 +08:00
Hyogeun Oh (오효근)
ccee371e86 [Docs] Fix warnings in mkdocs build (continued) (#24092)
Signed-off-by: Zerohertz <ohg3417@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-09-10 06:23:28 -07:00
RoadToNowhereX
c0bd6a684a Fix Auto_Round Quatization Loading on SM75 and Lower GPUs (#24217)
Signed-off-by: RoadToNowhereX <37441177+RoadToNowhereX@users.noreply.github.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-09-10 06:22:31 -07:00
co63oc
3144d90217 fix some typos (#24167)
Signed-off-by: co63oc <co63oc@users.noreply.github.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
2025-09-10 06:21:23 -07:00
Daniele
2f5e5c18de [CI/Build] bump timm dependency (#24189)
Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
2025-09-10 06:20:59 -07:00
wang.yuqi
bd98842c8a [CI] Add PPL test for generation models (#24485)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-09-10 06:16:39 -07:00
Lifans
d6069887c6 [rocm] enable torchao quantization for rocm (#24400)
Signed-off-by: Lifan Shen <lifans@meta.com>
2025-09-10 06:16:21 -07:00
Ye (Charlotte) Qi
492196ed0e [CI/Build] split true unit tests to Entrypoints Unit Tests (#24418)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-09-10 06:16:07 -07:00
Nick Hill
f4f1a8df22 [BugFix] Ensure integrity of reused CPU tensors during async scheduling (#24527)
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: guoze.lin <guozelin@tencent.com>
2025-09-10 21:15:14 +08:00
lacora
0b9a612fa3 [BugFix][easy] Fix flaky test test_gpt_oss_multi_turn_chat (#24549)
Signed-off-by: lacora2017 <yehu@meta.com>
Co-authored-by: lacora2017 <yehu@meta.com>
2025-09-10 21:14:55 +08:00
Wenlong Wang
4c04eef706 [BugFix][Multi Modal] Fix TensorSchema shape mismatch in Molmo (#24559)
Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
2025-09-10 06:14:27 -07:00
Harry Mellor
f36355abfd Move LoadConfig from config/__init__.py to config/load.py (#24566)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-10 06:14:18 -07:00
Yash Pratap Singh
9e3c3a7df2 [LoRA]: Add LoRA support to Mistral's Voxtral models (#24517)
Signed-off-by: Yash Pratap Singh <yashsingh20001@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-09-10 06:12:03 -07:00
baonudesifeizhai
6cbd41909e Feature/vit attention unification# 23880 (#23978)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-10 06:10:14 -07:00
danielafrimi
72d30108a0 Support for NemotronH Nano VLM (#23644)
Signed-off-by: Daniel Afrimi <danielafrimi8@gmail.com>
2025-09-10 06:10:06 -07:00
Tyler Michael Smith
8b83b93739 [Docs] Document the extra memory footprint overhead when using EPLB (#24537)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-09-10 06:09:49 -07:00
Harry Mellor
9dbefd88e9 [Docs] Improve organisation of API Reference nav (#24569)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-10 06:08:21 -07:00
vllmellm
7c195d43da [ROCm][Bugfix] Fix Aiter RMSNorm (#23412)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-09-10 21:08:03 +08:00
Lucas Wilkinson
0ae43dbf8c [Attention] add DCP support for FLASH_ATTN_MLA backend (#24453)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
Co-authored-by: Matthew Bonanni <mbonanni@redhat.com>
2025-09-10 17:19:26 +08:00
li-jinpeng
267c80d31f [Model] Limit CPU threads for image transformations in InternVL to reduce cpu contention. (#24519)
Signed-off-by: li-jinpeng <3332126450@qq.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-09-10 16:45:44 +08:00
Flora Feng
77f62613f9 Consolidate rendering parameters into RenderConfig dataclass (#24543)
Signed-off-by: sfeng33 <4florafeng@gmail.com>
2025-09-10 08:44:47 +00:00
Remy
feaf202e93 [Bugfix] Guard _may_reorder_batch for encoder-only models on CPU (#24319) (#24348)
Signed-off-by: Remy <eunhwan.shin@dtonic.io>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
2025-09-10 14:24:42 +08:00
Simon Mo
91130ae376 [docs] promo pytorch conf and ray summit (#24562)
Signed-off-by: simon-mo <simon.mo@hey.com>
2025-09-09 23:24:20 -07:00
Harry Mellor
e40827280b [Docs] Enable relative links in examples to function when rendered in the docs (#24041)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-09 21:40:45 -07:00
pwschuurman
4377b1ae3b [Bugfix] Update Run:AI Model Streamer Loading Integration (#23845)
Signed-off-by: Omer Dayan (SW-GPU) <omer@run.ai>
Signed-off-by: Peter Schuurman <psch@google.com>
Co-authored-by: Omer Dayan (SW-GPU) <omer@run.ai>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-09-09 21:37:17 -07:00
Chenheli Hua
009d689b0c [Core] Simplify and unify mm uuid handling & auto-generated mm hash overrides processing. (#24271)
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
2025-09-09 21:36:09 -07:00
Wei
0efdb5c3ba [gpt-oss] Cache permute indices for faster MXFP4 MoE layer loading (#24154)
Signed-off-by: Wei Wei <wwei6@meta.com>
2025-09-10 04:27:53 +00:00
Wenlong Wang
53b42f4102 [BugFix][Spec Decode] Fix out-of-range index triggered by eagle3; re-enable test for LlamaForCausalLMEagle3 (#24392)
Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
2025-09-09 21:24:23 -07:00
Chauncey
309d7aa401 [P/D] MultiConnector supports shutdown (#24425)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-09-09 21:24:11 -07:00
Yihua Cheng
b4a01aaf95 [KV Connector] More async support for get_num_new_matched_tokens (#23620)
Signed-off-by: ApostaC <yihua98@uchicago.edu>
2025-09-09 21:23:37 -07:00
Nick Hill
83dd28aae4 [CI] Adjust threshold for flaky ngram spec decoding test (#24528)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-09-09 21:07:33 -07:00
Nick Hill
f88e84016f [BugFix] Fix async core engine client finalizer (#24540)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-09-09 21:07:13 -07:00
Ignacio Sica
3c2156b3af [Hardware][Apple-CPU] Enable native bfloat16 on Apple Silicon (M2 and later) (#24129)
Signed-off-by: ignaciosica <mignacio.sica@gmail.com>
2025-09-10 03:50:21 +00:00
Nick Hill
7e7db04310 [CI] Retry flaky fp8 cutlass mla tests (#24536)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-09-09 20:33:10 -07:00
Chen Zhang
41f160b974 Add @heheda12345 to CODEOWNERS of KVCacheManager related code (#24546)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-09-10 03:30:32 +00:00
Yong Hoon Shin
dc625ea6b8 [Perf] Convert np array to torch tensor to index into block table for attn chunking (#24474)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-09-09 20:01:06 -07:00
bnellnm
b23fb78623 [Bugfix] Fix for 24530. Fix naive all2all shared expert overlap. (#24538) 2025-09-09 17:53:53 -07:00
Tyler Michael Smith
561f38dc3c [Bugfix] Improve EPLB config validation error message (#24524)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-09-10 00:32:36 +00:00
Charlie Fu
73e688cb79 [ROCm][Feature] Enable Pipeline Parallelism with Ray Compiled Graph on ROCm (#24275)
Signed-off-by: charlifu <charlifu@amd.com>
2025-09-09 23:27:35 +00:00
Ekagra Ranjan
fb1a8f932a [Benchmark] Add option to skip oversampling in benchmark (#24457)
Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
2025-09-09 22:00:17 +00:00
Ekagra Ranjan
0dc9cbb527 [Benchmark] Update bench doc with mtbench, blazedit, spec bench (#24450)
Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
2025-09-09 21:15:41 +00:00
Jiangyun Zhu
b5fb3005a8 [Log] Use a relative path in debug-level logs to distinguish files with identical names (#23846)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
2025-09-09 16:46:35 -04:00
Wentao Ye
15de5ff9ea [Feature] Disallow FlashMLA on Blackwell (#24521)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-09-09 14:59:34 -04:00
Jiangyun Zhu
b8a93076d3 [CI] execute all piecewise compilation tests together (#24502)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
2025-09-09 11:05:25 -07:00
Chenyaaang
c3f9773b2c [TPU] Fix tpu structured decoding in mixed batches (#24458)
Signed-off-by: Chenyaaang <chenyangli@google.com>
2025-09-09 11:04:25 -07:00
Nicolò Lucchesi
3707cb2505 [Docs] Gemma3n transcriptions endpoint support (#24512)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-09-09 11:03:32 -07:00
Kazuhiro Serizawa
920ed46b09 [Misc] bump outlines_core to fix the version conflicts with outlines >= 1.2.0 (#24368)
Signed-off-by: Kazuhiro Serizawa <nserihiro@gmail.com>
Signed-off-by: Simon Mo <simon.mo@hey.com>
Co-authored-by: Aaron Pham <contact@aarnphm.xyz>
Co-authored-by: Simon Mo <simon.mo@hey.com>
2025-09-09 10:59:46 -07:00
Flora Feng
15cb047e25 Extend renderer with embedding support and integrate completion endpoint (#24405)
Signed-off-by: sfeng33 <4florafeng@gmail.com>
2025-09-10 01:46:46 +08:00
Jee Jee Li
9ad0688e43 [Bugfix] Fix hidden_size for multimodal classification model (#24501)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-09-09 10:37:25 -07:00
Gregory Shtrasberg
b9a1c4c8a2 [ROCm][CI/Build] Sync ROCm dockerfiles with the ROCm fork (#24279)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-09-09 12:21:56 -04:00
youkaichao
1aa427fdc1 [Kernels] Add Flash Linear Attention Kernels (#24518)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-09-10 00:04:41 +08:00
Micah Williamson
1c63a16b65 [Core] Run garbage collector after CUDA graph capture to fix throughput regression (#24128)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Co-authored-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-09-09 10:38:10 -04:00
d.transposed
922d3b401b [Bugfix] Handle the edge case in detokenizer where processed tokens contain both stop str and eos token (#23938)
Signed-off-by: dtransposed <damian.bogunowicz@gmail.com>
2025-09-09 07:30:24 -07:00
wang.yuqi
19332c0479 [Model] Systematic support for fp32 head, pooling models part (#23810)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-09-09 07:29:50 -07:00
Wentao Ye
a55cf41a09 [Compilation][WideEP] Enable Piecewise CUDAGraph for DeepEPHT (#24123) 2025-09-09 10:21:10 -04:00
Ye (Charlotte) Qi
6fb2788163 [CI/Build][Doc] Fully deprecate old bench scripts for serving / throughput / latency (#24411)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-09-09 10:02:35 +00:00
Weixiao Huang
3d2a2de8f7 [RL] fast weight update with zmq + ipc handles (#24295)
Signed-off-by: huangweixiao <huangweixiao@msh.team>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
2025-09-09 16:57:46 +08:00
Chen Zhang
1116590b16 [gpt-oss] Validate gpt-oss python tool during initialization (#23856)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-09-09 08:37:48 +00:00
Roger Wang
ccb97338af [Misc] Add Codex settings to gitignore (#24493)
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Roger Wang <hey@rogerw.me>
2025-09-09 01:25:44 -07:00
Ye (Charlotte) Qi
45c9cb5835 [Misc] Add claude settings to gitignore (#24492)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-09-09 01:14:45 -07:00
WeiQing Chen
e283976f3a [Performance][MM] Building the inverse permutation in O(n) time in Qwen2_5_VisionTransformer (#24443)
Signed-off-by: Junhong <liujunhong11@huawei.com>
Co-authored-by: Junhong <liujunhong11@huawei.com>
2025-09-09 00:24:11 -07:00
Didier Durand
46876dff32 [Doc]: fixing typos to improve docs (#24480)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
2025-09-08 23:06:04 -07:00
Ming Yang
1823a00d67 [Misc] Support bench serve long context (#24373)
Signed-off-by: Ming Yang <minos.future@gmail.com>
2025-09-08 22:53:10 -07:00
Mickaël Seznec
ed16d0f26f [Doc] mention fpdb for multiprocess breakpoints (#24452)
Signed-off-by: Mickael Seznec <mickael@mistral.ai>
2025-09-08 21:46:45 -07:00
22quinn
0cdd213641 [Misc] Improve Worker process title and logging prefix (#22205)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-09-08 21:43:48 -07:00
Cyrus Leung
948dd3443b [Bugfix] Fix Apertus HF repo name (#24447)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-08 21:40:29 -07:00
cong-meta
b2f7745774 Add data_parallel_size to VllmConfig string representation (#24298)
Co-authored-by: Cong Chen <congc@meta.com>
2025-09-08 21:35:18 -07:00
Zebing Lin
82dfb12e52 [Core] Use sha256 bytes instead of BlockHash to reduce GC overhead (#23673)
Signed-off-by: linzebing <linzebing1995@gmail.com>
2025-09-08 21:34:37 -07:00
elvischenv
bba1042c6f [Flashinfer] Support Flashinfer TRTLLM FP8-qkv BF16/FP16-out Attention Kernel (#23647)
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
2025-09-08 20:53:07 -07:00
CSWYF3634076
b6fbc15634 [BugFix][Model] Fix Ernie4.5-VL hanging on long inputs (#24074)
Signed-off-by: wangyafeng <wangyafeng@baidu.com>
2025-09-09 11:37:16 +08:00
Harry Mellor
3e0d4a3475 Move KVTransferConfig from config/__init__.py to config/kv_transfer.py (#24434)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-08 20:30:32 -07:00
dependabot[bot]
562663a044 Bump actions/github-script from 7.0.1 to 8.0.0 (#24413)
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
2025-09-09 03:12:44 +00:00
dependabot[bot]
ed1623a88a Bump actions/stale from 9.1.0 to 10.0.0 (#24412)
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
2025-09-09 03:11:20 +00:00
cjackal
13b89bd823 [doc] update vllm serve cli args documentation (#24329)
Signed-off-by: cjackal <44624812+cjackal@users.noreply.github.com>
2025-09-09 03:07:58 +00:00
dependabot[bot]
22a0070530 Bump actions/setup-python from 5.4.0 to 6.0.0 (#24414)
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
2025-09-09 02:54:58 +00:00
zhiweiz
170129eb28 [gpt-oss] Harmony changes with container tool support (#23386)
Signed-off-by: zhiweiz <zhiweiz@fb.com>
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Signed-off-by: Lu Fang <30275821+houseroad@users.noreply.github.com>
Co-authored-by: zhiweiz <zhiweiz@fb.com>
Co-authored-by: Aaron Pham <contact@aarnphm.xyz>
Co-authored-by: Simon Mo <simon.mo@hey.com>
Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com>
2025-09-08 19:03:50 -07:00
Tyler Michael Smith
955c624915 [Bugfix][Wide EP] Fix redundant work when using DeepEP, TP Attn, and EP MoE (#24134)
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
2025-09-08 19:01:51 -07:00
Zhiyu
4f87abdcc6 Update reviewers for modelopt related files (#24468) 2025-09-09 01:53:13 +00:00
Sahithi Chigurupati
6910b56da2 [CI] Add nightly multiarch manifests to dockerhub (#24102)
Signed-off-by: Sahithi Chigurupati <chigurupati.sahithi@gmail.com>
Signed-off-by: Simon Mo <simon.mo@hey.com>
Signed-off-by: simon-mo <simon.mo@hey.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
2025-09-09 01:18:09 +00:00
R3hankhan
e10fef0883 [Hardware][IBM Z] Fix Outlines Core issue for s390x (#24034)
Signed-off-by: Rehan Khan <Rehan.Khan7@ibm.com>
2025-09-08 16:50:34 -07:00
Chauncey
e680723eba [Bugfix] Disable the statslogger if the api_server_count is greater than 1 (#22227)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-09-08 15:28:03 -07:00
Matthew Bonanni
620db1fc58 [Attention] FlashAttention MLA cudagraph support (#23958)
Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2025-09-08 22:05:26 +00:00
Ekagra Ranjan
41183c1fe0 [Spec Decode] Fix offline spec_decode.py (#24257)
Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-09-08 20:44:13 +00:00
Yang Kaiyong
43d9ad03ba [Model loader]: support multi-thread model weight loading (#23928)
Signed-off-by: Yang Kaiyong <yangkaiyong.yky@antgroup.com>
Signed-off-by: Simon Mo <simon.mo@hey.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
2025-09-08 18:49:39 +00:00
Jiangyun Zhu
7be141b2c5 [CI] Enable encoder model compilation test (#24442)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
2025-09-08 11:48:06 -07:00
Jee Jee Li
8d7f39b48c [Model] Remove quantized mixtral (#24437)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-09-08 11:02:14 -07:00
Ekagra Ranjan
cd08636926 [Spec Decode][Benchmark] Add Blitzedit dataset (#23605)
Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-09-08 10:32:52 -07:00
Ekagra Ranjan
3feeeb9fea [Spec Decode][Benchmark] Add Spec Bench Dataset for benchmarking (#23563)
Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
2025-09-08 10:32:42 -07:00
Jee Jee Li
6f4a82f8b5 [Model] Enable BNB support for qwen2_5_omni_thinker (#24420)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-09-08 09:37:08 -07:00
rongfu.leng
c44797a4d6 [Docs]add eplb_config param use docs (#24213)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-09-08 09:36:57 -07:00
Didier Durand
55be93baf5 [Doc]: fix 2 hyperlinks leading to Ray site after they changed Ray's doc structure (#24438)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-08 09:36:54 -07:00
Harry Mellor
717fc00e98 [Docs] Move feature compatibility tables to README (#24431)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-08 06:45:14 -07:00
Chenheli Hua
01dfb5e982 [Frontend] User-provided uuids for medias in chat. (RFC #22044) (#23449)
Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Signed-off-by: Roger Wang <hey@rogerw.me>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-09-08 06:42:20 -07:00
Harry Mellor
03dd652c16 Move KVEventsConfig from config/__init__.py to config/kv_events.py (#24433)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-08 06:41:27 -07:00
Christian Pinto
9cd76b71ab [Misc] Terratorch related fixes (#24337)
Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-09-08 06:40:26 -07:00
tomeras91
e041314184 [Bugfix] Fix mamba2 prefill chunking (#23279)
Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com>
Signed-off-by: tomeras91 <57313761+tomeras91@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-09-08 11:42:41 +00:00
Li Wang
5e537f45b4 [Bugfix] Fix get_quant_config when using modelscope (#24421)
Signed-off-by: wangli <wangli858794774@gmail.com>
2025-09-08 11:03:02 +00:00
Michael Yao
c2a8b08fcd [Doc] Fix issues in integrations/llamastack.md (#24428)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-09-08 02:28:32 -07:00
Didier Durand
f4962a6d55 [Doc]: fix typos in Python comments (#24417)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
2025-09-08 00:22:16 -07:00
Michael Yao
2f0b833a05 [Docs] Fix a tip indentation and typo (#24419)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-09-08 00:19:40 -07:00
Chauncey
425b04b8f4 [gpt-oss][Responses API] Fix the function call id format (#24409)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-09-08 06:49:52 +00:00
Chatcharin Sangbutsarakum
60f0843ef8 [Model] Remove unnecessary CUDA sync of Qwen2VL image and video preprocess (#24334)
Signed-off-by: Win <chatcharinsang@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-09-07 23:11:12 -07:00
Chatcharin Sangbutsarakum
8a46602606 [Model] Remove unnecessary CUDA sync of GLM-4.1V image and video preprocess (#24332)
Signed-off-by: Win <chatcharinsang@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-09-07 23:10:54 -07:00
Chauncey
61aa4b2901 [P/D] Add a shutdown method to the Connector API (#22699)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-09-07 23:07:00 -07:00
Al-Ekram Elahee Hridoy
8c892b1831 [Doc] Fix UTF-8 encoding issues in documentation generation on Windows (#24361)
Signed-off-by: alekramelaheehridoy <aliqramalaheehridoy@gmail.com>
Signed-off-by: alekramelaheehridoy <alekramelaheehridoy@gmail.com>
Co-authored-by: alekramelaheehridoy <alekramelaheehridoy@gmail.com>
2025-09-07 22:33:52 -07:00
Chenheli Hua
3bca396f79 [CI/Build] Fix local image inputs in test_pixtral.py (#24401)
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-09-08 03:31:35 +00:00
22quinn
3a3e91bdfe [CI/Build] Disable flaky test_structured_output tests (#24404)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-09-08 02:51:59 +00:00
Xingyu Liu
b3d7e3c845 [Sampler] Support returning all prompt logprobs (#23868)
Signed-off-by: Xingyu Liu <charlotteliu12x@gmail.com>
Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-09-07 19:34:31 -07:00
Yan Ma
67841317d1 [xpu] upgrade ipex/python3.12 for xpu (#23830)
Signed-off-by: Yan Ma <yan.ma@intel.com>
2025-09-08 02:07:16 +00:00
Ming Yang
86173ad593 [Kernel] Support decode context parallelism on Blackwell with CUTLASS MLA (#24385)
Signed-off-by: Ming Yang <minos.future@gmail.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
2025-09-08 09:27:12 +08:00
Lucia Fang
795b6951cd Add @luccafong to codeowner for spec decode (#24397)
Signed-off-by: Lu Fang <fanglu@fb.com>
2025-09-08 08:30:27 +08:00
Woosuk Kwon
2e5d21378d Skip MM Encoder for non-first PP ranks (#24387)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-07 09:38:35 -07:00
Flora Feng
0661cb9df3 Add renderer-based prompt processing for embedding and classification endpoints (#24356)
Signed-off-by: sfeng33 <4florafeng@gmail.com>
2025-09-07 08:26:48 +00:00
Woosuk Kwon
105d3d62ef [TPU] Remove TopKTopPSampler dependency for TPU sampler (#24391)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-07 01:12:36 -07:00
Jee Jee Li
62f66be1f7 [Bugfix] Fix Qwen3-coder moe tuned config (#24072)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-09-07 05:19:46 +00:00
Ye (Charlotte) Qi
81c53ef55c [Misc] collect flashinfer version in collect_env.py (#24378)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-09-07 03:30:41 +00:00
Saman A. Pour
75334956c2 QWEN3 Thinking Fused MoE kernels Optimization configs (#24330)
Signed-off-by: Saman Keon <samanamp@outlook.com>
2025-09-07 03:18:54 +00:00
Jiangyun Zhu
77aec83b8c [Benchmark] add benchmark for custom activation op (#23908)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
Signed-off-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-09-06 20:12:05 -07:00
Aaron Pham
e67597545b [CI][Fix] deterministic seed for flaky CI runs on structured outputs (#24380)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-09-07 11:10:40 +08:00
Benji Beck
37a6fa95fd Migrate Qwen2 inputs to TensorSchema (#23475)
Signed-off-by: Benji Beck <benjibeck@meta.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-09-06 20:07:31 -07:00
youkaichao
558f0907dc [attention][DCP] use AttentionImpl.need_to_return_lse_for_decode (#24372)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-09-07 01:18:59 +00:00
Woosuk Kwon
4172235ab7 [V0 deprecation] Deprecate V0 Neuron backend (#21159)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-06 16:15:18 -07:00
Bangsheng Tang
848562bd49 break execute_model in gpu_model_runner into sub-functions for custom scopes (#24265)
Co-authored-by: Bangsheng Tang <bangsheng@meta.com>
2025-09-06 14:02:47 -07:00
elvischenv
e68dc2f014 [Bugfix] Fix unstable silu_mul+nvfp4 quant fusion test (#24370)
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
2025-09-06 20:39:34 +00:00
Ye (Charlotte) Qi
a3645ed94d [Frontend][Responses API] Support reporting tool output tokens and fix reasoning token count (#24285)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-09-06 13:27:15 -07:00
Aaron Pham
fb691ee4e7 [Fix] [gpt-oss] fix non-tool calling path for chat completion (#24324) 2025-09-06 19:10:32 +00:00
Ashwin Phadke
6024d115cd Lora bias(enable_lora_bias) deprecate warning (#24339)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-09-07 00:42:19 +08:00
Jee Jee Li
7555d6b34a [Bugfix] Fix test_mixtral_moe (#24371) 2025-09-06 09:32:03 -07:00
Isotr0py
00a4e56d8d [Bugfix] Fix broken deepseek fp8 TP weights loading (#24367)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-06 09:23:12 -07:00
mohankku
0eadaeff7e [Bugfix] Avoid uninitialized usage of azp_val when AZP is false. (#24335)
Signed-off-by: Mohan Kumar Kumar <mohan.cbein@gmail.com>
Signed-off-by: mohankku <mohan.cbein@gmail.com>
2025-09-06 08:17:03 -07:00
Benjamin Chislett
0077c8634e Add @benchislett to codeowner for spec decode and structured outputs (#24362)
Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
2025-09-06 22:03:35 +08:00
Roger Wang
b121ca22ad [CI] Disable flaky structured output test from CI (#24366)
Signed-off-by: Roger Wang <hey@rogerw.io>
2025-09-06 13:31:56 +00:00
Roger Wang
eddaafc1c7 [Multimodal] Improve max video embedding length estimation in V1 (#24312)
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Roger Wang <hey@rogerw.me>
2025-09-06 02:33:19 -07:00
Andrew Sansom
305a1cc0d2 refactor: Turn GPUModelRunner.inputs_embeds to a CpuGpuBuffer (#24345)
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
2025-09-05 23:01:23 -07:00
wang.yuqi
6d6c6b05d3 [New Model]: google/embeddinggemma-300m (#24318)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-09-05 22:58:36 -07:00
Isotr0py
53b19ccdd5 [Core] Allow disabling TP sharding for parallel Linear layer (#23024)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-09-05 22:53:58 -07:00
Nick Hill
6432739ef1 [Bugfix] Catch and log invalid token ids in detokenizer (#24351)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-09-05 22:30:22 -07:00
yzds
ac201a0eaf [Feature] Support Decode Context Parallel (DCP) for MLA (#23734)
Signed-off-by: hongchao <hongchao@msh.team>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: hongchao <hongchao@msh.team>
Co-authored-by: youkaichao <youkaichao@gmail.com>
2025-09-06 13:24:05 +08:00
Yong Hoon Shin
3c529fc994 [KV Sharing] Raise error if using eagle with fast prefill (#24350)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-09-05 20:22:40 -07:00
Didier Durand
35bf193864 [Doc]: fix typos in Python comments (#24294)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-09-05 19:41:12 -07:00
22quinn
35efa70297 Add @22quinn as code reviewer for RL related components (#24346) 2025-09-06 01:56:15 +00:00
Benjamin Chislett
cee182b297 [Perf][V1] Fully overlap model execution (#23569)
Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
2025-09-05 18:20:17 -07:00
Rafael Vasquez
c954c6629c [CI] Add timeouts to tests (#24260)
Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-09-05 17:26:22 -07:00
Shiyan Deng
9dfbeb41e5 [RFC] allow cancelation after shutdown in blocking collective_rpc (#23390)
Signed-off-by: Shiyan Deng <dsy842974287@meta.com>
2025-09-05 14:14:18 -07:00
elvischenv
eedb2a2a10 [Bugfix] Fix silu_mul+quant fusion test (#24341)
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
2025-09-05 20:13:42 +00:00
Chauncey
23a6c5280e [gpt-oss][Bugfix]Fix streamableparser for missing handling of certain token_ids (#24306)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-09-05 10:26:00 -07:00
youkaichao
7812bcf278 [docs] add shenzhen meetup (#24326)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-09-05 22:48:42 +08:00
Louie Tsai
006e7a34ae Adding int4 and int8 models for CPU benchmarking (#23709)
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
2025-09-05 20:08:50 +08:00
liuzhenwei
e599e2c65e [XPU][P/D] Add XPU support in NixlConnector (#22436)
Signed-off-by: zhenwei <zhenwei.liu@intel.com>
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
2025-09-04 21:03:12 -07:00
Aaron Pham
c29fb540ff [gpt-oss] tool parser supports for /chat/completions [1/n] (#22386)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Co-authored-by: Simon Mo <simon.mo@hey.com>
2025-09-04 20:39:12 -07:00
Nicolò Lucchesi
65e038931d [Frontend] Skip unnecessary detokenization when token_id is requested (#24236)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-09-04 23:04:12 +00:00
Zhuohan Li
886ccbe5ba [CI/Build] Reduce the number of redundant cases to test for LoRA (#24276)
Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
2025-09-04 21:58:44 +00:00
elvischenv
adc3ddb430 [Bugfix][Misc] Fix silu_and_mul_nvfp4_quant issue and extract common utils for nvfp4 kernel source files (#23727)
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-09-04 14:25:45 -07:00
Seiji Eicher
60b755cbcb [Misc] Have AsyncLLM custom_stat_loggers extend default logger list (#20952)
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
Signed-off-by: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-09-04 14:25:30 -07:00
Saman A. Pour
482e52f56c QWEN3 Coder Fused MoE kernels Optimization configs (#24266)
Signed-off-by: Saman Keon <samanamp@outlook.com>
2025-09-04 20:33:43 +00:00
Po-Han Huang (NVIDIA)
78336a0c3e Upgrade FlashInfer to v0.3.0 (#24086)
Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
2025-09-04 09:49:20 -07:00
Jee Jee Li
94866d7c93 [Misc] Slight improve deepgemm print (#24085)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-09-04 16:06:51 +00:00
Didier Durand
83609ca91d [Doc]: fix typos in Python comments (#24173)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-09-04 08:52:17 -07:00
Nick Hill
e41a0fa377 [Perf] Freeze core engine proc heap after init (#24008)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-09-04 22:55:23 +08:00
nvjullin
37241077d5 [Misc] Removed force_fp8_e4m3fnuz from FP8LinearOp (#23725)
Signed-off-by: Julien Lin <jullin@nvidia.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-09-04 09:25:40 -04:00
Yash Pratap Singh
c9f7081f9c [LoRA]: Add lora support to qwen-2.5-omni (#24231) 2025-09-04 05:50:50 -07:00
Kunshang Ji
16ded21eeb [XPU] support Triton Attention backend on Intel GPU (#24149)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2025-09-04 20:41:08 +08:00
nopperl
2b30afa442 Use hidden_size_per_head as head_size fallback (#24221)
Signed-off-by: nopperl <54780682+nopperl@users.noreply.github.com>
2025-09-04 12:59:16 +01:00
Jiangyun Zhu
eafa8dcde6 [Model] Add pp support for hunyuan (#24212)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
2025-09-04 03:58:26 -07:00
TJian
6c7af8110a [Doc] Update vLLM Singapore Meetup info (#24234)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-09-04 02:58:18 -07:00
Kebe
8f423e5f43 [Feature][Response API] Add streaming support for non-harmony (#23741)
Signed-off-by: Kebe <mail@kebe7jun.com>
2025-09-04 17:49:06 +08:00
Ignacio Sica
369a079568 [Hardware][Apple-CPU] Disable OneDNN build for Apple Silicon (#24200)
Signed-off-by: ignaciosica <mignacio.sica@gmail.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
2025-09-04 02:48:25 -07:00
Lucas Wilkinson
402759d472 [Attention] FlashAttn MLA (#14258)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
Co-authored-by: Matthew Bonanni <mbonanni001@gmail.com>
Co-authored-by: Matthew Bonanni <mbonanni@redhat.com>
2025-09-04 02:47:59 -07:00
Fanli Lin
2c301ee2eb [Bugfix] Fix Incremental Detokenization with tokenizers == 0.22.0 (#24159)
Signed-off-by: Fanli Lin <fanli.lin@intel.com>
Signed-off-by: Fanli Lin <fanli0116@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-09-04 02:47:08 -07:00
whx
3efb9f4d95 [Attention][Platform] Refactor MLA to support Custom Op (#23332)
Signed-off-by: whx-sjtu <2952154980@qq.com>
2025-09-04 02:46:37 -07:00
anthonsu
04f3c35cff Improve flexibility of auto_tune.sh execution. (#23766)
Signed-off-by: Anthony Su <50185138+anthonsu@users.noreply.github.com>
Signed-off-by: anthonsu <50185138+anthonsu@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-09-04 09:41:41 +00:00
mgazz
51d5e9be7d [Core][Model] Terratorch backend integration (#23513)
Signed-off-by: Michele Gazzetti <michele.gazzetti1@ibm.com>
Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
Co-authored-by: Christian Pinto <christian.pinto@ibm.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-09-04 00:22:41 -07:00
bingchen-mi
e7fc70016f [Model] Add MiDashengLM model support (#23652)
Signed-off-by: chenbing8 <chenbing8@xiaomi.com>
Signed-off-by: bingchen-mi <chenbing8@xiaomi.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-04 00:08:09 -07:00
Weida Hong
12e1e63cc5 [Misc] Enhance output readability of helper script (#24214)
Signed-off-by: Weida Hong <wdhongtw@google.com>
2025-09-04 06:38:26 +00:00
Li, Jiang
57b1ce94f7 [CPU] Refactor CPU unquantized linear (#24150)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-09-04 14:28:45 +08:00
Benji Beck
cb55ad86fe Migrate ultravox inputs to TensorSchema (#23503)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-09-04 06:09:11 +00:00
Flora Feng
712b273f65 [Refactor] Introduce basic Renderer for completion-style request (#24010)
Signed-off-by: sfeng33 <4florafeng@gmail.com>
2025-09-04 05:21:12 +00:00
Qiming Zhang
e919d6f549 [Kernel][Bugfix] Fix grouped topk cu (#24146)
Signed-off-by: mayuyuace <qiming1.zhang@intel.com>
2025-09-04 12:37:37 +08:00
wuhang
a38f8bd54c [Feature][Responses API]Support MCP tools with streaming mode + background mode (#23927)
Signed-off-by: wuhang <wuhang6@huawei.com>
2025-09-04 04:05:10 +00:00
Peter Pan
b5ee1e3261 Remove deprecated PyNcclConnector (#24151)
Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>
2025-09-03 22:49:16 +00:00
George Nagy II
36c260dad6 [Feature][gpt-oss] Add support for num_cached_tokens and num_reasoning_tokens tracking (#23460)
Signed-off-by: George Nagy II <george.nagy0969@gmail.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-09-03 21:08:47 +00:00
Kebe
a43a3f1770 [Bugfix][DP] DP distribution does not require ray[default] (#23822)
Signed-off-by: Kebe <mail@kebe7jun.com>
2025-09-03 13:21:36 -07:00
WeiQing Chen
6adaed42f4 [Feature][P/D]: Optimize NIXL Connector xfer Launch (#23887)
Signed-off-by: ycyaw66 <497410282@qq.com>
Co-authored-by: ycyaw66 <497410282@qq.com>
2025-09-03 19:14:30 +00:00
Matthew Bonanni
a742322092 [Attention] Blackwell FP8 MLA support with CUTLASS_MLA backend (#23289)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2025-09-03 14:05:24 -04:00
Benji Beck
731a6940e3 Migrate whisper inputs to TensorSchema (#23505)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-09-03 18:04:00 +00:00
bnellnm
e9b92dcd89 [Kernels] Overlap shared experts with send/recv (#23273)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-09-03 12:35:18 -04:00
nopperl
fa4311d85f [V1] v1 engine + full CUDA graph support for PLaMo2 (#23998)
Signed-off-by: Hemmi Shinichi <shemmi@preferred.jp>
Signed-off-by: nopperl <54780682+nopperl@users.noreply.github.com>
Co-authored-by: Hemmi Shinichi <shemmi@preferred.jp>
Co-authored-by: Thomas Parnell <tom.parnell@gmail.com>
2025-09-03 08:24:02 -07:00
Burkhard Ringlein
6d80ae83e1 [Bugfix] Fixing division by zero in triton_attn if query_heads/kv_heads > 16 (#23424)
Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
2025-09-03 15:01:09 +00:00
dongbo910220
4ba0c587ba FIX: Add libnuma-dev to Dockerfile for dev stage (#20388)
Signed-off-by: dongbo910220 <1275604947@qq.com>
2025-09-03 07:17:20 -07:00
qscqesze
6997a25ac6 [Model] Remove useless code from MiniMax implementation (#23982)
Signed-off-by: QscQ <qscqesze@gmail.com>
Signed-off-by: qingjun <qingjun@minimaxi.com>
2025-09-03 11:27:04 +00:00
Jakub Smid
28f350e147 Support add_generation_prompt in embeddings endpoint with chat request (#23931)
Signed-off-by: biba10 <jaksmid@seznam.cz>
2025-09-03 10:47:55 +00:00
wang.yuqi
51383bd472 [CI] Accelerate mteb test by setting SentenceTransformers mteb score to a constant (#24088)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-09-03 17:23:56 +08:00
Isotr0py
9c99e4871f [Misc] Clean up deadcode for legacy processing pipeline (#24153)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-03 08:34:29 +00:00
dsinghvi
70549c1245 [CI/Build] Serve images used by multimodal tests through local HTTP Server (#23907)
Signed-off-by: Divyansh Singhvi <divyanshsinghvi@gmail.com>
Signed-off-by: dsinghvi <divyanshsinghvi@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-09-03 16:13:11 +08:00
Nicolò Lucchesi
f0c503f66e [Nixl] Heterogeneous TP support FlashInfer (#20189)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-09-03 15:19:54 +08:00
youkaichao
f38035c123 [distributed][rl] remove nccl cumem env var override (#24141)
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-09-03 06:45:25 +00:00
Yong Hoon Shin
426cc8629f [BugFix] Fix routed_scaling_factor double mul for dots1 and glm4 MoE models (#24132)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-09-03 04:57:59 +00:00
Jiangyun Zhu
e81d4e69c1 [Misc] Add check for dual_chunk_attention (#24070)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
2025-09-03 04:19:14 +00:00
Didier Durand
02d411fdb2 [Doc]: fix typos in Python comments (#24115)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
2025-09-02 21:14:07 -07:00
Didier Durand
d7e1e59972 [Doc]: fix typos in Python comments (#24093)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
2025-09-02 21:05:45 -07:00
Wentao Ye
c4ed78b14f [Compile] Fix Compile Warning for w4a8_mm_entry.cu (#23660)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-09-02 20:45:52 -07:00
co63oc
1bd007f234 fix some typos (#24071)
Signed-off-by: co63oc <co63oc@users.noreply.github.com>
2025-09-02 20:44:50 -07:00
afeldman-nm
136d853e65 [V1] Wrapper which plumbs request-level logits processors into vLLM batch-level logits processing (#23656)
Signed-off-by: Andrew Feldman <afeldman@redhat.com>
2025-09-03 02:52:51 +00:00
Russell Bryant
e32a0e8678 Upgrade xgrammar to 0.1.23 (#22988)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-09-03 02:32:59 +00:00
youkaichao
42dc59dbac Update release pipeline post PyTorch 2.8.0 update (#24073)
Signed-off-by: Huy Do <huydhn@gmail.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Huy Do <huydhn@gmail.com>
2025-09-03 10:09:19 +08:00
Chaojun Zhang
862f2ef893 [XPU] Fix the bug of LoRA logits on the XPU platform (#24081)
Signed-off-by: chzhang <chaojun.zhang@intel.com>
2025-09-03 08:21:18 +08:00
Matthew Bonanni
2fd1a40a54 [CI/Build] Disable SiluMul NVFP4 quant fusion tests (#24121)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2025-09-02 16:50:28 -07:00
Wentao Ye
930a24144c [Bug] R1 Accuracy: Fix routed_scaling_factor Double Mul Issue (#24119)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-09-02 22:22:30 +00:00
rasmith
457e471971 [AMD][Kernel][Bugfix] Cast offsets tensor bn to tl.int64 to avoid GPU segfault (#23692)
Signed-off-by: Randall Smith <Randall.Smith@amd.com>
2025-09-02 22:13:57 +00:00
Thomas Parnell
d328f7894f [CI] Enable all hf transformers baselines in test_hybrid (#23936)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-09-02 20:15:06 +00:00
Wentao Ye
98aee612aa [Log] Only Print Profiler Results on Rank 0 (#23370)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-09-02 18:53:34 +00:00
nathan
598bd74cf8 Fix weights loading for Apertus (#24100)
Signed-off-by: Nathan Ranchin <nranchin@student.ethz.ch>
2025-09-02 18:34:28 +00:00
Mark McLoughlin
2417798471 [Metrics] Deprecate TPOT in favor of ITL (#24110)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-09-02 18:10:10 +00:00
Kyuyeun Kim
9480ae24e3 [Bugfix] Fix packed_factor missing attribute error (#23902)
Signed-off-by: Kyuyeun Kim <kyuyeunk@google.com>
2025-09-02 10:56:31 -07:00
Chenheli Hua
f399182e8c Run ruff format on a few files. (#24075)
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
2025-09-02 17:55:32 +00:00
Kyle Sayers
1c41310584 [Bugfix] Fix transform_config parsing in Compressed Tensors (#23945)
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
2025-09-02 13:54:10 -04:00
Jiangyun Zhu
c83c4ff815 [Benchmark] Add support for local hf dataset path in benchmark (#23999)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
2025-09-02 17:49:16 +00:00
Peter Pan
0e1759cd54 [docs] add SYS_NICE cap & security-opt for docker/k8s (#24017)
Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>
Signed-off-by: Peter Pan <peter.pan@daocloud.io>
Co-authored-by: Li, Jiang <bigpyj64@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-02 17:27:20 +00:00
Michael Goin
e66ed3e675 [CI Failure] Skip failing nvfp4 silu test (#23959)
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-09-02 13:18:15 -04:00
wang.yuqi
e0653f6c0b [Model] Classification models support logit_bias / sigmoid_normalize (#24031)
Signed-off-by: wang.yuqi <noooop@126.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-09-02 16:48:57 +00:00
Kyungmin Lee
38ba061f6f [BugFix] Fix EXAONE4 rotary embeddings (#23918)
Signed-off-by: lkm2835 <lkm2835@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-02 14:40:55 +00:00
Nicolò Lucchesi
0a74e9d0f2 [Gemma3n] Fix audio batching (#24052)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-09-02 22:23:35 +08:00
Christian Berge
8bd5844989 correct LWS deployment yaml (#23104)
Signed-off-by: cberge908 <42270330+cberge908@users.noreply.github.com>
2025-09-02 12:04:59 +00:00
Aziz
ce30dca5c4 [CI]: reduce HTTP calls inside entrypoints openai tests (#23646)
Signed-off-by: AzizCode92 <azizbenothman76@gmail.com>
Signed-off-by: Aziz <azizbenothman76@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-02 10:49:32 +00:00
WeiQing Chen
2f0bab3f26 [Model] Support dp on ViT on GLM-4.5V (#23168)
Signed-off-by: David Chen <530634352@qq.com>
2025-09-02 10:48:18 +00:00
Didier Durand
fad73be1a5 [Doc]: fix typos in Python comments (#24077)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
2025-09-02 02:38:55 -07:00
Benji Beck
56d04089ef Migrate Interns1 inputs to TensorSchema (#23510)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-09-02 04:35:45 +00:00
Yan Ma
7be0cb8e9e [XPU][Feature] fp8 online quantization support for XPU (#23148)
Signed-off-by: Yan Ma <yan.ma@intel.com>
Co-authored-by: Qiming Zhang <qiming1.zhang@intel.com>
2025-09-02 04:06:53 +00:00
Benji Beck
1fa1d6a9a0 Migrate OvisImagePatchInputs to TensorSchema (#22024)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-09-02 12:01:36 +08:00
Maximilien de Bayser
d59c986444 Remove runtime checks based on pooling params (#24051)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
2025-09-02 11:54:37 +08:00
damon
04d0c60770 [Bugfix] Fix the issue that Blip2ForConditionalGeneration' object has… (#24028)
Signed-off-by: Dazhi Jiang <dazhi_jiang@163.com>
2025-09-02 11:54:20 +08:00
Asaf Joseph Gardin
2b41cbbf03 [V1][Mamba1] - FP32 SSM Kernel Support (#23506)
Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com>
2025-09-01 20:53:00 -07:00
Didier Durand
0235103cbb [Doc]: fix typos in Python comments (#24042)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-09-01 19:07:45 -07:00
Lucia Fang
a344a5aa0a [bugfix]fix MTP hidden states (#24056)
Signed-off-by: Lu Fang <fanglu@fb.com>
2025-09-01 21:09:37 +00:00
Woosuk Kwon
5685370271 [Chore][V0 Deprecation] Move LogProb to a separate file (#24055)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-01 12:07:53 -07:00
WeiQing Chen
a0e0efd6bd [Model] Support DP for ViT on Kimi-VL-A3B-Thinking-2506 (#23817)
Signed-off-by: Junhong <liujunhong11@huawei.com>
Signed-off-by: LJH-LBJ <98734602+LJH-LBJ@users.noreply.github.com>
Co-authored-by: Junhong <liujunhong11@huawei.com>
Co-authored-by: LJH-LBJ <98734602+LJH-LBJ@users.noreply.github.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
2025-09-01 16:56:56 +00:00
Christian Pinto
cf91a89dd2 [docs][misc] IOProcessor plugins fixes (#24046)
Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
2025-09-01 09:17:41 -07:00
Woosuk Kwon
39a22dcaac [Misc] Minor code simplification for spec decode (#24053)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-01 08:54:01 -07:00
Julien Debache
41c80698b3 Document multi-proc method selection for profiling (#23802)
Signed-off-by: jdebache <jdebache@nvidia.com>
2025-09-01 06:28:26 -07:00
Kwai-Keye
7c8271cd1e [Model]: support KeyeVL-1_5-8B (#23838)
Signed-off-by: wangruitao <wangruitao@kuaishou.com>
Co-authored-by: wangruitao <wangruitao@kuaishou.com>
2025-09-01 03:50:27 -07:00
Kay Yan
3e330fcb21 [Doc]: Fix CPU install docs: force torch-backend=cpu to avoid GPU torchvision errors (#24033)
Signed-off-by: Kay Yan <kay.yan@daocloud.io>
2025-09-01 03:34:52 -07:00
Nicolò Lucchesi
d46934b229 [Frontend] Gemma3n audio transcriptions/translations endpoint (#23735)
Signed-off-by: NickLucche <nlucches@redhat.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-09-01 18:07:46 +08:00
Didier Durand
107284959a [Doc]: fix typos in Python comments (#24026)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
2025-09-01 09:38:20 +00:00
Jee Jee Li
dc1a53186d [Kernel] Update DeepGEMM to latest commit (#23915)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-09-01 02:38:04 -07:00
wang.yuqi
55602bb2e6 [Frontend] Update the warning log when using VLLM_ALLOW_LONG_MAX_MODEL_LEN (#20904)
Signed-off-by: wang.yuqi <noooop@126.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-01 08:50:25 +00:00
Isotr0py
d7fbc6ddac [Misc] Enable V1 FP16 inference on pre-Ampere GPUs (#24022)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-01 08:12:22 +00:00
Ning Xie
5438967fbc [Misc] add hash_function doc string (#24014)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-08-31 23:11:20 -07:00
Code Jesus
422e793fa6 [Bugfix] Add support for <tool_call> format in streaming mode for XLAM Tool Parser (#22769)
Signed-off-by: Devon Peroutky <devon@kindo.ai>
2025-09-01 14:07:54 +08:00
Christian Pinto
1cb39dbcdd [Misc] IO Processor plugins for pooling models (#22820)
Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: Max de Bayser <mbayser@br.ibm.com>
2025-08-31 23:07:12 -07:00
Benji Beck
437c3ce026 Migrate Phi4 inputs to TensorSchema (#23471)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-09-01 14:05:59 +08:00
Ning Xie
499b074bfd [Misc] refactor code by import as for torch._inductor.config (#23677)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-09-01 14:05:42 +08:00
Isotr0py
ff0e59d83a [CI/Build] Improve Tensor Schema tests speed by avoid engine core initialization (#23357)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-31 22:52:20 -07:00
Woosuk Kwon
b55713683c [Misc] Move fast prefill logic to separate method (#24013)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-01 05:40:38 +00:00
Jun-Howie
acc1a6e10a Fix the bug related to loading GPTP INT3 weights. (#23328)
Signed-off-by: JunHowie <JunHowie@aliyun.com>
Co-authored-by: JunHowie <JunHowie@aliyun.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-01 05:39:57 +00:00
Woosuk Kwon
8c742a66d1 [Misc] Avoid redundant copy for encoder-only models (#24012)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-01 04:02:43 +00:00
JartX
183a70967a [BUGFIX] GPTQ quantization compatibility for Qwen3 MOE models (AutoGPTQ and AutoRound-GPTQ) (#23994)
Signed-off-by: JartX <sagformas@epdcenter.es>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-01 03:33:40 +00:00
Or Ozeri
14b4326b94 v1: Support KV events from connectors (#19737)
Signed-off-by: Or Ozeri <oro@il.ibm.com>
2025-09-01 01:13:21 +00:00
Nick Hill
752d2e1c36 [Minor] Fix some random typos in comments (#24009)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-08-31 16:42:17 -07:00
Xiaodong Wang
81eea3d348 vllm fix check on max vocab size (#22471)
Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.me>
2025-08-31 20:57:05 +08:00
Didier Durand
9701352e4b [Doc]: fix typos in Python comments (#24001)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
2025-08-31 08:21:59 +00:00
Roger Wang
749be00a98 [Core][Multimodal] Allow passing multi_modal_uuids as multimodal identifiers. (#23394)
Signed-off-by: Roger Wang <hey@rogerw.io>
2025-08-30 18:01:22 -07:00
Gabriel Marinho
5b8077b8ac Fix wrong truncate_prompt_tokens type hint (#22761)
Signed-off-by: Gabriel Marinho <gmarinho@ibm.com>
Signed-off-by: Gabriel Marinho <104592062+gmarinho2@users.noreply.github.com>
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Max de Bayser <mbayser@br.ibm.com>
2025-08-30 20:39:38 +00:00
Andy Lo
038e9be4eb [LoRA] Much faster startup when LoRA is enabled (#23777)
Signed-off-by: Andy Lo <andy@mistral.ai>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-30 15:37:39 +00:00
Ning Xie
68a349114f [Misc] enhance type hint for rearrange return value (#23519)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-08-30 06:43:33 -07:00
Ning Xie
e80bca309e [Refactor] refactor freezing_value/cuda_event initialize outside try finally (#23758)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-08-30 06:42:25 -07:00
Ning Xie
fb4983e112 [Misc] add reorder_batch AttentionMetadataBuilder (#23798)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-08-30 06:41:45 -07:00
sadegh.shokatian
379ea2823a Add LoRA support for DeepSeek models (V2, V3, R1-0528) (#23971)
Signed-off-by: sadeghja1070 <sadegh.ja1070@gmail.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Claude <noreply@anthropic.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-08-30 06:40:02 -07:00
Jiangyun Zhu
3a6acad431 [Model] Enable encoder DP for MiniCPM-V (#23948)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
Signed-off-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-08-30 06:31:26 -07:00
Ning Xie
5490d633ce [UT] fix unify_kv_cache_configs when kv cache config needs sort (#23843) 2025-08-30 11:22:14 +00:00
Jee Jee Li
628d00cd7b [Bugfix] Fix test_lora_resolvers.py (#23984)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-30 11:16:11 +00:00
Thomas Parnell
4071c76cf3 [V1] [Hybrid] Move MiniMaxLinearAttention into layers/mamba (#23831)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-08-30 00:16:15 -07:00
Cyrus Leung
f1bddbd852 [Core] Cleanup TPU model runner for MM (#23894)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-30 00:14:58 -07:00
Yong Hoon Shin
9748c5198b [CI] Fix broken compile tests due to unsupported SiluMul+Nvfp4Quant fusion (#23973)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-08-30 00:14:43 -07:00
Roger Wang
ee52a32705 [CI] Move testing image from remote URL to S3 (#23980)
Signed-off-by: Roger Wang <hey@rogerw.io>
2025-08-29 21:41:25 -07:00
Xin Yang
8fb85b7bb6 Add routed_scaling_factor to MoE grouped topk (#23123)
Signed-off-by: Xin Yang <xyangx@amazon.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-08-29 21:36:48 -07:00
dubejf
5b31cb1781 [Bugfix] Fix --config arg expansion called from api_server.py (#23944)
Signed-off-by: Jean-Francois Dube <dubejf+gh@gmail.com>
Co-authored-by: Jean-Francois Dube <dubejf+gh@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-08-29 21:36:39 -07:00
Roger Wang
d660c98c1b [CI] Fix unavailable image remote URL (#23966)
Signed-off-by: Roger Wang <hey@rogerw.io>
2025-08-29 15:40:04 -07:00
Harry Mellor
5674a40366 [Misc] Make download_weights_from_hf more reliable (#23863)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-29 12:37:24 -07:00
Yong Hoon Shin
8c3e199998 Revert gemma3n fast prefill changes (#23897)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-08-29 12:16:57 -07:00
Thomas Parnell
1c26b42296 [Docs] [V1] [Hybrid] Add new documentation re: contributing mamba-based models (#23824)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-08-29 18:47:58 +00:00
Michael Goin
b7adf94c4a Tuned H100/H200 triton fp8 block configs for fused_qkv_a_proj (#23939)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-29 10:28:35 -07:00
22quinn
4d7fe40fc0 [RL][BugFix] Fix missing tokenizer error for token-in-token-out (#23904)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-08-30 01:09:55 +08:00
yzds
0dc9532065 [BUGFIX ] fix undefined silu_and_mul_nvfp4_quant (#23929)
Signed-off-by: hongchao <hongchao@msh.team>
Signed-off-by: Richard Zou <zou3519@gmail.com>
Co-authored-by: hongchao <hongchao@msh.team>
Co-authored-by: Richard Zou <zou3519@gmail.com>
Co-authored-by: Richard Zou <zou3519@users.noreply.github.com>
2025-08-29 09:36:39 -07:00
vllmellm
72a69132dc [CI] Add aiter to matching list of issue auto labeller for rocm tag (#23942)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-08-29 15:29:21 +00:00
Nick Hill
d90d8eb674 [BugFix] Async scheduling and PP compatibility with DP (#23770)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-08-29 08:17:27 -07:00
Lukas Geiger
0a2f4c0793 [Models] Use in-place adds in Idefics2Vision (#23932)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-08-29 07:42:57 -07:00
EduardDurech
1cf3753b90 [MODEL] Apertus and XIELU (#23068)
Signed-off-by: EduardDurech <39579228+EduardDurech@users.noreply.github.com>
Co-authored-by: AllenHaoHuang <allenhuangdd@gmail.com>
2025-08-29 20:29:18 +08:00
Adit Chawdhary
4f7cde7272 Adds json_count_leaves utility function (#23899)
Signed-off-by: aditchawdhary <aditxy@hotmail.com>
2025-08-29 05:28:13 -07:00
Huy Do
67c14906aa Update PyTorch to 2.8.0 (#20358)
Signed-off-by: Huy Do <huydhn@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-08-29 18:57:35 +08:00
Flora Feng
69f46359dd [Multimodal] Consolidate mm inputs into MultiModalFeatureSpec (#23779)
Signed-off-by: sfeng33 <4florafeng@gmail.com>
2025-08-29 18:36:57 +08:00
wang.yuqi
d9e00dbd1f [Performance] V1 Classify Models E2E Performance Optimization (#23541)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-08-29 03:12:32 -07:00
Li, Jiang
ad39106b16 [CPU] Enable data parallel for CPU backend (#23903)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-08-29 02:19:58 -07:00
Maximilien de Bayser
2554b27baa [V0 Deprecation] Remove pooling model support in V0 (#23434)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-29 00:04:02 -07:00
Harry Mellor
934bebf192 Better errors for Transformers backend missing features (#23759)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-29 07:01:40 +00:00
Jiangyun Zhu
885ca6d31d [Misc] Fix warnings for mistral model (#23552)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
Signed-off-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
2025-08-29 06:58:48 +00:00
Chenheli Hua
2d0afcc9dc [mrope][Qwen2-VL] Fix edge case where getting index of image/video token can potentially throw in default vl mrope implementation. (#23895)
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
2025-08-28 23:29:13 -07:00
Jee Jee Li
b4f9e9631c [CI/Build] Clean up LoRA test (#23890)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-28 23:28:35 -07:00
Raghavan
05d839c19e Fix(async): Add support for truncate_prompt_tokens in AsyncLLM (#23800) 2025-08-28 22:55:06 -07:00
wangxiyuan
6597d7a456 [Platform] import activation_quant_fusion for CUDA only (#23882)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-08-28 22:54:16 -07:00
Jinghui Zhang
5264015d74 [BugFix][AMD][Deepseek] fix a dtype mismatch error for deepseek running on AMD (#23864)
Signed-off-by: Jinghui Zhang <jinghuizhang0804@gmail.com>
2025-08-28 22:54:12 -07:00
Isotr0py
98ac0cb32d [Bugfix] Use ReplicatedLinear for SequenceClassification head (#23836)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-29 04:41:20 +00:00
Russell Bryant
c8b3b299c9 [tests] Improve speed and reliability of test_transcription_api_correctness (#23854)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-08-29 04:25:33 +00:00
Charlie Fu
006477e60b [ROCm][Fix] Fix rocm build caused by #23791 (#23847)
Signed-off-by: charlifu <charlifu@amd.com>
2025-08-28 19:52:27 -07:00
Lukas Geiger
de533ab2a1 [Models] Improve iteration over layers (#19497)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-08-29 09:26:34 +08:00
Chaojun Zhang
235c9db8a7 [XPU] support data parallel for MoE models on XPU (#22887)
Signed-off-by: chzhang <chaojun.zhang@intel.com>
2025-08-29 09:23:04 +08:00
Woosuk Kwon
b668055a11 [V0 Deprecation] Remove V0 Samplers test (#23862) 2025-08-28 18:05:52 -07:00
Wentao Ye
d3d2aad5a2 [Log] Use Debug Once for DeepGEMM E8M0 When not Enabled (#23858) 2025-08-28 22:18:10 +00:00
Yong Hoon Shin
cb293f6a79 [V1] Enable prefill optimization for Gemma3n (#22628)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-08-28 14:54:30 -07:00
Woosuk Kwon
7ffbf27239 [BugFix][FlashInfer] Fix potential race condition for paged_kv_indptr_cpu (#23737)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-28 14:22:46 -07:00
Simon Mo
27e88cee74 chore: build release image by default (#23852)
Signed-off-by: Codex <codex@openai.com>
2025-08-28 13:17:15 -07:00
elvischenv
16a45b3a28 [NVIDIA] Support SiluMul + NVFP4 quant fusion (#23671)
Signed-off-by: jindih <jindih@nvidia.com>
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
Co-authored-by: jindih <jindih@nvidia.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Luka Govedic <lgovedic@redhat.com>
2025-08-28 19:36:50 +00:00
Jingkai He
57d4ede520 [bugfix] [spec-decoding] fix data race in sample_recovered_tokens_kernel (vLLM v1) (#23829)
Signed-off-by: He-Jingkai <he-jingkai@outlook.com>
2025-08-28 19:05:20 +00:00
Divakar Verma
04d1dd7f4a [ROCm][Aiter] Add triton fp8 bmm kernel for mla (#23264)
Signed-off-by: Divakar Verma <divakar.verma@amd.com>
Co-authored-by: ShaoChunLee <Shao-Chun.Lee@amd.com>
2025-08-28 18:18:08 +00:00
Benji Beck
f32a5bc505 Migrate Llama4ImagePatchInputs to TensorSchema (#22021)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-08-28 17:29:37 +00:00
Jean Schmidt
8805ad9fa9 Add scale_config.yml file for Meta autoscalers for GH Actions (#23840)
Signed-off-by: Jean Schmidt <contato@jschmidt.me>
2025-08-28 09:31:20 -07:00
Jean Schmidt
0583578f42 [ci] breaks down V1 Test into 3 groups of approx 30 minutes runtime (#23757)
Signed-off-by: Jean Schmidt <contato@jschmidt.me>
2025-08-28 08:59:19 -07:00
Angela Yi
db74d60490 [Bugfix] Add fake mode around passes (#23349)
Signed-off-by: angelayi <yiangela7@gmail.com>
2025-08-28 11:25:56 -04:00
Po-Han Huang (NVIDIA)
95089607fa [Model][gpt-oss] Support DP+EP for GPT-OSS with FlashInfer trtllm-gen MoE (#23819)
Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
2025-08-28 06:56:20 -07:00
Thomas Parnell
1f096f9b95 [CI] Fix linting error on main (#23835)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-08-28 06:52:01 -07:00
YUQI.CHENG
66548f6603 [Bugfix] Fix benchmark_moe.py for blockwise fp8. (#23823)
Signed-off-by: crischeng <420985011@qq.com>
Co-authored-by: cris <grace@guisenbindeMacBook-Pro.local>
2025-08-28 21:44:09 +08:00
Didier Durand
d3da2eea54 [Doc]: fix typos in Python scripts (#23828)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
2025-08-28 05:37:38 -07:00
Jiangyun Zhu
bfab219648 [Model] [gpt-oss] fix gpt-oss pp support (#23815)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
2025-08-28 05:36:55 -07:00
Woosuk Kwon
a3432f18fd [BugFix][Spec Decode] Use float64 for uniform_probs (#23803)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-28 12:26:45 +00:00
Li, Jiang
67cee40da0 [CI/Build][Bugfix] Fix Qwen VL tests on CPU (#23818)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-08-28 11:57:05 +00:00
Didier Durand
d99c3a4f7b [Doc]: fix typos in .md files (including those of #23751) (#23825)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
2025-08-28 04:38:19 -07:00
JartX
3462c1c522 [FIXBUG] Add return_success parameter to moe_wna16_weight_loader function (#22797)
Signed-off-by: JartX <sagformas@epdcenter.es>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-08-28 09:03:22 +00:00
Isotr0py
c5d004aaaf [Model] Add PP support and VLM backbone compatability for GPT-OSS (#23680)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-28 16:03:28 +08:00
wang.yuqi
11a7fafaa8 [New Model]: Support GteNewModelForSequenceClassification (#23524)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-08-28 15:36:42 +08:00
yzds
186aced5ff [Kernel] cuda kernels for upcoming decode context parallel feature (#23791)
Co-authored-by: hongchao <hongchao@msh.team>
2025-08-28 15:29:11 +08:00
rongfu.leng
daa1273b14 [Bugfix] when set offline model running error (#23711)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-08-28 07:27:45 +00:00
Jiangyun Zhu
c07a73317d [CI] enable idefics3 and fuyu-8b test in multimodal test (#23790)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
2025-08-28 14:51:24 +08:00
Kyle Sayers
22feac8e95 [Transform] [Quantization] Add transforms to compressed tensors (#22486) 2025-08-28 02:43:48 -04:00
Jinheng
c8851a4723 Add deprecation warning for lora_extra_vocab_size (#23635)
Signed-off-by: Jinheng Li <ahengljh@gmail.com>
2025-08-27 22:34:29 -07:00
Alex
f48a9af892 [CI] make all multi-gpu weight loading tests run nightly (#23792)
Signed-off-by: Alex Yun <alexyun04@gmail.com>
2025-08-27 21:27:36 -07:00
Jan Kessler
a11adafdca Gracefully handle edge cases in harmony utils (#23155)
Signed-off-by: Jan Kessler <jakessle@uni-mainz.de>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-08-27 20:14:00 -07:00
Michael Goin
a781e84ec2 [Perf] Tune configs for triton block fp8 gemm H100/H200 (#23748)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-28 11:12:53 +08:00
Shrey Gupta
1b7b161a09 [Feature] models: pass layer prefix to replace_linear_class for per-layer quantization routing. Addresses #23239 (#23556)
Signed-off-by: Shrey Gupta <shreyg1303@gmail.com>
2025-08-27 20:12:44 -07:00
Benji Beck
a69693e38f Migrate Qwen inputs to TensorSchema (#23473)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-08-28 10:43:26 +08:00
Hanchenli
5da4f5d857 [Bugfix] Fix for V1 priority scheduling crashes at preemption (#23713)
Signed-off-by: Hanchenli <lihanc2002@gmail.com>
2025-08-28 00:44:52 +00:00
Wentao Ye
321938e9ac [Feature] Add VLLM_DISABLE_PAD_FOR_CUDAGRAPH to Avoid Hang Issue (#23595)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-08-27 21:52:24 +00:00
Michael Goin
f9ca2b40a0 [Bugfix] Fix Marlin NVFP4 for modelopt (#23659)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-27 17:48:16 -04:00
Yongye Zhu
082cc07ef8 DP/EP Support for gpt-oss with deepep-ht comm kernel on SM100 (#23608) 2025-08-27 17:33:21 -04:00
Asaf Joseph Gardin
853c371fc3 [V1][Mamba] - Enable V1 by default for Mamba Models (#23650)
Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com>
2025-08-27 20:53:30 +00:00
Roger Wang
8bf6266a17 [Multimodal] Generate mm_hash based on request metadata when caching is turned off (#23690)
Signed-off-by: Roger Wang <hey@rogerw.io>
2025-08-27 20:24:31 +00:00
Harry Mellor
0585a9e73c Disable torch.compile for dynamic rope models in Transformers backend (#23738)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-27 19:03:05 +00:00
Eli Uriegas
3c0ef769ba ci: Add arm64 docker build to release pipeline (#23210)
Signed-off-by: Eli Uriegas <eliuriegas@meta.com>
Signed-off-by: Eli Uriegas <1700823+seemethere@users.noreply.github.com>
2025-08-27 10:41:48 -07:00
Hyogeun Oh (오효근)
4e4d017b6f [Docs] Fix warnings in mkdocs build (continued) (#23743)
Signed-off-by: Zerohertz <ohg3417@gmail.com>
Signed-off-by: Hyogeun Oh (오효근) <ohg3417@gmail.com>
2025-08-27 17:17:29 +00:00
Thomas Parnell
dd58932280 [V1] [Hybrid] Enable compile and piecewise CUDA graph for MiniMax-Text models (#22589)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-08-27 10:05:16 -07:00
Cyrus Leung
52883ed084 [Model] Merge SupportsMultiModalWithRawInput with SupportsMultiModal (#23749)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-27 10:01:50 -07:00
Luka Govedič
4f35be10a9 [BugFix] Fix topk_softmax assert (#19764)
Signed-off-by: Luka Govedic <lgovedic@redhat.com>
2025-08-27 09:47:28 -07:00
Harry Mellor
2b61d2e22f [Docs] Remove in-tree Gaudi install instructions (#23628)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-27 09:22:21 -07:00
Nick Hill
3ce8285d6d [LogitsProcs] Deduplicate built-in LP implementation logic (#23362)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-08-27 23:11:33 +08:00
Didier Durand
83f555f637 [Doc]: upgrade version of crate-ci tool for improved typo detection (#23755)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
2025-08-27 07:59:34 -07:00
Isotr0py
841490434a [Model] Enable native HF format InternVL support (#23742)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-27 14:45:17 +00:00
Wentao Ye
3af47c3cc6 [Feature] Add Hopper DeepGEMM E8M0 for DeepSeekV3.1 scale_fmt (#23666)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
2025-08-27 14:09:08 +00:00
Harry Mellor
513c1fe255 Only run get_attr_docs if generating help text (#23723)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-27 13:55:12 +00:00
Cyrus Leung
fe8d7b6f03 [Model] Interface to enable batch-level DP support (#23733)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-08-27 06:41:22 -07:00
Harry Mellor
16dc4052b0 Fix pre-commit on main (#23747)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-27 06:39:48 -07:00
rebel-hongseok
8dd2baa597 Add vLLM Korea Meetup in the README.md and meetups.md (#23746)
Signed-off-by: rebel-hongseok <hongseok@rebellions.ai>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-08-27 06:25:49 -07:00
Cyrus Leung
5eeef1b908 [Model] Explicit default_pooling_type interface (#23736)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-27 13:24:09 +00:00
Thomas Parnell
704432af3c [V1] [Hybrid] Disable prefix caching by default for hybrid or mamba-based models (#23716)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-08-27 12:51:54 +00:00
Nick Hill
a403d0fa41 [Misc] Remove unnecessary _send_reconfig_message() in core_client.py (#23127)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-08-27 05:50:47 -07:00
cndoit18
8c13820f0b [Bugfix] Fix task field initialization when PYTHONOPTIMIZE is enabled (#23718)
Signed-off-by: cndoit18 <cndoit18@outlook.com>
2025-08-27 12:42:20 +00:00
tc-mb
9d30de4469 [model] Support MiniCPM-V 4.5 (#23586)
Signed-off-by: tc-mb <caitianchi@modelbest.cn>
Signed-off-by: Xin Yang <xyangx@amazon.com>
Signed-off-by: Abatom <abzhonghua@gmail.com>
Signed-off-by: chzhang <chaojun.zhang@intel.com>
Signed-off-by: Pate Motter <patemotter@google.com>
Signed-off-by: Terrencezzj <terrence@cohere.ai>
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
Signed-off-by: simon-mo <simon.mo@hey.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
Signed-off-by: siyuanf <siyuanf@nvidia.com>
Signed-off-by: Weiliang Liu <weiliangl@nvidia.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Signed-off-by: Zijing Liu <liuzijing2014@gmail.com>
Signed-off-by: Zijing Liu <liuzijing2014@users.noreply.github.com>
Signed-off-by: jiabin.00 <jiabin.00@bytedance.com>
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: tc-mb <157115220+tc-mb@users.noreply.github.com>
Signed-off-by: Roger Wang <hey@rogerw.me>
Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Huy Do <huydhn@gmail.com>
Signed-off-by: Matúš Námešný <matus.namesny@ameria.com>
Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: oye93 <en.ouyang93@outlook.com>
Signed-off-by: Julien Lin <jullin@nvidia.com>
Signed-off-by: Didier Durand <durand.didier@gmail.com>
Signed-off-by: Tianyu Li <tianyu.li@arm.com>
Signed-off-by: Hongxia Yang <hongxia.yang@amd.com>
Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: jiang1.li <jiang1.li@intel.com>
Signed-off-by: Zerohertz <ohg3417@gmail.com>
Signed-off-by: Hyogeun Oh (오효근) <ohg3417@gmail.com>
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Huzaifa Sidhpurwala <huzaifas@redhat.com>
Signed-off-by: Federico <65908512+coval3nte@users.noreply.github.com>
Signed-off-by: Zixuan Zhang <zixuanzhang@bytedance.com>
Signed-off-by: wuhang <wuhang6@huawei.com>
Signed-off-by: czhu-cohere <conway.zhu@cohere.com>
Signed-off-by: Wei Wei <wwei6@meta.com>
Signed-off-by: Yiheng Xu <charlesyihengxu@gmail.com>
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Signed-off-by: wangyafeng <wangyafeng@baidu.com>
Co-authored-by: Xin Yang <105740670+xyang16@users.noreply.github.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: Zhonghua Deng <abzhonghua@gmail.com>
Co-authored-by: Chaojun Zhang <chaojun.zhang@intel.com>
Co-authored-by: Pate Motter <p@temotter.com>
Co-authored-by: Terrence Zhao <32208165+Terrencezzj@users.noreply.github.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Simon Mo <simon.mo@hey.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: weiliang <weiliangl@nvidia.com>
Co-authored-by: Siyuan Fu <siyuanf@nvidia.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com>
Co-authored-by: ProExpertProg <11367180+ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Zijing Liu <liuzijing2014@users.noreply.github.com>
Co-authored-by: Bin Jia <45593998+FoolPlayer@users.noreply.github.com>
Co-authored-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Raghavan <oneraghavan@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.me>
Co-authored-by: knlnguyen1802 <knlnguyen1802@gmail.com>
Co-authored-by: Huy Do <huydhn@gmail.com>
Co-authored-by: Matúš Námešný <matus@namesny.com>
Co-authored-by: Guillaume Calmettes <gcalmettes@scaleway.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: En Ouyang <en.ouyang93@outlook.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
Co-authored-by: nvjullin <jullin@nvidia.com>
Co-authored-by: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Co-authored-by: TianyuLi0 <116711075+TianyuLi0@users.noreply.github.com>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: Yuekai Zhang <zhangyuekai@foxmail.com>
Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: Hyogeun Oh (오효근) <ohg3417@gmail.com>
Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Lukas Geiger <lukas.geiger94@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Huzaifa Sidhpurwala <huzaifas@redhat.com>
Co-authored-by: Federico <65908512+coval3nte@users.noreply.github.com>
Co-authored-by: zixuanzhang226 <zixuanzhang@bytedance.com>
Co-authored-by: wuhang <wuhang6@huawei.com>
Co-authored-by: yzds <41983536+youzhedian@users.noreply.github.com>
Co-authored-by: hongchao <hongchao@msh.team>
Co-authored-by: czhu-cohere <conway.zhu@cohere.com>
Co-authored-by: Wei <weiweinpu@gmail.com>
Co-authored-by: Yiheng Xu <charlesyihengxu@gmail.com>
Co-authored-by: Aaron Pham <contact@aarnphm.xyz>
Co-authored-by: Chenheli Hua <huachenheli@outlook.com>
Co-authored-by: CSWYF3634076 <58356743+CSWYF3634076@users.noreply.github.com>
2025-08-27 05:38:00 -07:00
Michael Yao
1f7a9c95e4 [Docs] Fix a 1-2-3 list and style issues in tpu.md (#23729)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-08-27 05:37:52 -07:00
Fanli Lin
8f0d7eaea8 [XPU] Fix OOM issue for data parallel with Ray backend (#22500)
Signed-off-by: Fanli Lin <fanli.lin@intel.com>
Signed-off-by: Fanli Lin <fanli0116@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-08-27 19:57:38 +08:00
Jee Jee Li
e03940762b [CI/Build] Reduce LoRA layer test cases (#23721)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-27 10:59:35 +00:00
Woosuk Kwon
11eddf02f0 [FlashInfer] Cache hyper params in metadata builder (#23732)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-27 03:45:04 -07:00
Woosuk Kwon
04ff1e43fb [Misc] Move CpuGpuBuffer to vllm/v1/utils.py (#23728)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-27 03:25:00 -07:00
Woosuk Kwon
6578e87365 Optimize input preparation for FlashInfer [2/N] (#23174)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-27 02:52:45 -07:00
Michael Yao
5bd9f84158 [Docs] Fix an admonition important (#23726)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-08-27 02:50:09 -07:00
Cyrus Leung
91e382c935 [CI/Build] Remove redundant register in model init tests (#23715)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-27 08:11:15 +00:00
Kunshang Ji
6446677839 [XPU]fix cuda event used in XPU model runner (#23708)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2025-08-27 07:27:14 +00:00
Cyrus Leung
69244e67e6 [Core] Use key-only cache for BaseMultiModalProcessor (#23018)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-27 14:19:13 +08:00
rongfu.leng
8dbf6ed7be [Bugfix] fix when config.yaml config value is list parse error (#23528)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-08-27 05:54:39 +00:00
Jee Jee Li
9de25c294b [CI/Build] Remove redundant LoRA model tests (#23706)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-27 05:51:50 +00:00
Kunshang Ji
fce10dbed5 [XPU] Add xpu torch.compile support (#22609)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2025-08-27 05:33:27 +00:00
Dipika Sikka
d272415e57 [Quantization] Expand compressed-tensors MoE matching logic to support NFP4 + FP8 MoEs (#22674)
Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
Signed-off-by: Dipika <dipikasikka1@gmail.com>
2025-08-27 05:00:21 +00:00
Chen Zhang
142ac08030 [Frontend] Optimize beam search performance by limiting concurrency (#23599)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-08-27 04:59:14 +00:00
Chen Zhang
3210264421 [Frontend] Add --log-error-stack to print stack trace for error response (#22960)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-08-27 04:58:59 +00:00
CSWYF3634076
644d57d531 [Model] Add Ernie4.5 VL Model Support (#22514)
Signed-off-by: wangyafeng <wangyafeng@baidu.com>
2025-08-26 21:02:55 -07:00
Chenheli Hua
c905684cfe [Core] Asynchronous h2d in merge_multimodal_embeddings via pinned memory. (#23686)
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-08-26 20:05:34 -07:00
Yiheng Xu
786835807b [Bugfix]: Qwen3 Coder Tool Parser (#23099)
Signed-off-by: Yiheng Xu <charlesyihengxu@gmail.com>
Co-authored-by: Aaron Pham <contact@aarnphm.xyz>
2025-08-26 19:58:32 -07:00
Wei
fecbb7c782 [Bugfix][gpt-oss] passing the cache config in gpt-oss (#23613)
Signed-off-by: Wei Wei <wwei6@meta.com>
2025-08-27 02:54:23 +00:00
Harry Mellor
6dab89b8ec [Docs] Fix math rendering in docs (#23676)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-26 18:47:08 -07:00
Michael Goin
de02b07db4 [Bugfix] Lazy import gpt_oss_triton_kernels_moe for mxfp4 (#23678)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-27 09:34:57 +08:00
Chen Zhang
eb1995167e [gpt-oss] Enable unit test for response API harmony integration (#23533)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-08-26 18:23:26 -07:00
czhu-cohere
2c2b140ae8 [quantization] use channel scales for w4a8 + misc fixes (#23570)
Signed-off-by: czhu-cohere <conway.zhu@cohere.com>
2025-08-26 18:23:23 -07:00
yzds
c7c80af084 fix pynccl reduce_scatter (#23648)
Co-authored-by: hongchao <hongchao@msh.team>
2025-08-26 18:21:11 -07:00
wuhang
6891205b16 [Feature][Responses API] Support MCP tool in background mode (#23494)
Signed-off-by: wuhang <wuhang6@huawei.com>
2025-08-27 01:06:58 +00:00
zixuanzhang226
b1625dbe9c feat: add triton fused moe config for GLM-4.5-Air-FP8 on B200 (#23695)
Signed-off-by: Zixuan Zhang <zixuanzhang@bytedance.com>
2025-08-26 18:06:10 -07:00
Federico
585e0bde36 [Bugfix] UnboundLocalError when GptOss reasoning specified (#23054)
Signed-off-by: Federico <65908512+coval3nte@users.noreply.github.com>
2025-08-27 00:29:52 +00:00
Wentao Ye
714872f1a9 [Compile] Fix Cmake Warning (#23689)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-26 23:48:32 +00:00
Thomas Parnell
5f1af97f86 [V1] [Hybrid] Enable Full CUDA graph by default for hybrid models in V1 (#22594)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-08-26 23:28:55 +00:00
Zhonghua Deng
c3b0fd1ee6 [V1][P/D]P2pNcclConnector supports flashinfer (#23536)
Signed-off-by: Abatom <abzhonghua@gmail.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
2025-08-26 22:56:16 +00:00
Harry Mellor
6421b66bf4 [Docs] Move quant supported hardware table to README (#23663)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-26 22:26:46 +00:00
Huzaifa Sidhpurwala
2f13319f47 Enhance the pre-notification policy (#23532)
Signed-off-by: Huzaifa Sidhpurwala <huzaifas@redhat.com>
2025-08-26 20:41:36 +00:00
Chen Zhang
d696f86e7b [doc] Hybrid KV Cache Manager design doc (#22688)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-26 20:19:05 +00:00
Isotr0py
9816b81f5f [Model] Enable video support for InternVL3.5 models (#23658)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-26 19:46:52 +00:00
Jiangyun Zhu
c37c0af990 [Misc] Fix comments in tests/kernels/quantization (#23675)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
2025-08-26 19:31:20 +00:00
Cyrus Leung
9715f7bb0f [Bugfix] Fix incorrect original shape in hashing (#23672)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-08-26 19:01:25 +00:00
Russell Bryant
98aa16ff41 [v1] Add cross-attention KV cache support for encoder-decoder models (#23664)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-08-26 18:49:06 +00:00
Thomas Parnell
227e231b55 [Docs] [V1] [Hybrid] Update docs to remove FlashInfer constraint for hybrid models (#23665)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-08-26 18:33:16 +00:00
Hyogeun Oh (오효근)
730d0ac8b9 [Docs] Fix warnings in mkdocs build (#23649)
Signed-off-by: Zerohertz <ohg3417@gmail.com>
Signed-off-by: Hyogeun Oh (오효근) <ohg3417@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-26 18:19:23 +00:00
Li, Jiang
9b0187003e [Bugfix] Fix cuda event usage with CPU model runner (#23643)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-08-26 17:10:42 +00:00
vllmellm
44ac25eae2 [CI] [Doc]: Add GH Action for auto labeling issues with rocm tag (#20988)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-08-26 16:20:13 +00:00
nvjullin
7ea22e42d5 [Misc] Add override for allreduce fusion thresholds (#23639)
Signed-off-by: Julien Lin <jullin@nvidia.com>
2025-08-26 15:53:04 +00:00
Yuekai Zhang
9d4183dd2e [model] support qwen2audio embedding input (#23625)
Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-08-26 23:48:08 +08:00
Yuekai Zhang
513298f1b4 [Bugfix] fix bf16 multimodal model hash (#23623)
Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-08-26 23:47:50 +08:00
Harry Mellor
379f828fba [Docs] Reduce requirements for docs build (#23651)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-26 15:43:28 +00:00
Hongxia Yang
1fdc732419 [ROCm] Starting to add AMD code reviewers for ROCm components (#23496)
Signed-off-by: Hongxia Yang <hongxia.yang@amd.com>
2025-08-26 07:32:37 -07:00
TianyuLi0
f58675bfb3 [CPU] add cpu fused moe pytorch native implementation (#23146)
Signed-off-by: Tianyu Li <tianyu.li@arm.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
2025-08-26 14:09:17 +00:00
Didier Durand
7c04779afa [Doc]: fix various spelling issues in multiple files (#23636)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
2025-08-26 14:05:29 +00:00
nvjullin
f66673a39d [Kernel] Added flashinfer fp8 per-tensor gemms (#22895)
Signed-off-by: Julien Lin <jullin@nvidia.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-08-26 06:54:04 -07:00
En Ouyang
b78bed1bc5 [Hardware][Mac] Fix the installation fail for Apple Silicon (CPU) (#23565)
Signed-off-by: oye93 <en.ouyang93@outlook.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
2025-08-26 13:04:25 +00:00
Harry Mellor
164b2273c8 [Docs] Fix broken links to docs/api/summary.md (#23637)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-26 13:00:18 +00:00
Chen Zhang
2b4fc9bd9b Support FlashAttention Backend for Hybrid SSM Models (#23299)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-08-26 12:41:52 +00:00
Guillaume Calmettes
ebd5a77bb5 feat: add usage to TranscriptionResponse (text and json response_format) (#23576)
Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
2025-08-26 05:26:26 -07:00
Matúš Námešný
384dd1b0a8 [Bugfix] Add missing enable_log_outputs parameter to init_app_state function (#23634)
Signed-off-by: Matúš Námešný <matus.namesny@ameria.com>
2025-08-26 12:13:15 +00:00
Jee Jee Li
fdeb3dac13 [Model] fix DeepSeek e_score_correction_bias dtype to fp32 (#23640)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-26 20:09:47 +08:00
Michael Goin
d52358c1e0 [Perf] Remove duplicated NVFP4 blockscales to save memory (#23379)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-26 19:16:33 +08:00
Huy Do
6ace2f72b0 Fix writing benchmark results with tuple keys (#23633)
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-08-26 19:16:09 +08:00
Harry Mellor
b00e69f8ca Fix nits from #20059 (#23548)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-26 03:27:20 -07:00
Cyrus Leung
50fede6634 [V1] Enable V1 for compute capability < 8.0 + FP32 (#23614)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-26 03:00:18 -07:00
Roger Wang
b5d34af328 [Bugfix] Fix scheduling when repeated images in one request (#23544)
Signed-off-by: Roger Wang <hey@rogerw.me>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.me>
Co-authored-by: knlnguyen1802 <knlnguyen1802@gmail.com>
2025-08-26 09:46:28 +00:00
Jee Jee Li
9b5f64238f [Bugfix] Fix Qwen25VL packed_modules_mapping (#23604)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-08-26 01:09:14 -07:00
Raghavan
ff77764f86 Fix CLI parameter documentation inconsistency in pooling_models.md (#23630) 2025-08-26 01:05:37 -07:00
Harry Mellor
bfc1edc9f5 [Docs] Fix titles for multi-file examples that are rendered in the docs (#23573)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-26 00:16:44 -07:00
Jiangyun Zhu
3ecbb14b81 [Benchmarks] add benchmark for embedding models (#23000)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
2025-08-25 23:57:08 -07:00
Cyrus Leung
7d67a9d9f9 [mypy] Fix incorrect type hint for EAGLE3 support (#23617)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-25 23:50:17 -07:00
Bin Jia
959783fb99 [fix] fix seed-oss-parser (#23560)
Signed-off-by: jiabin.00 <jiabin.00@bytedance.com>
2025-08-25 23:16:36 -07:00
Cyrus Leung
ce0e9dbd43 [CI/Build] Fix typo in #23561 (#23616)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-25 23:13:03 -07:00
Zijing Liu
b395b3b0a3 [Disagg][Perf] Use CUDA event sync instead of blocking tolist to avoid unintentional copy ops blocking across different CUDA streams, improving disagg TTIT/TTFT (#22760)
Signed-off-by: Zijing Liu <liuzijing2014@gmail.com>
Signed-off-by: Zijing Liu <liuzijing2014@users.noreply.github.com>
2025-08-25 21:06:00 -07:00
Copilot
6fad29b11b Remove graph_pool as member of VllmBackend and argument to CUDAGraphWrapper (#23385)
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: ProExpertProg <11367180+ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-08-25 19:34:15 -07:00
Cyrus Leung
6fd45e7b8a [CI/Build] Use vLLM client's user agent to fetch images (#23561)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-25 19:34:12 -07:00
Wentao Ye
56dcf4e7e9 [Bug] Fix DeepGEMM Env Control (#23591)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-25 18:41:21 -07:00
weiliang
ae067888d6 Update Flashinfer to 0.2.14.post1 (#23537)
Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
Signed-off-by: siyuanf <siyuanf@nvidia.com>
Signed-off-by: Weiliang Liu <weiliangl@nvidia.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Siyuan Fu <siyuanf@nvidia.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-08-25 18:30:44 -07:00
Michael Goin
906e461ed6 [CI Fix] Pin deepep and pplx tags in tools/ep_kernels/, gate multigpu tests (#23568)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-25 18:29:00 -07:00
Simon Mo
2a97ffc33d [Misc] Add release note draft to PR template (#23598)
Signed-off-by: simon-mo <simon.mo@hey.com>
2025-08-25 16:44:51 -07:00
Woosuk Kwon
efc88cf64a [Misc] Simplify FlashInfer attention metadata (#23585)
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
2025-08-25 15:42:29 -07:00
Terrence Zhao
7b6a837275 [Docs] Update Documentation of Cohere Command-A Models (#23584)
Signed-off-by: Terrencezzj <terrence@cohere.ai>
Signed-off-by: Abatom <abzhonghua@gmail.com>
Co-authored-by: Zhonghua Deng <abzhonghua@gmail.com>
2025-08-25 21:53:52 +00:00
Pate Motter
c34c82b7fe [TPU][Bugfix] Fixes prompt_token_ids error in tpu tests. (#23574)
Signed-off-by: Pate Motter <patemotter@google.com>
2025-08-25 14:29:16 -07:00
Chaojun Zhang
8a044754bd [XPU] Delay BF16 check to worker init for spawn compatibility (#22979)
Signed-off-by: chzhang <chaojun.zhang@intel.com>
2025-08-25 13:09:26 -07:00
Zhonghua Deng
9188ae7cb5 [Bugfix][V1][P/D]Fix the issue where repeated requests for the same input produce abnormal outputs for P2pNcclConnector (#23403)
Signed-off-by: Abatom <abzhonghua@gmail.com>
2025-08-25 12:57:08 -07:00
Xin Yang
8a3cd90af5 [Kernel] Add fused grouped_topk kernel for MoE (#23274)
Signed-off-by: Xin Yang <xyangx@amazon.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-08-25 11:47:52 -07:00
22quinn
2a167b2eeb [test][RL] Add sleep level 2 test and fix reload with sleep mode (#23521)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-08-26 00:25:52 +08:00
Woosuk Kwon
0ff902f3b4 [Refactor] Refactor persistent buffers with CpuGpuBuffer (#23515)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-08-25 08:44:48 -07:00
Isotr0py
a9082a4d14 [Bugfix] Fix Qwen3 MoE GPTQ inference (#23490)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-25 06:40:20 -07:00
Driss Guessous
e0329ed4b4 Updates to Flex + VLLm integration (#21416)
Signed-off-by: drisspg <drisspguessous@gmail.com>
2025-08-25 09:32:42 -04:00
Cyrus Leung
6879cd80ae [Refactor] Pass tokenizer explicitly instead of binding to prompt update (#23542)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-25 06:31:57 -07:00
Cyrus Leung
e269be2ba2 [Doc] Add caution for API server scale-out (#23550)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-25 06:14:15 -07:00
Ayush Satyam
5c4b6e66fe [Attention] Unify mamba and attention backend selection (#23171)
Signed-off-by: Ayush Satyam <ayushsatyam146@gmail.com>
2025-08-25 09:09:36 +00:00
youkaichao
d0a4a3f645 [misc] add shanghai meetup (#23535)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-08-25 17:00:03 +08:00
Cyrus Leung
ebafb0936d [Bugfix] Allow dynamic number of patches for llava_onevision (#23525)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-25 08:34:54 +00:00
Breno Baldas Skuk
0cb7b065c3 Feature/benchmark/random mm data/images (#23119)
Signed-off-by: breno.skuk <breno.skuk@hcompany.ai>
2025-08-25 01:28:35 -07:00
ZiTian Zhao
2da02dd0d8 [Fix] DeepSeek V3.1 tool parser error message (#23492)
Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
2025-08-25 00:56:39 -07:00
Chenguang Zheng
d765cf01fe [Core][Multimodal] Track encode cache entries by mm_hash and enable embedding sharing between requests (#22711)
Signed-off-by: knlnguyen1802 <knlnguyen1802@gmail.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: knlnguyen1802 <knlnguyen1802@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-08-25 00:41:17 -07:00
Cyrus Leung
712d0f88d8 [Refactor] Dynamic target and content for prompt updates (#23411)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-24 23:39:58 -07:00
Yu Guo
49ab23b3cc [gpt-oss] use reasoning channel for reasoning text in serving_chat (#22920)
Signed-off-by: Yu Guo <yuguo@meta.com>
2025-08-25 06:29:34 +00:00
LIYIFAN_liyifan
c9abb10489 [Bugfix] Fix Dense module loading for sentence-transformers embedding models (simplified V2) (#23408)
Signed-off-by: FFFfff1FFFfff <yifanli0919@gmail.com>
2025-08-25 05:39:24 +00:00
Benji Beck
787cdb3829 Migrate DonutImagePixelInputs to TensorSchema (#23509)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-08-25 05:02:15 +00:00
Benji Beck
a5203d04df Migrate skyworkr1v inputs to TensorSchema (#23499)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-08-25 04:43:21 +00:00
Benji Beck
99f8094400 Migrate tarsier inputs to TensorSchema (#23500)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-08-25 04:42:36 +00:00
Jee Jee Li
170e8ea9ea [Misc] Unified linear print info (#23516)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-24 20:13:51 -07:00
zifeitong
a71e4765cc [Bugfix] Fix Qwen2.5-VL quantized model weights loading (#23512)
Signed-off-by: Zifei Tong <zifeitong@gmail.com>
2025-08-25 10:40:22 +08:00
Noam Gat
39971db3aa Frontend: Adding LM Format Enforcer support to V1 engine (#22564)
Signed-off-by: Noam Gat <noamgat@gmail.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-08-24 19:31:22 -07:00
Ming Yang
504d914314 [Perf] Add Triton config for DeepSeek V3 FP8 EP32 H200 (#23504)
Signed-off-by: Ming Yang <minos.future@gmail.com>
2025-08-24 18:06:35 -07:00
Didier Durand
47455c424f [Doc: ]fix various typos in multiple files (#23487)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-25 00:04:04 +00:00
Lucia Fang
c7fc6b1354 fix incompatibililty with non cuda platform for nvfp4 (#23478)
Signed-off-by: Lu Fang <fanglu@fb.com>
Co-authored-by: Lucia (Lu) Fang <fanglu@meta.com>
2025-08-24 15:35:41 -07:00
Woosuk Kwon
ad78868450 [Misc] Remove unused slot_mapping buffer (#23502)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-24 14:03:36 -07:00
Cyrus Leung
e2db1164a1 [Model] Enable BLOOM on V1 (#23488)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-24 13:30:47 +00:00
汪志鹏
416f05929a [New Model]Donut model (#23229)
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
2025-08-24 12:52:24 +00:00
TeeKen Lau
5e021b4981 (Misc): add missing test for zero truncation size. (#23457)
Signed-off-by: teekenl <teekenlau@gmail.com>
2025-08-24 18:12:47 +08:00
rongfu.leng
1b9b16649c [Misc] update dict parse to EPLBConfig from json dumps to dict unpacking (#23305)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-08-24 08:06:34 +00:00
czhu-cohere
e76e233540 [kernel] Support W4A8 on Hopper (#23198)
Signed-off-by: czhu-cohere <conway.zhu@cohere.com>
2025-08-24 06:18:04 +00:00
Benji Beck
a75277285b Migrate Paligemma inputs to TensorSchema (#23470)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-08-24 04:56:56 +00:00
22quinn
9dc30b7068 [Bugfix] Add strong reference to CUDA pluggable allocator callbacks (#23477)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Eric Marcus <eric.marcus@kaiko.ai>
Co-authored-by: youkaichao <youkaichao@gmail.com>
2025-08-24 12:56:17 +08:00
Benji Beck
053278a5dc Migrate Pixtral inputs to TensorSchema (#23472)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-08-24 04:55:53 +00:00
Jiangyun Zhu
c55c028998 [gpt-oss] Streaming Output for Python Tool (#23409)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
2025-08-24 04:42:38 +00:00
Jee Jee Li
65197a5fb3 [Misc] Modify CacheConfig import (#23459)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-23 06:05:27 +00:00
Xu Wenqing
b8f17f5d98 Support DeepSeek-V3.1 tool call (#23454)
Signed-off-by: Xu Wenqing <xuwq1993@qq.com>
2025-08-23 05:50:16 +00:00
Aziz
d9a55204ba fix(tests): Correct unreachable assertion in truncation test (#23425)
Signed-off-by: AzizCode92 <azizbenothman76@gmail.com>
2025-08-23 05:23:54 +00:00
Cyrus Leung
b4e9fd811f Revert "[PERF] Use faster way of decode in tokenizer: avoid useless list-to-list conversion (#20000)" (#23396)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-23 04:16:48 +00:00
Chenxi Yang
308fa287a8 Add glm4.5v tp2,4 fp8 config on H100_80GB (#23443)
Co-authored-by: Chenxi Yang <cxyang@meta.com>
2025-08-23 02:54:19 +00:00
Daifeng Li
fa78de9dc3 Quantization: support FP4 quantized models on AMD CDNA2/CDNA3 GPUs (#22527)
Signed-off-by: feng <fengli1702@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-08-22 20:53:21 -06:00
Michael Goin
f6818a92cb [UX] Move Dockerfile DeepGEMM install to tools/install_deepgemm.sh (#23360)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-22 20:52:50 -06:00
WeiQing Chen
23c939fd30 [Model] Support DP for ViT on MiniCPM-V-4 (#23327)
Signed-off-by: ycyaw66 <497410282@qq.com>
Co-authored-by: ycyaw66 <497410282@qq.com>
2025-08-23 02:14:41 +00:00
Nick Hill
add1adfec7 [BugFix] Fix MinPLogitsProcessor.update_states() (#23401)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-08-23 08:22:11 +08:00
Nick Hill
c80c53a30f [BugFix] Fix batch updates for pooling models (#23398)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-08-23 08:20:41 +08:00
elvischenv
24d0c9e6ed [NVIDIA][torch.compile] Support Flashinfer TRTLLM FP8-q/kv NVFP4-out Attention Kernel (#22703)
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-08-22 22:09:05 +00:00
rasmith
cc7ae5e7ca [BugFix][AMD][Quantization] Fix torch.compile issue where wvSplitKQ not being called when it should when using quantized FP8 model (#22281)
Signed-off-by: Randall Smith <Randall.Smith@amd.com>
2025-08-22 21:47:57 +00:00
Ilya Markov
0313cf854d [PERF] PyTorch Symmetric Memory All-Reduce (#20759)
Signed-off-by: ilmarkov <imarkov@redhat.com>
Signed-off-by: ilmarkov <markovilya197@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: ilmarkov <imarkov@redhat.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-08-22 15:39:08 -06:00
Zhewen Li
0483fabc74 [CI/Build] add EP dependencies to docker (#21976)
Co-authored-by: Simon Mo <simon.mo@hey.com>
2025-08-22 13:34:40 -07:00
Shiyan Deng
da65bec309 add an env var for path to pre-downloaded flashinfer cubin files (#22675) 2025-08-22 19:25:45 +00:00
Isotr0py
4645024d3a [Quantization] Allow GGUF quantization to skip unquantized layer (#23188)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-22 13:04:22 -06:00
Isotr0py
cd7a3df26f [Bugfix] Fix broken Florence-2 model (#23426)
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: 汪志鹏 <wangzhipeng628@gmail.com>
2025-08-22 17:50:52 +00:00
Isotr0py
32d2b4064f [Model] Add Ovis2.5 PP support (#23405)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-22 17:46:34 +00:00
Didier Durand
22cf679aad [Doc]: fix various typos in multiple files (#23179)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
2025-08-22 10:38:46 -07:00
Yong Hoon Shin
b6d7d34fc6 Add unit tests for batched guided and non-guided requests (#23389)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-08-22 10:31:24 -07:00
Aziz
341923b982 fix(tests): Ensure reliable CUDA cache clearing in MoE test (#23416)
Signed-off-by: AzizCode92 <azizbenothman76@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-08-22 17:20:59 +00:00
bppps
424fb7a5d2 [BugFix] Fix the issue where image embeddings were incorrectly split.… (#23366)
Signed-off-by: bppps <bpppsaka@gmail.com>
Co-authored-by: zouyu.zzx <zouyu.zzx@alibaba-inc.com>
Co-authored-by: bppps <bpppsaka@gmail.com>
2025-08-22 16:56:46 +00:00
PapaGoose
88491c1b6b [Speculators][Speculative Decoding] Fix Qwen 2 Eagle3 Support (#23337) 2025-08-22 16:39:19 +00:00
Martin Hickey
613a23b57f [Bugfix]: Installing dev environment due to pydantic incompatible version (#23353)
Signed-off-by: Martin Hickey <martin.hickey@ie.ibm.com>
2025-08-22 16:22:29 +00:00
Burkhard Ringlein
51a215300b [Fix] Bump triton version in rocm-build requirements (#21630)
Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
2025-08-22 15:13:39 +00:00
Naman Lalit
ebe14621e3 [Bug fix] Dynamically setting the backend variable for genai_perf_tests in the run-nightly-benchmark script (#23375)
Signed-off-by: Naman Lalit <nl2688@nyu.edu>
2025-08-22 15:12:28 +00:00
Ning Xie
325aa3dee9 [Misc] local import code clean (#23420)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-08-22 14:01:35 +00:00
Chen Zhang
a073be6d87 [Doc] Update the doc for log probs + prefix caching (#23399)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-08-22 13:20:39 +00:00
杨朱 · Kiki
695e7adcd2 [misc] Remove outdate comment about runai_model_streamer (#23421)
Signed-off-by: carlory <baofa.fan@daocloud.io>
2025-08-22 13:08:53 +00:00
Russell Bryant
281710ef9a [Attention] Allow V1 flash_attn to support cross-attention (#23297)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-08-22 12:10:16 +00:00
Woosuk Kwon
808d2e9aa0 [Misc] Move M-RoPE init logic to _init_mrope_positions (#23422)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-22 03:07:22 -07:00
Jee Jee Li
285178b3b8 [V0 Deprecation] Remove V0 LoRA test (#23418)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-22 09:56:51 +00:00
Li, Jiang
88016c372a [Bugfix] Fix pooling models on CPU backend (#23392)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-08-22 09:47:17 +00:00
Benji Beck
998720859c Migrate MiniCPMOAudioInputs to TensorSchema (#21847)
Signed-off-by: Benji Beck <benjibeck@meta.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-08-22 16:43:29 +08:00
Guillaume Calmettes
0ba1b54ac6 [gpt-oss] add input/output usage in responses api when harmony context is leveraged (#22667)
Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
2025-08-22 08:32:24 +00:00
Flora Feng
53415653ff [P/D][Nixl] Make kv cache register compatible with hybrid memory allocator (#23079)
Signed-off-by: sfeng33 <4florafeng@gmail.com>
2025-08-21 22:30:48 -07:00
Chen Zhang
17373dcd93 [Attention] Refactor AttentionMetadata Preparation for Encoder-only Models (#23154)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-08-22 05:05:59 +00:00
Bin Jia
5964069367 [New Model] Add Seed-Oss model (#23241)
Signed-off-by: jiabin.00 <jiabin.00@bytedance.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-22 04:58:10 +00:00
Philip Chung
de9c085e17 [Misc] Add gemma3 chat template with pythonic-style function calling (#17149)
Signed-off-by: Philip Chung <philip.f.chung@gmail.com>
2025-08-21 21:06:50 -07:00
Arjun Reddy
111692bb8c [CI] Add end-to-end V1 min_tokens test coverage (#22495)
Signed-off-by: Arjun Reddy <189282188+arjunbreddy22@users.noreply.github.com>
Co-authored-by: Arjun Reddy <189282188+arjunbreddy22@users.noreply.github.com>
2025-08-21 22:04:07 -06:00
Wentao Ye
394591e343 [Feature] Enable DeepGEMM Linear on B200; 1.5% E2E throughput improvement (#23351)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-21 21:01:08 -07:00
Isotr0py
3ac849665d [CI/Build] Skip Idefics3 and SmolVLM generation test again (#23356)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-22 03:39:46 +00:00
Benji Beck
0b9cc56fac Migrate MllamaImagePixelInputs to TensorSchema (#22020)
Signed-off-by: Benji Beck <benjibeck@meta.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-08-22 11:28:49 +08:00
Cyrus Leung
8896eb72eb [Deprecation] Remove prompt_token_ids arg fallback in LLM.generate and LLM.embed (#18800)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-22 10:56:57 +08:00
Matthew Bonanni
19fe1a0510 [Kernel] Add FP8 support with FlashMLA backend (#22668)
Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
2025-08-22 02:26:32 +00:00
22quinn
480bdf5a7b [Core] Support custom executor qualname (#23314)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-08-22 09:40:54 +08:00
Kebe
5368f76855 [Feature][Responses API] Support logprobs(non-stream) (#23319)
Signed-off-by: Kebe <mail@kebe7jun.com>
2025-08-21 23:09:16 +00:00
tvalentyn
8ef6b8a38c Always use cache mounts when installing vllm to avoid populating pip cache in the image. Also remove apt cache. (#23270)
Signed-off-by: Valentyn Tymofieiev <valentyn@google.com>
2025-08-21 18:01:03 -04:00
Michael Goin
3bbe11cc13 [Perf] Small optimizations for silu_mul_fp8_quant_deep_gemm (#23265)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-21 17:56:15 -04:00
Simon Mo
c5041f899f [CI] improve pr comments bot (#23380) 2025-08-21 14:49:03 -07:00
Simon Mo
8b5fe6eb51 [CI] Clean up actions: remove helm, publish workflows and improve pr … (#23377) 2025-08-21 14:29:04 -07:00
Woosuk Kwon
800349c2a5 [Structured Outputs] Refactor bitmask construction into get_grammar_bitmask (#23361)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-21 20:53:33 +00:00
Elvir Crnčević
044931f97b Make sure that vectorize_with_alignment produced vectorized global loads (#23182) 2025-08-21 20:06:54 +00:00
Pavani Majety
1d353b6352 [Core] Always use tensor cores for Flashinfer Decode Wrapper (#23214)
Signed-off-by: Pavani Majety <pmajety@nvidia.com>
2025-08-21 16:02:11 -04:00
Ning Xie
3496274663 [Misc] Convert VLLM_TORCH_PROFILER_DIR path to absolute (#23191)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-08-21 15:49:09 -04:00
Chen Zhang
8a19303173 [BugFix][gpt-oss] Fix Chat Completion with Multiple Output Message (#23318)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-08-21 10:31:11 -07:00
Nick Hill
603fbbbce0 [Misc] Misc code cleanup/simplification (#23304)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-08-21 17:22:55 +00:00
Ming Yang
10f535c086 [Bugfix] Fix port conflict by obtaining a list of open ports upfront (#21894)
Signed-off-by: Ming Yang <minos.future@gmail.com>
2025-08-21 10:22:18 -07:00
Wentao Ye
48bfb0c9b7 [Bug] Fix R1 Accuracy 0 Bug (#23294)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-08-21 13:11:28 -04:00
Lain
f8ce022948 add tg-mxfp4-moe-test (#22540)
Signed-off-by: siyuanf <siyuanf@nvidia.com>
Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-08-21 17:05:47 +00:00
Yi Liu
0278f1ac3a Fix nvfp4 swizzling (#23140)
Signed-off-by: yiliu30 <yi4.liu@intel.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-08-21 16:54:50 +00:00
Benji Beck
a482e4e769 Migrate MolmoImageInputs to TensorSchema (#22022)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-08-21 16:54:08 +00:00
youkaichao
e0b056e443 [ci/build] Fix abi tag for aarch64 (#23329)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-08-21 23:32:55 +08:00
Roger Wang
79f05e4436 [Multimodal] Always enable hashing mm data (#23308)
Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-21 07:23:28 -07:00
jerryzhuang
f8daddcc4c [Bugfix] set system_message in phi4mini chat template (#23309)
Signed-off-by: zhuangqh <zhuangqhc@gmail.com>
2025-08-21 14:22:39 +00:00
Robert Shaw
c8e33c72c6 [V1] Remove unnecessary check for main thread (#23298)
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
2025-08-21 14:08:35 +00:00
wang.yuqi
d70a16625d [Performance] V1 Pooling Models E2E Performance Optimization (#23162)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-08-21 13:26:09 +00:00
Cyrus Leung
5cc54f7c5b [Doc] Fix batch-level DP example (#23325)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
2025-08-21 06:16:38 -07:00
Cyrus Leung
0c6e40bbaa [Refactor] Simplify code for MM budget (#23310)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-21 08:00:16 +00:00
Paul Pak
2e2000f352 [Model] Add LFM2 architecture (#22845)
Signed-off-by: Paul Pak <paulpak58@gmail.com>
2025-08-21 09:35:07 +02:00
Jared O'Connell
31282401b6 [BugFix] Fix Python 3.9 Support (#23306)
Signed-off-by: Jared O'Connell <46976761+jaredoconnell@users.noreply.github.com>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-08-20 23:23:56 -07:00
Cyrus Leung
0c31e28e95 [Bugfix] Fix extra whitespace in strings caused by newline (#23272)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-20 22:03:00 -07:00
22quinn
f571ff8eb6 [Sampler] Support returning final logprobs (#22387)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-20 21:28:32 -07:00
Michael Goin
f64ee61d9e [CI] Block the cu126 wheel build while broken (#23285)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-21 04:21:05 +00:00
QiliangCui
8993073dc1 [CI] Delete images older than 24h. (#23291)
Signed-off-by: Qiliang Cui <derrhein@gmail.com>
2025-08-20 21:15:20 -07:00
杨奇(yann qi)
655a09f653 [Model][VLM] Support R-4B Model (#23246)
Signed-off-by: yannqi <yannqi@qq.com>
Signed-off-by: 杨奇(yann qi) <51905299+yannqi@users.noreply.github.com>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: yannqiyang <yannqiyang@tencent.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-08-21 04:08:52 +00:00
Wentao Ye
f94bf9b924 [Compile] Fix Compile Warning SM100 Cutlass MLA (#23287)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-21 03:09:39 +00:00
Asaf Joseph Gardin
3663870c72 [V1][Mamba1] - Full CUDA and Piecewise CUDA Graphs Support (#23035)
Signed-off-by: asafg <asafg@ai21.com>
Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com>
Co-authored-by: asafg <asafg@ai21.com>
2025-08-20 20:08:51 -07:00
Cyrus Leung
2461d9e562 [CI/Build] Split out mm processor tests (#23260)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-20 20:05:20 -07:00
Li, Jiang
7be5d113d8 [CPU] Refactor CPU W8A8 scaled_mm (#23071)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-08-21 09:34:24 +08:00
Woosuk Kwon
b029de9902 [Optimization] Make new_block_ids None if empty (#23262)
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
2025-08-20 18:25:56 -07:00
Michael Goin
bbea1cefdd [CI Bugfix] Fix CI by fully removing --enable-prompt-adapter (#23284)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-20 17:18:12 -07:00
Russell Bryant
f5aa307d77 Remove duplicate entry in vllm.attention.__all__ (#23296)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-08-20 17:14:59 -07:00
22quinn
4b795020ed [EP] Add logging for experts map (#22685)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
2025-08-20 23:46:06 +00:00
shixianc
c86af22f31 [Fix] remove is_marlin param in benchmark_moe (#23286) 2025-08-20 22:04:21 +00:00
Matthew Bonanni
10cc12ba66 Feature/mla tests (#23195)
Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2025-08-20 21:46:47 +00:00
Matthew Bonanni
a4fbb32fab Remove chunked_prefill_enabled flag in V1 MLA (#23183)
Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
2025-08-20 21:43:17 +00:00
youkaichao
1b125004be [misc] fix multiple arch wheels for the nightly index (#23110)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-08-20 14:15:34 -07:00
rongfu.leng
4fbda0b20c [Feature] use --eplb_config to set eplb param (#20562)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: rongfu.leng <lenronfu@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-20 14:07:28 -07:00
Russell Bryant
4e51fa8cba Do not use eval() to convert unknown types (#23266)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-08-20 13:28:30 -07:00
Saurabh Misra
bf7c99dfc4 [Perf] Speed up function _convert_tokens_to_string_with_added_encoders by 13.7x (#20413)
Signed-off-by: Saurabh Misra <misra.saurabh1@gmail.com>
Signed-off-by: Aseem Saxena <aseem.bits@gmail.com>
Co-authored-by: codeflash-ai[bot] <148906541+codeflash-ai[bot]@users.noreply.github.com>
Co-authored-by: Aseem Saxena <aseem.bits@gmail.com>
2025-08-20 13:17:11 -07:00
Chen Zhang
b95697d731 [Frontend] improve error logging of chat completion (#22957)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-08-20 13:03:37 -07:00
bigmoyan
582bbe6bd7 [Fix] correct tool_id for kimi-k2 when use tool_choice=required (#21259)
Co-authored-by: wangzhengtao <wangzhengtao@msh.team>
2025-08-20 12:59:54 -07:00
Michael Goin
0cdbf5e61c [Kernel/Quant] Remove the original marlin format and qqq (#23204)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-20 15:13:36 -04:00
dongluw
ebe56a0064 Small fix for Command-A-Vision (#23268)
Signed-off-by: donglu <donglu@cohere.com>
2025-08-20 18:15:18 +00:00
Russell Bryant
f77a0802b7 Limit HTTP header count and size (#23267)
Signed-off-by: Taneem Ibrahim <taneem.ibrahim@gmail.com>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Taneem Ibrahim <taneem.ibrahim@gmail.com>
2025-08-20 17:57:37 +00:00
Benji Beck
c4477f55e5 Migrate Mistral3ImagePixelInputs to TensorSchema (#21945)
Signed-off-by: Benji Beck <benjibeck@meta.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-08-20 17:37:29 +00:00
Yong Hoon Shin
dfd2382039 [torch.compile] Support conditional torch.compile per module (#22269)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-08-20 16:52:59 +00:00
JartX
3b11b26b50 [FIXBUG ] Allow disabling rocm_aiter_fa backend for ROCm GPUs not compatible with AITER (#22795)
Signed-off-by: JartX <sagformas@epdcenter.es>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-08-20 09:08:29 -07:00
Woosuk Kwon
d6d13bd49e [Misc] Add max_seq_len to CommonAttentionMetadata (#23216)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-20 09:05:29 -07:00
Cyrus Leung
5efd6905bc [CLI][Doc] Formalize --mm-encoder-tp-mode (#23190)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-20 23:42:28 +08:00
shixianc
b17109beea [Kernel] CUTLASS MoE FP8: Integrate cuda moe permute/unpermute (#23045)
Signed-off-by: Shixian Cui <shixian@amazon.com>
2025-08-20 10:35:26 -04:00
Cyrus Leung
4449235843 [Bugfix] Ensure correctness of HCXVision processing (#23254)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-20 14:19:30 +00:00
rongfu.leng
38217877aa [Fix] fix offline env use local mode path (#22526)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-08-20 13:34:49 +00:00
Jee Jee Li
c6d80a7a96 [Model] Improve olmo and olmo2 (#23228)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-20 12:47:05 +00:00
xyxinyang
7cd17e22d7 [Model][V1] Support Ernie MTP (#22169)
Signed-off-by: zhouchong <zhouchong03@baidu.com>
Co-authored-by: zhouchong <zhouchong03@baidu.com>
2025-08-20 20:41:55 +08:00
Michael Goin
50df09fe13 Update to flashinfer-python==0.2.12 and disable AOT compile for non-release image (#23129)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-20 08:05:54 -04:00
Cyrus Leung
68fcd3fa73 [Bugfix] Ensure correctness of Cohere2Vision processing (#23245)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-20 11:09:18 +00:00
Xin Yang
83e69a09d6 [Model] Support deepseek with eagle (#21086)
Signed-off-by: Xin Yang <xyangx@amazon.com>
2025-08-20 19:01:31 +08:00
Shiming Zhang
3aa8c10038 Fix missing quotes (#23242)
Signed-off-by: Shiming Zhang <wzshiming@hotmail.com>
2025-08-20 10:46:59 +00:00
Calvin Chen
103f1ec8d3 [Model] use autoWeightsLoader for gptoss (#22446)
Signed-off-by: calvin chen <wen.chen@dynamia.ai>
2025-08-20 10:16:27 +00:00
who who who
d983769c41 fix cuda graph (#22721)
Signed-off-by: fsx950223 <fsx950223@outlook.com>
2025-08-20 06:24:37 +00:00
Nick Hill
8fd920924c [BugFix] Fix stuck stats/metrics after requests are aborted (#22995)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-08-20 13:50:29 +08:00
Cyrus Leung
de7b67a023 [CI/Build] Sync multimodal tests (#23181)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-20 05:06:42 +00:00
Zhewen Li
f729023272 [CI/Build] Also check DP in benchmarks throughput script (#23038)
Co-authored-by: Simon Mo <simon.mo@hey.com>
2025-08-20 04:09:27 +00:00
길재은
1a3079a15e chore: support pytorch format in lora (#22790)
Signed-off-by: jaeeun.kil <rha3122@naver.com>
Signed-off-by: 길재은 <rha3122@naver.com>
2025-08-20 04:02:50 +00:00
Louie Tsai
941f56858a Fix a performance comparison issue in Benchmark Suite (#23047)
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Signed-off-by: Louie Tsai <louie.tsai@intel.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Li, Jiang <bigpyj64@gmail.com>
2025-08-20 03:14:32 +00:00
Zebing Lin
a634733f67 [Attention] Optimize make_local_attention_virtual_batches for Flash Attention (#23185)
Signed-off-by: linzebing <linzebing1995@gmail.com>
2025-08-20 02:57:47 +00:00
Cyrus Leung
64ab3c7253 [Doc] Update V1 status of various pooling models (#23189)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-20 10:33:41 +08:00
Chenheli Hua
e58c5a9768 [Core] Add torch profiler CPU traces for AsyncLLM. (#21794)
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
2025-08-20 02:32:47 +00:00
Michael Goin
d46d417b58 [CI Perf] Only test bfloat16 for tests/compile/test_fusion_all_reduce.py (#23132)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-19 20:18:52 -06:00
633WHU
0167efe20d [Core] Optimize scheduler request removal for single completions (#21917)
Signed-off-by: chiliu <chiliu@paypal.com>
Signed-off-by: chiliu <cliu_whu@yeah.net>
Co-authored-by: chiliu <chiliu@paypal.com>
2025-08-19 18:25:59 -07:00
Kyle Sayers
c32e6ad1f6 [Quantization] Bump Compressed Tensors Version (#23202)
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-08-20 00:39:28 +00:00
Chenheli Hua
1630cc8d0f [Benchmarks] Add video inputs to ShareGPTDataset. (#23199)
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
2025-08-19 23:42:31 +00:00
Lucas Wilkinson
14e2b0730b [BugFix] fix CUTLASS MLA full cudagraph (#23200)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-08-19 22:17:08 +00:00
Michael Goin
0f4f0191d8 [CI/Build] Replace lm-eval gsm8k tests with faster implementation (#23002)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-19 15:07:30 -07:00
amirkl94
a38b8af4c3 [NVIDIA] Add SM100 Flashinfer Cutlass MoE fp8 backend (#22357)
Signed-off-by: Amir Klein <203507526+amirkl94@users.noreply.github.com>
2025-08-19 18:01:53 -04:00
Michael Goin
21dce80ea9 [CI/Build] Add support for Python 3.13 (#13164)
Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-08-19 13:49:34 -07:00
Woosuk Kwon
e61bac87ee [Misc] Minor refactoring for FlashInfer backend (#23147)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-19 13:11:51 -07:00
Marko Rosenmueller
80141bbf2f fix: use cache_salt for gpt-oss (#23186)
Signed-off-by: Marko Rosenmueller <5467316+dr75@users.noreply.github.com>
2025-08-19 18:12:25 +00:00
bnellnm
b94faf9d50 [Bugfix] Fix accuracy issue when using flashinfer cutlass moe, TP=1 and modelopt. (#23125)
Signed-off-by: Bill Nell <bnell@redhat.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-08-19 14:00:51 -04:00
Woosuk Kwon
5b5f350d67 [Misc] Enable yapf for FlashInfer backend (#23193)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-19 10:33:47 -07:00
22quinn
f7cf5b512e [Frontend] Add /collective_rpc API endpoint (#23075)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-08-19 17:29:32 +00:00
Ruixiang Tan
03d4235fd2 [Misc] Fix the benchmark's README and improve the error messages for the benchmark's argument checks (#22654)
Signed-off-by: tanruixiang <tanruixiang0104@gmail.com>
2025-08-19 10:18:51 -07:00
Isotr0py
d6a1a20973 [CI/Build] Update transformers to v4.55.2 (#23093)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-19 10:06:17 -07:00
Benji Beck
a70d0bd0a3 Migrate LlavaOnevisionMultiInputs to TensorSchema (#21844)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-08-19 17:02:02 +00:00
Yuge Zhang
24f4d1a224 Add return_token_ids parameter to OpenAI API endpoints (#22587)
Signed-off-by: Yuge Zhang <scottyugochang@gmail.com>
Co-authored-by: Claude <noreply@anthropic.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
2025-08-19 09:48:31 -07:00
yiz-liu
4f510bc2a1 [Model] Removes redundant all-reduce operation in Qwen3MoeSparseMoeBlock (#23169)
Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-08-19 16:18:41 +00:00
TJian
1298c67795 [FEAT] [Performance] Enable DP for ViT in Qwen2.5VL (#22742)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-08-19 15:25:57 +00:00
Jee Jee Li
4d9c61993a [Bugfix] Fix benchmark_moe.py (#23177)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-19 13:39:40 +00:00
myselvess
b87cb97a53 [Model] support new model ovis2.5 (#23084)
Signed-off-by: myselvess <244285088@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-19 13:12:59 +00:00
wang.yuqi
f856c33ce9 [Model] Add multi_label_classification support (#23173)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-08-19 12:54:30 +00:00
elvischenv
03752dba8f [NVIDIA] Support Flashinfer TRTLLM FP8-q/kv/out Attention Kernel (#21716)
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-08-19 08:22:15 -04:00
Woosuk Kwon
40f26734b9 [Misc] Fix seq_lens for graph capture (#23175)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-19 03:58:16 -07:00
Tialo
2c3f557f08 [Doc] use power of 2 (#23172) 2025-08-19 03:16:23 -07:00
Woosuk Kwon
21bcc8263f [Misc] Avoid accessing req_ids inside a loop (#23159)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-19 09:39:38 +00:00
qizixi
5bfe0dea7a [bug fix] Fix llama4 spec decoding (#22691)
Signed-off-by: qizixi <qizixi@meta.com>
Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com>
2025-08-19 08:53:24 +00:00
Isotr0py
31fd3265c8 [Bugfix] Fix broken Minimax-01-VL model (#22116)
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-19 08:49:29 +00:00
hustxiayang
31436e8b4f [Misc] Add request_id into benchmark_serve.py (#23065)
Signed-off-by: yangxia <yangxiast@gmail.com>
2025-08-19 08:32:18 +00:00
qizixi
4efd43e9b4 Fix GLM-4.5V-FP8 numerical issue (#22949)
Signed-off-by: qizixi <qizixi@meta.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-08-19 07:56:31 +00:00
Daniel Serebrenik
3c8a787247 [Benchmark] Add flag --served-model-name to benchmark_serving_multi_turn (#22889)
Signed-off-by: daniels <daniels@pliops.com>
2025-08-19 07:48:07 +00:00
Grace Ho
01a08739e0 [misc] split engine_model into json file for nsys profile tool (#23117)
Signed-off-by: Grace Ho <grho@nvidia.com>
Signed-off-by: Grace Ho <146482179+gracehonv@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-08-19 15:44:53 +08:00
Jiangyun Zhu
fda9537c5e [Model] Support Pipeline Parallelism for moonshotai/Kimi-VL-A3B-Thinking-2506 (#23114)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-08-19 14:24:31 +08:00
Wentao Ye
90bbe0a5ad [Log] Warning Once for Cutlass MLA (#23137)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-18 23:24:16 -07:00
Benji Beck
e75f342261 Migrate InternVLImagePixelInputs (in nemotron_vl.py) to TensorSchema (#22023)
Signed-off-by: Benji Beck <benjibeck@meta.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-08-19 13:48:26 +08:00
Nikhil Suryawanshi
78dba404ad [Hardware][IBM Z]Enable v1 for s390x and s390x dockerfile fixes (#22725)
Signed-off-by: Nikhil Suryawanshi <suryawanshin74@gmail.com>
2025-08-19 04:40:37 +00:00
Chengji Yao
e9d6a3db69 [TPU] make ptxla not imported when using tpu_commons (#23081)
Signed-off-by: Chengji Yao <chengjiyao@gmail.com>
Signed-off-by: Chengji Yao <chengjiyao@google.com>
Co-authored-by: Chengji Yao <chengjiyao@gmail.com>
2025-08-19 11:46:42 +08:00
Xiao
a4454e9401 chore: disable enable_cpp_symbolic_shape_guards (#23048)
Signed-off-by: Xiao Liu <xiszishu@gmail.com>
2025-08-18 23:08:05 -04:00
Woosuk Kwon
14006840ea [V0 Deprecation] Remove V0 FlashInfer attention backend (#22776)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-18 19:54:16 -07:00
Robert Shaw
6603288736 [CI][V0 Deprecation] Removed V0 Only Chunked Prefill and Prefix Caching Tests (#22871)
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-18 17:39:01 -07:00
Thomas Parnell
95e3095136 [Misc] Add @tdoublep as a maintainer of hybrid model and Triton-attention related code (#23122)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-08-19 08:31:38 +08:00
Woosuk Kwon
c9b38be8aa [Spec Decode] Make propose_draft_token_ids non-blocking for lower TTFT (#23041)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-18 17:20:38 -07:00
Woosuk Kwon
0dd3f4f5ab [Misc] Minor refactoring for prepare_inputs (#23116)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-18 16:58:05 -07:00
Xiang Xu
498259ccce Install tpu_info==0.4.0 to fix core dump for TPU (#23135) 2025-08-18 16:23:33 -07:00
Michael Goin
6d25e3fd6e Use Blackwell FlashInfer MXFP4 MoE by default if available (#23008)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-18 15:25:49 -07:00
Breno Baldas Skuk
ac6eb49de3 fix: OpenAI SDK compat (ResponseTextConfig) (#23126)
Signed-off-by: breno.skuk <breno.skuk@hcompany.ai>
Signed-off-by: Breno Baldas Skuk <breno.skuk@hcompany.ai>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-08-18 15:22:59 -07:00
Michael Goin
bf756321c7 [CI Bugfix] Pin openai<1.100 to unblock CI (#23118)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-18 12:14:01 -07:00
Raushan Turganbay
0e3bb543f0 [Bugfix] Support compile for Transformers multimodal (#23095)
Signed-off-by: raushan <raushan@huggingface.co>
2025-08-18 13:35:48 +00:00
杨朱 · Kiki
569aefd134 chore: remove unnecessary patch_padding_side for the chatglm model (#23090)
Signed-off-by: carlory <baofa.fan@daocloud.io>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-08-18 12:32:13 +00:00
Cyrus Leung
d3f71f1224 [Refactor] Get prompt updates earlier (#23097)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-18 12:31:53 +00:00
Ning Xie
5a30bd10d8 [Bugfix] fix IntermediateTensors equal method (#23027)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-08-18 02:58:11 -07:00
Cyrus Leung
27e8d1ea3e [Refactor] Define MultiModalKwargsItems separate from MultiModalKwargs (#23053)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-18 09:52:00 +00:00
Kunshang Ji
5c79b0d648 [XPU][CI]add xpu env vars in CI scripts (#22946)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2025-08-18 09:47:03 +00:00
Kunshang Ji
5f5664b3e4 [XPU] Fix compile size for xpu (#23069)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2025-08-18 00:04:08 -07:00
Roger Wang
89657a557c [Misc] Fix backward compatibility from #23030 (#23070)
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Roger Wang <hey@rogerw.me>
2025-08-17 23:33:29 -07:00
Ning Xie
08d5f7113a [Misc] refactor function name (#23029)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-08-17 22:16:21 -07:00
Andy Lo
b2fd0b81e0 [Bugfix][CI] Machete kernels: deterministic ordering for more cache hits (#23055)
Signed-off-by: Andy Lo <andy@mistral.ai>
2025-08-17 22:10:26 -07:00
double7
9f1c642254 [Bugfix] fix Qwen2.5-Omni processor output mapping (#23058)
Signed-off-by: double7 <33449816+DoubleVII@users.noreply.github.com>
Co-authored-by: 杨森 <yangsen.double7@bytedance.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-08-17 22:09:11 -07:00
Ning Xie
7be3a59d8e [Misc] enhance static type hint (#23059)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-08-17 22:09:08 -07:00
Woosuk Kwon
8ea0c2753a [Misc] Minor code cleanup for _get_prompt_logprobs_dict (#23064)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-17 18:16:03 -07:00
Simon Mo
0fc8fa751a fix: gptq marlin weight loading failure (#23066)
Some checks failed
Create Release / Create Release (push) Has been cancelled
2025-08-17 15:56:07 -07:00
Calvin Chen
21e39436c8 [XPU] fix xpu to set cudagraph batch sizes (#23044)
Signed-off-by: calvin chen <wen.chen@dynamia.ai>
2025-08-17 21:45:42 +00:00
Woosuk Kwon
6d243efeda [Misc] Convert use_structured_output property into constant (#23060)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-17 12:41:38 -07:00
Woosuk Kwon
c55bc1db26 [Misc] Remove dead return (#23061)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-17 10:36:46 -07:00
Lucas Wilkinson
292084e72a [BugFix] Fix for IMA in FA3 varlen combine (#22967)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-08-17 08:52:04 -07:00
Kevinzz
16bff144be [Misc] fix typo in the multimodal doc (#23051) 2025-08-17 01:56:20 -07:00
947132885
fe0411fc6f [Bugfix] should use stack instead of concat (#22972)
Signed-off-by: 947132885 <947132885@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-17 08:46:36 +00:00
Jee Jee Li
4d4061b6e7 [Kernel] Add cuda kernel for gpt_oss activation (#22951)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-17 05:03:24 +00:00
Ning Xie
87f48623a5 [Misc] method name typo fix (#23042)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-08-16 21:49:14 -07:00
Cyrus Leung
5c32143b9d [Refactor] Defer tensor data construction in MultiModalKwargs (#23030)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-16 21:05:50 -07:00
Michael Goin
94096a47c9 [UX] Separate marlin moe config logic from triton moe (#23006) 2025-08-16 22:16:42 -04:00
Jinzhen Lin
a258ad8bcc [Bugfix] fix qwen3 moe fp8 accuracy issue (#23031)
Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
2025-08-16 17:41:23 -07:00
afeldman-nm
bf7f470b22 [V1] Logits processors extensibility (#19912)
Signed-off-by: Andrew Feldman <afeldman@redhat.com>
Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Andrew Feldman <afeld2012@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-16 12:59:17 -07:00
Michael Goin
4fc722eca4 [Kernel/Quant] Remove AQLM (#22943)
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-08-16 19:38:21 +00:00
Michael Goin
3253ae765e [Flaky CI] Increase timeout tolerance for test_mp_crash_detection+test_default_mm_lora_chat_completions (#23028)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-16 18:33:08 +00:00
Michael Goin
000cceca8c [Bugfix gpt-oss] Fix float32 convert for flashinfer sink support (#23016)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-16 11:16:00 -07:00
Woonggi Min
68373d3126 [Frontend] Added support for HermesToolParser for models without special tokens (#16890)
Signed-off-by: minpeter <kali2005611@gmail.com>
2025-08-16 17:38:42 +00:00
Maximilien de Bayser
52ce1420e9 Fix handling of max_num_batched_tokens for pooling tasks (#23004)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
2025-08-16 17:36:30 +00:00
汪志鹏
829bbd7882 [New Model]mBART model (#22883)
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
2025-08-16 12:16:58 +00:00
Cyrus Leung
4dff91c93d [Refactor] Allow optional MultiModalKwargsItem in IPC (#23022)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-16 11:30:49 +00:00
Seiji Eicher
de9cb61763 Add docs for PrefixRepetitionDataset + enable usage with vllm bench throughput (#23012)
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
Co-authored-by: Roger Wang <hey@rogerw.me>
2025-08-16 10:21:20 +00:00
Isotr0py
2dbccce8a6 [CI][Bugfix] Skip Ovis2 generation test because of broken remote code (#22954)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-16 09:44:19 +00:00
Chengji Yao
933f45334a [Core] Make cudagraph check cuda platform only (#23005)
Signed-off-by: Chengji Yao <chengjiyao@gmail.com>
Signed-off-by: Chengji Yao <chengjiyao@google.com>
Co-authored-by: Chengji Yao <chengjiyao@gmail.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
2025-08-16 07:46:00 +00:00
Isotr0py
cc826a202b [Multimodal] Update Tensor schema test to cover arbitrary shape mm inputs (#22867)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-16 00:44:50 -07:00
Jee Jee Li
6d3da472bc [Misc] Add --save-dir option to benchmark_moe (#23020)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-16 07:26:10 +00:00
Andrew Sansom
78863f8c5c [BugFix] Add support for loading prompt embeds tensors serialized on unavailable devices and sparse tensors (#22962)
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
2025-08-16 06:25:10 +00:00
Lucas Wilkinson
5157827cfc [Build] Env var to disable sccache (#22968)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-08-16 05:36:27 +00:00
Kunshang Ji
7caec10e7b [XPU]avoid circular import during XPU init (#23017)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2025-08-16 05:16:34 +00:00
Grace Ho
1f83e7d849 [misc] nsys profile output kernel classifier and visualizer (#22971)
Signed-off-by: Grace Ho <grho@nvidia.com>
2025-08-16 02:52:51 +00:00
Calvin Chen
e4e37ded56 [V1] support min_tokens for detokener (#22014)
Signed-off-by: calvin chen <wen.chen@dynamia.ai>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-08-16 02:28:10 +00:00
Nick Hill
f6b5040590 [Frontend] Avoid list copies in serving_chat.py (#22947)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-08-16 02:06:30 +00:00
Benjamin Chislett
fbd88728b3 [Bugfix] Fix DeepSeek MTP (#22934)
Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
2025-08-16 01:25:06 +00:00
Nicolò Lucchesi
070da660c1 [Kernel] Simplify get_kv_cache_layout and cache use_trtllm_attention env-dependent bit (#22735)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-08-16 00:14:08 +00:00
Nick Hill
ad0297d113 [Misc] Support passing multiple request ids at once to AsyncLLM.abort() (#22944)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-08-15 17:00:36 -07:00
Yichen Yan
236b864e4f [BugFix] Make run_once thread-safe (#22978)
Signed-off-by: <wenji.yyc@alibaba-inc.com>
Signed-off-by: Yichen Yan <wenji.yyc@alibaba-inc.com>
2025-08-15 16:56:17 -07:00
Yong Hoon Shin
3e2f7985a2 Support multiple attention groups for KV sharing (#22672)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-08-15 16:54:10 -07:00
Or Ozeri
c280066f9d [v1] Move block_hashes from KVCacheManager to Request.block_hashes (#19728)
Signed-off-by: Or Ozeri <oro@il.ibm.com>
2025-08-15 16:52:52 -07:00
Nick Hill
b9dc9d2607 [BugFix] Handle case where async utility call is cancelled (#22996)
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Yinghai Lu <yinghai@thinkingmachines.ai>
2025-08-15 17:38:42 -06:00
rishitdholakia13
1fc375dc05 [Structured Outputs] [Bug] Fix misalignment in apply_grammar_bitmask causing unintended masking and NaN logits (#22963)
Signed-off-by: rishitdholakia13 <rishit+github@cohere.com>
2025-08-15 23:25:05 +00:00
Eli Uriegas
76144adf76 ci: Add CUDA + arm64 release builds (#21201)
Signed-off-by: Eli Uriegas <eliuriegas@meta.com>
2025-08-15 23:16:23 +00:00
Thomas Parnell
f5d412bafb [BugFix] Fix regression caused by mamba state dtype PR (#22998)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-08-15 22:55:26 +00:00
Lucas Wilkinson
177e55e3bd [Attention] FA3 Attention Sinks Perf Boost (#22478)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-08-15 17:41:07 -04:00
eigen
1723ef1aae minor: zero workspace buffer init for flashinfer trtllm-gen attn (#22603) 2025-08-15 21:38:10 +00:00
Seiji Eicher
00d6cba0cf Add PrefixRepetitionRandomDataset to vllm bench serve datasets (#20638)
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
2025-08-15 14:09:23 -07:00
shixianc
7f89ed248f [Fix] enable swap_ab for pplx problem size computation (#22991)
Signed-off-by: Shixian Cui <shixian@amazon.com>
Co-authored-by: Shixian Cui <shixian@amazon.com>
2025-08-15 14:02:12 -07:00
Michael Goin
8a87cd27d9 [CI] Speed up Whisper tests by reusing server (#22859)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-15 16:56:31 -04:00
Michael Goin
a344a1a7da Use regex in convert-results-json-to-markdown.py (#22989)
Signed-off-by: Michael Goin <mgoin64@gmail.com>
2025-08-15 20:54:20 +00:00
nvjullin
79899b63f6 [Bugfix] Added more env vars to hash (#22449)
Signed-off-by: Julien Lin <jullin@nvidia.com>
2025-08-15 20:08:37 +00:00
Zebing Lin
6e670778cd [Core] direct indexing on self.block_table_np in compute_slot_mapping (#22940)
Signed-off-by: linzebing <linzebing1995@gmail.com>
2025-08-15 12:12:12 -07:00
Wentao Ye
df5afa82e5 [Log] Debug Once for Randomizing dummy data for DP Rank (#22860)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-15 11:51:50 -07:00
Chih-Chieh Yang
6cd69f51bf [Model] Granite-4 support loading quantized checkpoint (#22925)
Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
2025-08-15 18:47:56 +00:00
bnellnm
8ad7285ea2 [Kernels] Clean up FusedMoeMethodBase and modular kernel setup. Remove extra arguments from modular kernel methods. (#22035)
Signed-off-by: Bill Nell <bnell@redhat.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-08-15 14:46:00 -04:00
Shanshan Shen
48b01fd4d4 [Structured Output] Make the output of structured output example more complete (#22481)
Signed-off-by: shen-shanshan <467638484@qq.com>
2025-08-15 18:29:25 +00:00
Chenheli Hua
993d3d122b [Benchmarks] Include image data when ShareGPT4V dataset is used. (#22955)
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
2025-08-15 18:23:06 +00:00
JartX
68af77e51c [FIXBUG] Correctly Apply Grammar Bitmask in Mixed Batches (#22896)
Signed-off-by: JartX <sagformas@epdcenter.es>
2025-08-15 17:42:49 +00:00
sstamenk
6b04039a72 [BugFix] Skip the Q component for QKVParallelLinear in the case of QKVCrossParallelLinear since its width is 0 (#22369)
Signed-off-by: sstamenk <sstamenk@amd.com>
2025-08-15 17:17:31 +00:00
Woosuk Kwon
1c859a1387 [V0 Deprecation] Remove advance_step (#22969)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-15 08:22:31 -07:00
fhl2000
74f441f4b5 [Core] Allow full cudagraph with separate attention routines and orthogonal to compilation, add support for FA2 and FlashInfer (#20059)
Signed-off-by: fhl <2410591650@qq.com>
Signed-off-by: fhl2000 <63384265+fhl2000@users.noreply.github.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
2025-08-15 10:01:39 -04:00
Csrayz
a0632a3e03 [Frontend] Expose do_log_stats interval to env (#22905)
Signed-off-by: Csrayz <jover@cmbchina.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-15 13:00:20 +00:00
Harry Mellor
e8b40c7fa2 [CI] Remove duplicated docs build from buildkite (#22924)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-15 05:58:06 -07:00
Jee Jee Li
48f4636927 [Misc] Ignore ep_kernels_workspace (#22807)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-15 05:58:03 -07:00
Thomas Parnell
75531a6c13 [V1] [Hybrid] Support using float32 for state in Hybrid Models (Mamba2, Mamba1, Minimax) (#22928)
Signed-off-by: Daniel Afrimi <danielafrimi8@gmail.com>
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Daniel Afrimi <danielafrimi8@gmail.com>
Co-authored-by: Burkhard Ringlein <ngl@zurich.ibm.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
2025-08-15 12:57:06 +00:00
Staszek Paśko
22341b996e Improve multimodal hasher performance for re-used Image prompts (#22825)
Signed-off-by: Staszek Pasko <staszek@gmail.com>
2025-08-15 12:32:56 +00:00
Roger Wang
49252cf59e [MM] Allow skipping memory profiling for multimodal models. (#22950)
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-08-15 11:41:38 +00:00
Jinzhen Lin
3e6dd40016 [Bugfix] fix cuda 12.6 and 11.8 build (#22952)
Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
2025-08-15 10:10:22 +00:00
Sayandip Dutta
aa300c438d [Bugfix] Unquote file uri before reading image (#22912)
Signed-off-by: Sayandip Dutta <sayandip199309@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-08-15 09:28:00 +00:00
amirai21
fe91ce9591 [V1] - Split Prefill and Decode for Mamba1 models (#22653)
Signed-off-by: amirk <amirk@ai21.com>
Signed-off-by: asafg <asafg@ai21.com>
Co-authored-by: asafg <asafg@ai21.com>
Co-authored-by: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com>
2025-08-15 08:59:52 +00:00
wang.yuqi
5406ebf5c9 [CI] Pooling models mteb test uses enforce_eager (#22878)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-08-15 01:16:15 -07:00
frankie
b2c06509e5 [P/D]Provide bucket algorithm rate limiter for proxy_server (#22643)
Signed-off-by: frankie-ys <yongshengwang@cmbchina.com>
Signed-off-by: frankie <wangyongsheng686@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Kuntai Du <kuntai@uchicago.edu>
2025-08-15 07:01:48 +00:00
TJian
b2f6c247a9 Revert "[ROCm][AITER] Support AITER Rope ops in RotaryEmbedding Module." (#22956)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-08-15 06:39:19 +00:00
Asaf Joseph Gardin
3d232dbd19 [Mamba] - refactor: Renamed mamba_attn to mamba2_attn (#22818)
Signed-off-by: asafg <asafg@ai21.com>
Co-authored-by: asafg <asafg@ai21.com>
2025-08-15 06:38:05 +00:00
Wentao Ye
5c3fbfe46b [Feature] Full Cuda Graph Support for Cutlass MLA and 6% E2E Throughput Improvement (#22763)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-15 06:27:30 +00:00
amirkl94
b4cef5e6c7 refactor: Change scaling factors calculation for flashinfer FusedMoE (#22812)
Signed-off-by: Amir Klein <203507526+amirkl94@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-08-15 06:19:31 +00:00
Michael Goin
0fe85087a9 [CI Perf] Prune tests in tests/kernels/attention/ (#22936)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-14 21:34:53 -06:00
Michael Goin
d2b0e97ea6 [CI Perf] Prune tests in tests/kernels/moe/ (#22939)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-14 21:33:42 -06:00
Michael Goin
590bddbfc5 [CI Perf] Prune tests in tests/kernels/quantization/ (#22942)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-14 21:25:34 -06:00
Nick Hill
ae05a6d83d [BugFix] Fix port lookup in internal DP LB tests (#22252)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-08-15 11:17:11 +08:00
Nick Hill
0933f9d518 [BugFix][KVConn] Fix use of get_required_kvcache_layout (#22734)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-08-15 01:39:43 +00:00
Simon Mo
f1f0d2fab8 Revert "[Kernel] Add cuda kernel for gpt_oss activation" (#22948) 2025-08-14 17:38:10 -07:00
Jee Jee Li
81f4b96481 [Kernel] Add cuda kernel for gpt_oss activation (#22538)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-14 17:21:29 -07:00
Yongye Zhu
39cd09dc86 [Bugfix] use flash attn on sm90 (#22933)
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-08-14 16:37:22 -07:00
Nick Hill
919234fe17 [BugFix] Fix initial DP request load imbalance (#22910)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-08-14 15:20:28 -07:00
Nick Hill
ebcce2cd36 [Core] Return final response for aborted requests from AsyncLLM.generate (#22283)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-08-14 14:49:02 -07:00
Dipika Sikka
4121de512e [Quantization]: Support compressed-tensors mixed-precision model loading (#22468)
Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
2025-08-14 17:32:09 -04:00
nvjullin
279a5f31b3 [Kernel] Add nvfp4 gemm flashinfer backends (#22346)
Signed-off-by: Julien Lin <jullin@nvidia.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-08-14 16:03:55 -04:00
Lucas Wilkinson
b8ff05361a [CI] Temporarily disable flaky test (#22930)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-08-14 19:59:16 +00:00
Nir
637093ae26 docs: update fastsafetensors usage instructions (#22891)
Signed-off-by: Nir Levy <bhr166@gmail.com>
2025-08-14 19:56:54 +00:00
Jinzhen Lin
33c63e9547 [Kernel] [Quantization] Add MXFP4 and bias support for marlin kernel (#22428)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
Signed-off-by: Huzaifa Sidhpurwala <huzaifas@redhat.com>
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Animesh Jain <anijain@umich.edu>
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: kf <kuanfu.liu@embeddedllm.com>
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
Signed-off-by: Sage Moore <sage@neuralmagic.com>
Signed-off-by: tjtanaavllm <tunjian.tan@amd.com>
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
Signed-off-by: Roger Wang <hey@rogerw.me>
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@centml.ai>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
Signed-off-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Signed-off-by: yan <yan.ma@intel.com>
Signed-off-by: Yan Ma <yan.ma@intel.com>
Signed-off-by: Xiao Liu <xiszishu@gmail.com>
Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Signed-off-by: LopezCastroRoberto <roberto.lopez.castro@udc.es>
Signed-off-by: Andy Xie <andy.xning@gmail.com>
Signed-off-by: Haibin Lin <haibin.lin@bytedance.com>
Signed-off-by: David Ben-David <davidb@pliops.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: jiang1.li <jiang1.li@intel.com>
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Signed-off-by: Abirdcfly <fp544037857@gmail.com>
Signed-off-by: Giancarlo Delfin <gdelfin@meta.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: huangweixiao <huangweixiao@msh.team>
Signed-off-by: alyosha-swamy <raghav@arcee.ai>
Signed-off-by: Eric Hanley <ericehanley@google.com>
Signed-off-by: Abatom <abzhonghua@gmail.com>
Signed-off-by: CLFutureX <775523362@qq.com>
Signed-off-by: Linkun Chen <github@lkchen.net>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Signed-off-by: tlipoca9 <tlipoca9@gmail.com>
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
Signed-off-by: zitian zhao <zitian.zhao@tencentmusic.com>
Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: wang.yuqi <noooop@126.com>
Signed-off-by: Benji Beck <benjibeck@meta.com>
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
Signed-off-by: isotr0py <2037008807@qq.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: simon-mo <xmo@berkeley.edu>
Signed-off-by: LucasWilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: Zhang Jason <ning.zhang2@amd.com>
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Signed-off-by: asafg <asafg@ai21.com>
Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
Signed-off-by: Lain <fusiyuan2000@hotmail.com>
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: QscQ <qscqesze@gmail.com>
Signed-off-by: qingjun <qingjun@minimaxi.com>
Signed-off-by: Syed Muhammad Bin Asif <syedmba7@connect.hku.hk>
Signed-off-by: Lionel Villard <villard@us.ibm.com>
Signed-off-by: ycyaw66 <497410282@qq.com>
Signed-off-by: David Chen <530634352@qq.com>
Signed-off-by: Linkun <github@lkchen.net>
Signed-off-by: Moritz Sanft <58110325+msanft@users.noreply.github.com>
Signed-off-by: Ming Yang <minos.future@gmail.com>
Signed-off-by: Adrian Garcia <adrian.garcia@inceptionai.ai>
Signed-off-by: shaojunqi <shaojunqi.sjq@alibaba-inc.com>
Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
Signed-off-by: Andrew Chan <andrewkchan.akc@gmail.com>
Signed-off-by: Felix Marty <Felix.Marty@amd.com>
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
Signed-off-by: Shu Wang <shuw@nvidia.com>
Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
Signed-off-by: Shu Wang. <shuw@nvidia.com>
Signed-off-by: XIn Li <xinli@nvidia.com>
Signed-off-by: Junhao Li <junhao@ubicloud.com>
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: iAmir97 <Amir.balwel@embeddedllm.com>
Signed-off-by: iAmir97 <71513472+iAmir97@users.noreply.github.com>
Signed-off-by: <zyy1102000@gmail.com>
Signed-off-by: Guy Stone <guys@spotify.com>
Signed-off-by: <yyweiss@gmail.com>
Signed-off-by: yyw <yyweiss@gmail.com>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Pradyun Ramadorai <pradyunr@amazon.com>
Signed-off-by: Pradyun92 <142861237+Pradyun92@users.noreply.github.com>
Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Co-authored-by: rongfu.leng <rongfu.leng@daocloud.io>
Co-authored-by: Huzaifa Sidhpurwala <huzaifas@redhat.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Animesh Jain <jainanimesh2305@yahoo.com>
Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Co-authored-by: XiongfeiWei <isaacwxf23@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: JartX <sagformas@gmail.com>
Co-authored-by: fhl2000 <63384265+fhl2000@users.noreply.github.com>
Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: kf <kuanfu.liu@embeddedllm.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
Co-authored-by: Sage Moore <sage@neuralmagic.com>
Co-authored-by: tjtanaavllm <tunjian.tan@amd.com>
Co-authored-by: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Co-authored-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com>
Co-authored-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com>
Co-authored-by: Yuxuan Zhang <2448370773@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Yan Ma <yan.ma@intel.com>
Co-authored-by: Xiao <xiszishu@gmail.com>
Co-authored-by: jiahanc <173873397+jiahanc@users.noreply.github.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
Co-authored-by: Roberto L. Castro <38211239+LopezCastroRoberto@users.noreply.github.com>
Co-authored-by: Ning Xie <andy.xning@gmail.com>
Co-authored-by: H <linhaibin.eric@gmail.com>
Co-authored-by: David Ben-David <sdavidbd@gmail.com>
Co-authored-by: David Ben-David <davidb@pliops.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
Co-authored-by: TankNee <nee@tanknee.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com>
Co-authored-by: ZiTian.Zhao <zitian.zhao@tencentmusic.com>
Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: Abirdcfly <fp544037857@gmail.com>
Co-authored-by: Giancarlo Delfin <32987265+TheEpicDolphin@users.noreply.github.com>
Co-authored-by: Chenxi Yang <cxyang@cs.utexas.edu>
Co-authored-by: Chenxi Yang <cxyang@meta.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Weixiao Huang <hwx.simle@gmail.com>
Co-authored-by: Raghav Ravishankar <113712354+alyosha-swamy@users.noreply.github.com>
Co-authored-by: ericehanley <ericehanley@google.com>
Co-authored-by: Zhonghua Deng <abzhonghua@gmail.com>
Co-authored-by: Po-Han Huang (NVIDIA) <53919306+nvpohanh@users.noreply.github.com>
Co-authored-by: PiteXChen <44110731+CLFutureX@users.noreply.github.com>
Co-authored-by: lkchen <github@lkchen.net>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Co-authored-by: tlipoca9 <160737620+tlipoca9@users.noreply.github.com>
Co-authored-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
Co-authored-by: wang.yuqi <noooop@126.com>
Co-authored-by: Benji Beck <benjibeck@meta.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Siyuan Liu <lsiyuan@google.com>
Co-authored-by: Benjamin Chislett <chislett.ben@gmail.com>
Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: Zhang Jason <ning.zhang2@amd.com>
Co-authored-by: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com>
Co-authored-by: asafg <asafg@ai21.com>
Co-authored-by: Lain <siyuanf@nvidia.com>
Co-authored-by: tc-mb <157115220+tc-mb@users.noreply.github.com>
Co-authored-by: imning3 <hbning@pku.edu.cn>
Co-authored-by: Maximilien de Bayser <mbayser@br.ibm.com>
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
Co-authored-by: Tao He <linzhu.ht@alibaba-inc.com>
Co-authored-by: qscqesze <qingjun@minimaxi.com>
Co-authored-by: Syed Muhammad Bin Asif <92625830+syedmba@users.noreply.github.com>
Co-authored-by: Lionel Villard <villard@us.ibm.com>
Co-authored-by: WeiQing Chen <40507679+david6666666@users.noreply.github.com>
Co-authored-by: ycyaw66 <497410282@qq.com>
Co-authored-by: Moritz Sanft <58110325+msanft@users.noreply.github.com>
Co-authored-by: Ming Yang <minos.future@gmail.com>
Co-authored-by: Adrián García García <adrigarvk8@gmail.com>
Co-authored-by: Michael Goin <mgoin@redhat.com>
Co-authored-by: JaceyShao <65159281+JaceyShao@users.noreply.github.com>
Co-authored-by: shaojunqi <shaojunqi.sjq@alibaba-inc.com>
Co-authored-by: Ricardo Decal <crypdick@users.noreply.github.com>
Co-authored-by: Andrew Chan <andrewkchan.akc@gmail.com>
Co-authored-by: fxmarty-amd <felmarty@amd.com>
Co-authored-by: Andrew Sansom <andrew@protopia.ai>
Co-authored-by: Zhiyu <zhiyuc@nvidia.com>
Co-authored-by: Shu Wang <shuw@nvidia.com>
Co-authored-by: XIn Li <xinli@nvidia.com>
Co-authored-by: Junhao Li <streaver91@gmail.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
Co-authored-by: iAmir97 <71513472+iAmir97@users.noreply.github.com>
Co-authored-by: iAmir97 <Amir.balwel@embeddedllm.com>
Co-authored-by: Hong Hanh <hanh.usth@gmail.com>
Co-authored-by: Daniel Serebrenik <74646983+pliops-daniels@users.noreply.github.com>
Co-authored-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: Guy Stone <guys@spotify.com>
Co-authored-by: yyweiss <70619747+yyweiss@users.noreply.github.com>
Co-authored-by: Pradyun92 <142861237+Pradyun92@users.noreply.github.com>
Co-authored-by: Pradyun Ramadorai <pradyunr@amazon.com>
Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com>
2025-08-14 11:23:22 -07:00
Thomas Parnell
ab9f2cfd19 [CI] [Hybrid] Bump min transformers version for Bamba and Jamba (#22908)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-08-14 11:01:16 -07:00
Cyrus Leung
dbe298046c [Bugfix] Fix parsing of --disable-mm-preprocessor-cache (#22909)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-14 08:09:44 -07:00
Jiangyun Zhu
625ccd1c4d [Bugfix] Replace custom Encoding class with BatchEncoding in MistralTokenizer (#22786)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
2025-08-14 08:09:27 -07:00
Jee Jee Li
92ff41abea [Model] Modify the gate implementation of glm4_moe (#22832)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-14 05:28:50 -07:00
Lucas Wilkinson
829b9a62d0 [Perf] Dont create unnecessary pooling params (#22876)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-08-14 05:28:09 -07:00
Nicolò Lucchesi
540d54ca8d [CI] Re-enable transcriptions test_long_audio_request (#22890)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-08-14 11:34:34 +00:00
Daniele
0783f13960 [Doc] fix dead link (#22898)
Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
2025-08-14 04:06:13 -07:00
iAmir97
7655dc3e45 [Bugfix] Add reset prefix cache for online serving (#22726)
Signed-off-by: iAmir97 <Amir.balwel@embeddedllm.com>
Signed-off-by: iAmir97 <71513472+iAmir97@users.noreply.github.com>
Co-authored-by: iAmir97 <Amir.balwel@embeddedllm.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-08-14 04:04:18 -07:00
Harry Mellor
f4efda821d Remove Phi 4 Flash configuration workaround (#22723)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-14 04:03:49 -07:00
Nick Hill
eb08487b18 [BugFix] Threadsafe close async zmq sockets (#22877)
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-14 03:44:29 -07:00
Isotr0py
7c3a0741c6 [Bugfix] Fix PixtralHFImagePixelInputs dynamic shape check (#22827)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-14 02:35:43 -07:00
Louie Tsai
00e3f9da46 vLLM Benchmark suite improvement (#22119)
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Signed-off-by: Louie Tsai <louie.tsai@intel.com>
Co-authored-by: Li, Jiang <bigpyj64@gmail.com>
2025-08-14 07:12:17 +00:00
Robert Shaw
a353bd083d [CI] remove flaky v0 test (#22864)
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
2025-08-13 21:41:51 -07:00
Ilya Markov
1d20c34717 [CI] Fix tests/distributed/test_ca_buffer_sharing.py (#22849)
Signed-off-by: ilmarkov <imarkov@redhat.com>
Co-authored-by: ilmarkov <imarkov@redhat.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-08-13 20:09:30 -07:00
Will Eaton
b6af24fba7 [CI][Entrypoints]: add filter to generation to filter out invalid tool calls (#22826)
Signed-off-by: Will Eaton <weaton@redhat.com>
2025-08-13 20:09:07 -07:00
Cyrus Leung
0ca2393b47 [CI/Build] Increase pooling tolerance to pass CI (#22844)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-08-13 18:52:48 -04:00
Jialin Ouyang
31a500c86f [Core] [N-gram SD Optimization][1/n] Propose tokens with a single KMP (#22437)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-08-13 14:44:06 -07:00
Luka Govedič
4e8614e88b Move checklist in PR template (#22852)
Signed-off-by: Luka Govedic <lgovedic@redhat.com>
2025-08-13 21:38:35 +00:00
kliuae
c6cd5ca3d3 [ROCm][Bugfix] Fix compilation error in topk softmax fused kernel (#22819)
Signed-off-by: kliuae <kuanfu.liu@embeddedllm.com>
2025-08-13 13:45:03 -07:00
Isotr0py
df0e0f023e [CI/Build] Skip gpt_big model test because of broken HF model (#22848)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-13 20:36:28 +00:00
Cyrus Leung
b4b78d6317 [CI/Build] Fix param mismatch in test_eagle_correctness (#22847)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-13 10:55:25 -07:00
Nicolò Lucchesi
12817a8ac7 [CI] Fix tests/v1/e2e/test_kv_sharing_fast_prefill.py import on test (#22815)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-08-13 10:35:50 -07:00
Cyrus Leung
c9232d41f4 [CI/Build] Update VLM common tests (#22841)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-13 10:03:05 -07:00
HWH
9bd9294f0e [Bugfix] Fix MiniCPMV Image input inference failed (#22813)
Signed-off-by: HWH <67449739+jio-H@users.noreply.github.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-08-13 09:41:41 -07:00
Roger Wang
da2705198f [Misc] clear and separate error messages for input too long and input + max-tokens too long (#22803)
Signed-off-by: Roger Wang <hey@rogerw.me>
2025-08-13 07:22:56 -07:00
Cyrus Leung
19b927e52d [Core] Use individual MM items in P0/P1 cache and model runner (#22570)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-13 07:18:07 -07:00
milesial
20d65aa755 [Frontend] Multithreaded async multimodal load_bytes (#22710)
Signed-off-by: Alexandre Milesi <30204471+milesial@users.noreply.github.com>
Co-authored-by: Alexandre Milesi <30204471+milesial@users.noreply.github.com>
2025-08-13 06:09:26 -07:00
Gh0u1L5
b159c0a67a Fix GGUF loader for Qwen3 MoE. (#22785)
Signed-off-by: Gh0u1L5 <Gh0u1L5@outlook.com>
2025-08-13 06:08:23 -07:00
Yuanyuan Chen
6772bb0f7d Remove unnecessary CUDA sync of qwen image and video preprocess (#22792)
Signed-off-by: cyy <cyyever@outlook.com>
Signed-off-by: Yuanyuan Chen <cyyever@outlook.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-08-13 06:07:28 -07:00
Chen Zhang
fceafaf582 [Bugfix][mamba] Fix type annotation of Mamba2Metadata (#22787)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-08-13 06:07:09 -07:00
Nicolò Lucchesi
6b794c756c [Nixl][CI] Fix tests (#22806)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-08-13 06:03:53 -07:00
Chi Zhang
98deac3879 [FEATURE] support custom vllm tuned config path for fused moe triton kernels (#22791)
Signed-off-by: Chi Zhang <zhangchi.usc1992@bytedance.com>
2025-08-13 20:27:25 +08:00
Kdump
653124bd46 [Frontend] Add chunked processing to handle long inputs in embedding models (#22280)
Signed-off-by: x22x22 <wadeking@qq.com>
Signed-off-by: Kdump <rootshellexp@gmail.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Maximilien de Bayser <maxdebayser@gmail.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-13 04:14:24 -07:00
wangxiyuan
0b1bdac6af [Platform] Custom ops support for FusedMoe (#22509)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-08-13 04:12:00 -07:00
Giancarlo Delfin
d94e3026de [V1] Add tree drafting tests for eagle spec decoding (#22705)
Signed-off-by: Giancarlo Delfin <gdelfin@meta.com>
2025-08-13 04:11:28 -07:00
633WHU
3f52738dce [Doc] Add max_lora_rank configuration guide (#22782)
Signed-off-by: chiliu <cliu_whu@yeah.net>
2025-08-13 04:10:07 -07:00
Duc-Viet Hoang
a01e0018b5 [Bugfix] Fix Nemotron VL image processing (#22739)
Co-authored-by: ducviet00-h2 <viet.d.hoang@h2corporation.jp>
2025-08-13 03:11:36 -07:00
Yuxuan Zhang
9e7e5baaa8 [Model] Add missing prefix to glm4_1v (#22716)
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
2025-08-13 01:23:33 -07:00
zzh142857
d16aa3dae4 [Model] Add option to run Step3VisionEncoder in DP (#22697)
Signed-off-by: zzh142857 <chaorenzhaozhenghao@gmail.com>
2025-08-13 00:09:13 -07:00
Chen Zhang
6807af8f46 [gpt-oss] upgrade gpt-oss to v0.0.3 and add version check (#22768)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-08-12 21:37:26 -07:00
shixianc
4c558cf62e [Perf] Support topk softmax fused kernel for broader num_experts (#22211)
Signed-off-by: Shixian Cui <shixian@amazon.com>
Co-authored-by: Shixian Cui <shixian@amazon.com>
2025-08-12 21:34:47 -07:00
Wentao Ye
77a6bf07ae [Bug] Fix Unexpected Keyword Argument 'w1_bias' (#22757)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-12 21:31:47 -07:00
Michael Goin
4082338a25 Remove unneeded ROCm platform import when using CUDA (#22765)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-12 21:26:38 -07:00
Michael Goin
c6b928798e Force TRTLLM attention for gpt-oss on SM100 (#22678)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-12 21:22:16 -07:00
Michael Goin
b1361c7273 [Bugfix] Fix default enable for CUTLASS MLA on SM100 (#22738)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-12 21:22:05 -07:00
Po-Han Huang (NVIDIA)
4f0f844b16 Fix cuda illegal mem access with Llama4 TP8 + rms_norm custom op (#22701)
Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
2025-08-12 21:21:50 -07:00
Woosuk Kwon
c5830381af [V0 Deprecation] Remove args for multi-step scheduling (#22779)
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
2025-08-12 20:38:18 -07:00
Woosuk Kwon
d31f97cf57 [Misc] Remove tests/multi_step/__init__.py (#22778)
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
2025-08-12 20:21:18 -07:00
Woosuk Kwon
71683ca6f6 [V0 Deprecation] Remove multi-step scheduling (#22138)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
2025-08-12 20:18:39 -07:00
Michael Goin
e18859298d Add hardware plugins to installation doc (#22732)
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-12 17:14:46 -07:00
Jee Jee Li
fde0b611a3 [Model] Decouple glm4v (#22751)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-12 17:13:17 -07:00
Harry Mellor
d0a6301588 Fix Transformers backend tensor parallel for multimodal models (#22673)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-12 17:12:30 -07:00
Harry Mellor
45c3936e94 [Docs] Hide the navigation and toc sidebars on home page (#22749)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-12 17:12:26 -07:00
Frank Wang
ba81acbdc1 [Bugfix] Bump DeepGEMM Version to Fix SMXX Layout Issues (#22606)
Signed-off-by: frankwang28 <frank.wbb@hotmail.com>
2025-08-12 15:43:06 -07:00
RUTHLESS-BOT
53c730286c [Misc] parametrize 'dtype' in test_flash_mla (#22641)
Signed-off-by: RUTHLESS-BOT <wujiafeng@cmbchina.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-08-12 16:31:48 -04:00
zifeitong
6534d2fc97 Fix torch version check for SM100 mxfp4 (#22535)
Signed-off-by: Zifei Tong <zifeitong@gmail.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-08-12 12:54:42 -07:00
Nicolò Lucchesi
422f22e012 [CI][Nixl] Check kv cache layout during handshake (#22745)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-08-12 12:53:52 -07:00
Xiaozhu Meng
6bd8ebf026 [Kernel][AMD] Avoid D2H copy and cumsum kernel (#22683)
Signed-off-by: Xiaozhu <mxz297@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-08-12 12:53:36 -07:00
Wentao Ye
dab4f9f764 [Chore] Update CODEOWNERS to include @yewentao256 for CUDA kernels, attention backends, quantization, and related tests (#22741)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-13 00:50:31 +08:00
TeeKen Lau
c42fe0b63a Add more test scenario for tensor schema (#22733)
Signed-off-by: teekenl <teekenlau@gmail.com>
2025-08-12 16:34:41 +00:00
Rahul Tuli
5a4b4b3729 Add: SupportsEagle3 interface for explicit EAGLE3 support (#22642)
Signed-off-by: Rahul Tuli <rtuli@redhat.com>
2025-08-12 09:24:52 -07:00
Daniel Serebrenik
e5d3d63c42 [Benchmark] Fix terminal colors in benchmark_serving_multi_turn (python 3.12) (#22730)
Signed-off-by: daniels <daniels@pliops.com>
2025-08-12 14:41:37 +00:00
Nicolò Lucchesi
3d9d40efde [Bugfix][CI] Fix test_remote_decode_lifecycle.py::test_short_prompt_lifecycle (#22727)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-08-12 07:30:17 -07:00
Po-Han Huang (NVIDIA)
67c153b88a Fix Llama4 FlashInfer FP4 MoE issues (#22511)
Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
2025-08-12 05:50:59 -07:00
wang.yuqi
f7ad6a1eb3 [CI Failure] fix tests/entrypoints/openai/test_skip_tokenizer.py (#22708)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-08-12 05:42:58 -07:00
Harry Mellor
80bb1e8afe Officially support SmolLM3 using the Transformers backend (#22665)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-12 05:38:48 -07:00
Nicolò Lucchesi
d030b01548 [BugFix][Nixl][PD] Fix heterogenous TP (#22663)
Signed-off-by: NickLucche <nlucches@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-08-12 05:37:30 -07:00
Harry Mellor
767e63b860 [Docs] Improve docs navigation (#22720)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-12 04:25:55 -07:00
Yongye Zhu
007dd90859 [gpt-oss] Enable gpt-oss on ampere (#22714)
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
2025-08-12 03:21:44 -07:00
Jee Jee Li
b8a9d0e429 [Misc] remove GH discussions link (#22722)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-12 03:15:33 -07:00
zejunchen-zejun
50f2aae1b4 [LMCache][Example] Align the PYTHONHASHSEED for prefillers and decoders for KV chunks hashing (#21161)
Signed-off-by: zejunchen-zejun <zejun.chen@amd.com>
2025-08-12 02:05:14 -07:00
RishiAstra
46ae7f6666 [Bugfix] Mamba2 SSD varlen bug fix initstates decay, improve test, assert chunk pwr 2 (#21783)
Signed-off-by: Rishi Astra <40644327+RishiAstra@users.noreply.github.com>
2025-08-12 02:04:37 -07:00
Jun-Howie
1ece7f30ba Fix: AWQ Marlin get_quant_method does not recognize "modules_to_not_convert" (#21888)
Signed-off-by: JunHowie <JunHowie@aliyun.com>
Co-authored-by: JunHowie <JunHowie@aliyun.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-08-12 02:03:53 -07:00
phantomlei
bc8372efc3 [Bugfix] Fix erroneous randomly generated cases in bad word testing (#22170)
Signed-off-by: phantomlei <phantomlei3@gmail.com>
2025-08-12 02:03:22 -07:00
Sugar-zsg
8d17fa633e [V0] Correct CUDA Graph capture for encoder-decoder models (#22630) 2025-08-12 02:01:08 -07:00
dongluw
9f909b8996 [New Model] Support Command-A-Vision (#22660)
Signed-off-by: donglu <donglu@cohere.com>
2025-08-12 01:39:54 -07:00
Chendi.Xue
59f3b93636 [DOC] update v1_guide with INTEL HW (#22679)
Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
2025-08-12 01:22:49 -07:00
Harry Mellor
78077d5417 Move SchedulerConfig from config/__init__.py to config/scheduler.py (#22626)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-12 00:23:49 -07:00
wang.yuqi
6d729c43fb [Bugfix] Fix ModernBert load & Enable sliding window attention for bidirectional attention. (#22637)
Signed-off-by: wang.yuqi <noooop@126.com>
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: Max de Bayser <mbayser@br.ibm.com>
2025-08-12 00:23:17 -07:00
Sooraj S
2f4657952b [doc] Update x86 CPU-inference installation doc to reflect optionality of AVX512f (#22707)
Signed-off-by: Sooraj S <94284954+sooraj-satheesh@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Li, Jiang <bigpyj64@gmail.com>
2025-08-12 00:21:08 -07:00
Hongsheng Liu
3a7e3bbdd2 [Doc] Added unmentioned required option "method" in the usage of EAGLE-3 based models (#21737)
Signed-off-by: Dilute-l <dilu2333@163.com>
Co-authored-by: Dilute-l <dilu2333@163.com>
2025-08-12 00:14:51 -07:00
Harry Mellor
4fbd8bb597 Fix passing SpeculativeConfig from the CLI (#22652)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-11 22:13:32 -07:00
Chen Zhang
ad344ef552 [gpt-oss] Small bug fixes for frontend (#22512)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-08-11 22:04:38 -07:00
Chen Zhang
bbaf9e9cb1 [gpt-oss] Fix mxfp4 support (#22700)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-08-11 21:22:26 -07:00
Benji Beck
4678503476 Migrate MiniCPMVImageInputs to TensorSchema (#21939)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-08-11 20:43:37 -07:00
Michael Goin
93d0652433 [CI] Increase timeout for test_completion_with_image_embeds (#22670)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-11 20:31:36 -07:00
Michael Goin
ea1292ad3e [CI Failure] Use float32 for tests/entrypoints/openai/test_audio.py (#22686)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-11 20:20:42 -07:00
Po-Han Huang (NVIDIA)
dc5e4a653c Upgrade FlashInfer to v0.2.11 (#22613)
Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-08-11 19:58:41 -07:00
Harry Mellor
839ab00349 Re-enable Xet on TPU tests now that hf_xet has been updated (#22666)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-11 19:54:40 -07:00
Andy Chen
9b94d6ec8f Enable 4bit bnb prequant MOE (#21548)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-11 19:02:14 -07:00
Chen Zhang
1891a265d3 [gpt-oss] Add test for response API + harmony (but skipped) (#22554)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-08-11 17:47:24 -07:00
Chen Zhang
95a935fc48 [gpt-oss] Support streaming in response API (#22431)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-08-11 17:46:59 -07:00
Harry Mellor
458e74eb90 Support more parallel styles in Transformers backend TP (#22651)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-11 10:42:48 -07:00
TJian
65abe111a3 [CI] Skip Tree Attn Test in test_max_len.py to unblock CI (#22664)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-08-11 10:36:05 -07:00
22quinn
807d21b80d [BugFix] [Spec Decode] Remove LlamaForCausalLMEagle3 to fix CI (#22611)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-08-11 10:31:36 -07:00
Isotr0py
c90fb03df5 [CI/Build] Skip Mllama HF runner tests with Transformers v4.55.0 (#22659)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-08-11 10:00:58 -07:00
wang.yuqi
84cf78acee [Model] Pooling models default to using chunked prefill & prefix caching if supported. (#20930)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-08-11 09:41:37 -07:00
GuanLuo
16fb668b61 fix: NIXL connector transfers partial block to pass full multi-modal context (#21074)
Signed-off-by: GuanLuo <gluo@nvidia.com>
2025-08-11 09:40:55 -07:00
Wentao Ye
f7dcce7a4a [Feature] Add VLLM_USE_DEEP_GEMM_E8M0 Env to Control E8M0 Scale (#21968)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-11 09:39:08 -07:00
Isotr0py
8e13d9fe6d [Misc] Further clean up some redundant config definitions (#22649)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-08-11 09:22:25 -07:00
Eric Curtin
3fa5b25845 Document aarch64 CPU support works (#22646)
Signed-off-by: Eric Curtin <ecurtin@redhat.com>
2025-08-11 07:22:45 -07:00
danielafrimi
14a5d903ab [Model] NemotronH Support (#22349)
Signed-off-by: Daniel Afrimi <danielafrimi8@gmail.com>
2025-08-11 04:09:24 -07:00
Cyrus Leung
951b038298 [Misc] Move jsontree to utils (#22622)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-11 03:49:32 -07:00
Cyrus Leung
ebf7605b0d [Misc] Move tensor schema tests (#22612)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-11 00:15:27 -07:00
Harry Mellor
bc1d02ac85 [Docs] Add comprehensive CLI reference for all large vllm subcommands (#22601)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-11 00:13:33 -07:00
JartX
1e55dfa7e5 [BUGFIX] KeyError 'layers.14.mlp.gate.g_idx' for Qwen3-MoE with GPTQ on ROCm (#22017) 2025-08-11 00:13:30 -07:00
Jee Jee Li
384a052971 [Misc] benchmark_moe supports expert parallel (#22251)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-11 00:13:27 -07:00
Maximilien de Bayser
39052dbca8 Support token_type_ids in V1 with less code changes (#21985)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
2025-08-10 22:54:59 -07:00
vllmellm
9c97a1c349 [ROCm][AITER] Support AITER Rope ops in RotaryEmbedding Module. (#22521)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-08-10 22:52:34 -07:00
Eugene Cheah
f919d4cb8f [BugFix] Fix logits repetition penalty cuda check (#22592) 2025-08-10 22:52:31 -07:00
Zhewen Li
afa5b7ca0b [Misc][gpt-oss] guard import when triton kernel when not up to date (#22584)
Signed-off-by: zhewenli <zhewenli@meta.com>
2025-08-10 21:29:35 -07:00
Lifans
1b99028069 [Misc][gpt-oss] Add rules to label gpt-oss related PRs (#22600)
Signed-off-by: Lifan Shen <lifans@meta.com>
2025-08-10 19:49:51 -07:00
Nick Hill
5898b135ab [BugFix] Fix KVConnectorOutput TPU breakage (#22598)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-08-10 19:33:48 -07:00
22quinn
b799f4b9ea [CI/Build] Fix tensorizer test for load_format change (#22583)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-08-10 19:30:00 -07:00
Benji Beck
06da44f0cb Migrate LlavaImageInputs to TensorSchema (#21770)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-08-10 19:29:19 -07:00
Benji Beck
a554991748 Migrate LlavaNextVideoPixelInputs to TensorSchema (#21843)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-08-10 19:29:16 -07:00
Doug Smith
d1af8b7be9 enable Docker-aware precompiled wheel setup (#22106)
Signed-off-by: dougbtv <dosmith@redhat.com>
2025-08-10 16:29:02 -07:00
Benji Beck
68b254d673 Fix TensorSchema validation test for symbolic dims (#22366)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-08-10 17:16:44 +00:00
ZiTian Zhao
8c50d62f5a Remove redundant row_indices unsqueeze operation in MiniCPMO (#22528)
Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
2025-08-10 09:20:00 -07:00
Benji Beck
b4e2916721 Migrate LlavaNextImageInputs to TensorSchema (#21774)
Signed-off-by: Benji Beck <benjibeck@meta.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-08-10 09:05:21 -07:00
Breno Baldas Skuk
65a7917be4 Fix(benchmarks): allow multiple mm contents in OpenAI Chat Completion Benchmarks (#22534)
Signed-off-by: breno.skuk <breno.skuk@hcompany.ai>
2025-08-10 09:03:15 -07:00
Isotr0py
b76753f0b5 [Bugfix][Kernel] Support partial rotary embedding for MRoPE triton kernel (#22593)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-10 09:00:36 -07:00
youkaichao
b81fe83b2c [doc] add alibaba cloud as sponsor (#22597)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-08-10 23:13:47 +08:00
youkaichao
0757551c96 [doc] add beijing meetup links (#22596)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-08-10 22:51:36 +08:00
Harry Mellor
8290d15d2c Move CacheConfig from config/__init__.py to config/cache.py (#22586)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-10 07:36:40 -07:00
Isotr0py
049c245143 [Misc] Replace flaky image urls in pixtral test (#22574)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-08-10 06:18:21 -07:00
Harry Mellor
00976db0c3 [Docs] Fix warnings in docs build (#22588)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-10 05:49:51 -07:00
Cyrus Leung
d411df0296 [Misc] Further refine type annotations in parallel state (#22499)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-10 05:49:48 -07:00
22quinn
010e0e39ea [Doc] Fix API doc link in side navigation (#22585)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-08-10 01:35:22 -07:00
Ning Xie
326976291b [Misc] code clean duplicate set_current_vllm_config in _set_vllm_config (#22566)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-08-10 00:08:48 -07:00
Isotr0py
7e8d685775 [Minor] Fix pre-commit error on main (#22579)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-08-10 00:08:23 -07:00
Harry Mellor
c49848396d Refactor sliding window configuration to Transformers best practice (#21927)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-09 20:50:48 -07:00
Chengji Yao
2a84fb422f [TPU] kv cache update kernel doesn't need to be padded slices to multiple of num_slices_per_block (#22394)
Signed-off-by: Chengji Yao <chengjiyao@gmail.com>
Co-authored-by: Chengji Yao <chengjiyao@gmail.com>
2025-08-09 20:49:04 -07:00
ZiTian Zhao
534c45b962 Improve fast_topk function with type hints and documentation (#22530)
Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
2025-08-09 20:25:42 -07:00
Le Chen
3d7363e61c [Config] add "qwen" as a native eagle3 target supported model (#22333)
Signed-off-by: lechen <lecself@163.com>
Signed-off-by: LeChen <lecself@163.com>
2025-08-09 20:21:05 -07:00
Jee Jee Li
0c5254b82a [oss] Init gpt-oss bf16 support (#22508)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-09 20:19:13 -07:00
Thomas Parnell
61f67d8acd [V1] [Hybrid] Enable Full CUDA Graph (decode-only) for Mamba layers (#21401)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-08-09 20:16:11 -07:00
TJian
42172ad18f [FEAT] [Performance] Add triton mrope to replace the torch code path (#22375)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-08-09 11:50:03 -07:00
Isotr0py
fbd8595c5c [Bugfix] Fix basic models tests hanging due to mm processor creation (#22571)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-09 11:42:21 -07:00
Nicolò Lucchesi
5a16fa614c [Model] Gemma3n MM (#20495)
Signed-off-by: ShriKode <shrikode@gmail.com>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: ShriKode <shrikode@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.me>
2025-08-09 09:56:25 -07:00
Harry Mellor
2d18256e47 Move ParallelConfig from config/__init__.py to config/parallel.py (#22565)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-09 08:33:46 -07:00
Harry Mellor
56186474f6 [Docs] Reduce noise in docs and --help from the JSON tip (#22567)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-09 08:31:32 -07:00
Thomas Parnell
1bf5e1f25b [CI] [Hybrid] Speed up hybrid models test by removing large models (#22563)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-08-09 02:04:42 -07:00
Yuxuan Zhang
a6022e6fbc GLM-4.5V with new class name at transformers (#22520)
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-09 00:50:21 -07:00
Thomas Parnell
2be07a0db1 Update docs for Minimax-Text support (#22562)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-08-09 00:18:18 -07:00
Jee Jee Li
0edc0cd52b [Bugfix] Fix CI moe kernel failure (#22556)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-09 00:03:29 -07:00
Isotr0py
7920e9b1c5 [Bugfix] Fix failing GPT-OSS initialization test (#22557)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-09 00:03:26 -07:00
Charlie Fu
b7c0942b65 [ROCm][Misc] Rename the context_len to seq_len in ROCm custom paged attention kernel (#22097)
Signed-off-by: charlifu <charlifu@amd.com>
2025-08-08 23:15:06 -07:00
Kyuyeun Kim
9a0c5ded5a [TPU] Add support for online w8a8 quantization (#22425)
Signed-off-by: Kyuyeun Kim <kyuyeunk@google.com>
2025-08-08 23:12:54 -07:00
Eldar Kurtić
10a02535d4 Fix loading of quantized BigCode models (#22463)
Signed-off-by: Eldar Kurtic <eldar@neuralmagic.com>
2025-08-08 23:12:12 -07:00
Cyrus Leung
65552b476b [Misc] Use config definitions from Transformers library (#21913)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-08 23:10:51 -07:00
Or Ozeri
7ad7adb67f v1: Pass KVConnectorOutput to scheduler-side (#22157)
Signed-off-by: Or Ozeri <oro@il.ibm.com>
2025-08-08 23:09:51 -07:00
Thomas Parnell
6ade99eafa [V1] [Hybrid] Support Minimax-Text-01 in V1 (#22151)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-08-08 23:08:48 -07:00
Wentao Ye
3157aebb63 [Log] Add Warning for Deprecation of DeepGEMM old version (#22194)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-08 23:07:48 -07:00
Thomas Parnell
8a0ffd6285 Remove mamba_ssm from vLLM requirements; install inside test container using --no-build-isolation (#22541)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-08-08 23:05:32 -07:00
Roger Wang
23472ff51c [Doc] Add usage of implicit text-only mode (#22561)
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Flora Feng <4florafeng@gmail.com>
2025-08-08 23:04:19 -07:00
Roger Wang
08b751ba74 Implicit language-model-only mode via limit-mm-per-prompt (#22299)
Signed-off-by: Roger Wang <hey@rogerw.me>
Signed-off-by: Andy Xie <andy.xning@gmail.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
Signed-off-by: Shu Wang <shuw@nvidia.com>
Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
Signed-off-by: Shu Wang. <shuw@nvidia.com>
Signed-off-by: XIn Li <xinli@nvidia.com>
Signed-off-by: Junhao Li <junhao@ubicloud.com>
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
Signed-off-by: zitian zhao <zitian.zhao@tencentmusic.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: iAmir97 <Amir.balwel@embeddedllm.com>
Signed-off-by: iAmir97 <71513472+iAmir97@users.noreply.github.com>
Signed-off-by: Linkun <github@lkchen.net>
Co-authored-by: Ning Xie <andy.xning@gmail.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
Co-authored-by: Andrew Sansom <andrew@protopia.ai>
Co-authored-by: Zhiyu <zhiyuc@nvidia.com>
Co-authored-by: Shu Wang <shuw@nvidia.com>
Co-authored-by: XIn Li <xinli@nvidia.com>
Co-authored-by: Junhao Li <streaver91@gmail.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
Co-authored-by: Yuxuan Zhang <2448370773@qq.com>
Co-authored-by: ZiTian Zhao <zitian.zhao@tencentmusic.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Po-Han Huang (NVIDIA) <53919306+nvpohanh@users.noreply.github.com>
Co-authored-by: iAmir97 <71513472+iAmir97@users.noreply.github.com>
Co-authored-by: iAmir97 <Amir.balwel@embeddedllm.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Hong Hanh <hanh.usth@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: lkchen <github@lkchen.net>
2025-08-08 22:21:40 -07:00
Isotr0py
429e4e2d42 [Bugfix] Fix ModernBert cuda graph capturing in v1 (#21901)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-08-08 22:17:22 -07:00
Pradyun92
35afe1b30b [BugFix] [P/D] Handle lookahead token count edge-case with Eagle Spec Decoding and P/D (#22317)
Signed-off-by: Pradyun Ramadorai <pradyunr@amazon.com>
Signed-off-by: Pradyun92 <142861237+Pradyun92@users.noreply.github.com>
Co-authored-by: Pradyun Ramadorai <pradyunr@amazon.com>
Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com>
2025-08-08 17:04:15 -07:00
Kunshang Ji
81c57f60a2 [XPU] upgrade torch 2.8 on for XPU (#22300)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2025-08-08 17:03:45 -07:00
Russell Bryant
311d875614 Drop flaky test_healthcheck_response_time (#22539)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-08-08 16:56:47 -07:00
Harry Mellor
e3edc0a7a8 Extract CompilationConfig from config.py (#22524)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-08 16:34:25 -07:00
yyweiss
baece8c3d2 [Frontend] Add unix domain socket support (#18097)
Signed-off-by: <yyweiss@gmail.com>
Signed-off-by: yyw <yyweiss@gmail.com>
2025-08-08 16:23:44 -07:00
Guy Stone
2fcf6b27b6 [Docs] fix broken links in metrics.md (#22315)
Signed-off-by: Guy Stone <guys@spotify.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-08 16:22:35 -07:00
Harry Mellor
41b9655751 Skip Qwen 1 in CI because remote code is no longer compatible with Transformers (#22536)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-08 16:20:58 -07:00
Thomas Parnell
bd875d2eb7 [Bugfix] Update FA commit hash (#22546)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-08-08 16:10:25 -07:00
Varun Sundar Rabindranath
f703b923f3 [Misc] DeepGEMM : Avoid JIT generation in the hot-path (#22215)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-08-08 16:09:59 -07:00
Lucas Wilkinson
cd9b9de1fb [BugFix] Fix IMA FlashMLA full cuda-graph and DP + Update FlashMLA (#21691)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-08-08 16:09:42 -07:00
Chen Zhang
fe6d8257a1 [gpt-oss] Support tool call and implement MCP tool server (#22427)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-08-08 15:06:37 -07:00
Ricardo Decal
e290594072 [Docs] Rename “Distributed inference and serving” to “Parallelism & Scaling” (#22466)
Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
2025-08-08 19:26:21 +00:00
Yongye Zhu
f756a682d9 [gpt-oss] guard import when triton kernel is not installed (#22529)
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-08 11:18:33 -07:00
Daniel Serebrenik
f0964e29cb [Benchmark] Add benchmark tool for multi turn conversations (#20267) 2025-08-08 10:28:50 -07:00
Yongye Zhu
e789cad6b8 [gpt-oss] triton kernel mxfp4 (#22421)
Signed-off-by: <zyy1102000@gmail.com>
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
2025-08-08 08:24:07 -07:00
Harry Mellor
e5ebeeba53 Remove exception for Python 3.8 typing from linter (#22506)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-08 03:06:46 -07:00
Harry Mellor
7be7f3824a [Docs] Improve API docs (+small tweaks) (#22459)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-08 03:02:51 -07:00
Nick Hill
ccdae737a0 [BugFix] Don't cancel asyncio tasks directly from destructors (#22476)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-08-08 01:13:18 -07:00
rongfu.leng
904063907c [Misc] fix openai version (#22485)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-08-08 01:12:54 -07:00
Cyrus Leung
43c4f3d77c [Misc] Begin deprecation of get_tensor_model_*_group (#22494)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-08 01:11:54 -07:00
Cyrus Leung
1712543df6 [CI/Build] Fix multimodal tests (#22491)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-08 00:31:19 -07:00
lkchen
808a7b69df [bench] Fix benchmark/serve.py to ignore unavailable results (#22382)
Signed-off-by: Linkun <github@lkchen.net>
2025-08-07 23:15:50 -07:00
iAmir97
099c046463 [Doc] Sleep mode documentation (#22310)
Signed-off-by: iAmir97 <Amir.balwel@embeddedllm.com>
Signed-off-by: iAmir97 <71513472+iAmir97@users.noreply.github.com>
Co-authored-by: iAmir97 <Amir.balwel@embeddedllm.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Hong Hanh <hanh.usth@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
2025-08-08 12:25:18 +08:00
Po-Han Huang (NVIDIA)
af473f0a85 [bugfix] Fix Llama3/4 issues caused by FlashInfer 0.2.10 (#22426)
Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
2025-08-07 20:25:01 -07:00
Cyrus Leung
157f9c1368 Fix pre-commit (#22487)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-07 20:21:54 -07:00
ZiTian Zhao
6f287915d8 Optimize MiniCPMO mask creation with vectorized implementation (#22464)
Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
Signed-off-by: zitian zhao <zitian.zhao@tencentmusic.com>
2025-08-07 20:18:50 -07:00
Yuxuan Zhang
c152e2a8a0 not tie_word_embeddings for glm-4.5 and glm-4.5v (#22460)
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
2025-08-07 19:37:23 -07:00
Chauncey
17eaaef595 [Bugfix] Fix RuntimeError: Index put requires the source and destination dtypes match (#22065)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-08-07 19:20:21 -07:00
Junhao Li
3303f134e0 [Kernel] Add support for block FP8 on SM120 (NVIDIA 5090 and RTX PRO 6000) (#22131)
Signed-off-by: Junhao Li <junhao@ubicloud.com>
2025-08-07 19:18:28 -07:00
Shu Wang
b2c8ce57c6 Fix Flashinfer CUTLASS MOE Allgather (#21963)
Signed-off-by: Shu Wang <shuw@nvidia.com>
2025-08-07 19:18:25 -07:00
Shu Wang
a3b9c17b56 Support Tensorrt-LLM MoE fp4 for low-latency (#21331)
Signed-off-by: Shu Wang <shuw@nvidia.com>
Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
Signed-off-by: Shu Wang. <shuw@nvidia.com>
Signed-off-by: XIn Li <xinli@nvidia.com>
Co-authored-by: XIn Li <xinli@nvidia.com>
2025-08-07 19:18:22 -07:00
Zhiyu
d57dc2364e Add ModelOpt Qwen3 nvfp4 support (#20101)
Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
2025-08-07 19:18:19 -07:00
Andrew Sansom
e2c8f1edec [PERF] Use pybase64 to more quickly decode prompt embeddings (#22469)
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
2025-08-07 19:15:32 -07:00
TJian
1ee5ead5f8 [ROCm] [V1] [SpecDec] Enable Speculative Decoding on ROCm V1 Engine (#21496)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-08-07 19:13:17 -07:00
Ning Xie
acf8aeb79e [Misc] normalize multiprocessing Queue usage (#22371)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-08-08 01:57:27 +00:00
Harry Mellor
7e3a8dc906 Remove from_dict from SpeculativeConfig (#22451)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-07 10:13:04 -07:00
Cyrus Leung
139d155781 [Frontend] Use engine argument to control MM cache size (#22441)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-07 09:47:10 -07:00
Cyrus Leung
8c9da6be22 [Core] Simplify mm processing cache (#22457)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-07 09:47:07 -07:00
Woosuk Kwon
399d2a10e2 Fix pre-commit error in main (#22462)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-07 08:54:39 -07:00
Chen Zhang
4815b00f54 [gpt-oss] Generate ResponseOutputItem from Harmony Message (#22410)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-08-07 08:33:25 -07:00
Chen Zhang
4da8bf20d0 [Tool] Fix auto tool call (#22434)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-08-07 07:03:38 -07:00
fxmarty-amd
7e0b121812 [Bugfix] Add missing packed_modules_mapping to DeepseekV2ForCausalLM (#22352)
Signed-off-by: Felix Marty <Felix.Marty@amd.com>
2025-08-07 06:30:48 -07:00
Cyrus Leung
766bc8162c [Core] Store only the keys for multi-modal data in P0 (#22198)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-07 01:45:04 -07:00
WeiQing Chen
289b18e670 [Docs] Update features/disagg_prefill, add v1 examples and development (#22165)
Signed-off-by: David Chen <530634352@qq.com>
2025-08-07 00:59:23 -07:00
Andrew Chan
35171b1172 [Doc] update docs for nightly benchmarks (#12022)
Signed-off-by: Andrew Chan <andrewkchan.akc@gmail.com>
2025-08-07 00:29:45 -07:00
Ricardo Decal
a2c6696bfe [Docs] Factor out troubleshooting to its own guide; add section for Ray Observability (#21578)
Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
2025-08-07 00:29:13 -07:00
Yong Hoon Shin
5e8398805e [Doc] Fix link to prefix caching design (#22384)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-08-07 00:28:15 -07:00
Woosuk Kwon
136825de75 [Misc] Enhance code formatting in mxfp4.py (#22423)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-07 00:26:24 -07:00
JaceyShao
c2dba2dba8 Add H20-3e fused MoE kernel tuning configs for GLM-4.5 (#22433)
Signed-off-by: shaojunqi <shaojunqi.sjq@alibaba-inc.com>
Co-authored-by: shaojunqi <shaojunqi.sjq@alibaba-inc.com>
2025-08-07 00:24:47 -07:00
Harry Mellor
434d2f3f7a [Docs] Add missing dependency for docs build (#22435)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-07 00:22:07 -07:00
Adrián García García
8e8e0b6af1 feat: Add --enable-log-outputs flag for logging model generations (#20707)
Signed-off-by: Adrian Garcia <adrian.garcia@inceptionai.ai>
2025-08-06 23:10:13 -07:00
Ming Yang
82216dc21f [Misc] Support routing logic simulation (#21990)
Signed-off-by: Ming Yang <minos.future@gmail.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-08-06 23:06:20 -07:00
Moritz Sanft
370661856b [Frontend] Update OpenAI error response to upstream format (#22099)
Signed-off-by: Moritz Sanft <58110325+msanft@users.noreply.github.com>
2025-08-06 23:06:00 -07:00
vllmellm
cbc8457b26 [Model] Switch to Fused RMS norm in Qwen2.5_VL model. (#22184)
Signed-off-by: kf <kuanfu.liu@embeddedllm.com>
Signed-off-by: tjtanaavllm <tunjian.tan@amd.com>
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: kf <kuanfu.liu@embeddedllm.com>
2025-08-06 23:05:24 -07:00
lkchen
4d4297e8fe [Bench] Split serve.py:main into async/async versions (#22405)
Signed-off-by: Linkun <github@lkchen.net>
2025-08-06 23:05:07 -07:00
wang.yuqi
2a4c825523 [CI] Skip the pooling models that do not support transformers v4.55 (#22411)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-08-06 23:05:03 -07:00
WeiQing Chen
4be02a3776 [Bugfix] EPLB load statistics problem (#22167)
Signed-off-by: ycyaw66 <497410282@qq.com>
Signed-off-by: David Chen <530634352@qq.com>
Co-authored-by: ycyaw66 <497410282@qq.com>
2025-08-07 04:07:54 +00:00
Chen Zhang
f6278b6243 [gpt-oss] Convert user input to harmony format (#22402)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-06 20:56:02 -07:00
Lionel Villard
ad6c655dde preload heavy modules when mp method is forkserver (#22214)
Signed-off-by: Lionel Villard <villard@us.ibm.com>
2025-08-06 20:33:24 -07:00
ZiTian.Zhao
14bcf93a6a Optimize logger init performance by using module-level constants (#22373)
Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
2025-08-06 20:32:19 -07:00
Harry Mellor
ecbea55ca2 Update hf_xet pin to resolve hangs (#22356)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-06 20:31:41 -07:00
Syed Muhammad Bin Asif
609b533cb6 [Bugfix] Add proper comparison for package versions (#22314)
Signed-off-by: Syed Muhammad Bin Asif <syedmba7@connect.hku.hk>
2025-08-06 20:31:03 -07:00
qscqesze
5e9455ae8f [Bugfix]: Fix the streaming output for function calls in the minimax (#22015)
Signed-off-by: QscQ <qscqesze@gmail.com>
Signed-off-by: qingjun <qingjun@minimaxi.com>
2025-08-06 20:30:27 -07:00
Michael Goin
a00d8b236f Use float32 for test_completion.py (#22385)
Signed-off-by: Michael Goin <mgoin64@gmail.com>
2025-08-07 11:07:47 +08:00
Cyrus Leung
04cf435d95 [Bugfix] Fix wrong method name in Intern-S1 image processor (#22417)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-06 20:05:20 -07:00
Tao He
7377131a2c [Qwen3] Enable dual-chunk-attention support for Qwen3 models. (#21924)
Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>
2025-08-06 19:58:08 -07:00
Kunshang Ji
6b47ef24de [XPU]Fix flash_attn_varlen_func interface on xpu (#22350)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2025-08-06 19:28:11 -07:00
Lucas Wilkinson
1dc8a70b6d [Attention] Support multiple attention metadata builders per kv_cache_spec + proper local attention no hybrid kv cache fix (#21588)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-08-06 18:40:52 -07:00
Maximilien de Bayser
f825c6bd22 Support encoder_only attention for FlexAttention (#22273)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
2025-08-06 18:37:14 -07:00
tc-mb
41b67f4263 [model] Support MiniCPM-V 4.0 (#22166)
Co-authored-by: imning3 <hbning@pku.edu.cn>
2025-08-06 18:35:46 -07:00
Michael Goin
e8961e963a Update flashinfer-python==0.2.10 (#22389)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-06 18:10:24 -07:00
Lain
9a3835aaa9 Fix trtllm-gen attention env and add attention sink (#22378)
Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
Signed-off-by: Lain <fusiyuan2000@hotmail.com>
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
2025-08-06 18:07:41 -07:00
Yongye Zhu
5c7cc33f4d [gpt-oss] fix model config with hf_config (#22401)
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
2025-08-06 18:04:04 -07:00
Chen Zhang
19c9365aa4 [gpt-oss] add demo tool server (#22393)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-08-06 17:47:14 -07:00
Wentao Ye
eec890c1c1 [Bug] Fix B200 DeepGEMM E8M0 Accuracy Issue (#22399)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-06 17:03:53 -07:00
Asaf Joseph Gardin
46a13949d5 [v1] - Mamba1 Attention Metadata (#21249)
Signed-off-by: asafg <asafg@ai21.com>
Co-authored-by: asafg <asafg@ai21.com>
2025-08-06 17:03:42 -07:00
Yongye Zhu
31f09c615f [gpt-oss] flashinfer mxfp4 (#22339)
Signed-off-by: simon-mo <xmo@berkeley.edu>
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
2025-08-06 12:37:27 -07:00
Yongye Zhu
31f5dc5b2a [gpt-oss] Enhance error msg on attention sink init (#22335)
Signed-off-by: simon-mo <xmo@berkeley.edu>
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
2025-08-06 11:41:42 -07:00
Woosuk Kwon
ec7cb19224 [gpt-oss] Add loop for built-in tool call (#22374)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
2025-08-06 10:32:21 -07:00
Gregory Shtrasberg
2435ea7ed5 [Bugfix] Make condition in triton kernel constexpr (#22370)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-08-06 10:00:58 -07:00
Lucas Wilkinson
4a6b72c2ab [BugFix] Fix triton compile error in kernel_unified_attention_2/3d caused by attention sinks (#22368)
Signed-off-by: LucasWilkinson <lwilkinson@neuralmagic.com>
2025-08-06 09:47:38 -07:00
Zhang Jason
b4b9813b5e add the codes to check AMD Instinct GPU number (#22367)
Signed-off-by: Zhang Jason <ning.zhang2@amd.com>
2025-08-06 08:58:38 -07:00
Lucas Wilkinson
2cb6ef8996 [BugFix] Fix FA2 RuntimeError when sinks is provided (#22365)
Signed-off-by: LucasWilkinson <lwilkinson@neuralmagic.com>
2025-08-06 08:03:03 -07:00
Woosuk Kwon
9edd1db02b [Minor] Fix type (#22347)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-06 02:22:03 -07:00
Woosuk Kwon
f263a4b53f [gpt-oss] Support chat completion api (#22342) 2025-08-06 01:57:39 -07:00
Roger Wang
54991c548a [gpt-oss] add model to supported models doc (#22336)
Signed-off-by: Roger Wang <hey@rogerw.me>
2025-08-06 01:49:44 -07:00
Woosuk Kwon
178d03fbd6 [gpt-oss] Add Tool/ConversationContext classes and harmony_utils (#22340)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
2025-08-06 01:08:49 -07:00
Isotr0py
fa00c5d75b [Misc] Clean up duplicated hf overrides (#22311)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-08-06 07:50:25 +00:00
Woosuk Kwon
134a8ee8fd [gpt-oss] Add openai-harmony as default dependency (#22332)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
2025-08-06 00:10:14 -07:00
Yongye Zhu
90ec006937 [gpt-oss] flashinfer attention sink init (#22330)
Signed-off-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com>
2025-08-05 23:48:19 -07:00
Chen Zhang
a47e6ffe93 [GptOss] Add GptOss reasoning parser to support structure output (#22322)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
2025-08-05 23:39:13 -07:00
Woosuk Kwon
98a3a81024 [ROCm] Add attention sink to use_rocm_custom_paged_attention (#22329)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>

Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
2025-08-05 23:30:38 -07:00
Woosuk Kwon
de98252f49 Add GPT-OSS model code and config [1/N] (#22327)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-05 23:26:00 -07:00
Harry Mellor
796bae07c5 Update transformers to v4.55 (#21931)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-05 22:56:14 -07:00
Woosuk Kwon
6e20924350 Add attention sink in attention backends (#22320)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>

Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
2025-08-05 22:37:21 -07:00
Woosuk Kwon
dd16bdc798 Increase openai-python version (#22316)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-05 21:43:21 -07:00
Woosuk Kwon
e3c876dca3 Upgrade FA3 for attention sink (#22313)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-05 21:36:21 -07:00
Gregory Shtrasberg
5d5d419ca6 [Bugfix][CI/Build][ROCm] Make sure to use the headers from the build folder on ROCm (#22264)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-08-05 20:39:32 -07:00
Rui Qiao
302962e806 [Bugfix] Skip dead and non-GPU nodes for Ray DP engine allocation (#22275)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-08-05 20:35:32 -07:00
Benjamin Chislett
7e6544c797 [Perf] Parallelize fill_bitmask to accelerate high-throughput guided decoding (#21862)
Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
2025-08-05 19:57:49 -07:00
Jee Jee Li
8e6c7e873f [Bugfix] Fix MoE BNB version (#22260)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-05 19:56:22 -07:00
Michael Goin
6a51530437 [Bugfix] Fix 3D input passed into cutlass_scaled_mm (#22278)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-06 10:35:20 +08:00
Michael Goin
35509fc5be [Bugfix] Remove faulty test for oot attention backend (#22286)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-06 00:05:40 +00:00
Siyuan Liu
4b29d2784b [CI][TPU] Fix docker clean up (#22271)
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
2025-08-05 23:54:56 +00:00
youkaichao
59a0b8554b [bugfix] fix blackwell deepep installation (#22255) 2025-08-06 01:26:09 +08:00
Giancarlo Delfin
469b3ffaaa [V1] port xformers backend to v1 (#21342)
Signed-off-by: Giancarlo Delfin <gdelfin@meta.com>
2025-08-05 10:04:46 -07:00
Wentao Ye
ae87ddd040 [Refactor] Remove Unused Environment Variable VLLM_NO_DEPRECATION_WARNING (#22199)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-05 09:40:23 -07:00
Michael Goin
a7cb6101ca [CI/Build] Update flashinfer to 0.2.9 (#22233)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-05 09:39:38 -07:00
Michael Goin
c494f96fbc Use UV_LINK_MODE=copy in Dockerfile to avoid hardlink fail (#22128)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-05 06:57:10 -07:00
Nicolò Lucchesi
0c275ad5ad [V0 Deprecation][TPU] Remove V1 flag check from tests (#22248)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-08-05 06:53:23 -07:00
Ning Xie
74333ae2f6 [Misc] correct static type check for GroupCoordinator (#21946)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-08-05 03:17:46 -07:00
elvischenv
83156c7b89 [NVIDIA] Support Flashinfer TRT-LLM Prefill Attention Kernel (#22095)
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
2025-08-05 02:45:34 -07:00
Wentao Ye
4771df7b2b [Feature] Non-contiguous Support for FP8 Quantization (#21961)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-08-05 02:36:43 -07:00
Benji Beck
05fae02175 Migrate KimiVLImagePixelInputs to TensorSchema (#21769)
Signed-off-by: Benji Beck <benjibeck@meta.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
2025-08-05 02:36:18 -07:00
Nicolò Lucchesi
d1bf1b9711 [Docs][TPU] Highlight TPU Software version selection (#22242)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-08-05 02:33:46 -07:00
wang.yuqi
586f286789 [Model] Pooling model activation supports per request control by PoolingParams (#20538)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-08-05 00:37:00 -07:00
Cyrus Leung
811ac13d03 [Core] Factor out common logic for MM budget calculation (#22228)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-04 23:54:55 -07:00
Michael Goin
e79a12fc3a [UX] Fail if an invalid attention backend is specified (#22217)
Signed-off-by: mgoin <michael@neuralmagic.com>
2025-08-04 23:54:52 -07:00
Cyrus Leung
cdfd6871a5 [Bugfix] Misaligned params in TreeAttentionImpl (#22226)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-04 22:40:09 -07:00
ZiTian.Zhao
4b3e4474d7 Optimize configuration access with LRU cache in custom ops (#22204)
Signed-off-by: zitian zhao <zitian.zhao@tencentmusic.com>
2025-08-04 21:43:24 -07:00
Ning Xie
bd3db7f469 [Misc] log more detailed message for ensure_model_parallel_initialized (#22144)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-08-04 19:36:55 -07:00
Ning Xie
29b97c0995 [Doc] add backend to doc string of initialize_model_parallel (#22142)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-08-04 19:36:20 -07:00
elvischenv
7b455cf1c0 [Misc] Remove pass_config from CompilationConfig dump_json excluded (#21911)
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
2025-08-04 19:17:18 -07:00
tlipoca9
8a6e108e76 fix: kimi_k2 return empty tool call list (#22149)
Signed-off-by: tlipoca9 <tlipoca9@gmail.com>
2025-08-04 19:15:31 -07:00
Wentao Ye
d7b28f3415 [Log] DeepGEMM Update Log for Unaligned Problem Size (#22208)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-04 19:13:19 -07:00
Yuxuan Zhang
6fa41e0c32 self.gate dtype update for GLM-4.5 (#22203)
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
2025-08-04 19:12:38 -07:00
Gregory Shtrasberg
031ca762d7 [ROCm][Bugfix] Compilation passes fix (#22202)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-08-04 19:12:28 -07:00
TJian
6ad6b8e115 [FEAT] Refactor ROPE into module (#22192)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-08-04 19:12:16 -07:00
lkchen
f4f4e7ef27 [V0 deprecation][P/D] Deprecate v0 KVConnectorBase code (1/2) (#21785)
Signed-off-by: Linkun Chen <github@lkchen.net>
2025-08-04 19:11:33 -07:00
Giancarlo Delfin
5ea71ff46f [V1] reduce block size for tree attention correctness test to fix 'ou… (#22207)
Signed-off-by: Giancarlo Delfin <gdelfin@meta.com>
2025-08-04 19:11:06 -07:00
Woosuk Kwon
7175817637 Revert "[Bugfix] V1 Fix the cursor leakage issue during request scheduling." (#22223) 2025-08-04 18:37:06 -07:00
PiteXChen
2dffac464c [Bugfix] V1 Fix the cursor leakage issue during request scheduling. (#21173)
Signed-off-by: CLFutureX <775523362@qq.com>
2025-08-04 18:34:10 -07:00
Po-Han Huang (NVIDIA)
bdcb42e45d [NVIDIA] Auto detect modelopt quant and fix DSR1-FP4 weight loading (#22073) 2025-08-04 21:02:55 -04:00
Zhonghua Deng
c09efff976 [Bugfix][V1][P/D]Fix the uneven polling issue in the toy proxy for P2pNcclConnector (#21819)
Signed-off-by: Abatom <abzhonghua@gmail.com>
2025-08-04 20:17:05 +00:00
ericehanley
309c1bb822 [Bug] Update auto_tune.sh to separate benchmarking and profiling. (#21629)
Signed-off-by: Eric Hanley <ericehanley@google.com>
2025-08-04 15:12:06 +00:00
Woosuk Kwon
9af654cc38 [Responses API] Ignore store=True and process the request by default (#22185)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-04 05:12:48 -07:00
Raghav Ravishankar
a5fff3bd49 Fix Arcee model weight loading: Add custom load_weights (#21725)
Signed-off-by: alyosha-swamy <raghav@arcee.ai>
2025-08-04 04:09:56 -07:00
Cyrus Leung
1539ced93a [Doc] Update pooling model docs (#22186)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-04 03:37:06 -07:00
22quinn
54de71d0df [Sampler] Support returning all logprobs or logits (#21792)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-08-04 03:04:12 -07:00
Isotr0py
fed5849d3f [Bugfix] Fix failing GGUF models test (#22174)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-04 01:27:02 -07:00
Weixiao Huang
c1b4eb048a [feat] move WEIGHT_SCALE_SUPPORTED into raise block to accelerate RLHF weight loading (#21164)
Signed-off-by: huangweixiao <huangweixiao@msh.team>
2025-08-04 15:43:06 +08:00
Jee Jee Li
a7b8788d2c [Misc] Modify the organization of GLM series (#22171)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-03 23:51:20 -07:00
Tyler Michael Smith
8ecb3e9e93 [CI Bugfix] Fix wNa16 kernel not found for test_shared_storage_connector_hashes (#22163)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-08-03 22:19:04 -07:00
Chenxi Yang
e5949e5ae0 Remove index_put from MM embeddings merging (#22105)
Co-authored-by: Chenxi Yang <cxyang@meta.com>
2025-08-03 22:15:14 -07:00
ZiTian.Zhao
49bcd893e7 [refactor] improve ConstantList exception specificity (#22156)
Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
2025-08-03 22:14:49 -07:00
Giancarlo Delfin
aa7012eb6d Add tree attention backend for v1 (part 1) (#20401)
Signed-off-by: Giancarlo Delfin <gdelfin@meta.com>
2025-08-03 22:13:26 -07:00
Ning Xie
c2e75b3c11 remove duplicate code within cleanup_dist_env_and_memory (#22147)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-08-03 20:03:58 -07:00
Abirdcfly
0d7db16a92 [PD] add test for chat completions endpoint (#21925)
Signed-off-by: Abirdcfly <fp544037857@gmail.com>
2025-08-03 19:57:03 -07:00
22quinn
845420ac2c [RLHF] Fix torch.dtype not serializable in example (#22158)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-08-04 02:43:33 +00:00
ZiTian.Zhao
e27d25a0dc [fix] fix correct assertion syntax error in attention utils. (#22154)
Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
2025-08-03 19:24:02 -07:00
Seiji Eicher
6f5478298d Use aiohttp connection pool for benchmarking (#21981)
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
2025-08-03 19:23:32 -07:00
Isotr0py
6a39ba85fe [Bugfix] Fix failing multimodal standard test (#22153)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-03 19:04:38 +00:00
Yuxuan Zhang
d3c18c9cb0 fuse fp32 for GLM-4.5 e_score_correction_bias (#22143)
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
2025-08-03 09:04:54 -07:00
TankNee
83f7bbb318 Add chat doc in quick start (#21213)
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-08-03 07:47:55 -07:00
Li, Jiang
b5dfb94fa0 [CI/Build][Bugfix] Fix Qwen2.5 tests in CPU CI via fallback silu_and_mul to torch native implementation (#22145)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-08-03 05:34:04 -07:00
Woosuk Kwon
6d98843b31 [Responses API] Disable response store by default (#22137)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-03 04:04:21 -07:00
David Ben-David
aefeea0fde [V1] [P/D] Refactor KV Connector Path (#21980)
Signed-off-by: David Ben-David <davidb@pliops.com>
Co-authored-by: David Ben-David <davidb@pliops.com>
2025-08-03 04:03:40 -07:00
H
24d1dffbeb [executor] feat: add supports_pp attr to executors (#21786)
Signed-off-by: Haibin Lin <haibin.lin@bytedance.com>
2025-08-03 18:04:45 +08:00
Ning Xie
7de45db9a5 [Misc] update doc comment for send (#22026)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-08-03 00:55:20 -07:00
Roberto L. Castro
789562c28c Support CUTLASS NVFP4 (w4a4) for Blackwell Geforce GPUs (SM120) (#21309)
Signed-off-by: LopezCastroRoberto <roberto.lopez.castro@udc.es>
2025-08-03 00:54:22 -07:00
Ye (Charlotte) Qi
3f36c325fa [Benchmark] Support ready check timeout in vllm bench serve (#21696)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Co-authored-by: Roger Wang <hey@rogerw.me>
2025-08-03 00:52:38 -07:00
Isotr0py
3dddbf1f25 [Misc] Add tensor schema test coverage for multimodal models (#21754)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-08-03 00:52:14 -07:00
jiahanc
337eb23bcc [Fix] Fix llama4 modelopt weight loading error (#22107)
Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-08-03 00:50:34 -07:00
Rui Qiao
2ff46b8826 [Misc] Bump ray to 2.48.0 (#22123)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-08-02 19:42:00 -07:00
Xiao
554df8a6a2 Revert "[compile][startup] Disable C++ compilation of symbolic shapes" (#22122)
Signed-off-by: Xiao Liu <xiszishu@gmail.com>
2025-08-02 09:03:30 -07:00
Yan Ma
73e1b9b1d4 [xpu]support moe models on XPU platform (#21643)
Signed-off-by: yan <yan.ma@intel.com>
Signed-off-by: Yan Ma <yan.ma@intel.com>
2025-08-02 07:49:08 -07:00
Thomas Parnell
4abfd8796f [V1] [Hybrid] Validate compatibility of attention backend batch reordering at init time (#21557)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-08-02 05:29:40 -07:00
Cyrus Leung
f5d0f4784f [Frontend] Improve error message for too many mm items (#22114)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-02 02:20:38 -07:00
Chih-Chieh Yang
b690e34824 [Model] Mamba2 preallocate SSM output tensor to avoid d2d copy overhead (#21075)
Signed-off-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com>
Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
2025-08-02 01:59:34 -07:00
Yuxuan Zhang
25373b6c6c for glm-4.1V update (#22000)
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
2025-08-02 01:46:57 -07:00
Vadim Gimpelson
58eee5f2e0 [PERF] Use faster way of decode in tokenizer: avoid useless list-to-list conversion (#20000)
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@centml.ai>
2025-08-02 01:43:52 -07:00
Roger Wang
067c34a155 docs: remove deprecated disable-log-requests flag (#22113)
Signed-off-by: Roger Wang <hey@rogerw.me>
2025-08-02 00:19:48 -07:00
Chih-Chieh Yang
c64861d63c [Bugfix] Mamba2 remove bugged initial state condition in chunk scan (#22034)
Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
2025-08-01 23:55:57 -07:00
Yong Hoon Shin
8564dc9448 Fix test_kv_sharing_fast_prefill flakiness (#22038)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-08-01 23:55:34 -07:00
Rui Qiao
4ac8437352 [Misc] Getting and passing ray runtime_env to workers (#22040)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-08-01 23:54:40 -07:00
vllmellm
d3a6f2120b [FEAT][ROCm] Enable running Flash Attention as ViT attn backend for Qwen-VL models on ROCm platform. (#22069)
Signed-off-by: tjtanaavllm <tunjian.tan@amd.com>
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: tjtanaavllm <tunjian.tan@amd.com>
2025-08-01 23:53:18 -07:00
Sage Moore
0edaf752d7 [Attention][DBO] Add support for "splitting" the CommonAttentionMetadata (#21153)
Signed-off-by: Sage Moore <sage@neuralmagic.com>
2025-08-01 19:47:53 -07:00
Wentao Ye
6e8d8c4afb [Test] Add Unit Test for Batched DeepGEMM (#21559)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-02 10:45:46 +08:00
Nick Hill
8d524ce79f [BugFix] Improve internal DP load balancing (#21617)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-08-01 19:45:27 -07:00
Dipika Sikka
9f9c38c392 [Speculators][Speculative Decoding] Add Qwen Eagle3 Support (#21835)
Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
2025-08-01 19:43:37 -07:00
Varun Sundar Rabindranath
a65f46be5e [Misc] DeepGemmExperts : Avoid JIT generation in the hot-path (#21955)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-08-01 19:42:03 -07:00
Nicolò Lucchesi
57393715e8 [Misc] VLLM_TARGET_DEVICE.lower() (#22101)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-08-01 19:41:40 -07:00
vllmellm
ee2eb6ecd8 [Model] Qwen2.5 VL SiLU-and-Mul (#22066)
Signed-off-by: kf <kuanfu.liu@embeddedllm.com>
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: kf <kuanfu.liu@embeddedllm.com>
2025-08-01 19:34:37 -07:00
fhl2000
23322431c8 [V1][CUDA] Full cudagraph support for FlashInfer (#21367) 2025-08-01 21:49:34 -04:00
JartX
3654847db5 feat: Add Support GPTQ Quantization MOE on ROCM vllm serve (#21733) 2025-08-01 21:12:19 -04:00
Wentao Ye
eefbf4a68b [Perf] Optimize reshape_and_cache_flash CUDA Kernel (#22036)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-01 19:18:51 -04:00
Michael Goin
88faa466d7 [CI] Initial tests for SM100 Blackwell runner (#21877)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-01 16:18:38 -07:00
Nick Hill
881e1af43a [BugFix] Harden distributed DP startup (#21538)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-08-01 21:40:45 +00:00
XiongfeiWei
d84b97a3e3 Add lora test for tp>1 case for TPU. (#21970)
Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
2025-08-01 18:56:08 +00:00
Rui Qiao
d331759488 Introduce RayPPCommunicator for ray-based PP (#21660)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-08-01 11:50:58 -07:00
Animesh Jain
9659bc7f27 [compile][startup] Disable C++ compilation of symbolic shapes (#20836)
Signed-off-by: Animesh Jain <anijain@umich.edu>
2025-08-01 10:38:52 -07:00
Michael Goin
3277e8f9e1 Fix pre-commit failure for SECURTIY.md (#22102)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-01 10:36:07 -07:00
Jee Jee Li
8d705996df [Misc] Minor enhancement of benchmark_moe (#22068)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-02 01:35:30 +08:00
Harry Mellor
38c8bce8b6 Enable headless models for pooling in the Transformers backend (#21767)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-01 10:31:29 -07:00
Varun Sundar Rabindranath
ac45c44d98 [Bugfix] [Performance] DeepEPHighThroughput + DeepSeek : Quant before Dispatch (#21837)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-08-01 10:14:38 -07:00
Huzaifa Sidhpurwala
d6664664b4 security policy: take 1 (#21119)
Signed-off-by: Huzaifa Sidhpurwala <huzaifas@redhat.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
2025-08-01 10:09:49 -07:00
rongfu.leng
b879ecd6e2 [Bugfix] fix when skip tokenizer init (#21922)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-08-01 10:09:36 -07:00
Isotr0py
3f8e952179 [Bugfix] Fix glm4.1v video inference issue (#22067)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-08-01 09:33:30 -07:00
Harry Mellor
326a1b001d Improve documentation of ModelConfig.try_get_generation_config to prevent future confusion (#21526)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-01 09:32:27 -07:00
Harry Mellor
2d7b09b998 Deprecate --disable-log-requests and replace with --enable-log-requests (#21739)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-01 17:16:37 +01:00
David Xia
97608dc276 [Docs] use uv in CPU installation docs (#22089)
Signed-off-by: David Xia <david@davidxia.com>
2025-08-01 07:55:55 -07:00
Nick Hill
3146519add [BugFix] Don't change title of top-level process (#22032)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-08-01 07:37:55 -07:00
Richard Zou
8026a335a1 [BugFix] Update AttnFusionPass cache key (#21947)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2025-08-01 07:11:29 -07:00
Wentao Ye
a59cd9d9f7 [Refactor] Fix Compile Warning #1444-D (#21462)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-01 06:10:30 -07:00
Abirdcfly
5c54d9759d [Bugfix][PD] set max_completion_tokens=1 if req has this value (#21841)
Signed-off-by: Abirdcfly <fp544037857@gmail.com>
2025-08-01 06:08:45 -07:00
Gamhang
0a6d305e0f feat(multimodal): Add customizable background color for RGBA to RGB conversion (#22052)
Signed-off-by: Jinheng Li <ahengljh@gmail.com>
Co-authored-by: Jinheng Li <ahengljh@gmail.com>
2025-08-01 06:07:33 -07:00
Michael Goin
f81c1bb055 [Bugfix] Check NVIDIA artifactory is accessible before using flashinfer cubin kernels (#21893) 2025-08-01 08:28:45 -04:00
Harry Mellor
fb0e0d46fc Fix get_kwargs for case where type hint is list[Union[str, type]] (#22016)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-01 05:26:42 -07:00
TJian
26b5f7bd2a [BUG] [ROCm] Fix import bug on ROCm (#22083)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-08-01 05:25:20 -07:00
Dipika Sikka
dfbc1f8880 [Speculative Decoding] Add speculators config support (#21345) 2025-08-01 08:25:18 -04:00
Harry Mellor
87c94bc879 Revert "Update sampling_metadata.py (#21937)" (#22088)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-01 05:24:46 -07:00
Jee Jee Li
28b18cc741 [Quantization] Enable BNB support for InternS1 (#21953)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-01 11:09:54 +00:00
WeiQing Chen
4931486988 [Doc] Added warning of speculating with draft model (#22047)
Signed-off-by: Dilute-l <dilu2333@163.com>
Co-authored-by: Dilute-l <dilu2333@163.com>
2025-08-01 02:11:56 -07:00
Woosuk Kwon
0f81b310db [Misc] Remove upper bound in openai package version (#22060)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-01 02:11:40 -07:00
wuhang
e6680f9e25 [Bugfix] Add log prefix in non-dp mode engine core (#21889)
Signed-off-by: wuhang <wuhang6@huawei.com>
2025-08-01 09:04:16 +00:00
Roger Wang
27a145e893 [Doc] Add example for Step3-VL (#22061)
Signed-off-by: Roger Wang <hey@rogerw.me>
2025-08-01 08:35:49 +00:00
Simon Mo
da31f6ad3d Revert precompile wheel changes (#22055) 2025-08-01 08:26:24 +00:00
Sungyoon Jeong
98df153abf [Frontend] Align tool_choice="required" behavior with OpenAI when tools is empty (#21052)
Signed-off-by: Sungyoon Jeong <sungyoon.jeong@furiosa.ai>
2025-08-01 07:54:17 +00:00
Zebing Lin
e0f63e4a35 [Core] Avoid repeated len(block_token_ids) check in hash_request_tokens (#21781)
Signed-off-by: linzebing <linzebing1995@gmail.com>
2025-08-01 00:23:29 -07:00
Cyrus Leung
b4e081cb15 [Bugfix] Disable multi-modal preprocessor cache for DP (#21896)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-01 08:03:56 +01:00
Hongsheng Liu
79731a79f0 [Doc] Fix a syntax error of example code in structured_outputs.md (#22045)
Signed-off-by: wangzi <3220100013@zju.edu.cn>
Co-authored-by: wangzi <3220100013@zju.edu.cn>
2025-08-01 00:01:22 -07:00
Aviad Rossmann
53d7c39271 Update sampling_metadata.py (#21937)
Signed-off-by: Aviad Rossmann <aviadr@neureality.ai>
2025-07-31 23:23:18 -07:00
Cyrus Leung
61dcc280fa [Doc] Add Voxtral to Supported Models page (#22059)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-31 23:10:56 -07:00
Kyle Sayers
0f46a780d4 [Model] [Quantization] Support quantization for Gemma3n (#21974)
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
2025-07-31 22:45:15 -07:00
Mickaël Seznec
e1a7fe4af5 [BugFix] fix: aot passes kvcache dtype information (#19750)
Signed-off-by: Mickael Seznec <mickael@mistral.ai>
2025-08-01 05:45:02 +00:00
Cyrus Leung
82de9b9d46 [Misc] Automatically resolve HF processor init kwargs (#22005)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-31 22:44:10 -07:00
Charent
ad57f23f6a [Bugfix] Fix: Fix multi loras with tp >=2 and LRU cache (#20873)
Signed-off-by: charent <19562666+charent@users.noreply.github.com>
2025-07-31 19:48:13 -07:00
Wentao Ye
3700642013 [Refactor] Remove Duplicate per_block_cast_to_fp8, Remove Dependencies of DeepGEMM (#21787)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-01 01:13:27 +00:00
Michael Goin
0bd409cf01 Move flashinfer-python to optional extra vllm[flashinfer] (#21959)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-31 18:02:11 -07:00
Matthew Bonanni
e360316ab9 Add DeepGEMM to Dockerfile in vllm-base image (#21533)
Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-07-31 18:01:55 -07:00
Wentao Ye
c3e0e9337e [Feature] Add Flashinfer MoE Support for Compressed Tensor NVFP4 (#21639)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-31 15:26:11 -07:00
Ilya Markov
6e672daf62 Add FlashInfer allreduce RMSNorm Quant fusion (#21069)
Signed-off-by: ilmarkov <imarkov@redhat.com>
Signed-off-by: ilmarkov <markovilya197@gmail.com>
Co-authored-by: ilmarkov <imarkov@redhat.com>
2025-07-31 13:58:38 -07:00
Benjamin Chislett
2dff2e21d9 [Bugfix] Fix MTP weight loading (#21941) 2025-07-31 16:33:53 -04:00
Yong Hoon Shin
71470bc4af [Misc] Add unit tests for chunked local attention (#21692)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-07-31 11:39:16 -07:00
zhiweiz
9e0726e5bf [Meta] Official Eagle mm support, first enablement on llama4 (#20788)
Signed-off-by: morgendave <morgendave@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.me>
2025-07-31 10:35:07 -07:00
XiongfeiWei
53c21e492e Update torch_xla pin to 20250730 (#21956)
Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
2025-07-31 17:26:43 +00:00
Alexei-V-Ivanov-AMD
0780bb5783 Removing amdproduction Tests (#22027)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
2025-07-31 09:53:27 -07:00
Doug Smith
58bb902186 fix(setup): improve precompiled wheel setup for Docker builds (#22025)
Signed-off-by: dougbtv <dosmith@redhat.com>
2025-07-31 09:52:48 -07:00
Zhengxu Chen
7349d5268b [ez] Remove a trailing space from compilation/decorators.py (#22028) 2025-07-31 09:46:07 -07:00
Song
9484641616 [Model] Add step3 vl (#21998)
Signed-off-by: oliveryuan <yuansong@step.ai>
Co-authored-by: oliveryuan <yuansong@step.ai>
2025-07-31 23:19:06 +08:00
amirkl94
207b750e19 [NVIDIA] Add SM100 Flashinfer MoE per tensor scale fp8 backend (#21458)
Signed-off-by: Amir Klein <203507526+amirkl94@users.noreply.github.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-07-31 06:00:01 -07:00
Nick Hill
5daffe7cf6 [BugFix] Fix case where collective_rpc returns None (#22006)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-07-31 12:51:37 +00:00
wang.yuqi
2836dd73f1 [Model][CI] Let more pooling models support v1 (#21747)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-07-31 01:51:15 -07:00
Daniele
d2aab336ad [CI/Build] get rid of unused VLLM_FA_CMAKE_GPU_ARCHES (#21599)
Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
2025-07-31 15:00:08 +08:00
Cyrus Leung
9532a6d563 [Deprecation] Remove deprecated args and methods (#21907)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-30 23:46:38 -07:00
Ning Xie
3e36fcbee6 [Bugfix]: fix metadata file copy in test_sharded_state_loader (#21830)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-07-31 06:22:11 +00:00
Michael Goin
055bd3978e [CI Bugfix] Fix CI OOM for test_shared_storage_connector_hashes (#21973)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-31 11:45:29 +08:00
Jee Jee Li
0f7919fca0 [Misc] Expand SUPPORTED_HIDDEN_SIZES for DeepEP low-latency kernels (#21818)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-30 20:41:12 -07:00
Michael Goin
61445453df [UX] Rename CUTLASS_MLA_VLLM_V1 to CUTLASS_MLA (#21966)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-30 20:40:34 -07:00
Sanchit Gandhi
ec02e536df [Bugfix] Relax lang pin for voxtral (#21833)
Signed-off-by: Sanchit Gandhi <sgandhi3141@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-07-30 20:38:52 -07:00
Michael Goin
9cb497bfa3 [Example] Add async_llm_streaming.py example for AsyncLLM streaming in python (#21763)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-30 18:39:46 -06:00
Zebing Lin
ca9e2be3ed [Core] Move EngineCoreRequest to Request conversion out of EngineCore (#21627)
Signed-off-by: linzebing <linzebing1995@gmail.com>
2025-07-30 15:00:54 -07:00
Bram
601f856d56 [Bugfix] Fix None value handling in trace span creation for cancelled requests (#20272) 2025-07-30 14:44:02 -07:00
cascade
287f527f54 [Feature] Add async tensor parallelism for scaled mm (#20155)
Signed-off-by: cascade812 <cascade812@outlook.com>
2025-07-30 17:23:41 -04:00
Ming Yang
f12d9256b3 [Misc] Use dracut on CentOS and skip clone if repo exists for EP kernel installation (#21635)
Signed-off-by: Ming Yang <minos.future@gmail.com>
2025-07-30 13:15:06 -07:00
Doug Smith
b9b753e7a7 For VLLM_USE_PRECOMPILED, only compiled .so files should be extracted (#21964) 2025-07-30 13:04:40 -07:00
Nick Hill
56bd537dde [Misc] Support more collective_rpc return types (#21845)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-07-30 10:20:20 -07:00
wenxindongwork
8f0d516715 [TPU] Support Pathways in vLLM (#21417)
Signed-off-by: wenxindongwork <wenxindong@google.com>
2025-07-30 10:02:12 -07:00
wxsm
f4135232b9 feat(distributed): add get_required_kvcache_layout class method to kv connector api (#20433)
Signed-off-by: wxsm <wxsms@foxmail.com>
2025-07-30 16:41:51 +00:00
Chenguang Zheng
4904e53c32 [Bugfix] SharedStorage Connector for V1 PD multimodal (#21611)
Signed-off-by: fake0fan <645327136@qq.com>
Signed-off-by: herotai214 <herotai214@gmail.com>
Co-authored-by: herotai214 <herotai214@gmail.com>
2025-07-30 09:18:37 -07:00
Cyrus Leung
004203e953 [CI/Build] Fix registry tests (#21934)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-30 09:10:41 -07:00
633WHU
5c765aec65 [Bugfix] Fix TypeError in scheduler when comparing mixed request_id types (#21816)
Signed-off-by: chiliu <chiliu@paypal.com>
Co-authored-by: chiliu <chiliu@paypal.com>
2025-07-30 08:54:44 -07:00
Yong Hoon Shin
ad510309ee Override attention metadata for fast prefill in some KV sharing setups (#21590)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-07-30 08:54:15 -07:00
Cyrus Leung
366f6b3a4d [Bugfix] Fix multi-api server not working for text models (#21933)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-30 08:42:05 -07:00
Isotr0py
6e599eebe8 [Bugfix] Fix OOM tests in initialization test (#21921)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-07-30 07:35:47 -07:00
Harry Mellor
88edf5994c [Docs] Reduce the size of the built docs (#21920)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-30 07:35:08 -07:00
Po-Han Huang (NVIDIA)
ff08e51940 [NVIDIA] Fix Llama4 Scout FP4 functionality issues (#21499)
Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
2025-07-30 07:33:40 -07:00
Ruixiang Tan
8f4a1c9a04 [Misc] Improve code readability of KVCacheManager (#21673)
Signed-off-by: tanruixiang <tanruixiang0104@gmail.com>
Signed-off-by: Ruixiang Tan <819464715@qq.com>
Signed-off-by: GitHub <noreply@github.com>
2025-07-30 07:20:43 -07:00
Harry Mellor
36ede45989 Reduce time wasted in GitHub Actions using concurrency (#21919)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-30 07:18:02 -07:00
Cyrus Leung
0e40b26073 [CI/Build] Only run markdownlint in CI (#21892)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-30 07:17:14 -07:00
Wentao Ye
0271c2ff2f [Test] Add Benchmark and Unit Test for per_token_group_quant (#21860)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-30 07:15:02 -07:00
youkaichao
e91d3c9cda [misc] skip p2p check by default (#21904) 2025-07-30 22:05:04 +08:00
Yan Pashkovsky
bf668b5bf5 [Feature] Support multiple api keys in server (#18548)
Signed-off-by: Yan Pashkovsky <yanp.bugz@gmail.com>
2025-07-30 07:03:23 -07:00
rongfu.leng
da3e0bd6e5 [Bugfix] we should use metavar is not choices (#21902)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-07-30 06:51:58 -07:00
Cyrus Leung
fcfd1eb9c5 [Doc] Remove vLLM prefix and add citation for PagedAttention (#21910)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-30 06:36:34 -07:00
aladerran
d979dd6beb [Feature][EPLB] Add eplb support for Qwen3 (#20815)
Signed-off-by: aladerran <aladerran@gmail.com>
2025-07-30 06:27:57 -07:00
Eric Curtin
b876860c62 [Hardware][CPU] Build fix for ARM without BF16 (#21848)
Signed-off-by: Eric Curtin <ecurtin@redhat.com>
2025-07-30 06:22:00 -07:00
Patrick von Platen
13986365a9 Add @patrickvonplaten as maintainer of mistral's related files. (#21928)
Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com>
2025-07-30 20:42:51 +08:00
Hongsheng Liu
5c8fe389d6 [Docs] Fix the example code of streaming chat completions in reasoning (#21825)
Signed-off-by: wangzi <3220100013@zju.edu.cn>
Co-authored-by: wangzi <3220100013@zju.edu.cn>
Co-authored-by: Zi Wang <66560864+BruceW-07@users.noreply.github.com>
2025-07-30 12:11:58 +00:00
Cyrus Leung
5bbaf492a6 [Doc] Update partial support (#21916)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-30 01:32:39 -07:00
Peter Pan
533db0935d [benchmark] add max-concurrency in result table (#21095)
Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>
2025-07-30 01:15:43 -07:00
Jee Jee Li
fc91da5499 [Model] Remove DSV2 unused code (#21903)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-30 00:55:03 -07:00
Varun Vinayak Shenoy
547795232d [Tests] Fixing bug inside MultiModalProfiler. (#21842)
Signed-off-by: Varun Shenoy <varun.vinayak.shenoy@oracle.com>
2025-07-30 00:44:15 -07:00
Kebe
30ef30ed5a [CI] rollback lint-and-deploy pipeline using amd machine (#21912)
Signed-off-by: Kebe <mail@kebe7jun.com>
2025-07-30 00:37:59 -07:00
Jee Jee Li
02f82fe438 [Doc] Update Intern-S1 info (#21908)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-29 23:58:57 -07:00
Cyrus Leung
2ca5f82c2a [Misc] Remove redundant config definitions (#21891)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-29 23:54:18 -07:00
Louie Tsai
6f8d261882 Update vLLM Benchmark Suite for Xeon based on 0.9.2 release (#21486)
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
2025-07-30 05:57:03 +00:00
Ricardo Decal
4cd7fe6cea [Docs] Expand introduction to Ray in Multi-node deployment section (#21584)
Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
2025-07-29 22:07:28 -07:00
Cyrus Leung
16f3250527 [CI/Build] Fix pre-commit failure in docs (#21897)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-29 21:53:08 -07:00
Tao He
e3bc17ceea Add @sighingnow as maintainer of qwen's related files. (#21895)
Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>
2025-07-29 21:30:44 -07:00
Kunshang Ji
05cbbe20c5 [XPU] use ZE_AFFINITY_MASK for device select on xpu (#21815)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2025-07-30 03:56:14 +00:00
wang.yuqi
65f311ce59 [Frontend] Add LLM.reward specific to reward models (#21720)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-07-29 20:56:03 -07:00
Wentao Ye
1b0a155534 [Perf] Using __nv_fp8_e4m3 instead of c10::e4m3 for per_token_group_quant (#21867)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-29 21:50:46 -06:00
Cyrus Leung
44bc46da60 [Bugfix] Actually disable processing cache when API server is scaled out (#21839)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-29 20:36:04 -07:00
MingzhenHan
b7b23da4d2 [Bugfix] Fix comment typo of get_num_common_prefix_blocks() (#21827)
Signed-off-by: MingzhenHan <hanmingzhen2002@outlook.com>
2025-07-29 20:35:33 -07:00
Areeb Syed
fdde18229e [Bugfix] Fix shape mismatch assertion error when loading Gemma3n model with BitsAndBytes quantization (#21808)
Signed-off-by: sydarb <areebsyed237@gmail.com>
2025-07-30 11:35:21 +08:00
Csrayz
b917da442b Expose PyTorch profiler configuration to environment variables (#21803)
Signed-off-by: Csrayz <33659823+Csrayz@users.noreply.github.com>
2025-07-29 19:46:31 -07:00
Michael Goin
fb58e3a651 [Docs] Update docker.md with HF_TOKEN, new model, and podman fix (#21856) 2025-07-29 19:45:41 -07:00
Chen Zhang
76080cff79 [DOC] Fix path of v1 related figures (#21868)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-07-29 19:45:18 -07:00
Harry Mellor
ba5c5e5404 [Docs] Switch to better markdown linting pre-commit hook (#21851)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-29 19:45:08 -07:00
Chen Zhang
555e7225bc [v1][attention] Support Hybrid Allocator + FlashInfer (#21412)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-07-30 01:45:29 +00:00
milesial
0e36abf993 [Bugfix] Correct max tokens for non-contiguous embeds (#21798)
Signed-off-by: Alexandre Milesi <30204471+milesial@users.noreply.github.com>
Co-authored-by: Alexandre Milesi <30204471+milesial@users.noreply.github.com>
2025-07-30 01:16:25 +00:00
Simon Mo
452b2a3180 [ci] mark blackwell test optional for now (#21878) 2025-07-29 18:03:27 -07:00
Simon Mo
0d0cc9e150 [ci] add b200 test placeholder (#21866)
Signed-off-by: simon-mo <simon.mo@hey.com>
2025-07-29 17:11:50 -07:00
Yong Hoon Shin
9266d98048 [BugFix] Fix interleaved sliding window not set for Gemma3n (#21863)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-07-29 16:34:19 -07:00
Gregory Shtrasberg
176bbce1db Revert "[AMD][CI/Build] Fix the AMD issue caused by inappropriate of symbol exposure (#21647)" (#21850)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-07-29 21:56:29 +00:00
Doug Smith
a1873db23d docker: docker-aware precompiled wheel support (#21127)
Signed-off-by: dougbtv <dosmith@redhat.com>
2025-07-29 14:45:19 -07:00
Michael Goin
a33ea28b1b Add flashinfer_python to CUDA wheel requirements (#21389)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-29 12:51:58 -07:00
David Xia
7b49cb1c6b [Doc] update Contributing page's testing section (#18272)
Signed-off-by: David Xia <david@davidxia.com>
2025-07-29 10:32:46 -07:00
Varun Sundar Rabindranath
f03e9cf2bb [Doc] Add FusedMoE Modular Kernel Documentation (#21623)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-07-29 10:32:30 -07:00
David Xia
37f86d9048 [Docs] use uv in GPU installation docs (#20277)
Signed-off-by: David Xia <david@davidxia.com>
2025-07-29 10:32:06 -07:00
elvischenv
58b11b24a6 [Bugfix] Fix workspace buffer None issue for Flashinfer TRTLLM Backend (#21525)
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
2025-07-29 10:34:00 -04:00
Wenhua Cheng
ad341c5194 [Bugfix]fix mixed bits and visual language model quantization in AutoRound (#21802)
Signed-off-by: Wenhua Cheng <wenhua.cheng@intel.com>
2025-07-29 07:26:31 -07:00
Brittany
759b87ef3e [TPU] Add an optimization doc on TPU (#21155)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-29 07:23:19 -07:00
Harry Mellor
f693b067a2 [Docs] Merge design docs for a V1 only future (#21832)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-29 07:22:50 -07:00
Richard Zou
04e38500ee [Bugfix] VLLM_V1 supports passing other compilation levels (#19340)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2025-07-29 09:35:58 -04:00
Cyrus Leung
ab714131e4 [Doc] Update compatibility matrix for pooling and multimodal models (#21831)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-29 06:29:51 -07:00
Chen Zhang
755fa8b657 [KVCache] Make KVCacheSpec hashable (#21791)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-07-29 19:58:29 +08:00
Kay Yan
2470419119 [Docs] Fix the outdated URL for installing from vLLM binaries (#21523)
Signed-off-by: Kay Yan <kay.yan@daocloud.io>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-29 04:56:27 -07:00
Jee Jee Li
61a6905ab0 [Model] Refactor JambaForCausalLM (#21394)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-29 18:25:07 +08:00
Reza Barazesh
37efc63b64 [V0 deprecation] Guided decoding (#21347)
Signed-off-by: Reza Barazesh <rezabarazesh@meta.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-29 03:15:30 -07:00
Isotr0py
a4528f0cac [Model]: Fused MoE for nomic-embed-text-v2-moe (#18321)
Signed-off-by: isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-07-29 03:13:27 -07:00
Cyrus Leung
a2480251ec [Doc] Link to RFC for pooling optimizations (#21806)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-28 23:53:18 -07:00
Nick Hill
7234fe2685 [Misc] Rework process titles (#21780)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-07-29 05:14:47 +00:00
Benji Beck
f1e2c095ec Migrate InternVLImageInputs and InternVLVideoInputs to TensorSchema (#21684)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-07-28 22:09:45 -07:00
Gregory Shtrasberg
12a223ef9b [AMD][CI/Build][Bugfix] Guarding CUDA specific functions by ifndef ROCM (#21766)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-07-29 03:35:37 +00:00
Calvin Chen
e18f085103 skip fusedmoe layer for start_load_kv (#21378)
Signed-off-by: calvin chen <wen.chen@dynamia.ai>
2025-07-28 18:59:44 -07:00
Michael Goin
afa2607596 [CI] Parallelize Kernels MoE Test (#21764)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-28 18:56:24 -07:00
Wentao Ye
48b763d6b5 [Refactor] Merge Compressed Tensor FP8 CompressedTensorsW8A8Fp8MoEMethod and CompressedTensorsW8A8Fp8MoECutlassMethod (#21775)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-28 19:47:21 -06:00
Michael Goin
947e982ede [Docs] Minimize spacing for supported_hardware.md table (#21779) 2025-07-28 18:46:39 -07:00
lyrisz
c6c9122d50 [Kernel] SM90 CUTLASS FP8 GEMM: add support for swap AB + kernel tuning (#20396)
Signed-off-by: Faqin Zhong <faqin.zhong@gmail.com>
Co-authored-by: Duncan Moss <djm.moss@gmail.com>
2025-07-28 23:13:58 +00:00
Lucas Wilkinson
8aa1485fcf [Perf] Disable chunked local attention by default with llama4 (#21761)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-07-28 18:49:04 -04:00
Nikhil Gupta
89ac266b26 [Feat]: Add support for Dynamic Quant 4 bit CPU kleidiai kernels (#17112)
Signed-off-by: Nikhil Gupta <nikhil.gupta2@arm.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-07-28 20:55:15 +00:00
Clayton Coleman
c6f36cfa26 [Bugfix] DeepGEMM is not enabled on B200 due to _lazy_init() (#21472)
Signed-off-by: Clayton Coleman <smarterclayton@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-07-28 20:51:22 +00:00
Kuntai Du
b18b417fbf Revert "[V1] Exception Handling when Loading KV Cache from Remote Store" (#21778)
Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
2025-07-28 20:15:18 +00:00
Lu Fang
9ba1c88a93 [AMD][CI/Build] Fix the AMD issue caused by inappropriate of symbol exposure (#21647)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-07-28 20:11:16 +00:00
Wentao Ye
e0e58f9729 [Bug] Enforce contiguous input for dynamic_scaled_fp8_quant and static_scaled_fp8_quant (#21773)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-28 19:55:48 +00:00
rasmith
b361f14e39 [AMD][BugFix] Fix omission of wvSplitK kernel for small batch sizes (1-4) due to torch.compile (#21350)
Signed-off-by: Randall Smith <Randall.Smith@amd.com>
2025-07-28 15:38:20 -04:00
weiliang
01c753ed98 update flashinfer to v0.2.9rc2 (#21701)
Signed-off-by: Weiliang Liu <weiliangl@nvidia.com>
2025-07-28 19:31:47 +00:00
Harry Mellor
94b71ae106 Use metavar to list the choices for a CLI arg when custom values are also accepted (#21760)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-28 19:31:10 +00:00
Nick Hill
7d44c691b0 [P/D] Log warnings related to prefill KV expiry (#21753)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-07-28 18:40:53 +00:00
Cyrus Leung
e17a4d3bf9 [Bugfix] Fix granite speech shape validation (#21762)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-28 14:19:21 -04:00
Chaojun Zhang
ec261b0291 [XPU] IPEX-optimized Punica Wrapper on XPU (#21703)
Signed-off-by: chzhang <chaojun.zhang@intel.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-28 16:43:37 +00:00
Cyrus Leung
04fe61aa3d [CI/Build] Fix plugin tests (#21758)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-28 15:08:05 +00:00
Michard Hugo
25708d317a [Bugfix] Mistral crashes on tool with no description (#21167)
Signed-off-by: HugoMichard <hugo@harfanglab.fr>
2025-07-28 08:03:35 -07:00
Cyrus Leung
0e18a5d058 [Misc] Reduce logs for model resolution (#21765)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-28 07:59:56 -07:00
Michael Goin
34a20c49b3 [Logs] Change flashinfer sampler logs to once (#21759)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-28 06:59:51 -07:00
Isotr0py
31084b3b1f [Bugfix][CI/Build] Update peft version in test requirement (#21729)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-07-28 06:17:43 -07:00
wuhang
bccc43c033 [Bugfix]check health for engine core process exiting unexpectedly (#21728)
Signed-off-by: wuhang <wuhang6@huawei.com>
2025-07-28 06:17:31 -07:00
Harry Mellor
1395dd9c28 [Docs] Add revision date to rendered docs (#21752)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-28 06:12:46 -07:00
Keyang Ru
9ace2eaf35 [Bugfix] Improve JSON extraction in LlamaToolParser (#19024)
Signed-off-by: keru <keyang.ru@oracle.com>
Co-authored-by: keru <keyang.ru@oracle.com>
2025-07-28 12:36:58 +00:00
Anton Vlasjuk
656c24f1b5 [Ernie 4.5] Name Change for Base 0.3B Model (#21735)
Signed-off-by: vasqu <antonprogamer@gmail.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-28 12:22:32 +00:00
Chauncey
63fe3a700f [PD] let p2p nccl toy proxy handle /chat/completions (#21734)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-07-28 11:45:50 +00:00
Isotr0py
0ae970ed15 [Bugfix] Fix glm4.1v video_grid_thw tensor shape scheme (#21744)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-07-28 04:26:49 -07:00
Li, Jiang
65e8466c37 [Bugfix] Fix environment variable setting in CPU Dockerfile (#21730)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-07-28 11:02:39 +00:00
Jee Jee Li
1b769dccf3 [Bugfix] Fix Ernie4_5_MoeForCausalLM shared experts (#21717)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-28 11:02:25 +00:00
rongfu.leng
2cc571199b [feature] add log non default args in LLM (#21680)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-07-28 02:21:22 -07:00
Cyrus Leung
a4ed731546 [Model] Prioritize Transformers fallback over suffix matching (#21719)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-28 02:15:31 -07:00
Benji Beck
d128d0d554 Migrate KeyeImageInputs and KeyeVideoInputs to TensorSchema (#21686)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-07-28 01:16:35 -07:00
Asaf Joseph Gardin
a6c050286a [v1][mamba] Added mamba_type into MambaSpec (#21715)
Signed-off-by: asafg <asafg@ai21.com>
Co-authored-by: asafg <asafg@ai21.com>
2025-07-28 08:15:55 +00:00
Lucas Wilkinson
139a7f07bd [BugFix] Fix ChunkedLocalAttention when the hybrid kv-cache is disabled (#21707)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-07-28 07:18:47 +00:00
Ning Xie
150d9e6337 [Bugfix] fix max-file-size type from str to int (#21675)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-07-28 00:06:52 -07:00
Cyrus Leung
139a97ec56 [Bugfix] Fix shape checking for Fuyu (#21709)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-28 00:05:56 -07:00
rongfu.leng
18cc33dd60 [bugfix] fix profile impact benchmark results (#21507)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-07-27 22:44:24 -07:00
Hongsheng Liu
7656cf4cf3 [Bugfix] [issue-21565] Fix the incompatibility issue with stream and named function calling when Thinking is disabled (#21573)
Signed-off-by: wangzi <3220100013@zju.edu.cn>
Co-authored-by: wangzi <3220100013@zju.edu.cn>
2025-07-27 22:43:50 -07:00
Benji Beck
3ea57a56d9 Migrate Idefics3ImagePixelInputs and Idefics3ImageEmbeddingInputs to … (#21683)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-07-27 22:37:23 -07:00
Benji Beck
75856bc2cb Migrate GraniteSpeechAudioInputs to TensorSchema (#21682)
Signed-off-by: Benji Beck <benjibeck@meta.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-07-27 22:37:20 -07:00
Benji Beck
304dcdf575 Migrate GLMVImagePixelInputs to TensorSchema (#21679)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-07-27 22:36:11 -07:00
Benji Beck
88e46c7c8d Migrate Glm4vImageInputs, Glm4vVideoInputs to TensorSchema (#21678)
Signed-off-by: Benji Beck <benjibeck@meta.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-07-27 22:36:08 -07:00
Benji Beck
d8937de4c8 Migrate Gemma3ImagePixelInputs to TensorSchema (#21676)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-07-27 22:36:05 -07:00
TJian
e626d286f5 [FEAT] [ROCm] [AITER]: Add AITER HIP block quant kernel (#21242) 2025-07-28 05:07:06 +00:00
Shinichi Hemmi
c7ffe93d9c [Model] Support TP/PP/mamba2 kernel for PLaMo2 (#19674)
Signed-off-by: Shinichi Hemmi <shemmi@preferred.jp>
Signed-off-by: Shinichi Hemmi <50256998+Alnusjaponica@users.noreply.github.com>
Co-authored-by: Calvin Metzger <metzger@preferred.jp>
Co-authored-by: Sixue Wang <cecilwang@preferred.jp>
2025-07-28 05:00:47 +00:00
Adeline
15a72ac478 [V1] Exception Handling when Loading KV Cache from Remote Store (#21534)
Signed-off-by: liuyumoye <adeline_ly2023@outlook.com>
Co-authored-by: liuyumoye <adeline_ly2023@outlook.com>
2025-07-27 20:34:17 -07:00
Jee Jee Li
04ff4be310 [Misc] Add fused_moe configs for Qwen3-Coder-480B-A35B-Instruct-FP8 (#21700)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-27 20:12:18 -07:00
Yuxuan Zhang
93269bb43e Fix GLM tool parser (#21668)
Co-authored-by: Chenhui Zhang <zhang.chenhui@outlook.com>
2025-07-28 10:46:38 +08:00
Joachim Studnia
82acf2184d Fix typo for limit-mm-per-prompt in docs (#21697)
Signed-off-by: Joachim Studnia <joachim@mistral.ai>
2025-07-27 19:45:37 -07:00
Cyrus Leung
86ae693f20 [Deprecation][2/N] Replace --task with --runner and --convert (#21470)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-27 19:42:40 -07:00
Alexander Matveev
8f605ee309 [Attention] Make CutlassMLA the default backend for SM100 (blackwell) (#21626)
Signed-off-by: Alexander Matveev <amatveev@redhat.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-07-27 20:13:00 +00:00
Ning Xie
a9b2a1d704 [Misc] Refactor vllm config str (#21666) 2025-07-27 09:51:44 -07:00
Caleb_Du
57c22e57f9 Fix CUDA permute/unpermute for use with DeepGemm Moe (#17934)
Signed-off-by: Caleb_Du <Caleb_Du@zju.edu.cn>
2025-07-27 07:08:00 -07:00
Wentao Ye
bda9d0535f [Refactor] Refactor MOE NVFP4 Code Base: ModelOpt + Compressed Tensor (#21631)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-27 05:25:21 -07:00
Isotr0py
3d847a3125 [VLM] Add video support for Intern-S1 (#21671)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-07-27 11:49:43 +00:00
Benji Beck
5f8c9a425e Migrate Florence2ImagePixelInputs to TensorSchema (#21663)
Signed-off-by: Benji Beck <benjibeck@meta.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-07-27 02:43:02 -07:00
Ning Xie
1cbf951ba2 [Misc] add default value for file pattern arg (#21659)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-07-27 05:14:51 +00:00
ZiTian.Zhao
a8936e5193 Refactor: Remove numpy dependency from LoggingStatLogger (#20529)
Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
2025-07-27 04:06:21 +00:00
Ye (Charlotte) Qi
01a395e9e7 [CI/Build][Doc] Clean up more docs that point to old bench scripts (#21667)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-07-27 04:02:12 +00:00
Huy Do
971948b846 Handle non-serializable objects in vllm bench (#21665) 2025-07-27 03:35:22 +00:00
Isotr0py
eed2f463b2 [VLM] Support HF format Phi-4-MM model (#17121)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-07-26 20:07:57 -07:00
Benji Beck
20950b29fb Migrate ChameleonImagePixelInputs to TensorSchema (#21657)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-07-26 19:34:25 -07:00
Benji Beck
3339cba3ff Migrate FuyuImagePatchInputs to TensorSchema (#21662)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-07-26 19:34:14 -07:00
Benji Beck
0b8caf9095 Migrate DeepseekVL2ImageInputs to TensorSchema (#21658)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-07-26 19:34:11 -07:00
Benji Beck
ccf27cc4d4 Migrate Blip2ImagePixelInputs and Blip2ImageEmbeddingInputs to TensorSchema (#21656)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-07-27 10:33:52 +08:00
Jinzhen Lin
c657369841 support torch.compile for bailing moe (#21664) 2025-07-26 23:54:32 +00:00
Wenchen Lo
6c66f28fa5 Remove xformers requirement for Mistral-format Pixtral and Mistral3 (#21154)
Signed-off-by: Wenchen Lo <charles761013@gmail.com>
2025-07-26 17:20:29 -06:00
Kaixi Hou
de509ae8eb [NVIDIA] Explicitly disable shuffled weights for flashinfer blockscale moe fp8 kernels (#21411)
Signed-off-by: kaixih <kaixih@nvidia.com>
2025-07-26 07:10:36 -07:00
Ye (Charlotte) Qi
e7c4f9ee86 [CI/Build][Doc] Move existing benchmark scripts in CI/document/example to vllm bench CLI (#21355)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-07-26 07:10:14 -07:00
Yeju Zhou
9094d11c5d [Bugfix][Apple Silicon] fix missing symbols when build from source on Mac with Apple Silicon (#21380)
Signed-off-by: Yeju Zhou <yejuzhou@outlook.com>
2025-07-26 07:09:57 -07:00
Wentao Ye
56e544f24b [Refactor] Remove moe_align_block_size_triton (#21335)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-26 07:08:29 -07:00
WeiQing Chen
97d6c30cc9 [BugFix] Fix shared storage connector load kv only load attention layer (#21428)
Signed-off-by: David Chen <530634352@qq.com>
2025-07-26 07:07:40 -07:00
Ye (Charlotte) Qi
a40a8506df [Misc] Improve memory profiling debug message (#21429)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-07-26 07:07:21 -07:00
Wentao Ye
c215f5c877 [Bug] Fix has_flashinfer_moe Import Error when it is not installed (#21634)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-26 07:06:14 -07:00
Maximilien de Bayser
1cd6eaba54 Support encoder-only models without KV-Cache (#21270)
Signed-off-by: Max de Bayser <maxdebayser@gmail.com>
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
2025-07-26 21:09:52 +08:00
Isotr0py
f27fdfc3ed [Bugfix] Investigate Qwen2-VL failing test (#21527)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-07-26 06:09:29 -07:00
Benji Beck
de10ff0b7c Migrate AyaVisionImagePixelInputs to TensorSchema for shape validation (#21622)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-07-26 06:08:18 -07:00
Benji Beck
9d197280fa Migrate AriaImagePixelInputs to TensorSchema for shape validation (#21620)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-07-26 06:08:15 -07:00
Huy Do
e98def439c [Take 2] Correctly kill vLLM processes after benchmarks (#21646)
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-07-26 06:06:05 -07:00
Reid
05c1126f29 [Misc] remove unused try-except in pooling config check (#21618)
Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-07-26 12:20:03 +00:00
Lyu Han
875af38e01 Support Intern-S1 (#21628)
Signed-off-by: Roger Wang <hey@rogerw.me>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-07-26 19:14:04 +08:00
QiliangCui
7728dd77bb [TPU][Test] Divide TPU v1 Test into 2 parts. (#21431) 2025-07-26 06:20:30 +00:00
Alexandre JUAN
2f6e6b33fb [Bugfix] Fix isinstance check for tensor types in _load_prompt_embeds to use dtype comparison (#21612)
Signed-off-by: Alexandre Juan <a.juan@netheos.net>
2025-07-25 20:11:10 -07:00
Huy Do
a55c95096b Correctly kill vLLM processes after finishing serving benchmarks (#21641)
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-07-25 19:06:21 -07:00
WeiQing Chen
97349fe2bc [Docs] add offline serving multi-modal video input expamle Qwen2.5-VL (#21530)
Signed-off-by: David Chen <530634352@qq.com>
2025-07-25 18:37:32 -07:00
Farzad Abdolhosseini
62965de5fe [Model] Ultravox: Support Llama 4 and Gemma 3 backends (#17818)
Signed-off-by: Farzad Abdolhosseini <farzad@fixie.ai>
Signed-off-by: Patrick Li <patrick8289@gmail.com>
Co-authored-by: Patrick Li <patrick8289@gmail.com>
2025-07-25 18:12:31 -07:00
Alex Kogan
7ae75fa6d0 [Feature] Add support for MoE models in the calibration-free RTN-based quantization (#20766)
Signed-off-by: Alex Kogan <alex.kogan@oracle.com>
2025-07-25 18:09:34 -07:00
Chengji Yao
f1b286b2fb [TPU] Update ptxla nightly version to 20250724 (#21555)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-07-25 17:09:00 -07:00
Rui Qiao
c7742d6113 [Bugfix] Always set RAY_ADDRESS for Ray actor before spawn (#21540)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-07-25 17:08:30 -07:00
Rui Qiao
cea96a0156 [Bugfix] Fix sync_and_slice_intermediate_tensors (#21537)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-07-25 17:07:58 -07:00
Yong Hoon Shin
2eddd437ba Add interleaved RoPE test for Llama4 (Maverick) (#21478)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-07-25 17:07:26 -07:00
Wentao Ye
75d29cf4e1 [Perf] Cuda Kernel for Int8 Per Token Group Quant (#21476)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-25 17:07:07 -07:00
Daniel Han
41d3082c41 Add Unsloth to RLHF.md (#21636) 2025-07-25 17:06:48 -07:00
QiliangCui
7cfea0df39 [TPU][Test] Rollback PR-21550. (#21619)
Signed-off-by: Qiliang Cui <derrhein@gmail.com>
2025-07-25 13:22:01 -07:00
Wenhua Cheng
5ac3168ee3 [Docs] add auto-round quantization readme (#21600)
Signed-off-by: Wenhua Cheng <wenhua.cheng@intel.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-25 08:52:42 -07:00
Kebe
396ee94180 [CI] Unifying Dockerfiles for ARM and X86 Builds (#21343)
Signed-off-by: Kebe <mail@kebe7jun.com>
2025-07-25 07:33:56 -07:00
mgazz
e189b50f53 Add support for Prithvi in Online serving mode (#21518)
Signed-off-by: Michele Gazzetti <michele.gazzetti1@ibm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-07-25 07:01:27 -07:00
czhu-cohere
136d750f5f [Kernel] Improve machete memory bound perf (#21556)
Signed-off-by: czhu-cohere <conway.zhu@cohere.com>
2025-07-25 06:53:21 -07:00
who who who
b3caeb82e7 [ROCm][AITER] Enable fp8 kv cache on rocm aiter backend. (#20295)
Signed-off-by: fsx950223 <fsx950223@outlook.com>
Signed-off-by: amd-ruitang3 <Rui.Tang2@amd.com>
Co-authored-by: amd-ruitang3 <Rui.Tang2@amd.com>
2025-07-25 06:50:21 -07:00
Chih-Chieh Yang
eab2f3980c [Model] Replace Mamba2 RMSNorm Gated with Fused Triton Kernel (#20839)
Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
Signed-off-by: Yu Chin Fabian Lim <fabian.lim@gmail.com>
Signed-off-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com>
Co-authored-by: Yu Chin Fabian Lim <fabian.lim@gmail.com>
2025-07-25 06:49:36 -07:00
kourosh hakhamaneshi
9fe98d4250 [Frontend] Add request_id to the Request object so they can be controlled better via external load balancers (#21009)
Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
2025-07-25 06:49:11 -07:00
bigshanedogg
29c6fbe58c [MODEL] New model support for naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B (#20931)
Signed-off-by: bigshanedogg <bigshane319@gmail.com>
2025-07-25 06:05:42 -07:00
xyxinyang
c72f049cb4 [Model] Fix Ernie4.5MoE e_score_correction_bias parameter (#21586)
Signed-off-by: zhouchong <zhouchong03@baidu.com>
Co-authored-by: zhouchong <zhouchong03@baidu.com>
2025-07-25 06:02:53 -07:00
Mengqing Cao
f3a683b7c9 [Bugfix][Logprobs] Fix logprobs op to support more backend (#21591)
Signed-off-by: MengqingCao <cmq0113@163.com>
2025-07-25 05:53:07 -07:00
Cyrus Leung
46d81d6951 [V1] Get supported tasks from model runner instead of model config (#21585)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-25 05:36:45 -07:00
Jee Jee Li
5c3f2628d5 [Quantization] Enable BNB support for more MoE models (#21370)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-25 03:57:34 -07:00
Kebe
7311f74468 [Bugfix] GGUF: fix AttributeError: 'PosixPath' object has no attribute 'startswith' (#21579)
Signed-off-by: Kebe <mail@kebe7jun.com>
2025-07-25 03:42:23 -07:00
Xu Wenqing
8ed01e32f7 Add H20-3e fused MoE kernel tuning configs for Qwen3-Coder-480B-A35B-Instruct (#21598)
Signed-off-by: 许文卿 <xwq391974@alibaba-inc.com>
2025-07-25 02:36:55 -07:00
Nick Hill
e38e96a3c0 [Tests] Harden DP tests (#21508)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-07-25 02:27:24 -07:00
Chengji Yao
40d86ee412 [TPU][Bugfix] fix OOM issue in CI test (#21550)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-07-24 23:01:53 -07:00
Yang Chen
85d051f026 [Misc] Removed undefined cmake variables MOE_PERMUTE_ARCHS (#21262)
Signed-off-by: Yang Chen <yangche@fb.com>
2025-07-24 22:54:23 -07:00
Ignacio Sica
5140f54b89 [CI/Build] fix cpu_extension for apple silicon (#21195)
Signed-off-by: ignaciosica <mignacio.sica@gmail.com>
2025-07-24 22:53:59 -07:00
Chengji Yao
947edd099e [Misc][Tools] make max-model-len a parameter in auto_tune script (#21321)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-07-24 22:46:43 -07:00
hfan
fde60ee775 [Model] Fix a check for None but the return value was empty list in Gemma3 MM vision_embeddings (#21479)
Signed-off-by: Hongmin Fan <fanhongmin@google.com>
2025-07-25 13:46:06 +08:00
Jason Gu
b38bc652ac [Model] Support tensor parallel for timm ViT in Deepseek_vl2 (#21494)
Signed-off-by: wzqd <1057337859@qq.com>
2025-07-24 22:45:16 -07:00
Ning Xie
adaf2c6d4f [Bugfix] fix modelscope snapshot_download serialization (#21536)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-07-24 22:44:38 -07:00
Li, Jiang
42343f1f89 [CI] Update CODEOWNERS for CPU and Intel GPU (#21582)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-07-24 21:58:03 -07:00
Benji Beck
965bc71b04 Integrate TensorSchema with shape validation for Phi3VImagePixelInputs (#21232)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-07-24 21:43:52 -07:00
Zhou Fang
807a328bb6 [Docs] Add requirements/common.txt to run unit tests (#21572)
Signed-off-by: Zhou Fang <fang.github@gmail.com>
2025-07-24 20:51:15 -07:00
QiliangCui
e0be2c4d09 [TPU][Test] Temporarily suspend this MoE model in test_basic.py. (#21560)
Signed-off-by: Qiliang Cui <derrhein@gmail.com>
2025-07-24 20:44:50 -07:00
Nick Hill
9c8b2c2a8a [DP] Support api-server-count > 0 in hybrid DP LB mode (#21510)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-07-24 20:18:16 -07:00
Varun Sundar Rabindranath
2212cd6cfb [Bugfix] DeepGemm utils : Fix hardcoded type-cast (#21517)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-07-24 20:17:29 -07:00
Burkhard Ringlein
ce3a9b1378 [Kernel] adding fused_moe configs for upcoming granite4 (#21332)
Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-07-24 20:16:59 -07:00
Yuxuan Zhang
2ce90e5b01 Fix GLM-4 PP Missing Layer When using with PP. (#21531)
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
2025-07-24 20:07:38 -07:00
Wentao Ye
633f6e804b [Bug] Fix DeepGemm Init Error (#21554)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-24 20:07:22 -07:00
Harry Mellor
b57296bb9a [Docs] Fix site_url for RunLLM (#21564)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-24 20:05:58 -07:00
Cyrus Leung
34ddcf9ff4 [Frontend] run-batch supports V1 (#21541)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-24 20:05:55 -07:00
Woosuk Kwon
fe56180c7f [MoE] More balanced expert sharding (#21497)
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
2025-07-24 15:56:08 -07:00
QiliangCui
07d80d7b0e [TPU][TEST] HF_HUB_DISABLE_XET=1 the test 3. (#21539)
Signed-off-by: Qiliang Cui <derrhein@gmail.com>
2025-07-24 15:33:04 -07:00
weiliang
2dd72d23d9 update flashinfer to v0.2.9rc1 (#21485)
Signed-off-by: Weiliang Liu <weiliangl@nvidia.com>
2025-07-24 14:06:11 -07:00
Simon Mo
a6c7fb8cff [Docs] Add Expert Parallelism Initial Documentation (#21373)
Signed-off-by: simon-mo <simon.mo@hey.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-24 12:36:06 -07:00
Ricardo Decal
a7272c23d0 [Docs][minor] Fix broken gh-file link in distributed serving docs (#21543)
Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
2025-07-24 10:36:56 -07:00
Juncheng Gu
6066284914 [P/D] Support CPU Transfer in NixlConnector (#18293)
Signed-off-by: Juncheng Gu <juncgu@gmail.com>
Signed-off-by: Richard Liu <ricliu@google.com>
Co-authored-by: Richard Liu <39319471+richardsliu@users.noreply.github.com>
Co-authored-by: Richard Liu <ricliu@google.com>
2025-07-24 17:58:42 +01:00
Rui Qiao
1e9ea8e69d [P/D] Move FakeNixlWrapper to test dir (#21328)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-07-24 08:53:45 -07:00
Chaojun Zhang
d9f9a3fd96 [XPU] Conditionally import CUDA-specific passes to avoid import errors on xpu platform (#21036)
Signed-off-by: chzhang <chaojun.zhang@intel.com>
2025-07-24 23:23:36 +08:00
Shu Wang
1b25f1fe75 Update flashinfer CUTLASS MoE Kernel (#21408)
Signed-off-by: Shu Wang. <shuw@nvidia.com>
2025-07-24 08:13:31 -07:00
Wentao Ye
e8cb0d0495 [Bug] Fix Compressed Tensor NVFP4 cutlass_fp4_group_mm illegal memory access (#21465)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-24 08:13:24 -07:00
Ricardo Decal
684174115d [Docs] Rewrite Distributed Inference and Serving guide (#20593)
Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-24 08:13:05 -07:00
Sanger Steel
cdb79ee63d [Docs] Update Tensorizer usage documentation (#21190)
Signed-off-by: Sanger Steel <sangersteel@gmail.com>
Signed-off-by: William Goldby <willgoldby@gmail.com>
Co-authored-by: William Goldby <willgoldby@gmail.com>
2025-07-24 06:56:18 -07:00
elvischenv
5a19a6c670 [Fix] Update mamba_ssm to 2.2.5 (#21421)
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
2025-07-24 03:25:41 -07:00
Ming Yang
2ded067fd2 [Bugfix] Fix CUDA arch flags for MoE permute (#21426)
Signed-off-by: Ming Yang <minos.future@gmail.com>
2025-07-24 03:23:59 -07:00
Harry Mellor
13abd0eaf9 [Model] Officially support Emu3 with Transformers backend (#21319)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-24 03:22:12 -07:00
Lucas Wilkinson
61b8cea3b4 [Attention] Optimize FlashInfer MetadataBuilder Build call (#21137)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-07-24 03:21:46 -07:00
cjackal
526078a96c bump flashinfer to v0.2.8 (#21385)
Signed-off-by: cjackal <44624812+cjackal@users.noreply.github.com>
2025-07-24 03:20:38 -07:00
Chauncey
6da0078523 [Feat] Allow custom naming of vLLM processes (#21445)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-07-24 03:15:23 -07:00
Rui Qiao
73e3949d07 [Misc] Improve comment for DPEngineCoreActor._set_cuda_visible_devices() (#21501)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-07-24 03:13:40 -07:00
Shintarou Okada
6eca337ce0 Replace --expand-tools-even-if-tool-choice-none with --exclude-tools-when-tool-choice-none for v0.10.0 (#20544)
Signed-off-by: okada <kokuzen@gmail.com>
Signed-off-by: okada shintarou <okada@preferred.jp>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-24 02:56:36 -07:00
Yuxuan Zhang
85bda9e7d0 remove GLM-4.5 quantization wrong Code (#21435) 2025-07-24 01:52:43 -07:00
22quinn
610852a423 [Core] Support model loader plugins (#21067)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-07-24 01:49:44 -07:00
Nick Hill
f0f4de8f26 [Misc] Fix duplicate FusedMoEConfig debug messages (#21455)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-07-24 01:27:30 -07:00
Zhou Fang
fc5f756db4 [v1][Core] Clean up usages of SpecializedManager (#21407)
Signed-off-by: Zhou Fang <fang.github@gmail.com>
2025-07-24 00:40:11 -07:00
Chengji Yao
e74bfc70e4 [TPU][Bugfix] fix moe layer (#21340)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
2025-07-24 00:38:39 -07:00
Gregory Shtrasberg
90eeea8f85 [Bugfix][ROCm] Fix for warp_size uses on host (#21205)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-07-24 00:37:19 -07:00
Harry Mellor
dde295a934 Deduplicate Transformers backend code using inheritance (#21461)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-24 00:16:23 -07:00
Julien Denize
6d8d0a24c0 Add think chunk (#21333)
Some checks failed
Create Release / Create Release (push) Has been cancelled
Signed-off-by: Julien Denize <julien.denize@mistral.ai>
2025-07-23 21:51:32 -07:00
Yinghai Lu
11ef7a611e [BugFix] Set CUDA_VISIBLE_DEVICES before spawning the subprocesses (#21211)
Signed-off-by: Yinghai Lu <yinghai@thinkingmachines.ai>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Rui Qiao <ruisearch42@gmail.com>
2025-07-23 21:44:04 -07:00
Woosuk Kwon
dc2f159f8a Dump input metadata on crash for async scheduling (#21258)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-07-23 21:10:30 -07:00
Robert Shaw
d5b981f8b1 [DP] Internal Load Balancing Per Node [one-pod-per-node] (#21238)
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-07-23 20:57:32 -07:00
Nick Hill
eec6942014 [BugFix] Fix KVConnector TP worker aggregation (#21473)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-07-23 20:56:49 -07:00
KazusatoOoko
fd48d99ffd [BugFix]: Batch generation from prompt_embeds fails for long prompts (#21390)
Signed-off-by: KazusatoOko <kazusto.oko@sakana.ai>
Co-authored-by: KazusatoOko <kazusto.oko@sakana.ai>
2025-07-23 20:43:17 -07:00
WeiQing Chen
f8c15c4efb [Bugfix] Fix example disagg_example_p2p_nccl_xpyd.sh zombie process (#21437)
Signed-off-by: David Chen <530634352@qq.com>
2025-07-23 20:42:11 -07:00
Matthew Bonanni
aa08a954f9 [Bugfix] Fix casing warning (#21468)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2025-07-23 20:41:23 -07:00
Liangliang Ma
13e4ee1dc3 [XPU][UT] increase intel xpu CI test scope (#21492)
Signed-off-by: Ma, Liangliang <liangliang.ma@intel.com>
2025-07-23 20:24:04 -07:00
Ming Yang
772ce5af97 [Misc] Add dummy maverick test to CI (#21324)
Signed-off-by: Ming Yang <minos.future@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-07-23 20:22:42 -07:00
deven-labovitch
63d92abb7c [Frontend] Set MAX_AUDIO_CLIP_FILESIZE_MB via env var instead of hardcoding (#21374)
Signed-off-by: Deven Labovitch <deven@videa.ai>
2025-07-23 20:22:19 -07:00
Hardik Gupta
11599b0e1f feat(gguf_loader): accept HF repo paths & URLs for GGUF (#20793)
Signed-off-by: Hardik <hardikgupta1999@gmail.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-07-23 20:21:02 -07:00
Michael Goin
f3137cdd81 [Core] Freeze gc during cuda graph capture to speed up init (#21146)
Signed-off-by: Codex <codex@openai.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-23 17:20:14 -07:00
Michael Goin
82ec66f514 [V0 Deprecation] Remove Prompt Adapters (#20588)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-23 16:36:48 -07:00
Yong Hoon Shin
78c13e30e1 [V1] Fix local chunked attention always disabled (#21419)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-07-23 15:59:30 -07:00
22quinn
5c9b807b34 [Core] Add reload_weights RPC method (#20096)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-07-23 14:24:52 -07:00
QiliangCui
14bf19e39f [TPU][TEST] Fix the downloading issue in TPU v1 test 11. (#21418)
Signed-off-by: Qiliang Cui <derrhein@gmail.com>
2025-07-23 11:29:36 -07:00
Yong Hoon Shin
4ac7713e32 Add test case for compiling multiple graphs (#21044)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-07-23 11:00:47 -07:00
Christian Pinto
8560a5b258 [Core][Model] PrithviMAE Enablement on vLLM v1 engine (#20577)
Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
2025-07-23 11:00:23 -07:00
Nick Hill
316b1bf706 [Tests] Add tests for headless internal DP LB (#21450)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-07-23 07:49:25 -07:00
Tao He
7c734ee09b [Bugfix][Qwen][DCA] fixes bug in dual-chunk-flash-attn backend for qwen 1m models. (#21364)
Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>
2025-07-23 06:34:37 -07:00
Cyrus Leung
f59ec35b7f [V1] Check all pooling tasks during profiling (#21299)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-23 05:53:26 -07:00
Asher
2671334d45 [Model] add Hunyuan V1 Dense Model support. (#21368)
Signed-off-by: Asher Zhang <asherszhang@tencent.com>
2025-07-23 03:54:08 -07:00
Michael Yao
2cc5016a19 [Docs] Clean up v1/metrics.md (#21449)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-07-23 03:37:25 -07:00
Yang Chen
6929f8b437 [Misc] fixed nvfp4_moe test failures due to invalid kwargs (#21246)
Signed-off-by: Yang Chen <yangche@fb.com>
2025-07-23 01:41:43 -07:00
Yu Chin Fabian Lim
32ec9e2f2a Mamba V2 Test not Asserting Failures. (#21379)
Signed-off-by: Yu Chin Fabian Lim <flim@sg.ibm.com>
2025-07-23 01:40:27 -07:00
Lu Fang
accac82928 [Sampler] Introduce logprobs mode for logging (#21398)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-07-23 01:39:25 -07:00
Michael Yao
23637dcdef [Docs] Fix bullets and grammars in tool_calling.md (#21440)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-07-23 01:23:20 -07:00
Sergio Paniego Blanco
6364af92f8 Fixed typo in profiling logs (#21441) 2025-07-23 01:18:54 -07:00
Guillaume Calmettes
7aaa2bd5a8 [Bugfix] ensure tool_choice is popped when tool_choice:null is passed in json payload (#19679)
Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
2025-07-23 00:30:05 -07:00
youkaichao
2f5c14de6a add clear messages for deprecated models (#21424)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-07-23 00:03:16 -07:00
Michael Goin
f002e9a870 [Cleanup] Only log MoE DP setup warning if DP is enabled (#21315)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-23 00:02:48 -07:00
Jialin Ouyang
a1f3610fc6 [Core] Add basic unit test for maybe_evict_cached_block (#21400)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-07-23 00:02:02 -07:00
Isotr0py
4ecedd1806 [Bugfix] Fix nightly transformers CI failure (#21427)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-07-23 00:01:01 -07:00
Alexei-V-Ivanov-AMD
107111a859 Changing "amdproduction" allocation. (#21409)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
2025-07-22 20:48:31 -07:00
elvischenv
2dec7c1a5d [Bugfix][CUDA] fixes CUDA FP8 kv cache dtype supported (#21420)
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
2025-07-22 20:34:50 -07:00
Chendi.Xue
08d2bd78da [BUGFIX] deepseek-v2-lite failed due to fused_qkv_a_proj name update (#21414)
Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
2025-07-22 20:33:57 -07:00
ericehanley
4f76a05f4f [BugFix] Update python to python3 calls for image; fix prefix & input calculations. (#21391)
Signed-off-by: Eric Hanley <ericehanley@google.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-07-22 20:33:00 -07:00
Harry Mellor
f154bb9ff0 Simplify weight loading in Transformers backend (#21382)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-22 20:29:43 -07:00
Gregory Shtrasberg
3ec7170ff1 [Bugfix][ROCm][Build] Fix build regression on ROCm (#21393)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-07-22 20:27:41 -07:00
Cyrus Leung
c401c64b4c [CI/Build] Fix model executor tests (#21387)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-22 20:25:37 -07:00
Joe Runde
b77c7d327f [BugFix] Fix ray import error mem cleanup bug (#21381)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
Co-authored-by: Travis Johnson <tsjohnso@us.ibm.com>
2025-07-22 16:19:55 -07:00
Rui Qiao
35bc8bd5fb [Misc] Copy HF_TOKEN env var to Ray workers (#21406)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-07-22 16:18:42 -07:00
Yiheng Xu
4594fc3b28 [Model] Add Qwen3CoderToolParser (#21396)
Signed-off-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: simon-mo <xmo@berkeley.edu>
2025-07-22 15:05:57 -07:00
Xin Li
ae268b6326 Fix Flashinfer Allreduce+Norm enable disable calculation based on fi_allreduce_fusion_max_token_num (#21325)
Signed-off-by: XIn Li <xinli@nvidia.com>
2025-07-22 12:42:31 -07:00
Cyrus Leung
35366ae57c [CI/Build] Fix test failure due to updated model repo (#21375)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-22 08:39:35 -07:00
Aritra Roy Gosthipaty
2226d5bd85 [Bugfix] Decode Tokenized IDs to Strings for hf_processor in llm.chat() with model_impl=transformers (#21353)
Signed-off-by: ariG23498 <aritra.born2fly@gmail.com>
2025-07-22 08:27:28 -07:00
Wang Yijun
44554a0068 Add tokenization_kwargs to encode for embedding model truncation (#21033) 2025-07-22 08:24:00 -07:00
Wentao Ye
226b452a20 Revert "[Refactor] Fix Compile Warning #1444-D (#21208)" (#21384)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-22 08:22:10 -07:00
Raushan Turganbay
f38ee34a0a [feat] Enable mm caching for transformers backend (#21358)
Signed-off-by: raushan <raushan@huggingface.co>
2025-07-22 08:18:46 -07:00
Benjamin Bartels
b194557a6c Adds parallel model weight loading for runai_streamer (#21330)
Signed-off-by: bbartels <benjamin@bartels.dev>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-07-22 08:15:53 -07:00
Wentao Ye
774d0c014b [Perf] Cuda Kernel for Per Token Group Quant (#21083)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-22 07:27:15 -07:00
Duncan Moss
2c8db17cfd [feat]: add SM100 support for cutlass FP8 groupGEMM (#20447)
Signed-off-by: Duncan Moss <djm.moss@gmail.com>
Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com>
Co-authored-by: jiahanc <173873397+jiahanc@users.noreply.github.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-07-22 07:27:12 -07:00
Mickaël Seznec
4fb56914c5 [perf] Add fused MLA QKV + strided layernorm (#21116)
Signed-off-by: Mickael Seznec <mickael@mistral.ai>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-07-22 07:07:44 -07:00
Ning Xie
0df4d9b06b [Misc] unify variable for LLM instance v2 (#21356)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-07-22 06:32:36 -07:00
Jialin Ouyang
ed25054577 [Core] Introduce popleft_n and append_n in FreeKVCacheBlockQueue to further optimize block_pool (#21222)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-07-22 06:17:47 -07:00
Jialin Ouyang
10904e6d75 [benchmark] Port benchmark request sent optimization to benchmark_serving (#21209)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-07-22 05:28:00 -07:00
Jialin Ouyang
a32237665d [Core] Optimize update checks in LogitsProcessor (#21245)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-07-22 05:27:18 -07:00
Kebe
bc8a8ce5ec [Misc] Remove deprecated args in v0.10 (#21349)
Signed-off-by: Kebe <mail@kebe7jun.com>
2025-07-22 05:26:39 -07:00
Simon Mo
32142b3c62 [Bugfix] Fix eviction cached blocked logic (#21357)
Signed-off-by: simon-mo <simon.mo@hey.com>
2025-07-22 01:18:40 -07:00
Raghav Ravishankar
82b8027be6 Add arcee model (#21296)
Signed-off-by: alyosha-swamy <raghav@arcee.ai>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-22 00:57:43 -07:00
rongfu.leng
3779eb8c81 [Feature][eplb] add verify ep or tp or dp (#21102)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-07-21 23:41:14 -07:00
Shu Wang
9e23ad9655 Update fp4 quantize API (#21327)
Signed-off-by: Shu Wang <shuw@nvidia.com>
2025-07-21 23:40:21 -07:00
Wentao Ye
e69a92a1ce [Bug] DeepGemm: Fix Cuda Init Error (#21312)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-21 23:36:18 -07:00
Varun Sundar Rabindranath
8425f785ad [Misc] DeepEPHighThroughtput - Enable Inductor pass (#21311)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-07-21 23:35:45 -07:00
Konrad Zawora
c17231e827 Fix kv_cache_dtype handling for out-of-tree HPU plugin (#21302)
Signed-off-by: Konrad Zawora <kzawora@habana.ai>
Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
Co-authored-by: Chendi.Xue <chendi.xue@intel.com>
2025-07-21 23:35:14 -07:00
Wentao Ye
6e5b5ca580 [Refactor] Fix Compile Warning #1444-D (#21208)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-21 23:33:51 -07:00
Thomas Parnell
488d8a986a [V1] [Hybrid] Add new test to verify that hybrid views into KVCacheTensor are compatible (#21300)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-07-21 23:31:18 -07:00
Jialin Ouyang
af376ca19d [Core] Minimize number of dict lookup in _maybe_evict_cached_block (#21281)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-07-21 22:37:34 -07:00
Ming Yang
e7b2042681 Revert "[Performance] Performance improvements in non-blockwise fp8 CUTLASS MoE (#20762) (#21334)
Signed-off-by: Ming Yang <minos.future@gmail.com>
2025-07-21 21:49:01 -07:00
Ratnam Parikh
90f1e55421 [Intel GPU] Ray Compiled Graph avoid NCCL for Intel GPU (#21338)
Signed-off-by: ratnampa <ratnam.parikh@intel.com>
2025-07-21 21:48:27 -07:00
Li, Jiang
5e70dcd6e6 [Doc] Fix CPU doc format (#21316)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-07-21 21:47:49 -07:00
Chaojun Zhang
25d585ab7b [XPU] Enable external_launcher to serve as an executor via torchrun (#21021)
Signed-off-by: chzhang <chaojun.zhang@intel.com>
2025-07-21 21:47:35 -07:00
Lu Fang
8d0a01a5f2 [v1][sampler] Inplace logprobs comparison to get the token rank (#21283)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-07-21 13:47:47 -07:00
Himanshu Jaju
0ec82edda5 [perf] Speed up align sum kernels (#21079)
Signed-off-by: Himanshu Jaju <hj@mistral.ai>
2025-07-21 11:19:23 -07:00
Michael Goin
005ae9be6c Fix bad lm-eval fork (#21318) 2025-07-21 10:47:51 -07:00
Robert Shaw
29d1ffc5b4 [DP] Fix Prometheus Logging (#21257)
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
2025-07-21 09:11:35 -07:00
Lucas Wilkinson
304dce7ec0 [Attention] Clean up iRoPE in V1 (#21188)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-07-21 09:10:30 -07:00
Ming Yang
6ece16c4fe [Misc] Add dummy maverick test (#21199)
Signed-off-by: Ming Yang <minos.future@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-07-21 09:08:09 -07:00
simpx
a0e827e07c [BugFix] make utils.current_stream thread-safety (#21252) (#21253)
Signed-off-by: simpx <simpxx@gmail.com>
2025-07-21 09:07:36 -07:00
Li, Jiang
a15a50fc17 [CPU] Enable shared-memory based pipeline parallel for CPU backend (#21289)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-07-21 09:07:08 -07:00
Woosuk Kwon
6dda13c86b [Misc] Add sliding window to flashinfer test (#21282)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-07-21 08:37:49 -07:00
Zhiyu
6b46c4b653 Add Nvidia ModelOpt config adaptation (#19815)
Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
2025-07-21 10:02:58 -04:00
Ning Xie
d97841078b [Misc] unify variable for LLM instance (#20996)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-07-21 12:18:33 +01:00
Harry Mellor
e6b90a2805 [Docs] Make tables more space efficient in supported_models.md (#21291)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-21 02:25:02 -07:00
Harry Mellor
be54a951a3 [Docs] Fix hardcoded links in docs (#21287)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-21 02:23:57 -07:00
Cyrus Leung
042af0c8d3 [Model][1/N] Support multiple poolers at model level (#21227)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-21 02:22:21 -07:00
Cyrus Leung
378d33c392 [Bugfix] Fix missing placeholder in logger debug (#21280)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-20 22:50:06 -07:00
Huy Do
940af1f03a Add the instruction to run e2e validation manually before release (#21023)
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-07-20 22:29:18 -07:00
Simon Mo
92615d7fe8 [Docs] Add RFC Meeting to Issue Template (#21279)
Signed-off-by: simon-mo <simon.mo@hey.com>
2025-07-20 21:58:07 -07:00
Kay Yan
8188196a1c [CI] Cleanup modelscope version constraint in Dockerfile (#21243)
Signed-off-by: Kay Yan <kay.yan@daocloud.io>
2025-07-20 20:13:02 -07:00
Jiayi Yan
7ba34b1241 [bugfix] fix syntax warning caused by backslash (#21251) 2025-07-20 17:12:10 +00:00
Raushan Turganbay
9499e26e2a [Model] Support VLMs with transformers backend (#20543)
Signed-off-by: raushan <raushan@huggingface.co>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-07-20 13:25:50 +00:00
Calvin Chen
51ba839555 [Model] use AutoWeightsLoader for bart (#18299)
Signed-off-by: calvin chen <120380290@qq.com>
2025-07-20 08:15:50 +00:00
Seiji Eicher
d1fb65bde3 Enable v1 metrics tests (#20953)
Some checks failed
Create Release / Create Release (push) Has been cancelled
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
2025-07-20 03:22:02 +00:00
Chengji Yao
3a1d8940ae [TPU] support fp8 kv cache quantization (#19292)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-07-20 03:01:00 +00:00
Thomas Parnell
2b504eb770 [Docs] [V1] Update docs to remove enforce_eager limitation for hybrid models. (#21233)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-07-19 16:09:58 -07:00
Yuxuan Zhang
10eb24cc91 GLM-4 Update (#20736)
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Lu Fang <fanglu@fb.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Lu Fang <fanglu@fb.com>
2025-07-19 22:40:31 +00:00
fhl2000
2e8cbb58f3 [BugFix] Fix full cuda graph slot_mapping (#21228)
Signed-off-by: fhl2000 <63384265+fhl2000@users.noreply.github.com>
2025-07-19 14:13:18 -07:00
Woosuk Kwon
752c6ade2e [V0 Deprecation] Deprecate BlockSparse Attention & Phi3-Small (#21217)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-07-19 13:53:17 -07:00
Thomas Parnell
881e3cbe3b [V1] [Hybrid] Enable piecewise CUDA Graph for mamba layers (#21194)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-07-19 19:27:21 +00:00
kourosh hakhamaneshi
9f414a12ad [BugFix] Make PD work with Ray (#21072)
Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
2025-07-19 08:46:50 -07:00
Jiayi Yan
6a971ed692 [Docs] Update the link to the 'Prometheus/Grafana' example (#21225) 2025-07-19 06:58:07 -07:00
Sungjae Lee
da6579bf41 [CI/CD][bugfix]fix: error argument to loads has incompatible type (#21223)
Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Signed-off-by: Sungjae Lee <sung-jae.lee@navercorp.com>
2025-07-19 05:16:48 -07:00
Rabi Mishra
c81259d33a Fix/remove some broken model executor tests (#21224)
Signed-off-by: Rabi Mishra <ramishra@redhat.com>
2025-07-19 12:15:07 +00:00
Li, Jiang
e3a0e43d7f [bugfix] Fix auto thread-binding when world_size > 1 in CPU backend and refactor code (#21032)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-07-19 05:13:55 -07:00
22quinn
b3d82108e7 [Bugfix][Frontend] Fix openai CLI arg middleware (#21220)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-07-19 02:40:38 -07:00
Kaixi Hou
6d0734c562 [NVIDIA] Add SM100 Flashinfer MoE blockscale fp8 backend for low latency (#20645)
Signed-off-by: kaixih <kaixih@nvidia.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-07-19 02:33:01 -07:00
shixianc
7d94577138 Add torch golden impl for moe_align_block_size kernel test (#20653)
Signed-off-by: Shixian Cui <shixian@amazon.com>
Co-authored-by: Shixian Cui <shixian@amazon.com>
2025-07-19 02:32:36 -07:00
Lucas Wilkinson
59f935300c [BugFix] Fix potential cuda-graph IMA (#21196)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-07-19 02:18:47 -07:00
Isotr0py
18e519ec86 [Bugfix] Fix ndarray video color from VideoAsset (#21064)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-07-19 02:17:16 -07:00
Jee Jee Li
1eaff27815 [V0 deprecation] Remove long context LoRA (#21169)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-19 02:15:41 -07:00
Huy Do
cf8cc32674 Fix a couple of Voxtral tests (#21218)
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-07-19 09:13:41 +00:00
Chenyaaang
3a2cb2649d [Misc][Tools][Benchmark] Add readme file for auto_tune script (#20779)
Signed-off-by: Chenyaaang <chenyangli@google.com>
2025-07-19 09:06:59 +00:00
김종곤
3e04107d97 [Model] EXAONE 4.0 model support (#21060)
Signed-off-by: Deepfocused <rlawhdrhs27@gmail.com>
Signed-off-by: woongsik <rlawhdrhs27@gmail.com>
2025-07-19 14:25:44 +08:00
Wentao Ye
37bd8d6e4c [Bug] DeepGemm: Fix TypeError: per_block_cast_to_fp8() missing 1 required positional argument: 'use_ue8m0' for SM100 (#21187)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-18 23:25:22 -07:00
Lucas Wilkinson
468e2400fe [BugFix][CPU] Fix TorchSDPABackendImpl doesn't have use_irope (#21200)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-07-18 23:18:48 -07:00
Varun Sundar Rabindranath
dcc6cfb991 [Kernel][Performance] Tweak MoE Batched silu_mul_fp8_quant_deep_gemm kernel (#21193)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-07-18 23:09:51 -07:00
Woosuk Kwon
dd572c0ab3 [V0 Deprecation] Remove V0 Spec Decode workers (#21152)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-07-18 21:47:50 -07:00
Varun Sundar Rabindranath
9ffe905a41 [Bugfix][Model] Fix LoRA for Mistral-Small-3.1-24B-Instruct-2503 (#21183)
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2025-07-18 21:15:03 -07:00
Lucia Fang
9a9fda1423 [Core] Support Local Chunked Attention for Hybrid KV Cache (#19351)
Signed-off-by: Lucia Fang <fanglu@fb.com>
Signed-off-by: Lu Fang <fanglu@meta.com>
Signed-off-by: Lu Fang <fanglu@fb.com>
Co-authored-by: Lu Fang <fanglu@meta.com>
2025-07-18 20:48:38 -07:00
Jee Jee Li
466e878f2a [Quantization] Enable BNB support for more MoE models (#21100)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-18 17:52:02 -07:00
Rui Qiao
217937221b Elastic Expert Parallel Initial Support (#20775)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-07-18 17:46:09 -07:00
hax0r31337
5782581acf [Bugfix] Voxtral on Blackwell GPUs (RTX 50 series) (#21077)
Signed-off-by: hax0r31337 <liulihaocaiqwq@gmail.com>
2025-07-18 18:40:18 -04:00
JialinOuyang-Meta
0f199f197b [Core] Avoid KVCacheBlock.__eq__ invocations in FreeKVCacheBlockQueue (#21005)
Signed-off-by: Jialin Ouyang <jialino@meta.com>
2025-07-18 12:34:40 -07:00
Richard Zou
b2eb2b5ad7 [Kernel] Apply torch.Tag.needs_fixed_stride_order only for torch==2.6.0 (#19346)
Signed-off-by: rzou <zou3519@gmail.com>
2025-07-18 14:10:21 -04:00
Richard Zou
21274ab476 [CI] Update CODEOWNERS for vllm/compilation (#21185)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2025-07-18 06:51:12 -07:00
Thomas Parnell
ed8cbfedf8 Let GraniteMoeAttention use YaRN (#21174)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-07-18 05:52:52 -07:00
Cyrus Leung
45badd05d0 [Core] Set pooling params based on task and model (#21128)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-18 05:41:17 -07:00
ElizaWszola
4adc66f64d [Bugfix] Allocate less memory in non-batched CUTLASS MoE (#21121)
Signed-off-by: ElizaWszola <ewszola@redhat.com>
2025-07-18 18:55:52 +08:00
Cyrus Leung
55ad648715 [Doc] Fix typo in model name (#21178)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-18 03:55:10 -07:00
wang.yuqi
5895afd780 [Bugfix] The special_tokens in tokenizer should also be controlled by do_lower_case in encoder_config. (#20750)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-07-18 09:10:47 +00:00
wang.yuqi
ca4eb82bcb [Model] Re-add the implicit conversion feature for as_seq_cls_model (#21103)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-07-18 07:15:07 +00:00
Roger Wang
ba2dfbb0c2 [Misc] Make MM embedding merge interface explicit in model runner (#21147)
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-07-18 07:13:57 +00:00
Jialin Ouyang
1bf65138f6 [benchmark] Sending request strictly follows the random intervals (#21108)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-07-18 06:22:08 +00:00
Woosuk Kwon
54cf1cae62 [Misc] Do not print async output warning for v1 (#21151)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-07-17 21:57:02 -07:00
shixianc
5780121c95 [Perf] Add swap_ab to SM90 FP8 non-block CUTLASS moe grouped gemm (#20911)
Signed-off-by: Shixian Cui <shixian@amazon.com>
Co-authored-by: Shixian Cui <shixian@amazon.com>
2025-07-18 04:34:43 +00:00
Shu Wang
c7d8724e78 [Core] FlashInfer CUTLASS fused MoE backend (NVFP4) (#20037)
Signed-off-by: shuw <shuw@nvidia.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-07-17 21:32:45 -07:00
22quinn
b38baabcf9 [Doc] Add inplace weights loading example (#19640)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-07-17 21:12:23 -07:00
Lucas Wilkinson
89cab4d01f [Attention] Make local attention backend agnostic (#21093) 2025-07-18 00:10:42 -04:00
Lucia Fang
b9a21e9173 [Docs] Update supported models documentation with missing models (#20844)
Signed-off-by: Lu Fang <fanglu@fb.com>
2025-07-17 20:12:13 -07:00
Ricardo Decal
c4e3b12524 [Docs] Add minimal demo of Ray Data API usage (#21080)
Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
2025-07-17 20:09:19 -07:00
elvischenv
8dfb45ca33 [Bugfix] Fix the tensor non-contiguous issue for Flashinfer TRT-LLM backend attention kernel (#21133) 2025-07-18 00:35:58 +00:00
Wentao Ye
8a8fc94639 [Log] Debugging Log with more Information (#20770)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-18 00:19:46 +00:00
Woosuk Kwon
4de7146351 [V0 deprecation] Remove V0 HPU backend (#21131)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-07-17 16:37:36 -07:00
Eric Curtin
ac9fb732a5 On environments where numa cannot be detected we get 0 (#21115)
Signed-off-by: Eric Curtin <ecurtin@redhat.com>
2025-07-17 18:52:17 +00:00
Jee Jee Li
a3a6c695f4 [Misc] Qwen MoE model supports LoRA (#20932)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-17 18:32:52 +00:00
Cyrus Leung
90bd2ab6e3 [Model] Update pooling model interface (#21058)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-17 16:05:40 +00:00
ElizaWszola
9fb2d22032 [Performance] Performance improvements in non-blockwise fp8 CUTLASS MoE (#20762)
Signed-off-by: ElizaWszola <ewszola@redhat.com>
2025-07-17 09:56:44 -04:00
Harry Mellor
2d6a38209b [Docs] Move code block out of admonition now that it's short (#21118)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-17 06:12:29 -07:00
wangxiyuan
89e3c4e9b4 [Misc] Avoid unnecessary import (#21106)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-07-17 12:57:41 +00:00
Harry Mellor
fe8a2c544a [Docs] Improve docstring formatting for FusedMoEParallelConfig.make (#21117)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-17 04:13:00 -07:00
kYLe
4ef00b5cac [VLM] Add Nemotron-Nano-VL-8B-V1 support (#20349)
Signed-off-by: Kyle Huang <kylhuang@nvidia.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-07-17 03:07:55 -07:00
Asher
5a7fb3ab9e [Model] Add ToolParser and MoE Config for Hunyuan A13B (#20820)
Signed-off-by: Asher Zhang <asherszhang@tencent.com>
2025-07-17 09:10:09 +00:00
Varun Sundar Rabindranath
11dfdf21bf [Kernel] DeepGemm MoE : Integrate triton permute / unpermute kernels (#20903)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-07-17 08:10:37 +00:00
Chauncey
fdc5b43d20 [Bugfix]: Fix final_res_batch list index out of range error (#21055)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-07-17 00:29:09 -07:00
Jee Jee Li
c5b8b5953a [Misc] Fix PhiMoE expert mapping (#21085)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-17 05:47:49 +00:00
David Ben-David
4fcef49ec4 [V1] [KVConnector] Fix MultiprocExecutor worker output aggregation (#21048)
Signed-off-by: David Ben-David <davidb@pliops.com>
Co-authored-by: David Ben-David <davidb@pliops.com>
2025-07-17 13:29:45 +08:00
Zhonghua Deng
8a4e5c5f3c [V1][P/D]Enhance Performance and code readability for P2pNcclConnector (#20906)
Signed-off-by: Abatom <abzhonghua@gmail.com>
2025-07-16 22:13:00 -07:00
Lucas Wilkinson
76b494444f [Attention] Refactor attention metadata builder interface (#20466)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-07-17 04:44:25 +00:00
Michael Goin
28a6d5423d [Bugfix] Fix Machete zero point issue for GPTQ models on SM90 (#21066)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-16 19:54:45 -07:00
XiongfeiWei
58760e12b1 [TPU] Start using python 3.12 (#21000)
Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
2025-07-16 19:37:44 -07:00
Michael Goin
a50d918225 [Docker] Allow FlashInfer to be built in the ARM CUDA Dockerfile (#21013)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-16 19:37:13 -07:00
Kevin_Xiong
c9ba8104ed [Bugfix] weight loading use correct tp_group with patch_tensor_parallel_group (#21024)
Signed-off-by: KevinXiong-C <kevin_xiong1997@outlook.com>
2025-07-16 19:36:36 -07:00
Michael Goin
4e7dfbe7b4 Update PyTorch to torch==2.7.1 for CUDA (#21011)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-17 02:30:44 +00:00
QiliangCui
72ad273582 Remove torch_xla.tpu.version() from pallas.py. (#21065)
Signed-off-by: Qiliang Cui <derrhein@gmail.com>
2025-07-17 00:25:26 +00:00
Nir David
01513a334a Support FP8 Quantization and Inference Run on Intel Gaudi (HPU) using INC (Intel Neural Compressor) (#12010)
Signed-off-by: Nir David <ndavid@habana.ai>
Signed-off-by: Uri Livne <ulivne@habana.ai>
Co-authored-by: Uri Livne <ulivne@habana.ai>
2025-07-16 15:33:41 -04:00
Cyrus Leung
ac2bf41e53 [Model] Remove model sampler (#21059)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-16 19:03:37 +00:00
Harry Mellor
a931b4cdcf Remove Qwen Omni workaround that's no longer necessary (#21057)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-16 16:25:23 +00:00
Avshalom Manevich
a0f8a79646 [fix] fix qwen image_embeds input (#21049)
Signed-off-by: h-avsha <avshalom.manevich@hcompany.ai>
2025-07-16 15:17:20 +00:00
Mac Misiura
18bdcf4113 feat - add a new endpoint get_tokenizer_info to provide tokenizer/chat-template information (#20575)
Signed-off-by: m-misiura <mmisiura@redhat.com>
2025-07-16 21:52:14 +08:00
Cyrus Leung
1c3198b6c4 [Model] Consolidate pooler implementations (#20927)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-16 13:39:13 +00:00
Michael Yao
260127ea54 [Docs] Add intro and fix 1-2-3 list in frameworks/open-webui.md (#19199)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-07-16 06:11:38 -07:00
Seiji Eicher
d0dc4cfca4 Fix inadvertently silenced PP tests for mp, add DeepSeek V2/V3 model family to PP tests (#20831)
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
2025-07-16 00:14:49 -07:00
Lucas Wilkinson
d31a647124 [BugFix] Fix import error on non-blackwell machines (#21020)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-07-15 22:27:29 -07:00
Chengji Yao
85431bd9ad [TPU] fix kv_cache_update kernel block size choosing logic (#21007)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-07-16 04:39:48 +00:00
zhiweiz
c11013db8b [Meta] Llama4 EAGLE Support (#20591)
Signed-off-by: qizixi <qizixi@meta.com>
Co-authored-by: qizixi <qizixi@meta.com>
2025-07-15 21:14:15 -07:00
Peter Pan
1eb2b9c102 [CI] update typos config for CI pre-commit and fix some spells (#20919)
Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>
2025-07-15 21:12:40 -07:00
Maximilien de Bayser
6ebf313790 Avoid direct comparison of floating point numbers (#21002)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
2025-07-15 21:12:14 -07:00
Patrick von Platen
cfbcb9ed87 [Voxtral] Add more tests (#21010)
Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-07-15 21:11:49 -07:00
Wentao Ye
76ddeff293 [Doc] Remove duplicate docstring (#21012)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-15 20:09:13 -07:00
Michael Goin
f46098335b [Bugfix] Fix Mistral3 support on SM100/SM120 (#20998)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-15 20:08:41 -07:00
Chendi.Xue
e9534c7202 [CI][HPU] update for v0 deprecate by switching to VLLM_TARGET_DEVICE=empty (#21006)
Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
2025-07-15 20:07:05 -07:00
Doug Smith
7976446015 Add Dockerfile argument for VLLM_USE_PRECOMPILED environment (#20943)
Signed-off-by: dougbtv <dosmith@redhat.com>
2025-07-15 19:53:57 -07:00
Ming Yang
fcb9f879c1 [Bugfix] Correct per_act_token in CompressedTensorsW8A8Fp8MoECutlassM… (#20937)
Signed-off-by: Ming Yang <minos.future@gmail.com>
2025-07-15 19:53:42 -07:00
Ricardo Decal
3ed94f9d0a [Docs] Enhance Anyscale documentation, add quickstart links for vLLM (#21018)
Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
2025-07-15 19:46:56 -07:00
Reid
fa839565f2 [Misc] Refactor: Improve argument handling for conda command (#20481)
Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-07-15 19:43:19 -07:00
Brayden Zhong
75a99b98bf [Chore] Remove outdated transformers check (#20989)
Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
2025-07-15 19:42:40 -07:00
Chauncey
b5c3b68359 [Misc] bump xgrammar version to v0.1.21 (#20992)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-07-15 19:42:16 -07:00
Thomas Parnell
6cbc4d4bea [Model] Add ModelConfig class for GraniteMoeHybrid to override default max_seq_len_to_capture (#20923)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-07-15 19:19:10 -07:00
Michael Goin
153c6f1e61 [Frontend] Remove print left in FrontendArgs.add_cli_args (#21004)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-15 19:18:41 -07:00
Chauncey
34cda778a0 [Frontend] OpenAI Responses API supports input image (#20975)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-07-15 18:59:36 -06:00
Elfie Guo
30800b01c2 [Nvidia] Integrate SM100 cudnn prefill API to MLA prefill (#20411)
Signed-off-by: Elfie Guo <elfieg@nvidia.com>
Co-authored-by: Elfie Guo <eflieg@nvidia.com>
2025-07-15 17:56:45 -07:00
Chen LI
10be209493 [Bug Fix] get_distributed_init_method should get the ip from get_ip i… (#20889)
Signed-off-by: Chen Li <lcpingping@gmail.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-07-15 21:23:52 +00:00
Marko Rosenmueller
19c863068b [Frontend] Support cache_salt in /v1/completions and /v1/responses (#20981)
Signed-off-by: Marko Rosenmueller <5467316+dr75@users.noreply.github.com>
2025-07-15 21:01:04 +00:00
Tuan, Hoang-Trong
f29fd8a7f8 [BugFix] fix 3 issues: (1) using metadata for causal-conv1d, (2) indexing overflow in v1 vLLM, and (3) init_states in v0 (#20838)
Signed-off-by: Tuan M. Hoang-Trong <tmhoangt@us.ibm.com>
Co-authored-by: Tuan M. Hoang-Trong <tmhoangt@us.ibm.com>
2025-07-15 16:08:26 -04:00
Gregory Shtrasberg
ed10f3cea1 [ROCm] warpSize is being made non constexpr in ROCm 7.0 (#20330)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-07-15 14:01:44 -04:00
Harry Mellor
b637e9dcb8 Add full serve CLI reference back to docs (#20978)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-15 17:42:30 +00:00
Harry Mellor
1e36c8687e [Deprecation] Remove nullable_kvs (#20969)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-15 17:21:50 +00:00
Harry Mellor
5bac61362b Configure Gemini (#20971)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-15 09:37:05 -07:00
Harry Mellor
313ae8c16a [Deprecation] Remove everything scheduled for removal in v0.10.0 (#20979)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-15 15:57:53 +00:00
Cyrus Leung
c847e34b39 [CI/Build] Fix wrong path in Transformers Nightly Models Test (#20994)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-15 08:53:16 -07:00
Patrick von Platen
e7e3e6d263 Voxtral (#20970)
Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-07-15 07:35:30 -07:00
Christian Pinto
4ffd963fa0 [v1][core] Support for attention free models (#20811)
Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
2025-07-15 14:20:01 +00:00
Harry Mellor
56fe4bedd6 [Deprecation] Remove TokenizerPoolConfig (#20968)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-15 14:00:50 +00:00
Rui Qiao
d91278181d [doc] Add more details for Ray-based DP (#20948)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-07-15 05:37:12 -07:00
Li Wang
20149d84d9 [MISC] Add init files for python package (#20908)
Signed-off-by: wangli <wangli858794774@gmail.com>
2025-07-15 12:16:33 +00:00
Thomas Parnell
3534c39a20 [V1] [Hybrid] Refactor mamba state shape calculation; enable V1 via cli (#20840)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-07-15 04:04:35 -07:00
Yifei Teng
c586b55667 [TPU] Optimize kv cache update kernel (#20415)
Signed-off-by: Yifei Teng <tengyifei88@gmail.com>
2025-07-15 03:56:43 -07:00
Ricardo Decal
33d560001e [Docs] Improve documentation for ray cluster launcher helper script (#20602)
Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
2025-07-15 03:55:45 -07:00
kourosh hakhamaneshi
f148c44c6a [frontend] Refactor CLI Args for a better modular integration (#20206)
Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
2025-07-15 02:23:42 -07:00
Ricardo Decal
235bfd5dfe [Docs] Improve documentation for RLHF example (#20598)
Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
2025-07-15 01:54:10 -07:00
Reid
68d28e37b0 [frontend] Add --help=page option for paginated help output (#20961)
Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-07-15 00:42:00 -07:00
Ilya Markov
37a7d5d74a [Misc] Refactor AllReduceFusionPass. Remove parameter (#20918)
Signed-off-by: ilmarkov <imarkov@redhat.com>
Co-authored-by: ilmarkov <imarkov@redhat.com>
2025-07-15 06:57:40 +00:00
Woosuk Kwon
d4d309409f Implement Async Scheduling (#19970)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-07-14 23:01:46 -07:00
Jennifer He
85bd6599e4 [Model] Add AutoWeightsLoader support for BERT, RoBERTa (#20534)
Signed-off-by: Jennifer He <islandhe@gmail.com>
Signed-off-by: <islandhe@gmail.com>
Signed-off-by: Jen H <islandhe@gmail.com>
2025-07-15 13:34:24 +08:00
Boyuan Feng
91b3d190ae [cold start] replace VLLM_COMPILE_DEPYF with debug_dump_dir (#20940)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
2025-07-15 13:02:17 +08:00
Isotr0py
fc017915f5 [Doc] Clearer mistral3 and pixtral model support description (#20926)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-07-14 21:56:53 -07:00
Pavani Majety
9ad0a4588b [Bugfix] Switch bailout logic for kv-cache-dtype with SM100 Flashinfer (#20934)
Signed-off-by: Pavani Majety <pmajety@nvidia.com>
2025-07-15 03:27:50 +00:00
Ruheena Suhani Shaik
016b8d1b7f Enabled BnB NF4 inference on Gaudi (#20172)
Signed-off-by: Ruheena Suhani Shaik <rsshaik@habana.ai>
2025-07-14 20:26:08 -07:00
Nicolò Lucchesi
80305c1b24 [CI] Fix flaky test_streaming_response test (#20913)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-07-14 20:15:15 -07:00
Reid
37e2ecace2 feat: add image zoom to improve image viewing experience (#20763)
Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-07-14 20:14:23 -07:00
Ricardo Decal
054c8657e3 [Docs] Add Kuberay to deployment integrations (#20592)
Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
2025-07-14 20:13:55 -07:00
XiongfeiWei
d4170fad39 Use w8a8 quantized matmul Pallas kernel (#19170)
Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
2025-07-15 03:06:33 +00:00
Michael Goin
946aadb4a0 [CI/Build] Split Entrypoints Test into LLM and API Server (#20945)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-15 02:44:18 +00:00
Michael Goin
bcdfb2a330 [Bugfix] Fix incorrect dispatch for CutlassBlockScaledGroupedGemm and DeepGEMM (#20933)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-15 01:42:17 +00:00
Richard Zou
ba8c300018 [BugFix] VLLM_DISABLE_COMPILE_CACHE=1 should disable all reads and writes from the cache (#20942)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2025-07-15 01:26:18 +00:00
Alexander Matveev
8cdc371217 SM100 Cutlass MLA decode with unrestricted num_heads (< 128) for DeepSeek TP (#20769)
Signed-off-by: Alexander Matveev <amatveev@redhat.com>
2025-07-15 01:06:38 +00:00
Yong Hoon Shin
61e20828da Fall back if flashinfer comm module not found (#20936)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-07-14 23:11:18 +00:00
Kuntai Du
55e1c66da5 [Docs] remove outdated performance benchmark (#20935)
Signed-off-by: Kuntai Du <kuntai@uchicago.edu>
2025-07-14 22:14:17 +00:00
Thomas Parnell
86f3ac21ce Fix overflow indexing in causal_conv1d kernel (#20938)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-07-14 21:43:07 +00:00
Nicolò Lucchesi
149f2435a5 [Misc] Relax translations tests (#20856)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-07-14 20:08:36 +00:00
Varun Sundar Rabindranath
c0569dbc82 [Misc] ModularKernel : Perform WeightAndReduce inside TritonExperts & DeepGemmExperts (#20725)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-07-14 19:47:16 +00:00
Michael Goin
8bb43b9c9e Add benchmark dataset for mlperf llama tasks (#20338)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-14 19:10:07 +00:00
Tyler Michael Smith
559756214b Change default model to Qwen3-0.6B (#20335)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-07-14 16:54:52 +00:00
Isotr0py
6d0cf239c6 [CI/Build] Add Transformers nightly tests in CI (#20924)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-07-14 16:33:17 +00:00
Isotr0py
3fc964433a [Misc] Clean up Aimv2 config registration in Ovis config (#20921)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-07-14 15:36:43 +00:00
Lu Fang
0caf61c08a [CI] Update codeowner for compilation code (#20929)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-07-14 08:33:19 -07:00
Richard Zou
667624659b [CI] cc folks on changes to vllm/compilation (#20925)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2025-07-14 07:52:17 -07:00
ant-yy
38efa28278 [Model] Add Ling implementation (#20680)
Signed-off-by: vito.yy <vito.yy@antgroup.com>
2025-07-14 22:10:32 +08:00
Cyrus Leung
e8cc53af5e [Misc] Log the reason for falling back to FlexAttention (#20699)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-14 04:16:51 -07:00
Chauncey
a4851cfe68 [Bugfix]: Fix messy code when using logprobs (#20910)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-07-14 11:06:45 +00:00
Reid
9887e8ec50 [Misc] Remove unused function (#20909)
Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-07-14 10:48:55 +00:00
22quinn
f326ab9c88 [Bugfix] Bump up mistral_common to support v13 tokenizer (#20905)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-07-14 10:45:03 +00:00
Cyrus Leung
dcf2a5e208 [CI/Build] Fix OOM issue in Jina-VL test (#20907)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-14 10:32:35 +00:00
wangxiyuan
1e9438e0b0 [MISC] Move bind_kv_cache to worker module (#20900)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-07-14 09:40:00 +00:00
Aaron Pham
697ef765ee [Refactor][V1] Move outlines utils for V1 imports (#20878)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-07-14 00:58:35 -07:00
Jee Jee Li
a99b9f7dee [Quantization] add BNB for MixtralForCausalLM (#20893)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-14 07:34:34 +00:00
TJian
c488b928a7 [ROCm] [Bugfix] [Critical]: Fix mamba compilation bug (#20883)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-07-14 15:23:28 +08:00
Reid
2c7fa47161 Fix: Add missing EOFError handling in CLI complete command (#20896)
Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-07-14 07:09:57 +00:00
Daniel song
88fc8a97e3 Removing redundant python version check (#20888)
Signed-off-by: Dannyso05 <dansong1177@gmail.com>
2025-07-14 06:15:05 +00:00
Maroon Ayoub
66f6fbd393 [Prefix Cache] Add reproducible prefix-cache block hashing using SHA-256 + CBOR (64bit) (#20511)
Signed-off-by: Maroon Ayoub <maroon.ayoub@ibm.com>
2025-07-14 02:45:31 +00:00
22quinn
8632e831ba [Core] Add update_config RPC method (#20095)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-07-14 00:49:18 +00:00
nopperl
4bbfc36b16 [V1] Hybrid allocator without prefix caching (#20661)
Signed-off-by: nopperl <54780682+nopperl@users.noreply.github.com>
2025-07-13 16:55:14 +00:00
TJian
80d38b8ac8 [V1] [ROCm] [AITER] Upgrade AITER to commit 916bf3c and bugfix APIs (#20880)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-07-13 15:19:32 +00:00
Liuchenlong
211b6a6113 [Bugfix] fix define of RerankDocument (#20877)
Signed-off-by: liuchenlong <liuchenlong@xiaohongshu.com>
Co-authored-by: liuchenlong <liuchenlong@xiaohongshu.com>
2025-07-13 14:32:40 +00:00
Wang Siyuan
247102f07f [Bugfix] Fix: add patch_rope_scaling after hf override (#20857)
Signed-off-by: Wang Siyuan <wsy0227@sjtu.edu.cn>
Signed-off-by: Wang Siyuan <sywang0227@gmail.com>
2025-07-13 00:13:25 -07:00
Minkyu Kim
bd4c1e6fdb Support for LlamaForSequenceClassification (#20807)
Signed-off-by: thechaos16 <thechaos16@gmail.com>
2025-07-13 00:09:34 -07:00
QiliangCui
99b4f080d8 Renable google/gemma-3-1b-it accuracy test. (#20866)
Signed-off-by: Qiliang Cui <derrhein@gmail.com>
2025-07-12 21:48:56 -07:00
Nicolò Lucchesi
020f58abcd [Core] Support multiple tasks per model (#20771)
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-12 19:40:11 -07:00
Wentao Ye
c1acd6d7d4 [Refactor] Change the way of import triton (#20774)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-12 19:39:55 -07:00
ElizaWszola
3b3b778d4a [Bugfix] Fix a couple PPLX+CUTLASS MoE bugs (#20825)
Signed-off-by: ElizaWszola <ewszola@redhat.com>
2025-07-12 19:39:14 -07:00
Wentao Ye
42d440c22b [Perf] Use Triton instead of Torch for DeepGEMM Per Token Group Quant (#20841)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-12 19:38:45 -07:00
Woosuk Kwon
f45a332886 [Sched] Enhance the logic to remove stopped requests from queues (#20739) 2025-07-12 15:33:13 -07:00
Michael Goin
6e2c176e1f [Bugfix] Restrict Machete to only run on Hopper (#20830)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-12 17:34:40 +00:00
Reid
a86754a12b [docs] convert supported configs to table (#20858)
Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-07-12 06:54:50 -07:00
Alex Brooks
c2a2f19aba [Bugfix] Fix Tensor Parallelism Padding Consistency in Granite Models (#20843)
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
2025-07-12 06:11:30 -07:00
Congcong Chen
2c11a738b3 [Model] New model support for microsoft/Phi-4-mini-flash-reasoning (#20702)
Signed-off-by: Congcong Chen <congcongchen@microsoft.com>
2025-07-12 06:02:10 -07:00
Michael Goin
b639327ad9 Revert "Use NVCC --compress-mode to reduce binary size by 30% #20694" (#20853)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-11 23:07:35 -07:00
Zhiyu
4afe687a82 Enable ModelOpt Llama4 fp8 checkpoint deployment (#20419)
Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
2025-07-11 23:07:16 -07:00
Maximilien de Bayser
5de8d9f111 Remove extra tensor on CPU (#20693)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
2025-07-12 14:06:34 +08:00
Boyuan Feng
c1c8ca57ff [cold start time] add envs.VLLM_COMPILE_DEPYF to guard decompile (#20790)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
2025-07-11 23:06:13 -07:00
Richard Zou
a3a5a47e48 [Bugfix] Fix torch.compile x LoRA for PyTorch 2.8 (#20823)
Signed-off-by: rzou <zou3519@gmail.com>
2025-07-11 23:06:04 -07:00
Lucia Fang
fb25e95688 [Docs] Update basic.md (#20846) 2025-07-11 23:05:32 -07:00
Wentao Ye
0d4891cd03 [Bug] Fix DeepGemm for EP low latency case (#20833)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-11 23:05:12 -07:00
lkchen
f56d2996ca [Misc] Respect no_use_tqdm_on_load flag while capturing CUDA graph (#20834)
Signed-off-by: Linkun <github@lkchen.net>
2025-07-11 23:04:45 -07:00
Isotr0py
147afb448b [Bugfix] Replace unavailable video url in multimodal test (#20854)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-07-12 05:25:39 +00:00
Nicolò Lucchesi
3c7d942da8 [Frontend] Abstract prompt and SpeechToTextConfig for transcriptions models (#20637)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-07-11 21:33:26 -07:00
Varun Sundar Rabindranath
890323dc1b [Bugfix] : Fix typo - logger.warn_once -> logger.warning_once (#20852) 2025-07-11 20:56:24 -07:00
Isotr0py
01cae37713 [CI/Build] Ensure compatability with Transformers v4.53 (#20541)
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-07-11 20:53:07 -07:00
yurhett
11c0198615 [Bugfix] Fix tensor parallel issue in Qwen3 reranker weight loading (#20682)
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
2025-07-11 20:52:43 -07:00
Li, Jiang
b1235c3e10 [Bugfix] Lazy import fused_experts in BitsAndBytesMoEMethod to avoid break not-cuda-alike devices (#20822)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-07-11 20:52:05 -07:00
Jee Jee Li
44d02f54db [Misc] Restrict deep_gemm's log output (#20827)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-11 20:50:42 -07:00
Trevor Morris
a8593237c0 Add pynccl all-gatherv and reducescatterv (#20154)
Signed-off-by: Trevor Morris <tmorris@nvidia.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-07-11 18:59:23 -07:00
Ilya Markov
fc0f41d10a Integration SM100 FlashInfer fused allreduce RMSNorm (#20691)
Signed-off-by: ilmarkov <imarkov@redhat.com>
Co-authored-by: ilmarkov <imarkov@redhat.com>
2025-07-11 18:58:15 -07:00
Wentao Ye
7b828e30d5 [CI Bug] Fix Async Engine, Inputs, Utils, Worker Test: 'State' object has no attribute 'enable_server_load_tracking' (#20845)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-11 18:57:24 -07:00
bigmoyan
5f0af36af5 Update kimi-k2 tool calling docs, enable unit tests (#20821)
Signed-off-by: wangzhengtao <wangzhengtao@moonshot.cn>
Co-authored-by: wangzhengtao <wangzhengtao@moonshot.cn>
Co-authored-by: wangzhengtao <wangzhengtao@msh.team>
2025-07-11 20:16:14 +00:00
Isotr0py
0d21b2664c [Bugfix] Fix OOM in language generation test (#20814)
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-07-11 11:21:52 -07:00
Nick Hill
9907fc4494 [Docs] Data Parallel deployment documentation (#20768)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-07-11 09:42:10 -07:00
Michael Goin
d47661f0cd [Kernel] Basic tuned configs for NVFP4 CUTLASS dense GEMM (#20646)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-11 10:05:33 -06:00
Varun Sundar Rabindranath
53fa457391 [Misc] Add unit tests for MoE ModularKernel combinations + Profiling utility (#20449)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-07-11 07:51:46 -07:00
Reid
6fb162447b [doc] fix ordered list issue (#20819)
Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-07-11 06:49:46 -07:00
Li, Jiang
66177189c5 [Bugfix] Add missing field to TritonLanguagePlaceholder (#20812)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-07-11 05:25:11 -07:00
QiliangCui
b4f0b5f9aa Temporarily suspend google/gemma-3-1b-it. (#20722)
Signed-off-by: Qiliang Cui <derrhein@gmail.com>
2025-07-11 11:21:26 +00:00
Cyrus Leung
cbd14ed561 [Bugfix] Refactor /invocations to be task-agnostic (#20764)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-11 03:20:54 -07:00
Pavani Majety
7bd4c37ae7 [Core] Add Flashinfer TRTLLM Backend for Flashinfer decode path (SM100). (#19825)
Signed-off-by: Pavani Majety <pmajety@nvidia.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: shuw <shuw@nvidia.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-07-11 09:23:23 +00:00
Jee Jee Li
8020e98c9f [Quantization][1/N] MoE support BNB-Inflight Quantization (#20061)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-11 08:01:13 +00:00
Luka Govedič
762be26a8e [Bugfix] Upgrade depyf to 0.19 and streamline custom pass logging (#20777)
Signed-off-by: Luka Govedic <lgovedic@redhat.com>
Signed-off-by: luka <lgovedic@redhat.com>
2025-07-11 00:15:22 -07:00
Reid
6a9e6b2abf [doc] fold long code block (#20795)
Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-07-10 23:16:41 -07:00
nopperl
5d09152ff1 [V1] Enable Mamba2 layers other than MambaMixer2 in the v1 engine (#20660)
Signed-off-by: nopperl <54780682+nopperl@users.noreply.github.com>
2025-07-11 05:53:31 +00:00
Luka Govedič
31d5c1797f [Perf][fp8] Use CustomOp abstraction for fp8 quant for better perf (#19830)
Signed-off-by: Luka Govedic <lgovedic@redhat.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-07-11 04:56:28 +00:00
Ratnam Parikh
35514b682a [XPU] XCCL support enabled in torch 2.8.0.dev nightly builds (#20705)
Signed-off-by: ratnampa <ratnam.parikh@intel.com>
2025-07-10 20:39:52 -07:00
Wentao Ye
e2de455c34 [Feature] Integrate SM100 DeepGEMM support (#20087) 2025-07-10 20:18:05 -07:00
Alexander Matveev
5b032352cc [Attention] MLA - Flashinfer Ragged Prefill (#20034) 2025-07-10 20:17:47 -07:00
Michael Goin
922f316441 [Model] Support HF format of minimax (#20211)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-11 02:55:21 +00:00
Duncan Moss
5923ab9524 [fix]: disable cutlass block scaled group gemm for EP (#20781)
Signed-off-by: Duncan Moss <djm.moss@gmail.com>
2025-07-11 02:39:18 +00:00
bigmoyan
0cf893cae1 Add kimi-k2 tool parser (#20789)
Signed-off-by: wangzhengtao <wangzhengtao@moonshot.cn>
Co-authored-by: wangzhengtao <wangzhengtao@moonshot.cn>
Co-authored-by: wangzhengtao <wangzhengtao@msh.team>
2025-07-11 10:36:23 +08:00
Michael Goin
cf75cd2098 [CI Bugfix] Specify same TORCH_CUDA_ARCH_LIST for flashinfer aot and install (#20772)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-11 01:16:01 +00:00
Simon Mo
b854321ffe [Docs] Lazy import gguf (#20785)
Signed-off-by: simon-mo <simon.mo@hey.com>
2025-07-10 16:06:37 -07:00
Kuntai Du
5b6fe23d05 [Bugfix][Benchmark] Make sure the output length > 0 when testing prefill workload. (#20786)
Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-07-10 14:52:46 -07:00
Varun Sundar Rabindranath
f0c98cae27 [Misc] MoE ModularKernel : Introduce TopKWeightAndReduce (#20648)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-07-10 14:40:38 -07:00
Nick Hill
574ad60db9 [KVConnector] Always call connector clear_metadata() at end of step (#20756)
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: David Ben-David <sdavidbd@gmail.com>
2025-07-10 22:37:27 +01:00
Varun Sundar Rabindranath
fdadb6f43a [Bugfix] Fused MoE Modular Kernel chunking loop (#20392)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-07-10 20:31:10 +00:00
Alex Brooks
41060c6e08 [Core] Add Support for Default Modality Specific LoRAs [generate / chat completions] (#19126)
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
2025-07-10 21:09:37 +01:00
Ming Yang
3de2ed767f [Bugfix] Remove assertion of expert_map being None (#20714)
Signed-off-by: Ming Yang <yming@meta.com>
Signed-off-by: Ming Yang <minos.future@gmail.com>
2025-07-10 19:55:22 +00:00
Wentao Ye
299252ea82 [CI] Fix pre commit issue (#20782)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-10 12:48:13 -07:00
Nathan Hoos
d6902ce79f [V0][V1][Core] Add outlines integration for V1, and update V0 integration. (#15975)
Signed-off-by: Nathan Hoos <thwackyy.y@gmail.com>
2025-07-10 15:30:26 -04:00
Sanger Steel
5e53c89a74 [Bugfix] [CI] Fix Tensorizer LoRA test (#20760)
Signed-off-by: Sanger Steel <sangersteel@gmail.com>
2025-07-10 19:07:06 +00:00
QiliangCui
c66e38ea4c [Test] Remove docker build from test. (#20542)
Signed-off-by: Qiliang Cui <derrhein@gmail.com>
2025-07-10 11:21:58 -07:00
sfbemerk
251595368f Fix DeepSeek-R1-0528 chat template (#20717)
Signed-off-by: Benjamin Merkel <benjamin.merkel@tngtech.com>
Co-authored-by: Benjamin Merkel <benjamin.merkel@tngtech.com>
2025-07-10 17:47:36 +00:00
shineran96
4bed167768 [Model][VLM] Support JinaVL Reranker (#20260)
Signed-off-by: shineran96 <shinewang96@gmail.com>
2025-07-10 10:43:43 -07:00
Asher
b140416abf [Model] Add reason parser for Hunyuan A13B Model. (#20625)
Signed-off-by: Asher Zhang <asherszhang@tencent.com>
2025-07-10 16:33:26 +00:00
Gregory Shtrasberg
5b8366b61a [ROCm][Regression] Remove tensor creation that harms performance on ROCm (#20741)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-07-10 09:22:23 -07:00
nishith-fujitsu
c7753a9809 [Hardware][CPU] Vllm int8 quantization enablement for ARM CPU (#14129)
Signed-off-by: nishith-fujitsu <nishith.jaiswal@fujitsu.com>
2025-07-10 15:59:04 +00:00
Michael Goin
4b9a9435bb Update Dockerfile FlashInfer to v0.2.8rc1 (#20718)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-10 08:09:02 -07:00
Harry Mellor
3482fd7e4e [Doc] Add engine args back in to the docs (#20674)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-10 08:02:40 -07:00
Isotr0py
77f77a951e [Misc] Clean up mark to fork process in BNB tests (#20692)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-07-10 13:59:40 +00:00
Michael Goin
1a4f35e2ea Normalize lm-eval command between baseline and correctness test (#18560)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-10 13:27:32 +00:00
Michael Goin
be1e128dfb [CI Bugfix] Skip failing Tensorizer+LoRA test (#20724) 2025-07-10 21:15:03 +09:00
Reid
65393ee064 [doc] fix ordered list (#20749)
Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-07-10 03:13:52 -07:00
Gregory Shtrasberg
dc221ad72d [Bugfix][Build][Non-CUDA] Only referencing CMAKE_CUDA_COMPILER_VERSION on CUDA where it is defined (#20738)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-07-10 02:58:11 -07:00
Jee Jee Li
7571a4a7e5 [CI/Build] Fix Basic Models Test (#20728)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-10 09:57:19 +00:00
Isotr0py
f67d986dd1 [Misc] loose new-model tagger conditions (#20747)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-07-10 02:54:47 -07:00
Or Ozeri
cc876d0f29 [KVConnector] Aggregate finished requests on the scheduler (#19555)
Signed-off-by: Or Ozeri <oro@il.ibm.com>
2025-07-10 09:22:18 +01:00
Chenyaaang
fdfd409f8f [TPU][Core]Make load weight exceed hbm error more instructive for customers (#20644)
Signed-off-by: Chenyaaang <chenyangli@google.com>
2025-07-10 07:01:17 +00:00
Nick Hill
ffbcc9e757 [BugFix] Fix VllmConfig() construction on all platforms (#20695)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-07-10 07:00:20 +00:00
Nick Hill
59389c927b [BugFix][CPU] Fix CPU worker dependency on cumem_allocator (#20696)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-07-10 14:24:20 +08:00
Chauncey
8f2720def9 [Frontend] Support Tool Calling with both tool_choice='required' and $defs. (#20629)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-07-10 13:56:35 +08:00
Seiji Eicher
ad6c2e1a0b Correct PPMissingLayer handling in Deepseek-V2-Lite PP deployment (#20665)
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
2025-07-09 20:34:40 -07:00
Michael Goin
49e8c7ea25 Use NVCC --compress-mode to reduce binary size by 30% (#20694)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-09 18:26:48 -07:00
Varun Sundar Rabindranath
805d62ca88 [Misc] DP : Add ExpertTokensMetadata (#20332)
Signed-off-by: Varun <vsundarr@redhat.com>
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun <vsundarr@redhat.com>
2025-07-10 00:33:14 +00:00
Michael Goin
b7d9e9416f [CI/Build] Fix FlashInfer double build in Dockerfile (#20651)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-09 17:41:56 -06:00
Woosuk Kwon
7c12a765aa [Misc] Simplify the prefix caching logic on draft tokens (#20701)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-07-09 14:48:35 -07:00
Yiming
cd587c93ef [BugFix]: Properly set engine_id when using multi connector (#19487)
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: leiyiming <leiyiming@kingsoft.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-07-09 20:32:44 +00:00
fxmarty-amd
332d4cb17b [Feature][Quantization] MXFP4 support for MOE models (#17888)
Signed-off-by: Felix Marty <felmarty@amd.com>
Signed-off-by: Bowen Bao <bowenbao@amd.com>
Signed-off-by: Felix Marty <Felix.Marty@amd.com>
Co-authored-by: Bowen Bao <bowenbao@amd.com>
2025-07-09 13:19:02 -07:00
Jacob Manning
bf03ff3575 [Kernel] Add Conch backend for mixed-precision linear layer (#19818)
Signed-off-by: Jacob Manning <jmanning+oss@stackav.com>
2025-07-09 13:17:55 -07:00
Tuan, Hoang-Trong
47043eb678 [Kernel] Triton implementation of causal-conv1d for Mamba-based models (#18218)
Signed-off-by: Tuan M. Hoang-Trong <tmhoangt@us.ibm.com>
Co-authored-by: Tuan M. Hoang-Trong <tmhoangt@us.ibm.com>
Co-authored-by: Tyler Michael Smith <tysmith@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-07-09 12:53:55 -07:00
Michael Goin
31b96d1c64 Support Llama 4 for cutlass_moe_fp4 (#20453)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-09 15:53:38 -04:00
Li, Jiang
e59ba9e142 [CI/Build] Enlarge tolerance for a CPU multi-modal test (#20684)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-07-09 17:48:52 +00:00
Harry Mellor
403b481573 Remove heading form installation inc.md file (#20697)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-09 10:42:51 -07:00
Li, Jiang
138709f8d1 [Doc] Update CPU doc (#20676)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-09 10:28:30 -07:00
Michael Goin
0bbac1c1b4 [Bench] Add NVFP4 GEMM benchmark script (#20578)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-09 13:23:48 -04:00
Liangliang Ma
a3e4e85ece [XPU][CI] enhance xpu test support (#20652)
Signed-off-by: Ma, Liangliang <liangliang.ma@intel.com>
Co-authored-by: zhenwei-intel <zhenweiliu@habana.ai>
2025-07-09 16:53:09 +00:00
Chengji Yao
eb58f5953d [TPU][Bugfix] fix test_pallas (#20666)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-07-09 09:32:48 -07:00
Sanger Steel
4ac9c33f78 [Bugfix] Fix handling of Tensorizer arguments for LoadConfig (#20643)
Signed-off-by: Sanger Steel <sangersteel@gmail.com>
2025-07-09 15:36:37 +00:00
Reid
efe73d0575 [doc] update doc format (#20673)
Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-07-09 08:08:19 -07:00
Ricardo Decal
853487bc1b [Docs] Improve docs for RLHF co-location example (#20599)
Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-09 08:06:43 -07:00
Li Wang
9ff2af6d2b [Benchmark] Parameterization of streaming loading of multimodal datasets (#20528)
Signed-off-by: wangli <wangli858794774@gmail.com>
2025-07-09 13:35:16 +00:00
Cyrus Leung
70ca5484f5 [Doc] Update notes (#20668)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-09 03:46:36 -07:00
Thomas Parnell
5358cce5ff [V1] [Doc] Update V1 docs for Mamba models (#20499)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-07-09 01:02:41 -07:00
Chauncey
2155e95ef1 [Bugfix] Fix the issue where reasoning_content is None when Thinkng is enabled and tool_choice is set to 'required'. (#20662)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-07-09 07:39:58 +00:00
qscqesze
f95570a52d [Docs] fix minimax tool_calling docs error (#20667)
Signed-off-by: qingjun <qingjun@minimaxi.com>
2025-07-09 00:37:07 -07:00
Kunshang Ji
b6e7e3d58f [Intel GPU] support ray as distributed executor backend for XPU. (#20659)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2025-07-09 00:36:58 -07:00
Dmitry Rogozhkin
e760fcef22 [XPU] Use spawn with XPU multiprocessing (#20649)
Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
2025-07-09 00:34:28 -07:00
B-201
6bbf1795b7 [Misc] Fix the size of batched_dummy_mm_inputs in profile_run (#20434)
Signed-off-by: bk-201 <joy25810@foxmail.com>
2025-07-08 20:15:44 -07:00
Michael Goin
9e0ef888f0 Fix bullets in incremental_build.md (#20642) 2025-07-09 11:03:41 +08:00
Duncan Moss
97abeb1daa [feat] enable SM100 CUTLASS block scaled group gemm for smaller batch sizes (#20640)
Signed-off-by: Duncan Moss <djm.moss@gmail.com>
2025-07-09 11:03:35 +08:00
zhrrr
34dad19e7b [Bugfix] set default set cuda_graph_sizes to min(self.max_num_seqs * 2, 512) (#20628)
Signed-off-by: izhuhaoran <izhuhaoran@qq.com>
2025-07-09 11:02:51 +08:00
Akash kaothalkar
6db31e7a27 [Hardware][PPC64LE] Enable V1 for ppc64le and ARM (#20554)
Signed-off-by: Akash Kaothalkar <akash.kaothalkar@ibm.com>
Co-authored-by: Akash Kaothalkar <akash.kaothalkar@ibm.com>
Co-authored-by: Nikhil Gupta <nikhil.gupta2@arm.com>
2025-07-08 20:00:41 -07:00
Ricardo Decal
977180c912 [Docs] Improve documentation for multi-node service helper script (#20600)
Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
2025-07-08 19:44:26 -07:00
Ratnam Parikh
c40784c794 [BugFix][Intel GPU] Use refactored API for dist_backend in V1 worker (#20596)
Signed-off-by: ratnampa <ratnam.parikh@intel.com>
2025-07-08 19:44:23 -07:00
kourosh hakhamaneshi
baed180aa0 [tech debt] Revisit lora request model checker (#20636)
Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
2025-07-09 09:42:41 +08:00
Kunshang Ji
0b407479ef [misc]refactor Platform.set_device method (#20262)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2025-07-09 01:39:47 +00:00
Wenxin Cheng
5eaf570050 Replace multiply_add with homogeneous_multiply_add to Address Clang Template Parameter Issue (#20142)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-07-09 00:30:18 +00:00
QiliangCui
d8ee5a2ca4 [TPU][Bugfix] disable phi-3 test (#20632)
Signed-off-by: Qiliang Cui <derrhein@gmail.com>
2025-07-08 23:14:26 +00:00
Isotr0py
b9fca83256 [Bugfix] Fix GLM-4.1-V video prompt update (#20635)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-07-08 23:13:58 +00:00
Cyrus Leung
32dffc2772 [Core] Rename get_max_tokens_per_item for backward compatibility (#20630)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-08 23:11:30 +00:00
Ming Yang
c438183e99 [Bugfix] Fix topk_ids indices_type for CUTLASS w8a8 FP8 MoE (#20166)
Signed-off-by: Ming Yang <yming@meta.com>
2025-07-08 23:10:57 +00:00
wang.yuqi
baba0389f7 [CI] Increase the threshold of the MTEB RERANK tests (#20615)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-07-08 08:10:11 -07:00
viravera
c6c22f16d3 Revert invalid spellchecker fix on deepseek_vl2 (#20618) 2025-07-08 15:07:14 +00:00
Cyrus Leung
dd382e0fe3 [Model] Implement missing get_language_model for Keye-VL (#20631)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-08 07:47:46 -07:00
XiongfeiWei
849590a2a7 Update torch/xla pin to 20250703 (#20589)
Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
2025-07-08 07:44:02 -07:00
Yan Ma
a4c23314c0 [xpu]feat: support multi-lora on xpu (#20616)
Signed-off-by: yan <yan.ma@intel.com>
2025-07-08 22:07:10 +08:00
Harry Mellor
b942c094e3 Stop using title frontmatter and fix doc that can only be reached by search (#20623)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-08 03:27:40 -07:00
Harry Mellor
b4bab81660 Remove unnecessary explicit title anchors and use relative links instead (#20620)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-08 02:49:13 -07:00
Ricardo Decal
b91cb3fa5c [Docs] Improve documentation for Deepseek R1 on Ray Serve LLM (#20601)
Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
2025-07-08 02:09:06 -07:00
Nicolò Lucchesi
71d1d75b7a [PD][Nixl] Remote consumer READ timeout for clearing request blocks (#20139)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-07-08 08:56:40 +01:00
Sanger Steel
72d14d0eed [Frontend] [Core] Integrate Tensorizer in to S3 loading machinery, allow passing arbitrary arguments during save/load (#19619)
Signed-off-by: Sanger Steel <sangersteel@gmail.com>
Co-authored-by: Eta <esyra@coreweave.com>
2025-07-07 22:47:43 -07:00
Chenyaaang
e34d130c16 [TPU] Temporary fix vmem oom for long model len by reducing page size (#20278)
Signed-off-by: Chenyaaang <chenyangli@google.com>
2025-07-08 05:16:16 +00:00
Li, Jiang
7721ef1786 [CI/Build][CPU] Fix CPU CI and remove all CPU V0 files (#20560)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-07-07 22:13:44 -07:00
Reid
8369b7c2a9 [Misc] improve error msg (#20604)
Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-07-07 21:45:18 -07:00
Ricardo Decal
3eb4ad53f3 [Docs] Add Anyscale to frameworks (#20590)
Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
2025-07-07 20:09:13 -07:00
Ricardo Decal
90a2769f20 [Docs] Add Ray Serve LLM section to openai compatible server guide (#20595)
Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
2025-07-07 20:08:05 -07:00
Ricardo Decal
e60d422f19 [Docs] Improve docstring for ray data llm example (#20597)
Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
2025-07-07 20:06:26 -07:00
Ricardo Decal
0d914c81a2 [Docs] Rewrite offline inference guide (#20594)
Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
2025-07-07 20:06:02 -07:00
Harry Mellor
6e428cdd7a [Doc] Syntax highlight request responses as JSON instead of bash (#20582)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-07 20:02:45 -07:00
Chauncey
93b9d9f499 [Bugfix]: Fix messy code when using logprobs (#19209)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-07-08 11:02:15 +08:00
Harry Mellor
af107d5a0e Make distinct code and console admonitions so readers are less likely to miss them (#20585)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-07 19:55:28 -07:00
Woosuk Kwon
31c5d0a1b7 [Optimize] Don't send token ids when kv connector is not used (#20586)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-07-07 19:04:54 -07:00
Ming Yang
afb7cff1b9 [Bugfix] Fix Maverick correctness by filling zero to cache space in cutlass_moe (#20167)
Signed-off-by: Ming Yang <yming@meta.com>
2025-07-08 01:07:22 +00:00
Kyle Yu
d2e841a10a [Misc] Improve logging for dynamic shape cache compilation (#20573)
Signed-off-by: kyolebu <kyu@redhat.com>
2025-07-08 00:48:09 +00:00
Patrick von Platen
14601f5fba [Config] Refactor mistral configs (#20570)
Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com>
2025-07-07 15:25:10 -07:00
Harry Mellor
042d131f39 Fix links in multi-modal model contributing page (#18615)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-07 21:13:52 +00:00
rongfu.leng
8e807cdfa4 [Misc] feat output content in stream response (#19608)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-07-07 20:45:10 +00:00
Anton
e601efcb10 [Misc] Add fully interleaved support for multimodal 'string' content format (#14047)
Signed-off-by: drobyshev.anton <drobyshev.anton@wb.ru>
Co-authored-by: drobyshev.anton <drobyshev.anton@wb.ru>
2025-07-07 19:43:08 +00:00
jvlunteren
22dd9c2730 [Kernel] Optimize Prefill Attention in Unified Triton Attention Kernel (#20308)
Signed-off-by: Jan van Lunteren <jvl@zurich.ibm.com>
2025-07-07 19:08:12 +00:00
Rui Qiao
a6d795d593 [DP] Copy environment variables to Ray DPEngineCoreActors (#20344)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-07-07 10:14:22 -07:00
ztang2370
a37d75bbec [Front-end] microbatch tokenization (#19334)
Signed-off-by: zt2370 <ztang2370@gmail.com>
2025-07-07 17:54:10 +01:00
Peter Pan
edd270bc78 [Bugfix] Prevent IndexError for cached requests when pipeline parallelism is disabled (#20486)
Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>
2025-07-07 09:41:15 -07:00
wang.yuqi
110df74332 [Model][Last/4] Automatic conversion of CrossEncoding model (#19675)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-07-07 14:46:04 +00:00
Harry Mellor
1ad69e8375 [Doc] Fix some MkDocs snippets used in the installation docs (#20572)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-07 07:44:34 -07:00
Harry Mellor
b8a498c9b2 [Doc] Add outline for content tabs (#20571)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-07 07:43:26 -07:00
Harry Mellor
923147b5e8 [Doc] Fix internal links so they don't always point to latest (#20563)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-07 04:15:50 -07:00
Harry Mellor
45877ef740 [Doc] Use gh-pr and gh-issue everywhere we can in the docs (#20564)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-07 03:54:22 -07:00
Harry Mellor
6e4bef1bea [Doc] Remove extra whitespace from CI failures doc (#20565)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-07 03:35:47 -07:00
Jee Jee Li
4ff79a136e [Misc] Set the minimum openai version (#20539)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-07 09:15:26 +00:00
Abirdcfly
448acad31e [Misc] remove unused jinaai_serving_reranking (#18878)
Signed-off-by: Abirdcfly <fp544037857@gmail.com>
2025-07-07 09:14:12 +00:00
Michael Yao
eb0b2d2f08 [Docs] Clean up tables in supported_models.md (#20552)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-07-07 01:46:31 -07:00
Yan Ma
3112271f6e [XPU] log clean up for XPU platform (#20553)
Signed-off-by: yan <yan.ma@intel.com>
2025-07-07 01:38:22 -07:00
Michael Yao
1fd471e957 Add docstrings to url_schemes.py to improve readability (#20545)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-07-07 08:31:49 +00:00
Liangliang Ma
2c5ebec064 [XPU][CI] add v1/core test in xpu hardware ci (#20537)
Signed-off-by: Ma, Liangliang <liangliang.ma@intel.com>
2025-07-07 01:16:40 -07:00
Jee Jee Li
2e610deb72 [CI/Build] Enable phi2 lora test (#20540)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-07 05:10:41 +00:00
Yang Yang
6e2c19ce22 [Refactor]Abstract Platform Interface for Distributed Backend and Add xccl Support for Intel XPU (#19410)
Signed-off-by: dbyoung18 <yang5.yang@intel.com>
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
2025-07-07 04:32:32 +00:00
Reid
47db8c2c15 [Misc] add a tip for pre-commit (#20536)
Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-07-06 19:42:06 -07:00
Woosuk Kwon
462b269280 Implement OpenAI Responses API [1/N] (#20504)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-07-06 18:32:13 -07:00
Cyrus Leung
c18b3b8e8b [Bugfix] Add use_cross_encoder flag to use correct activation in ClassifierPooler (#20527)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-06 14:01:48 -07:00
Woosuk Kwon
9528e3a05e [BugFix][Spec Decode] Fix spec token ids in model runner (#20530)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-07-06 19:44:52 +00:00
Cyrus Leung
9fb52e523a [V1] Support any head size for FlexAttention backend (#20467)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-06 09:54:36 -07:00
Woosuk Kwon
e202dd2736 [V0 deprecation] Remove V0 CPU/XPU/TPU backends (#20412)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: jiang1.li <jiang1.li@intel.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
2025-07-06 08:48:13 -07:00
Reid
43813e6361 [Misc] call the pre-defined func (#20518)
Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-07-06 10:25:29 +00:00
Brayden Zhong
cede942b87 [Benchmark] Add support for multiple batch size benchmark through CLI in benchmark_moe.py (#20516)
Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
2025-07-06 09:20:11 +00:00
Flora Feng
fe1e924811 [Frontend] Support image object in llm.chat (#19635)
Signed-off-by: sfeng33 <4florafeng@gmail.com>
Signed-off-by: Flora Feng <4florafeng@gmail.com>
2025-07-06 06:47:13 +00:00
Chengji Yao
4548c03c50 [TPU][Bugfix] fix the MoE OOM issue (#20339)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-07-05 21:19:09 -07:00
Lucas Wilkinson
40b86aa05e [BugFix] Fix: ImportError when building on hopper systems (#20513)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-07-06 12:17:30 +08:00
Lucia Fang
432870829d [Bugfix] Fix missing per_act_token parameter in compressed_tensors_moe (#20509)
Signed-off-by: Lu Fang <fanglu@fb.com>
2025-07-06 12:08:30 +08:00
Vadim Gimpelson
f73d02aadc [BUG] Fix #20484. Support empty sequence in cuda penalty kernel (#20491)
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@centml.ai>
2025-07-05 19:38:02 -07:00
Jeremy Reizenstein
c5ebe040ac test_attention compat with coming xformers change (#20487)
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-07-05 19:37:59 -07:00
Reid
8d763cb891 [Misc] remove unused import (#20517)
Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-07-05 19:17:06 -07:00
Reid
cf4cd53982 [Misc] Add logger.exception for TPU information collection failures (#20510)
Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-07-05 07:24:32 -07:00
Isotr0py
32c9be2200 [v1] Re-add fp32 support to v1 engine through FlexAttention (#19754)
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-07-05 09:41:10 +00:00
Lucia Fang
8aeaa910a2 Fix unknown attribute of topk_indices_dtype in CompressedTensorsW8A8Fp8MoECutlassMethod (#20507)
Co-authored-by: Lucia (Lu) Fang <fanglu@meta.com>
2025-07-05 14:03:20 +08:00
Jee Jee Li
906e05d840 [Misc] Remove the unused LoRA test code (#20494)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-05 13:48:16 +08:00
Reid
ef9a2990ae [doc] small fix (#20506)
Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-07-04 20:56:39 -07:00
Reid
7e90870491 [Misc] Add security warning for development mode endpoints (#20508)
Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-07-04 20:52:13 -07:00
Guy Stone
d3f05c9248 [Doc] fix mutltimodal_inputs.md gh examples link (#20497)
Signed-off-by: Guy Stone <guys@spotify.com>
2025-07-04 16:41:35 -07:00
Michael Goin
c108781c85 [CI Bugfix] Fix pre-commit failures on main (#20502) 2025-07-04 14:17:30 -07:00
Duncan Moss
3d184b95b8 [feat]: CUTLASS block scaled group gemm for SM100 (#19757)
Signed-off-by: Duncan Moss <djm.moss@gmail.com>
Co-authored-by: Duncan Moss <dmoss@nvidia.com>
2025-07-04 12:58:04 -06:00
Thomas Parnell
2f35a022e6 Enable V1 for Hybrid SSM/Attention Models (#20016)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Stanislaw Wozniak <stw@zurich.ibm.com>
Co-authored-by: Tyler Michael Smith <tysmith@redhat.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
2025-07-04 17:46:53 +00:00
Chenheli Hua
ffe00ef77a [Misc] Small: Remove global media connector. Each test should have its own test connector object. (#20395)
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
2025-07-04 08:15:03 -07:00
Peter Pan
5561681d04 [CI] add kvcache-connector dependency definition and add into CI build (#18193)
Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>
2025-07-04 06:49:18 -07:00
Cyrus Leung
fbd62d8750 [Doc] Fix classification table in list of supported models (#20489)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-04 06:08:02 -07:00
wang.yuqi
2e26f9156a [Model][3/N] Automatic conversion of CrossEncoding model (#20168)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-07-04 05:47:39 -07:00
sangbumlikeagod
9e5452ee34 [Bug][Frontend] Fix structure of transcription's decoder_prompt (#18809)
Signed-off-by: sangbumlikeagod <oironese@naver.com>
2025-07-04 11:28:07 +00:00
Michael Goin
0e3fe896e2 Support Llama 4 for fused_marlin_moe (#20457)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-04 07:55:10 +00:00
Jee Jee Li
1caca5a589 [Misc] Add SPDX-FileCopyrightText (#20428)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-04 07:40:42 +00:00
Wentao Ye
783921d889 [Perf] Optimize Vectorization Utils for Int 8 Quantization Kernels (#20331)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-04 15:06:24 +08:00
Aaron Pham
4a98edff1f [Structured Outputs][V1] Skipping with models doesn't contain tokenizers (#20365)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-07-04 15:05:49 +08:00
Reid
a7bab0c9e5 [Misc] small update (#20462)
Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-07-03 20:33:44 -07:00
汪志鹏
25950dca9b Add ignore consolidated file in mistral example code (#20420)
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
2025-07-04 02:55:07 +00:00
Gabriel Marinho
a4113b035c [Platform] Add custom default max tokens (#18557)
Signed-off-by: Gabriel Marinho <gmarinho@ibm.com>
2025-07-04 10:50:17 +08:00
Michael Goin
7e1665b089 [Misc] Change warn_for_unimplemented_methods to debug (#20455) 2025-07-04 02:35:08 +00:00
Seiji Eicher
8d1096e7db [Bugfix] Register reducer even if transformers_modules not available (#19510)
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
2025-07-03 22:08:12 +00:00
Nicolò Lucchesi
8d775dd30a [Misc] Fix Unable to detect current VLLM config. Defaulting to NHD kv cache layout warning (#20400)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-07-03 14:56:09 -07:00
bnellnm
78fe77534b [Kernel] Enable fp8 support for pplx and BatchedTritonExperts. (#18864)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-07-03 14:55:40 -07:00
Yuxuan Zhang
2f2fcb31b8 [Misc] Remove _maybe_ignore_quant_config from GLM4.1v (#20432)
Some checks failed
Create Release / Create Release (push) Has been cancelled
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
2025-07-03 21:41:13 +00:00
Ning Xie
1dba2c4ebe [Misc] adjust for ipv6 for mookcacke url parse (#20107)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-07-03 20:27:17 +00:00
Isotr0py
71d6de3a26 [Misc] Clean up InternVL family config registration (#19992)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-07-03 20:01:47 +00:00
Alexei-V-Ivanov-AMD
536fd33003 [CI] Trimming some failing test groups from AMDPRODUCTION. (#20390) 2025-07-03 08:21:31 -07:00
Reid
619b9f5c7e [Frontend] fix duplicate output for bench subcmd (#20446)
Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-07-03 08:02:06 -07:00
Nicolò Lucchesi
d1b689c445 [Bugfix] Fix flaky test_streaming_response test (#20363)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-07-03 14:46:24 +00:00
Reid
9854dc9040 [Frontend] improve vllm bench <bench_type> --help display (#20430)
Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-07-03 14:22:16 +00:00
Isotr0py
ff5c60fad8 [Misc] Automatically tag PRs to add new models (#20222)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-07-03 07:11:03 -07:00
wang.yuqi
6f1229f91d [Model][2/N] Automatic conversion of CrossEncoding model (#19978)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-07-03 13:59:23 +00:00
Jee Jee Li
1819fbda63 [Quantization] Bump to use latest bitsandbytes (#20424)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-03 21:58:46 +08:00
Li, Jiang
7f0367109e [CI/Build][CPU] Enable cross compilation in CPU release pipeline (#20423)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-07-03 05:26:12 -07:00
Ning Xie
fb14d53cf6 [Kernel] refactor cpu worker v0 cache dtype (#20080)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-07-03 08:39:14 +00:00
Cyrus Leung
b024a42e93 [Core] Move multimodal placeholder from chat utils to model definition (#20355)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-03 08:18:30 +00:00
Michael Yao
cb97f2bfc5 [Docs] Replace two list with tables in intel_gaudi.md (#20414)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-07-03 00:48:25 -07:00
Reid
359200f6ac [doc] fix link (#20417)
Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-07-03 00:21:57 -07:00
Lifans
220aee902a [Misc] Add rules to label Speculative Decoding Related PRs (#20406)
Signed-off-by: Lifan Shen <lifans@meta.com>
2025-07-02 23:56:49 -07:00
Nick Hill
67d25eca05 [Tests] Update online DP tests to verify that requests are balanced (#20157)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-07-03 14:49:13 +08:00
qscqesze
363528de27 [Feature] Support MiniMax-M1 function calls features (#20297)
Signed-off-by: QscQ <qscqesze@gmail.com>
Signed-off-by: qingjun <qingjun@minimaxi.com>
2025-07-03 06:48:27 +00:00
QiliangCui
4ff61ababa [TPU] Add a case to cover RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 (#20385)
Signed-off-by: Qiliang Cui <derrhein@gmail.com>
2025-07-03 06:46:41 +00:00
Li, Jiang
0ec3779df7 [Bugfix][CI/CD][CPU] Fix CPU CI tests (#20383)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-07-02 20:11:36 -07:00
Chenheli Hua
b616f6a53d [Misc] Small: Fix video loader return type annotations. (#20389)
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
2025-07-03 03:10:39 +00:00
bnellnm
2e25bb12a8 [Bugfix] Fix import of CutlassExpertsFp8 in compressed_tensors_moe.py (#20381)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-07-03 02:07:43 +00:00
Louie Tsai
9965c47d0d Enable CPU nightly performance benchmark and its Markdown report (#18444)
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
2025-07-02 17:50:25 -07:00
Nick Hill
059d4cdb49 [BugFix] Fix DP headless mode arg validation (#20398)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-07-02 17:15:32 -07:00
Tyler Michael Smith
bdb84e26b0 [Bugfix] Fixes for FlashInfer's TORCH_CUDA_ARCH_LIST (#20136)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: Tyler Michael Smith <tysmith@redhat.com>
2025-07-02 17:15:11 -07:00
Nicolò Lucchesi
3dd359147d [Docs] Update EAGLE example (#20375)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-07-02 17:13:51 -07:00
Nick Hill
657f2f301a [DP] Support external DP Load Balancer mode (#19790)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-07-02 10:21:52 -07:00
vllmellm
a1aafc827a [ROCm][FEAT] Enable Full Graph Mode in AITER MLA V1 Attn Backend (Decode Phase only) (#20254)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-07-02 16:25:46 +00:00
rongfu.leng
139508a418 [Misc] add handler HF_TOKEN is emptry string (#20369)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-07-02 09:14:31 -07:00
Nick Hill
d265414dbc [Minor] Clean up incorrect comment in test (#20382)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-07-02 09:13:37 -07:00
afeldman-nm
48fb076cbc [V1] LogitsProcessor programming model (#16728)
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Andrew Feldman <afeldman@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-07-02 09:10:42 -07:00
bnellnm
c1909e7e8c [Kernels] MoE refactor (#19636)
Signed-off-by: Bill Nell <bnell@redhat.com>
Signed-off-by: ElizaWszola <ewszola@redhat.com>
Co-authored-by: ElizaWszola <ewszola@redhat.com>
2025-07-02 06:08:27 -07:00
cronoik-inceptionai
b95877509b Documentation update tool_calling: mapping back to function from response (#20373) 2025-07-02 05:55:49 -07:00
zichongli5
706ff13224 [Model] Adds support for SlimMoE models Phi-tiny-MoE-instruct (#20286)
Signed-off-by: Zichong Li <t-lizichong@microsoft.com@Reasoning-H100-VM3.drbuo4tcjzruhloch3eo0b25ef.cx.internal.cloudapp.net>
Co-authored-by: Zichong Li <t-lizichong@microsoft.com@Reasoning-H100-VM3.drbuo4tcjzruhloch3eo0b25ef.cx.internal.cloudapp.net>
Co-authored-by: Isotr0py <2037008807@qq.com>
2025-07-02 12:54:12 +00:00
WangHuaqiang
ccbfb1d1c9 [Bugfix] Fix the max_seq_len limit of 16384 for DeepSeek models (#20322)
Signed-off-by: Wang Huaqiang <huaqiang.wang@intel.com>
2025-07-02 12:53:36 +00:00
Joonchen Liau
9e5552aa13 [NVIDIA] Support Cutlass w8a8 FP8 for Blackwell Geforce GPUs (sm120) (#17280)
Signed-off-by: kaln27 <liaojuncheng123@foxmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-07-02 06:47:19 -06:00
Lu Fang
0c600b9ab6 [Build/CI] Automatically tag DeepSeek related PRs (#20370)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-07-02 04:02:43 -07:00
CSWYF3634076
e303dcf523 [Model] Add Ernie4.5 and Ernie4.5MoE Model Support (#20220)
Signed-off-by: wangyafeng <wangyafeng@baidu.com>
2025-07-02 03:37:01 -07:00
Michael Yao
ae9c4d416f [Docs] Make TPU ref prettier in google_tpu.md (#20356)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-07-02 02:04:08 -07:00
Michael Yao
d853520b3e [Docs] Fix indentations for 2-level items in deprecation_policy.md (#20352)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-07-01 23:50:31 -07:00
Cyrus Leung
ba51aea65e [Bugfix] Keye-VL compatibility with tok_kwargs (#20058) (#20353)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-01 23:46:59 -07:00
Kwai-Keye
8452946c06 [Model][VLM] Support Keye-VL-8B-Preview (#20126)
Signed-off-by: Kwai-Keye <Keye@kuaishou.com>
2025-07-01 23:35:04 -07:00
Chenheli Hua
2e7cbf2d7d [Frontend] Support configurable mm placeholder strings & flexible video sampling policies via CLI flags. (#20105)
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
2025-07-01 23:34:03 -07:00
Chengji Yao
7da296be04 [TPU] kv cache update kernel supports dynamic grid (#20235)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-07-02 06:33:37 +00:00
QiliangCui
b205e8467d [Doc][TPU] Add models and features supporting matrix. (#20230)
Signed-off-by: Qiliang Cui <cuiq@google.com>
2025-07-02 06:33:20 +00:00
yyzxw
be0cfb2b68 fix[Docs]: link anchor is incorrect #20309 (#20315)
Signed-off-by: zxw <1020938856@qq.com>
2025-07-02 06:32:34 +00:00
Cyrus Leung
1a03dd496b [Bugfix] Fix dynamic rotary embedding (#20343)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-02 06:31:26 +00:00
Kunshang Ji
27b8017636 [FIX][Intel GPU]fix ipex flash_attn_varlen_func api missing parameter (#20348)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2025-07-01 22:26:40 -07:00
Lifans
9ec1e3065a [Misc][Doc] Add missing comment for LLM (#20285)
Signed-off-by: Lifan Shen <lifans@meta.com>
2025-07-01 19:04:24 -07:00
Wentao Ye
9dae7d46bf [Refactor] Remove Unused Env VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON (#20334)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-01 19:03:43 -07:00
Wentao Ye
7058d7dd5d [Refactor] Remove duplicate find_free_port (#20333)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-01 19:03:07 -07:00
Liangliang Ma
a0389e0554 [UT][intel GPU] use current_platform instead of device hardcode in v1 tests (#20169)
Signed-off-by: Ma, Liangliang <liangliang.ma@intel.com>
2025-07-02 09:06:04 +08:00
Tyler Michael Smith
3be8d312a2 [Kernel][Bugfix] Fixup some warnings in nvfp4_blockwise_moe when CUDA < 12.8 (#20324)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-07-01 18:05:47 -07:00
czhu-cohere
3abfe22154 Enable group size 64 for Machete (#20290)
Signed-off-by: czhu-cohere <conway.zhu@cohere.com>
2025-07-01 18:05:44 -07:00
Wentao Ye
e81fbefe8a [Refactor] Refactor import utils (#20269)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-01 18:05:42 -07:00
周周周
9290de5667 remove unused variables in marlin_template.h (#20236) 2025-07-02 00:51:52 +00:00
Woosuk Kwon
7f280d69c9 [Optimization] Cache sampled token ids in model runner (#20291)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-07-01 11:01:31 -07:00
TJian
02cabff207 [V1] [ROCm] Enable EP with AITER Fused MoE (#20270)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-07-01 16:48:30 +00:00
Shintarou Okada
3d19d47d91 [Frontend] Expand tools even if tool_choice="none" (#17177)
Signed-off-by: okada shintarou <okada@preferred.jp>
2025-07-01 12:47:38 -04:00
Woosuk Kwon
8acb4badee [CUDA graphs] Enable full cuda graphs with FA3 AoT scheduling (#20301)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-07-01 09:07:36 -07:00
Nicolò Lucchesi
314af8617c [Docs] Update transcriptions API to use openai client with stream=True (#20271)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-07-01 15:47:13 +00:00
Woosuk Kwon
0e96cc9b7e [Misc] Minor refactoring for scheduler (#20299)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-07-01 07:55:32 -07:00
aiyiwang2025
ecad851cbd [Model]Add Tencent HunYuanMoEV1 Model Support (#20114)
Signed-off-by: aiyiwang <aiyiwang@tencent.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: quinnrong <quinnrong@tencent.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-01 07:28:13 -07:00
Yuxuan Zhang
ed70f3c64f Add GLM4.1V model (Draft) (#19331)
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-07-01 12:48:26 +00:00
Nicolò Lucchesi
650d5dbd04 [Misc] Minor refactor of NIXL background handshake (#20068)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-07-01 12:40:14 +01:00
Kyle Sayers
9025a9a705 [Quant] [Bugfix] Fix quantization config matching with hf_to_vllm_mapper (#20046) 2025-07-01 19:20:34 +09:00
Lionel Villard
c05596f1a3 [Perf] Validate @config in pre-commit instead of dynamically (#20200)
Signed-off-by: Lionel Villard <villard@us.ibm.com>
2025-07-01 05:10:28 -04:00
Reid
787b13389e [doc] fix the incorrect logo in dark mode (#20289)
Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-07-01 08:18:09 +00:00
TY-AMD
96453cfa83 [BugFix][V1][ROCm] Triton MLA uses V0 backend on V1 engine (#19067)
Signed-off-by: Tianyuan Wu <Tianyuan.Wu@amd.com>
2025-07-01 16:12:19 +08:00
Kebe
b1c1fe35a5 [Misc] remove redundant char (#20287)
Signed-off-by: Kebe <mail@kebe7jun.com>
2025-07-01 15:33:22 +08:00
Varun Sundar Rabindranath
08d81f1014 [Bugfix] Fix deepep tests (#20288)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-07-01 15:29:08 +08:00
Li, Jiang
6cc1e7d96d [CPU] Update custom ops for the CPU backend (#20255)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-07-01 07:25:03 +00:00
czhu-cohere
9909726d2a Enable ZP Support for Machete (#20268)
Signed-off-by: czhu-cohere <conway.zhu@cohere.com>
2025-07-01 07:12:20 +00:00
Prashant Gupta
22e9d42040 [Misc] add xgrammar for arm64 (#18359)
Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com>
2025-07-01 07:02:20 +00:00
Richard Barnes
86debab54c Fix numel() downcast in vllm/csrc/moe/moe_align_sum_kernels.cu +2 (#17082)
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-07-01 06:48:10 +00:00
Michael Goin
be250bbc67 [V1] Only print cudagraph tqdm on rank 0 with is_global_first_rank (#19516)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-01 06:02:09 +00:00
Alex Kogan
27949354fa [Feature] A calibration-free RTN-based quantization for accurate and accelerated INT4/INT8 inference (#18768)
Signed-off-by: Alex Kogan <alex.kogan@oracle.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-07-01 05:44:38 +00:00
Ernest Wong
bd5038af07 [Doc] add config and troubleshooting guide for NCCL & GPUDirect RDMA (#15897)
Signed-off-by: Ernest Wong <chwong719@gmail.com>
2025-06-30 21:44:39 -07:00
Chendi.Xue
a2f14dc8f9 [CI][Intel Gaudi][vllm-Plugin]Add CI for hpu-plugin-v1-test (#20196)
Signed-off-by: Chendi Xue <chendi.xue@intel.com>
2025-07-01 04:17:07 +00:00
Kuntai Du
92ee7baaf9 [Example] add one-click runnable example for P2P NCCL XpYd (#20246)
Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
2025-06-30 21:03:55 -07:00
Woosuk Kwon
7151f92241 [Misc] Fix spec decode example (#20296)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-06-30 21:01:48 -07:00
fyuan1316
e28533a16f [Bugfix] Fix include prompt in stream response when echo=true (#15233)
Signed-off-by: Yuan Fang <yuanfang@alauda.io>
2025-07-01 01:30:14 +00:00
Luka Govedič
6d42ce8315 [CLI] Improve CLI arg parsing for -O/--compilation-config (#20156)
Signed-off-by: luka <luka@neuralmagic.com>
2025-07-01 01:03:13 +00:00
Zhonghua Deng
ded1fb635b [Bugfix][V1][P/D]Fix the issue of occasional garbled output for P2pNcclConnector (#20263)
Signed-off-by: Abatom <abzhonghua@gmail.com>
2025-06-30 16:45:14 -07:00
Wentao Ye
97d9524fe9 [Refactor] Remove useless pdb comment (#20266)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-06-30 18:15:24 +00:00
Kyle Sayers
d8cf819a9a [Core] [Bugfix] [Multimodal] Fix multimodal profiling and generation for SFT/PTQed models (#20058)
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
2025-06-30 17:26:49 +00:00
Wentao Ye
551ef1631a [Unit Test] Add unit test for deep gemm (#20090)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-06-30 10:26:42 -06:00
Woosuk Kwon
2863befce3 [Optimization] Use Shared CachedRequestData Instance Across All Requests (#20232)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-06-30 09:07:50 -07:00
Woosuk Kwon
2965c99c86 [Spec Decode] Clean up spec decode example (#20240)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-06-30 08:28:13 -07:00
Woosuk Kwon
2062c0723d [Spec Decode] Refactor spec decoding into a separate function (#20238)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-06-30 08:13:50 -07:00
li haoyang
1c50e100a9 [Bugfix] fix quark ptpc (#20251)
Signed-off-by: Haoyang Li <Haoyang.Li@amd.com>
Co-authored-by: Haoyang Li <307790822@qq.com>
2025-06-30 22:24:50 +09:00
Michael Yao
3ee56e26be [Docs] Fix 1-2-3 list in v1/prefix_caching.md (#20243)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-06-30 11:20:51 +00:00
Jee Jee Li
8fe7fc8634 [Quantization] Improve BitsAndBytesModelLoader (#20242)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-30 18:22:09 +08:00
Isotr0py
e936e401de [Bugfix] Fix processor initialization in transformers 4.53.0 (#20244)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-30 10:16:16 +00:00
noiji
f5dfa07531 [Bugfix] Skip loading extra parameters for modelopt Qwen3 MoE model (#19598)
Signed-off-by: noiji <>
2025-06-30 18:21:56 +09:00
Reid
022c58b80f [doc] Add Slack and Forum to the top navigation (#20208)
Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-06-30 07:53:45 +00:00
Woosuk Kwon
19108ef311 [Misc] Fix import (#20233)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-06-29 20:34:54 -07:00
Chendi.Xue
5a52f389dd [BUGFIX][DEEPSEEK][MODEL_LOAD] fix w13, w2 weight not initialized assert (#20202)
Signed-off-by: Chendi Xue <chendi.xue@intel.com>
2025-06-29 19:46:19 -07:00
redmoe-moutain
65b1cbb138 [Model] support dots1 (#18254)
Signed-off-by: redmoe-moutain <agiredmoe@gmail.com>
2025-06-29 19:34:36 -07:00
Huy Do
6c9837a761 Fix cuda_archs_loose_intersection when handling sm_*a (#20207)
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-06-29 16:52:34 -07:00
Dipika Sikka
6f2f53a82d [Quantization] Add compressed-tensors NVFP4 MoE Support (#19990)
Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
Signed-off-by: Dipika <dipikasikka1@gmail.com>
2025-06-29 22:05:40 +00:00
Michael Goin
7b1895e6ce [CI Fix] Try fixing eagle e2e test OOM by reducing block allocation (#20213)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-29 10:31:37 +08:00
Wentao Ye
4d36693687 [Refactor] Create a function util and cache the results for has_deepgemm, has_deepep, has_pplx (#20187)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-06-28 22:06:38 +00:00
Stan Wozniak
daec9dea6e [Bugfix] Correct behavior of GraniteMoeHybrid for TensorParallel execution (#20137)
Signed-off-by: Stanislaw Wozniak <stw@zurich.ibm.com>
2025-06-28 08:16:41 -07:00
Nicolò Lucchesi
daceac57c7 [Frontend] Generalize v1/audio/transcriptions endpoint (#20179)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-06-28 08:15:26 -07:00
Thomas Parnell
8615d9776f [CI/Build] Add new CI job to validate Hybrid Models for every PR (#20147)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-06-27 23:00:25 -07:00
Jiayi Yan
7b460c25f9 [BugFix] Fix the incorrect func name in the comments. (config.py) (#20185) 2025-06-27 22:51:16 -07:00
Michael Goin
f719772281 [Bugfix] Properly reject requests with empty list guided_choice (#20195)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-27 22:50:52 -07:00
Wentao Ye
d45417b804 fix ci issue distributed 4 gpu test (#20204)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-06-27 22:50:00 -07:00
Michael Goin
a29e62ea34 Fix num_token_padding support for static per-tensor scaled_fp8_quant (#20188)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-27 22:48:13 -07:00
Chales Xu
e53be6f00a [Misc] Add type assertion of request_id for LLMEngine.add_request (#19700)
Signed-off-by: n2ptr <xuzhanchaomail@163.com>
2025-06-27 22:47:36 -07:00
Michael Goin
c329ceca6d [CI Fix] Pin tests/models/registry.py MiniMaxText01ForCausalLM to revision due to model changes (#20199)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-28 13:43:06 +08:00
Fabien Dupont
3c545c0c3b [CI/Build] Allow hermetic builds (#18064)
Signed-off-by: Fabien Dupont <fdupont@redhat.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: Fabien Dupont <fabiendupont@pm.me>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Elias Levy <eliaslevy@google.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-06-27 09:04:39 -07:00
Tyler Michael Smith
e8c3bd2cd1 [Bugfix] Fix some narrowing conversion warnings (#20141)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-06-27 09:01:28 -07:00
bnellnm
c6c983053d [Bugfix] Mark 'hidden_states' as mutable in moe_forward registration. (#20152)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-06-27 09:42:22 -06:00
Luka Govedič
aafabaa0d5 [Fix][torch.compile] Enable custom ops by default when Inductor off (#20102)
Signed-off-by: luka <luka@neuralmagic.com>
2025-06-27 09:00:42 -06:00
Hosang
94a55c7681 [Fix][ROCm] Remove unused variables to fix build error on GFX11/12 (#19891)
Signed-off-by: Hosang Yoon <hosang.yoon@amd.com>
2025-06-27 07:14:44 -07:00
Ilya Lavrenov
aa0dc77ef5 [Perf] Improved perf for resolve_chat_template_content_format (#20065)
Signed-off-by: Ilya Lavrenov <ilya.lavrenov@cerebras.net>
2025-06-27 09:16:41 +00:00
Michael Goin
4ab3ac285e [Bugfix] Fix flaky failure when getting DP ports (#20151)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-27 15:30:53 +08:00
Robert Shaw
d1c956dc0f Gemma3n (Text-only) (#20134)
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Roger Wang <hey@rogerw.me>
2025-06-27 07:16:26 +00:00
Chendi.Xue
dec197e3e5 Quick Fix by adding conditional import for flash_attn_varlen_func in flash_attn (#20143)
Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
2025-06-27 05:48:13 +00:00
Yazan Sharaya
6e244ae091 [Perf][Frontend] eliminate api_key and x_request_id headers middleware overhead (#19946)
Signed-off-by: Yazan-Sharaya <yazan.sharaya.yes@gmail.com>
2025-06-27 00:44:14 -04:00
wang.yuqi
cd4cfee689 [Model][1/N] Automatic conversion of CrossEncoding model (#20012)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-06-26 21:10:04 -07:00
Thomas Parnell
e110930680 [Fix] Fix gemma CI test failing on main (#20124)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-06-26 21:06:59 -07:00
Yang Wang
8b64c895c0 [CI] Sync test dependency with test.in for torch nightly (#19632)
Signed-off-by: Yang Wang <elainewy@meta.com>
Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Concurrensee <yida.wu@amd.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-06-26 20:55:25 -07:00
li haoyang
0740e29b66 [Feature] add quick all reduce (#19744)
Signed-off-by: ilmarkov <imarkov@redhat.com>
Signed-off-by: Haoyang Li <Haoyang.Li@amd.com>
Co-authored-by: ilmarkov <imarkov@redhat.com>
2025-06-26 20:54:24 -07:00
Michael Goin
44d2e6af63 [Bugfix] Build moe_data for both sm100 and sm90 (#20086)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-26 20:50:12 -07:00
Ilya Markov
2d7779f888 [Perf] SM100 FP8 GEMM Optimizations after cutlass_profiler (#20071)
Signed-off-by: ilmarkov <imarkov@redhat.com>
Co-authored-by: ilmarkov <imarkov@redhat.com>
2025-06-26 20:50:09 -07:00
Dipika Sikka
a57d57fa72 [Quantization] Bump to use latest compressed-tensors (#20033)
Signed-off-by: Dipika <dipikasikka1@gmail.com>
Co-authored-by: Kyle Sayers <kylesayrs@gmail.com>
2025-06-26 20:50:06 -07:00
Michael Goin
71799fd005 [CI Failure] Fix OOM with test_oot_registration_embedding (#20144)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-27 11:21:04 +08:00
Bowen Wang
e9fd658a73 [Feature] Expert Parallelism Load Balancer (EPLB) (#18343)
Signed-off-by: Bowen Wang <abmfy@icloud.com>
2025-06-26 15:30:21 -07:00
Kyle Yu
07b8fae219 [Doc] correct LoRA capitalization (#20135)
Signed-off-by: kyolebu <kyu@redhat.com>
2025-06-26 15:22:12 -07:00
Wentao Ye
562308816c [Refactor] Rename commnication utils (#20091)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-06-26 22:19:32 +00:00
Chengji Yao
04e1642e32 [TPU] add kv cache update kernel (#19928)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-06-26 10:01:37 -07:00
Kunshang Ji
b69781f107 [Hardware][Intel GPU] Add v1 Intel GPU support with Flash attention backend. (#19560)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2025-06-26 09:27:18 -07:00
Tyler Michael Smith
0bceac9810 Spam folks if config.py changes (#20131)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-06-26 08:19:46 -07:00
Cyrus Leung
34878a0b48 [Doc] Rename page titles (#20130)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-26 08:18:49 -07:00
Cyrus Leung
6393b03986 [Doc] Auto sign-off for VSCode (#20132)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-26 08:18:36 -07:00
wang.yuqi
0907d507bf [Doc] Automatically signed-off by PyCharm (#20120)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-06-26 14:34:17 +00:00
Wentao Ye
c894c5dc1f [Bug Fix] Fix address/port already in use error for deep_ep test (#20094)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-06-26 22:33:13 +08:00
Michael Goin
1f5d178e9c Revert "[Bugfix] default set cuda_graph_sizes to max_num_seqs for v1 engine" (#20128) 2025-06-26 07:32:22 -07:00
TJian
27c065df50 [Bugfix][V1][ROCm] Fix AITER Flash Attention Backend (Fix API Break and Local Attention Logic: affecting Llama4) (#19904)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-06-26 12:42:31 +00:00
Michael Yao
84c260caeb [Docs] Improve frameworks/helm.md (#20113)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-06-26 10:41:51 +00:00
Reid
167aca45cb [Misc] Use collapsible blocks for benchmark examples. (#20017)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-26 03:35:16 -07:00
Li, Jiang
0567c8249f [CPU] Fix torch version in x86 CPU backend (#19258)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-06-26 03:34:47 -07:00
Wentao Ye
d188913d99 [Refactor] Remove unused library (#20099)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-06-26 09:16:10 +00:00
Cyrus Leung
1d7c29f5fe [Doc] Update docs for New Model Implementation (#20115)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-26 00:47:06 -07:00
Seiji Eicher
65397e40f5 [Bugfix] Allow CUDA_VISIBLE_DEVICES='' in Platform.device_id_to_physical_device_id (#18979)
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
2025-06-26 00:01:57 -07:00
Ekagra Ranjan
9502c38138 [Benchmark][Bug] Fix multiple bugs in bench and add args to spec_decode offline (#20083) 2025-06-25 22:06:27 -07:00
Nicolò Lucchesi
2582683566 [PD] Skip tp_size exchange with rank0 (#19413)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-06-25 20:04:39 -07:00
Michael Goin
754b00edb3 [Bugfix] Fix Mistral tool-parser regex for nested JSON (#20093)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-26 01:01:17 +00:00
Michael Goin
296ce95d8e [CI] Add SM120 to the Dockerfile (#19794)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-25 16:23:56 -07:00
Chenyaaang
2d7620c3eb [TPU] Add TPU specific var VLLM_TPU_MOST_MODEL_LEN (#19919)
Signed-off-by: Chenyaaang <chenyangli@google.com>
2025-06-25 15:51:02 -07:00
Nick Hill
55c65ab495 [P/D] Avoid stranding blocks in P when aborted in D's waiting queue (#19223)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-25 15:19:44 -07:00
Chengji Yao
2cc2069970 [TPU][Bugfix] fix kv cache padding (#20048)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-06-25 21:24:10 +00:00
zhrrr
9f0608fc16 [Bugfix] default set cuda_graph_sizes to max_num_seqs for v1 engine (#20062)
Signed-off-by: izhuhaoran <izhuhaoran@qq.com>
2025-06-25 21:03:17 +00:00
QiliangCui
4e0db57fff Fix the path to the testing script. (#20082)
Signed-off-by: Qiliang Cui <derrhein@gmail.com>
2025-06-25 20:48:17 +00:00
Nick Hill
c40692bf9a [Misc] Add parallel state node_count function (#20045)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-25 13:38:53 -07:00
lkchen
4734704b30 [PD] let toy proxy handle /chat/completions (#19730)
Signed-off-by: Linkun <github@lkchen.net>
2025-06-25 15:17:45 -04:00
Eldar Kurtić
8b8c209e35 static_scaled_fp8_quant should not run when scale.numel is not 1 (#20076) 2025-06-25 15:08:03 -04:00
lsz05
23a04e0895 [Fix] Support cls pooling in ModernBertPooler (#20067)
Signed-off-by: shengzhe.li <shengzhe.li@sbintuitions.co.jp>
2025-06-25 15:07:45 -04:00
Dipika Sikka
02c97d9a92 [Quantization] Add compressed-tensors emulations support for NVFP4 (#19879)
Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
Signed-off-by: Dipika <dipikasikka1@gmail.com>
2025-06-25 14:28:19 -04:00
Nicolò Lucchesi
e795d723ed [Frontend] Add /v1/audio/translations OpenAI API endpoint (#19615)
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: NickLucche <nlucches@redhat.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
2025-06-25 17:54:14 +00:00
cjackal
8359f4c8d8 [V1][Speculative Decoding] Fix DeepSeek MTP (#20022)
Signed-off-by: cjackal <44624812+cjackal@users.noreply.github.com>
2025-06-25 08:41:02 -07:00
Michael Goin
bf5181583f [Doc] Guide for Incremental Compilation Workflow (#19109) 2025-06-25 22:06:46 +09:00
Reid
c53fec1fcb [doc] add reference link for Intel XPU (#20064)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-25 12:24:07 +00:00
Lucas Wilkinson
0f9e7354f5 [BugFix] Fix full-cuda-graph illegal memory access in FA3 (#20057)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-06-25 08:39:04 +00:00
Aaron Pham
ba7ba35cda [Chore] debloat some initial logs (#19438)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-06-25 06:36:22 +00:00
bnellnm
015fab8c2f [Kernels][Bugfix] Use torch op for all kernels in FusedMoE forward. Add additional testing for cudagraphs. (#19717)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-06-24 23:22:58 -07:00
Max Wittig
f59fc60fb3 [Feat][CLI] enforce-include-usage (#19695)
Signed-off-by: Max Wittig <max.wittig@siemens.com>
2025-06-25 01:43:04 -04:00
Wentao Ye
879f69bed3 [Refactor] Remove duplicate ceil_div (#20023)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-06-25 05:19:09 +00:00
David Xia
7108934142 [Frontend] speed up import time of vllm.config (#18036)
Signed-off-by: David Xia <david@davidxia.com>
2025-06-25 00:41:11 -04:00
h-avsha
3443aaf8dd Move to a faster base64 implementation (#19984)
Signed-off-by: h-avsha <avshalom.manevich@hcompany.ai>
2025-06-24 20:33:51 -07:00
Isotr0py
2273ec322c Revert "Fix(models/siglip): Add compatibility for Gemma models quantized by llm-compressor" (#20030) 2025-06-25 11:23:29 +08:00
Wentao Ye
a6c4b87fbc Revert "[Feature] Integrate new deepgemm (#19820)" (#20049)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-06-24 19:45:22 -07:00
Brayden Zhong
1afa9948f5 [Llama4] Update attn_temperature_tuning (#19997)
Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
2025-06-24 22:42:53 -04:00
Eli Uriegas
0d06b533a0 cmake: Update vllm_flash_attn for vllm_kernels (#20032)
Signed-off-by: Eli Uriegas <eliuriegas@meta.com>
2025-06-24 22:44:10 +00:00
Boyuan Feng
c01d1c5aba use .dev for version comparison with pytorch nightly release (#20031)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
2025-06-24 21:52:16 +00:00
Brayden Zhong
ead369845d [Easy] Remove submodule added in #19463 (#20039)
Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
2025-06-24 13:23:15 -07:00
Wentao Ye
c6e3bba8e6 [Feature] Integrate new deepgemm (#19820)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-06-24 12:51:56 -07:00
lkchen
91f7d9d0b6 [P/D] Asynchronously do _nixl_handshake (#19836)
Signed-off-by: Linkun Chen <github@lkchen.net>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-06-24 12:46:10 -07:00
Nick Hill
8619e7158c [BugFix] Fix multi-node offline data parallel (#19937)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-24 12:45:20 -07:00
d.transposed
c635c5f744 [Misc][Benchmarking] Add variable request-rate ("ramp-up") to the benchmarking client. (#19423)
Signed-off-by: dtransposed <damian@damian-ml-machine.europe-west3-b.c.jetbrains-grazie.internal>
Co-authored-by: dtransposed <damian@damian-ml-machine.europe-west3-b.c.jetbrains-grazie.internal>
Co-authored-by: Roger Wang <hey@rogerw.me>
2025-06-24 18:41:49 +00:00
Lucas Wilkinson
a045b7e89a [Perf] Improve/Fix-regression for FA3 in High QPS regimes (#19463)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2025-06-24 13:09:01 -04:00
amit
981eeca41a [Fix][V1] Remove --scheduling-policy oracle (#20010)
Signed-off-by: amit <amit.man@gmail.com>
2025-06-24 09:52:15 -07:00
Reid
26d34eb67e refactor example - qwen3_reranker (#19847)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-24 14:03:20 +00:00
Li, Jiang
53da4cd397 [Bugfix][CPU] Fix InputBatch for pooling models in the CPU v1 (#20014)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-06-24 13:20:04 +00:00
Vadim Gimpelson
9a3b88328f [PERF] Speedup of MRoPE prepare inputs (#19939)
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@centml.ai>
2025-06-23 23:01:26 -07:00
Reid
3014c920da add some examples for other benchmark scripts (#19893)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-24 05:57:46 +00:00
Kay Yan
0eed516951 [doc] Fix broken link in the installation for CPU (#19980)
Signed-off-by: Kay Yan <kay.yan@daocloud.io>
2025-06-24 12:04:11 +08:00
Chenyaaang
ee5ad8d2c5 [Misc][Tools][Benchmark] Add profile to autotune script (#19711)
Signed-off-by: Chenyaaang <chenyangli@google.com>
2025-06-24 00:59:41 +00:00
QiliangCui
a738dbb2a1 Update test case parameter to have the throughput above 8.0 (#19994)
Signed-off-by: Qiliang Cui <derrhein@gmail.com>
2025-06-24 00:18:10 +00:00
Chenyaaang
33d5e29be9 [TPU] Fix tpu model runner test (#19995)
Signed-off-by: Chenyaaang <chenyangli@google.com>
2025-06-23 16:04:28 -07:00
22quinn
4671ac6e2a [Bugfix][Benchmark] Fix Marlin benchmark (#19929) 2025-06-24 07:25:12 +09:00
Jun-Howie
dd2ccf8dde Feat Dynamic Quantization for MoE Layers in GPTQ Marlin Backend (#19395) 2025-06-24 07:23:28 +09:00
22quinn
a3bc76e4b5 [CI/Build] Push latest tag for cpu and neuron docker image (#19897)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-06-23 14:15:37 -07:00
cascade
e6327c9b3e [Feature] Support sequence parallelism for static fp8 quantization (#19181)
Signed-off-by: cascade812 <cascade812@outlook.com>
2025-06-23 16:09:02 -04:00
lkchen
d0132f025d [Misc] Add type alias ReqId and EngineId for better readability (#19880)
Signed-off-by: Linkun Chen <github@lkchen.net>
2025-06-23 12:57:57 -07:00
Isotr0py
61f4fc5dc6 [Bugfix][v1] Fix step pooler implementation and step pooling usage in v1 (#19956)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-23 18:38:06 +00:00
Tyler Michael Smith
68aaeb3749 [EP+DP] Optimize the little operations in the DeepGEMM + DeepEP low latency case (#19885)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Signed-off-by: Tyler Michael Smith <tysmith@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-06-23 11:07:47 -07:00
Lukas Geiger
c3649e4fee [Docs] Fix syntax highlighting of shell commands (#19870)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-06-23 17:59:09 +00:00
Reid
53243e5c42 [doc] improve readability for long commands (#19920)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-23 14:27:07 +00:00
Jee Jee Li
a6e6604d32 [Bugfix] Fix CI bitsandbytes failure (#19969)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-23 21:30:55 +08:00
Reid
b82e0f82cb [doc] use MkDocs collapsible blocks - supplement (#19973)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-23 10:54:16 +00:00
Isotr0py
5111642a6f [Doc] Update V1 status for decoder-only embedding models (#19952)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-23 09:31:06 +00:00
lkchen
1bcd15edc7 [BugFix][P/D] Fix for cases where _recving_transfers can be cleaned up when *all* transfer done (#19874)
Signed-off-by: Linkun Chen <github@lkchen.net>
2025-06-22 22:41:53 -07:00
Nicolò Lucchesi
2ebff5b77c [P/D][NixlConnector] Support tp_size > num_kv_heads deployments (#19691)
Signed-off-by: NickLucche <nlucches@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-06-22 22:41:50 -07:00
Reid
f17aec0d63 [doc] Fold long code blocks to improve readability (#19926)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-23 05:24:23 +00:00
Vensen
493c275352 Fix(models/siglip): Add compatibility for Gemma models quantized by llm-compressor (#19643)
Signed-off-by: Vensenmu <vensenmu@gmail.com>
2025-06-23 03:40:28 +00:00
jinqinn
f39ab2d4bd [Misc] Configurable timeout for execute_model RPC calls via env var (#19544)
Signed-off-by: jinqinn <goodqinjin@163.com>
2025-06-22 20:36:26 -07:00
amit
4a0f7888a3 [Core] feat: Implement Priority Scheduling in V1 Engine (#19057)
Signed-off-by: amit <amit.man@gmail.com>
Co-authored-by: Roger Wang <Rogerw0108@gmail.com>
2025-06-22 20:18:08 -07:00
Aaron Pham
c4cf260677 [Perf][CLI] Improve overall startup time (#19941) 2025-06-22 23:11:22 +00:00
Ye (Charlotte) Qi
33d51f599e [BugFix] Add an env to disable moe chunking to work around compile incompatibility (#19642)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-06-22 15:17:49 -07:00
Aaron Pham
e91386cde1 [Chore] dedup logs (#19955) 2025-06-22 19:43:07 +00:00
Ye (Charlotte) Qi
2c11a29f0b [Misc] Simplify vllm bench cli subcommand implementation (#19948) 2025-06-22 12:34:48 -04:00
Roger Wang
c76a506bd6 [Misc] Update model-specific PR tagging (#19949)
Signed-off-by: Roger Wang <hey@rogerw.me>
2025-06-22 12:16:08 +00:00
Reid
ec0db6f51c [doc] use snippets for contact us (#19944)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-22 10:26:13 +00:00
22quinn
c305a2109d [CI/Build] Auto tag perf benchmarks related PRs (#19943)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-06-22 08:46:21 +00:00
Wang, Yi
202c5df935 [Benchmark] fix request loss if "ping" is returned (#19535)
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-06-22 07:21:04 +00:00
Ning Xie
2bb246b8f7 [MISC] add cpu_kvcache_space_bytes to CacheConfig (#19812)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-06-22 13:39:09 +08:00
Ning Xie
4c409cabc2 [Misc] add vllm_config in __init__ (#19866)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-06-21 23:10:46 -04:00
Adrian
3b1e4c6a23 [Docs] Add GPT2ForSequenceClassification to supported models in docs (#19932)
Signed-off-by: nie3e <adrcwiek@gmail.com>
2025-06-21 20:57:19 +00:00
Woosuk Kwon
2c5302fadd [Multimodal] Optimize Qwen2/2.5-VL startup time (#19756)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Roger Wang <hey@rogerw.me>
2025-06-21 20:01:07 +00:00
Reid
caa680fd2e [doc] add contact us in community (#19922)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-21 17:29:06 +00:00
汪志鹏
c3bf9bad11 [New model support]Support Tarsier2 (#19887)
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
2025-06-21 04:01:51 +00:00
Isotr0py
6f170f11dd [Bugfix] Fix bnb 8bit model weights loading (#19917)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-21 03:29:09 +00:00
Rabin Adhikari
8ca81bb069 Fix: Check the type of params to be a Sequence not list. (#19910)
Signed-off-by: Rabin Adhikari <rabin.adk1@gmail.com>
2025-06-20 23:03:17 +00:00
wangxiyuan
e773a9e1c2 [Misc] Clean up useless code (#19889)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-06-20 21:09:09 +00:00
Ning Xie
71baf85ae1 [Kernel] mark TorchSDPABackend swap_blocks NotImplementedError (#19749) 2025-06-20 18:18:11 +00:00
Li, Jiang
79f2f1c2a1 [CPU][CI] Fallback sliding window to v0 and fix CPU pooling model tests (#19901)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-06-20 15:30:36 +00:00
Vlad Tiberiu Mihailescu
2e3e3c86dc Export NaNs in logits to scheduler_stats if output is corrupted (#18777)
Signed-off-by: Vlad Mihailescu <vtmihailescu@gmail.com>
2025-06-20 22:47:16 +08:00
Chendi.Xue
7e8977fcd4 [custom_op][vllm-plugin] update custom_op class to use op_registry (#19164)
Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
2025-06-20 07:44:56 -07:00
Adrian
f1e840e842 [Model] GPT2ForSequenceClassification model (#19663)
Signed-off-by: nie3e <adrcwiek@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-06-20 12:07:41 +00:00
Thomas Parnell
7771d1de88 [Fix] import regex instead of re (#19875)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-06-20 11:16:48 +00:00
Ning Xie
71d1219545 [Kernel] correct cpu worker function parameter type (#19745)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-06-20 10:50:13 +00:00
Reid
e384f2f108 [Misc] refactor example - openai_transcription_client (#19851)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-20 08:02:21 +00:00
Reid
089a306f19 [Misc] update cuda version (#19526)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-20 07:25:15 +00:00
kourosh hakhamaneshi
5e666f72cd [Bugfix][Ray] Set the cuda context eagerly in the ray worker (#19583) 2025-06-19 22:01:16 -07:00
qli88
e3a3e4db46 [Bugfix] Enable PP with AITER+V1 (#19822)
Signed-off-by: Qiang Li <qiang.li2@amd.com>
2025-06-20 12:43:20 +08:00
Xerxes
e41bf15cd0 [Chore]: qwen3-moe-type-hints-mistake (#19860)
Co-authored-by: xinnan.hou <hxn02029096@alibaba-inc.com>
2025-06-19 21:43:07 -07:00
Brayden Zhong
5aa4a015ce [Benchmark] Fix Value of type "SampleRequest" is not indexable (#18032)
Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
2025-06-19 21:28:55 -07:00
Elaine Zhao
b6bad3d186 [CI][Neuron] Fail and exit on first error (#19622)
Signed-off-by: Elaine Zhao <elaineyz@amazon.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-06-20 12:27:51 +08:00
Isotr0py
ee9a1531aa [CI/Build][Bugfix] Fix deadlock on v1 engine test CI (#19872)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-20 09:51:07 +08:00
Robert Shaw
10d82f9ac5 [Benchmark][Bugfix] Fix Dataset Length Calculation (#19868)
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
2025-06-19 18:30:41 -07:00
xzbdmw
ea10dd9d9e [Frontend] early return chat format resolution when specified (#19735) 2025-06-19 18:49:59 +00:00
Alex Brooks
ead2110297 [Core][Bugfix] Fix Online MM Beam Search (#19688)
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
2025-06-19 17:18:07 +00:00
Li, Jiang
01220ce89a [CI][CPU] Improve dummy Triton interfaces and fix the CPU CI (#19838)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-06-19 15:46:09 +00:00
22quinn
6f68c49220 [Doc] Update V1 user guide for embedding models (#19842)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-06-19 09:43:27 +00:00
Alexei-V-Ivanov-AMD
4719460644 Fixing Chunked Prefill Test. (#19762)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
2025-06-19 01:36:16 -07:00
NekoMimiUnagi
466166dcfd [Frontend] Add optional token-level progress bar to LLM.beam_search (#19301)
Signed-off-by: Ruosen Li <rxl190028@utdallas.edu>
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Signed-off-by: Ubuntu <ubuntu@ip-172-31-71-179.ec2.internal>
Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-06-19 03:21:41 -04:00
Zuxin
1d0ae26c85 Add xLAM tool parser support (#17148) 2025-06-19 14:26:41 +08:00
Isotr0py
6021999573 [Minor] Allow redirecting model path for HfRunner in test (#19795)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-18 23:04:10 -07:00
Ning Xie
c7b370c603 raise exception for pin_lora (#19809)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-06-18 22:57:35 -07:00
zsolt-borbely-htec
aa20d10a91 [Misc] [ROCm] Prevent surplus tensor reshape (#19803)
Signed-off-by: Zsolt Borbely <zsolt.borbely@htecgroup.com>
2025-06-19 13:57:16 +08:00
TJian
2de12be428 [ROCm] [AITER] [Bugfix] Patch for AITER commit 648764942e552a8bb5fe16026703716a81f05374 (#18990)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-06-18 22:56:31 -07:00
Yu-Hang "Maxin" Tang
83ca9ae47b Mark invariant normalizer in Gemma as non-persistent (#19788)
Signed-off-by: Yu-Hang Tang <Tang.Maxin@gmail.com>
2025-06-18 22:56:03 -07:00
kourosh hakhamaneshi
e2148dc5ea [Bugfix] Add check_health to v1 async client. (#19821)
Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
2025-06-18 21:47:01 -07:00
Lu Fang
b1098b4072 [Bugfix] Fix the linter (#19826)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-18 21:44:41 -07:00
Maximilien de Bayser
799397ee4f Support embedding models in V1 (#16188)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Signed-off-by: Max de Bayser <maxdebayser@gmail.com>
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-06-18 21:36:33 -07:00
Jee Jee Li
4959915089 [Quantization] Modify the logic of BNB double quantization (#19742)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-19 03:52:09 +00:00
Lu Fang
8d1e89d946 [Misc][ROCm] Enforce no unused variable in ROCm C++ files (#19796)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-18 20:25:15 -07:00
Michael Goin
36239f79dd Fix FA2 fallback for Blackwell V1 (#19781)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-19 09:53:55 +08:00
afeldman-nm
dfada85eee [Frontend] Expose custom args in OpenAI APIs (#16862)
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Andrew Feldman <afeldman@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-06-18 17:41:11 -07:00
Richard Zou
ed33349738 [BugFix] Fix use_cudagraph=False (#19612)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2025-06-19 08:23:12 +08:00
Woosuk Kwon
d49adea1f9 [Multimodal] Use fast processor for Qwen2/2.5-VL (#19789) 2025-06-18 15:49:40 -07:00
Russell Bryant
14fdd21d39 [Core] More fixes to MultiModalEmbeddings type handling (#19715)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-06-18 22:48:29 +00:00
QiliangCui
04fefe7c9a [TPU] Update torch-xla version to include paged attention tuned block change (#19813)
Signed-off-by: Qiliang Cui <derrhein@gmail.com>
2025-06-18 22:41:13 +00:00
Lukas Geiger
3b523e38d9 [Core] Do not copy array during hashing (#19484)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-06-18 15:36:55 -07:00
afeldman-nm
16c16301c8 Disable "Forbid direct 'import triton'" check for vllm/triton_utils/importing.py in an extensible way (#19783)
Signed-off-by: Andrew Feldman <afeldman@redhat.com>
2025-06-18 15:08:00 -07:00
Nathan Weinberg
9206d0ff01 docs: fix Slack bulletpoint in README (#19811)
Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
2025-06-18 20:47:08 +00:00
Chen Zhang
a89209b78d [v1] Support mamba2 (#19327)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-06-18 20:34:15 +00:00
Russell Bryant
ffacb222cb [Docs] Add Huzaifa Sidhpurwala to vuln mgmt team doc (#19808)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-06-18 20:22:28 +00:00
Chauncey
12575cfa7a [Bugfix] fix RAY_CGRAPH_get_timeout is not set successfully (#19725)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-06-18 10:26:16 -07:00
Zzz9990
8b6e1d639c [Hardware][AMD] integrate aiter chunked prefill into vllm (#18596)
Signed-off-by: fsx950223 <fsx950223@outlook.com>
Signed-off-by: charlifu <charlifu@amd.com>
Co-authored-by: fsx950223 <fsx950223@outlook.com>
Co-authored-by: charlifu <charlifu@amd.com>
2025-06-18 08:46:51 -07:00
Lu Fang
735a9de71f [Qwen] Add tagging rule for Qwen related PRs (#19799)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-18 14:26:43 +00:00
wangxiyuan
257ab95439 [Platform] Allow platform use V1 Engine by default (#19792)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-06-18 13:03:36 +00:00
Reid
cca91a7a10 [doc] fix the incorrect label (#19787)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-18 10:30:58 +00:00
Woosuk Kwon
f04d604567 [Minor] Zero-initialize attn output buffer (#19784)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-06-18 06:59:27 +00:00
afeldman-nm
19a53b2783 [V1] Decouple GPU and TPU InputBatch (#19778)
Signed-off-by: Andrew Feldman <afeldman@redhat.com>
2025-06-18 06:38:13 +00:00
Zhonghua Deng
eccdc8318c [V1][P/D] An native implementation of xPyD based on P2P NCCL (#18242)
Signed-off-by: Abatom <abzhonghua@gmail.com>
2025-06-18 06:32:36 +00:00
Russell Bryant
5f52a84685 [V1] Add API docs for EncoderCacheManager (#19294)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-06-18 13:37:01 +08:00
lkchen
d4629dc43f [Misc] Add __str__ for RequestStatus (#19780)
Signed-off-by: Linkun Chen <github@lkchen.net>
2025-06-18 03:03:01 +00:00
Ning Xie
6e9cc73f67 [MISC] correct DeviceConfig device field static type analysis (#19699)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-06-17 17:21:50 -07:00
Ning Xie
c53711bd63 [MISC] correct copy_blocks src_to_dists param type (#19696)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-06-17 17:21:06 -07:00
Chenyaaang
dac8cc49f4 [TPU] Update torch version to include paged attention kernel change (#19706)
Signed-off-by: Chenyaaang <chenyangli@google.com>
2025-06-17 22:24:49 +00:00
Charlie Fu
a44b1c951d [Feature][ROCm] Add full graph capture support for TritonAttentionBackend (#19158)
Signed-off-by: charlifu <charlifu@amd.com>
2025-06-17 17:03:06 -04:00
Michael Goin
b447624ee3 [Bugfix] Fix faulty triton importing logic when using Ray for DP (#19734)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-17 20:59:29 +00:00
Jiayi Yao
cda92307c1 [Misc] Update lmcache connector with the latest connector apis (#19441)
Signed-off-by: YaoJiayi <120040070@link.cuhk.edu.cn>
2025-06-17 19:57:54 +00:00
Michael Goin
bf57ccc5c2 Remove sm120 arch from sm100 cutlass kernel arch list (#19716)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-17 11:49:39 -07:00
Wentao Ye
ffb2cd6b54 [Perf] Optimize moe_align_block_size CUDA kernel (#19572)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-06-17 11:49:26 -07:00
Isotr0py
ca94d7fa00 [Bugfix] Update multimodel models mapping to fit new checkpoint after Transformers v4.52 (#19151)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-17 15:58:38 +00:00
CYJiang
5a1c2e15d8 [Mis] remove duplicate engine status checks (#19647)
Signed-off-by: googs1025 <googs1025@gmail.com>
2025-06-17 08:17:38 -07:00
Nicolò Lucchesi
4c8f64faa7 [V1][Kernel] Flashinfer HND KV cache layout (#19280)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-06-17 09:09:22 -04:00
David Xia
93aee29fdb [doc] split "Other AI Accelerators" tabs (#19708) 2025-06-17 22:05:29 +09:00
Reid
154d063b9f [doc][mkdocs] Add edit button to documentation (#19637)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-17 11:10:31 +00:00
jvlunteren
ccd7c05089 [Kernel] Add Split-KV Support to Unified Triton Attention Kernel (#19152)
Signed-off-by: Jan van Lunteren <jvl@zurich.ibm.com>
2025-06-17 10:45:07 +00:00
Huy Do
c48c6c4008 Add a doc on how to update PyTorch version (#19705) 2025-06-17 18:10:37 +08:00
Isotr0py
aed8468642 [Doc] Add missing llava family multi-image examples (#19698)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-17 07:05:21 +00:00
quanliu
5c76b9cdaf [Core] add remove_seq_from_computed_blocks_tracker to BlockSpaceManager (#19686)
Signed-off-by: 刘全 <quan.liu2@dbappsecurity.com.cn>
Co-authored-by: 刘全 <quan.liu2@dbappsecurity.com.cn>
2025-06-17 04:40:58 +00:00
Driss Guessous
ddfed314f9 Fixes IMA for TP w/ flex-attention (#19712)
Signed-off-by: drisspg <drisspguessous@gmail.com>
2025-06-17 04:01:50 +00:00
Di Liu
5b3ad5ecf2 [DOC] fix doc typos (#19600)
Signed-off-by: Di Liu <liu-di@sjtu.edu.cn>
2025-06-17 11:34:53 +08:00
nguyenhoangthuan99
ede5c4ebdf [Frontend] add chunking audio for > 30s audio (#19597)
Signed-off-by: nguyenhoangthuan99 <thuanhppro12@gmail.com>
2025-06-17 11:34:00 +08:00
Lucas Wilkinson
07334959d8 [Wheel Size] Only build FA2 8.0+PTX (#19336) 2025-06-17 12:32:49 +09:00
David Xia
119f683949 [doc] add project flag to gcloud TPU command (#19664)
Signed-off-by: David Xia <david@davidxia.com>
2025-06-17 01:00:09 +00:00
Conroy Cheers
0860087aff [Fix] Fall back to Gloo when NCCL backend is unavailable (#19641)
Signed-off-by: conroy-cheers <conroy@corncheese.org>
2025-06-17 08:42:14 +08:00
Dipika Sikka
6bc7b57315 [Quantization] Remove FP4 emulation; Fall-back to marlin for device < 100 (#19563) 2025-06-16 17:33:51 -04:00
Russell Bryant
90f9c2eb5c [V1] Change return type on get_multimodal_embeddings() (#19446)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-06-16 13:32:15 -04:00
qscqesze
387bdf0ab9 [Model] Add support for MiniMaxM1ForCausalLM (shares architecture with MiniMaxText01ForCausalLM) (#19677)
Signed-off-by: QscQ <qscqesze@gmail.com>
2025-06-16 09:47:14 -07:00
bnellnm
5e5baa91aa [Kernels] Use empty for modular MoE workspaces (#19667)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-06-16 14:58:01 +00:00
Chauncey
836d4ce140 [Bugfix] fix missing 'finish_reason': null in streaming chat (#19662)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-06-16 14:10:39 +00:00
Ning Xie
c3fec47bb7 [MISC] bump huggingface_hub pkg to 0.33.0 (#19547)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-06-16 05:22:28 -07:00
Isotr0py
1173804dca [Bugfix] Fix TP inference for Flex attention backend (#19657)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-16 11:21:37 +00:00
Shawn Tan
4d5424029b [Feature]:Allow for Granite MoE Hybrid models with _only_ shared experts. (#19652)
Signed-off-by: Shawn Tan <shawntan@ibm.com>
2025-06-16 11:14:18 +00:00
Navanit Dubey
3e7506975c [DOC] Add reasoning capability to vLLM streamlit code (#19557) 2025-06-16 07:09:12 -04:00
Nick Hill
ee35e96ac3 [BugFix] Don't catch BaseException when dumping execute_model errors (#19626)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-16 11:01:08 +00:00
Szymon Ożóg
dec66d253b [Kernel] GGUF MMVQ kernel for multiple input vectors (#18754)
Signed-off-by: SzymonOzog <szymon.ozog@gmail.com>
2025-06-16 17:33:26 +08:00
Russell Bryant
8d120701fd [Docs] Move multiproc doc to v1 dir (#19651)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-06-16 09:10:12 +00:00
wang.yuqi
f40f763f12 [CI] Add mteb testing for rerank models (#19344) 2025-06-16 01:36:43 -07:00
Ning Xie
26bc46ef89 [MISC] typo fix (#19672)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-06-16 07:18:49 +00:00
Chengji Yao
a77aea59fd [TPU] support attention head dim smaller than 128 (#19620)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-06-16 06:40:53 +00:00
Ye (Charlotte) Qi
b692e9cd07 [Misc] Fix skipped max-model-len validation when deriving max model length from tokenizer config (#19660)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-06-16 06:30:29 +00:00
Francesco Bertolotti
367871a469 [Misc][Frontend] passthrough bad_words (#19564)
Signed-off-by: Francesco Bertolotti <francesco.bertolotti@igenius.ai>
Co-authored-by: Francesco Bertolotti <francesco.bertolotti@igenius.ai>
Co-authored-by: Aaron Pham <Aaronpham0103@gmail.com>
2025-06-16 05:05:13 +00:00
quanliu
92183b41f3 [Bugfix][Core] Prefix caching causes incorrect outputs due to outdated ComputedBlocksTracker (#18957)
Signed-off-by: 刘全 <quan.liu2@dbappsecurity.com.cn>
Co-authored-by: 刘全 <quan.liu2@dbappsecurity.com.cn>
2025-06-15 21:56:37 -07:00
Lu Fang
c6703d1e0d [MISC] Remove unused variableds in C++ (#19609)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-15 20:05:28 -07:00
Isotr0py
a5e7242d5f [Misc] Remove duplicate multiproc method setting for CPU platform (#19649)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-16 02:26:58 +00:00
Richard Zou
91b2c17a55 [CI/Build] Fix torch nightly CI dependencies part 2 (#19589) 2025-06-15 20:01:10 +08:00
Woosuk Kwon
055915e6ce Enable prefix caching with full cuda graphs (#19617)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-06-15 01:05:05 -07:00
Wentao Ye
3d330c4c09 [Benchmark] Refactor benchmark script for fp8 & int8 (#19627)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-06-15 15:15:37 +08:00
22quinn
0b73736a0d [Kernel] Raise verbose error and consolidate num_heads/num_kv_heads divisibility check (#19339)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-06-15 13:43:48 +08:00
Lu Fang
ee1531bc38 [Bugfix][2/n] Fix speculative decoding CI - Fix test_ngram_e2e_greedy_correctness (#19644) 2025-06-14 21:15:41 -07:00
Ilya Markov
e13945f9dd [Perf] Further tunings for SM100 FP8 CUTLASS kernel (#19566) 2025-06-14 17:25:10 -07:00
maobaolong
08500011d3 [Fix] Convert kv_transfer_config from dict to KVTransferConfig (#19262) 2025-06-14 12:32:07 -07:00
Konrad Zawora
861a0a0a39 [Bugfix] Don't attempt to use triton if no driver is active (#19561) 2025-06-14 12:30:54 -07:00
Huy Do
bc956b38d0 Only build CUTLASS MoE kernels on Hopper (#19648) 2025-06-14 11:44:15 -07:00
jiahanc
294fc1e2c9 [Hardware][NVIDIA][kernel] Fp4 MOE quant kernel optimization (#19500) 2025-06-14 09:34:28 -07:00
Isotr0py
2db9044ab6 [Bugfix] Fix auto dtype casting for BatchFeature (#19316)
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-06-14 15:13:08 +00:00
Reid
6fa718a460 [Misc] Modularize CLI Argument Parsing in Benchmark Scripts (#19593)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-14 16:54:52 +08:00
Lu Fang
06be858828 [Bugfix] Fix the speculative decoding test by setting the target dtype (#19633) 2025-06-13 20:57:32 -07:00
Saheli Bhattacharjee
d1e34cc9ac [V1][Metrics] Deprecate metrics with gpu_ prefix for non GPU specific metrics. (#18354)
Signed-off-by: Saheli Bhattacharjee <saheli@krai.ai>
2025-06-14 11:07:36 +08:00
Nick Hill
bd517eb9fe [BugFix] Fix DP Coordinator incorrect debug log message (#19624)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-14 00:18:03 +00:00
Concurrensee
d65668b4e8 Adding "AMD: Multi-step Tests" to amdproduction. (#19508)
Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-06-13 17:08:51 -07:00
Woosuk Kwon
aafbbd981f [torch.compile] Use custom ops when use_inductor=False (#19618) 2025-06-13 15:05:54 -07:00
Anna Pendleton
0f0874515a [Doc] Add troubleshooting section to k8s deployment (#19377)
Signed-off-by: Anna Pendleton <pendleton@google.com>
2025-06-13 21:47:51 +00:00
Luka Govedič
3597b06a4f [CUDA] Enable full cudagraph for FlashMLA (#18581)
Signed-off-by: luka <luka@neuralmagic.com>
2025-06-13 18:12:26 +00:00
Reid
1015296b79 [doc][mkdocs] fix the duplicate Supported features sections in GPU docs (#19606)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-13 16:25:08 +00:00
Wentao Ye
ce9dc02c93 [Refactor] Remove unused variables in moe_permute_unpermute_kernel.inl (#19573)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-06-13 06:12:15 -07:00
qscqesze
a24cb91600 [Model] Fix minimax model cache & lm_head precision (#19592)
Signed-off-by: qingjun <qingjun@minimaxi.com>
2025-06-13 12:08:20 +00:00
Nick Hill
7e8d97dd3f [BugFix] Honor enable_caching in connector-delayed kvcache load case (#19435)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-13 09:46:32 +00:00
youkaichao
d70bc7c029 [torch.compile] reorganize the cache directory to support compiling multiple models (#19064)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-06-13 15:23:25 +08:00
Boyuan Feng
ce688ad46e use base version for version comparison (#19587)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
2025-06-13 15:09:34 +08:00
汪志鹏
cefdb9962d [Fix] The zip function in Python 3.9 does not have the strict argument (#19549)
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
2025-06-13 14:57:48 +08:00
汪志鹏
ace5cdaff0 [Fix] bump mistral common to support magistral (#19533)
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
2025-06-12 22:28:12 -07:00
Li, Jiang
6458721108 [CPU] Refine default config for the CPU backend (#19539)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-06-13 13:27:39 +08:00
Hyogeun Oh (오효근)
bb4a0decef [Misc] Correct broken docs link (#19553)
Signed-off-by: Zerohertz <ohg3417@gmail.com>
2025-06-12 22:27:13 -07:00
Reid
c707cfc12e [doc] fix incorrect link (#19586)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-13 04:26:09 +00:00
Aaron Pham
7b3c9ff91d [Doc] uses absolute links for structured outputs (#19582)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-06-13 03:35:17 +00:00
qizixi
c68698b326 [Bugfix] Fix EAGLE vocab embedding for multimodal target model (#19570)
Signed-off-by: qizixi <qizixi@meta.com>
2025-06-12 23:09:19 -04:00
Varun Sundar Rabindranath
e3b12667d4 [BugFix] : Fix Batched DeepGemm Experts (#19515)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-06-12 20:43:02 -06:00
kourosh hakhamaneshi
e6aab5de29 Revert "[Build/CI] Add tracing deps to vllm container image (#15224)" (#19378) 2025-06-12 17:26:40 -07:00
Russell Bryant
c57bb199b3 [V1] Resolve failed concurrent structured output requests (#19565)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-06-12 23:30:09 +00:00
Aaron Pham
dba68f9159 [Doc] Unify structured outputs examples (#18196)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-06-12 22:50:31 +00:00
Michael Goin
a3319f4f04 [Bugfix] Enforce contiguous input for dynamic_per_token FP8/INT8 quant (#19452)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-12 15:39:15 -04:00
Varun Sundar Rabindranath
9d880f594d [Misc] Turn MOE_DP_CHUNK_SIZE into an env var (#19506) 2025-06-12 18:01:16 +00:00
Ekagra Ranjan
017ef648e9 [Spec Decode][Benchmark] Generalize spec decode offline benchmark to more methods and datasets (#18847) 2025-06-12 10:30:56 -07:00
Reid
4b25ab14e2 [doc] Make top navigation sticky (#19540)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-12 15:48:11 +00:00
Luka Govedič
f98548b9da [torch.compile][ROCm] Fuse quantization onto attention using a torch.compile pass (#16756)
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
Co-authored-by: Sage Moore <sage@neuralmagic.com>
2025-06-12 08:31:04 -07:00
mobicham
96846bb360 Fix TorchAOConfig skip layers (#19265)
Signed-off-by: mobicham <hicham@mobiuslabs.com>
2025-06-12 22:22:53 +08:00
Wentao Ye
b6efafd9e4 [Perf] Vectorize static / dynamic INT8 quant kernels (#19233)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-06-12 06:51:41 -07:00
Nicolò Lucchesi
1129e2b1ab [V1][NixlConnector] Drop num_blocks check (#19532)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-06-12 12:36:14 +00:00
Cyrus Leung
c742438f8b [Doc] Add V1 column to supported models list (#19523)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-12 19:16:44 +08:00
Jee Jee Li
73e2e0118f [Quantization] Improve AWQ logic (#19431)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-12 11:02:11 +00:00
jmswen
c9280e6346 [Bugfix] Respect num-gpu-blocks-override in v1 (#19503)
Signed-off-by: Jon Swenson <jmswen@gmail.com>
2025-06-12 11:00:23 +00:00
Michael Goin
af09b3f0a0 [Bugfix][V1] Allow manual FlashAttention for Blackwell (#19492)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-12 10:40:24 +00:00
Russell Bryant
4f6c42fa0a [Security] Prevent new imports of (cloud)pickle (#18018)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Aaron Pham <Aaronpham0103@gmail.com>
2025-06-12 10:30:17 +00:00
niu_he
dff680001d Fix typo (#19525)
Signed-off-by: 2niuhe <carlton2tang@gmail.com>
2025-06-12 09:24:45 +00:00
rasmith
2e090bd5df [AMD][Kernel][BugFix] fix test_rocm_compressed_tensors_w8a8 for rocm (#19509)
Signed-off-by: Randall Smith <Randall.Smith@amd.com>
2025-06-12 07:14:24 +00:00
wonjun Jang
1b0b065eb5 [BugFix] Handle missing sep_token for Qwen3-Reranker in Score API (#19522)
Signed-off-by: strutive07 <strutive07@gmail.com>
2025-06-12 07:00:47 +00:00
Nick Hill
d5bdf899e4 [BugFix] Work-around incremental detokenization edge case error (#19449)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-12 06:43:20 +00:00
22quinn
7e3e74c97c [Frontend] Improve error message in tool_choice validation (#19239)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-06-12 01:13:00 -04:00
Brayden Zhong
3f6341bf7f Add Triton Fused MoE kernel config for E=16 on B200 (#19518)
Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
2025-06-12 04:31:51 +00:00
Varun Sundar Rabindranath
e5d35d62f5 [BugFix] Force registration of w8a8_block_fp8_matmul_deepgemm via lazy import (#19514)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-06-12 04:28:12 +00:00
Ning Xie
2f1c19b245 [CI] change spell checker from codespell to typos (#18711)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-06-11 19:57:10 -07:00
Richard Zou
42f52cc95b [CI/Build] Fix torch nightly CI dependencies (#19505)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2025-06-11 14:40:42 -07:00
Robert Shaw
97a9465bbc [UX] Add Feedback During CUDAGraph Capture (#19501)
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-06-11 21:09:05 +00:00
rasmith
c7ea0b56cd [AMD] [Quantization] Add override flag for attention dtype instead of using kv_cache_dtype trigger (#17331)
Signed-off-by: Randall Smith <Randall.Smith@amd.com>
2025-06-11 15:53:28 -04:00
bnellnm
29fa5cac1c [Kernels] Add activation chunking logic to FusedMoEModularKernel (#19168)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-06-11 12:53:10 -04:00
Woosuk Kwon
b2d9be6f7d [Docs] Remove WIP features in V1 guide (#19498)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-06-11 09:15:03 -07:00
Jee Jee Li
04a55612dd [Misc] Fix misleading ROCm warning (#19486)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-12 00:12:10 +08:00
David Xia
89b0f84e17 [doc] fix "Other AI accelerators" getting started page (#19457)
Signed-off-by: David Xia <david@davidxia.com>
2025-06-11 16:11:17 +00:00
Michael Goin
497a91e9f7 [CI] Update FlashInfer to 0.2.6.post1 (#19297)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-11 22:57:28 +08:00
runzhen
943ffa5703 [Bugfix] Update the example code, make it work with the latest lmcache (#19453)
Signed-off-by: Runzhen Wang <wangrunzhen@gmail.com>
2025-06-11 12:42:20 +00:00
Louie Tsai
5c8d34a42c Support no privileged mode on CPU for docker and kubernetes deployments (#19241)
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
2025-06-11 04:11:47 -07:00
Ximingwang-09
3c8694eabe Fix some typo (#19475)
Signed-off-by: ximing.wxm <ximing.wxm@antgroup.com>
Co-authored-by: ximing.wxm <ximing.wxm@antgroup.com>
2025-06-11 10:36:04 +00:00
Michael Goin
7484e1fce2 Add cache to cuda get_device_capability (#19436)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-11 17:37:05 +08:00
Cyrus Leung
a2142f0196 Support non-string values in JSON keys from CLI (#19471)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-11 09:34:04 +00:00
Lu Fang
871d6b7c74 [Misc] Reduce warning message introduced in env_override (#19476)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-11 17:29:54 +08:00
Cyrus Leung
29a38f0352 [Doc] Support "important" and "announcement" admonitions (#19479)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-11 01:39:58 -07:00
Cyrus Leung
a5115f4ff5 [Doc] Fix quantization link titles (#19478)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-11 01:27:22 -07:00
Cyrus Leung
68b4a26149 [Doc] Update V1 User Guide for Hardware and Models (#19474)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-11 00:49:06 -07:00
artetaout
b8e809a057 [Kernel] Support deep_gemm for linear methods (#19085)
Signed-off-by: artetaout <lulala341@gmail.com>
2025-06-11 15:14:45 +08:00
Lu Fang
5039ec2336 [ROCm] Add rules to automatically label ROCm related PRs (#19405)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-11 15:09:18 +08:00
leopardracer
7c644ab6d5 Fix Typo in Documentation and Function Name (#19442) 2025-06-10 22:44:11 -07:00
Junhao Li
2d40665fe8 Add fused MOE config for Qwen3 30B A3B on B200 (#19455)
Signed-off-by: Junhao Li <junhao@ubicloud.com>
2025-06-11 13:43:46 +08:00
Lukas Geiger
96ada386b7 [Misc] Remove unused MultiModalHasher.hash_prompt_mm_data (#19422)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-06-11 05:18:57 +00:00
Michael Goin
1e473b3010 [CI] Disable failing GGUF model test (#19454)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-11 05:12:38 +00:00
Lu Fang
2b1e2111b0 Fix test_max_model_len in tests/entrypoints/llm/test_generate.py (#19451)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-11 12:54:59 +08:00
niu_he
a45b979d9f [BugFix] Fix docker build cpu-dev image error (#19394)
Signed-off-by: niu_he <carlton2tang@gmail.com>
2025-06-10 20:56:40 -07:00
wang.yuqi
3952731e8f [New Model]: Support Qwen3 Embedding & Reranker (#19260) 2025-06-10 20:07:30 -07:00
Richard Zou
77f0d465d0 [BugFix] Allow use_cudagraph to work with dynamic VLLM_USE_V1 (#19390)
Signed-off-by: rzou <zou3519@gmail.com>
2025-06-11 07:54:41 +08:00
Xu Wenqing
22c3c0aa4a Add H20-3e fused MoE kernel tuning configs for Qwen3-235B-A22B-FP8 (#19401)
Signed-off-by: 许文卿 <xwq391974@alibaba-inc.com>
2025-06-11 07:23:57 +08:00
py-andy-c
33f8dba7c6 [Model] use AutoWeightsLoader for commandr (#19399)
Signed-off-by: py-andy-c <pychen1017@gmail.com>
2025-06-10 22:42:21 +00:00
Gregory Shtrasberg
5241ca50d6 [ROCm][V1] Adding ROCm to the list of plaforms using V1 by default (#19440)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-06-10 22:06:15 +00:00
Russell Bryant
da9b523ce1 [Docs] Note that alternative structured output backends are supported (#19426)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-06-10 16:20:00 +00:00
Jee Jee Li
b6553be1bc [Misc] Slight improvement of the BNB (#19418)
Some checks failed
Create Release / Create Release (push) Has been cancelled
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-06-10 13:51:49 +00:00
youkaichao
64a9af5afa Simplify ep kernels installation (#19412)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-06-10 20:06:08 +08:00
Li, Jiang
e4248849ec [BugFix][CPU] Fix CPU CI by ignore collecting test_pixtral (#19411)
Signed-off-by: jiang.li <jiang1.li@intel.com>
2025-06-10 12:02:40 +00:00
Rachel Guo
467bef18a3 [BugFix][FlashInfer] Fix attention backend interface mismatch with unexpected keyword use_irope (#19134)
Signed-off-by: Yunqiu Guo <guorachel@meta.com>
2025-06-10 16:48:51 +08:00
Isotr0py
5f1ac1e1d1 Revert "[v1] Add fp32 support to v1 engine through flex attn" (#19404) 2025-06-10 01:30:20 -07:00
Louie Tsai
9368cc90b2 Automatically bind CPU OMP Threads of a rank to CPU ids of a NUMA node. (#17930)
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Co-authored-by: Li, Jiang <bigpyj64@gmail.com>
2025-06-10 06:22:05 +00:00
Anna Pendleton
32b3946bb4 Add clear documentation around the impact of debugging flag (#19369)
Signed-off-by: Anna Pendleton <pendleton@google.com>
2025-06-10 06:16:09 +00:00
Reid
6b1391ca7e [Misc] refactor neuron_multimodal and profiling (#19397)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-10 06:12:42 +00:00
Russell Bryant
a3f66e75d1 Add security warning to bug report template (#19365)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2025-06-10 06:06:36 +00:00
Lukas Geiger
319cb1e351 [Core] Batch multi modal input using pinned memory (#19169)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-06-10 13:44:59 +08:00
Li Wang
1efef71645 [Bugfix] Fix modelscope token passed in (#19389)
Signed-off-by: wangli <wangli858794774@gmail.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-10 13:39:37 +08:00
Nick Hill
646d62f636 [Core] Use tuple for kv cache group block ids (#19175)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-10 07:01:17 +02:00
Reid
6cd4ae8acd [Frontend] Add tqdm_leave_pbar to control progress bar visibility (#19357)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-10 04:55:09 +00:00
Harry Mellor
c016047ed7 Fix docs/mkdocs/hooks/remove_announcement.py (#19382) 2025-06-09 21:36:54 -07:00
XiongfeiWei
9af6d22e4c Use xla flag to improve the quantized model performance (#19303)
Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
2025-06-10 01:28:45 +00:00
Tianyu Guo
4589b94032 [Bugfix] Fix benchmark_moe.py (#19016)
Signed-off-by: Tianyu Guo <guoty9@mail2.sysu.edu.cn>
2025-06-09 18:04:36 -07:00
Ye (Charlotte) Qi
cc867be19c [V1] Reuse V0's memory_profiling util for gpu worker memory profiling (#19312)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-06-10 08:40:01 +08:00
Siyuan Liu
3a7cd627a8 [Misc] Fix a config typo in disable_hybrid_kv_cache_manager configuration (#19383)
Some checks failed
Create Release / Create Release (push) Has been cancelled
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
2025-06-09 16:41:51 -07:00
Pavani Majety
8058c91108 [HOT-FIX] Add kv_sharing_target_layer_name argument to cutlass_mla backend (#19374)
Signed-off-by: Pavani Majety <pmajety@nvidia.com>
2025-06-09 19:00:07 -04:00
Siyuan Liu
7d44c469fe [TPU]Fix KV cache sharing tests (#19371) 2025-06-09 18:38:15 -04:00
liusiqian-tal
31f58be96a [Frontend] Make TIMEOUT_KEEP_ALIVE configurable through env var (#18472)
Signed-off-by: liusiqian <liusiqian@tal.com>
2025-06-09 21:41:21 +00:00
Kyle Sayers
ebb2f383b8 [Quantization] Bump compressed-tensors version (#19295)
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
2025-06-09 14:33:15 -07:00
22quinn
c1c7dbbeeb [Bugfix][Core] Prevent token lengths exceeding max_model_len in V0 (#19348)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-06-09 23:01:29 +08:00
Varun Sundar Rabindranath
5cf2daea9a [Misc] Fixes and Optimizations for DeepEP + DeepGEMM combination. (#19298)
Signed-off-by: Varun <vsundarr@redhat.com>
Co-authored-by: Varun <vsundarr@redhat.com>
2025-06-09 10:50:39 -04:00
Isotr0py
b8089195b4 [v1] Add fp32 support to v1 engine through flex attn (#19319)
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-06-09 22:10:44 +08:00
Yinghai Lu
770e5dcdb8 [full_graph] Fix query_start_loc padding (#19321)
Signed-off-by: Yinghai Lu <yinghai@thinkingmachines.ai>
2025-06-09 21:32:56 +08:00
Michael Yao
c57c9415b1 [Docs] Fix a bullet list in usage/security.md (#19358)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-06-09 13:28:51 +00:00
Lu Fang
01810f9236 [CI] Introduce rules for llama auto-label (#19323)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-09 20:05:42 +08:00
Conroy Cheers
59abbd84f9 [Fix] Allow kernel compilation for CUDA capability 8.7 (#19328)
Signed-off-by: Conroy Cheers <conroy@corncheese.org>
2025-06-09 02:57:23 -07:00
Jee Jee Li
95a6568b5c [CI/Build] Fix LoRA test (#19350)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-09 09:52:10 +00:00
Se7en
0eca5eacd0 [Doc] Fix description in the Automatic Prefix Caching design doc (#19333)
Signed-off-by: cr7258 <chengzw258@163.com>
2025-06-09 17:30:02 +08:00
Reid
12e5829221 [doc] improve ci doc (#19307)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-09 07:26:12 +00:00
Richard Zou
3a4d417707 [Misc] Cleanup compilation tests (#19343)
Signed-off-by: rzou <zou3519@gmail.com>
2025-06-09 15:05:44 +08:00
Kseniya Parkhamchuk
8335667c22 [Frontend] Remove unreachable code from llm.py (#19288)
Signed-off-by: KsuParkhamchuk <k.parkhamchuk@gmail.com>
2025-06-09 10:22:10 +08:00
Isotr0py
e1c4380d4c [Misc] Add documentation update reminder to PR template (#19289)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-09 10:20:53 +08:00
Cyrus Leung
e31ae3de36 [Deprecation] Remove inputs arg fallback in Engine classes (#18799)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-09 10:19:56 +08:00
wang.yuqi
2ffb9b6e07 [Bugfix] model_max_length should consider max_model_len in tokenizer_config (#19201) 2025-06-08 07:17:53 -07:00
jennyyyyzhen
cda10fa3e2 [Multi Modal] Add an env var for message queue max chunk bytes (#19242)
Signed-off-by: yZhen <yZhen@fb.com>
Co-authored-by: yZhen <yZhen@fb.com>
2025-06-08 21:39:12 +08:00
Dipika Sikka
c123bc33f9 [Quantization] Add compressed-tensors NVFP4 support (#18312) 2025-06-08 09:05:55 -04:00
Akash kaothalkar
b9a1791e2c [Hardware][POWER] Add IBM POWER11 Support to CPU Extension Detection (#19082)
Signed-off-by: Akash Kaothalkar <akash.kaothalkar@ibm.com>
Co-authored-by: Akash Kaothalkar <akash.kaothalkar@ibm.com>
2025-06-08 09:17:14 +00:00
Xu Wenqing
989dcee981 Add H20-3e fused MoE kernel tuning configs for Qwen3-235B-A22B (#19315)
Signed-off-by: Xu Wenqing <xuwq1993@qq.com>
2025-06-08 16:07:02 +08:00
Richard Zou
3d64d366e0 [Misc] Change tests/compile to use VLLM_V1 by default (#19302)
Signed-off-by: rzou <zou3519@gmail.com>
2025-06-08 16:06:48 +08:00
Richard Zou
eaa2e51088 [Bugfix] Re-enable use_cudagraph in vLLM v1 (#19299)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2025-06-08 08:56:12 +08:00
Chauncey
d77f7fb871 [Bugfix]: Fix TypeError: 'float' object cannot be interpreted as an integer (#19283)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-06-08 08:16:31 +08:00
Luka Govedič
2d8476e465 [BugFix][V1] Fix memory profiling bug (#18974)
Signed-off-by: luka <luka@neuralmagic.com>
2025-06-07 10:34:51 -07:00
pramenku
88be823d57 [AMD] Update compatible packaging version (#19309)
Signed-off-by: pramkuma <Pramendra.Kumar@amd.com>
2025-06-07 20:55:09 +08:00
Lifans
4e4f63ad45 [Nit][Benchmark]Fix example in benchmark_serving_structured_output.py (#19311)
Signed-off-by: Lifan Shen <lifans@meta.com>
2025-06-07 18:25:38 +08:00
Isotr0py
d2f0e7e615 [CI/Build] Improve Llama GGUF test robustness (#19287)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-07 17:23:28 +08:00
Reid
122cdca5f6 [Misc] refactor context extension (#19246)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-07 05:13:21 +00:00
Driss Guessous
cf02f9b283 Add FlexAttention to V1 (#16078)
Signed-off-by: drisspg <drisspguessous@gmail.com>
2025-06-06 21:58:55 -07:00
Aaruni Aggarwal
c4296b1a27 [CI][PowerPC] Use a more appropriate way to select testcase in tests/models/language/pooling/test_embedding.py (#19253)
Signed-off-by: Aaruni Aggarwal <aaruniagg@gmail.com>
2025-06-07 11:52:52 +08:00
QiliangCui
66c508b137 [TPU][Test] Add script to run benchmark on TPU for buildkite (#19039)
Signed-off-by: Qiliang Cui <derrhein@gmail.com>
2025-06-06 20:10:24 -07:00
ElizaWszola
84166fee97 [Kernel] Integrate CUTLASS MoE kernel with PPLX (#18762)
Signed-off-by: ElizaWszola <ewszola@redhat.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-06-06 18:26:11 -07:00
Lu Fang
6e0cd10f72 [Easy][Test] Simplify test_function_tool_use with multiple parametrizes (#19269)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-07 09:19:09 +08:00
Alexei-V-Ivanov-AMD
e010688f50 [Build][ROCm] Update Dockerfile.rocm (#19296)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
2025-06-06 19:35:16 -04:00
Chenyaaang
441b65d8c7 [Misc][Tools][Benchmark] Fix and improve auto tune script (#19163)
Signed-off-by: Chenyaaang <chenyangli@google.com>
2025-06-06 23:31:19 +00:00
Nick Hill
46ecc57973 [BugFix] Fix tpu_model_runner block_id concatenation (#19228)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-06 16:28:17 -07:00
Nicolò Lucchesi
b6a3a9f76d [Core] Fix abrupt request abort (#18485)
Signed-off-by: nicklucche <nlucches@redhat.com>
Signed-off-by: Nick Hill <nhill@redhat.com>

Co-authored-by: Nick Hill <nhill@redhat.com>
2025-06-06 16:27:59 -07:00
Adolfo Victoria
ca27f0f9c1 [Bugfix][Core] Update cancellation logic in generate() to handle Generator exits (#19225)
Co-authored-by: Adolfo Victoria <adovi@meta.com>
2025-06-06 20:17:54 +00:00
Nick Hill
aad30bd306 [BugFix] Fix MultiConnector test after HMA changes (#19291)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-06 20:16:24 +00:00
Nishidha
94ecee6282 Fixed ppc build when it runs on non-RHEL based linux distros (#18422)
Signed-off-by: Nishidha Panpaliya <nishidha.panpaliya@partner.ibm.com>
Signed-off-by: Md. Shafi Hussain <Md.Shafi.Hussain@ibm.com>
Signed-off-by: npanpaliya <nishidha.panpaliya@partner.ibm.com>
Co-authored-by: Md. Shafi Hussain <Md.Shafi.Hussain@ibm.com>
2025-06-06 11:54:26 -07:00
Yu Guo
8267f9916f improve logits bias (#19041) 2025-06-06 19:59:25 +08:00
jmswen
7353492a47 [Core] Raise when non-multi-instance DP clients target a DP rank (#19227)
Signed-off-by: Jon Swenson <jmswen@gmail.com>
2025-06-06 19:03:01 +08:00
Jee Jee Li
7661e92ef8 [Model] Optimize nemotron_h implementation (#19249)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-06 10:05:14 +00:00
Siqi Yan
f168b85725 Unit Test for run_dp_sharded_vision_model (#19103)
Signed-off-by: Siqi Yan <siqi@meta.com>
Co-authored-by: Siqi Yan <siqi@meta.com>
2025-06-06 16:24:02 +08:00
Richard Zou
da511d54d8 Fix CompilationConfig repr (#19091)
Signed-off-by: rzou <zou3519@gmail.com>
2025-06-06 16:23:35 +08:00
Nick Hill
65c69444b1 [Docs] Improve V1 KVConnector interface documentation (#19172)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-06 16:22:45 +08:00
Dipika Sikka
94870359cd [Quantization] Bump compressed-tensors version; update NVFP4A16 test model (#19224)
Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
2025-06-06 01:21:54 -07:00
Chengji Yao
0d49483ea9 [TPU] fix kv cache dtype in model runner (#19244)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-06-06 16:20:16 +08:00
Jinghui Zhang
90b78ec5f9 [v1][P/D] Fix a edge case in kv cache schedule (#19182)
Co-authored-by: jinghui <jinghui@fb.com>
2025-06-05 23:32:55 -07:00
Aaron Pham
91a2ef98ea [Chore] update CODEOWNERS (#19247)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-06-06 06:09:43 +00:00
Xu Song
3da2313d78 Support allowed_token_ids in ChatCompletionRequest (#19143)
Signed-off-by: Xu Song <xusong.vip@gmail.com>
2025-06-06 05:06:48 +00:00
Chengji Yao
b61dc5f972 [TPU] update torch_xla pin (#19231)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-06-06 04:27:38 +00:00
Chen Zhang
f8a1a2d108 [v1] Hybrid Memory Allocator (#17996)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-06-05 20:47:09 -07:00
Benjamin Chislett
3465b87ef8 [Bugfix] Fix EAGLE vocab embedding construction for Llama 70B (#19033)
Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
2025-06-05 19:10:08 -07:00
Jerry Zhang
c8134bea15 Fix AOPerModuleConfig name changes (#18869)
Signed-off-by: Jerry Zhang <jerryzh168@gmail.com>
2025-06-05 18:51:32 -07:00
Luis Vega
cb6d572e85 [Model] NemotronH support (#18863)
Signed-off-by: Luis Vega <2478335+vegaluisjose@users.noreply.github.com>
Co-authored-by: Luis Vega <2478335+vegaluisjose@users.noreply.github.com>
2025-06-05 21:29:28 +00:00
Michael Goin
87360308b7 [V1] Use FlashInfer by default on Blackwell GPUs (#19118) 2025-06-05 15:40:39 -04:00
Dipika Sikka
aa49f14832 [Quantization] Skip Fp4 Test for compressed-tensors (#19217) 2025-06-05 18:21:53 +00:00
Nicolò Lucchesi
9ef9173cfa [P/D][NixlConnector] Enable FlashInfer backend (#19090) 2025-06-05 17:10:15 +00:00
Povilas Kanapickas
85e2b7bb13 [MISC][Bugfix] Use less CPU when message queue has been empty for some time (#16226)
Signed-off-by: Povilas Kanapickas <povilas@radix.lt>
2025-06-05 16:53:08 +00:00
Chiyue Wei
61059bee40 [Hardware][NVIDIA] FP4 MoE kernel optimization (#19110)
Signed-off-by: Chiyue Wei <chiyuew@nvidia.com>
Co-authored-by: Chiyue Wei <chiyuew@nvidia.com>
2025-06-05 09:48:26 -07:00
Xu Wenqing
ec89524f50 Add H20-3e fused MoE kernel tuning configs for DeepSeek-R1/V3 (#19205) 2025-06-05 16:38:54 +00:00
Patrick von Platen
f20f9f063b [mistral_common] Add v11 tokenizer (#19193)
Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com>
2025-06-05 08:27:41 -07:00
Guillaume Calmettes
9bc8bb07cf [Bugfix] properly catch PIL-related errors for vision models when incorrect data urls are provided (#19202)
Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
2025-06-05 12:59:28 +00:00
Reid
1aeb925f34 [Frontend] improve vllm run-batch --help display (#19187)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-05 11:16:25 +00:00
22quinn
188a4590d8 [Misc] Do not override NCCL_CUMEM_ENABLE if set explicitly (#19105)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-06-05 11:14:32 +00:00
vllmellm
18093084be [Misc] Remove unnecessary fallback to prefill-decode attention (#19138)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-06-05 16:08:26 +08:00
Simon Mo
da40380214 [Build] Annotate wheel and container path for release workflow (#19162)
Signed-off-by: simon-mo <simon.mo@hey.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-06-04 23:24:56 -07:00
Chauncey
8fc57501d3 [Bugfix]: Fix the incompatibility issue with stream when Thinking is disabled (#19135)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-06-05 06:24:24 +00:00
Woosuk Kwon
af7fc84fd2 [BugFix][Minor] Fix full cuda graph bug when max_num_seqs < 512 (#19171)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-06-05 13:41:25 +08:00
Huy Do
0678b52251 Handle non-serializable objects when dumping benchmark results (#19114) 2025-06-04 22:40:04 -07:00
Yang Wang
25b918eee6 [Torch Nightly]add missing dependency (#18770)
Signed-off-by: Yang Wang <elainewy@meta.com>
2025-06-04 21:56:12 -07:00
Michael Goin
a408820f2f [Bugfix] Fix port handling in make_zmq_path (#19117) 2025-06-04 21:00:59 -06:00
Robert Shaw
c56ed8bb0e [Bugfix][Nixl] Fix full prefix cache hit bug (#18632)
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-06-05 02:07:32 +00:00
Reid
78dcf56cb3 [doc] small fix (#19167)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-05 09:13:50 +08:00
Nicolò Lucchesi
b2fac67130 [P/D] Heterogeneous TP (#18833)
Signed-off-by: nicklucche <nlucches@redhat.com>
2025-06-04 23:25:34 +00:00
CYJiang
23027e2daf [Misc] refactor: simplify EngineCoreClient.make_async_mp_client in AsyncLLM (#18817)
Signed-off-by: googs1025 <googs1025@gmail.com>
2025-06-04 15:37:25 -07:00
Varun Sundar Rabindranath
c3fd4d669a [Kernel] Integrate batched/masked deepgemm kernel (#19111)
Signed-off-by: Varun <vsundarr@redhat.com>
Co-authored-by: Varun <vsundarr@redhat.com>
2025-06-04 21:59:18 +00:00
Kebe
ef3f98b59f [Bugfix] fix v1 cpu worker fails on macOS (#19121) 2025-06-04 20:17:38 +00:00
Siyuan Liu
7ee2590478 [TPU] Update dynamo dump file name in compilation test (#19108)
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
2025-06-04 16:13:43 -04:00
Michael Goin
53a5a0ce30 [Perf] Tunings for SM100 FP8 CUTLASS kernel (#18778)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-04 10:46:28 -07:00
Tyler Michael Smith
d459fae0a2 [Bugfix][EP+DP] Fix internode check (#19112)
Signed-off-by: Tyler Michael Smith <tysmith@redhat.com>
2025-06-04 23:39:23 +08:00
jmswen
c8dcc15921 Allow AsyncLLMEngine.generate to target a specific DP rank (#19102)
Signed-off-by: Jon Swenson <jmswen@gmail.com>
2025-06-04 08:26:47 -07:00
Cyrus Leung
8f4ffbd373 [Doc] Update V1 Guide for embedding models (#19141)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-04 22:57:55 +08:00
Lain
5f2cd251d2 Sm100 blockwise fp8 swap ab (#18564) 2025-06-04 07:48:45 -07:00
Xu Wenqing
02658c2dfe Add DeepSeek-R1-0528 function call chat template (#18874)
Signed-off-by: 许文卿 <xwq391974@alibaba-inc.com>
2025-06-04 13:24:18 +00:00
Cyrus Leung
01dc9a76db [CI/Build][Bugfix] Ensure compatibility with transformers 4.52 (#18678)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-04 04:49:20 -07:00
wang.yuqi
35cf32df30 Improve the output precision of embedding models (#19092) 2025-06-04 11:48:57 +00:00
Isotr0py
8711bc5e68 [Misc] Add packages for benchmark as extra dependency (#19089)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-04 04:18:48 -07:00
Seiji Eicher
2669a0d7b5 Fix ValueError: Missing value for tag key(s): model_name,engine. (#19113)
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
2025-06-04 17:10:45 +08:00
Siyuan Liu
8e972d9c44 [TPU] Skip hanging tests (#19115)
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
2025-06-04 01:43:00 -07:00
汪志鹏
3336c8cfbe Fix #19130 (#19132)
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
2025-06-04 01:42:06 -07:00
Woosuk Kwon
b124e1085b [Bugfix] Fix FA3 full cuda graph correctness (#19106)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-06-03 23:10:15 -07:00
Kaixi Hou
41aa578428 [NVIDIA] Add Cutlass MLA backend (#17625) 2025-06-03 21:40:26 -07:00
Calvin Chen
8d646c2e53 [Cleanup][v1]:remote guided-decoding-backend for example (#19059)
Signed-off-by: calvin chen <120380290@qq.com>
2025-06-04 04:23:26 +00:00
Vadim Gimpelson
5d6d1adf15 [KERNEL] Sampler. CUDA kernel for applying repetition penalty (#18437) 2025-06-03 21:13:01 -07:00
Lukas Geiger
1409ef9134 [Core] Cast multimodal input in hf processor (#18862)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-06-03 20:24:56 -07:00
Li, Jiang
4555143ea7 [CPU] V1 support for the CPU backend (#16441) 2025-06-03 18:43:01 -07:00
Russell Bryant
52dceb172d [Docs] Add developer doc about CI failures (#18782)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Mark McLoughlin <markmc@redhat.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-06-04 01:09:13 +00:00
Jiaxin Shan
abd7df2fca [Misc] Fix path and python alias errors in disagg_prefill exmaples (#18919) 2025-06-03 17:15:18 -07:00
Yan Ru Pei
b712be98c7 feat: add data parallel rank to KVEventBatch (#18925) 2025-06-03 17:14:20 -07:00
Chen Zhang
a8da78eac9 [Bugfix] Max concurrency estimation and check_enough_kv_cache_memory for models with sliding window layers (#19029)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-06-04 00:14:06 +00:00
Nicolò Lucchesi
5d96533e22 [Bugfix][P/D] Fix Prefix Cache Bug (#18411)
Signed-off-by: nicklucche <nlucches@redhat.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2025-06-03 23:53:16 +00:00
Chauncey
4de790fcad [Bugfix]: Fix the incompatibility issue with tool_choice 'required' when Thinking is enabled (#19075)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-06-03 23:27:24 +00:00
Chen Zhang
b5fd9506c1 [Bugfix] get_num_blocks_to_allocate with null_block (#19031)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-06-03 15:30:55 -07:00
Ekagra Ranjan
135cf55cd1 [V1][Spec Decode][Ngram] 1.35x gain -> 1.95x gain on InstructCoder with prompt fix (#18971) 2025-06-03 15:26:33 -07:00
Chen Zhang
6cac54f4d1 [v1] Re-init input batch for multiple kv cache groups (#18654)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-06-03 21:41:36 +00:00
Harry Mellor
6865fe0074 Fix interaction between Optional and Annotated in CLI typing (#19093)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Yikun Jiang <yikun@apache.org>
2025-06-03 21:07:19 +00:00
Michael Goin
e31446b6c8 [Perf] Tune scaled_fp8_quant by increasing vectorization (#18844)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-03 13:48:25 -07:00
Yong Hoon Shin
bdf13965ab [V1] Support cross-layer KV sharing (#18212)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-06-03 20:33:07 +00:00
Varun Sundar Rabindranath
fa98d77773 [Kernel] DeepEP dispatch-combine kernel integration (#18434)
Signed-off-by: Varun <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-06-03 12:30:02 -07:00
Reid
01eee40536 [doc] update docker version (#19074)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-03 19:08:21 +00:00
SorenDreano
19bdaf32b1 [Doc] Readme standardization (#18695)
Co-authored-by: Soren Dreano <soren@numind.ai>
2025-06-03 11:50:55 -07:00
Simon Mo
02f0c7b220 [Misc] Add SPDX-FileCopyrightText (#19100)
Signed-off-by: simon-mo <simon.mo@hey.com>
2025-06-03 11:20:17 -07:00
CYJiang
d054da1992 [Misc] fix: add miss best_of param validation (#18555)
Signed-off-by: googs1025 <googs1025@gmail.com>
2025-06-03 11:02:07 -07:00
Nicolò Lucchesi
4b7817c119 [Misc] Add missing _Backend enums (#19081)
Signed-off-by: nicklucche <nlucches@redhat.com>
2025-06-03 16:15:16 +00:00
Lu Fang
d00dd65cd4 [Doc] Improve the Pull Request template with key components (#19086)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-03 23:44:34 +08:00
Raushan Turganbay
d81edded69 [Bugfix] disable processor cache (#19068)
Signed-off-by: raushan <raushan@huggingface.co>
2025-06-03 15:06:04 +00:00
Harry Mellor
476844d44c Fix underscores in dict keys passed via CLI (#19030)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-06-03 14:39:24 +00:00
Jee Jee Li
4e68ae5e59 [CI/Build] Remove V0 LoRA test (#19066)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-03 14:30:18 +00:00
youkaichao
4e88723f32 [doc] clarify windows support (#19088)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-06-03 21:42:17 +08:00
Cyrus Leung
118ff92111 [Doc] Update V1 user guide for embedding and enc-dec models (#19060)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-03 02:29:41 -07:00
Isotr0py
ec2dcd80bc [Misc] Update WeightsMapper for qwen2-vl/qwen2.5-vl (#19054)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-03 09:08:20 +00:00
Jee Jee Li
42243fbda0 [Doc] Add InternVL LoRA support (#19055)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-03 09:08:03 +00:00
Michael Goin
6d18ed2a2e Update docker docs with ARM CUDA cross-compile (#19037)
Signed-off-by: mgoin <michael@neuralmagic.com>
2025-06-03 08:21:53 +00:00
Chen Zhang
f32fcd9444 [v1][KVCacheManager] Rename BlockHashType to BlockHash (#19015)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-06-03 08:01:48 +00:00
Lu Fang
d32aa2e670 [Bugfix] Use cmake 3.26.1 instead of 3.26 to avoid build failure (#19019)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-03 00:16:17 -07:00
Michael Goin
cc977286e7 Reduce logs in CLI scripts and plugin loader (#18970)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-03 06:00:45 +00:00
Reid
17430e3653 [bugfix] small fix logic issue (#18999)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-03 05:35:12 +00:00
汪志鹏
1282bd812e Add tarsier model support (#18985)
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
2025-06-03 13:13:13 +08:00
Rui Qiao
bdce64f236 [V1] Support DP with Ray (#18779) 2025-06-02 21:15:13 -07:00
Gregory Shtrasberg
9e6f61e8c3 [ROCm][Build] Clean up the ROCm build (#19040)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-06-02 20:47:47 -07:00
Li, Jiang
8655f47f37 [CPU][CI] Re-enable the CPU CI tests (#19046)
Signed-off-by: jiang.li <jiang1.li@intel.com>
2025-06-02 20:46:47 -07:00
Concurrensee
4ce42f9204 Adding "LoRA Test %N" to AMD production tests (#18929)
Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu>
2025-06-02 20:46:44 -07:00
Tyler Michael Smith
8a57872b2a [Bugfix][EP+DP] Use pplx-kernel internode instead of intranode (#19034)
Signed-off-by: Tyler Michael Smith <tysmith@redhat.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-06-03 11:36:51 +08:00
Hyogeun Oh (오효근)
5bc1ad6cee [Doc] Remove duplicate TOCs during MkDocs migration (#19021)
Signed-off-by: Zerohertz <ohg3417@gmail.com>
2025-06-02 19:49:48 -07:00
Siyuan Liu
9112b443a0 [Hardware][TPU] Initial support of model parallelism with single worker using SPMD (#18011)
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
Co-authored-by: Hossein Sarshar <hossein.sarshar@gmail.com>
Co-authored-by: Chengji Yao <chengjiyao@google.com>
2025-06-03 00:06:20 +00:00
Calvin Chen
c57d577e8d add an absolute path for run.sh (#18258)
Signed-off-by: calvin chen <120380290@qq.com>
2025-06-02 19:38:23 +00:00
Gregory Shtrasberg
ca2f6b9c30 [Bugfix][Model] Attempt to fix eagle in V0. (#18978)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-06-02 08:15:53 -07:00
Frαnçois
20133cfee2 [Frontend] enable custom logging for the uvicorn server (OpenAI API server) (#18403)
Signed-off-by: François Paupier <francois.paupier@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-06-02 15:04:23 +00:00
jennyyyyzhen
ebb1ec9318 [Model] enable data parallel for Llama4 vision encoder (#18368)
Signed-off-by: yzhen <yzhen@devgpu093.cco2.facebook.com>
Co-authored-by: yZhen <yZhen@fb.com>
Co-authored-by: yzhen <yzhen@devgpu093.cco2.facebook.com>
2025-06-02 19:22:54 +08:00
Reid
5b168b6d7a [doc] add pytest tips (#19010)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-02 11:07:26 +00:00
22quinn
9760fd8f6a [Core] Support inplace model weights loading (#18745)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-06-02 17:38:50 +08:00
Robert Shaw
b9f61e1387 [Bugfix][Nixl] Fix DP Metadata Handshake (#19008)
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-06-02 03:30:41 +00:00
zhrrr
d6fd3a33b8 [Misc] reuse num_tokens_across_dp of get_dp_padding to avoid unnecessary dp all reduce in set_forward_context (#18935)
Signed-off-by: Tyler Michael Smith <tysmith@redhat.com>
Co-authored-by: zhuhaoran <zhuhaoran.zhr@alibaba-inc.com>
Co-authored-by: Tyler Michael Smith <tysmith@redhat.com>
2025-06-01 19:41:18 +00:00
Reid
432ec9926e [doc] wrong output (#19000)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-01 11:26:14 +00:00
Nick Hill
2b102d51ad [BugFix] Fix incorrect metrics shutdown error log message (#18992)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-01 11:42:23 +08:00
rongfu.leng
aa54a7bf7b [BugFix] fix data parallel construct ipv6 url addres (#18991)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-06-01 11:42:10 +08:00
Michael Goin
2ad6194a02 Let max_num_batched_tokens use human_readable_int for large numbers (#18968)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-01 11:41:29 +08:00
Reid
c594cbf565 [doc] small fix - mkdocs (#18996)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-31 20:23:43 -07:00
Isotr0py
a35ca765a5 [LoRA] Support dynamically initialize packed_modules_mapping for VLM with arbitrary components (#18987)
Signed-off-by: isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-01 11:06:57 +08:00
Cyrus Leung
6aa8f9a4e7 [Core] Rework dtype resolution (#18751)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-01 11:04:23 +08:00
Benjamin Chislett
1bc86a3da1 [Bugfix] Fix EAGLE3 broken logits (#18909)
Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
2025-05-31 19:58:07 -07:00
Ekagra Ranjan
bbfa0c61d1 [Misc][Benchmark] Add support for CustomDataset (#18511) 2025-05-31 19:07:38 +00:00
Reid
20079c6e36 [Misc] add return token strs for tokenize (#18941)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-31 18:00:11 +00:00
Nick Hill
9a1b9b99d7 [BugFix] Fix multi-node offline data-parallel (#18981)
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-05-31 08:34:52 -07:00
ptarasiewiczNV
8bf507d766 [P/D] NixlConnector use cache device index for memory registration (#18969)
Signed-off-by: Piotr Tarasiewicz <ptarasiewicz@nvidia.com>
2025-05-31 11:19:18 -04:00
Charlie Fu
306d60401d [ROCm][Kernel] Add gfx950 support for skinny gemms (#18010)
Signed-off-by: charlifu <charlifu@amd.com>
2025-05-31 07:40:05 -07:00
Fred Reiss
f2c3f66d59 [Bugfix] Fix for issue 17396 (#18773)
Signed-off-by: Fred Reiss <frreiss@us.ibm.com>
2025-05-31 11:58:17 +00:00
vllmellm
0f5e0d567e [FEAT][ROCm] Add AITER grouped topk for DeepSeekV2 (#18825)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-05-31 03:39:31 -07:00
Luka Govedič
c55d804672 [BugFix] Pydantic part 2 (#18911)
Signed-off-by: luka <luka@neuralmagic.com>
2025-05-31 03:39:28 -07:00
Reid
749f5bdd38 [doc] fix the list rendering issue - security.md (#18982)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-31 10:39:21 +00:00
Satyajith Chilappagari
2a50ef5760 [Neuron] Add Multi-Modal model support for Neuron (#18921)
Signed-off-by: Satyajith Chilappagari <satchill@amazon.com>
Co-authored-by: Ashraf Mahgoub <ashymahg@amazon.com>
Co-authored-by: Rohith Nallamaddi <nalrohit@amazon.com>
Co-authored-by: FeliciaLuo <luof@amazon.com>
Co-authored-by: Elaine Zhao <elaineyz@amazon.com>
2025-05-31 10:39:11 +00:00
Lucia Fang
b8b904795d fix security issue of logging llm output (#18980)
Signed-off-by: Lu Fang <fanglu@fb.com>
Co-authored-by: Lucia (Lu) Fang <fanglu@meta.com>
2025-05-31 10:38:56 +00:00
Chauncey
ba5111f237 [Bugfix]: Fix the incompatibility issue with Structured Outputs when Thinking is disabled (#18879)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-05-31 09:20:54 +00:00
Yong Hoon Shin
1e123529d7 [Misc] Fix estimated max model len msg (#18966)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-05-31 16:43:44 +08:00
Pooya Davoodi
dff80b0e42 [Frontend] Add rerank support to run_batch endpoint (#16278)
Signed-off-by: Pooya Davoodi <pooya.davoodi@parasail.io>
2025-05-31 07:40:01 +00:00
Yu Guo
7782464a17 create util function for batched arange (#18937) 2025-05-31 13:50:38 +08:00
Lukas Geiger
0f71e24034 [Docs] Correct multiprocessing design doc (#18964)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-05-31 01:30:15 +00:00
Will Eaton
1dab4d5718 Tool parser regex timeout handling (#18960)
Signed-off-by: Will Eaton <weaton@redhat.com>
2025-05-30 21:02:54 +00:00
rongfu.leng
7f21e8052b [Misc] add group_size is -1 in awq quantization (#18910)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-05-30 17:34:22 +00:00
Isotr0py
5a8641638a [VLM] Add PP support and fix GPTQ inference for Ovis models (#18958)
Signed-off-by: isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-05-30 17:11:44 +00:00
Michael Goin
f49239cb45 Benchmark script for fp8 vs bf16 gemm (#17126)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-30 10:56:11 -06:00
Nick Hill
2dbe8c0774 [Perf] API-server scaleout with many-to-many server-engine comms (#17546) 2025-05-30 08:17:00 -07:00
Richard Zou
84ec470fca Improve "failed to get the hash of the compiled graph" error (#18956)
Signed-off-by: rzou <zou3519@gmail.com>
2025-05-30 15:00:54 +00:00
Russell Bryant
b29ca5c4d5 [Docs] Update SECURITY.md with link to our security guide (#18961)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-05-30 07:37:27 -07:00
Reid
ec6833c5e9 [doc] show the count for fork and watch (#18950)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-30 06:45:59 -07:00
Shawn Huang
e1fadf1197 [Feature] minicpm eagle support (#18943)
Signed-off-by: huangyuxiang03 <huangyx0321@gmail.com>
Co-authored-by: huangyuxiang03 <huangyx0321@gmail.com>
2025-05-30 06:45:56 -07:00
Daniele
43ff405b90 [CI/Build] remove regex from build dependencies (#18945)
Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-05-30 04:02:50 -07:00
Carol Zheng
fba02e3bd1 [Bugfix][TPU] Fix tpu model runner testcase failure (#18810)
Signed-off-by: Carol Zheng <cazheng@google.com>
2025-05-30 18:04:03 +08:00
Always-Naive
4577fc9abb [Misc]Fix typo (#18947) 2025-05-30 02:21:35 -07:00
Rabi Mishra
5f1d0c8118 [Bugfix][Failing Test] Fix test_vllm_port.py (#18618)
Signed-off-by: rabi <ramishra@redhat.com>
2025-05-30 17:13:47 +08:00
Lukas Geiger
c3bb9f2331 [Model] Use in-place adds in SigLIP (#18922)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-05-30 17:12:59 +08:00
Reid
8f8900cee9 [doc] add mkdocs doc (#18930)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-30 07:58:44 +00:00
Rabi Mishra
6acb7a6285 [Misc]Fix benchmarks/README.md for speculative decoding (#18897)
Signed-off-by: rabi <ramishra@redhat.com>
2025-05-30 07:58:04 +00:00
Cyrus Leung
4f4a6b844a [Deprecation] Remove mean pooling default for Qwen2EmbeddingModel (#18913)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-30 06:53:37 +00:00
Michael Goin
4d0a1541be [Bugfix] Remove NVFP4 scales assertions to fix load_format=dummy (#18861)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-30 13:37:36 +08:00
vllmellm
77b6e74fe2 [ROCm] Remove unnecessary assertion of max_model_len in ROCM_AITER_MLA attention backend. (#18938)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-05-29 22:33:17 -07:00
H
5acf828d99 [docs] fix: fix markdown syntax (#18927) 2025-05-30 05:20:48 +00:00
iLeGend
3987e2ae96 [Model] Use AutoWeightsLoader for mamba2 (#18918)
Signed-off-by: iLeGend <824040212@qq.com>
2025-05-30 04:50:10 +00:00
Chauncey
77164dad5e [Bugfix] Consistent ascii handling in tool parsers (#18883)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-05-30 04:44:43 +00:00
Wenhua Cheng
3de3eadf5b improve the robustness of parsing vlms config in AutoRound (#18894)
Signed-off-by: wenhuach21 <wenhua.cheng@intel.com>
2025-05-29 19:24:47 -07:00
Carol Zheng
3132290a14 [TPU][CI/CD] Clean up docker for TPU tests. (#18926)
Signed-off-by: Carol Zheng <cazheng@google.com>
2025-05-30 10:24:19 +08:00
Cyrus Leung
1aa2f81b43 [Misc] Update type annotation for rotary embedding base (#18914)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-30 10:17:01 +08:00
Michael Goin
d54af615d5 [Bugfix] Fix PP default fallback behavior for V1 (#18915)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-30 10:13:17 +08:00
Chengji Yao
a1cc9f33a3 [TPU] remove transpose ops in moe kernel (#18923)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-05-29 23:00:11 +00:00
Richard Zou
a521ef06e5 Use standalone_compile by default in torch >= 2.8.0 (#18846)
Signed-off-by: rzou <zou3519@gmail.com>
2025-05-30 06:41:58 +08:00
Will Eaton
64eaf5fe05 [P/D] NixlConnector DP fixes (#18903)
Signed-off-by: Will Eaton <weaton@redhat.com>
2025-05-29 18:08:40 +00:00
Nick Hill
d1d61f3351 [BugFix] Make DP work with connector-delayed new requests (#18559)
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Will Eaton <weaton@redhat.com>
2025-05-29 18:04:18 +00:00
Nicolò Lucchesi
32ce3cf7c9 [V1] Allocate kv_cache with stride order for V1 (#18775)
Signed-off-by: nicklucche <nlucches@redhat.com>
2025-05-29 17:54:16 +00:00
CYJiang
d58f9c7f7a [Misc] Remove duplicate init for self.vllm_config (#18896)
Signed-off-by: googs1025 <googs1025@gmail.com>
2025-05-29 17:26:07 +00:00
Cyrus Leung
c29034037d [Deprecation] Disallow pos-args other than model when initializing LLM (#18802)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-29 09:36:58 -07:00
Gregory Shtrasberg
1b7cfd5a36 [ROCm][V0][Attention] Revert to the previous FA triton kernel (#18226)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-05-29 12:13:18 -04:00
Gregory Shtrasberg
da4b69d0b4 [Attention][V1] Toggle for v1 attention backend (#18275)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-05-29 10:48:24 -04:00
Isotr0py
c9479b2920 [Bugfix] Fix the failing gte embedding test (#18720)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-05-29 07:39:25 -07:00
Hyogeun Oh (오효근)
6f2909405e [Doc] Fix codeblocks formatting in LoRA adapters documentation (#18907)
Signed-off-by: Zerohertz <ohg3417@gmail.com>
2025-05-29 07:38:55 -07:00
Duyi-Wang
b169d5f7b6 [Misc][Tools][Benchmark] Add benchmark_serving supports for llama.cpp. (#18692)
Signed-off-by: Duyi-Wang <duyi.wang@intel.com>
2025-05-29 20:02:08 +08:00
Chenyaaang
f8977c233f Fix an error in dummy weight loading for quantization models (#18855)
Signed-off-by: Chenyaaang <chenyangli@google.com>
2025-05-29 03:07:20 -07:00
Luka Govedič
f274581f44 [BugFix] Update pydantic to fix error on python 3.10 (#18852)
Signed-off-by: luka <luka@neuralmagic.com>
2025-05-29 03:05:46 -07:00
Lukas Geiger
0b1447f890 [Bugfix] Ensure tensors are contiguous during serialisation (#18860)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-05-29 03:05:20 -07:00
Nicolò Lucchesi
24d0ef8970 [Misc] Replace TODO in serving transcription (#18895)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-05-29 02:58:14 -07:00
Jee Jee Li
7fcfd954ff [Bugfix] Fix misleading information in the documentation (#18845)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-05-29 02:54:14 -07:00
Reid
e740d07f07 [doc] add CLI doc (#18871)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-29 09:51:36 +00:00
Michael Yao
a652e71dd0 [Doc] Remove redundant spaces from compatibility_matrix.md (#18891)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-05-29 02:51:20 -07:00
Jee Jee Li
34d6c447c4 [LoRA] Add LoRA support for InternVL (#18842)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-05-29 08:46:24 +00:00
Satyajith Chilappagari
972eddf7c9 [Neuron] Add multi-LoRA support for Neuron. (#18284)
Signed-off-by: Satyajith Chilappagari <satchill@amazon.com>
2025-05-29 16:41:22 +08:00
Brent Salisbury
fd7bb88d72 Fixes a dead link in nightly benchmark readme (#18856)
Signed-off-by: Brent Salisbury <bsalisbu@redhat.com>
2025-05-29 04:41:39 +00:00
Yikun Jiang
3c49dbdd03 Skip device and quant Pydantic validation to make plugin device work (#18843)
Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
2025-05-28 20:12:30 -07:00
aws-elaineyz
1661a9c28f [Doc][Neuron] Update documentation for Neuron (#18868)
Signed-off-by: Elaine Zhao <elaineyz@amazon.com>
2025-05-28 19:44:01 -07:00
Chengji Yao
8e882ffdc0 [Bugfix][TPU] fix moe custom kernel import (#18853)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-05-28 19:34:19 -07:00
Richard Zou
26b4fa45be Add ability to use CUDAGraphs with use_inductor=False (#17345)
Signed-off-by: rzou <zou3519@gmail.com>
2025-05-29 10:16:52 +08:00
Maximilien de Bayser
515b413ebf Prevent the cross-encoder logic from being applied to classification tasks (#18838)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-05-28 19:16:17 -07:00
Hongxia Yang
269d901734 [Bugfix][ROCm] fix the power of 2 exception from triton_unified_attention.py when running llama4 models and unit test fix (#18100)
Signed-off-by: Hongxia Yang <hongxia.yang@amd.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-05-29 07:21:46 +08:00
Varun Sundar Rabindranath
7951d78738 [Core] Enable CUDA graphs for DP + All2All kernels (#18724)
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2025-05-28 22:55:30 +00:00
Harry Mellor
6dbe5b5c93 Remove checks for None for fields which should never be None (#17985)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-28 21:32:19 +00:00
Akshat Tripathi
643622ba46 [Hardware][TPU][V1] Multi-LoRA Optimisations for the V1 TPU backend (#15655)
Signed-off-by: Akshat Tripathi <akshat@krai.ai>
Signed-off-by: Chengji Yao <chengjiyao@google.com>
Signed-off-by: xihajun <junfan@krai.ai>
Signed-off-by: Jorge de Freitas <jorge.de-freitas22@imperial.ac.uk>
Signed-off-by: Jorge de Freitas <jorge@krai.ai>
Co-authored-by: Chengji Yao <chengjiyao@google.com>
Co-authored-by: xihajun <junfan@krai.ai>
Co-authored-by: Jorge de Freitas <jorge.de-freitas22@imperial.ac.uk>
Co-authored-by: Jorge de Freitas <jorge@krai.ai>
2025-05-28 19:59:09 +00:00
Aaron Pham
a09c7ca9f2 [Chore][Spec Decode] Update check NoneType instead of assigning variables (#18836)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-05-28 18:57:19 +00:00
Mark McLoughlin
0e98964e94 [V1][Metrics] Remove metrics that were deprecated in 0.8 (#18837)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-05-28 18:54:12 +00:00
rongfu.leng
c68b5c63eb [Misc] fix olmoe model layer can't laod in tp gt 1 (#18828)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-05-28 17:36:21 +00:00
Aaron Pham
fced756923 [Chore] update ty configuration (#18839)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-05-28 08:59:11 -07:00
Alex Brooks
321331b8ae [Core] Add Lora Support to Beam Search (#18346)
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
2025-05-28 08:58:24 -07:00
daniel-salib
6e4cea1cc5 decrement server_load on listen for disconnect (#18784)
Signed-off-by: Daniel Salib <danielsalib@meta.com>
2025-05-28 22:15:12 +08:00
Reid
435fa95444 [Frontend] add run batch to CLI (#18804)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-28 07:08:57 -07:00
Harry Mellor
4c2b38ce9e Enable Pydantic mypy checks and convert configs to Pydantic dataclasses (#17599)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-28 12:46:04 +00:00
Mengqing Cao
d781930f90 [Platform][Dist] Make torch distributed process group extendable (#18763)
Signed-off-by: Mengqing Cao <cmq0113@163.com>
2025-05-28 10:52:34 +00:00
Lucas Wilkinson
ce75efeecb [BugFix] FA2 MLA Accuracy Issue (#18807)
Signed-off-by: LucasWilkinson <lwilkinson@neuralmagic.com>
2025-05-28 08:59:39 +00:00
Richard Zou
aa42561e40 Fix PiecewiseCompileInterpreter (#17338)
Signed-off-by: rzou <zou3519@gmail.com>
2025-05-28 08:40:53 +00:00
wang.yuqi
de65fc8e1e [CI] improve embed testing (#18747) 2025-05-28 00:16:35 -07:00
Cyrus Leung
0c492b7824 [Deprecation] Remove fallbacks for Embeddings API (#18795)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-28 15:09:04 +08:00
Cyrus Leung
0f0926b43f [Deprecation] Remove unused sync methods in async_timeout (#18792)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-28 15:08:48 +08:00
Cyrus Leung
7f2c1a87e9 [Deprecation] Require overriding get_dummy_text and get_dummy_mm_data (#18796)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-28 15:08:35 +08:00
Rabi Mishra
b78f844a67 [Bugfix][FailingTest]Fix test_model_load_with_params.py (#18758)
Signed-off-by: rabi <ramishra@redhat.com>
2025-05-28 05:42:54 +00:00
RonaldBXu
5e13c07d00 [V1] [Bugfix] eagle bugfix and enable correct lm_head for multimodal (2) (#18781)
Signed-off-by: Ronald Xu <ronaldxu@amazon.com>
2025-05-28 05:09:14 +00:00
Divakar Verma
774c5fde30 [V1] fix torch profiling for V1 offline scenarios (#18445)
Signed-off-by: Divakar Verma <divakar.verma@amd.com>
2025-05-28 04:16:30 +00:00
Guillaume Calmettes
9a21e331ff [Bugfix]: correctly propagate errors message caught at the chat_templating step to the client (#18769)
Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
2025-05-28 03:35:43 +00:00
wang.yuqi
3e9ce609bd [Bugfix] Fix nomic max_model_len (#18755) 2025-05-27 20:29:53 -07:00
fxmarty-amd
794ae1f551 [rocm] Fix wrong attention log (#18764)
Signed-off-by: Felix Marty <felmarty@amd.com>
2025-05-27 19:45:41 -07:00
Lukas Geiger
d73a9457a5 [Core] Improve Tensor serialisation (#18774)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-05-28 09:46:21 +08:00
Luka Govedič
a3896c7f02 [Build] Fixes for CMake install (#18570) 2025-05-27 20:49:24 -04:00
cascade
51e98e4ffd [Bugfix] Disable prefix caching by default for benchmark (#18771)
Signed-off-by: cascade812 <cascade812@outlook.com>
2025-05-28 08:18:09 +08:00
Michael Goin
e56f44d9ec Support datasets in vllm bench serve and sync with benchmark_[serving,datasets].py (#18566) 2025-05-27 19:59:48 -04:00
Satyajith Chilappagari
e0cbad4e30 [Neuron] Support quantization on neuron (#18283)
Signed-off-by: Satyajith Chilappagari <satchill@amazon.com>
2025-05-27 22:10:33 +00:00
Carol Zheng
b48d5cca16 [CI/Build] [TPU] Fix TPU CI exit code (#18282)
Signed-off-by: Carol Zheng <cazheng@google.com>
2025-05-27 14:54:59 -07:00
Michael Goin
5873877241 [Bugfix] Mistral tool calling when content is list (#18729)
Some checks failed
Create Release / Create Release (push) Has been cancelled
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-27 09:05:37 -07:00
Cyrus Leung
696259ca01 [Core] Automatically cast multi-modal input dtype (#18756)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-27 23:45:48 +08:00
chunxiaozheng
6b6d496114 optimize get_kv_cache_torch_dtype (#18531)
Signed-off-by: idellzheng <idellzheng@tencent.com>
2025-05-27 13:08:44 +00:00
cascade
aaa4ac1c95 Disable prefix cache by default for benchmark (#18639)
Signed-off-by: cascade812 <cascade812@outlook.com>
2025-05-27 20:06:34 +08:00
Mark McLoughlin
06a0338015 [V1][Metrics] Add API for accessing in-memory Prometheus metrics (#17010)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-05-27 09:37:06 +00:00
Cyrus Leung
4318c0559d [CI/Build] Remove imports of built-in re (#18750)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-27 09:19:18 +00:00
Hyogeun Oh (오효근)
a68e293cb9 [Doc] Convert Sphinx directives ( {class}, {meth}, {attr}, ...) to MkDocs format for better documentation linking (#18663)
Signed-off-by: Zerohertz <ohg3417@gmail.com>
2025-05-27 01:44:20 -07:00
Shawn Huang
6881107948 [BUG FIX] minicpm (#18739)
Signed-off-by: huangyuxiang03 <huangyx0321@gmail.com>
Co-authored-by: huangyuxiang03 <huangyx0321@gmail.com>
2025-05-27 01:04:49 -07:00
Kebe
e0f0ff87b8 [Build] fix cpu build missing libtbbmalloc.so (#18744)
Signed-off-by: Kebe <mail@kebe7jun.com>
2025-05-27 01:03:56 -07:00
maobaolong
c24b1572ac Minor fix about MooncakeStoreConnector (#18721)
Signed-off-by: baoloongmao <baoloongmao@tencent.com>
2025-05-27 08:02:28 +00:00
Calvin Chen
4693a3438c [Doc] cleanup deprecated flag for doc (#18715)
Signed-off-by: calvin chen <120380290@qq.com>
2025-05-27 07:12:02 +00:00
Łukasz Durejko
bbd9a84dc5 [Hardware][Intel-Gaudi] [CI/Build] Fix multiple containers using the same name in run-hpu-test.sh (#18752)
Signed-off-by: Lukasz Durejko <ldurejko@habana.ai>
2025-05-27 00:10:26 -07:00
almersawi
a547aeb828 feat(rocm-support): support mamba2 on rocm (#18565)
Signed-off-by: Islam Almersawi <islam.almersawi@openinnovation.ai>
Co-authored-by: Islam Almersawi <islam.almersawi@openinnovation.ai>
2025-05-27 00:07:53 -07:00
Reid
fc6d0c290f [Misc] improve docs (#18734)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-27 07:07:01 +00:00
Cyrus Leung
753944fa9b [Doc] Update reproducibility doc and example (#18741)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-27 07:03:13 +00:00
Cyrus Leung
25a817f202 [Doc] Update OOT model docs (#18742)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-27 06:30:31 +00:00
vllmellm
d260f799a9 [FEAT] [ROCm] Upgrade AITER Fused MoE kernels. (#18271)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-05-26 23:14:07 -07:00
Lukas Geiger
b50602d5f0 [Model][Gemma3] Cast image pixel values already on CPU (#18732)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-05-27 05:42:54 +00:00
Isotr0py
1f1b1bc03b [V1][Quantization] Add CUDA graph compatible v1 GGUF support (#18646)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-05-27 04:40:28 +00:00
Reid
1f88dbd2bb [Misc] improve web section group title display (#18684)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-27 04:35:16 +00:00
Lukas Geiger
0eebd74842 [Model][Gemma3] Simplify image input validation (#18710)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-05-27 11:13:37 +08:00
Harry Mellor
27bebcd897 Convert examples to ruff-format (#18400)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-26 16:57:54 +00:00
Lukas Geiger
e7523c2e03 [V1][Sampler] Improve performance of FlashInfer sampling by sampling logits instead of probs (#18608) 2025-05-26 11:49:36 -04:00
Cyrus Leung
a869baca73 [Bugfix] Fix Llama GGUF initialization (#18717)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-26 07:49:22 -07:00
Cyrus Leung
82e2339b06 [Doc] Move examples and further reorganize user guide (#18666)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-26 07:38:04 -07:00
Cyrus Leung
9553fdb41e [Doc] Improve API docs (#18713)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-26 07:33:34 -07:00
dylan
243eb9199f [Bugfix]: handle hf-xet CAS error when loading Qwen3 weights in vLLM (#18701) 2025-05-26 07:10:56 -07:00
Reid
0665e29998 [Misc] add AutoGen integration (#18712)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-05-26 13:56:18 +00:00
Łukasz Durejko
e76be06550 [Hardware][Intel-Gaudi] [CI/Build] Add tensor parallel size = 2 test to HPU CI (#18709)
Signed-off-by: Lukasz Durejko <ldurejko@habana.ai>
2025-05-26 05:26:07 -07:00
Isotr0py
0877750029 [CI/Build] Split pooling and generation extended language models tests in CI (#18705)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-05-26 04:00:08 -07:00
Naveassaf
6d68030f1c [Model] Add support for YARN in NemotronNAS models (#18427)
Signed-off-by: Nave Assaf <nassaf@nvidia.com>
2025-05-26 10:31:49 +00:00
Ning Xie
5a2c76cbe1 [CI] fix dump_input for str type (#18697)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-05-26 18:23:35 +08:00
Cyrus Leung
38b13dfe78 [CI/Build] Replace math.isclose with pytest.approx (#18703)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-26 02:05:17 -07:00
Cyrus Leung
61a45e7a72 [Bugfix] Fix Mistral-format models with sliding window (#18693)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-26 01:44:04 -07:00
Cyrus Leung
65523a0995 [Doc] Fix issue template format (#18699)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-26 00:45:39 -07:00
Cyrus Leung
4b7740a105 [GH] Add issue template for reporting CI failures (#18696)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-26 00:42:04 -07:00
Ning Xie
4ea62c0ea0 [CI] add missing argument (#18694)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-05-26 00:22:04 -07:00
Maximilien de Bayser
561b77a0d6 [Bugfix] Fix the lm_head in gpt_bigcode in lora mode (#6357)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Signed-off-by: Max de Bayser <maxdebayser@gmail.com>
2025-05-26 14:52:25 +08:00
CYJiang
abd4030d94 refactor: simplify request handler, use positive condition check for handler assignment (#18690)
Signed-off-by: googs1025 <googs1025@gmail.com>
2025-05-26 06:32:28 +00:00
AlexZhao
8820821b59 [Misc] Fixed the abnormally high TTFT issue in the PD disaggregation example (#18644)
Signed-off-by: zhaohaidao <zhaohaidao2008@hotmail.com>
Signed-off-by: zhaohaiyuan <zhaohaiyuan@xiaohongshu.com>
Co-authored-by: zhaohaiyuan <zhaohaiyuan@xiaohongshu.com>
2025-05-26 13:51:27 +08:00
Cyrus Leung
fba0642704 [CI/Build][Doc] Update gte-Qwen2-1.5B-instruct usage (#18683)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
2025-05-25 20:27:50 -07:00
Lukas Geiger
6071e989df [Core][Multimodal] Convert PIL Image to array without data copy when hashing (#18682)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-05-25 17:33:35 +00:00
Cyrus Leung
57fd13a707 [Bugfix] Fix profiling dummy data for Pixtral (#18677)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-25 14:05:30 +00:00
Reid
3a886bd58c [Misc] small improve (#18680)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-25 06:05:38 -07:00
Reid
35be8fad62 [CI/build] fix no regex (#18676)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-25 10:10:51 +00:00
Yuqi Zhang
f2faac745d [Bugfix] Fix cpu usage and cache hit stats reporting on cpu environment (#18674)
Signed-off-by: zzzyq <zhangyuqi94@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-05-25 02:36:06 -07:00
Reid
279f854519 [doc] improve readability (#18675)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-25 01:40:31 -07:00
Reid
624b77a2b3 [doc] fix broken links (#18671)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-25 01:36:33 -07:00
Cyrus Leung
503f8487c2 [Misc] Reduce logs on startup (#18649)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-24 23:03:53 -07:00
Ning Xie
44073a7ac3 [BUGFIX] catch subclass first for try...except (#18672)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-05-25 05:34:24 +00:00
Michael Goin
63934543a0 Speed up the kernels/quantization/ tests (#18669)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-25 05:02:59 +00:00
Isotr0py
75f81750f3 [VLM] Initialize video input support for InternVL models (#18499)
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-05-25 04:51:25 +00:00
Mengqing Cao
6ab681bcbe [Misc][ModelScope] Change to use runtime VLLM_USE_MODELSCOPE (#18655)
Signed-off-by: Mengqing Cao <cmq0113@163.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
2025-05-25 04:51:21 +00:00
Chenguang Li
cebc22f3b6 [Misc]Replace cuda hard code with current_platform in Ray (#14668)
Signed-off-by: noemotiovon <757486878@qq.com>
2025-05-24 20:26:31 -07:00
Ning Xie
6c6dcd8611 [MISC] correct signature for LoaderFunction (#18670)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-05-24 20:17:47 -07:00
Seiji Eicher
7891fdf0c6 [V1] Fix _pickle.PicklingError: Can't pickle <class 'transformers_modules.deepseek-ai.DeepSeek-V2-Lite... (#18640)
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
2025-05-24 20:07:20 -07:00
Woosuk Kwon
6825d9a998 [BugFix][Spec Decode] Improve Prefix Caching Logic in Speculative Decoding (#18668)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-05-24 17:33:46 -07:00
Reid
b554ab736e [CI/Build] fix permission denied issue (#18645)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-24 16:09:10 +00:00
Aaron Pham
9ea7f1abf3 fix(regression): clone from reference items (#18662)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-05-24 15:25:20 +00:00
Aaron Pham
2807271c86 [CI] enforce import regex instead of re (#18665)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-05-24 08:04:14 -07:00
wangxiyuan
b9018a3f9f [BugFix] Fix import error for fused_moe (#18642)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-05-24 07:53:36 -07:00
Ning Xie
4ceafb6299 [MISC] typo fix and clean import (#18664)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-05-24 07:52:09 -07:00
Cyrus Leung
2e6705784f [CI/Build] chmod +x to cleanup_pr_body.sh (#18650)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-24 07:26:45 -07:00
Cyrus Leung
1cb194a018 [Doc] Reorganize user guide (#18661)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-24 07:25:33 -07:00
ztang2370
2cd4d58df4 [Model] use AutoWeightsLoader for gpt2 (#18625)
Signed-off-by: zt2370 <ztang2370@gmail.com>
2025-05-24 13:36:13 +00:00
Cyrus Leung
6d166a8d35 [Doc] Add community links (#18657)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-24 06:06:38 -07:00
Cyrus Leung
ef1dd6870f [Doc] Fix indentation problems in V0 Paged Attention docs (#18659)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-24 06:06:35 -07:00
Mengqing Cao
e77dc4bad8 [MISC][pre-commit] Add pre-commit check for triton import (#17716)
Signed-off-by: Mengqing Cao <cmq0113@163.com>
2025-05-24 20:09:15 +08:00
Cyrus Leung
07458a51ce [Doc] Update README links, mark external links (#18635)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-24 09:57:15 +00:00
qizixi
c1e4a4052d [V1][Spec Decode] Support multi-layer eagle draft model (#18030)
Signed-off-by: qizixi <qizixi@meta.com>
2025-05-24 09:45:34 +00:00
Yuanhao WU
a859320575 [Model] Add support for Qwen2.5-Omni-7B-AWQ (Qwen2_5OmniForConditionalGeneration) (#18647) 2025-05-24 09:15:36 +00:00
Reid
441dc63ac7 [Frontend] improve vllm serve --help display (#18643)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-24 07:53:22 +00:00
qizixi
d55e446d13 [V1][Spec Decode] Small refactors to improve eagle bookkeeping performance (#18424)
Signed-off-by: qizixi <qizixi@meta.com>
2025-05-24 06:51:22 +00:00
Wenhua Cheng
ec82c3e388 FIX MOE issue in AutoRound format (#18586)
Signed-off-by: wenhuach21 <wenhua.cheng@intel.com>
2025-05-23 22:01:40 -07:00
Mathieu Borderé
45ab403a1f config.py: Clarify that only local GGUF checkpoints are supported. (#18623)
Signed-off-by: Mathieu Bordere <mathieu@letmetweakit.com>
2025-05-24 08:46:34 +08:00
Robert Shaw
2b10ba7491 [Bugfix][Nixl] Fix Preemption Bug (#18631)
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-05-23 23:30:16 +00:00
Feng XiaoLong
4fc1bf813a [Bugfix] Migrate to REGEX Library to prevent catastrophic backtracking (#18454)
Signed-off-by: Crucifixion-Fxl <xmufxl@gmail.com>
Co-authored-by: Crucifixion-Fxl <xmufxl@gmail.com>
2025-05-23 16:16:26 -07:00
Pavani Majety
f2036734fb [ModelOpt] Introduce VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE env var to control blockscale tensor allocation (#18160)
Signed-off-by: Pavani Majety <pmajety@nvidia.com>
2025-05-23 15:52:20 -07:00
Cyrus Leung
7d9216495c [Doc] Update references to doc files (#18637)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-23 15:49:21 -07:00
Michael Goin
0ddf88e16e [CI] Enable test_initialization to run on V1 (#16736)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-23 15:09:44 -07:00
Huy Do
1645b60196 Use prebuilt FlashInfer x86_64 PyTorch 2.7 CUDA 12.8 wheel for CI (#18537)
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-05-23 21:17:16 +00:00
Jiayi Yao
2628a69e35 [V1] Support Deepseek MTP (#18435)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Signed-off-by: YaoJiayi <120040070@link.cuhk.edu.cn>
Co-authored-by: Rui Qiao <ruisearch42@gmail.com>
2025-05-23 10:26:28 -07:00
Cyrus Leung
371f7e4ca2 [Doc] Fix broken links and unlinked docs, add shortcuts to home sidebar (#18627)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-23 10:22:40 -07:00
Cyrus Leung
15b45ffb9a [Doc] Avoid documenting dynamic / internal modules (#18626)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-23 09:58:02 -07:00
Cyrus Leung
273cb3b4d9 [Doc] Fix top-level API links/docs (#18621)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-23 09:46:56 -07:00
David Xia
8ddd1cf26a [Doc] fix list formatting (#18624)
Signed-off-by: David Xia <david@davidxia.com>
2025-05-23 09:41:17 -07:00
Chen Zhang
6550114c9c [v1] Redo "Support multiple KV cache groups in GPU model runner (#17945)" (#18593)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-05-23 09:39:47 -07:00
Michael Goin
9520a989df [Docs] Change mkdocs to not use directory urls (#18622)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-23 09:33:21 -07:00
Harry Mellor
3d28ad343f Fix figures in design doc (#18612)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-23 09:09:54 -07:00
youkaichao
6a7988c55b Refactor pplx init logic to make it modular (prepare for deepep) (#18200)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-05-23 23:43:43 +08:00
Cyrus Leung
022d8abe29 [Doc] Use a different color for the announcement (#18616)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-23 08:25:03 -07:00
Hyogeun Oh (오효근)
5221815a00 [Doc] Fix markdown list indentation for MkDocs rendering (#18620)
Signed-off-by: Zerohertz <ohg3417@gmail.com>
2025-05-23 08:23:21 -07:00
Simon Mo
1068556b2c [Bugfix][Build/CI] Fixup CUDA compiler version check for CUDA_SUPPORTED_ARCHS (#18579) 2025-05-23 07:43:58 -07:00
Reid
2cd1fa4556 [Misc] add Haystack integration (#18601)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-23 06:21:19 -07:00
Harry Mellor
d4c2919760 Include private attributes in API documentation (#18614)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-23 06:18:31 -07:00
Tristan Leclercq
6220f3c6b0 [Bugfix] Fix transformers model impl ignored for mixtral quant (#18602)
Signed-off-by: Tristan Leclercq <tristanleclercq@gmail.com>
2025-05-23 05:54:13 -07:00
Harry Mellor
52fb23f47e Fix examples with code blocks in docs (#18609)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-23 05:53:44 -07:00
Cyrus Leung
6dd51c7ef1 [CI/Build] Fix V1 flag being set in entrypoints tests (#18598)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-23 05:51:53 -07:00
Harry Mellor
2edb533af2 Replace {func} with mkdocs style links (#18610)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-23 05:51:38 -07:00
Hyogeun Oh (오효근)
38a95cb4a8 [Doc] Fix indent of contributing to vllm (#18611)
Signed-off-by: Zerohertz <ohg3417@gmail.com>
2025-05-23 05:50:07 -07:00
Ning Xie
cd821ea5d2 [CI] fix kv_cache_type argument (#18594)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-05-23 04:49:18 -07:00
Kay Yan
7ab056c273 [Hardware][CPU] Update intel_extension_for_pytorch 2.7.0 and move to requirements/cpu.txt (#18542)
Signed-off-by: Kay Yan <kay.yan@daocloud.io>
2025-05-23 04:38:42 -07:00
Harry Mellor
6526e05111 Add myself as docs code owner (#18605)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-23 04:08:31 -07:00
Madeesh Kannan
e493e48524 [V0][Bugfix] Fix parallel sampling performance regression when guided decoding is enabled (#17731)
Signed-off-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
2025-05-23 03:38:23 -07:00
Mengqing Cao
4ce64e2df4 [Bugfix][Model] Fix baichuan model loader for tp (#18597)
Signed-off-by: Mengqing Cao <cmq0113@163.com>
2025-05-23 02:39:05 -07:00
Cyrus Leung
fbb13a2c15 Revert "[V1] [Bugfix] eagle bugfix and enable correct lm_head for multimodal (#18034)" (#18600)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-23 02:18:22 -07:00
Harry Mellor
a1fe24d961 Migrate docs from Sphinx to MkDocs (#18145)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-23 02:09:53 -07:00
Yuqi Zhang
d0bc2f810b [Bugfix] Add half type support in reshape_and_cache_cpu_impl on x86 cpu platform (#18430)
Signed-off-by: Yuqi Zhang <yuqizhang@google.com>
Co-authored-by: Yuqi Zhang <yuqizhang@google.com>
2025-05-23 01:41:37 -07:00
Chauncey
b046cf792d [Feature][V1]: suupports cached_tokens in response usage (#18149)
Co-authored-by: simon-mo <xmo@berkeley.edu>
2025-05-23 01:41:03 -07:00
Michael Goin
54af915949 [Doc] Update quickstart and install for cu128 using --torch-backend=auto (#18505)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-23 08:36:37 +00:00
cascade
71ea614d4a [Feature]Add async tensor parallelism using compilation pass (#17882)
Signed-off-by: cascade812 <cascade812@outlook.com>
2025-05-23 01:03:34 -07:00
RonaldBXu
4c611348a7 [V1] [Bugfix] eagle bugfix and enable correct lm_head for multimodal (#18034)
Signed-off-by: Ronald Xu <ronaldxu@amazon.com>
2025-05-23 00:37:18 -07:00
Ning Xie
60cad94b86 [Hardware] correct method signatures for HPU,ROCm,XPU (#18551)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-05-22 22:31:59 -07:00
Shanshan Shen
9c1baa5bc6 [Misc] Replace cuda hard code with current_platform (#16983)
Signed-off-by: shen-shanshan <467638484@qq.com>
2025-05-23 04:38:50 +00:00
Teruaki Ishizaki
4be2255c81 [Bugfix][Benchmarks] Fix a benchmark of deepspeed-mii backend to use api_key (#17291)
Signed-off-by: Teruaki Ishizaki <teruaki.ishizaki@ntt.com>
2025-05-23 12:30:47 +08:00
aws-elaineyz
ed5d408255 [Neuron] Remove bypass on EAGLEConfig and add a test (#18514)
Signed-off-by: Elaine Zhao <elaineyz@amazon.com>
2025-05-22 21:26:32 -07:00
Benjamin Chislett
583507d130 [Spec Decode] Make EAGLE3 draft token ID mapping optional (#18488)
Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-05-22 20:17:39 -07:00
lkchen
e44d8ce8c7 [Bugfix] Set KVTransferConfig.engine_id in post_init (#18576)
Signed-off-by: Linkun Chen <github@lkchen.net>
2025-05-23 02:54:42 +00:00
Nick Hill
93ecb8139c [BugFix] Increase TP execute_model timeout (#18558)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-05-23 10:22:11 +08:00
CYJiang
fae453f8ce [Misc] refactor: simplify input validation and num_requests handling in _convert_v1_inputs (#18482)
Signed-off-by: googs1025 <googs1025@gmail.com>
2025-05-23 10:15:32 +08:00
Harry Mellor
4b0da7b60e Enable hybrid attention models for Transformers backend (#18494)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-23 10:12:08 +08:00
Mark McLoughlin
c6b636f9fb [V1][Spec Decoding] Use model_loader.get_model() to load models (#18273)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-05-23 02:05:44 +00:00
Chenheli Hua
04eb88dc80 Re-submit: Fix: Proper RGBA -> RGB conversion for PIL images. (#18569)
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
2025-05-23 01:59:18 +00:00
rasmith
46791e1b4b [AMD] [P/D] Compute num gpus for ROCm correctly in run_accuracy_test.sh (#18568)
Signed-off-by: Randall Smith <Randall.Smith@amd.com>
2025-05-22 18:45:35 -07:00
Sanger Steel
c32e249a23 [Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)
Signed-off-by: Sanger Steel <sangersteel@gmail.com>
2025-05-22 18:44:18 -07:00
Kai Wu
c91fe7b1b9 [Frontend][Bug Fix] Update llama4 pythonic jinja template and llama4_pythonic parser (#17917)
Signed-off-by: Kai Wu <kaiwu@meta.com>
2025-05-22 16:44:08 -07:00
Ekagra Ranjan
a04720bc36 [V1][Spec Decode][Bugfix] Load quantize weights for EAGLE (#18290) 2025-05-22 15:17:33 -07:00
lkchen
7b9d832c80 [Tool] Add NIXL installation script (#18172)
Signed-off-by: Linkun <github@lkchen.net>
2025-05-22 14:33:16 -07:00
Tyler Michael Smith
6e588da0f4 [Build/CI] Fix CUDA 11.8 build (#17679)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: Tyler Michael Smith <tysmith@redhat.com>
Co-authored-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2025-05-22 12:13:54 -07:00
Mengqing Cao
f8d2cc5f55 [Compile][Platform] Make PiecewiseBackend pluggable and extendable (#18076)
Signed-off-by: Mengqing Cao <cmq0113@163.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
2025-05-22 12:11:53 -07:00
wangxiyuan
721fb9b181 [Platform] Move platform check to right place (#18470)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-05-22 12:11:28 -07:00
David Xia
1f3a1200e4 [Bugfix] make test_openai_schema.py pass (#18224)
Signed-off-by: David Xia <david@davidxia.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-22 18:34:06 +00:00
Lukas Geiger
54631f8262 [Misc] Call ndarray.tobytes() directly instead of ndarray.data.tobytes() (#18347)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-05-22 09:00:13 -07:00
Reid
cb506ecb5a [Misc] improve Automatic Prefix Caching example (#18554)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-22 14:50:46 +00:00
Li, Jiang
93f71673ce [BugFix][CPU] Fix x86 SHM distributed module initialization (#18536)
Signed-off-by: jiang.li <jiang1.li@intel.com>
2025-05-22 07:35:00 -07:00
Calvin Chen
3f505233fd [Doc] Add stream flag for chat completion example (#18524)
Signed-off-by: calvin chen <120380290@qq.com>
2025-05-22 14:07:10 +00:00
Bowen Wang
4e04eceb58 [Bugfix] Use random hidden states in dummy sampler run (#18543)
Signed-off-by: Bowen Wang <abmfy@icloud.com>
2025-05-22 06:48:56 -07:00
CYJiang
71075029f2 [Doc] Support --stream arg in openai_completion_client.py script (#18388)
Signed-off-by: googs1025 <googs1025@gmail.com>
2025-05-22 13:20:17 +00:00
Harry Mellor
ca86a7cf6e [CI/Build] Update bamba test model location (#18544)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-22 06:01:07 -07:00
lkchen
a35a494745 [Bugfix] Add kwargs to RequestOutput __init__ to be forward compatible (#18513)
Signed-off-by: Linkun <github@lkchen.net>
2025-05-22 05:24:43 -07:00
f6037d1907 [Bugfix] Fix MRoPE Errors in the Qwen-VL Model When Processing Pure Text (#18526)
Co-authored-by: 松灵 <wpf272043@alibaba-inc.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-22 05:22:53 -07:00
aws-elaineyz
fa72f9a812 Order sequence ids + config update to support specifying custom quantization layers (#18279)
Signed-off-by: Elaine Zhao <elaineyz@amazon.com>
Co-authored-by: Tailin Pan <tailinpa@amazon.com>
Co-authored-by: Rishabh Rajesh <rishyraj@amazon.com>
Co-authored-by: Yishan McNabb <yishanm@amazon.com>
Co-authored-by: Patrick Lange <patlange@amazon.com>
Co-authored-by: Maxwell Goldberg <mgld@amazon.com>
Co-authored-by: Aakash Shetty <sheaak@amazon.com>
2025-05-22 02:20:36 -07:00
aws-elaineyz
ebed81fbf5 Update default neuron config for speculation (#18274)
Signed-off-by: Elaine Zhao <elaineyz@amazon.com>
Co-authored-by: Shashwat Srijan <sssrijan@amazon.com>
Co-authored-by: Aakash Shetty <sheaak@amazon.com>
2025-05-22 02:18:55 -07:00
Satyajith Chilappagari
e2d7d31244 [Neuron] Update Dockerfile.neuron to use latest neuron release (2.23) (#18512)
Signed-off-by: Satyajith Chilappagari <satchill@amazon.com>
2025-05-22 02:17:34 -07:00
Cyrus Leung
23b67b37b2 [Doc] Fix invalid JSON in example args (#18527)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-22 07:11:46 +00:00
Jee Jee Li
db5a29ba19 [Bugfix] Fix LoRA test (#18518)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-05-21 21:48:53 -07:00
Shane A
51797775c3 [Bugfix][Model] Make Olmo2Model weight loading return loaded weights (#18504)
Signed-off-by: Shane A <shanea@allenai.org>
2025-05-21 21:17:03 -07:00
Nick Hill
cf5984b2fe [BugFix][DP] Send DP wave completion only from dp_rank==0 (#18502)
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: kourosh hakhamaneshi <kourosh@anyscale.com>
2025-05-21 20:25:25 -07:00
youngrok cha
d022115cc6 [Bugfix] Inconsistent token calculation compared to HF in llava family (#18479)
Signed-off-by: jaycha <jaycha@ncsoft.com>
2025-05-21 20:21:47 -07:00
Rabi Mishra
acb54ca8e1 Intialize io_thread_pool attribute in the beginning. (#18331)
Signed-off-by: rabi <ramishra@redhat.com>
2025-05-21 20:21:14 -07:00
Russell Bryant
6e0fd34d3c [CI] Fix race condition with StatelessProcessGroup.barrier (#18506)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-05-21 20:19:13 -07:00
Ning Xie
176d62e4ea [MISC] update project urls in pyproject.toml (#18519)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-05-21 20:17:34 -07:00
Dhia Eddine Rhaiem
20bd6f4d2e [FalconH1] Fix output dtype in RMSNorm fallback path for Falcon-H1 (e.g. 0.5B) (#18500)
Signed-off-by: dhia.rhaiem <dhia.rhaiem@tii.ae>
Co-authored-by: younesbelkada <younesbelkada@gmail.com>
Co-authored-by: Ilyas Chahed <ilyas.chahed@tii.ae>
Co-authored-by: Jingwei Zuo <jingwei.zuo@tii.ae>
2025-05-21 19:23:59 -07:00
Sebastian Schoennenbeck
1f079540db [Bugfix] Consistent ascii handling in tool parsers (#17704)
Signed-off-by: Sebastian Schönnenbeck <sebastian.schoennenbeck@comma-soft.com>
2025-05-21 20:41:23 +00:00
vllmellm
94d8ec8d2b [FEAT][ROCm] Upgrade AITER MLA v1 backend (#18338)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-05-21 10:34:28 -07:00
Mark McLoughlin
bb0a311213 Revert "[v1] Support multiple KV cache groups in GPU model runner (#17945) (#18459)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-05-21 10:25:23 -07:00
Hosang
dd5fa7e04f [ROCm][Kernel][V1] Enable AMD Radeon GPU Custom Paged Attention on v1 (#17004)
Signed-off-by: Hosang Yoon <hosang.yoon@amd.com>
2025-05-21 08:35:00 -07:00
Hyogeun Oh (오효근)
2b16104557 [Misc] Update deprecation message for --enable-reasoning (#18404) 2025-05-21 07:33:11 -07:00
Kebe
371376f996 [Build] fix Dockerfile shell (#18402) 2025-05-21 07:32:06 -07:00
bnellnm
c6c10ca920 [Bugfix] Reduce moe_sum test size to avoid OOM (#18484)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-05-21 06:46:39 -07:00
GiantCroc
c154d89306 [Doc] fix arg docstring in linear layers (#18410)
Signed-off-by: giantcroc <1204449533@qq.com>
2025-05-21 06:45:57 -07:00
Dhia Eddine Rhaiem
eca18691d2 [MODEL] FalconH1 (#18406)
Signed-off-by: dhia.rhaiem <dhia.rhaiem@tii.ae>
Co-authored-by: younesbelkada <younesbelkada@gmail.com>
Co-authored-by: Ilyas Chahed <ilyas.chahed@tii.ae>
Co-authored-by: Jingwei Zuo <jingwei.zuo@tii.ae>
2025-05-21 04:59:06 -07:00
Rabi Mishra
61acfc45bc [Bugfix][Failing Test] Fix test_events.py (#18460)
Signed-off-by: rabi <ramishra@redhat.com>
2025-05-21 04:57:28 -07:00
Reid
107f5fc4cb [Misc] refactor disaggregated-prefill-v1 example (#18474)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-21 11:10:14 +00:00
Yong Hoon Shin
907f935de9 [V1] Fix general plugins not loaded in engine for multiproc (#18326)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-05-21 01:21:49 -07:00
Kebe
5d7f545204 [Frontend] deprecate --device arg (#18399)
Signed-off-by: Kebe <mail@kebe7jun.com>
2025-05-21 01:21:17 -07:00
Nicolò Lucchesi
cd8dfc6dfc [Misc] MultiConnector._connectors type (#18423)
Signed-off-by: nicklucche <nlucches@redhat.com>
2025-05-20 22:48:43 -07:00
wwl2755
d06dd72ba9 [Bugfix][Failing Test] Fix nixl connector test when promt size < block size (#18429)
Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
2025-05-20 22:41:44 -07:00
Cyrus Leung
ad0012a0ac Revert "[Bugfix] Fix MRoPE Errors in the Qwen-VL Model When Processing Pure Text (#18407)" (#18456)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-20 22:39:22 -07:00
bnellnm
92247c522e [Bug] Fix moe_sum signature (#18440)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-05-20 22:37:08 -07:00
Gregory Shtrasberg
0c15c2e486 [Bugfix] config.head_dim is now explicitly set to None (#18432)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-05-20 21:04:33 -07:00
Michael Goin
3b17ea26e4 [TPU] Re-enable the Pallas MoE kernel (#18025)
Signed-off-by: Michael Goin <mgoin64@gmail.com>
2025-05-20 19:52:27 -07:00
Dilip Gowda Bhagavan
23baa2180b fix:Build torch wheel inline rather than picking from nightly (#18351)
Signed-off-by: Dilip Gowda Bhagavan <dilip.bhagavan@ibm.com>
2025-05-20 22:22:24 +00:00
Percy
980a172474 [Kernel] update comment for KV shape in unified triton attn (#18099)
Signed-off-by: haochengxia <xhc_1007@163.com>
2025-05-20 11:19:34 -07:00
Calvin Chen
e1f5a71ed7 [Model] use AutoWeightsLoader for bloom (#18300)
Signed-off-by: calvin chen <120380290@qq.com>
2025-05-20 09:40:05 -07:00
Michael Goin
f4a8a37465 [Minor] Rename quantization nvfp4 to modelopt_fp4 (#18356)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-20 09:08:37 -07:00
Reid
8f55962a7f [Misc] refactor prompt embedding examples (#18405)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-20 15:26:12 +00:00
be48360c1f [Bugfix] Fix MRoPE Errors in the Qwen-VL Model When Processing Pure Text (#18407)
Co-authored-by: 松灵 <wpf272043@alibaba-inc.com>
2025-05-20 06:59:48 -07:00
wang.yuqi
86847700d7 [CI] Add mteb testing to test the accuracy of the embedding model (#17175) 2025-05-20 06:51:12 -07:00
汪志鹏
d6c86d09ae Update cpu.txt (#18398)
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
2025-05-20 10:53:23 +00:00
Jee Jee Li
6b35cb10a0 [Misc] Add LoRA code owner (#18387)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-05-20 03:27:30 -07:00
Reid
1b1e8e05ff [doc] update env variable export (#18391)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-20 08:53:27 +00:00
Random Fly
bca55b556f [Bugfix] fix adding bias twice in ipex GPTQ quantization (#18363)
Signed-off-by: rand-fly <randfly@outlook.com>
2025-05-20 00:54:33 -07:00
Kevin H. Luu
d981396778 [release] Change dockerhub username for TPU release (#18389) 2025-05-19 23:49:23 -07:00
Nan Qin
9609327fa4 [Core] [Bugfix]: tensor parallel with prompt embeds (#18171)
Signed-off-by: Nan2018 <nan@protopia.ai>
Co-authored-by: Andrew Sansom <andrew@protopia.ai>
2025-05-19 20:21:27 -07:00
Isotr0py
f07a673eb2 [Misc] Allow AutoWeightsLoader to skip loading weights with specific substr in name (#18358)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-05-19 20:20:12 -07:00
Liangfu Chen
d565e0976f [neuron] fix authorization issue (#18364)
Signed-off-by: Liangfu Chen <liangfc@amazon.com>
2025-05-19 23:30:32 +00:00
Lucia Fang
258bf621d5 fix CUDA_check redefinition in #17918 (#18287)
Signed-off-by: Lucia Fang <fanglu@fb.com>
Co-authored-by: Lucia (Lu) Fang <fanglu@meta.com>
2025-05-19 13:42:35 -07:00
Satyajith Chilappagari
dc1440cf9f Neuron up mistral (#18222)
Signed-off-by: Satyajith Chilappagari <satchill@amazon.com>
2025-05-19 09:54:47 -07:00
Gong Shufan
8171221834 [Misc] Fix typo (#18330) 2025-05-19 09:51:01 -07:00
sunyicode0012
7937c2fd52 Add files via uploadAdd fused MoE kernel tuning configs (fp8_w8a8) for DeepSeek V3/R1 on a single-node 8x NVIDIA H20 96GB setup (#18337) 2025-05-19 09:49:57 -07:00
Wenhua Cheng
e2ee1e8e9e [Feature]Add support for models quantized with AutoRound (#17850)
Signed-off-by: wenhuach21 <wenhua.cheng@intel.com>
2025-05-19 09:38:53 -07:00
Reid
20d8ce81eb [Frontend] add --quick option for vllm chat/complete (#18297)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-19 09:36:13 -07:00
Elad Segal
84ab4feb7e [Doc] Fix typo (#18355) 2025-05-19 16:05:16 +00:00
Jee Jee Li
6781af5608 [Quantization] Pool model support bitsandbytes (#18087)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-05-19 09:03:43 -07:00
Nick Hill
1b15df2546 [BugFix] Fix handling of num_computed_tokens with connector (#18232)
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com>
2025-05-19 09:03:25 -07:00
Cyrus Leung
43b5f61dce [Doc] Move input-related docs to Features (#18353)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-19 15:08:39 +00:00
Li Wang
c5bb0ebdc6 [Doc] Fix prompt embedding examples (#18350)
Signed-off-by: wangli <wangli858794774@gmail.com>
2025-05-19 06:48:16 -07:00
Shaoyu Yang
d637b96099 [BugFix] [Vul] Add missing usedforsecurity=False in MD5 hashing to enable FIPS (#18319)
Signed-off-by: cascade812 <cascade812@outlook.com>
Signed-off-by: shaoyuyoung <shaoyuyoung@gmail.com>
Co-authored-by: cascade <cascade812@outlook.com>
2025-05-19 01:31:23 -07:00
CYJiang
275c5daeb0 fix: Add type specifications for CLI arguments in tensorizer options (#18314) 2025-05-18 23:42:17 -07:00
Simon Mo
47fda6d089 [Build] Supports CUDA 12.6 and 11.8 after Blackwell Update (#18316)
Signed-off-by: simon-mo <simon.mo@hey.com>
2025-05-18 23:19:33 -07:00
Reid
27d0952600 [Misc] extract parser.parse_args() (#18323)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-19 04:06:26 +00:00
Nan Qin
221cfc2fea Feature/vllm/input embedding completion api (#17590)
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
Signed-off-by: Nan2018 <nan@protopia.ai>
Co-authored-by: 临景 <linjing.yx@alibaba-inc.com>
Co-authored-by: Bryce1010 <bryceyx@gmail.com>
Co-authored-by: Andrew Sansom <andrew@protopia.ai>
Co-authored-by: Andrew Sansom <qthequartermasterman@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-05-18 20:18:05 -07:00
wwl2755
9da1095daf [Spec Decode][V0] Fix spec decode correctness test in V0 eagle/medusa (#18175)
Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
2025-05-18 19:49:46 -07:00
Robin
d1211f8794 [Doc] Add doc to explain the usage of Qwen3 thinking (#18291)
Signed-off-by: WangErXiao <863579016@qq.com>
2025-05-18 23:04:07 +00:00
Reid
b6a6e7a529 [Misc] add litellm integration (#18320)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-18 15:32:30 +00:00
Lifu Huang
4fb349f66a Fix copy-paste error in phi4mm image processing (#18315)
Signed-off-by: Lifu Huang <lifu.hlf@gmail.com>
2025-05-18 07:00:12 -07:00
22quinn
908733aca7 [Model] Use sigmoid for single-label classification (#18313)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-05-18 07:00:09 -07:00
Reid
1a8f68bb90 [doc] update reasoning doc (#18306)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-18 06:59:14 -07:00
cascade
9ab2c02ff8 Support sequence parallelism combined with pipeline parallelism (#18243)
Signed-off-by: cascade812 <cascade812@outlook.com>
2025-05-17 22:47:25 +00:00
Ning Xie
66e63e86ec [MISC] fix typo (#18305)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-05-17 10:52:09 -07:00
rongfu.leng
9214e60631 [Model] use AutoWeightsLoader for solar (#18113) 2025-05-17 00:24:17 -07:00
Nishidha
f880d42582 Fixed build on ppc64le due to openssl conflicts (#18262)
Signed-off-by: Nishidha Panpaliya <nishidha.panpaliya@partner.ibm.com>
2025-05-17 00:23:46 -07:00
Michael Goin
dcfe95234c Update Dockerfile to build for Blackwell (#18095) 2025-05-17 00:23:25 -07:00
Siyuan Liu
48ac2bed5b [Hardware][TPU] Optionally import for TPU backend (#18269)
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
Signed-off-by: Jade Zheng <zheng.shoujian@outlook.com>
Co-authored-by: Carol Zheng <cazheng@google.com>
Co-authored-by: Jade Zheng <zheng.shoujian@outlook.com>
Co-authored-by: Hongmin Fan <fanhongmin@google.com>
2025-05-17 15:23:12 +08:00
David Ben-David
3e0d435027 [P/D][V1] Support dynamic loading of external KV connector implementations (#18142)
Signed-off-by: David Ben-David <davidb@pliops.com>
Co-authored-by: David Ben-David <davidb@pliops.com>
2025-05-17 06:40:39 +00:00
汪志鹏
4ee4826ede [BugFix] Correct max_model_len derivation from config.json for Mistral format (#17937)
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
Co-authored-by: tracelogfb <48808670+tracelogfb@users.noreply.github.com>
Co-authored-by: Stephen Chen <tracelog@meta.com>
2025-05-17 04:20:13 +00:00
Reid
60017dc841 [Misc] reformat the collect-env output (#18285)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-16 19:46:18 -07:00
Trevor Royer
55f1a468d9 Move cli args docs to its own page (#18228) (#18264)
Signed-off-by: Trevor Royer <troyer@redhat.com>
2025-05-16 19:43:45 -07:00
Michael Goin
fd195b194e [V1][P/D] Local attention optimization for NIXL (#18170)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-16 21:16:33 -04:00
Woosuk Kwon
fabe89bbc4 [Spec Decode] Don't fall back to V0 when spec decoding is enabled (#18265) 2025-05-16 16:10:27 -07:00
Jinzhen Lin
e73b7dfd69 [Bugfix] fix an illegal memory access was encountered of marlin kernel + act_order (#18245) 2025-05-16 16:02:44 -07:00
Bowen Wang
7fdfa01530 [Sampler] Adapt to FlashInfer 0.2.3 sampler API (#15777)
Signed-off-by: Bowen Wang <abmfy@icloud.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-05-16 15:14:03 -07:00
Sanger Steel
aef94c6d07 [CI] Assign reviewer to mergify with changes to Tensorizer files (#18278) 2025-05-16 12:04:14 -07:00
Nick Hill
0ceaebf87b [BugFix] Fix ordering of KVConnector finished send/rcv sets (#18211)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-05-16 09:20:54 -07:00
Nick Hill
1db4f47f81 [BugFix] Fix multi async save in MultiConnector (#18246)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-05-16 08:13:47 -07:00
Reid
d3d91b6f71 [Misc][MacOS] fix bfloat16 error (#18249)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-16 15:05:59 +00:00
learner0810
87d871470d [Model] Use autoweightloader for dbrx (#18251)
Signed-off-by: learner0810 <zhongjun.li@daocloud.io>
2025-05-16 07:54:13 -07:00
fxmarty-amd
a5f8c111c2 [Fix] Fix typo in resolve_hf_chat_template (#18259)
Signed-off-by: Felix Marty <felmarty@amd.com>
2025-05-16 14:52:41 +00:00
Lain
e23564cb70 use ceil_div in cutlass block scaling shape check (#17918) 2025-05-16 03:02:58 -07:00
Isotr0py
390ec88905 [Misc] Consolidate Audio tests into multimodal common generation tests (#18214)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-05-16 09:18:08 +00:00
Seiji Eicher
541817670c [Misc] Add Ray Prometheus logger to V1 (#17925)
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
2025-05-16 01:02:42 -07:00
Vadim Gimpelson
67da5720d4 [PERF] Speed up Qwen2.5-VL model by speed up rotary position embedding (#17973)
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@centml.ai>
2025-05-15 23:31:02 -07:00
David Xia
5c04bb8b86 [doc] fix multimodal example script (#18089)
Signed-off-by: David Xia <david@davidxia.com>
2025-05-16 06:05:34 +00:00
Lucia Fang
3d2779c29a [Feature] Support Pipeline Parallism in torchrun SPMD offline inference for V1 (#17827)
Signed-off-by: Lucia Fang <fanglu@fb.com>
2025-05-15 22:28:27 -07:00
Will Eaton
6b31c84aff Throw better error for when running into k8s service discovery issue (#18209)
Signed-off-by: Will Eaton <weaton@redhat.com>
2025-05-15 21:07:28 -07:00
Harry Mellor
b18201fe06 Allow users to pass arbitrary JSON keys from CLI (#18208)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-15 21:05:34 -07:00
Sky Lee
f4937a51c1 [Model] vLLM v1 supports Medusa (#17956)
Signed-off-by: lisiqi23 <lisiqi23@xiaomi.com>
Signed-off-by: skylee-01 <497627264@qq.com>
Co-authored-by: lisiqi23 <lisiqi23@xiaomi.com>
2025-05-15 21:05:31 -07:00
kliuae
ee659e3b60 [Bugfix][ROCm] Use chunked_prefill_paged_decode as fallback for V1 attention on ROCm (#18093)
Signed-off-by: kf <kuanfu.liu@embeddedllm.com>
2025-05-15 19:30:17 -07:00
Lucas Wilkinson
4e1c6a0264 [Bugfix] fix rotary embedding test for _get_padded_tensor_shape (#18229)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2025-05-16 01:32:45 +00:00
Lucas Wilkinson
c7852a6d9b [Build] Allow shipping PTX on a per-file basis (#18155)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2025-05-15 16:41:55 -07:00
Lucia Fang
8795eb9975 [Bugfix] Fix test_eagle test (#18223)
Signed-off-by: Lucia Fang <fanglu@fb.com>
2025-05-15 15:59:42 -07:00
Alexei-V-Ivanov-AMD
0b34593017 Adding "AMD: Tensorizer Test" to amdproduction. (#18216) 2025-05-15 11:01:25 -07:00
Nicolò Lucchesi
e3f3aee6f4 [Misc] Avoid cuda graph log when sizes still match (#18202)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-05-15 09:59:38 -07:00
TJian
92540529c0 [Bugfix] [ROCm]: Remove assertion logic when using AITER fused moe in unquantizedMethod to reenable LLama4 BF16 (#18205)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-05-15 09:53:18 -07:00
Zhonghua Deng
fadb8d5c2d [Bugfix]Change the exception thrown by call_hf_processor from RuntimeError to ValueError (#18181)
Signed-off-by: Abatom <abzhonghua@gmail.com>
2025-05-15 09:01:47 -07:00
Sebastian Schoennenbeck
2aa5470ac5 [Frontend] Fix chat template content format detection (#18190)
Signed-off-by: Sebastian Schönnenbeck <sebastian.schoennenbeck@comma-soft.com>
2025-05-15 09:00:21 -07:00
Harry Mellor
51ff154639 Improve examples rendering in docs and GitHub (#18203)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-15 15:57:49 +00:00
Alexei-V-Ivanov-AMD
566ec04c3d Adding "Basic Models Test" and "Multi-Modal Models Test (Extended) 3" in AMD Pipeline (#18106)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-05-15 08:49:23 -07:00
Thomas Parnell
01c22335ba [Kernel] [V1] Fix performance regression for triton unified attention (#18161)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2025-05-15 06:39:00 -07:00
hustxiayang
451da4bcbd add tools into TokenizeChatRequest (#18187)
Signed-off-by: yangxia <yangxiast@gmail.com>
2025-05-15 04:01:49 -07:00
Harry Mellor
07ad27121f Update deprecated type hinting in model_loader (#18130)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-15 04:00:21 -07:00
omahs
a9944aabfa fix: typos (#18151)
Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>
2025-05-15 02:16:15 -07:00
Russell Bryant
a8f5aec20a [V1] Update zmq socket creation in nixl connector (#18148)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-05-14 23:17:57 -07:00
David Xia
de71fec81b [CI] don't skip fixed test_kv_cache_events() (#18183)
Signed-off-by: David Xia <david@davidxia.com>
2025-05-14 23:17:16 -07:00
Mengqing Cao
70f8b96724 [Bugfix] Fix FusedMoEPrepareAndFinalize for cuda-disalike backends (#18178)
Signed-off-by: Mengqing Cao <cmq0113@163.com>
2025-05-14 23:16:31 -07:00
inkcherry
dd2a94596a [Model] Allow the use of sliding window in Qwen2 (#17772)
Signed-off-by: inkcherry <mingzhi.liu@intel.com>
2025-05-14 22:29:38 -07:00
Ning Xie
420caf7557 [UT] Add ut for none hash (#17892)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-05-15 13:28:11 +08:00
Chenheli Hua
4f07a64075 Support custom implementations of VideoLoader backends. (#18091) 2025-05-15 13:26:49 +08:00
Thomas Parnell
e6b8e65d2d [Bugfix] Fix fp8 tests for triton_unified_attention for Triton 3.3 (#18013)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2025-05-15 13:26:34 +08:00
Harry Mellor
26d0419309 Update deprecated type hinting in models (#18132)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-14 22:06:50 -07:00
Luka Govedič
83f74c698f [Fix][ROCm] Enforce eager for all encoder-decoder models on ROCm (#18154)
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
2025-05-14 22:04:43 -07:00
Reid
2dff093574 [Misc] add lobe-chat support (#18177)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-15 05:02:23 +00:00
Aaron Pham
afe3236e90 [Chore] astral's ty (#18116)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-05-15 05:00:43 +00:00
Mark McLoughlin
65334ef3b9 [V1][Metrics] Remove unused code (#18158)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-05-14 20:13:17 -07:00
Chen Zhang
e60f550b38 [v1] Support multiple KV cache groups in GPU model runner (#17945)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-05-14 18:54:54 -07:00
David Xia
f25e0d1125 [Bugfix]: make most of test_openai_schema.py pass (#17664) 2025-05-14 17:04:35 -07:00
Andrey Talman
09f106a91e Upload vllm index for the rc builds (#18173) 2025-05-14 16:35:56 -07:00
Michael Goin
2142035b51 [V1] Support multiple kv connectors (#17564)
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-05-14 16:28:02 -07:00
Russell Bryant
78aa341d12 [CI] Fix race condition in test_kv_cache_events test (#18169)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-05-14 16:27:48 -07:00
Jerry Zhang
7974736740 Add support for loading torchao models with AOPerModuleConfig (#17826)
Signed-off-by: Jerry Zhang <jerryzh168@gmail.com>
2025-05-14 16:24:59 -07:00
Aaron Pham
2fc9075b82 [V1] Structured Outputs + Thinking compatibility (#16577)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
2025-05-14 15:45:24 -07:00
Lucas Wilkinson
d93c976a0d [Kernel] Have rotary embeddings support tensors (#18046)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2025-05-14 15:43:55 -07:00
David Xia
749f792553 [Frontend] decrease import time of vllm.multimodal (#18031)
Co-authored-by: Aaron Pham <Aaronpham0103@gmail.com>
2025-05-14 15:43:32 -07:00
Robert Shaw
856865008e [CI] Disable Failing Tests (#18165) 2025-05-14 13:49:56 -07:00
bnellnm
f9c069c85e Modularize fused experts and integrate PPLX kernels (#15956) 2025-05-14 13:11:54 -07:00
Ekagra Ranjan
418d2f8bfb [V1][Spec Decode] Share input embedding of target model with EAGLE draft model to free ~1GB for llama 3 model (#17326)
Co-authored-by: root <root@ekagra-8xh100.us-east5-a.c.serving-efficiency-poc.internal>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-05-14 12:31:46 -07:00
Chen Zhang
964472b966 [Doc] Update prefix cache metrics to counting tokens (#18138)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-05-14 15:23:30 +00:00
Nick Hill
59dd311cf5 [KVConnector] Keep KVTransferParams as a dict (#18033) 2025-05-14 08:05:57 -07:00
Cyrus Leung
d066e52013 [Bugfix] Fix chat utils tests (#18139)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-14 05:38:21 -07:00
Harry Mellor
c8ea982d9b Update deprecated type hinting in platform, plugins, triton_utils, vllm_flash_attn (#18129)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-14 05:28:16 -07:00
Harry Mellor
dc372b9c8a Update deprecated type hinting in vllm/device_allocator and vllm/distributed (#18126)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-14 04:07:57 -07:00
Harry Mellor
9b5b39b650 Update deprecated type hinting in vllm/lora (#18128)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-14 03:57:59 -07:00
Reid
9ccc6ded42 [doc] add missing import (#18133)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-14 10:57:34 +00:00
Cyrus Leung
d62a076e84 [Model] GritLM supports other attention backends (#18109)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-14 03:33:19 -07:00
Jee Jee Li
259127f8b8 [Bugfix] Fix LoRA test (#18123)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-05-14 10:25:47 +00:00
TJian
612c2edb4f [FEAT] [ROCm]: Add AITER CK 2 Stages MoE support (#17110)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-05-14 03:03:11 -07:00
Andrzej Kotłowski
38fe728d60 [Bugfix] Fix QKVCrossParallelLinear::sync_weight_attrs for PyTorch compile (#17844)
Signed-off-by: Andrzej Kotłowski <akotlowski@habana.ai>
2025-05-14 09:39:51 +00:00
rongfu.leng
82e7f9bb03 [Misc] replace does not exist model (#18119)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-05-14 02:13:47 -07:00
Jee Jee Li
63dc3426e0 [Model] Add packed_modules_mapping for Qwen3-MOE (#18118)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-05-14 02:13:19 -07:00
Cyrus Leung
8f5dc41481 [Bugfix] Fix entrypoints audio test failure (#18111)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-14 09:08:07 +00:00
wang.yuqi
63ad622233 [New Model]: support GTE NewModel (#17986) 2025-05-14 01:31:31 -07:00
majianpeng
e7ef61c1f0 [Bugfix][Example] make lmcache v0 work. (#18051)
Signed-off-by: Ma, Jianpeng <jianpeng.ma@intel.com>
2025-05-13 23:43:44 -07:00
Jinzhen Lin
d4154c35a2 [Bugfix] fix moe marlin topk_weight loading (#18080)
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-05-13 23:31:57 -07:00
lkchen
6685890d11 [Fix] Move "model_config" as keyword args in chat_utils.py (#18098)
Signed-off-by: Linkun <github@lkchen.net>
2025-05-13 23:27:26 -07:00
Ecthlion_zyy
33011318c2 Fix broken example: examples/offline_inference/profiling at scheduler_config (#18117) 2025-05-13 23:19:14 -07:00
qli88
4f8b373225 [BugFix][AMD] Compatible patch for AITER lib after 04/20 (#17912)
Signed-off-by: Qiang Li <qiang.li2@amd.com>
2025-05-13 23:05:20 -07:00
Charlie Fu
7b2f28deba [AMD][torch.compile] Enable silu+fp8_quant fusion for rocm (#18082)
Signed-off-by: charlifu <charlifu@amd.com>
2025-05-13 22:13:56 -07:00
vllmellm
2d912fb66f [FEAT] [ROCm] [V1]: Add AITER biased group topk for DeepSeekV3 (#17955)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-05-13 22:03:47 -07:00
Michael Goin
12e6c0b41c [Bugfix][V1] Fix FlashInfer V1 backend using the wrong VllmConfig (#18086) 2025-05-13 20:36:17 -07:00
Michael Goin
9a2a6357de [Bugfix] Fix FP8 Marlin MoE and enable for compressed-tensors models (#18026)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-13 19:48:33 -07:00
youkaichao
6266c57bae [core][distributed] add ep group and all2all interface (#18077)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-05-14 10:46:49 +08:00
Jon Gill
754b699cbe [Bug]: Fix S3 model/tokenizer path resolution (#18083)
Signed-off-by: Jon Gill <jon@yurts.ai>
2025-05-13 19:34:17 -07:00
Roger Wang
6e27c6d86b [Misc] Remove unused numpy tensor (#18084)
Signed-off-by: Roger Wang <hey@rogerw.me>
2025-05-13 19:33:40 -07:00
Nick Hill
d5af47a149 [P/D] Add some more debug logs to NixlConnector (#18102)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-05-13 19:33:03 -07:00
Pavani Majety
65f0f74b66 [Hardware/NVIDIA/Modelopt] Fix modelopt forward method for v1 torch.compile (#18101)
Signed-off-by: Pavani Majety <pmajety@nvidia.com>
2025-05-13 19:33:00 -07:00
Luka Govedič
176a95c670 [Fix] Support CUDAGraph capture for encoder-decoder on ROCm (#18104)
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
2025-05-13 19:31:42 -07:00
Chen Zhang
f2ae883b67 [v1][KVCacheManager] pass num_new_computed_tokens to kv cache manager (#18001)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-05-13 19:09:39 -07:00
vllmellm
40de1ef455 [FEAT] [ROCm]: Add AITER Block-Scaled GEMM Feature (#14968)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-05-13 19:08:20 -07:00
Russell Bryant
0189a65a2e [Docs] Expand security doc with firewall info (#18081)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-05-13 19:36:00 +00:00
Nick Hill
55aa7af994 [V1] DP scale-out (2/N): Decouple engine process management and comms (#15977)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-05-13 10:48:21 -07:00
Harry Mellor
0b217da646 Update deprecated type hinting in vllm/adapter_commons (#18073)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-13 08:32:51 -07:00
Harry Mellor
19324d660c Update deprecated type hinting in vllm/compilation (#18072)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-13 08:32:48 -07:00
Harry Mellor
fc407a1425 Give auto-merge label workflow permission to add labels to issues (#18078)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-13 07:53:13 -07:00
Harry Mellor
009d9e7590 Convert benchmarks to ruff format (#18068)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-13 13:43:29 +00:00
Cyrus Leung
b922c2ebd2 [Bugfix] Fix entrypoints metrics tests (#18063)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-13 06:42:43 -07:00
Russell Bryant
00b14e0f16 [CI] set token permissions for pre-commit CI job (#17729)
Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>
2025-05-13 13:38:30 +00:00
Russell Bryant
54e467e6f8 [CI] Add token permissions for add-ready-label CI job (#17730)
Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>
2025-05-13 13:38:13 +00:00
Russell Bryant
79a1d25bbd [CI] Add workflow permissions for helm CI job (#17727)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>
2025-05-13 12:49:07 +00:00
Russell Bryant
9944011b30 [CI] Set token permissions for reminder comment CI job (#17728)
Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>
2025-05-13 12:46:58 +00:00
Harry Mellor
8c946cecca Update deprecated type hinting in vllm/transformers_utils (#18058)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-13 04:34:37 -07:00
Harry Mellor
ff334ca1cd Update deprecated type hinting in vllm/profiler (#18057)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-13 04:34:34 -07:00
Harry Mellor
6223dd8114 Update deprecated type hinting in model_executor/layers (#18056)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-13 04:17:23 -07:00
Reid
906f0598fc [doc] add download/list/delete HF model CLI usage (#17940)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-13 11:15:51 +00:00
Aaron Pham
cb528d0585 [Fix] check to make sure processor has chat templates (#18047)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-05-13 03:04:10 -07:00
Harry Mellor
98fcba1575 Convert .buildkite to ruff format (#17656)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-13 09:28:31 +00:00
Russell Bryant
23b3134eb5 [Benchmarks] Refactor run_structured_output_benchmarks.sh (#17722)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-05-13 01:47:29 -07:00
Michael Goin
ea6ae8cb45 [Bugfix] Fix marlin moe fallback logic for llama4 (#18042)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-13 07:53:28 +00:00
Woosuk Kwon
2ff297dce9 [BugFix] Set default random seed to 0 for V1 (#17929)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-05-13 07:52:19 +00:00
Jin Huang
8dd0671bac [Bugfix][V1] Only get input embeddings w/ multi-modal models if first PP (#17916)
Signed-off-by: Jin Huang <jinhun@amazon.com>
Co-authored-by: Jin Huang <jinhun@amazon.com>
2025-05-13 15:10:07 +08:00
Chen Zhang
f0d610a8ae [v1][KVCacheManager] Avoid full cache hit by controlling max_length (#17999)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-05-13 06:50:38 +00:00
Driss Guessous
e57e4d6e9e Fix Broken macro for cutlass moe (#18049)
Signed-off-by: drisspg <drisspguessous@gmail.com>
2025-05-12 23:31:06 -07:00
Nick Hill
ee5be834e7 [BugFix] Fix 4-GPU RLHF tests (#18007)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-05-12 23:03:55 -07:00
Calvin Chen
48545728d8 cleanup invalid prints (#18050)
Signed-off-by: calvin chen <120380290@qq.com>
2025-05-12 23:01:57 -07:00
Chauncey
dc1a821768 [Feature][V1] Support tool_choice: required when using Xgrammar as the StructuredOutputBackend. (#17845)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-05-12 23:01:31 -07:00
Cyrus Leung
61e0a506a3 [Bugfix] Avoid repeatedly creating dummy data during engine startup (#17935)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-12 22:40:19 -07:00
Michael Goin
1df491c522 [Bugfix] Fixes for new marlin moe usage (#18017)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-13 03:50:04 +00:00
Arjun Kathuria
d8487ef557 [ROCm]: Fix build from source failure with gcc14 and ROCm 6.3 (#13779)
Signed-off-by: Arjun Kathuria <arjun.kathuria8@gmail.com>
2025-05-12 20:36:33 -07:00
Jee Jee Li
c06af9a959 [Misc] Slight spelling modification (#18039)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-05-12 20:36:27 -07:00
Tao He
60f7624334 Implements dual-chunk-flash-attn backend for dual chunk attention with sparse attention support (#11844) 2025-05-12 19:52:47 -07:00
hissu-hyvarinen
f6518b2b48 [ROCm] Skip tests for quantizations incompatible with ROCm (#17905)
Signed-off-by: Hissu Hyvarinen <hissu.hyvarinen@amd.com>
2025-05-12 18:39:28 -06:00
Harry Mellor
d67085c2c8 Remove noisy warnings from SchedulerConfig (#17995)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-13 00:33:45 +00:00
Michael Goin
307939f299 Use NVFP4 Marlin for CompressedTensorsW4A16Fp4 (#18000)
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Dipika <dipikasikka1@gmail.com>
Co-authored-by: Dipika <dipikasikka1@gmail.com>
2025-05-12 18:07:34 -06:00
Harry Mellor
9d7ea9dbbf Update some more deprecated type hinting (#17998)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-12 23:49:33 +00:00
bwshen-mi
acee8f48aa [Model] Support MiMo-7B inference with MTP (#17433)
Signed-off-by: wp-alpha <wangpeng66@xiaomi.com>
Co-authored-by: wangpeng66 <wangpeng66@xiaomi.com>
2025-05-12 23:25:33 +00:00
Michael Goin
f065de4e88 Fix FBGEMM integration (#18002)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-12 23:02:07 +00:00
wwl2755
dc9905368d [V1][Spec Decode] Eagle unit tests (#17350)
Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
2025-05-12 23:01:17 +00:00
Russell Bryant
ebab1ac37c [CI] Make JSON output tests less likely to fail (#17859)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-05-12 22:31:54 +00:00
Yang Wang
2b0db9b0e2 Enable standard language model for torhc nightly (#18004)
Signed-off-by: Yang Wang <elainewy@meta.com>
2025-05-12 14:00:04 -07:00
Robert Shaw
195adb47c0 [Chore] Remove unused method (#18024)
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-05-12 13:59:47 -07:00
Chen Zhang
302f3aca7e [v1][KVCacheManager] Change prefix caching metric from counting blocks to counting tokens (#18003)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-05-12 13:46:12 -07:00
Alexei-V-Ivanov-AMD
e9c730c9bd Enabling "Weight Loading Multiple GPU Test - Large Models" (#18020) 2025-05-12 13:05:33 -07:00
Jade Zheng
289199feb6 [Core] Use platform-agnostic device control for DP engine core (#17245)
Signed-off-by: Jade Zheng <zheng.shoujian@outlook.com>
2025-05-12 12:09:16 -07:00
Carol Zheng
b9fd0d7a69 [CI/Build] Fix TPU V1 Test mixed use of & and && across tests (#17968) 2025-05-12 12:06:59 -07:00
Harry Mellor
72a3f6b898 Construct KVTransferConfig properly from Python instead of using JSON blobs without CLI (#17994)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-12 11:25:33 -07:00
Jonathan Berkhahn
98ea35601c [Lora][Frontend]Add default local directory LoRA resolver plugin. (#16855)
Signed-off-by: jberkhahn <jaberkha@us.ibm.com>
2025-05-12 10:39:10 -07:00
Robert Shaw
d19110204c [P/D] NIXL Integration (#17751)
Signed-off-by: ApostaC <yihua98@uchicago.edu>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Brent Salisbury <bsalisbu@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: ApostaC <yihua98@uchicago.edu>
Co-authored-by: Robert Shaw <rshaw@neuralmagic.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Tyler Michael Smith <tysmith@redhat.com>
Co-authored-by: Brent Salisbury <bsalisbu@redhat.com>
2025-05-12 09:46:16 -07:00
Maximilien de Bayser
05a4324f8e Initialize the delta tool call fields explicitly (#17340)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: igmainc <igmainc@icloud.com>
2025-05-12 13:28:58 +00:00
Jee Jee Li
7ea6cb28b2 [Misc] Improve modelscope import error (#17983)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-05-12 10:46:45 +00:00
Aaruni Aggarwal
9fbf2bfbd5 Correcting testcases in builkite job for IBM Power (#17675)
Signed-off-by: Aaruni Aggarwal <aaruniagg@gmail.com>
2025-05-12 08:11:55 +00:00
Xu Wenqing
3a5ea75129 [Feature] Support DeepSeekV3 Function Call (#17784)
Signed-off-by: 许文卿 <xwq391974@alibaba-inc.com>
Signed-off-by: Xu Wenqing <xuwq1993@qq.com>
2025-05-12 00:45:21 -07:00
Brayden Zhong
891b9d33de [Fix] Benchmark "EngineClient" has no attribute "model_config" (#17976)
Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
2025-05-11 22:55:53 -07:00
Siyuan Liu
430783018c [Bugfix][TPU] Use np array when updating cache slot_mapping (#17971)
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
2025-05-12 12:58:33 +08:00
Li Wang
19a3c78d1f [Bugfix] Fix pydantic.errors.PydanticUserError (#17962)
Signed-off-by: wangli <wangli858794774@gmail.com>
2025-05-12 12:58:23 +08:00
Reid
ada50aa295 [bugfix] fix the wrong parser (#17958)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-12 04:58:02 +00:00
Cheng Kuan Yong Jason
08bf784078 [Bugfix] validate grammar and throw 400 error instead of crashing the engine when xgrammar validation fails (#17623)
Signed-off-by: Jason Cheng <jasoncky96@gmail.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
2025-05-12 09:06:10 +08:00
youkaichao
d45fe333fb [misc] add instructions on how to install nvshmem/pplx/deepep (#17964)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-05-11 18:02:39 -07:00
Isotr0py
021c16c7ca [Model] Broadcast Ovis2 implementation to fit Ovis1.6 (#17861)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-05-11 17:56:30 -07:00
TJian
7de18d541b [BUG] [ROCm] [MLA] Fix variable name bug due to change in variable name in PR #17483 (#17961)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-05-11 09:14:30 -07:00
TJian
a810b5b088 [BugFix] [ROCm]: Bugfix and handle addition case of input for rocm_aiter_rms_norm (#17857)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-05-11 04:17:11 -07:00
Reid
009b3d5382 [Misc] not show --model in vllm serve --help (#16691)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-11 08:47:58 +00:00
wang.yuqi
e4b8713380 [New Model]: nomic-embed-text-v2-moe (#17785) 2025-05-11 00:59:43 -07:00
Gregory Shtrasberg
06c0922a69 [FP8][ROCm][Attention] Enable FP8 KV cache on ROCm for V1 (#17870)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-05-11 15:58:45 +08:00
Dipika Sikka
cd3edfc908 [Misc] Add compressed-tensors NVFP4A16 emulation support (#17914)
Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
Signed-off-by: Dipika <dipikasikka1@gmail.com>
2025-05-11 15:58:38 +08:00
Frieda Huang
9cea90eab4 [Frontend] Add /classify endpoint (#17032)
Signed-off-by: Frieda (Jingying) Huang <jingyingfhuang@gmail.com>
2025-05-11 07:57:07 +00:00
Reid
d1110f5b5a [doc] update lora doc (#17936)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-11 15:56:21 +08:00
Ben Browning
8132365b74 [Bugfix]: v1 engine - consider lora adapters in allowed_token_ids (#17855)
Signed-off-by: Ben Browning <bbrownin@redhat.com>
2025-05-11 00:53:58 -07:00
Shiyan Deng
eea22a56ab fix amd triton mla path (#17871) 2025-05-11 07:53:31 +00:00
Kuntai Du
9112155283 [Perf] Use small max_num_batched_tokens for A100 (#17885)
Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
2025-05-11 07:53:23 +00:00
xinli-centml
90d0a74b60 [Bugfix] Add revision to transformers.Auto*.from_pretrained processors (#17948)
Signed-off-by: Xin Li <xin@centml.ai>
2025-05-11 07:52:44 +00:00
Jinzhen Lin
d74e5f37bc [Kernel] fp4 marlin kernel (#17687)
Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
2025-05-10 19:58:49 -07:00
Chen Zhang
ca66a1674c [v1] Rename specialized_manager.py to single_type_kv_cache_manager.py (#17946)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-05-10 16:14:12 -07:00
Chen Zhang
950751a987 [v1] Pass BlockTable and KVCacheSpec to AttentionMetadataBuilders (#17483)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-05-10 16:12:04 -07:00
Reid
4c31218f80 [Misc] remove --model from vllm serve usage (#17944)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-10 13:23:31 +00:00
Harry Mellor
68311891f5 Don't default construct ModelConfig when default constructing VllmConfig (#17943)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-10 13:23:00 +00:00
Ximo Guanter
fc4441a4ee Add missing content type headers to /ping and /health (#17036) (#17786)
Signed-off-by: Ximo Guanter <ximo.guanter@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-10 07:13:32 +01:00
tracelogfb
246e3e0a36 fix broken test vllm:test_kernels - test_attention_selector.py::test_flash_attn (#17873)
Co-authored-by: Stephen Chen <tracelog@meta.com>
2025-05-10 10:46:54 +08:00
Mark McLoughlin
7042cc96b0 [V1][Spec Decoding] Log accumulated metrics after system goes idle (#17913)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-05-09 18:23:07 -07:00
Pavani Majety
0c0fdae84f [Hardware/NVIDIA/Kernel] Enable nvidia/DeepSeek-R1-FP4 Model (#16362) 2025-05-09 16:24:41 -07:00
Alexei-V-Ivanov-AMD
3b602cdea7 AMD conditional all test execution // new test groups (#17556)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu>
2025-05-09 15:35:58 -07:00
Harry Mellor
4b2ed7926a Improve configs - the rest! (#17562)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-09 15:18:44 -07:00
Mark McLoughlin
7e3571134f [V1][Spec Decoding] Include bonus tokens in mean acceptance length (#17908)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-05-09 13:32:36 -07:00
Richard Zou
ea2236bf95 Add option to use torch._inductor.standalone_compile (#17057)
Signed-off-by: rzou <zou3519@gmail.com>
2025-05-09 12:59:04 -07:00
Harry Mellor
7d4aedae7c Handle error when str passed to /v1/audio/transcriptions (#17909)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-09 19:23:59 +00:00
Michael Goin
22481fbfa3 Update CT WNA16MarlinMoE integration (#16666)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-09 13:19:45 -04:00
Isotr0py
5c4c08f6f1 [Misc] Auto fallback to float16 for pre-Ampere GPUs when detected bfloat16 config (#17265)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-05-09 17:16:12 +00:00
Rui Qiao
c44c384b1c [Misc] Add references in ray_serve_deepseek example (#17907)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-05-09 16:59:36 +00:00
Michael Goin
85b72cb7b1 Revert "[BugFix][AMD] Compatible patch for latest AITER(05/07/2025)" (#17910) 2025-05-09 08:58:18 -07:00
Cyrus Leung
6e5595ca39 [CI/Build] Automatically retry flaky tests (#17856)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-09 09:55:17 -06:00
Chen Zhang
200da9a517 [v1] Move block management logic from KVCacheManager to SpecializedManager (#17474)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-05-09 15:25:34 +00:00
qli88
9f64e93415 [BugFix][AMD] Compatible patch for latest AITER(05/07/2025) (#17864)
Signed-off-by: Qiang Li <qiang.li2@amd.com>
2025-05-09 08:59:36 -06:00
Reid
ec61ea20a8 [Misc] add dify integration (#17895)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-09 03:42:39 -07:00
Harry Mellor
c6798baa9c Change top_k to be disabled with 0 (still accept -1 for now) (#17773)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-09 10:01:49 +00:00
inkcherry
5b2dcbf0b8 Fix Whisper crash caused by invalid`` max_num_batched_tokens`` config (#17853)
Signed-off-by: inkcherry <mingzhi.liu@intel.com>
2025-05-09 09:16:26 +00:00
Isotr0py
6e4a93e3f7 [Bugfix][CPU] Fix broken AVX2 CPU TP support (#17252)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-05-09 08:55:14 +00:00
vllmellm
217db4baa6 [Bugfix][ROCm] Fix AITER MLA V1 (#17880)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-05-09 08:38:21 +00:00
Yan Ma
ff8c400502 [Doc] remove visible token in doc (#17884)
Signed-off-by: yan <yanma1@habana.ai>
2025-05-09 01:21:31 -07:00
Michael Yao
89a0315f4c [Doc] Update several links in reasoning_outputs.md (#17846)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-05-09 01:20:55 -07:00
Simon Mo
3d1e387652 [Docs] Add Slides from NYC Meetup (#17879)
Signed-off-by: simon-mo <simon.mo@hey.com>
2025-05-08 21:46:54 -07:00
Ning Xie
d310e6de98 [BUGFIX]: return fast when request requires prompt logprobs (#17251) 2025-05-08 21:25:41 -07:00
Lucas Wilkinson
5e6f939484 [Attention] MLA move rotary embedding to cuda-graph region (#17668)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2025-05-09 11:14:42 +08:00
Shanshan Shen
760e3ecc8f [V1][Structured Output] Update llguidance (>= 0.7.11) to avoid AttributeError (no StructTag) (#17839)
Signed-off-by: shen-shanshan <467638484@qq.com>
2025-05-08 20:14:18 -07:00
vllmellm
3c9396a64f [FEAT][ROCm]: Support AITER MLA on V1 Engine (#17523)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: qli88 <qiang.li2@amd.com>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
2025-05-09 10:42:05 +08:00
Shu Wang
376786fac1 Add cutlass support for blackwell fp8 blockwise gemm (#14383)
Signed-off-by: Shu Wang <shuw@nvidia.com>
2025-05-08 15:09:55 -07:00
Michael Goin
4f605a6de5 Fix noisy warning for uncalibrated q_scale/p_scale (#17414)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-08 15:56:59 -04:00
Michael Goin
8342e3abd1 [CI] Prune down lm-eval small tests (#17012)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-08 19:00:26 +00:00
yarongmu-google
a83a0f92b5 [Test] Attempt all TPU V1 tests, even if some of them fail. (#17334)
Signed-off-by: Yarong Mu <ymu@google.com>
2025-05-08 17:20:54 +00:00
Russell Bryant
226a4272cf [V1] Improve VLLM_ALLOW_INSECURE_SERIALIZATION logging (#17860)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-05-08 16:57:35 +00:00
Russell Bryant
ec54d73c31 [CI] Fix test_collective_rpc (#17858)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-05-08 16:47:12 +00:00
Jee Jee Li
a944f8ede7 [Misc] Delete LoRA-related redundancy code (#17841)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-05-08 06:02:21 -07:00
Cyrus Leung
015815fe01 [Bugfix] use_fast failing to be propagated to Qwen2-VL image processor (#17838)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-08 05:39:21 -07:00
Harry Mellor
e4ca6e3a99 Fix transient dependency error in docs build (#17848)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-08 03:42:03 -07:00
Reid
53d0cb7423 [Misc] add chatbox integration (#17828)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-08 10:05:26 +00:00
Lu Fang
f50dcb7c21 [Easy] Eliminate c10::optional usage in vllm/csrc (#17819) 2025-05-08 03:05:10 -07:00
Cyrus Leung
a1e19b635d [Doc] Fix a typo in the file name (#17836)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-08 18:04:18 +08:00
fxmarty-amd
bb239a730f [Bugfix] Fix quark fp8 format loading on AMD GPUs (#12612)
Signed-off-by: Felix Marty <felmarty@amd.com>
Signed-off-by: kewang2 <kewang2@amd.com>
Co-authored-by: kewang2 <kewang2@amd.com>
2025-05-08 02:53:53 -07:00
Jevin Jiang
a463555dee [TPU] Fix the test_sampler (#17820) 2025-05-08 05:51:33 -04:00
Rick Yuan
ca04b97c93 [Bugfix] Fix tool call template validation for Mistral models (#17644)
Signed-off-by: Rick Yuan <yuan821120@gmail.com>
Signed-off-by: RIck Yuan <yuan821120@gmail.com>
Co-authored-by: Aaron Pham <Aaronpham0103@gmail.com>
2025-05-08 09:47:19 +00:00
xsank
0a9bbaa104 [Misc] support model prefix & add deepseek vl2 tiny fused moe config (#17763)
Signed-off-by: 唯勤 <xsank.mz@alibaba-inc.com>
Co-authored-by: 唯勤 <xsank.mz@alibaba-inc.com>
2025-05-08 07:50:22 +00:00
Qiong Zhou Huang
39956efb3f [Bugfix] Fix bad words for Mistral models (#17753)
Signed-off-by: Qiong Zhou Huang <qiong@phonic.co>
2025-05-07 23:32:10 -07:00
Ximingwang-09
597051e56f [Qwen3]add qwen3-235b-bf16 fused moe config on A100 (#17715) 2025-05-07 23:09:32 -07:00
Cyrus Leung
96722aa81d [Frontend] Chat template fallbacks for multimodal models (#17805)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-07 23:05:54 -07:00
Agata Dobrzyniewicz
843b222723 [Hardware][Intel-Gaudi] Support Automatic Prefix Caching on HPU (#17648)
Signed-off-by: Agata Dobrzyniewicz <adobrzyniewicz@habana.ai>
2025-05-07 22:37:03 -07:00
Akash kaothalkar
e515668edf [Hardware][Power] Enable compressed tensor W8A8 INT8 quantization for POWER (#17153)
Signed-off-by: Akash Kaothalkar <akash.kaothalkar@ibm.com>
Co-authored-by: Akash Kaothalkar <akash.kaothalkar@ibm.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-05-07 22:35:03 -07:00
Hashem Hashemi
5a499e70d5 [Kernel][Hardware][AMD] Bf16 mfma opt for ROCm skinny GEMMs (#17071)
Signed-off-by: Hashem Hashemi <hashem.hashemi@amd.com>
Signed-off-by: charlifu <charlifu@amd.com>
Co-authored-by: charlifu <charlifu@amd.com>
2025-05-07 22:34:49 -07:00
Russell Bryant
6930a41116 [V1] Add VLLM_ALLOW_INSECURE_SERIALIZATION env var (#17490)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-05-08 13:34:02 +08:00
Harry Mellor
998eea4a0e Only log non-default CLI args for online serving (#17803)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-07 22:33:29 -07:00
Mikhail Podvitskii
c747d84576 [Installation] OpenTelemetry version update (#17771)
Signed-off-by: Mikhail Podvitskii <podvitskiymichael@gmail.com>
2025-05-07 22:32:49 -07:00
Vadim Markovtsev
b2da14a05a Improve exception reporting in MP engine (#17800)
Signed-off-by: Vadim Markovtsev <vadim@poolside.ai>
2025-05-08 05:32:39 +00:00
Chanh Nguyen
7ea2adb802 [Core] Support full cuda graph in v1 (#16072)
Signed-off-by: Chanh Nguyen <cnguyen@linkedin.com>
Co-authored-by: Chanh Nguyen <cnguyen@linkedin.com>
2025-05-07 22:30:15 -07:00
Nick Hill
3d13ca0e24 [BugFix] Fix --disable-log-stats in V1 server mode (#17600)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-05-08 04:08:15 +00:00
Harry Mellor
66ab3b13c9 Don't call the venv vllm (#17810)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-08 04:06:39 +00:00
Aaron Pham
a8238bbdb0 [Chore][Doc] uses model id determined from OpenAI client (#17815)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-05-08 01:48:57 +00:00
Wallas Henrique
d43f914d42 [Core][Feature] Input metadata dump on crash (#13407)
Signed-off-by: Wallas Santos <wallashss@ibm.com>
2025-05-07 22:15:09 +00:00
Nick Hill
ed5272cf21 [BugFix] Avoid secondary missing MultiprocExecutor.workers error (#17811)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-05-07 21:55:04 +00:00
Akshat Tripathi
c20ef40fd0 [Hardware][TPU][V1] Multi-LoRA implementation for the V1 TPU backend (#14238)
Signed-off-by: Akshat Tripathi <akshat@krai.ai>
Signed-off-by: Chengji Yao <chengjiyao@google.com>
Co-authored-by: Chengji Yao <chengjiyao@google.com>
2025-05-07 16:28:47 -04:00
Bowen Bao
db593aa67f [Quantization] Quark MXFP4 format loading (#16943) 2025-05-07 15:05:05 -04:00
Isotr0py
f98e307588 [Bugfix] Fix missing lora name mapping for lora without prefix (#17793)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-05-07 16:17:12 +00:00
Harry Mellor
646a31e51e Fix and simplify deprecated=True CLI kwarg (#17781)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-07 16:51:06 +01:00
Isotr0py
be8ff88e66 [Bugfix] Fix Video IO error for short video (#17791)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-05-07 15:36:06 +00:00
Christian Heimes
1a6af1453d Only depend on importlib-metadata for Python < 3.10 (#17776)
Signed-off-by: Christian Heimes <christian@python.org>
2025-05-07 07:51:06 -07:00
Gregory Shtrasberg
32aa74c09c [ROCm][FP8][Kernel] FP8 quantization fused into Custom Paged Attention (#17139)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-05-07 07:12:35 -07:00
Reid
7377dd0307 [doc] update the issue link (#17782)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-07 20:29:05 +08:00
Yong Hoon Shin
98c89e16ff Make key optional for rotary embedding (#17566)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-05-07 00:11:46 -07:00
Yong Hoon Shin
324a3119b0 Fix test_memory_usage_no_spec (#17754)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-05-07 00:10:33 -07:00
Cyrus Leung
8a15c2603a [Frontend] Add missing chat templates for various MLLMs (#17758)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-07 00:10:01 -07:00
Satyajith Chilappagari
043e4c4955 Add NeuronxDistributedInference support, Speculative Decoding, Dynamic on-device sampling (#16357)
Signed-off-by: Satyajith Chilappagari <satchill@amazon.com>
Co-authored-by: Aaron Dou <yzdou@amazon.com>
Co-authored-by: Shashwat Srijan <sssrijan@amazon.com>
Co-authored-by: Chongming Ni <chongmni@amazon.com>
Co-authored-by: Amulya Ballakur <amulyaab@amazon.com>
Co-authored-by: Patrick Lange <patlange@amazon.com>
Co-authored-by: Elaine Zhao <elaineyz@amazon.com>
Co-authored-by: Lin Lin Pan <tailinpa@amazon.com>
Co-authored-by: Navyadhara Gogineni <navyadha@amazon.com>
Co-authored-by: Yishan McNabb <yishanm@amazon.com>
Co-authored-by: Mrinal Shukla <181322398+mrinalks@users.noreply.github.com>
2025-05-07 00:07:30 -07:00
Jee Jee Li
ba7703e659 [Misc] Remove qlora_adapter_name_or_path (#17699)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-05-06 23:10:37 -07:00
Wanrui Dai
f80ae5bdcf [Kernel] Use fused rmsnorm for some models like qwen3 series (#17735)
Signed-off-by: evian <eviantai@u.nus.edu>
Co-authored-by: evian <eviantai@u.nus.edu>
2025-05-06 23:10:02 -07:00
Szymon Ożóg
1a45a61387 [Kernel] GGUF MoeVec kernel (#16780)
Signed-off-by: SzymonOzog <szymon.ozog@aleph-alpha.com>
Signed-off-by: SzymonOzog <szymon.ozog@gmail.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
2025-05-06 23:07:23 -07:00
Isotr0py
c3e9d5060e [Misc] Use apply_rotary_emb from vllm_flash_attn for Qwen2-VL vision RoPE (#17726)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-05-07 04:51:33 +00:00
Jee Jee Li
822de7fb94 [Misc] Split model loader (#17712)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-05-07 12:42:26 +08:00
Woosuk Kwon
8d84d836d1 [BugFix][Spec Decode] Fix hidden size mismatch between target and eagle head (#17740)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-05-06 19:51:26 -07:00
Michael Goin
950b71186f Replace lm-eval bash script with pytest and use enforce_eager for faster CI (#17717)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-06 18:00:10 -07:00
Michael Goin
e50a1f1a9c [TPU] Add kernel test for moe_pallas (#17496)
Signed-off-by: Michael Goin <mgoin64@gmail.com>
2025-05-06 17:59:57 -07:00
Michael Goin
a17cef70ea Removed unused marlin cuda code (#17684)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-06 17:59:47 -07:00
Chih-Chieh Yang
18dd5e01f2 [Model] Mamba2 causal conv1d Refactor to Split Prefill and Decode Requests for Corresponding Kernels (#17146)
Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
2025-05-06 17:59:30 -07:00
Yang Wang
6de3e13413 Add logging for torch nightly version (#17669)
Signed-off-by: Yang Wang <elainewy@meta.com>
2025-05-07 00:45:51 +00:00
Hongxia Yang
ed3a1d2106 [ROCm] fix num_stages for default moe config to avoid triton OutOfResource error (#17744)
Signed-off-by: Hongxia Yang <hongxia.yang@amd.com>
2025-05-07 00:39:48 +00:00
Harry Mellor
022afbeb4e Fix doc build performance (#17748)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-07 00:36:41 +00:00
Thomas Parnell
2f925e5777 [Kernel] Unified Triton kernel that doesn't distinguish between prefill + decode (#16828)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Co-authored-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2025-05-06 18:21:48 -04:00
Gregory Shtrasberg
de906b95f9 [Bugfix] Fix for the condition to accept empty encoder inputs for mllama (#17732)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-05-06 19:59:06 +00:00
d.transposed
d456aea71f [Misc] Add Next Edit Prediction (NEP) datasets support in benchmark_serving.py (#16839)
Signed-off-by: dtransposed <damian@damian-ml-machine.europe-west3-b.c.jetbrains-grazie.internal>
Signed-off-by: dtransposed <>
Co-authored-by: dtransposed <damian@damian-ml-machine.europe-west3-b.c.jetbrains-grazie.internal>
2025-05-06 15:38:45 -04:00
Jevin Jiang
621ca2c0ab [TPU] Increase block size and reset block shapes (#16458) 2025-05-06 13:55:04 -04:00
Harry Mellor
6115b11582 Make right sidebar more readable in "Supported Models" (#17723)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-06 16:48:26 +00:00
Cyrus Leung
5b8c390747 [Bugfix] Fix modality limits in vision language example (#17721)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-06 16:12:28 +00:00
Reid
7525d5f3d5 [doc] Add RAG Integration example (#17692)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-06 16:10:23 +00:00
Chen Zhang
aabcd2cae3 [v1] Introduce KVCacheBlocks as interface between Scheduler and KVCacheManager (#17479)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-05-06 08:50:34 -07:00
Michael Yao
0d115460a7 [Docs] Use gh-file to add links to tool_calling.md (#17709)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-05-06 15:27:19 +00:00
Aaron Pham
175bda67a1 [Feat] Add deprecated=True to CLI args (#17426)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-05-06 08:11:27 -07:00
Chen Zhang
cba31c47c4 [v1] AttentionMetadata for each layer (#17394)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-05-06 07:58:37 -07:00
Li, Jiang
a6fed02068 [V1][PP] Support PP for MultiprocExecutor (#14219)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
Signed-off-by: jiang.li <jiang1.li@intel.com>
2025-05-06 07:58:05 -07:00
Michael Goin
d419aa5dc4 [V1] Enable TPU V1 backend by default (#17673)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-06 06:49:49 -07:00
Mengqing Cao
f9bc5a0693 [Bugfix] Fix triton import with local TritonPlaceholder (#17446)
Signed-off-by: Mengqing Cao <cmq0113@163.com>
2025-05-06 17:53:09 +08:00
Harry Mellor
05e1f96419 Fix dockerfilegraph pre-commit hook (#17698)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-06 08:56:48 +00:00
Lucas Wilkinson
6eae34533a [Misc] Fix ScalarType float4 naming (#17690)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2025-05-06 01:07:15 -07:00
Cyrus Leung
63ced7b43f [Doc] Update notes for H2O-VL and Gemma3 (#17219)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-06 07:51:02 +00:00
Mikhail Podvitskii
dc47ba32f8 [Bugfix] Fixed prompt length for random dataset (#17408)
Signed-off-by: Mikhail Podvitskii <podvitskiymichael@gmail.com>
2025-05-06 07:00:08 +00:00
Richard Zou
edbf2d609e [easy] Fix logspam on PiecewiseBackend errors (#17138)
Signed-off-by: rzou <zou3519@gmail.com>
2025-05-05 23:46:11 -07:00
Stan Wozniak
999328be0d [Model] Add GraniteMoeHybrid 4.0 model (#17497)
Signed-off-by: Thomas Ortner <boh@zurich.ibm.com>
Signed-off-by: Stanislaw Wozniak <stw@zurich.ibm.com>
Co-authored-by: Thomas Ortner <boh@zurich.ibm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Tyler Michael Smith <tysmith@redhat.com>
2025-05-06 12:00:31 +08:00
Michael Goin
98834fefaa Update nm to rht in doc links + refine fp8 doc (#17678)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-06 00:41:14 +00:00
Varun Sundar Rabindranath
90bd2ae172 [Bugfix] LoRA - Retire unused maxnreg LoRA kernel argument (#17677) 2025-05-05 17:34:29 -07:00
Nicolò Lucchesi
5941e0b7ea [TPU][V1] Add support for top-logprobs (#17072)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-05-05 14:20:15 -07:00
XiongfeiWei
9765940824 [TPU] Enable gemma3-27b with TP>1 on multi-chips. (#17335)
Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
2025-05-05 14:19:58 -07:00
Nick Hill
5ea5c514da [BugFix] Increase timeout for startup failure test (#17642)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-05-05 20:53:19 +00:00
Russell Bryant
d3efde8176 [Benchmarks] Remove invalid option under V1 engine (#17651)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-05-05 16:30:22 -04:00
Thomas J. Fan
aea302be6c Use git-path commit in hook (#17616)
Signed-off-by: Thomas J. Fan <thomasjpfan@gmail.com>
2025-05-05 17:55:32 +00:00
Isotr0py
cc05b90d86 [Doc] Fix broken cuda installation doc rendering (#17654)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-05-05 17:52:40 +00:00
Jinzhen Lin
1d0c9d6b2d [Kernel] some optimizations for dense marlin and moe marlin (#16850)
Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
2025-05-05 09:39:30 -07:00
Tyler Michael Smith
f62cad6431 [Build/CI] Upgrade CUTLASS to 3.9.2 (#17641)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-05-04 19:23:17 -07:00
Chauncey
5394ad7387 [Bugfix] fix KeyError on top logprobs are special tokens (#17637)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-05-04 19:22:35 -07:00
Tyler Michael Smith
68e1ee0072 [Bugfix][Easy] Fix whitespace in shm_broadcast.py logging (#17635)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-05-04 19:20:19 -07:00
Cyrus Leung
2858830c39 [Bugfix] Prioritize dtype in root config before checking text config (#17629)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-04 12:43:05 +00:00
Harry Mellor
d6484ef3c3 Add full API docs and improve the UX of navigating them (#17485)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-03 19:42:43 -07:00
Cyrus Leung
46fae69cf0 [Misc] V0 fallback for --enable-prompt-embeds (#17615)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-03 22:59:24 +00:00
Isotr0py
f66f1e0fa3 [Bugfix] Fix broken Qwen2.5-omni tests (#17613)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-05-03 17:08:14 +00:00
Cyrus Leung
887d7af882 [Core] Gate prompt_embeds behind a feature flag (#17607)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-04 00:19:20 +08:00
Gregory Shtrasberg
a92842454c [Bugfix][ROCm] Using device_type because on ROCm the API is still torch.cuda (#17601)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-05-02 22:25:47 -07:00
Tyler Michael Smith
c8386fa61d [Build/CI] Upgrade CUTLASS to 3.9.1 (#17602)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-05-02 22:25:14 -07:00
Chenyaaang
87baebebd8 [Frontend][TPU] Add TPU default max-num-batched-tokens based on device name (#17508)
Signed-off-by: Chenyaaang <chenyangli@google.com>
2025-05-02 21:42:44 -07:00
rasmith
e3d0a1d190 [Quantizaton] [AMD] Add support for running DeepSeek int8 w8a8 MoE on ROCm (#17558)
Signed-off-by: Randall Smith <Randall.Smith@amd.com>
2025-05-02 21:41:10 -07:00
22quinn
d47b605eca Update test requirements to CUDA 12.8 (#17576)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-05-02 21:40:15 -07:00
Liangfu Chen
22c6f6397f [Neuron][Build] Require setuptools >= 77.0.3 for PEP 639 (#17603)
Signed-off-by: Liangfu Chen <liangfc@amazon.com>
2025-05-03 02:41:59 +00:00
Kevin H. Luu
3ec97e2cc5 [release] Add command to clean up Docker containers/images in TPU release machine (#17606) 2025-05-02 18:54:34 -07:00
Eric Hartford
9b103a1d76 fix typo in logging (#17605) 2025-05-02 18:04:40 -07:00
Richard Zou
b90b0852e9 [easy] Print number of needed GPUs in skip message (#17594)
Signed-off-by: rzou <zou3519@gmail.com>
2025-05-02 15:27:43 -07:00
Xiaodong Wang
9352cdb56d [Hardware][AMD] Improve OAM device ID + llama4 Maverick MOE tuning (#16263)
Signed-off-by: Lu Fang <lufang@fb.com>
Co-authored-by: Lu Fang <lufang@fb.com>
2025-05-02 19:44:19 +00:00
Zhiyu
182f40ea8b Add NVIDIA TensorRT Model Optimizer in vLLM documentation (#17561) 2025-05-02 11:36:46 -07:00
Caleb_Du
3e887d2e0c permute/unpermute kernel for moe optimization (#14568)
Signed-off-by: Caleb_Du <Caleb_Du@zju.edu.cn>
2025-05-02 11:31:55 -07:00
Lucas Wilkinson
0f87d8f7b2 [BugFix][Attention] Fix sliding window attention in V1 giving incorrect results (#17574)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2025-05-02 11:01:38 -07:00
Hui Liu
4c33d67321 [Bugfix] fix tmp_out and exp_sums dimensions (#17438)
Signed-off-by: Hui Liu <96135754+hliuca@users.noreply.github.com>
2025-05-02 16:44:07 +00:00
Cyrus Leung
cb234955df [Misc] Clean up input processing (#17582)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-02 08:11:53 -07:00
Reid
3a500cd0b6 [doc] miss result (#17589)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-02 07:04:49 -07:00
Michael Goin
868c546da4 Support W8A8 INT8 MoE for compressed-tensors (#16745)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-02 10:03:32 -04:00
Cyrus Leung
99404f53c7 [Security] Fix image hash collision (#17378)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-02 08:36:39 -04:00
Harry Mellor
785d75a03b Automatically tell users that dict args must be valid JSON in CLI (#17577)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-02 05:24:55 -07:00
Reid
6d1479ca4b [doc] add the print result (#17584)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-02 05:24:45 -07:00
Yang Wang
b8b0859b5c add more pytorch related tests for torch nightly (#17422)
Signed-off-by: Yang Wang <elainewy@meta.com>
2025-05-02 03:29:59 -07:00
Cyrus Leung
d7543862bd [Misc] Rename assets for testing (#17575)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-02 03:29:25 -07:00
Robert Shaw
c777df79f7 [BugFix] Fix Memory Leak (#17567)
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-05-02 01:07:03 -07:00
Andrew Sansom
cc2a77d7f1 [Core] [Bugfix] Add Input Embeddings (#15428)
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: 临景 <linjing.yx@alibaba-inc.com>
Co-authored-by: Bryce1010 <bryceyx@gmail.com>
Co-authored-by: Nan2018 <nan@protopia.ai>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-02 01:06:39 -07:00
Isotr0py
9e2de9b9e9 [Bugifx] Remove TritonPlaceholder from sys.modules (#17317)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-05-02 00:45:01 -07:00
Jerry Zhang
109e15a335 Add pt_load_map_location to allow loading to cuda (#16869)
Signed-off-by: Jerry Zhang <jerryzh168@gmail.com>
2025-05-01 23:23:42 -07:00
Michael Goin
f192ca90e6 Fix PixtralHF missing spatial_merge_size (#17571)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-01 22:14:09 -07:00
Cyrus Leung
f89d0e11bf [Misc] Continue refactoring model tests (#17573)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-01 22:06:08 -07:00
Michael Goin
b4003d11fc Check if bitblas is installed during support check (#17572)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-02 04:32:54 +00:00
Michael Goin
292fc59d61 [CI] Actually run tests/kv_transfer/test_disagg.py in CI (#17555)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-02 04:05:04 +00:00
Lucas Wilkinson
afcb3f8863 [Attention] MLA move o_proj q_proj into cuda-graph region (#17484)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2025-05-02 03:16:26 +00:00
David Xia
afb12e4294 [Doc] note that not all unit tests pass on CPU platforms (#17554)
Signed-off-by: David Xia <david@davidxia.com>
2025-05-02 02:57:21 +00:00
Michael Goin
24aebae177 [Bugfix] Disable gptq_bitblas for <SM80 to fix GPTQ on V100/T4 (#17541)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-01 17:59:35 -07:00
qizixi
39c0813a7f [V1][Spec Decode] Apply torch.compile & cudagraph to EAGLE3 (#17504)
Signed-off-by: qizixi <qizixi@meta.com>
2025-05-01 16:19:30 -07:00
Chenyaaang
9b70e2b4c1 [Misc][Tools][Benchmark] Publish script to auto tune server parameters (#17207)
Signed-off-by: Chenyaaang <chenyangli@google.com>
2025-05-01 19:53:03 +00:00
Chen Xia
173daac19d [Bug]change the position of cuda_graph_sizes in dataclasses (#17548)
Signed-off-by: CXIAAAAA <cxia0209@gmail.com>
2025-05-01 11:52:37 -07:00
sstamenk
04f2cfc894 Remove duplicate code from dbrx.py (#17550) 2025-05-01 11:51:58 -07:00
Juan Villamizar
811a6c0972 [ROCM] Add gfx950 to the custom attention archs (#16034)
Signed-off-by: jpvillam <Juan.Villamizar@amd.com>
Signed-off-by: seungrokjung <seungrok.jung@amd.com>
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Co-authored-by: seungrokjung <seungrok.jung@amd.com>
Co-authored-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-05-01 11:18:28 -07:00
Cyrus Leung
9b1769dd9a [Bugfix] Fix lint error (#17547)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-01 11:12:19 -07:00
Chen Xia
61c299f81f [Misc]add configurable cuda graph size (#17201)
Signed-off-by: CXIAAAAA <cxia0209@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-01 11:04:50 -07:00
Hongxia Yang
4acfa3354a [ROCm] update installation guide to include build aiter from source instructions (#17542)
Signed-off-by: Hongxia Yang <hongxia.yang@amd.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-05-01 11:01:28 -07:00
Isotr0py
88c8304104 [Model] Refactor Ovis2 to support original tokenizer (#17537)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-05-01 11:00:53 -07:00
Harry Mellor
6768ff4a22 Move the last arguments in arg_utils.py to be in their final groups (#17531)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-01 10:31:44 -07:00
Cyrus Leung
f2e7af9b86 [CI/Build] Remove awscli dependency (#17532)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-01 09:20:54 -07:00
Reid
7423cf0a9b [Misc] refactor example - cpu_offload_lmcache (#17460)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-01 15:05:24 +00:00
Sage Moore
460a2b1100 [torch.compile] Add torch inductor pass for fusing silu_and_mul with subsequent scaled_fp8_quant operations (#10867)
Signed-off-by: Sage Moore <sage@neuralmagic.com>
2025-05-01 07:59:28 -07:00
Hongxia Yang
28566d73b3 [ROCm] remove unsupported archs from rocm triton flash-attention supported list (#17536)
Signed-off-by: Hongxia Yang <hongxia.yang@amd.com>
2025-05-01 07:54:25 -07:00
Chauncey
98060b001d [Feature][Frontend]: Deprecate --enable-reasoning (#17452)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-05-01 06:46:16 -07:00
TJian
f5a3c655b2 [FEAT] [ROCm]: Add Qwen/Qwen3-235B-A22B-FP8 TP4 triton fused moe config (#17535)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-05-01 06:37:17 -07:00
Reid
7169f87ad0 [doc] add streamlit integration (#17522)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-01 13:34:02 +00:00
Huy Do
b74d888c63 Fix more broken speculative decode tests (#17450)
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-05-01 06:05:58 -07:00
TJian
2007d4d54f [FEAT] [ROCm]: Add Qwen/Qwen3-30B-A3B-FP8 fused moe config for MI300X (#17530)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-05-01 06:03:13 -07:00
Cyrus Leung
48e925fab5 [Misc] Clean up test docstrings and names (#17521)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-01 05:19:32 -07:00
Cyrus Leung
1903c0b8a3 [Frontend] Show progress bar for adding requests (#17525)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-01 05:15:32 -07:00
Teruaki Ishizaki
86a1f67a3b [Bugfix][Benchmarks] Allow benchmark of deepspeed-mii backend to select a model (#17285)
Signed-off-by: Teruaki Ishizaki <teruaki.ishizaki@ntt.com>
2025-05-01 11:54:51 +00:00
Harry Mellor
a257d9bccc Improve configs - ObservabilityConfig (#17453)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-01 03:52:05 -07:00
Chauncey
015069b017 [Misc] Optimize the Qwen3_ReasoningParser extract_reasoning_content (#17515)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-05-01 03:29:01 -07:00
Russell Bryant
fbefc8a78d [Core] Enable IPv6 with vllm.utils.make_zmq_socket() (#16506)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-05-01 09:38:18 +00:00
Keyun Tong
26bc4bbcd8 Avoid overwriting vllm_compile_cache.py (#17418)
Signed-off-by: Keyun Tong <tongkeyun@gmail.com>
2025-05-01 07:30:57 +00:00
Lucas Wilkinson
3c3d767201 [BugFix] Fix mla cpu - missing 3 required positional arguments (#17494)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2025-05-01 14:36:52 +08:00
Noah Yoshida
13cf6b6236 [BugFix] fix speculative decoding memory leak when speculation is disabled (#15506)
Signed-off-by: Noah Yoshida <noahcy117@gmail.com>
2025-04-30 23:28:17 -07:00
Hongxia Yang
90d0a54c4d [ROCm] Effort to reduce the number of environment variables in command line (#17229)
Signed-off-by: Hongxia Yang <hongxia.yang@amd.com>
2025-04-30 23:27:06 -07:00
Russell Bryant
7a0a146c54 [Build] Require setuptools >= 77.0.3 for PEP 639 (#17389)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-04-30 23:25:36 -07:00
Alexei-V-Ivanov-AMD
7ab643e425 FIxing the AMD test failures caused by PR#16457 (#17511)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
2025-04-30 23:23:07 -07:00
Cyrus Leung
afb4429b4f [CI/Build] Reorganize models tests (#17459)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-30 23:03:08 -07:00
Michael Goin
aa4502e7f3 [CI][Bugfix] Fix failing V1 Test due to missing 'cache_salt' arg (#17500)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-04-30 21:03:30 -07:00
Michael Goin
17b4d85f63 [CI][TPU] Skip structured outputs+spec decode tests on TPU (#17510)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-04-30 20:36:20 -07:00
NaLan ZeYu
1144a8efe7 [Bugfix] Temporarily disable gptq_bitblas on ROCm (#17411)
Signed-off-by: Yan Cangang <nalanzeyu@gmail.com>
2025-04-30 19:51:45 -07:00
Gregory Shtrasberg
08fb5587b4 [Bugfix][ROCm] Fix import error on ROCm (#17495)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-04-30 19:51:42 -07:00
Siyuan Liu
dbc18e7816 [CI][TPU] Skip Multimodal test (#17488)
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
2025-04-30 19:51:39 -07:00
Alex Brooks
02bd654846 [Misc] Rename Audios -> Audio in Qwen2audio Processing (#17507)
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
2025-04-30 19:51:36 -07:00
Rahul Tuli
200bbf92e8 Bump Compressed Tensors version to 0.9.4 (#17478)
Signed-off-by: Rahul Tuli <rtuli@redhat.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-04-30 15:24:45 -07:00
Chen Zhang
81ecf425f0 [v1][Spec Decode] Make sliding window compatible with eagle prefix caching (#17398)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-04-30 18:25:53 +00:00
David Xia
42d9a2c4c7 doc: fix bug report Github template formatting (#17486)
Signed-off-by: David Xia <david@davidxia.com>
2025-04-30 10:03:20 -07:00
Reid
2ac74d098e [doc] add install tips (#17373)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-30 17:02:41 +00:00
Gregory Shtrasberg
584f5fb4c6 [Bugfix][ROCm] Restrict ray version due to a breaking release (#17480)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-04-30 09:59:06 -07:00
zh Wang
d586ddc691 [BugFix] Fix authorization of openai_transcription_client.py (#17321)
Signed-off-by: zh Wang <rekind133@outlook.com>
2025-04-30 09:51:05 -07:00
Michael Goin
0b7e701dd4 [Docs] Update optimization.md doc (#17482)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-04-30 09:34:02 -07:00
Russell Bryant
947f2f5375 [V1] Allow turning off pickle fallback in vllm.v1.serial_utils (#17427)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-04-30 16:10:54 +00:00
Pete Savage
739e03b344 [Bugfix] Fixed mistral tokenizer path when pointing to file (#17457)
Signed-off-by: Pete Savage <psavage@redhat.com>
2025-04-30 08:08:37 -07:00
Aaron Pham
da4e7687b5 [Fix] Support passing args to logger (#17425)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-04-30 08:06:58 -07:00
Russell Bryant
39317cf42b [Docs] Add command for running mypy tests from CI (#17475)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-04-30 08:06:09 -07:00
Chauncey
2990cee95b [Feature] The Qwen3 reasoning parser supports guided decoding (#17466)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-04-30 07:48:21 -07:00
Alec
0be6d05b5e [V1][Metrics] add support for kv event publishing (#16750)
Signed-off-by: alec-flowers <aflowers@nvidia.com>
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Co-authored-by: Mark McLoughlin <markmc@redhat.com>
2025-04-30 07:44:45 -07:00
Marko Rosenmueller
77073c77bc [Core] Prevent side-channel attacks via cache salting (#17045)
Signed-off-by: Marko Rosenmueller <5467316+dr75@users.noreply.github.com>
2025-04-30 20:27:21 +08:00
Nicolò Lucchesi
a7d5b016bd [TPU][V1][CI] Update regression test baseline for v6 CI (#17064)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-04-30 04:03:22 -07:00
rongfu.leng
d803786731 [V1][Bugfix]: vllm v1 verison metric num_gpu_blocks is None (#15755)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-04-30 18:20:39 +08:00
Chauncey
1534d389af [Misc] Remove deprecated files (#17447)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-04-30 01:52:19 -07:00
Lu Fang
ece5a8b0b6 Make the _apply_rotary_emb compatible with dynamo (#17435) 2025-04-30 07:52:48 +00:00
Marco
54072f315f [MODEL ADDITION] Ovis2 Model Addition (#15826)
Signed-off-by: Marco <121761685+mlinmg@users.noreply.github.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-04-30 07:33:29 +00:00
Chauncey
be633fba0f [Bugfix] Fix AttributeError: 'State' object has no attribute 'engine_client' (#17434)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-04-30 00:11:04 -07:00
Kunshang Ji
ed6cfb90c8 [Hardware][Intel GPU] Upgrade to torch 2.7 (#17444)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
Co-authored-by: Qiming Zhang <qiming1.zhang@intel.com>
2025-04-30 00:03:58 -07:00
Kunshang Ji
6ed9f6047e [Intel GPU] [CI]Fix XPU ci, setuptools >=80.0 have build issue (#17298)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2025-04-29 22:54:10 -07:00
Michael Goin
a44c4f1d2f Support LoRA for Mistral3 (#17428)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-04-29 21:10:30 -07:00
Huy Do
88fcf00dda Fix some speculative decode tests with tl.dot (#17371)
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-04-29 19:41:02 -07:00
Harry Mellor
d1f569b1b9 Fix call to logger.info_once (#17416)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-29 19:39:18 -07:00
Harry Mellor
13698db634 Improve configs - ModelConfig (#17130)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-30 10:38:22 +08:00
Huy Do
2c4f59afc3 Update PyTorch to 2.7.0 (#16859) 2025-04-29 19:08:04 -07:00
Gabriel Marinho
1c2bc7ead0 Truncation control for embedding models (#14776)
Signed-off-by: Gabriel Marinho <gmarinho@ibm.com>
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: Max de Bayser <mbayser@br.ibm.com>
2025-04-30 09:24:57 +08:00
Kevin H. Luu
4055130a85 [release] Always git fetch all to get latest tag on TPU release (#17322) 2025-04-29 17:52:11 -07:00
Benjamin Chislett
34120f5acd [V1][Feature] Enable Speculative Decoding with Structured Outputs (#14702)
Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
Signed-off-by: Benjamin Chislett <chislett.ben@gmail.com>
2025-04-30 00:02:10 +00:00
Harry Mellor
7489ec0bab Remove Bamba 9B from CI (#17407)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-29 21:10:31 +00:00
Bryan Lu
70788bdbdc [V1][Spec Decode] Apply torch.compile & cudagraph to EAGLE (#17211)
Signed-off-by: Bryan Lu <yuzhelu@amazon.com>
2025-04-29 21:10:00 +00:00
Dilip Gowda Bhagavan
c9c1b59e59 Fix: Python package installation for opentelmetry (#17049)
Signed-off-by: Dilip Gowda Bhagavan <dilip.bhagavan@ibm.com>
2025-04-29 20:20:24 +00:00
Harry Mellor
0350809f3a Remove Falcon3 2x7B from CI (#17404)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-29 19:52:25 +00:00
Harry Mellor
a6977dbd15 Simplify (and fix) passing of guided decoding backend options (#17008)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-29 19:02:23 +00:00
Isotr0py
2fa2a50bf9 [Bugfix] Fix Minicpm-O-int4 GPTQ model inference (#17397)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-04-29 18:21:42 +00:00
Reid
08e15defa9 [CI/Build] Add retry mechanism for add-apt-repository (#17107)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-29 10:40:52 -07:00
Aaron Pham
b37685afbb [CI] Uses Python 3.11 for TPU (#17359)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-04-29 17:39:16 +00:00
Nicolò Lucchesi
792595b59d [TPU][V1][CI] Replace python3 setup.py develop with standard pip install --e on TPU (#17374)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-04-29 10:36:48 -07:00
casinca
0c1c788312 [Doc][Typo] Fixing label in new model requests link in overview.md (#17400) 2025-04-29 10:29:48 -07:00
Russell Bryant
56d64fbe30 [Docs] Propose a deprecation policy for the project (#17063)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-04-29 10:29:44 -07:00
Alexei-V-Ivanov-AMD
608968b7c5 Enabling multi-group kernel tests. (#17115)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
2025-04-29 10:27:27 -07:00
TY-AMD
06ffc7e1d3 [Misc][ROCm] Exclude cutlass_mla_decode for ROCm build (#17289)
Signed-off-by: Tianyuan Wu <Tianyuan.Wu@amd.com>
2025-04-29 10:26:42 -07:00
Qiming Zhang
d3cf61b89b fix gemma3 results all zero (#17364)
Signed-off-by: mayuyuace <qiming1.zhang@intel.com>
2025-04-29 09:40:25 -07:00
mofanke
a39203f99e [Bugfix] add qwen3 reasoning-parser fix content is None when disable … (#17369)
Signed-off-by: mofanke <mofanke@gmail.com>
2025-04-29 16:32:40 +00:00
Chen Zhang
24e6ad3f16 [V1] Remove num_input_tokens from attn_metadata (#17193)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-04-29 09:28:41 -07:00
Harry Mellor
2ef5d106bb Improve literal dataclass field conversion to argparse argument (#17391)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-29 16:25:08 +00:00
a2q1p
0ed27ef66c Fix: Spelling of inference (#17387) 2025-04-29 09:23:39 -07:00
Harry Mellor
900edfa8d4 Transformers backend tweaks (#17365)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-29 09:08:03 -07:00
Cyrus Leung
88ad9ec6b2 [Frontend] Support chat_template_kwargs in LLM.chat (#17356)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-29 22:03:35 +08:00
Harry Mellor
40896bdf3f pre-commit autoupdate (#17380)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-29 06:46:55 -07:00
Cyrus Leung
00ee37efa2 [Bugfix] Clean up MiniMax-VL and fix processing (#17354)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-29 20:42:16 +08:00
Jee Jee Li
890f104cdf [Doc] Fix QWen3MOE info (#17381)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-04-29 12:38:32 +00:00
Harry Mellor
4a5e13149a Update docs requirements (#17379)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-29 11:35:47 +00:00
Ekagra Ranjan
97cc8729f0 [Model] Ignore rotary embed load for Cohere model (#17319) 2025-04-29 00:30:40 -07:00
Gregory Shtrasberg
4464109219 [Build][Bugfix] Restrict setuptools version to <80 (#17320)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-04-29 00:17:23 -07:00
Hyogeun Oh (오효근)
193e78e35d [Fix] Documentation spacing in compilation config help text (#17342)
Signed-off-by: Zerohertz <ohg3417@gmail.com>
2025-04-29 00:16:17 -07:00
ponix-j
bdb2cddafc [Misc]Use a platform independent interface to obtain the device attributes (#17100) 2025-04-29 06:59:13 +00:00
Cyrus Leung
ebb3930d28 [Misc] Move config fields to MultiModalConfig (#17343)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-29 06:37:21 +00:00
qscqesze
cde384cd92 [Model] support MiniMax-VL-01 model (#16328)
Signed-off-by: qingjun <qingjun@minimaxi.com>
2025-04-29 12:05:50 +08:00
Chauncey
96e06e3cb7 [Misc] Add a Jinja template to support Mistral3 function calling (#17195)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-04-28 19:53:44 -07:00
Zhengyuan Su (苏政渊)
17eb306fcc [Bugfix] Add contiguous call inside rope kernel wrapper (#17091)
Signed-off-by: 苏政渊 <suzhengyuan@moonshot.cn>
Co-authored-by: 苏政渊 <suzhengyuan@moonshot.cn>
2025-04-28 19:24:07 -07:00
Richard Zou
165cb56329 Ignore '<string>' filepath (#17330)
Signed-off-by: rzou <zou3519@gmail.com>
2025-04-28 19:23:29 -07:00
Richard Barnes
d6da8a8ff2 [Bugfix] Fix numel() downcast in fused_layernorm_dynamic_per_token_quant.cu (#17316) 2025-04-28 19:23:18 -07:00
Lucia Fang
b4ac4fa04d [model] make llama4 compatible with pure dense layers (#17315)
Signed-off-by: Lucia Fang <fanglu@fb.com>
2025-04-29 10:22:22 +08:00
Ekagra Ranjan
e136000595 [V1][Spec Decode] Make Eagle model arch config driven (#17323) 2025-04-29 10:22:02 +08:00
Michał Moskal
86d9fc29cb implement Structural Tag with Guidance backend (#17333)
Signed-off-by: Michal Moskal <michal@moskal.me>
2025-04-29 02:21:32 +00:00
Cyrus Leung
506475de5f [Optim] Compute multimodal hash only once per item (#17314)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-29 09:40:35 +08:00
Ekagra Ranjan
cfe4532093 [Benchmark] Add single turn MTBench to Serving Bench (#17202) 2025-04-28 16:46:15 -07:00
Michael Goin
8fc88d63f1 [Model] Add tuned triton fused_moe configs for Qwen3Moe (#17328)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-04-28 15:20:24 -07:00
Alex Wu
6e74fd4945 Support loading transformers models with named parameters (#16868)
Signed-off-by: Alex <alexwu@character.ai>
2025-04-28 23:15:58 +01:00
Simon Mo
dcbac4cb4b [Model] Qwen3 Dense FP8 Compat Fixes (#17318)
Signed-off-by: simon-mo <xmo@berkeley.edu>
2025-04-28 14:12:01 -07:00
Charlie Fu
ed2462030f [Bugfix] Fix moe weight losing all extra attrs after process_weights_after_loading. (#16854)
Signed-off-by: charlifu <charlifu@amd.com>
2025-04-28 21:05:07 +00:00
Lucas Wilkinson
cc5befbced [BugFix] Fix cascade attention - RuntimeError: scheduler_metadata must have shape (metadata_size) (#17283)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2025-04-28 13:55:50 -07:00
Aaron Pham
2c89cd96a8 [Chore] cleanup license indicators in light of SPDX (#17259)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
2025-04-28 19:43:52 +00:00
Russell Bryant
a0304dc504 [Security] Don't bind tcp zmq socket to all interfaces (#17197)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-04-28 10:08:20 -07:00
Harry Mellor
c7941cca18 Explicitly explain quant method override ordering and ensure all overrides are ordered (#17256)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-28 16:55:31 +00:00
Harry Mellor
b6dd32aa07 Make name of compressed-tensors quant method consistent across vLLM (#17255)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-28 16:28:13 +00:00
Harry Mellor
f94886946e Improve conversion from dataclass configs to argparse arguments (#17303)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-28 16:22:12 +00:00
Russell Bryant
72dfe4c74f [Docs] Add a security guide (#17230)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-04-28 15:12:17 +00:00
Cyrus Leung
8b464d9660 [Misc] Clean up Qwen2.5-Omni code (#17301)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-28 06:20:45 -07:00
Nicolò Lucchesi
889ebb2638 [Misc] Minor typo/grammar in platforms/interface.py (#17307)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-04-28 05:45:42 -07:00
Reid
3ad986c28b [doc] update wrong model id (#17287)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-28 04:20:51 -07:00
Cyrus Leung
344e193b7d [Bugfix] Add missing get_language_model to new MLLMs (#17300)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-28 04:09:57 -07:00
Harry Mellor
fb1c933ade Add missing class docstring for PromptAdapterConfig (#17302)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-28 04:06:59 -07:00
idouba
72c5b97231 Update tpu_worker.py 's typo (#17288) 2025-04-28 04:01:15 -07:00
Alex Brooks
fa93cd9f60 [Model] Add Granite Speech Support (#16246)
Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
2025-04-28 10:05:00 +00:00
Cyrus Leung
aec9674dbe [Core] Remove legacy input mapper/processor from V0 (#15686)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-28 15:38:48 +08:00
Wanrui Dai
7fcc4223dc [Minor][Models] Pass partial_rotary_factor parameter to rope (#17266)
Signed-off-by: evian <eviantai@u.nus.edu>
Co-authored-by: evian <eviantai@u.nus.edu>
2025-04-28 04:28:59 +00:00
Nick Hill
8262a3e23b [Misc] Validate stop_token_ids contents (#17268)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-04-28 03:54:05 +00:00
Reid
f211331c48 [Doc] small fix (#17277)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-28 03:53:35 +00:00
Kuntai Du
9053d0b134 [Doc] Fix wrong github link in LMCache examples (#17274)
Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
2025-04-28 03:09:11 +00:00
Michael Goin
cb3f2d8d10 [Bugfix] Fix Mistral3 spatial merge error (#17270)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-04-27 19:40:05 -07:00
TherLF
c12df53b60 [Bugfix] Fix cutlass dispatch for fp8/int8 to properly invoke M<=16 c… (#16751)
Signed-off-by: Ther-LF <2639852836@qq.com>
2025-04-27 19:38:42 -07:00
Lennart K. M. Schulz
d1aeea7553 [Bugfix] Fix missing ARG in Dockerfile for arm64 platforms (#17261)
Signed-off-by: lkm-schulz <44176356+lkm-schulz@users.noreply.github.com>
2025-04-27 19:38:14 -07:00
Lucas Wilkinson
d8bccde686 [BugFix] Fix vllm_flash_attn install issues (#17267)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Aaron Pham <contact@aarnphm.xyz>
2025-04-27 17:27:56 -07:00
Lily Liu
20e489eaa1 [V1][Spec Decode] Make eagle compatible with prefix caching. (#17137)
Signed-off-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
2025-04-27 09:29:43 -07:00
Cyrus Leung
4213475ec7 [Metrics] Fix minor inconsistencies in bucket progression (#17262)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-27 16:19:39 +00:00
Reid
d92879baf6 [doc] Add feature status legend (#17257)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-27 08:17:02 -07:00
cascade
690fe019f0 [Feature] support sequence parallelism using compilation pass (#16155)
Signed-off-by: cascade812 <cascade812@outlook.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-04-27 06:29:35 -07:00
Kaixi Hou
ed7a29d9f8 [NVIDIA] Support Cutlass MLA for Blackwell GPUs (#16032)
Signed-off-by: kaixih <kaixih@nvidia.com>
2025-04-27 06:29:21 -07:00
Alex Brooks
756848e79e [Bugfix] Fix Lora Name Parsing (#17196)
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-04-27 20:33:09 +08:00
Flex Wang
18445edd0f [Misc] Change buckets of histogram_iteration_tokens to [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096] to represent number of tokens (#17033)
Signed-off-by: sfc-gh-zhwang <flex.wang@snowflake.com>
2025-04-27 12:30:53 +00:00
Jade Zheng
30215ca61f [MISC] Use string annotation types for class definitions (#17244)
Signed-off-by: Jade Zheng <zheng.shoujian@outlook.com>
2025-04-27 08:39:57 +00:00
Chen Zhang
838cedade7 [Bugfix] Get a specific type of layer from forward context (#17222)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-04-27 00:58:05 -07:00
Jee Jee Li
4283a28c2f [Bugfix] Fix QWen2 VL multimodal mapping (#17240)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-04-27 05:53:23 +00:00
Cyrus Leung
93a126fbc7 [Misc] Make cached tokenizer pickle-compatible (#17048)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-27 13:05:00 +08:00
rasmith
8e4b351a0c [Kernel][Triton][FP8] Adding fp8 and variable length sequence support to Triton FAv2 kernel (#12591)
Signed-off-by: Randall Smith <Randall.Smith@amd.com>
2025-04-27 00:35:08 +00:00
Happy
9869453c42 Update test_flash_attn.py (#17102)
Signed-off-by: ShuaibinLi <lishuaibin@live.cn>
2025-04-26 22:17:35 +00:00
Reid
3642c59aa8 [CI/Build] remove -t for run-lm-eval-gsm-hf-baseline.sh (#16271)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-26 18:25:05 +00:00
Woosuk Kwon
43eea2953b [Minor] Fix lint error in main branch (#17233)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-04-26 11:10:14 -07:00
Kero Liang
de7eb10ce4 [Bugfix] Fix Qwen2.5-Omni M-RoPE position ids generation (#16878)
Signed-off-by: imkero <kerorek@outlook.com>
2025-04-26 10:41:35 -07:00
Ning Xie
fd11a325b8 [MISC] rename interval to max_recent_requests (#14285) 2025-04-26 16:59:18 +00:00
Lu Fang
4d17e20310 Disable the torch.compile cache checks when VLLM_DISABLE_COMPILE_CACHE=1 (#16573)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-04-26 09:17:58 -07:00
changjun.lee
10fd1d7380 [Bugfix] fix error due to an uninitialized tokenizer when using skip_tokenizer_init with num_scheduler_steps (#9276)
Signed-off-by: changjun.lee <pord7457@gmail.com>
2025-04-26 11:51:17 -04:00
Russell Bryant
52b4f4a8d7 [Docs] Update structured output doc for V1 (#17135)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-04-26 15:12:18 +00:00
Aaron Pham
e782e0a170 [Chore] added stubs for vllm_flash_attn during development mode (#17228)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-04-26 07:45:26 -07:00
Ning Xie
dc2ceca5c5 [BUGFIX] use random for NONE_HASH only when PYTHONHASHSEED not set (#17088)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-04-26 14:34:24 +00:00
Russell Bryant
f8acd01ff7 [V1] Add structural_tag support using xgrammar (#17085) 2025-04-26 14:06:37 +00:00
Agata Dobrzyniewicz
c48334d405 [Hardware][Intel-Gaudi] Update hpu-extension and update bucketing system for HPU device (#17186)
Signed-off-by: Agata Dobrzyniewicz <adobrzyniewicz@habana.ai>
2025-04-26 05:55:14 -07:00
Cyrus Leung
909fdaf152 [Bugfix] Fix standard models tests (#17217)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-26 02:26:41 -07:00
Isotr0py
8c1c926d00 [Bugfix] Fix missing int type for -n in multi-image example (#17223) 2025-04-26 08:49:52 +00:00
Nick Hill
df6f3ce883 [Core] Remove prompt string from engine core data structures (#17214)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-04-25 23:41:05 -07:00
Woosuk Kwon
513f074766 [CI/test] Fix Eagle Correctness Test (#17209)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-04-25 23:40:36 -07:00
Nick Hill
b07bf83c7d [BugFix] Avoid race conditions in zero-copy tensor transmission (#17203)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-04-26 06:00:07 +00:00
Zijing Liu
53e8cf53a4 [V1][Metrics] Allow V1 AsyncLLM to use custom logger (#14661)
Signed-off-by: Zijing Liu <liuzijing2014@gmail.com>
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Mark McLoughlin <markmc@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-04-25 22:05:40 -07:00
Charlie Fu
54271bb766 [ROCm][Misc] Follow-ups for Skinny Gemms on ROCm. (#17011)
Signed-off-by: charlifu <charlifu@amd.com>
2025-04-25 22:05:10 -07:00
Shu Wang
9e96f56efb Allocate kv_cache with stride order (#16605)
Signed-off-by: shuw <shuw@nvidia.com>
2025-04-25 22:03:31 -07:00
Woosuk Kwon
b278911229 [Minor][Models] Fix Return Types of Llama & Eagle (#17220)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-04-25 21:54:47 -07:00
yarongmu-google
7bd0c7745c [Doc] Minor fix for the vLLM TPU setup page (#17206)
Signed-off-by: Yarong Mu <ymu@google.com>
2025-04-26 04:39:56 +00:00
Woosuk Kwon
1cf0719ebd [Minor][Spec Decode] Add use_eagle to SpeculativeConfig (#17213)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-04-25 21:08:15 -07:00
Reid
537d5ee025 [doc] add Anything LLM integration (#17216)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-25 21:03:23 -07:00
Lu Fang
c8e5be35f7 [MISC][AMD] Add unused annotation to rocm kernel file (#17097)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-04-25 20:33:35 -07:00
James Wu
a6e72e1e4f [Bugfix] [pytorch] Patch AOTAutogradCache._get_shape_env (#17142)
Signed-off-by: James Wu <jjwu@meta.com>
2025-04-26 11:28:20 +08:00
Yihua Cheng
5e83a7277f [v1] [P/D] Adding LMCache KV connector for v1 (#16625) 2025-04-26 03:03:38 +00:00
rasmith
68af5f6c5c [AMD][FP8][BugFix] Remove V1 check in arg_utils.py for FP8 since it is not necessary (#17215)
Signed-off-by: Randall Smith <Randall.Smith@amd.com>
2025-04-25 19:55:05 -07:00
Chen Zhang
8de2901fea [Bugfix] gemma[2,3] interleaved attention when sliding window is disabled (#17180)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-04-25 19:53:51 -07:00
Rui Qiao
c53e0730cb [Misc] Refine ray_serve_deepseek example (#17204)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-04-25 16:06:59 -07:00
Benjamin Chislett
a0e619e62a [V1][Spec Decode] EAGLE-3 Support (#16937)
Signed-off-by: Bryan Lu <yuzhelu@amazon.com>
Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
Co-authored-by: Bryan Lu <yuzhelu@amazon.com>
2025-04-25 15:43:07 -07:00
Nick Hill
70116459c3 [BugFix][Frontend] Fix LLM.chat() tokenization (#16081)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-04-25 22:20:05 +00:00
Christian Heimes
65e262b93b Fix Python packaging edge cases (#17159)
Signed-off-by: Christian Heimes <christian@python.org>
2025-04-26 06:15:07 +08:00
Cyrus Leung
43faa0461a [Bugfix] Fix hybrid model tests (#17182)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-25 15:14:37 -07:00
Daniel Li
48cb2109b6 [V1] Move usage stats to worker and start logging TPU hardware (#16211) 2025-04-25 14:06:01 -06:00
Russell Bryant
a5450f11c9 [Security] Use safe serialization and fix zmq setup for mooncake pipe (#17192)
Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
Co-authored-by: Shangming Cai <caishangming@linux.alibaba.com>
2025-04-25 16:53:23 +00:00
Cyrus Leung
9d98ab5ec6 [Misc] Inline Molmo requirements (#17190)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-25 16:41:44 +00:00
Reid
df5c879527 [doc] update wrong hf model links (#17184)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-25 16:40:54 +00:00
Harry Mellor
423e9f1cbe Use Transformers helper get_text_config() instead of checking for text_config (#17105)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-25 08:47:35 -07:00
Harry Mellor
0bd7f8fca5 Bump Transformers to 4.51.3 (#17116)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-25 08:34:34 -07:00
Jasmond L
d5615af9ae [Bugfix] Fix Mistral ChatCompletionRequest Body Exception (#16769)
Signed-off-by: Jasmond Loh <Jasmond.Loh@hotmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-04-25 07:26:30 -07:00
Cyrus Leung
19dcc02a72 [Bugfix] Fix mistral model tests (#17181)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-25 06:03:34 -07:00
Alex Brooks
7feae92c1f [Doc] Move todo out of beam search docstring (#17183)
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
2025-04-25 04:44:58 -07:00
Michael Yao
f851b84266 [Doc] Add two links to disagg_prefill.md (#17168)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-04-25 10:23:57 +00:00
Lu Fang
fc966e9cc6 Only turn on FastIncrementalDetokenizer when tokenizers >= 0.21.1 (#17158) 2025-04-25 17:10:32 +08:00
Michael Yao
ef19e67d2c [Doc] Add headings to improve gptqmodel.md (#17164)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-04-25 01:13:13 -07:00
rasmith
a41351f363 [Quantization][FP8] Add support for FP8 models with input_scale for output projection and QK quantization (#15734)
Signed-off-by: Randall Smith <Randall.Smith@amd.com>
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
Co-authored-by: Luka Govedič <lgovedic@redhat.com>
2025-04-25 00:45:02 -07:00
Sangyeon Cho
6aae216b4e [Bugfix] remove fallback in guided_json (int range, patterns) (#16725)
Signed-off-by: csy1204 <josang1204@gmail.com>
Co-authored-by: 조상연[플레이스 AI] <sang-yeon.cho@navercorp.com>
2025-04-25 06:54:43 +00:00
yexin(叶鑫)
b22980a1dc [Perf]Optimize rotary_emb implementation to use Triton operator for improved inference performance (#16457)
Signed-off-by: cynthieye <yexin93@qq.com>
Co-authored-by: MagnetoWang <magnetowang@outlook.com>
2025-04-25 14:52:28 +08:00
Lucas Wilkinson
881f735827 [Misc] Benchmark Serving Script Support Appending Results (#17028)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2025-04-24 22:53:55 -07:00
Mengqing Cao
2f54045508 [Bugfix][Misc] Use TritonPlaceholderModule to defensively import triton (#15099)
Signed-off-by: Mengqing Cao <cmq0113@163.com>
2025-04-24 22:51:02 -07:00
Lifu Huang
5aa6efb9a5 [Misc] Clean up redundant code in uniproc_executor.py (#16762)
Signed-off-by: Lifu Huang <lifu.hlf@gmail.com>
2025-04-24 22:49:30 -07:00
Harry Mellor
6ca0234478 Move missed SchedulerConfig args into scheduler config group in EngineArgs (#17131)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-24 22:48:53 -07:00
Michael Goin
649818995f [Docs] Fix True->true in supported_models.md (#17141) 2025-04-25 04:20:04 +00:00
Varun Sundar Rabindranath
7a0a9da72b [Doc] V1 : Update LoRA status (#17133)
Signed-off-by: varun sundar rabindranath <vsundarr@redhat.com>
Co-authored-by: varun sundar rabindranath <vsundarr@redhat.com>
2025-04-24 20:17:22 -07:00
Zaida Zhou
69bff9bc89 fix float16 support for kimi-vl (#17156)
Co-authored-by: zhouzaida <zhouzaida@msh.team>
2025-04-24 20:16:32 -07:00
Lucas Wilkinson
41ca7eb491 [Attention] FA3 decode perf improvement - single mma warp group support for head dim 128 (#16864)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2025-04-24 20:12:21 -07:00
vllmellm
eef364723c [FEAT] [ROCm]: AITER Fused MOE V1 Support (#16752)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-04-25 11:06:50 +08:00
jglaser
0d6e187e88 Use custom address for listening socket (#15988)
Signed-off-by: Jens Glaser <glaserj@ornl.gov>
2025-04-25 01:57:16 +00:00
Michael Goin
9420a1fc30 Better error message for missing mistral params.json (#17132)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-04-24 23:43:08 +00:00
Rui Qiao
583e900996 [Misc] Add example to run DeepSeek with Ray Serve LLM (#17134)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-04-24 22:25:21 +00:00
Maximilien de Bayser
05e1fbfc52 Add chat template for Llama 4 models (#16428)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
2025-04-24 20:19:36 +00:00
Yinghai Lu
fe92176321 Add collective_rpc to llm engine (#16999)
Signed-off-by: Yinghai Lu <yinghai@thinkingmachines.ai>
2025-04-24 20:16:52 +00:00
Russell Bryant
6d0df0ebeb [Docs] Generate correct github links for decorated functions (#17125)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-04-24 10:39:43 -07:00
Harry Mellor
0fa939e2d1 Improve configs - LoRAConfig + PromptAdapterConfig (#16980)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-24 10:29:34 -07:00
Harry Mellor
0422ce109f Add :markdownhelp: to EngineArgs docs so markdown docstrings render properly (#17124)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-24 10:28:45 -07:00
Eyshika Agarwal
47bdee409c Molmo Requirements (#17026)
Signed-off-by: Eyshika Agarwal <eyshikaengineer@gmail.com>
Signed-off-by: eyshika <eyshikaengineer@gmail.com>
2025-04-24 10:08:37 -07:00
Atilla
49f189439d existing torch installation pip command fix for docs (#17059) 2025-04-24 10:07:21 -07:00
Aaruni Aggarwal
5adf6f6b7f Updating builkite job for IBM Power (#17111)
Signed-off-by: Aaruni Aggarwal <aaruniagg@gmail.com>
2025-04-24 10:06:17 -07:00
Russell Bryant
4115f19958 [CI] Add automation for the tool-calling github label (#17118)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-04-24 09:22:00 -07:00
Mark McLoughlin
340d7b1b21 [V1][Spec Decoding] Add num_drafts and num_accepted_tokens_per_position metrics (#16665)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-04-24 08:57:40 -07:00
Reid
1bcbcbf574 [Misc] refactor example series - structured outputs (#17040)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-24 07:49:48 -07:00
Michael Goin
82e43b2d7e Add missing rocm_skinny_gemms kernel test to CI (#17060)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-04-24 07:49:37 -07:00
wang.yuqi
67309a1cb5 [Frontend] Using matryoshka_dimensions control the allowed output dimensions. (#16970) 2025-04-24 07:06:28 -07:00
Shanshan Shen
b724afe343 [V1][Structured Output] Clear xgrammar compiler object when engine core shut down to avoid nanobind leaked warning (#16954)
Signed-off-by: shen-shanshan <467638484@qq.com>
2025-04-24 06:15:03 -07:00
Harry Mellor
21f4f1c9a4 Improve static type checking in LoRAModelRunnerMixin (#17104)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-24 06:14:47 -07:00
Isotr0py
b0c1f6202d [Misc] Remove OLMo2 config copy (#17066)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-04-24 06:14:32 -07:00
Rui Qiao
c0dfd97519 [V1][PP] Optimization: continue scheduling prefill chunks (#17080)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-04-24 05:27:08 -07:00
Harry Mellor
a9138e85b1 Fix OOT registration test (#17099)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-24 04:44:12 -07:00
Harry Mellor
0a05ed57e6 Simplify TokenizerGroup (#16790)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-24 04:43:56 -07:00
Michael Goin
14288d1332 Disable enforce_eager for V1 TPU sampler and structured output tests (#17016)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-04-24 02:50:09 -07:00
Woosuk Kwon
b411418ff0 [Chore] Remove Sampler from Model Code (#17084)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-04-24 02:49:33 -07:00
omer-dayan
2bc0f72ae5 Add docs for runai_streamer_sharded (#17093)
Signed-off-by: Omer Dayan (SW-GPU) <omer@run.ai>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-04-24 01:03:21 -07:00
Reid
9c1244de57 [doc] update to hyperlink (#17096)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-24 00:58:08 -07:00
Reid
db2f8d915c [V1] Update structured output (#16812)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-23 23:57:17 -07:00
张宇
6167c0e5d2 [Bugfix][Core] add seq_id_to_seq_group clearing to avoid memory leak when s… (#16472)
Signed-off-by: 开哲 <kaizhe.zy@alibaba-inc.com>
Co-authored-by: 开哲 <kaizhe.zy@alibaba-inc.com>
2025-04-24 11:25:37 +08:00
Areeb Syed
ed2e464653 Addendum Fix to support FIPS enabled machines with MD5 hashing (#17043)
Signed-off-by: sydarb <areebsyed237@gmail.com>
2025-04-23 19:55:00 -07:00
Harry Mellor
2c8ed8ee48 More informative error when using Transformers backend (#16988)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-23 19:54:03 -07:00
Michael Goin
ed50f46641 [Bugfix] Enable V1 usage stats (#16986)
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-04-23 19:54:00 -07:00
Woosuk Kwon
46e678bcff [Minor] Use larger batch sizes for A100/B100/B200/MI300x (#17073)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-04-23 19:18:59 -07:00
Chen Xia
6b2427f995 [Quantization]add prefix for commandA quantized model (#17017) 2025-04-23 17:32:40 -07:00
Sangyeon Cho
b07d741661 [CI/Build] workaround for CI build failure (#17070)
Signed-off-by: csy1204 <josang1204@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-04-23 16:14:18 -07:00
Woosuk Kwon
41fb013d29 [V1][Spec Decode] Always use argmax for sampling draft tokens (#16899)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-04-23 14:57:43 -07:00
Yong Hoon Shin
32d4b669d0 [BugFix][V1] Fix int32 token index overflow when preparing input ids (#16806) 2025-04-23 12:12:35 -07:00
Travis Johnson
3cde34a4a4 [Frontend] Support guidance:no-additional-properties for compatibility with xgrammar (#15949)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
2025-04-23 18:34:41 +00:00
Harry Mellor
bdb3660312 Use @property and private field for data_parallel_rank_local (#17053)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-23 08:50:08 -07:00
Harry Mellor
f3a21e9c68 CacheConfig.block_size should always be int when used (#17052)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-23 08:50:05 -07:00
Harry Mellor
8e630d680e Improve Transformers backend model loading QoL (#17039)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-23 07:33:51 -07:00
Russell Bryant
af869f6dff [CI] Update structured-output label automation (#17055)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-04-23 07:33:14 -07:00
Harry Mellor
53c0fa1e25 Ensure that pid passed to kill_process_tree is int for mypy (#17051)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-23 07:32:26 -07:00
Michael Yao
f7912cba3d [Doc] Add top anchor and a note to quantization/bitblas.md (#17042)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-04-23 07:32:16 -07:00
Michael Goin
6317a5174a Categorize tests/kernels/ based on kernel type (#16799)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-04-23 09:21:07 -04:00
Michael Goin
aa72d9a4ea Mistral-format support for compressed-tensors (#16803)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-04-23 08:46:23 -04:00
Russell Bryant
ce17db8085 [CI] Run v1/test_serial_utils.py in CI (#16996)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-04-23 01:13:34 -07:00
Chauncey
8c87a9ad46 [Bugfix] Fix AssertionError: skip_special_tokens=False is not supported for Mistral tokenizers (#16964)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-04-23 07:24:09 +00:00
huafeng
ec69124eb4 [Misc] Improve readability of get_open_port function. (#17024)
Signed-off-by: gitover22 <qidizou88@gmail.com>
2025-04-23 06:16:53 +00:00
Lucas Wilkinson
d0da99fb70 [BugFix] llama4 fa3 fix - RuntimeError: scheduler_metadata must have shape (metadata_size) (#16998)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2025-04-22 21:49:24 -07:00
Nick Hill
b2f195c429 [V1] Avoid socket errors during shutdown when requests are in in-flight (#16807)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-04-23 12:36:29 +08:00
vllmellm
047797ef90 [Bugfix] Triton FA function takes no keyword arguments (#16902)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-04-22 21:35:24 -07:00
Reid
eb8ef4224d [doc] add download path tips (#17013)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-23 04:06:30 +00:00
Chendi.Xue
56a735261c [INTEL-HPU][v0] Port delayed sampling to upstream (#16949)
Signed-off-by: Michal Adamczyk <michal.adamczyk@intel.com>
Signed-off-by: Chendi Xue <chendi.xue@intel.com>
Co-authored-by: Michal Adamczyk <madamczyk@habana.ai>
2025-04-22 20:14:11 -07:00
youkaichao
e1cf90e099 [misc] tune some env vars for GB200 (#16992)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-04-23 10:59:48 +08:00
Chauncey
6bc1e30ef9 Revert "[Misc] Add S3 environment variables for better support of MinIO." (#17021) 2025-04-22 19:22:29 -07:00
vllmellm
7e081ba7ca [BugFix] Revert ROCm Custom Paged Attention Env Flag Check (#17022)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-04-22 19:17:48 -07:00
Nick Hill
1e013fa388 [V1][DP] More robust DP/EP dummy request coordination (#16277)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-04-22 19:12:15 -07:00
Aleksandr Malyshev
bc7c4d206b [Kernel][ROCM] Upstream prefix prefill speed up for vLLM V1 (#13305)
Signed-off-by: Sage Moore <sage@neuralmagic.com>
Signed-off-by: root <root@banff-cyxtera-s73-5.ctr.dcgpu>
Signed-off-by: Aleksandr Malyshev <maleksan@amd.com>
Signed-off-by: root <root@banff-cyxtera-s65-4.amd.com>
Signed-off-by: maleksan85 <maleksan@amd.com>
Signed-off-by: <>
Co-authored-by: Sage Moore <sage@neuralmagic.com>
Co-authored-by: root <root@banff-cyxtera-s73-5.ctr.dcgpu>
Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
Co-authored-by: qli88 <qiang.li2@amd.com>
Co-authored-by: root <root@banff-cyxtera-s65-4.amd.com>
2025-04-22 19:11:56 -07:00
Yang Wang
f67e9e9f22 add Dockerfile build vllm against torch nightly (#16936)
Signed-off-by: Yang Wang <elainewy@meta.com>
2025-04-22 19:08:27 -07:00
Guillaume Calmettes
36fe78769f [Bugfix] validate urls object for multimodal content parts (#16990)
Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
2025-04-23 09:43:06 +08:00
Chenyaaang
83d933718c [Core][V1][TPU] Enable structured decoding on TPU V1 (#16499)
Signed-off-by: Chenyaaang <chenyangli@google.com>
2025-04-22 18:05:23 -06:00
Nick Hill
5175b884f7 [BugFix] Remove default multiproc executor collective_rpc timeout (#17000)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-04-22 23:27:14 +00:00
Alexei-V-Ivanov-AMD
5536b30a4c Fencing Kernels Tests for enabling on AMD (#16929)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
2025-04-22 09:32:40 -07:00
Richard Zou
7f58fb9718 Add assertion for no objects while hashing hf_config (#16930)
Signed-off-by: rzou <zou3519@gmail.com>
2025-04-22 09:32:22 -07:00
vllmellm
30bc3e0f66 [FEAT][ROCm]: Support AITER MLA (#15893)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: qli88 <qiang.li2@amd.com>
2025-04-22 09:31:13 -07:00
Reid
f34410715f [frontend] enhance tool_calls type check (#16882)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-22 15:40:24 +00:00
Chauncey
68d4c33202 [Misc] Add S3 environment variables for better support of MinIO. (#16977)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-04-22 14:27:36 +00:00
Zhengyuan Su (苏政渊)
f961d7f6ef [BugFix] Pass in correct VLLM config in FlashInfer backend (#13207) (#16973)
Signed-off-by: 苏政渊 <suzhengyuan@moonshot.cn>
Co-authored-by: 苏政渊 <suzhengyuan@moonshot.cn>
2025-04-22 06:44:10 -07:00
Harry Mellor
d059110498 Improve configs - SpeculativeConfig (#16971)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-22 12:55:36 +00:00
Yang Fan
571e8dd65e [Bugfix] Fix distributed bug again in Qwen2.5-VL & Qwen2.5-Omni (#16974)
Signed-off-by: fyabc <suyang.fy@alibaba-inc.com>
2025-04-22 12:23:17 +00:00
Reid
4b91c927f6 [Misc] refactor example series (#16972)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-22 11:44:21 +00:00
vllmellm
0e237f0035 [FEAT][ROCm] Integrate Paged Attention Kernel from AITER (#15001)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-04-22 02:46:28 -07:00
Cyrus Leung
8f7bace7c3 [Doc] Improve documentation for multimodal CLI args (#16960)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-22 08:35:35 +00:00
Nick Hill
e4d6144232 [BugFix] Fix incremental detokenization perf issue (#16963)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-04-22 08:16:19 +00:00
Lei Wang
8d32dc603d [Kernel] Support Microsoft Runtime Kernel Lib for our Low Precision Computation - BitBLAS (#6036)
Signed-off-by: xinyuxiao <xinyuxiao2024@gmail.com>
Co-authored-by: xinyuxiao <xinyuxiao2024@gmail.com>
2025-04-22 09:01:36 +01:00
Woosuk Kwon
c4ab9f3e71 [V1] Remove pre-allocation for KV cache (#16941)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-04-22 00:52:18 -07:00
Flora Feng
2689d5c027 [Model] Use autoweightloader for mamba (#16950)
Signed-off-by: sfeng33 <4florafeng@gmail.com>
2025-04-22 07:48:15 +00:00
Chauncey
acba33a0f1 [Bugfix] Fix the issue where llm.generate cannot be called repeatedly after setting GuidedDecodingParams (#16767)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
2025-04-22 06:02:20 +00:00
SnowCharm
a114bf20a3 [Perf] Optimize _update_states for GPU model runner (#16910)
Signed-off-by: snowcharm <snowcharmqq@gmail.com>
2025-04-22 14:01:54 +08:00
Michael Yao
3097ce3a32 [Doc] Update ai_accelerator/hpu-gaudi.inc.md (#16956)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-04-22 05:33:27 +00:00
Cyrus Leung
d6da9322c8 [Bugfix] Fix f-string for Python 3.9-3.11 (#16962)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-21 21:45:55 -07:00
omer-dayan
71ce44047f Support S3 Sharded loading with RunAI Model Streamer (#16317)
Signed-off-by: Omer Dayan (SW-GPU) <omer@run.ai>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-04-21 21:21:49 -07:00
Charlie Fu
188b7f9b8c [Performance][ROCm] Add skinny gemms for unquantized linear on ROCm (#15830)
Signed-off-by: charlifu <charlifu@amd.com>
Co-authored-by: Tyler Michael Smith <tysmith@redhat.com>
2025-04-21 20:46:22 -07:00
wangxiyuan
b9b4746950 [V1] Remove additional_config check (#16710)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-04-21 20:45:27 -07:00
Varun Sundar Rabindranath
7b8a2ab76f [Kernel] Add expert_map support to Cutlass FP8 MOE (#16861)
Signed-off-by: varun sundar rabindranath <vsundarr@redhat.com>
Co-authored-by: varun sundar rabindranath <vsundarr@redhat.com>
2025-04-21 20:44:32 -07:00
Jee Jee Li
c9acbf1141 [Misc] Remove the chunked prefill warning for LoRA (#16925)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-04-21 20:44:24 -07:00
kliuae
5b794cae8d [ROCm] Add aiter tkw1 kernel for Llama4 fp8 (#16727)
Signed-off-by: kliuae <kuanfu.liu@embeddedllm.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-04-21 20:42:34 -07:00
Jeffrey Li
0e4254492f [Bugfix]: fix issue with n>1 sampling on v1 requests overriding each other (#16863)
Signed-off-by: Jeffrey Li <jeffrey.dot.li@gmail.com>
2025-04-22 11:40:19 +08:00
Woosuk Kwon
1311913f55 [BugFix][Spec Decode] No in-place update to draft probs (#16952)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-04-21 19:54:19 -07:00
Cyrus Leung
29f395c97c [Doc] Remove unnecessary V1 flag (#16924)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-21 21:04:38 -04:00
Nicolò Lucchesi
fa3bba2a53 [TPU][V1] Enable Top-P (#16843)
Signed-off-by: NickLucche <nlucches@redhat.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-04-22 00:46:07 +00:00
Michael Goin
986537f1c3 [V1] V1 FlashInfer Attention (#16684)
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Aurick Qiao <qiao@aurick.net>
2025-04-22 00:38:41 +00:00
Nicolò Lucchesi
210207525e [TPU][V1] Capture multimodal encoder during model compilation (#15051)
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: NickLucche <nlucches@redhat.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Siyuan Liu <lsiyuan@google.com>
2025-04-21 18:36:59 -06:00
Michael Goin
71eda0bb76 Update Qwen1.5-MoE-W4A16-compressed-tensors.yaml (#16946) 2025-04-21 18:35:32 -06:00
Chengji Yao
471fe65630 [TPU][V1] Implicitly adjust page size when there's SMEM OOM (#16871)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-04-21 15:43:13 -06:00
Woosuk Kwon
3a0fba5cf4 [V1][Spec Decode] Handle draft tokens beyond max_model_len (#16087)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-04-21 12:38:50 -07:00
Chanh Nguyen
299ebb62b2 [Core] Speed up decode by remove synchronizing operation in sampler (#16436)
Signed-off-by: Chanh Nguyen <cnguyen@linkedin.com>
Co-authored-by: Chanh Nguyen <cnguyen@linkedin.com>
2025-04-21 18:18:22 +00:00
David Xia
f728ab8e35 [Doc] mention how to install in CPU editable mode (#16923)
Signed-off-by: David Xia <david@davidxia.com>
2025-04-21 17:45:51 +00:00
David Xia
63e26fff78 [doc] install required python3-dev apt package (#16888)
Signed-off-by: David Xia <david@davidxia.com>
2025-04-21 16:15:18 +00:00
Yan Ma
fe3462c774 [XPU][Bugfix] minor fix for XPU (#15591)
Signed-off-by: yan ma <yan.ma@intel.com>
2025-04-22 00:02:57 +08:00
Kartik Ramesh
3b34fd5273 Raise error for data-parallel with benchmark_throughput (#16737)
Signed-off-by: Kartik Ramesh <kartikx2000@gmail.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
2025-04-21 23:51:43 +08:00
Isotr0py
55d6d3fdb8 [Bugfix] Fix GLM rotary_dim issue and support v1 (#16912)
Signed-off-by: isotr0py <2037008807@qq.com>
2025-04-21 14:26:34 +00:00
Shanshan Shen
7272bfae77 [Misc] Refactor platform to get device specific stream and event (#14411)
Signed-off-by: shen-shanshan <467638484@qq.com>
2025-04-21 21:25:49 +08:00
wangxiyuan
d9ac9e3dc5 [Misc] fix collect_env version parse (#15267)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-04-21 20:29:40 +08:00
Han Zhang
d41faaf9df Restore buffers when wake up from level 2 sleep (#16564) (#16889)
Signed-off-by: Han <zh950713@gmail.com>
2025-04-21 20:18:28 +08:00
Alex Brooks
b34f33438a [Doc] Split dummy_processor_inputs() in Multimodal Docs (#16915)
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
2025-04-21 11:10:01 +00:00
Yang Fan
26c0406555 [Bugfix] Fix distributed bug in Qwen2.5-VL & Qwen2.5-Omni (#16907) 2025-04-21 10:25:21 +00:00
Woosuk Kwon
4c41278b77 [CI/CD][V1] Add spec decode tests to CI (#16900)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-04-20 22:37:16 -07:00
qizixi
bb3605db85 [Bugfix] Fix v1/spec_decode/test_ngram.py (#16895)
Signed-off-by: qizixi <qizixi@meta.com>
2025-04-20 20:54:29 -07:00
Richard Zou
fe742aef5a [easy] Pass compile_fx only the config patches (#16845)
Signed-off-by: rzou <zou3519@gmail.com>
2025-04-20 12:25:19 +08:00
Harry Mellor
4b07d36891 Improve configs - CacheConfig (#16835)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-20 12:25:04 +08:00
Staszek Paśko
87aaadef73 Serialize tensors using int8 views (#16866)
Signed-off-by: Staszek Pasko <staszek@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-04-19 10:28:34 -07:00
Richard Zou
682e0b6d2f Log how much time loading a compiled artifact takes (#16848)
Signed-off-by: rzou <zou3519@gmail.com>
2025-04-19 16:50:46 +00:00
Reid
d6195a748b [doc] update hyperlink (#16877)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-19 16:40:38 +00:00
Cyrus Leung
205d84aaa9 [VLM] Clean up models (#16873)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-19 12:13:06 +00:00
Roger Wang
5124f5bf51 [Model] Qwen2.5-Omni Cleanup (#16872) 2025-04-19 09:37:02 +00:00
Isotr0py
83f3c3bd91 [Model] Refactor Phi-4-multimodal to use merged processor and support V1 (#15477)
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-19 02:26:11 -07:00
vie-serendipity
d9737ca1c6 [V1][Misc] stop update prefix cache stats when logs_stats is disabled (#16460)
Signed-off-by: vie-serendipity <2733147505@qq.com>
2025-04-19 02:25:19 -07:00
Nicolò Lucchesi
9d4ca19d50 [Misc] Benchmarks for audio models (#16505)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-04-19 02:24:14 -07:00
Nicolò Lucchesi
2ef0dc53b8 [Frontend] Add sampling params to v1/audio/transcriptions endpoint (#16591)
Signed-off-by: Jannis Schönleber <joennlae@gmail.com>
Signed-off-by: NickLucche <nlucches@redhat.com>
Co-authored-by: Jannis Schönleber <joennlae@gmail.com>
2025-04-19 07:03:54 +00:00
Divakar Verma
1d4680fad2 [rocm][MI300] llama4 maverick fp8 moe config tp8 (#16847)
Signed-off-by: Divakar Verma <divakar.verma@amd.com>
2025-04-19 06:21:43 +00:00
Yang Fan
2c1bd848a6 [Model][VLM] Add Qwen2.5-Omni model support (thinker only) (#15130)
Signed-off-by: fyabc <suyang.fy@alibaba-inc.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Xiong Wang <wangxiongts@163.com>
2025-04-18 23:14:36 -07:00
omrishiv
5c9121203c [release] Publish neuron docker image (#16733)
Signed-off-by: omrishiv <327609+omrishiv@users.noreply.github.com>
2025-04-18 17:11:25 -07:00
Justin Ho
490b1698a5 [Doc] Updated Llama section in tool calling docs to have llama 3.2 config info (#16857)
Signed-off-by: jmho <jaylenho734@gmail.com>
2025-04-18 23:28:53 +00:00
Reid
5a5e29de88 [Misc] refactor examples series - Chat Completion Client With Tools (#16829)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-18 23:24:42 +00:00
wang.yuqi
3d3ab3689f [New Model]: Snowflake Arctic Embed (Family) (#16649) 2025-04-18 08:11:57 -07:00
Harry Mellor
686623c5e7 Fix nullable_kvs fallback (#16837)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-18 05:58:39 -07:00
Cyrus Leung
aadb656562 [Misc] Clean up Kimi-VL (#16833)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-18 05:15:09 -07:00
Jonghyun Choe
87e067de41 [Model] use AutoWeightsLoader for BigCode, GPT-J (#16823)
Signed-off-by: Jonghyun Choe <andy.choe729@gmail.com>
2025-04-18 10:42:41 +00:00
Michael Yao
26507f8973 [Docs] Fix a link and grammar issue in production-stack.md (#16809)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-04-18 06:42:58 +00:00
Nathan Weinberg
9c1d5b456d [Doc] add podman setup instructions for official image (#16796)
Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
2025-04-18 06:10:49 +00:00
Lucia Fang
e31045f95c [Bugfix] fix pp for llama4 (#16746)
Signed-off-by: Lu Fang <fanglu@fb.com>
2025-04-18 13:51:30 +08:00
Luka Govedič
aaec845f8e [ROCm] [Attention] Cleanup ROCm output passing (#16431)
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
2025-04-18 05:46:45 +00:00
rongfu.leng
7bdfd29a35 [Misc] add collect_env to cli and docker image (#16759)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-04-17 22:13:35 -07:00
Harry Mellor
e78587a64c Improve-mm-and-pooler-and-decoding-configs (#16789)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-17 22:13:32 -07:00
Lucas Wilkinson
7eb4255628 [BugFix] Accuracy fix for llama4 int4 - improperly casted scales (#16801)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2025-04-17 22:13:29 -07:00
Michael Goin
6a0f547561 Add hardware print to TPU V1 test (#16792) 2025-04-17 22:13:26 -07:00
Shanshan Shen
30ed81b7ca [V1][Structured Output] Minor modification to _validate_structured_output() (#16748)
Signed-off-by: shen-shanshan <467638484@qq.com>
2025-04-18 13:12:54 +08:00
Chauncey
7a4a5de729 [Misc] Update outdated note: LMCache now supports chunked prefill (#16697)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-04-18 05:12:42 +00:00
Cyrus Leung
c16fb5dae8 [Doc] Improve help examples for --compilation-config (#16729)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-17 21:22:34 -07:00
Tarun Kumar
e37073efd7 Add property-based testing for vLLM endpoints using an API defined by an OpenAPI 3.1 schema (#16721)
Signed-off-by: Tarun Kumar <takumar@redhat.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-04-17 21:08:27 -07:00
Lucas Wilkinson
183dad7a85 [Attention] Update to lastest FA3 code (#13111)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2025-04-17 15:14:07 -07:00
Yihua Cheng
3408e47159 [P/D][V1] KV Connector API V1 (#15960)
Signed-off-by: ApostaC <yihua98@uchicago.edu>
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
Signed-off-by: remi <remi@mistral.ai>
Co-authored-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Co-authored-by: Rémi Delacourt <54138269+Flechman@users.noreply.github.com>
Co-authored-by: Tyler Michael Smith <tysmith@redhat.com>
2025-04-17 13:22:40 -07:00
Nick Hill
0377b8310b [MLA] Simplification to batch P/D reordering (#16673)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-04-17 16:12:09 -04:00
Mark McLoughlin
e4755f7fac [V1][Metrics] Fix http metrics middleware (#15894) 2025-04-17 19:52:18 +00:00
Sijia(Jackson) Chen
92edf35826 [ROCM] enable aiter fused moe kernel for llama4 bf16 checkpoints (#16674) 2025-04-17 11:44:34 -07:00
Nicolò Lucchesi
eb5819b2d9 [V1][TPU] Enable Top K (#15489)
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Hyesoo Yang <hyeygit@gmail.com>
Co-authored-by: Hyesoo Yang <hyeygit@gmail.com>
2025-04-17 18:18:11 +00:00
Nicolò Lucchesi
5989f4684d [TPU][V1] Fix padding recompilation when max-num-batched-tokens is not even (#16726)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-04-17 18:09:57 +00:00
rongfu.leng
5125d72f02 [Model] use AutoWeightsLoader for olmoe,opt,orion,persimmon,phi3_small (#16548)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-04-17 17:48:31 +00:00
Ximingwang-09
a018e555fd [Kernel] Add fp8_w8a8 fused MoE kernel tuning configs for DeepSeek V3/R1 on NVIDIA H20 (#16753)
Signed-off-by: ximing.wxm <ximing.wxm@antgroup.com>
Co-authored-by: ximing.wxm <ximing.wxm@antgroup.com>
2025-04-18 00:01:30 +08:00
Robin
6211b92273 [Bugfix]Fix index out of range error in api server log (#16787)
Signed-off-by: WangErXiao <863579016@qq.com>
2025-04-17 09:01:07 -07:00
Nick Hill
05fcd1b430 [V1][Perf] Faster incremental detokenization (#15137)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-04-17 07:45:24 -07:00
Insu Kim
7c02d6a137 [Doc] Changed explanation of generation_tokens_total and prompt_tokens_total counter type metrics to avoid confusion (#16784)
Signed-off-by: insukim1994 <insu.kim@moreh.io>
2025-04-17 14:10:08 +00:00
wang.yuqi
11c3b98491 [Doc] Document Matryoshka Representation Learning support (#16770) 2025-04-17 13:37:37 +00:00
Cyrus Leung
dbe7f07001 [Doc] Make sure to update vLLM when installing latest code (#16781)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-17 06:53:31 -06:00
Reid
c69bf4ee06 fix: hyperlink (#16778)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-17 11:34:20 +00:00
Harry Mellor
d27ea94034 Improve configs - TokenizerPoolConfig + DeviceConfig (#16603)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-17 11:19:42 +00:00
Reid
99ed526101 [Misc] refactor examples series - lmcache (#16758)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-17 11:02:35 +00:00
Michael Yao
207da28186 [Doc] Fix a 404 link in installation/cpu.md (#16773)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-04-17 10:46:21 +00:00
intervitens
5b1aca2ae3 [Bugfix] Fix GLM4 model (#16618)
Signed-off-by: intervitens <intervitens@tutanota.com>
2025-04-17 03:35:07 -07:00
Reid
d8e557b5e5 [doc] add open-webui example (#16747)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-17 18:27:32 +08:00
Cyrus Leung
61a44a0b22 [Doc] Add more tips to avoid OOM (#16765)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-17 09:54:34 +00:00
DefTruth
a6481525b8 [misc] ignore marlin_moe_wna16 local gen codes (#16760)
Signed-off-by: DefTruth <qiustudent_r@163.com>
2025-04-17 17:15:14 +08:00
Richard Liaw
8cac35ba43 [Ray] Improve documentation on batch inference (#16609)
Signed-off-by: Richard Liaw <rliaw@berkeley.edu>
2025-04-16 22:19:26 -07:00
Russell Bryant
9dbf7a2dc1 [V1] Remove log noise when idle (#16735)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-04-16 21:34:08 -07:00
David Heineman
607029e515 [Bugfix] Revert max_prompt_len validation for decoder-only models. (#16741)
Signed-off-by: David Heineman <david@davidheineman.com>
2025-04-16 21:33:15 -07:00
Isotr0py
cb072ce93b [Bugfix] Update Florence-2 tokenizer to make grounding tasks work (#16734)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-04-17 04:17:39 +00:00
Divakar Verma
95aca283b4 [rocm][V0] fix selection logic for custom PA in V0 (#16426)
Signed-off-by: Divakar Verma <divakar.verma@amd.com>
2025-04-16 19:52:11 -07:00
Robert Shaw
2b05b8ce69 [V1][Frontend] Improve Shutdown And Logs (#11737)
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Andrew Feldman <afeldman@neuralmagic.com>
Co-authored-by: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-04-16 19:48:34 -07:00
Aaruni Aggarwal
3c776dcefb Adding vllm buildkite job for IBM Power (#16679)
Signed-off-by: Aaruni Aggarwal <aaruniagg@gmail.com>
2025-04-17 10:47:47 +08:00
Bryan Lu
2cbd4d2999 [V1][Spec Dec Bug Fix] Respect Spec Dec Method Specification (#16636)
Signed-off-by: Bryan Lu <yuzhelu@amazon.com>
2025-04-16 19:47:26 -07:00
Staszek Paśko
3092375e27 [V1][Performance] Implement custom serializaton for MultiModalKwargs [Rebased] (#16432)
Signed-off-by: Staszek Pasko <staszek@gmail.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-04-16 19:28:32 -07:00
Harry Mellor
3cd91dc955 Help user create custom model for Transformers backend remote code models (#16719)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-17 01:05:59 +00:00
Jade Zheng
8a7368e069 [Misc] Remove redundant comment (#16703)
Signed-off-by: Jade Zheng <zheng.shoujian@outlook.com>
2025-04-17 00:44:52 +00:00
Harry Mellor
93e561ec4d Improve error for structured output backend selection (#16717)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-17 00:35:35 +00:00
Joe Runde
e1b004839a [Hardware] Add processor inputs to platform validation (#16680)
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
2025-04-16 09:28:42 -07:00
xsank
ee378f3d49 [Model] support modernbert (#16648)
Signed-off-by: 唯勤 <xsank.mz@alibaba-inc.com>
Co-authored-by: 唯勤 <xsank.mz@alibaba-inc.com>
2025-04-16 05:30:15 -07:00
DefTruth
e82ee40de3 [Bugfix][Kernel] fix potential cuda graph broken for merge_attn_states kernel (#16693)
Signed-off-by: DefTruth <qiustudent_r@163.com>
2025-04-16 03:31:39 -07:00
Cyrus Leung
facbe2a114 [Doc] Improve OOM troubleshooting (#16704)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-16 18:29:48 +08:00
Reid
7168920491 [Misc] refactor examples series (#16708)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-16 10:16:36 +00:00
Kay Yan
21378a2323 [CI] Cleanup additional_dependencies: [toml] for pre-commit yapf hook (#16405)
Signed-off-by: Kay Yan <kay.yan@daocloud.io>
2025-04-16 10:05:31 +00:00
Shanshan Shen
976711d9db [V1][Structured Output] Move xgrammar related utils to backend_xgrammar.py (#16578)
Signed-off-by: shen-shanshan <467638484@qq.com>
2025-04-16 17:01:36 +08:00
Sage Moore
44fa4d556c [ROCM] Bind triton version to 3.2 in requirements-built.txt (#16664)
Signed-off-by: Sage Moore <sage@neuralmagic.com>
2025-04-16 14:05:28 +08:00
billishyahao
3ac98edcb1 [Feature] add model aware kv ops helper (#16020)
Signed-off-by: billishyahao <bill.he@amd.com>
2025-04-15 23:00:43 -07:00
Richard Zou
966c742ed2 Disable remote caching when calling compile_fx (#16611)
Signed-off-by: rzou <zou3519@gmail.com>
2025-04-15 22:18:28 -07:00
Jee Jee Li
0d7d05f4b6 [Misc] Modify LRUCache touch (#16689)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-04-16 04:51:38 +00:00
rongfu.leng
96bb8aa68b [Bugfix] fix gpu docker image mis benchmarks dir (#16628)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-04-15 21:21:14 -07:00
Shinichi Hemmi
3badb0213b [Model] Add PLaMo2 (#14323)
Signed-off-by: Shinichi Hemmi <50256998+Alnusjaponica@users.noreply.github.com>
Signed-off-by: shemmi <shemmi@preferred.jp>
Co-authored-by: Kento Nozawa <nzw0301@preferred.jp>
Co-authored-by: Hiroaki Mikami <mhiroaki@preferred.jp>
Co-authored-by: Calvin Metzger <metzger@preferred.jp>
2025-04-15 19:31:30 -07:00
Angky William
fdcb850f14 [Misc] Enable vLLM to Dynamically Load LoRA from a Remote Server (#10546)
Signed-off-by: Angky William <angkywilliam@Angkys-MacBook-Pro.local>
Co-authored-by: Angky William <angkywilliam@Angkys-MacBook-Pro.local>
2025-04-15 22:31:38 +00:00
Dipika Sikka
54a66e5fee [Misc] Update compressed-tensors WNA16 to support zero-points (#14211) 2025-04-15 07:33:51 -06:00
DefTruth
280d62b8a2 [Kernel] Remove redundant Exp calculations (#16123)
Signed-off-by: DefTruth <qiustudent_r@163.com>
2025-04-15 12:58:37 +00:00
Xihui Cang
1666e66443 Add "/server_info" endpoint in api_server to retrieve the vllm_config.  (#16572)
Signed-off-by: Xihui Cang <xihuicang@gmail.com>
2025-04-15 11:50:38 +00:00
Jee Jee Li
1575c1701a [CI/Build] Fix LoRA OOM (#16624)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-04-15 16:38:19 +08:00
Reid
6ae996a873 [Misc] refactor argument parsing in examples (#16635)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-15 08:05:30 +00:00
Richard Zou
b590adfdc1 Fix vLLM x torch.compile config caching (#16491)
Signed-off-by: rzou <zou3519@gmail.com>
2025-04-14 23:11:11 -07:00
Michael Goin
b4fe16c75b Add vllm bench [latency, throughput] CLI commands (#16508)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-04-14 23:10:35 -07:00
Pooya Davoodi
bc5dd4f669 [Bugfix] Fix broken GritLM model and tests (missing pooling_metadata) (#16631)
Signed-off-by: Pooya Davoodi <pooya.davoodi@parasail.io>
2025-04-14 23:09:58 -07:00
Tyler Michael Smith
dbb036cf61 [Bugfix] Fix tests/kernels/test_mamba_ssm_ssd.py (#16623)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-04-15 05:35:38 +00:00
Taneem Ibrahim
70e7ed841d [BugFix]: Update minimum pyzmq version (#16549)
Signed-off-by: Taneem Ibrahim <taneem.ibrahim@gmail.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
2025-04-14 20:06:03 -07:00
Jinzhen Lin
d06ba4ed3f [Kernel] moe wna16 marlin kernel (#14447)
Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-04-14 20:05:22 -07:00
Alex Brooks
6b40996ae8 [Core][Bugfix] Fix Offline MM Beam Search (#16390)
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-04-15 10:33:02 +08:00
Shuqiao Li
d2020acac7 config check sleep mode support oot platforms (#16562) 2025-04-14 16:31:50 -07:00
Chengji Yao
1eb3c2ed48 [DOC][TPU] Add core idea about avoiding recompilation after warmup (#16614)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-04-14 21:56:06 +00:00
Siyuan Liu
c64ee87267 [Hardware][TPU] Add torchvision to tpu dependency file (#16616)
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
2025-04-14 17:50:46 -04:00
courage17340
b1308b84a3 [Model][VLM] Add Kimi-VL model support (#16387)
Signed-off-by: courage17340 <courage17340@163.com>
2025-04-14 21:41:48 +00:00
Nishan Acharya
7b5ecf79bd s390x: Fix PyArrow build and add CPU test script for Buildkite CI (#16036)
Signed-off-by: Nishan Acharya <Nishan.Acharya@ibm.com>
2025-04-14 10:55:32 -07:00
Harry Mellor
9883a18859 Fix triton install condition on CPU (#16600)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-14 17:06:01 +00:00
Nicolò Lucchesi
b3f2fddd17 [TPU][V1] Fix exponential padding when max-num-batched-tokens is not a power of 2 (#16596)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-04-14 17:01:05 +00:00
Cyrus Leung
aa29841ede [Bugfix] Multi-modal caches not acting like LRU caches (#16593)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-14 09:24:16 -07:00
Md. Shafi Hussain
6bf27affb6 [fix]: Dockerfile.ppc64le fixes for opencv-python and hf-xet (#16048)
Signed-off-by: Md. Shafi Hussain <Md.Shafi.Hussain@ibm.com>
2025-04-14 17:08:39 +01:00
shangmingc
1dd23386ec [Misc] Update usage with mooncake lib for kv transfer (#16523)
Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
2025-04-14 11:31:37 +00:00
Reid
7cbfc10943 [Misc] refactor examples (#16563)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-14 09:59:15 +00:00
DefTruth
ce4ddd2d1a [Misc] remove warning if triton>=3.2.0 (#16553)
Signed-off-by: DefTruth <qiustudent_r@163.com>
2025-04-14 02:39:47 -07:00
Harry Mellor
e51929ebca Improve configs - SchedulerConfig (#16533)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-14 17:24:16 +08:00
Russell Bryant
dc1b4a6f13 [Core][V0] Enable regex support with xgrammar (#13228)
Some checks failed
Create Release / Create Release (push) Has been cancelled
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-04-14 10:13:38 +08:00
Jennifer Zhao
63d2705edb [Benchmark][Bugfix] Fix SonnetDataset default values in benchmark_throughput.py (#16556) 2025-04-13 17:20:26 -07:00
Michael Goin
d085a44082 Enable PTPC FP8 for CompressedTensorsW8A8Fp8MoEMethod (triton fused_moe) (#16537)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-04-13 14:55:18 +00:00
Lily Liu
f49e5aff11 [V1][Spec Decode] KV cache slots for eagle heads (#16370)
Signed-off-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
2025-04-12 19:42:51 -07:00
Ryan McConville
6c11ecf8d3 [Bugfix] Validate logit biases to prevent out of vocab ids crashing engine (#16529)
Signed-off-by: Ryan McConville <ryan@ryanmcconville.com>
2025-04-12 20:19:19 +00:00
SnowCharm
93e5f3c5fb [Perf] Optimize Preparing Inputs for GPU Model Runner (#16484)
Signed-off-by: snowcharm <snowcharmqq@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-04-12 22:54:37 +08:00
Jie Fu (傅杰)
70363bccfa Fix syntaxWarning: invalid escape sequence '\s' (#16532)
Signed-off-by: Jie Fu <jiefu@tencent.com>
2025-04-12 14:39:42 +00:00
Jee Jee Li
3cdc57669f [Misc] Delete redundant code (#16530)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-04-12 11:21:37 +00:00
Huazhong Ji
68bb122eb4 [MISC] Make GroupCoordinator compatible with out-of-tree devices (#16464)
Signed-off-by: hzji210@gmail.com <hzji210@gmail.com>
2025-04-12 09:20:25 +00:00
Cyrus Leung
d9fc8cd9da [V1] Enable multi-input by default (#15799)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-12 08:52:39 +00:00
Nicolò Lucchesi
f069f3ea74 [Misc] Openai transcription client example use same Whisper model (#16487)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-04-12 07:27:03 +00:00
Cyrus Leung
c5bc0e7fcc [Misc] Update chat utils tests (#16520)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-12 06:48:43 +00:00
Tianer Zhou
4a3a518722 fix: spelling (#16466)
Signed-off-by: Tianer Zhou <ezhoureal@gmail.com>
2025-04-11 23:24:22 -07:00
wang.yuqi
fbf722c6e6 [Frontend] support matryoshka representation / support embedding API dimensions (#16331) 2025-04-11 23:23:10 -07:00
leon-seidel
e92d7085bf [Feature][V1] Add xgrammar to support minLength, maxLength with test (#16516)
Signed-off-by: Leon Seidel <leon.seidel@fau.de>
2025-04-11 23:22:07 -07:00
Michael Goin
bd6028d6b0 Optimized topk for topk=1 (Llama-4) (#16512)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-04-12 14:21:08 +08:00
Ye (Charlotte) Qi
802329dee9 [Doc] Update Llama4 Model Names in Supported Models (#16509)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-04-12 02:53:10 +00:00
Nick Hill
41cc883c29 [BugFix] Handle non-contiguous tensors properly when serializing (#16492)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-04-11 17:54:06 -07:00
Michael Goin
57504a4bcf [CI][Bugfix] Add mistral_tool_use to Ci (#16517)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-04-11 17:52:38 -07:00
Yuan Tang
ed4792c990 [Doc] Fix link to vLLM blog (#16519)
Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
2025-04-11 17:39:23 -07:00
Michael Goin
87b836ba77 Bugfix for PixtralHF models without spatial_merge_size (#16513)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-04-11 23:32:22 +00:00
rongfu.leng
56c76c2e0e [Bugfix] clean up duplicated code (#16485)
Signed-off-by: Gogs <gogs@fake.local>
Co-authored-by: Gogs <gogs@fake.local>
2025-04-11 23:19:40 +00:00
Christian Sears
c09632a66c Update openai_compatible_server.md (#16507)
Signed-off-by: Christian Sears <csears@redhat.com>
2025-04-11 22:54:58 +00:00
Yong Hoon Shin
a3bf8d4a2b [Kernel] Add tuned FusedMoE kernel config for Llama4 Scout, TP=8 on H100 (#16488) 2025-04-12 06:26:55 +08:00
Ye (Charlotte) Qi
16eda8c43a [Frontend] Added chat templates for LLaMa4 pythonic tool calling (#16463)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Co-authored-by: Kai Wu <kaiwu@meta.com>
2025-04-12 06:26:17 +08:00
Harry Mellor
cd77382ac1 Improve configs - LoadConfig (#16422)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-11 20:27:27 +00:00
Travis Johnson
71b9cde010 [Bugfix] handle alignment of encoder_seq_lens in mllama.py (#14784)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
2025-04-11 19:59:50 +00:00
Isotr0py
5285589f37 [Doc] Document InternVL3 support (#16495)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-04-11 19:41:09 +00:00
Michael Goin
f41647ee6b [Kernel] Support W8A8 channel-wise weights and per-token activations in triton fused_moe_kernel (#16366)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-04-11 17:54:08 +00:00
Nicolò Lucchesi
4d022cbc75 [TPU][V1] Make --disable_chunked_mm_input mandatory for serving MM models (#16483)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-04-11 17:06:14 +00:00
Richard Zou
70de35a881 Fix erroneous "model doesn't support compile" warning (#16486)
Signed-off-by: rzou <zou3519@gmail.com>
2025-04-11 16:24:36 +00:00
Tomasz Zielinski
34b2cf3b33 [Hardware][Intel-Gaudi] Multi-step scheduling implementation for HPU (#12779)
Signed-off-by: Tomasz Zielinski <tomasz.zielinski@intel.com>
2025-04-11 07:38:36 -07:00
chaow-amd
9e90c9f73f [Bugfix] Fix bugs of running Quark quantized models (#16236)
Signed-off-by: chaow <chaow@amd.com>
2025-04-11 10:18:32 -04:00
DefTruth
e9528f6dc6 [Kernel] support merge_attn_states CUDA kernel, 3x speedup (#16173)
Signed-off-by: DefTruth <qiustudent_r@163.com>
2025-04-11 06:50:50 -06:00
Harry Mellor
51baa9c333 Don't install triton on ppc64le platform (#16470)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-11 10:11:00 +00:00
Reid
35e076b3a8 [Misc] update api_client example (#16459)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-11 10:05:40 +00:00
Jee Jee Li
a26f59ccbc [Misc] Raise error for V1 not supporting Long LoRA. (#16415)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-04-11 01:51:20 -07:00
Michael Goin
aa3b3d76e0 Enforce valid max_num_batched_tokens when disable_chunked_mm_input=True (#16447)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-04-11 08:09:52 +00:00
Jee Jee Li
f7030df3be [Core][LoRA][1/N] Add LoRA for EncoderDecoderModelRunner (#15990)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-04-11 15:32:37 +08:00
DefTruth
905e91e9ac Revert "[Model] use AutoWeightsLoader for deepseek_v2, internlm2" (#16453) 2025-04-11 06:44:22 +00:00
Alex Brooks
f8f9c0ba62 [Bugfix] Don't set an upper bound on repetition penalty (#16403)
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-04-11 14:19:40 +08:00
Li, Jiang
dda811021a [CPU][Bugfix] Fix CPU docker issues (#16454)
Signed-off-by: jiang.li <jiang1.li@intel.com>
2025-04-11 14:19:07 +08:00
Isotr0py
93195146ea [Bugfix][VLM] Fix failing Phi-4-MM multi-images tests and add vision-speech test (#16424)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-04-11 04:57:16 +00:00
Michael Goin
ed37599544 Update supported_hardware.md for TPU INT8 (#16437) 2025-04-11 12:28:07 +08:00
Yong Hoon Shin
99ef59cf7f [Llama4] Enable attention temperature tuning by default for long context (>32k) (#16439)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-04-10 21:26:07 -07:00
Chenyaaang
d544d141ec update benchmark_serving_structured_output to include auto backend (#16438)
Signed-off-by: Chenyaaang <chenyangli@google.com>
2025-04-11 12:25:52 +08:00
Alexey Belyakov
3e397a9484 check input length of sonnet samples (#16423)
Signed-off-by: alexey-belyakov <alexey.belyakov@intel.com>
2025-04-11 10:15:06 +08:00
WWW
268c325078 Fix range_ratio Bug in RandomDataset (#16126)
Signed-off-by: jadewang21 <jadewangcn@outlook.com>
2025-04-10 15:31:17 -07:00
Nicolò Lucchesi
3cc9af88ff [TPU][V1] Disable per-request seed/Generator (#16172)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-04-10 17:05:44 -04:00
look
7cd0bd7212 [Bugfix] Fix output token length check logic (#16419)
Signed-off-by: look <eeslook@163.com>
2025-04-10 20:16:48 +00:00
Cyrus Leung
56d4aefa33 [VLM] Avoid unnecessary dummy multimodal data during processing (#16416)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-10 19:32:14 +00:00
Nick Hill
dd143ef541 [V1] Zero-copy tensor/ndarray serialization/transmission (#13790)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-04-10 19:23:14 +00:00
Chih-Chieh Yang
daefed052c [Model] Reduce redundant computations in mamba2 blocks for Bamba-9B (#15423)
Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
Co-authored-by: Yu Chin Fabian Lim <flim@sg.ibm.com>
2025-04-10 19:07:07 +00:00
Chenyaaang
5fbab20e02 [Bugfix] Fix bug when dataset is json (#15899)
Signed-off-by: Chenyaaang <chenyangli@google.com>
2025-04-10 18:35:41 +00:00
Lily Liu
e8224f3dca [V1][Spec Decode] Eagle Model loading (#16035)
Signed-off-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
2025-04-10 11:21:48 -07:00
Russell Bryant
9665313c39 [V1] Set structured output backend to auto by default (#15724)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-04-10 17:53:26 +00:00
Harry Mellor
0c54fc7273 Improve configs - ParallelConfig (#16332)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-10 17:34:37 +00:00
Nicolò Lucchesi
c1b57855ec [TPU][V1] Use language_model interface for getting text backbone in MM (#16410)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-04-10 17:32:04 +00:00
Cyrus Leung
83b824c8b4 [VLM] Remove BaseProcessingInfo.get_mm_max_tokens_per_item (#16408)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-10 09:06:58 -07:00
Lu Fang
7678fcd5b6 Fix the torch version parsing logic (#15857) 2025-04-10 07:37:47 -07:00
wineandchord
8661c0241d [CI] Add auto update workflow for Dockerfile graph (#11879)
Signed-off-by: wineandchord <guoqizhou19@gmail.com>
2025-04-10 13:43:05 +00:00
Reid
ce8d6b75fc [doc] update the wrong link (#16401)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-10 21:02:37 +08:00
Ye (Charlotte) Qi
61de3ef74b [Model] Remove image mm limit for LLaMa4 (#16365)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-04-10 09:36:27 +00:00
cyyever
ec1f9c8c91 Update Numba to 0.61.2 (#16376)
Signed-off-by: cyy <cyyever@outlook.com>
2025-04-10 07:59:37 +00:00
Reid
65e09094c4 [doc] add download model tips (#16389)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-10 07:45:26 +00:00
Michael Goin
c70cf0fe06 [Kernel] Use moe_wna16 kernel for compressed tensors wna16 moe models (#16038)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-04-10 15:08:47 +08:00
Cyrus Leung
a5d11a54dc [Bugfix] Fix validation error for text-only Mllama 3.2 (#16377)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-10 14:19:42 +08:00
Cyrus Leung
3d4c87758e [Misc] Update transformers version limits of multi-modal tests (#16381)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-09 23:03:33 -07:00
Aaron Ang
a9bd832fc5 [Model] use AutoWeightsLoader for deepseek_v2, internlm2 (#16383)
Signed-off-by: Aaron Ang <aaron.angyd@gmail.com>
2025-04-09 23:01:00 -07:00
Chenyaaang
417bcefbae fix sonnet dataset sample when prefix len is very small (#16379)
Signed-off-by: Chenyaaang <chenyangli@google.com>
2025-04-10 05:35:07 +00:00
Michael Goin
baada0e737 [Bugfix][TPU] Fix TPU validate_request (#16369)
Signed-off-by: Michael Goin <mgoin64@gmail.com>
2025-04-10 12:55:12 +08:00
Benjamin Kitor
82eb61dd4c [misc] use tqdm.auto where appropriate (#16290)
Signed-off-by: Benjamin Kitor <bkitor@gigaio.com>
2025-04-09 21:54:54 -07:00
Roger Wang
0d4d06fe2f [CI][Bugfix] Pin triton version for CPU (#16384)
Signed-off-by: Roger Wang <ywang@roblox.com>
2025-04-10 04:35:00 +00:00
Jintao
4aed0ca6a2 [bugfix] Avoid the time consumption caused by creating dummy videos. (#16371) 2025-04-10 04:30:05 +00:00
Chengji Yao
1621b25288 [TPU] Fix dummy loading OOM (#16372)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-04-10 04:06:16 +00:00
Aaron Ang
a564797151 [Model] use AutoWeightsLoader for granite, granitemoe, granitemoeshared, grok1, mixtral (#16325)
Signed-off-by: Aaron Ang <aaron.angyd@gmail.com>
2025-04-09 20:07:40 -07:00
Guillaume Calmettes
1da6a09274 [Bugfix]: do not shutdown server if skip_special_use=False for MistralTokenizer (#14094)
Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
2025-04-09 19:43:09 -07:00
Yuxuan Zhang
1e44ffc3ff Add GLM-4-0414 support (#16338)
Signed-off-by: lvfei.lv <lvfei.lv@alibaba-inc.com>
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: yihong0618 <zouzou0208@gmail.com>
Signed-off-by: Lu Fang <fanglu@fb.com>
Signed-off-by: Ajay Vohra <ajayvohr@amazon.com>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
Co-authored-by: Accelerator1996 <lvfei.lv@alibaba-inc.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: yihong <zouzou0208@gmail.com>
Co-authored-by: Lucia Fang <116399278+luccafong@users.noreply.github.com>
Co-authored-by: ajayvohra2005 <ajayvohr@amazon.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: Guillaume Calmettes <gcalmettes@scaleway.com>
2025-04-10 09:19:42 +08:00
Chengji Yao
a454748544 [TPU][V1] Refine tpu_model_runner to mitigate future recompilation issues (#16275)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-04-09 18:51:51 -06:00
Reid
1bff42c4b7 [Misc] refactor Structured Outputs example (#16322)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-09 23:32:42 +00:00
Joe Runde
cb391d85dc [Hardware] add platform-specific request validation api (#16291)
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
2025-04-09 12:50:01 -07:00
Russell Bryant
fee5b8d37f [Build/CI] Add tracing deps to vllm container image (#15224)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-04-09 19:14:06 +00:00
Michael Goin
b2ce859bd2 Fix benchmark_throughput.py --backend=hf (#16352)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-04-09 19:09:28 +00:00
Chendi.Xue
566f10a929 [CI]Fix hpu docker and numpy version for CI (#16355)
Signed-off-by: Chendi Xue <chendi.xue@intel.com>
2025-04-09 17:52:26 +00:00
Guillaume Calmettes
c3b5189137 [Bugfix] catch AssertionError in MistralTokenizer as ValueError (#16344)
Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
2025-04-09 17:33:24 +00:00
zh Wang
a25866ac8d [Bugfix] Fix profiling.py (#16202)
Signed-off-by: zh Wang <rekind133@outlook.com>
2025-04-09 17:03:34 +00:00
Michael Goin
098900d7c2 Revert "Update label-tpu mergify and remove removal bot" (#16350) 2025-04-09 07:59:36 -07:00
Guillaume Calmettes
98d01d3ce2 [Bugfix][Frontend] respect provided default guided decoding backend (#15476)
Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
2025-04-09 05:11:10 -07:00
Nicolò Lucchesi
d55244df31 [Model] Add SupportsMultiModal.get_language_model interface (#16007)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-04-09 04:12:54 -07:00
yihong
04149cce27 [BugFix] fix some typos found by typos. (#16314)
Signed-off-by: yihong0618 <zouzou0208@gmail.com>
2025-04-09 03:43:59 -07:00
ajayvohra2005
24834f4894 update neuron config (#16289)
Signed-off-by: Ajay Vohra <ajayvohr@amazon.com>
2025-04-09 03:43:22 -07:00
Lucia Fang
ec7da6fcf3 [BugFix] llama4 qknorm should be not shared across head (#16311)
Signed-off-by: Lu Fang <fanglu@fb.com>
2025-04-09 00:59:14 -07:00
yihong
819d548e8a [BugFix] logger is not callable (#16312)
Signed-off-by: yihong0618 <zouzou0208@gmail.com>
2025-04-09 00:59:02 -07:00
Michael Goin
477d2a8aa2 Update label-tpu mergify and remove removal bot (#16298) 2025-04-09 07:56:25 +00:00
Cyrus Leung
e484e02857 [Bugfix] Avoid transferring cached multi-modal items from P0 to P1 (#16273)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-09 00:51:27 -07:00
Accelerator1996
24f6b9a713 [Misc] Fix test_sharded_state_loader.py(#16004) (#16005)
Signed-off-by: lvfei.lv <lvfei.lv@alibaba-inc.com>
2025-04-09 14:47:30 +08:00
Luka Govedič
9cdde47289 [BugFix] Fix fusion test and add them to CI (#16287)
Signed-off-by: luka <luka@neuralmagic.com>
2025-04-08 23:46:45 -07:00
Chengji Yao
b1eb4ca152 [TPU] Update PyTorch/XLA (#16288)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-04-09 14:46:32 +08:00
Michael Goin
87b4ac56c2 [CI][Bugfix] Fix bad tolerance for test_batch_base64_embedding (#16221)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-04-09 04:14:46 +00:00
Russell Bryant
cb84e45ac7 [Core] Upgrade to xgrammar 0.1.18, add cache size limit (#16283)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-04-08 19:13:22 -07:00
rongfu.leng
4716377fbc [Feature] Estimate max-model-len use available KV cache memory (#16168)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-04-08 19:12:51 -07:00
rongfu.leng
4e9cf8c1dd [Bugfix] fix gettid method is not define (#16084)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-04-08 19:12:44 -07:00
TJian
2976dc27e9 [Bug] [ROCm] Fix Llama 4 Enablement Bug on ROCm: V0 ROCmFlashAttentionImpl and Triton Fused MoE bugs (#16198)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: kliuae <kuanfu.liu@embeddedllm.com>
Co-authored-by: Hongxia Yang <hongxia.yang@amd.com>
Co-authored-by: kliuae <kuanfu.liu@embeddedllm.com>
2025-04-08 19:12:34 -07:00
Chauncey
102bf967f0 [Model] Add smolvlm support (#16017)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-04-08 19:12:17 -07:00
yueshen2016
1f4b09b525 Add support to modelopt quantization of Mixtral model (#15961)
Signed-off-by: Yue <yueshen@nvidia.com>
2025-04-09 01:53:31 +00:00
Jee Jee Li
86c3369eb8 [CI/Build] Fix CI LoRA failure (#16270)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-04-09 09:13:56 +08:00
Russell Bryant
2755c34a8f [V1] Update structured output offline inference example (#15721)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-04-08 22:34:09 +00:00
Jinzhen Lin
db10422184 [Bugfix] fix deepseek fp16 scale bug (#14809)
Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-04-08 16:56:09 -04:00
Lucas Wilkinson
e1a2c699dd [BugFix] Fix Llama4 - Index Error When Single Request Near Max Context (#16209)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2025-04-08 18:56:51 +00:00
Harry Mellor
0115ccd5c0 Add warning that content below line in template will be removed (#16276)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-08 18:18:40 +00:00
Isotr0py
40b4284fe3 [Bugfix] Handle process_weights_after_loading for QKVCrossParallelLinear (#15328)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-04-08 10:02:23 -07:00
Cyrus Leung
4ebc0b9640 [Bugfix] Proper input validation for multi-modal encoder-decoder models (#16156)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-08 09:45:21 -07:00
Kero Liang
dc96fd54c6 [Misc] Avoid stripping meaningful whitespace from nvidia-smi topo -m output in collect_env.py (#16272)
Signed-off-by: imkero <kerorek@outlook.com>
2025-04-08 16:08:09 +00:00
wang.yuqi
1f5d13ab9f [New Model]: jinaai/jina-embeddings-v3 (#16120) 2025-04-08 08:39:12 -07:00
Harry Mellor
90cb44eb02 Update to transformers==4.51.1 (#16257)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-08 06:53:39 -07:00
Kebe
e11880deea [Bugfix] Remove triton do_bench fast_flush arg (#16256)
Signed-off-by: Kebe <mail@kebe7jun.com>
2025-04-08 13:51:06 +00:00
TY-AMD
9351f91be9 [BugFix][ROCm] Fix GGUF MoE Dispatch Block_Dim for ROCm (#16247)
Signed-off-by: Tianyuan Wu <Tianyuan.Wu@amd.com>
2025-04-08 05:10:26 -07:00
rongfu.leng
5a1e1c8353 [Model] use AutoWeightsLoader for phimoe,qwen2_moe,qwen3_moe (#16203)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-04-08 04:05:47 -07:00
Alex Brooks
69ecaa7c79 [Misc] Add warning for multimodal data in LLM.beam_search (#16241)
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
2025-04-08 04:05:27 -07:00
Reid
7f00899ff7 [Misc] format and refactor some examples (#16252)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-08 10:42:32 +00:00
Simon Mo
995e3d1f41 [Docs] Add Slides from Singapore Meetup (#16213)
Signed-off-by: simon-mo <simon.mo@hey.com>
2025-04-08 07:20:22 +00:00
Kebe
b4ac449a83 [Misc] Merge the logs of pp layers partitions (#16225)
Signed-off-by: Kebe <mail@kebe7jun.com>
2025-04-08 00:18:15 -07:00
Michael Goin
8e5314a468 [V1] Add disable_chunked_mm_input arg to disable partial mm input prefill (#15837)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-04-07 23:24:07 -07:00
Siyuan Liu
87918e40c4 [torch.compile][TPU] Make @support_torch_compile work for XLA backend (#15782)
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-04-08 14:23:53 +08:00
Isotr0py
f6b32efb7f [Bugfix] Fix and reorganize broken GGUF tests and bump gguf version (#16194)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-04-08 13:38:13 +08:00
Michael Goin
b99733d092 [Bugfix] Do not skip "empty" parts of chats that are parsable (#16219)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-04-08 05:14:15 +00:00
Yong Hoon Shin
05a015d6a5 Add warning for Attention backends that do not support irope yet (#16212) 2025-04-08 03:59:26 +00:00
zxfan-cpu
ad971af8c7 [Bugfix] fix use-ep bug to enable ep by dp/tp size > 1 (#16161) 2025-04-07 20:48:47 -07:00
Roger Wang
f2ebb6f541 [V1] Scatter and gather placeholders in the model runner (#16076)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Jennifer Zhao <ai.jenniferzhao@gmail.com>
2025-04-08 10:43:41 +08:00
Satyajith Chilappagari
1d01211264 Update BASE_IMAGE to 2.22 release of Neuron (#16218) 2025-04-07 19:11:18 -07:00
Miles Williams
f94ab12f79 [Misc] Update compressed-tensors to version 0.9.3 (#16196)
Signed-off-by: Miles Williams <42222518+mlsw@users.noreply.github.com>
2025-04-07 19:09:06 -07:00
youkaichao
a865bc1ca6 [core] do not send error across process (#16174)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-04-07 19:09:03 -07:00
Michael Goin
21802c4b6d [ROCm][Bugfix][FP8] Make fp8 quant respect fused modules mapping (#16031)
Signed-off-by: mgoin <michael@neuralmagic.com>
2025-04-07 21:28:14 -04:00
Driss Guessous
652907b354 Torchao (#14231)
Signed-off-by: drisspg <drisspguessous@gmail.com>
2025-04-07 19:39:28 -04:00
leon-seidel
24f1c01e0f [Bugfix][V0] XGrammar structured output supports Enum (#15878)
Signed-off-by: Leon Seidel <leon.seidel@fau.de>
2025-04-07 22:38:25 +00:00
Reid
fad6e2538e [Misc] add description attribute in CLI (#15921)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-07 22:30:35 +00:00
Nick Hill
7f6d47c1a2 [V1][BugFix] Exit properly if engine core fails during startup (#16137)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-04-07 15:30:15 -07:00
Benjamin Chislett
3147586ebd [Bugfix] Fix guidance backend for Qwen models (#16210)
Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
2025-04-07 22:15:43 +00:00
Roger Wang
ed636d99ca [Misc] Move Llama 4 projector call into encoder execution (#16201) 2025-04-07 14:02:05 -07:00
Nicolò Lucchesi
090c856d76 [Misc] Human-readable max-model-len cli arg (#16181)
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-04-07 14:40:58 -04:00
Gregory Shtrasberg
ad434d4cfe Print the warning only once (#16193)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-04-07 18:30:06 +00:00
Cyrus Leung
66d433b94f [V1] Revert the default max_num_seqs to V0 values for most hardware (#16158)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-07 13:54:36 -04:00
Cyrus Leung
027b204ff1 [Bugfix] Re-enable support for ChatGLMForConditionalGeneration (#16187)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-07 23:15:58 +08:00
Lu Fang
55dcce91df Upstream Llama4 Support to Main (#16113)
Signed-off-by: Aston Zhang <22279212+astonzhang@users.noreply.github.com>
Signed-off-by: Chris Thi <chris.c.thi@gmail.com>
Signed-off-by: drisspg <drisspguessous@gmail.com>
Signed-off-by: Jon Swenson <jmswen@gmail.com>
Signed-off-by: Keyun Tong <tongkeyun@gmail.com>
Signed-off-by: Lu Fang <fanglu@meta.com>
Signed-off-by: Xiaodong Wang <xdwang@meta.com>
Signed-off-by: Yang Chen <yangche@fb.com>
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
Signed-off-by: Zijing Liu <liuzijing2014@gmail.com>
Signed-off-by: Lu Fang <lufang@fb.com>
Signed-off-by: Lu Fang <fanglu@fb.com>
Signed-off-by: Lucia Fang <fanglu@fb.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Lu Fang <fanglu@fb.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-07 08:06:27 -07:00
Robin
8017c8db7f [Doc]Update image to latest version (#16186)
Signed-off-by: WangErXiao <863579016@qq.com>
2025-04-07 14:17:39 +00:00
Reid
dc3529dbf6 [Misc] improve example mlpspeculator and llm_engine_example (#16175)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-07 11:53:52 +00:00
YamPengLi
7699258ef0 [Model] Add Qwen3 and Qwen3MoE (#15289)
Signed-off-by: YamPengLi <yampayne.lyp@alibaba-inc.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-04-07 04:06:41 -07:00
Shanshan Shen
e9ba99f296 [V1][Structured Output] Add supports_structured_output() method to Platform (#16148)
Signed-off-by: shen-shanshan <467638484@qq.com>
2025-04-07 11:06:24 +00:00
Isotr0py
7c80368710 [VLM] Florence-2 supports online serving (#16164)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-04-07 04:04:02 -07:00
yihong
95d63f38c0 doc: fix some typos in doc (#16154)
Signed-off-by: yihong0618 <zouzou0208@gmail.com>
2025-04-07 05:32:06 +00:00
Roger Wang
bb8dab821e [CI] Set max transformers version for Ultravox model test (#16149)
Signed-off-by: Roger Wang <ywang@roblox.com>
2025-04-07 04:37:58 +00:00
Isotr0py
fc0f87768a [Bugfix] Make dummy encoder prompt padding alternative and add missing warnings (#16129)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-04-07 04:07:15 +00:00
Cyrus Leung
0a57386721 [Misc] Update Mistral-3.1 example (#16147)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-07 03:57:37 +00:00
Woosuk Kwon
3749e28774 [V1][Minor] Minor simplification for get_computed_blocks (#16139)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-04-06 20:38:12 -07:00
Kay Yan
86fc2321ff [Metrics] Add bucket for request_latency, time_to_first_token and time_per_output_token (#15202)
Signed-off-by: Kay Yan <kay.yan@daocloud.io>
2025-04-06 20:34:51 -07:00
Martin Hoyer
2549c0dfef Fix requires-python (#16132) 2025-04-06 19:22:25 -07:00
Woosuk Kwon
b10e519895 [V1][Minor] Optimize get_cached_block (#16135) 2025-04-06 20:48:14 +00:00
Chengji Yao
9bde5ba127 [TPU] Update PyTorch/XLA (#16130)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-04-06 18:25:55 +00:00
Reid
72c8f1ad04 [Misc] update requires-python in pyproject.toml (#16116)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-06 14:56:34 +00:00
paolovic
da224daaa9 [Bugfix] add hf_token to EngineArgs (#16093)
Signed-off-by: paolovic <paul-philipp.luley@uzh.ch>
Co-authored-by: paolovic <paul-philipp.luley@uzh.ch>
2025-04-06 14:47:33 +00:00
Varun Sundar Rabindranath
3a100b9278 [Bugfix] LoRA : Fix the order in which the kernels process LoRAs (#16040)
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2025-04-06 14:04:50 +00:00
rongfu.leng
242a637aea [Model] use AutoWeightsLoader for stablelm,starcoder2,zamba2 (#16103)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-04-06 05:52:01 -07:00
Isotr0py
c2a9671510 [Misc] Improve model redirect to accept json dictionary (#16119)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-04-06 05:51:45 -07:00
Paul Schweigert
d5ae4f7f42 [Doc][Bugfix] Add missing EOF in k8s deploy doc (#16025) 2025-04-06 12:10:57 +00:00
Reid
b6c502a150 [Misc] refactor example eagle (#16100)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-06 09:42:48 +00:00
Roger Wang
9ca710e525 [CI][V1] Fix passing tokenizer as kwarg to validate_guidance_grammar (#16117)
Signed-off-by: Roger Wang <ywang@roblox.com>
2025-04-06 16:18:00 +08:00
Ben Jackson
eb07c8cb5b [Frontend] Fix typo in tool chat templates for llama3.2 and toolace (#14501)
Signed-off-by: Ben Jackson <ben@ben.com>
2025-04-06 07:44:36 +00:00
Hyesoo Yang
ba10801961 [Benchmark] Add sampling parameters to benchmark_serving. (#16022)
Signed-off-by: Hyesoo Yang <hyeygit@gmail.com>
2025-04-06 12:30:35 +08:00
Lucia Fang
620fc2d09e [Model] fix model testing for TeleChat2ForCausalLM and V0 llama4 (#16112)
Signed-off-by: Lu Fang <fanglu@fb.com>
2025-04-05 21:23:40 -07:00
Jonghyun Choe
29283eaa7e [Model] use AutoWeightsLoader for phi, gemma, deepseek (#16088)
Signed-off-by: Jonghyun Choe <andy.choe729@gmail.com>
2025-04-05 20:34:38 -07:00
Jinzhen Lin
2fa66ef713 [Bugfix] fix use_atomic_add support of marlin kernel when using v1 engine (#15946)
Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
2025-04-05 20:04:22 -07:00
Chauncey
13affc432d [Misc] Remove redundant code (#16098)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-04-05 20:03:50 -07:00
Reid
d8f094a92a [Misc] format output for encoder_decoder.py (#16095)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-05 19:57:18 -07:00
Harry Mellor
97ae6d777f Fix some capitalisations in generated examples doc titles (#16094)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-05 13:44:03 +00:00
yihong
6baeee70d1 Revert "doc: add info for macos clang errors (#16049)" (#16091)
Signed-off-by: yihong0618 <zouzou0208@gmail.com>
2025-04-05 11:51:51 +00:00
Reid
d2517a4939 [doc] fix 404 (#16082)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-05 11:39:18 +00:00
yihong
6342adc438 fix: support clang17 for macos and fix the real libomp (#16086)
Signed-off-by: yihong0618 <zouzou0208@gmail.com>
2025-04-05 11:00:12 +00:00
Kevin H. Luu
0adba91547 [CI] Fix benchmark script level (#16089) 2025-04-05 03:36:01 -07:00
Tristan Leclercq
4285e423a6 [Misc] Auto detect bitsandbytes pre-quantized models (#16027)
Signed-off-by: Tristan Leclercq <tristanleclercq@gmail.com>
2025-04-04 23:30:45 -07:00
Woosuk Kwon
63375f0cdb [V1][Spec Decode] Update N-gram Proposer Interface (#15750)
Some checks failed
Create Release / Create Release (push) Has been cancelled
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-04-04 16:32:54 -07:00
Michael Goin
70ad3f9e98 [Bugfix][TPU] Fix V1 TPU worker for sliding window (#16059)
Signed-off-by: Michael Goin <mgoin64@gmail.com>
2025-04-04 23:31:19 +00:00
bnellnm
d6fc629f4d [Kernel][Minor] Re-fuse triton moe weight application (#16071)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-04-04 23:27:34 +00:00
Roger Wang
af51d80fa1 Revert "[V1] Scatter and gather placeholders in the model runner" (#16075) 2025-04-04 14:50:57 -07:00
Cyrus Leung
f5722a5052 [V1] Scatter and gather placeholders in the model runner (#15712)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
2025-04-04 21:26:44 +00:00
Nick Hill
651cf0fec1 [V1] DP scale-out (1/N): Use zmq ROUTER/DEALER sockets for input queue (#15906)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-04-04 12:56:43 -07:00
Kevin H. Luu
4dc52e1c53 [CI] Reorganize .buildkite directory (#16001)
Signed-off-by: kevin <kevin@anyscale.com>
2025-04-04 12:16:20 -07:00
Michael Goin
4708f13a9c [Bugfix] Fix default behavior/fallback for pp in v1 (#16057)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-04-04 17:58:08 +00:00
Gregory Shtrasberg
a6d042df0a [ROCm][Bugfix] Bring back fallback to eager mode removed in #14917, but for ROCm only (#15413)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-04-04 09:40:37 -07:00
Gregory Shtrasberg
40a36ccfeb [ROCm][Bugfix] Use platform specific FP8 dtype (#15717)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-04-04 09:40:20 -07:00
Ilya Markov
ef608c37a7 [Distributed] [ROCM] Fix custom allreduce enable checks (#16010)
Signed-off-by: ilmarkov <imarkov@redhat.com>
Co-authored-by: ilmarkov <imarkov@redhat.com>
2025-04-04 09:39:08 -07:00
Li, Jiang
2386803f2a [CPU] Change default block_size for CPU backend (#16002)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-04-04 09:39:05 -07:00
Ziji Shi (Steven)
95862f7b4d [Benchmark][Doc] Update throughput benchmark and README (#15998)
Signed-off-by: StevenShi-23 <shi.ziji.sm@gmail.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
2025-04-04 09:39:02 -07:00
Isotr0py
230b131b54 [Bugfix][kernels] Fix half2float conversion in gguf kernels (#15995)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-04-04 09:38:58 -07:00
liuzhenwei
0812d8dd41 [Hardware][Gaudi][BugFix] fix arguments of hpu fused moe (#15945)
Signed-off-by: zhenwei <zhenweiliu@habana.ai>
2025-04-04 09:38:55 -07:00
Jonghyun Choe
bf7e3c51ae [Model] use AutoWeightsLoader for baichuan, gpt-neox, mpt (#15939)
Signed-off-by: Jonghyun Choe <andy.choe729@gmail.com>
2025-04-04 09:38:52 -07:00
Mark McLoughlin
a35a8a8392 [V1][Spec Decode] Avoid logging useless nan metrics (#16023)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-04-04 08:52:41 -07:00
yihong
4ef0bb1fcf doc: add info for macos clang errors (#16049)
Signed-off-by: yihong0618 <zouzou0208@gmail.com>
2025-04-04 14:58:16 +00:00
Chengji Yao
fadc59c0e6 [TPU][V1] Remove ragged attention kernel parameter hard coding (#16041)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-04-04 07:48:50 -04:00
Reid
86cbd2eee9 [Misc] improve gguf check (#15974)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-04 01:33:36 +00:00
Huy Do
092475f738 [ROCm] Tweak the benchmark script to run on ROCm (#14252) 2025-04-03 17:12:48 -07:00
bnellnm
dcc56d62da [Bugfix] Fix function names in test_block_fp8.py (#16033)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-04-03 23:01:34 +00:00
Robert Shaw
f15e70d906 [TPU] Switch Test to Non-Sliding Window (#15981)
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
2025-04-03 14:28:45 -07:00
iefgnoix
b6be6f8d1e [TPU] Support sliding window and logit soft capping in the paged attention kernel for TPU. (#15732)
Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
2025-04-03 14:23:28 -07:00
Alexei-V-Ivanov-AMD
03a70eacaf Re-enable the AMD Testing for the passing tests. (#15586)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
2025-04-03 11:05:17 -07:00
yarongmu-google
45b1ff7a25 [Misc][Performance] Advance tpu.txt to the most recent nightly torch … (#16024) 2025-04-03 17:32:54 +00:00
bnellnm
15ba07ef25 [Minor] Fused experts refactor (#15914)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-04-03 10:19:38 -07:00
Liangfu Chen
d2b58ca203 [Neuron][kernel] Fuse kv cache into a single tensor (#15911)
Signed-off-by: Liangfu Chen <liangfc@amazon.com>
2025-04-03 09:51:32 -07:00
Kyle Sayers
82e7e19a6e [SupportsQuant] Chameleon, Chatglm, Commandr (#15952)
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
2025-04-03 08:25:22 -07:00
Kyle Sayers
421c462948 [SupportsQuant] Bert, Blip, Blip2, Bloom (#15573)
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
2025-04-03 08:23:19 -07:00
yihong
84884cd9ac fix: tiny fix make format.sh excutable (#16015)
Signed-off-by: yihong0618 <zouzou0208@gmail.com>
2025-04-03 15:18:05 +00:00
Reid
a43aa183dc [doc] update contribution link (#15922)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-03 10:47:31 +00:00
wwl2755
463bbb1835 [Bugfix][V1] Fix bug from putting llm_engine.model_executor in a background process (#15367)
Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
2025-04-03 07:32:10 +00:00
youkaichao
5e125e74d1 [misc] improve error message for "Failed to infer device type" (#15994)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-04-03 14:45:03 +08:00
Ziji Shi (Steven)
06f21ce7a5 [Benchmark] Add AIMO Dataset to Benchmark (#15955)
Signed-off-by: Ziji Shi <shi.ziji.sm@gmail.com>
Signed-off-by: StevenShi-23 <shi.ziji.sm@gmail.com>
2025-04-03 06:09:18 +00:00
Aleksandr Malyshev
57a810db9c [ROCM][V0] PA kennel selection when no sliding window provided (#15982)
Signed-off-by: Aleksandr Malyshev <maleksan@amd.com>
Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
2025-04-03 05:28:44 +00:00
youkaichao
8b664706aa [bugfix] add seed in torchrun_example.py (#15980)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-04-03 12:25:01 +08:00
yihong
37bfee92bf fix: better error message for get_config close #13889 (#15943)
Signed-off-by: yihong0618 <zouzou0208@gmail.com>
2025-04-03 03:53:19 +00:00
Aleksandr Malyshev
e73ff24e31 [ROCM][KERNEL] Paged attention for V1 (#15720)
Signed-off-by: Aleksandr Malyshev <maleksan@amd.com>
Signed-off-by: root <root@banff-cyxtera-s65-4.amd.com>
Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
Co-authored-by: root <root@banff-cyxtera-s65-4.amd.com>
2025-04-02 19:48:00 -07:00
Nicolò Lucchesi
bd7599d34a [V1][TPU] Do not compile sampling more than needed (#15883)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-04-03 01:36:01 +00:00
Chengji Yao
01b6113659 [TPU] optimize the all-reduce performance (#15903)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-04-03 00:25:14 +00:00
Hyesoo Yang
1b84eff03a [V1][TPU] TPU-optimized top-p implementation (avoids scattering). (#15736)
Signed-off-by: Hyesoo Yang <hyeygit@gmail.com>
Co-authored-by: root <root@t1v-n-822696b7-w-0.us-central2-b.c.tpu-prod-env-large-adhoc.internal>
2025-04-02 17:18:08 -07:00
Harry Mellor
55acf86bf8 Fix huggingface-cli[hf-xet] -> huggingface-cli[hf_xet] (#15969)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-02 23:37:30 +00:00
Michael Goin
f021b97993 [V1] Support Mistral3 in V1 (#15950)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-04-02 15:36:24 -07:00
youkaichao
1cab43c2d2 [misc] instruct pytorch to use nvml-based cuda check (#15951)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-04-03 01:02:58 +08:00
Nishidha
8bd651b318 Restricted cmake to be less than version 4 as 4.x breaks the build of… (#15859)
Signed-off-by: Nishidha Panpaliya <nishidha.panpaliya@partner.ibm.com>
2025-04-02 16:19:39 +00:00
Jee Jee Li
58e234a754 [Misc] V1 LoRA support CPU offload (#15843)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-04-02 23:04:43 +08:00
rongfu.leng
e86c414d6a [Model] use AutoWeightsLoader in model load_weights (#15770)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-04-02 07:47:31 -07:00
Li, Jiang
550b2801ad [CPU][Bugfix] Using custom allreduce for CPU backend (#15934)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-04-02 07:46:47 -07:00
Matthias Matt
cefb9e5a28 [Frontend] Implement Tool Calling with tool_choice='required' (#13483)
Signed-off-by: Liangfu Chen <liangfc@amazon.com>
Signed-off-by: Matt, Matthias <matthias.matt@tuwien.ac.at>
Co-authored-by: Liangfu Chen <liangfc@amazon.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
2025-04-02 07:45:45 -07:00
Mark McLoughlin
98d7367b61 [Metrics] Hide deprecated metrics (#15458)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-04-02 07:37:19 -07:00
Chauncey
594a8b9030 [Bugfix] Fix the issue where the model name is empty string, causing no response with the model name. (#15938)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-04-02 06:33:52 -07:00
Kay Yan
44f990515b [CI] Remove duplicate entrypoints-test (#15940)
Signed-off-by: Kay Yan <kay.yan@daocloud.io>
2025-04-02 02:44:01 -07:00
Brayden Zhong
252937806c [Bugfix][Benchmarks] Ensure async_request_deepspeed_mii uses the OpenAI choices key (#15926)
Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
2025-04-02 02:19:35 -07:00
Harry Mellor
51826d51fa Add minimum version for huggingface_hub to enable Xet downloads (#15873)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-02 02:03:36 -07:00
Russell Bryant
14e53ed11f [V1] Fix json_object support with xgrammar (#15488)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-04-02 02:00:08 -07:00
Eric Tang
ddb94c2605 [core] Add tags parameter to wake_up() (#15500)
Signed-off-by: Eric <erictang000@gmail.com>
2025-04-02 01:59:27 -07:00
LukasBluebaum
90969fb39a [Kernel] Add more dtype support for GGUF dequantization (#15879)
Signed-off-by: lukas.bluebaum <lukas.bluebaum@aleph-alpha.com>
2025-04-02 01:58:48 -07:00
Chris Thi
101f1481f9 [Build/CI] Update lm-eval to 0.4.8 (#15912)
Signed-off-by: Chris Thi <chris.c.thi@gmail.com>
2025-04-02 01:47:57 -07:00
Thien Tran
2edc87b161 [Bugfix] Fix cache block size calculation for CPU MLA (#15848)
Signed-off-by: Thien Tran <gau.nernst@yahoo.com.sg>
2025-04-02 01:45:02 -07:00
Jee Jee Li
4203926f10 [CI/Build] Further clean up LoRA tests (#15920)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-04-02 01:39:09 -07:00
Chauncey
cdb57015a7 [Misc] Replace print with logger (#15923)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-04-02 01:37:38 -07:00
Li Wang
aa557e6422 [Benchmark]Fix error message (#15866)
Signed-off-by: wangli <wangli858794774@gmail.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
2025-04-02 01:32:24 -07:00
Roger Wang
0e00d40e4f [V1][Bugfix] Fix typo in MoE TPU checking (#15927)
Signed-off-by: Roger Wang <ywang@roblox.com>
2025-04-01 23:46:42 -07:00
chun
c920e01242 [Doc] Update rocm.inc.md (#15917)
Signed-off-by: chun37 <chun.jb.37@gmail.com>
2025-04-01 23:38:26 -07:00
Woosuk Kwon
274d8e8818 [V1][Minor] Enhance SpecDecoding Metrics Log in V1 (#15902)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-04-01 23:38:02 -07:00
Thien Tran
2039c6305b [Bugfix] Fix imports for MoE on CPU (#15841)
Signed-off-by: Thien Tran <gau.nernst@yahoo.com.sg>
2025-04-02 03:33:55 +00:00
Brayden Zhong
6efb195a6e [V1] Fix: make sure k_index is int64 for apply_top_k_only (#15907)
Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
2025-04-01 19:06:44 -07:00
Ekagra Ranjan
24b7fb455a [Spec Decode] Fix input triton kernel for eagle (#15909) 2025-04-01 18:15:14 -07:00
Simon Mo
58f5a59769 [Docs] Add Intel as Sponsor (#15913)
Signed-off-by: simon-mo <simon.mo@hey.com>
2025-04-01 17:16:55 -07:00
Simon Mo
db9dfcfa6a [Docs] Add Ollama meetup slides (#15905)
Signed-off-by: simon-mo <simon.mo@hey.com>
2025-04-01 13:58:59 -07:00
Gerald
9ef98d527e [Model][MiniMaxText01] Support MiniMaxText01 model inference (#13454)
Signed-off-by: qscqesze <475517977@qq.com>
Co-authored-by: qingjun <qingjun@minimaxi.com>
Co-authored-by: qscqesze <475517977@qq.com>
2025-04-01 16:23:55 -04:00
yihong
93491aefc7 [BugFix] make sure socket close (#15875)
Signed-off-by: yihong0618 <zouzou0208@gmail.com>
2025-04-01 13:10:24 -07:00
Simon Mo
7acd539cd7 [Docs] update usage stats language (#15898)
Signed-off-by: simon-mo <simon.mo@hey.com>
2025-04-01 12:54:13 -07:00
Woosuk Kwon
e75a6301bd [V1][Spec Decode] Implement Eagle Proposer [1/N] (#15729)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-04-01 12:33:16 -07:00
Mark McLoughlin
a79cc68b3a [V1][Metrics] Initial speculative decoding metrics (#15151)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-04-01 10:45:04 -07:00
Roger Wang
7e3f7a4ee7 [CI] Disable flaky structure decoding test temporarily. (#15892)
Signed-off-by: Roger Wang <ywang@roblox.com>
2025-04-01 17:42:34 +00:00
cloud11665
9ec8257914 [Model] Add module name prefixes to gemma3 (#15889)
Signed-off-by: Bartholomew Sabat <bartek@recursal.ai>
Co-authored-by: Bartholomew Sabat <bartek@recursal.ai>
2025-04-01 10:13:40 -07:00
Jennifer Zhao
38327cf454 [Model] Aya Vision (#15441)
Signed-off-by: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
2025-04-01 16:30:43 +00:00
Jee Jee Li
dfa82e2a3d [CI/Build] Clean up LoRA tests (#15867)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-04-01 16:28:50 +00:00
bnellnm
e59ca942f5 Add option to use DeepGemm contiguous grouped gemm kernel for fused MoE operations. (#13932)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-04-01 12:07:43 -04:00
Gregory Shtrasberg
a57a3044aa [ROCm][Build][Bugfix] Bring the base dockerfile in sync with the ROCm fork (#15820)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-04-01 08:56:39 -07:00
Isotr0py
4e5a0f6ae2 [Misc] Allow using OpenCV as video IO fallback (#15055)
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-01 15:55:13 +00:00
Harry Mellor
b63bd14999 Reinstate format.sh and make pre-commit installation simpler (#15890)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-01 15:41:30 +00:00
chaow-amd
2041c0e360 [Doc] Quark quantization documentation (#15861)
Signed-off-by: chaow <chaow@amd.com>
2025-04-01 08:32:45 -07:00
wang.yuqi
085cbc4f9f [New Model]: jinaai/jina-reranker-v2-base-multilingual (#15876)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-01 08:32:26 -07:00
Harry Mellor
2b93162fb0 Remove format.sh as it's been unsupported >70 days (#15884)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-01 22:27:46 +08:00
Reid
2e45bd29fe [Misc] remove unused script (#15746)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-01 13:58:05 +00:00
Michael Goin
51d7c6a2b2 [Model] Support Mistral3 in the HF Transformers format (#15505)
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-04-01 06:10:05 -07:00
Yang Chen
f3aca1ee30 setup correct nvcc version with CUDA_HOME (#15725)
Signed-off-by: Yang Chen <yangche@fb.com>
2025-04-01 06:09:40 -07:00
Rui Qiao
8dd41d6bcc [Misc] Use envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE (#15831)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-01 06:07:53 -07:00
Isotr0py
0a298ea418 [Bugfix] Fix no video/image profiling edge case for MultiModalDataParser (#15828)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-04-01 18:17:11 +08:00
Harry Mellor
d330558bab [Docs] Fix small error in link text (#15868)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-01 10:05:14 +00:00
shangmingc
656fd72976 [Misc] Fix speculative config repr string (#15860)
Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
2025-04-01 02:26:22 -07:00
Varun Sundar Rabindranath
79455cf421 [Misc] Enable V1 LoRA by default (#15320)
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2025-04-01 16:53:56 +08:00
Wei Zeng
30d6a015e0 [Feature] specify model in config.yaml (#15798)
Signed-off-by: weizeng <weizeng@roblox.com>
2025-04-01 01:20:06 -07:00
yihong
8af5a5c4e5 fix: can not use uv run collect_env close #13888 (#15792)
Signed-off-by: yihong0618 <zouzou0208@gmail.com>
2025-04-01 07:45:49 +00:00
Chen Zhang
3a5f0afcd2 [V1] Implement sliding window attention in kv_cache_manager (#14097)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-04-01 00:33:17 -07:00
Gregory Shtrasberg
c7e63aa4d8 [ROCm] Use device name in the warning (#15838)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-04-01 00:10:48 -07:00
Lionel Villard
4a9ce1784c [sleep mode] clear pytorch cache after sleep (#15248)
Signed-off-by: <villard@us.ibm.com>
2025-03-31 22:58:58 -07:00
Alexander Matveev
7e4e709b43 [V1] TPU - Fix fused MOE (#15834)
Signed-off-by: Alexander Matveev <amatveev@redhat.com>
2025-03-31 22:58:07 -07:00
Alexey Kiryushin
63d8eabed0 [Bugfix]: Fix is_embedding_layer condition in VocabParallelEmbedding (#15824)
Signed-off-by: alexwl <alexey.a.kiryushin@gmail.com>
2025-03-31 22:57:59 -07:00
Percy
e830b01383 [Bugfix] Fix extra comma (#15851)
Signed-off-by: haochengxia <xhc_1007@163.com>
2025-03-31 22:57:28 -07:00
Yan Ma
ff6473980d [Bugfix][Model] fix mllama multi-image (#14883)
Signed-off-by: yan ma <yan.ma@intel.com>
2025-03-31 22:53:37 -07:00
Kinfey
a164aea35d [Frontend] Add Phi-4-mini function calling support (#14886)
Signed-off-by: Kinfey <kinfeylo@microsoft.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-03-31 22:50:05 -07:00
Harry Mellor
a76f547e11 Rename fallback model and refactor supported models section (#15829)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-03-31 22:49:41 -07:00
Ilya Markov
b7b7676d67 [Distributed] Add custom allreduce support for ROCM (#14125)
Signed-off-by: ilmarkov <imarkov@redhat.com>
Co-authored-by: ilmarkov <imarkov@redhat.com>
2025-03-31 22:49:12 -07:00
Harry Mellor
e6e3c55ef2 Move dockerfiles into their own directory (#14549)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-03-31 13:47:32 -07:00
Mark McLoughlin
f98a4920f9 [V1][Core] Remove unused speculative config from scheduler (#15818)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-03-31 19:15:21 +00:00
Harry Mellor
d4bfc23ef0 Fix Transformers backend compatibility check (#15290)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-03-31 10:27:07 -07:00
Alexander Matveev
9a2160fa55 [V1] TPU CI - Add basic perf regression test (#15414)
Signed-off-by: Alexander Matveev <amatveev@redhat.com>
2025-03-31 13:25:20 -04:00
yihong
2de4118243 fix: change GB to GiB in logging close #14979 (#15807)
Signed-off-by: yihong0618 <zouzou0208@gmail.com>
2025-03-31 10:00:50 -07:00
shangmingc
239b7befdd [V1][Spec Decode] Remove deprecated spec decode config params (#15466)
Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
2025-03-31 09:19:35 -07:00
Cyrus Leung
09e974d483 [Bugfix] Check dimensions of multimodal embeddings in V1 (#15816)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-31 09:01:35 -07:00
Harry Mellor
e5ef4fa99a Upgrade transformers to v4.50.3 (#13905)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-03-31 08:59:37 -07:00
Mrm
037bcd942c [Bugfix] Fix missing return value in load_weights method of adapters.py (#15542)
Signed-off-by: noc-turne <2270929247@qq.com>
2025-03-31 06:56:42 -07:00
Alex Brooks
c2e7507ad4 [Bugfix] Fix Crashing When Loading Modules With Batchnorm Stats (#15813)
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
2025-03-31 13:23:53 +00:00
Naveassaf
3aa2b6a637 [Model] Update support for NemotronNAS models (#15008)
Signed-off-by: Nave Assaf <nassaf@nvidia.com>
2025-03-31 20:35:14 +08:00
youkaichao
555aa21905 [V1] Fully Transparent Implementation of CPU Offloading (#15354)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-03-31 20:22:34 +08:00
yihong
e7ae3bf3d6 fix: better install requirement for install in setup.py (#15796)
Signed-off-by: yihong0618 <zouzou0208@gmail.com>
2025-03-31 05:13:32 -07:00
Harry Mellor
b932c048ac Recommend developing with Python 3.12 in developer guide (#15811)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-03-31 11:54:49 +00:00
Charlie Fu
e85829450d [Feature][ROCm]Enable fusion pass for torch.compile on ROCm (#15050)
Signed-off-by: charlifu <charlifu@amd.com>
2025-03-31 04:42:18 -07:00
Jennifer Zhao
effc5d24fa [Benchmark] Update Vision Arena Dataset and HuggingFaceDataset Setup (#15748)
Signed-off-by: Jennifer Zhao <ai.jenniferzhao@gmail.com>
2025-03-31 15:38:58 +08:00
Chengyang LIU
18ed3132d2 [Misc] update the comments (#15780)
Signed-off-by: chengyang liu <lcy4869@gmail.com>
Co-authored-by: chengyang liu <lcy4869@gmail.com>
2025-03-30 19:39:56 -07:00
Woosuk Kwon
9b459eca88 [V1][Scheduler] Avoid calling _try_schedule_encoder_inputs for every request (#15778)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-03-30 14:10:42 -07:00
yihong
70fedd0f79 fix: Comments to English for better dev experience (#15768)
Signed-off-by: yihong0618 <zouzou0208@gmail.com>
2025-03-30 10:47:57 -07:00
kYLe
bb103b29bf [Bugfix] Added embed_is_patch mask for fuyu model (#15731)
Signed-off-by: Kyle Huang <kylhuang@nvidia.com>
2025-03-30 03:45:08 -07:00
yihong
248e76c4df fix: lint fix a ruff checkout syntax error (#15767)
Signed-off-by: yihong0618 <zouzou0208@gmail.com>
2025-03-30 03:36:02 -07:00
Cyrus Leung
803d5c35f3 [V1] Override mm_counts for dummy data creation (#15703)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-30 03:20:42 -07:00
pansicheng
7fd8c0f85c fix test_phi3v (#15321)
Signed-off-by: pansicheng <sicheng.pan.chn@gmail.com>
2025-03-30 02:01:34 -07:00
Reid
44c3a5abc3 [doc] update conda to usage link in installation (#15761)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-03-30 08:12:13 +00:00
Julien Denize
6909a76201 [Bugfix] Fix Mistral guided generation using xgrammar (#15704)
Signed-off-by: Julien Denize <julien.denize@mistral.ai>
2025-03-29 20:20:19 -07:00
Chauncey
045533716b [CI] xgrammar structured output supports Enum. (#15757)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-03-29 20:20:02 -07:00
Isotr0py
3c0ff914ac [Bugfix] Fix Mllama interleaved images input support (#15564)
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
2025-03-29 18:11:15 +00:00
Woosuk Kwon
2bc4be4e32 [V1][Minor] Simplify rejection sampler's parse_output (#15741)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-03-29 09:25:17 -07:00
Roger Wang
c67abd614f [V1] Support interleaved modality items (#15605)
Signed-off-by: Roger Wang <ywang@roblox.com>
2025-03-29 06:30:09 -07:00
shangmingc
6fa7cd3dbc [Feature][Disaggregated] Support XpYd disaggregated prefill with MooncakeStore (#12957)
Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
2025-03-29 04:01:46 -07:00
wwl2755
94744ba41a [V1] [Feature] Collective RPC (#15444)
Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
2025-03-29 03:39:14 -07:00
TJian
4965ec42d2 [FEAT] [ROCm] Add AITER int8 scaled gemm kernel (#15433)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-03-29 03:33:56 -07:00
Reid
73aa7041bf [doc] update doc (#15740)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-03-29 04:27:22 +00:00
yarongmu-google
7c1f760024 [Kernel][TPU][ragged-paged-attn] vLLM code change for PR#8896 (#15659)
Signed-off-by: Yarong Mu <ymu@google.com>
2025-03-28 21:13:15 -07:00
Nicolò Lucchesi
da461f3cbf [TPU][V1][Bugfix] Fix w8a8 recompiilation with GSM8K (#15714)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-03-28 21:13:06 -07:00
Jinzhen Lin
5b800f0932 [Bugfix] set VLLM_WORKER_MULTIPROC_METHOD=spawn for vllm.entrypoionts.openai.api_server (#15700)
Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
2025-03-28 21:12:26 -07:00
cyyever
8427f70493 Use numba 0.61 for python 3.10+ to support numpy>=2 (#15692)
Signed-off-by: cyy <cyyever@outlook.com>
2025-03-29 12:11:51 +08:00
Russell Bryant
7a7992085b [CI] Speed up V1 structured output tests (#15718)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-03-28 21:10:45 -07:00
Varun Sundar Rabindranath
1286211f57 [Bugfix] LoRA V1: add and fix entrypoints tests (#15715)
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2025-03-28 21:10:41 -07:00
Nick Hill
6d531ad7b8 [Misc][V1] Misc code streamlining (#15723)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-03-28 20:59:47 -07:00
Ce Gao
762b424a52 [Docs] Document v0 engine support in reasoning outputs (#15739)
Signed-off-by: Ce Gao <cegao@tensorchord.ai>
2025-03-29 03:46:57 +00:00
pengyuange
de1cb38769 [Model] Support Skywork-R1V (#15397)
Signed-off-by: jiacai.liu <932997367@qq.com>
Co-authored-by: jiacai.liu <932997367@qq.com>
2025-03-28 20:39:21 -07:00
Gregory Shtrasberg
c802f5430d [ROCm][AMD][Build] Update AMD supported arch list (#15632)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-03-28 20:39:18 -07:00
simpx
cff8991a50 [Docs][V1] Optimize diagrams in prefix caching design (#15716) 2025-03-29 03:33:58 +00:00
daniel-salib
f3f8d8fff4 implement prometheus fast-api-instrumentor for http service metrics (#15657) 2025-03-29 00:12:02 +00:00
Reid
26df46ee59 [Misc] cli auto show default value (#15582)
Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-03-28 22:23:00 +00:00
Alexander Matveev
c3f687ac22 [V1] TPU - Fix the chunked prompt bug (#15713)
Signed-off-by: Alexander Matveev <amatveev@redhat.com>
2025-03-28 20:19:04 +00:00
Luka Govedič
04437e313d [Bugfix] [torch.compile] Add Dynamo metrics context during compilation (#15639)
Signed-off-by: luka <luka@neuralmagic.com>
2025-03-28 14:01:09 -06:00
Robert Shaw
038bededba [TPU] [Perf] Improve Memory Usage Estimation (#15671)
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
2025-03-28 17:37:52 +00:00
shangmingc
d03308be0c [Misc] Remove stale func in KVTransferConfig (#14746)
Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
2025-03-28 17:33:32 +00:00
Cyrus Leung
c6bc0034d0 [Misc] Remove unused utils and clean up imports (#15708)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-28 09:41:16 -07:00
Woosuk Kwon
70e132244a [Minor] Remove TGI launching script (#15646)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-03-28 09:30:08 -07:00
Michael Goin
47e9038d23 Fix cpu offload testing for gptq/awq/ct (#15648)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-03-29 00:29:32 +08:00
Kebe
432cf22a6a [Bugfix] Fix regex compile display format (#15368)
Signed-off-by: Kebe <mail@kebe7jun.com>
2025-03-28 08:58:44 -07:00
Reid
2914006fe0 [doc] add missing imports (#15699)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-03-28 15:56:48 +00:00
Russell Bryant
7329ff5468 [V1] Support disable_any_whtespace for guidance backend (#15584)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-03-28 23:46:45 +08:00
Cyrus Leung
541d1df486 [Bugfix] embed_is_patch for Idefics3 (#15696)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-28 08:27:52 -07:00
Chauncey
3b00ff9138 [Bugfix][v1] xgrammar structured output supports Enum. (#15594)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-03-28 06:14:53 -07:00
Jee Jee Li
91276c5721 [Model] Adding torch compile annotations to chatglm (#15624)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-03-28 21:14:09 +08:00
Harry Mellor
0b4167526d [Docs] Add "Generation quality changed" section to troubleshooting (#15701)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-03-28 13:03:21 +00:00
Reid
fd5fd26902 [Frontend] update priority for --api-key and VLLM_API_KEY (#15588)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-03-28 19:40:12 +08:00
Ce Gao
3bbaacbe15 [Bugfix][Frontend] Eliminate regex based check in reasoning full generator (#14821)
Signed-off-by: Ce Gao <cegao@tensorchord.ai>
2025-03-28 11:20:35 +00:00
Lize Cai
a10314c6b3 [Misc] Fix test_sleep to use query parameters (#14373)
Signed-off-by: Lize Cai <lize.cai@sap.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
2025-03-28 18:00:14 +08:00
Jee Jee Li
70f2c2a709 [Bugfix] Fix 'InductorAdaptor object has no attribute 'cache_dir' (#15674)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-03-28 17:10:40 +08:00
Li, Jiang
280d074103 [CPU][CI] Improve CPU Dockerfile (#15690)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-03-28 01:36:31 -07:00
Ce Gao
32b14baf8a [Refactor][Frontend] Keep all logic about reasoning into one class (#14428)
Signed-off-by: Ce Gao <cegao@tensorchord.ai>
2025-03-28 00:23:30 -07:00
Robert Shaw
2d9045fce8 [TPU][CI] Fix TPUModelRunner Test (#15667)
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
2025-03-28 00:01:26 -07:00
Cyrus Leung
355f66348c [V1] Remove legacy input registry (#15673)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-27 23:34:34 -07:00
Cyrus Leung
8693e47e6a [Bugfix] Fix mm_hashes forgetting to be passed (#15668)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-28 05:51:05 +00:00
Jason (Siyu) Zhu
cec8c7d7f8 Refactor error handling for multiple exceptions in preprocessing (#15650)
Signed-off-by: JasonZhu1313 <jasonchu13@outlook.com>
2025-03-28 03:27:20 +00:00
Gregory Shtrasberg
4d0ec37267 [Quantization][FP8] Adding support for fp8 gemm layer input in fp8 (#14578)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-03-28 02:58:16 +00:00
Chen Xia
e7f720ea56 [Misc]add coding benchmark for speculative decoding (#15303)
Signed-off-by: CXIAAAAA <cxia0209@gmail.com>
2025-03-28 10:47:05 +08:00
Wes
4ae17bf1e2 Revert "Use Cache Hinting for fused_moe kernel (#15511)" (#15645)
Signed-off-by: Wes Medford <wryanmedford@gmail.com>
2025-03-27 19:45:55 -07:00
Robert Shaw
8a49eea74b [CI][TPU] Temporarily Disable Quant Test on TPU (#15649)
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-03-27 19:45:05 -07:00
wwl2755
b4245a48df [Doc] Fix dead links in Job Board (#15637)
Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
2025-03-28 02:43:40 +00:00
Kebe
4e0f6076be [Bugfix] Fix failure to launch in Tensor Parallel TP mode on macOS. (#14948)
Signed-off-by: Kebe <mail@kebe7jun.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
2025-03-28 10:13:41 +08:00
Jee Jee Li
726efc6a32 [Quantization][V1] BitsAndBytes support V1 (#15611)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-03-28 10:12:47 +08:00
Robert Shaw
bd45912b99 [TPU] Lazy Import (#15656)
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-03-28 09:57:01 +08:00
Nick Hill
15dac210f0 [V1] AsyncLLM data parallel (#13923)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-03-27 16:14:41 -07:00
Russell Bryant
112b3e5b3b [CI] Update rules for applying tpu label. (#15634)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-03-27 22:15:26 +00:00
cnorman
32d669275b Correct PowerPC to modern IBM Power (#15635)
Signed-off-by: Christy Norman <christy@linux.vnet.ibm.com>
2025-03-27 15:04:32 -07:00
Nicolò Lucchesi
4098b72210 [Bugfix][TPU][V1] Fix recompilation (#15553)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-03-27 19:15:06 +00:00
Harry Mellor
46450b8d33 Use absolute placement for Ask AI button (#15628)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-03-27 18:52:18 +00:00
Cyrus Leung
13ac9cab21 [Misc] Avoid direct access of global mm_registry in compute_encoder_budget (#15621)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-27 17:52:00 +00:00
Yuan Tang
66aa4c0bf4 [Feature] Add middleware to log API Server responses (#15593)
Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
2025-03-27 17:49:38 +00:00
Cyrus Leung
247181536f [Misc] Replace is_encoder_decoder_inputs with split_enc_dec_inputs (#15620)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-27 17:36:32 +00:00
Cyrus Leung
07bf813fb5 [Doc] Link to onboarding tasks (#15629)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-27 16:30:53 +00:00
Hiroaki Sugiyama
8958217ad5 [Bugfix] Fix use_cascade_attention handling for Alibi-based models on vllm/v1 (#15211)
Signed-off-by: h-sugi <h.sugi@ieee.org>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-03-27 22:29:29 +08:00
Cyrus Leung
ac5bc615b0 [Model] MiniCPM-V/O supports V1 (#15487)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-27 06:07:29 -07:00
Reid
8063dfc61a [Doc] update --system for transformers installation in docker doc (#15616)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-03-27 20:38:46 +08:00
Richard Zou
6278bc829e Fix incorrect filenames in vllm_compile_cache.py (#15494)
Signed-off-by: <zou3519@gmail.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
2025-03-27 18:33:41 +08:00
wang.yuqi
3f532cb6a6 [Misc] Use model_redirect to redirect the model name to a local folder. (#14116) 2025-03-27 02:21:23 -07:00
Cyrus Leung
e6c9053f9e [Misc] Clean up scatter_patch_features (#15559)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-27 07:45:00 +00:00
Robert Shaw
43ed4143c4 [Quantization] Fp8 Channelwise Dynamic Per Token GroupedGEMM (#15587)
Signed-off-by: ElizaWszola <eliza@neuralmagic.com>
Signed-off-by: ElizaWszola <ewszola@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
Co-authored-by: ElizaWszola <eliza@neuralmagic.com>
Co-authored-by: Lucas Wilkinson <wilkinson.lucas@gmail.com>
Co-authored-by: ElizaWszola <ewszola@redhat.com>
2025-03-27 06:47:25 +00:00
Bella kira
f4c98b4d4c [Misc] Consolidate LRUCache implementations (#15481)
Signed-off-by: Bella kira <2374035698@qq.com>
2025-03-27 06:43:43 +00:00
Robert Shaw
e1e0fd7543 [TPU] Avoid Triton Import (#15589)
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-03-27 06:43:02 +00:00
Rui Qiao
df8d3d1287 [Misc] Restrict ray version dependency and update PP feature warning in V1 (#15556) 2025-03-27 06:21:07 +00:00
Chengji Yao
619d3de8bd [TPU] [V1] fix cases when max_num_reqs is set smaller than MIN_NUM_SEQS (#15583)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-03-26 22:46:26 -07:00
Gregory Shtrasberg
ecff8309a3 [ROCm] Env variable to trigger custom PA (#15557)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-03-26 22:46:12 -07:00
Jerry Zhang
dcf2a590f5 Allow torchao quantization in SiglipMLP (#15575) 2025-03-26 22:45:51 -07:00
Cody Yu
54aa619459 [V1] Refactor num_computed_tokens logic (#15307)
Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-03-27 04:54:36 +00:00
Mengqing Cao
fb22be5817 [moe][quant] add weight name case for offset (#15515)
Signed-off-by: Mengqing Cao <cmq0113@163.com>
2025-03-27 04:50:29 +00:00
Wei Zeng
7f301dd8ef [Doc] Update V1 user guide for fp8 kv cache support (#15585)
Signed-off-by: weizeng <weizeng@roblox.com>
2025-03-26 19:39:03 -07:00
Varun Sundar Rabindranath
8095341a01 [misc] LoRA: Remove unused long context test data (#15558)
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2025-03-27 10:04:51 +08:00
Chenyaaang
69db16a46a add platform check back (#15578)
Signed-off-by: Chenyaaang <llccyy1212@gmail.com>
2025-03-27 01:50:27 +00:00
Michael Goin
ce78f9af4e Add automatic tpu label to mergify.yml (#15560) 2025-03-26 21:39:58 -04:00
ElizaWszola
9239bf718e [Kernel] CUTLASS grouped gemm fp8 MoE kernel (#13972)
Signed-off-by: ElizaWszola <eliza@neuralmagic.com>
Signed-off-by: ElizaWszola <ewszola@redhat.com>
Co-authored-by: Lucas Wilkinson <wilkinson.lucas@gmail.com>
2025-03-27 00:54:44 +00:00
Matthew Vine
7a6d45bc8a Support FIPS enabled machines with MD5 hashing (#15299)
Signed-off-by: Matthew Vine <32849887+MattTheCuber@users.noreply.github.com>
2025-03-26 20:19:46 -04:00
Chengji Yao
e74ff409e0 [TPU] support disabling xla compilation cache (#15567)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-03-27 00:09:28 +00:00
Wes
7a888271f5 Use Cache Hinting for fused_moe kernel (#15511) 2025-03-26 23:21:34 +00:00
Alexander Matveev
9d119a86ae [V1] TPU CI - Fix test_compilation.py (#15570)
Signed-off-by: Alexander Matveev <amatveev@redhat.com>
2025-03-26 21:51:54 +00:00
Alexander Matveev
b2e85e26f4 [V1] TPU - Revert to exponential padding by default (#15565)
Signed-off-by: Alexander Matveev <amatveev@redhat.com>
2025-03-26 21:35:05 +00:00
Alexei-V-Ivanov-AMD
dd8a29da99 Applying some fixes for K8s agents in CI (#15493)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
2025-03-26 20:35:11 +00:00
marko
27df5199d9 Support SHA256 as hash function in prefix caching (#15297)
Signed-off-by: Marko Rosenmueller <5467316+dr75@users.noreply.github.com>
2025-03-26 11:11:28 -07:00
Nick Hill
35fad35a48 [V1][Sampler] Faster top-k only implementation (#15478)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-03-26 10:56:47 -07:00
Aaron Pham
733e7c9e95 [Refactor] Remove unnecessary backend parameter in structured output interface (#15317)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-03-26 17:51:56 +00:00
Harry Mellor
0af4d764d6 Fix weight loading for some models in Transformers backend (#15544)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-03-26 10:17:53 -07:00
youkaichao
e64afa455c multi-node offline DP+EP example (#15484)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-03-26 23:54:24 +08:00
Alex Brooks
1711b929b6 [Model] Add Reasoning Parser for Granite Models (#14202)
Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
Co-authored-by: Joe Runde <joe@joerun.de>
2025-03-26 14:28:07 +00:00
Harry Mellor
c091c0a588 Improve validation of TP in Transformers backend (#15540)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-03-26 07:26:48 -07:00
cyyever
1aa162e030 Apply torchfix (#15532)
Signed-off-by: cyy <cyyever@outlook.com>
2025-03-26 12:09:06 +00:00
Harry Mellor
cf5c8f1686 Separate base model from TransformersModel (#15467)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-03-26 18:13:38 +08:00
Reid
4ec2cee000 [Misc] improve example script output (#15528)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-03-26 10:12:47 +00:00
wwl2755
99f536f830 [Misc] Enhance warning information to user-defined chat template (#15408)
Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
2025-03-26 02:21:15 -07:00
vllmellm
5ebf66748b [FEAT][ROCm] Integrate Fused MoE Kernels from AITER (#14967)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-03-26 16:30:30 +08:00
Bryan Lu
781d056280 [Feature] Enhance EAGLE Architecture with Proper RMS Norms (#14990)
Signed-off-by: Bryan Lu <yuzhelu@amazon.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-03-26 08:24:07 +00:00
daniel-salib
5aefd6ac31 Fix raw_request extraction in load_aware_call decorator (#15382)
Signed-off-by: Daniel Salib <danielsalib@meta.com>
2025-03-25 22:29:54 -07:00
Varun Sundar Rabindranath
6c663dfd5e [misc] LoRA - Skip LoRA kernels when not required (#15152)
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2025-03-26 11:33:45 +08:00
Lucas Wilkinson
33437bc6e7 [BugFix] Fix nightly MLA failure (FA2 + MLA chunked prefill, i.e. V1, producing bad results) (#15492)
Signed-off-by: LucasWilkinson <lwilkinson@neuralmagic.com>
2025-03-25 20:33:22 -07:00
Tyler Michael Smith
23114d3364 [Misc] Warn about v0 in benchmark_paged_attn.py (#15495)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-03-25 20:31:04 -07:00
Cyrus Leung
997c8811d6 [Model] Support multi-image for Molmo (#15438)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-26 11:26:33 +08:00
Harry Mellor
e42389f9d7 Transformers backend already supports V1 (#15463)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-03-25 20:26:16 -07:00
Varun Sundar Rabindranath
ff38f0a32c [CI/Build] LoRA: Delete long context tests (#15503)
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2025-03-25 17:18:34 -07:00
Varun Sundar Rabindranath
a5cfbab3c8 [Core] LoRA: V1 Scheduler optimization (#15422)
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2025-03-25 22:50:09 +00:00
Chenyaaang
ac3cd6e83c [core] add bucket padding to tpu_model_runner (#14995)
Signed-off-by: Chenyaaang <llccyy1212@gmail.com>
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
Co-authored-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-03-25 17:27:22 -04:00
Lu Fang
082ab86f5f [V1] Support long_prefill_token_threshold in v1 scheduler (#15419)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-03-25 14:22:26 -07:00
Nick Hill
6aa196c8dc [V1][Minor] Use SchedulerInterface type for engine scheduler field (#15499)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-03-25 14:21:36 -07:00
Nicolò Lucchesi
a0dd7dcd49 [TPU][V1] Fix Sampler recompilation (#15309)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-03-25 16:43:54 -04:00
Maximilien de Bayser
e977c11111 Add workaround for shared field_names in pydantic model class (#13925)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
2025-03-25 20:31:08 +00:00
Joe Runde
5f063a80bd [bugfix] add supports_v1 platform interface (#15417)
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
2025-03-25 15:00:32 -04:00
Antonio Gómez
5d8e1c9279 [Bugfix] Support triton==3.3.0+git95326d9f for RTX 5090 (Unsloth + vLLM compatibility) (#15471)
Co-authored-by: ServerAI <ai@exc-mad-ai.com>
2025-03-25 17:59:25 +00:00
yarongmu-google
0a049c7d86 [CI/Build] Add tests for the V1 tpu_model_runner. (#14843)
Signed-off-by: Yarong Mu <ymu@google.com>
2025-03-25 12:27:16 -04:00
youkaichao
d0cfec7ab9 [bugfix] fix inductor cache on max_position_embeddings (#15436)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-03-25 07:05:39 -07:00
Szymon Ożóg
a608160027 [Kernel] Fix conflicting macro names for gguf kernels (#15456)
Signed-off-by: SzymonOzog <szymon.ozog@gmail.com>
2025-03-25 13:50:49 +00:00
Cyrus Leung
3f04a7fbf2 [Doc] Update V1 user guide for multi-modality (#15460)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-25 11:01:58 +00:00
Cyrus Leung
5994430b84 [Misc] Remove redundant num_embeds (#15443)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-25 18:27:57 +08:00
Cyrus Leung
a9e879b316 [Misc] Clean up MiniCPM-V/O code (#15337)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-25 10:22:52 +00:00
Md. Shafi Hussain
3e2f37a69a Dockerfile.ppc64le changes to move to UBI (#15402)
Signed-off-by: Md. Shafi Hussain <Md.Shafi.Hussain@ibm.com>
2025-03-25 10:15:14 +00:00
Thien Tran
4f044b1d67 [Kernel][CPU] CPU MLA (#14744)
Signed-off-by: Thien Tran <gau.nernst@yahoo.com.sg>
2025-03-25 09:34:59 +00:00
Siyuan Liu
4157f563b4 [Hardware][TPU][Bugfix] Fix v1 mp profiler (#15409)
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
2025-03-25 01:43:00 -07:00
Lu Fang
051da7efe3 Fix CUDA kernel index data type in vllm/csrc/quantization/gptq_marlin/awq_marlin_repack.cu +10 (#15160)
Signed-off-by: Lu Fang <lufang@fb.com>
Co-authored-by: Richard Barnes <rbarnes@meta.com>
2025-03-25 15:36:45 +08:00
2814 changed files with 384774 additions and 135758 deletions

View File

@@ -1,19 +1,20 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import sys
import zipfile
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
# Note that we have 400 MiB quota, please use it wisely.
# See https://github.com/pypi/support/issues/3792 .
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
# Note that we have 800 MiB quota, please use it wisely.
# See https://github.com/pypi/support/issues/6326 .
# Please also sync the value with the one in Dockerfile.
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
def print_top_10_largest_files(zip_file):
"""Print the top 10 largest files in the given zip file."""
with zipfile.ZipFile(zip_file, 'r') as z:
with zipfile.ZipFile(zip_file, "r") as z:
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
file_sizes.sort(key=lambda x: x[1], reverse=True)
for f, size in file_sizes[:10]:
@@ -28,14 +29,18 @@ def check_wheel_size(directory):
wheel_path = os.path.join(root, file_name)
wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
if wheel_size_mb > VLLM_MAX_SIZE_MB:
print(f"Not allowed: Wheel {wheel_path} is larger "
f"({wheel_size_mb:.2f} MB) than the limit "
f"({VLLM_MAX_SIZE_MB} MB).")
print(
f"Not allowed: Wheel {wheel_path} is larger "
f"({wheel_size_mb:.2f} MB) than the limit "
f"({VLLM_MAX_SIZE_MB} MB)."
)
print_top_10_largest_files(wheel_path)
return 1
else:
print(f"Wheel {wheel_path} is within the allowed size "
f"({wheel_size_mb:.2f} MB).")
print(
f"Wheel {wheel_path} is within the allowed size "
f"({wheel_size_mb:.2f} MB)."
)
return 0
@@ -45,4 +50,4 @@ if __name__ == "__main__":
sys.exit(1)
directory = sys.argv[1]
sys.exit(check_wheel_size(directory))
sys.exit(check_wheel_size(directory))

View File

@@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import os
@@ -7,7 +8,8 @@ template = """<!DOCTYPE html>
<html>
<body>
<h1>Links for vLLM</h1/>
<a href="../{wheel_html_escaped}">{wheel}</a><br/>
<a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
<a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
</body>
</html>
"""
@@ -20,7 +22,25 @@ filename = os.path.basename(args.wheel)
with open("index.html", "w") as f:
print(f"Generated index.html for {args.wheel}")
# sync the abi tag with .buildkite/scripts/upload-wheels.sh
if "x86_64" in filename:
x86_wheel = filename
arm_wheel = filename.replace("x86_64", "aarch64").replace(
"manylinux1", "manylinux2014"
)
elif "aarch64" in filename:
x86_wheel = filename.replace("aarch64", "x86_64").replace(
"manylinux2014", "manylinux1"
)
arm_wheel = filename
else:
raise ValueError(f"Unsupported wheel: {filename}")
# cloudfront requires escaping the '+' character
f.write(
template.format(wheel=filename,
wheel_html_escaped=filename.replace("+", "%2B")))
template.format(
x86_wheel=x86_wheel,
x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
arm_wheel=arm_wheel,
arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
)
)

View File

@@ -1,3 +1,4 @@
# For vllm script, with -t option (tensor parallel size).
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
tasks:

View File

@@ -1,3 +1,4 @@
# For hf script, without -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
tasks:

View File

@@ -1,3 +1,4 @@
# For hf script, without -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
tasks:

View File

@@ -1,3 +1,4 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
tasks:

View File

@@ -1,3 +1,4 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
tasks:

View File

@@ -1,3 +1,4 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
tasks:

View File

@@ -1,3 +1,4 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
tasks:

View File

@@ -1,3 +1,4 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
tasks:

View File

@@ -1,3 +1,4 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
tasks:

View File

@@ -1,3 +1,4 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
tasks:

View File

@@ -1,4 +1,5 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
# For hf script, without -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
tasks:
- name: "gsm8k"

View File

@@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.335
- name: "exact_match,flexible-extract"
value: 0.323
limit: 1319
num_fewshot: 5

View File

@@ -1,3 +1,4 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
tasks:

View File

@@ -1,3 +1,4 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
model_name: "mgoin/Minitron-4B-Base-FP8"
tasks:

View File

@@ -1,3 +1,4 @@
# For vllm script, with -t option (tensor parallel size).
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
tasks:

View File

@@ -1,3 +1,4 @@
# For vllm script, with -t option (tensor parallel size).
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
tasks:

View File

@@ -1,4 +1,5 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
# For hf script, without -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
tasks:
- name: "gsm8k"

View File

@@ -0,0 +1,12 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.30
- name: "exact_match,flexible-extract"
value: 0.465
limit: 1319
num_fewshot: 5

View File

@@ -1,3 +1,4 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
tasks:

View File

@@ -1,3 +1,4 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
tasks:

View File

@@ -1,3 +1,4 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
tasks:

View File

@@ -1,3 +1,4 @@
# For vllm script, with -t option (tensor parallel size).
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
tasks:

View File

@@ -1,11 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
model_name: "Qwen/Qwen2.5-1.5B-Instruct"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.419
value: 0.54
- name: "exact_match,flexible-extract"
value: 0.416
limit: 1000
value: 0.59
limit: 1319
num_fewshot: 5

View File

@@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.47
- name: "exact_match,flexible-extract"
value: 0.64
limit: 1319
num_fewshot: 5

View File

@@ -1,3 +1,4 @@
# For vllm script, with -t option (tensor parallel size).
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
tasks:

View File

@@ -1,10 +1,6 @@
Meta-Llama-3-8B-Instruct.yaml
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
Qwen2.5-1.5B-Instruct.yaml
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
Minitron-4B-Base-FP8.yaml
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
Qwen2-1.5B-Instruct-FP8W8.yaml
Meta-Llama-3-8B-QQQ.yaml
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
Qwen1.5-MoE-W4A16-compressed-tensors.yaml

View File

@@ -0,0 +1,44 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from pathlib import Path
import pytest
def pytest_addoption(parser):
parser.addoption(
"--config-list-file",
action="store",
help="Path to the file listing model config YAMLs (one per line)",
)
parser.addoption(
"--tp-size",
action="store",
default="1",
help="Tensor parallel size to use for evaluation",
)
@pytest.fixture(scope="session")
def config_list_file(pytestconfig, config_dir):
rel_path = pytestconfig.getoption("--config-list-file")
return config_dir / rel_path
@pytest.fixture(scope="session")
def tp_size(pytestconfig):
return pytestconfig.getoption("--tp-size")
def pytest_generate_tests(metafunc):
if "config_filename" in metafunc.fixturenames:
rel_path = metafunc.config.getoption("--config-list-file")
config_list_file = Path(rel_path).resolve()
config_dir = config_list_file.parent
with open(config_list_file, encoding="utf-8") as f:
configs = [
config_dir / line.strip()
for line in f
if line.strip() and not line.startswith("#")
]
metafunc.parametrize("config_filename", configs)

View File

@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on GSM for transformers.
#
# Make sure you have lm-eval-harness installed:
# pip install lm-eval==0.4.4
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
usage() {
echo``

View File

@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
# pip install lm-eval==0.4.4
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
usage() {
echo``
@@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
done
lm_eval --model vllm \
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
--tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
--batch_size "$BATCH_SIZE"

View File

@@ -1,59 +0,0 @@
#!/bin/bash
usage() {
echo``
echo "Runs lm eval harness on GSM8k using vllm and compares to "
echo "precomputed baseline (measured by HF transformers.)"
echo
echo "usage: ${0} <options>"
echo
echo " -c - path to the test data config (e.g. configs/small-models.txt)"
echo " -t - tensor parallel size"
echo
}
SUCCESS=0
while getopts "c:t:" OPT; do
case ${OPT} in
c )
CONFIG="$OPTARG"
;;
t )
TP_SIZE="$OPTARG"
;;
\? )
usage
exit 1
;;
esac
done
# Parse list of configs.
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
do
LOCAL_SUCCESS=0
echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
export LM_EVAL_TP_SIZE=$TP_SIZE
pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
if [[ $LOCAL_SUCCESS == 0 ]]; then
echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
else
echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
fi
SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
done
if [ "${SUCCESS}" -eq "0" ]; then
exit 0
else
exit 1
fi

View File

@@ -1,69 +1,57 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
LM eval harness on model to compare vs HF baseline computed offline.
Configs are found in configs/$MODEL.yaml
* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
* export LM_EVAL_TP_SIZE=4
* pytest -s test_lm_eval_correctness.py
pytest -s -v test_lm_eval_correctness.py \
--config-list-file=configs/models-small.txt \
--tp-size=1
"""
import os
from pathlib import Path
import lm_eval
import numpy
import pytest
import numpy as np
import yaml
RTOL = 0.05
TEST_DATA_FILE = os.environ.get(
"LM_EVAL_TEST_DATA_FILE",
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
RTOL = 0.08
def launch_lm_eval(eval_config):
trust_remote_code = eval_config.get('trust_remote_code', False)
model_args = f"pretrained={eval_config['model_name']}," \
f"tensor_parallel_size={TP_SIZE}," \
f"add_bos_token=true," \
f"trust_remote_code={trust_remote_code}"
def launch_lm_eval(eval_config, tp_size):
trust_remote_code = eval_config.get("trust_remote_code", False)
max_model_len = eval_config.get("max_model_len", 4096)
model_args = (
f"pretrained={eval_config['model_name']},"
f"tensor_parallel_size={tp_size},"
f"enforce_eager=true,"
f"add_bos_token=true,"
f"trust_remote_code={trust_remote_code},"
f"max_model_len={max_model_len}"
)
results = lm_eval.simple_evaluate(
model="vllm",
model_args=model_args,
tasks=[task["name"] for task in eval_config["tasks"]],
num_fewshot=eval_config["num_fewshot"],
limit=eval_config["limit"],
batch_size="auto")
batch_size="auto",
)
return results
def test_lm_eval_correctness():
eval_config = yaml.safe_load(
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
def test_lm_eval_correctness_param(config_filename, tp_size):
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
if eval_config[
"model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform": #noqa: E501
pytest.skip("FBGEMM is currently failing on main.")
results = launch_lm_eval(eval_config, tp_size)
# Launch eval requests.
results = launch_lm_eval(eval_config)
# Confirm scores match ground truth.
success = True
for task in eval_config["tasks"]:
for metric in task["metrics"]:
ground_truth = metric["value"]
measured_value = results["results"][task["name"]][metric["name"]]
print(f'{task["name"]} | {metric["name"]}: '
f'ground_truth={ground_truth} | measured={measured_value}')
success = success and numpy.isclose(
ground_truth, measured_value, rtol=RTOL)
print(
f"{task['name']} | {metric['name']}: "
f"ground_truth={ground_truth} | measured={measured_value}"
)
success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
# Assert at the end, print all scores even on failure for debugging.
assert success

View File

@@ -7,11 +7,11 @@ This directory contains two sets of benchmark for vllm.
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
## Performance benchmark quick overview
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!), with different models.
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models.
**Benchmarking Duration**: about 1hr.
@@ -28,16 +28,34 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performanc
## Trigger the benchmark
Performance benchmark will be triggered when:
- A PR being merged into vllm.
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
Manually Trigger the benchmark
```bash
bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
```
Runtime environment variables:
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
Nightly benchmark will be triggered when:
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
## Performance benchmark details
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
>
### Latency test
Here is an example of one test inside `latency-tests.json`:
@@ -60,7 +78,7 @@ Here is an example of one test inside `latency-tests.json`:
In this example:
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
- The `parameters` attribute control the command line arguments to be used for `vllm bench latency`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `vllm bench latency`. For example, the corresponding command line arguments for `vllm bench latency` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
@@ -68,13 +86,13 @@ WARNING: The benchmarking script will save json results by itself, so please do
### Throughput test
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `vllm bench throughput`.
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
### Serving test
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
We test the throughput by using `vllm bench serve` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
```json
[
@@ -86,7 +104,6 @@ We test the throughput by using `benchmark_serving.py` with request rate = inf t
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
@@ -104,8 +121,8 @@ Inside this example:
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
- The `server-parameters` includes the command line arguments for vLLM server.
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
- The `client-parameters` includes the command line arguments for `vllm bench serve`.
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `vllm bench serve`
The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
@@ -113,12 +130,29 @@ WARNING: The benchmarking script will save json results by itself, so please do
### Visualizing the results
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
If you do not see the table, please wait till the benchmark finish running.
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps.
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
| | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
|----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------|
| 0 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982 | 156.526018 | 1.097396 |
| 1 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334 | 294.018783 | 1.216863 |
A comparison diagram will be generated below the table.
Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
<img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
## Nightly test details
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
@@ -126,9 +160,9 @@ See [nightly-descriptions.md](nightly-descriptions.md) for the detailed descript
### Workflow
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
- Inside each container, we run [scripts/run-nightly-benchmarks.sh](scripts/run-nightly-benchmarks.sh), which will probe the serving engine of the current container.
- The `scripts/run-nightly-benchmarks.sh` will parse the workload described in [nightly-tests.json](tests/nightly-tests.json) and launch the right benchmark for the specified serving engine via `scripts/launch-server.sh`.
- At last, we run [scripts/summary-nightly-results.py](scripts/summary-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
### Nightly tests
@@ -138,6 +172,6 @@ In [nightly-tests.json](tests/nightly-tests.json), we include the command line a
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `scripts/run-nightly-benchmarks.sh` and `scripts/launch-server.sh`.
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).

View File

@@ -1,3 +1,4 @@
# Nightly benchmark annotation
## Description
@@ -13,15 +14,15 @@ Please download the visualization scripts in the post
- Find the docker we use in `benchmarking pipeline`
- Deploy the docker, and inside the docker:
- Download `nightly-benchmarks.zip`.
- In the same folder, run the following code:
- Download `nightly-benchmarks.zip`.
- In the same folder, run the following code:
```console
export HF_TOKEN=<your HF token>
apt update
apt install -y git
unzip nightly-benchmarks.zip
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
```
```bash
export HF_TOKEN=<your HF token>
apt update
apt install -y git
unzip nightly-benchmarks.zip
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
```
And the results will be inside `./benchmarks/results`.

View File

@@ -13,25 +13,25 @@ Latest reproduction guilde: [github issue link](https://github.com/vllm-project/
## Setup
- Docker images:
- vLLM: `vllm/vllm-openai:v0.6.2`
- SGLang: `lmsysorg/sglang:v0.3.2-cu121`
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
- *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
- vLLM: `vllm/vllm-openai:v0.6.2`
- SGLang: `lmsysorg/sglang:v0.3.2-cu121`
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
- *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.*
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
- Hardware
- 8x Nvidia A100 GPUs
- 8x Nvidia A100 GPUs
- Workload:
- Dataset
- ShareGPT dataset
- Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
- Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
- Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
- Models: llama-3 8B, llama-3 70B.
- We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
- Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
- Dataset
- ShareGPT dataset
- Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
- Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
- Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
- Models: llama-3 8B, llama-3 70B.
- We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
- Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
## Known issues

View File

@@ -1,10 +1,12 @@
# Performance benchmarks descriptions
## Latency tests
- Input length: 32 tokens.
- Output length: 128 tokens.
- Batch size: fixed (8).
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- CPU Models: llama-3.1 8B.
- Evaluation metrics: end-to-end latency (mean, median, p99).
{latency_tests_markdown_table}
@@ -14,7 +16,8 @@
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
- Output length: the corresponding output length of these 200 prompts.
- Batch size: dynamically determined by vllm to achieve maximum throughput.
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- CPU Models: llama-3.1 8B.
- Evaluation metrics: throughput.
{throughput_tests_markdown_table}
@@ -25,12 +28,18 @@
- Output length: the corresponding output length of these 200 prompts.
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- We also added a speculative decoding test for llama-3 70B, under QPS 2
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
- CPU Models: llama-3.1 8B.
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
- For CPU, we added random dataset tests to benchmark fixed input/output length with 100 prompts.
{serving_tests_markdown_table}
## Platform Information
{platform_markdown_table}
## json version of the benchmarking tables
This section contains the data of the markdown tables above in JSON format.

View File

@@ -0,0 +1,307 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import json
import os
from importlib import util
import pandas as pd
plotly_found = util.find_spec("plotly.express") is not None
def compare_data_columns(
files, name_column, data_column, info_cols, drop_column, debug=False
):
"""
Align concatenation by keys derived from info_cols instead of row order.
- Pick one canonical key list: subset of info_cols present in ALL files.
- For each file: set index to those keys, aggregate duplicates
- (mean for metric, first for names).
- Concat along axis=1 (indexes align), then reset_index so callers can
- group by columns.
- If --debug, add a <file_label>_name column per file.
"""
print("\ncompare_data_column:", data_column)
frames = []
raw_data_cols = []
compare_frames = []
# 1) choose a canonical key list from info_cols that exists in ALL files
cols_per_file = []
for f in files:
try:
df_tmp = pd.read_json(f, orient="records")
except Exception as err:
raise ValueError(f"Failed to read {f}") from err
cols_per_file.append(set(df_tmp.columns))
key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
if not key_cols:
# soft fallback: use any info_cols present in the first file
key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
if not key_cols:
raise ValueError(
"No common key columns found from info_cols across the input files."
)
# 2) build a single "meta" block (keys as columns) once, aligned by the key index
meta_added = False
for file in files:
df = pd.read_json(file, orient="records")
# Keep rows that actually have the compared metric (same as original behavior)
if drop_column in df.columns:
df = df.dropna(subset=[drop_column], ignore_index=True)
# Stabilize numeric key columns (harmless if missing)
for c in (
"Input Len",
"Output Len",
"TP Size",
"PP Size",
"# of max concurrency.",
"qps",
):
if c in df.columns:
df[c] = pd.to_numeric(df[c], errors="coerce")
# Ensure all key columns exist
for c in key_cols:
if c not in df.columns:
df[c] = pd.NA
# Set index = key_cols and aggregate duplicates → unique MultiIndex
df_idx = df.set_index(key_cols, drop=False)
# meta (key columns), unique per key
meta = df_idx[key_cols]
if not meta.index.is_unique:
meta = meta.groupby(level=key_cols, dropna=False).first()
# metric series for this file, aggregated to one row per key
file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
s = df_idx[data_column]
if not s.index.is_unique:
s = s.groupby(level=key_cols, dropna=False).mean()
s.name = file_label # column label like original
# add meta once (from first file) so keys are the leftmost columns
if not meta_added:
frames.append(meta)
meta_added = True
# (NEW) debug: aligned test-name column per file
if debug and name_column in df_idx.columns:
name_s = df_idx[name_column]
if not name_s.index.is_unique:
name_s = name_s.groupby(level=key_cols, dropna=False).first()
name_s.name = f"{file_label}_name"
frames.append(name_s)
frames.append(s)
raw_data_cols.append(file_label)
compare_frames.append(s)
# Generalize ratio: for any file N>=2, add ratio (fileN / file1)
if len(compare_frames) >= 2:
base = compare_frames[0]
current = compare_frames[-1]
ratio = current / base
ratio = ratio.mask(base == 0) # avoid inf when baseline is 0
ratio.name = f"Ratio 1 vs {len(compare_frames)}"
frames.append(ratio)
# 4) concat on columns with aligned MultiIndex;
# then reset_index to return keys as columns
concat_df = pd.concat(frames, axis=1)
concat_df = concat_df.reset_index(drop=True).reset_index()
if "index" in concat_df.columns:
concat_df = concat_df.drop(columns=["index"])
# Ensure key/info columns appear first (in your info_cols order)
front = [c for c in info_cols if c in concat_df.columns]
rest = [c for c in concat_df.columns if c not in front]
concat_df = concat_df[front + rest]
print(raw_data_cols)
return concat_df, raw_data_cols
def split_json_by_tp_pp(
input_file: str = "benchmark_results.json", output_root: str = "."
) -> list[str]:
"""
Split a benchmark JSON into separate folders by (TP Size, PP Size).
Creates: <output_root>/tp{TP}_pp{PP}/benchmark_results.json
Returns: list of file paths written.
"""
# Load JSON data into DataFrame
with open(input_file, encoding="utf-8") as f:
data = json.load(f)
# If the JSON is a dict with a list under common keys, use that list
if isinstance(data, dict):
for key in ("results", "serving_results", "benchmarks", "data"):
if isinstance(data.get(key), list):
data = data[key]
break
df = pd.DataFrame(data)
# Keep only "serving" tests
name_col = next(
(c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
)
if name_col:
df = df[
df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
].copy()
# Handle alias column names
rename_map = {
"tp_size": "TP Size",
"tensor_parallel_size": "TP Size",
"pp_size": "PP Size",
"pipeline_parallel_size": "PP Size",
}
df.rename(
columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True
)
# Ensure TP/PP columns exist (default to 1 if missing)
if "TP Size" not in df.columns:
df["TP Size"] = 1
if "PP Size" not in df.columns:
df["PP Size"] = 1
# make sure TP/PP are numeric ints with no NaN
df["TP Size"] = (
pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int)
)
df["PP Size"] = (
pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int)
)
# Split into separate folders
saved_paths: list[str] = []
for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False):
folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}")
os.makedirs(folder_name, exist_ok=True)
filepath = os.path.join(folder_name, "benchmark_results.json")
group_df.to_json(filepath, orient="records", indent=2, force_ascii=False)
print(f"Saved: {filepath}")
saved_paths.append(filepath)
return saved_paths
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"-f", "--file", action="append", type=str, help="input file name"
)
parser.add_argument(
"--debug", action="store_true", help="show all information for debugging"
)
parser.add_argument(
"--plot",
action=argparse.BooleanOptionalAction,
default=True,
help="plot perf diagrams or not --no-plot --plot",
)
parser.add_argument(
"-x",
"--xaxis",
type=str,
default="# of max concurrency.",
help="column name to use as X Axis in comparison graph",
)
args = parser.parse_args()
drop_column = "P99"
name_column = "Test name"
info_cols = [
"Model",
"Dataset Name",
"Input Len",
"Output Len",
"TP Size",
"PP Size",
"# of max concurrency.",
"qps",
]
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
html_msgs_for_data_cols = [
"Compare Output Tokens /n",
"Median TTFT /n",
"Median TPOT /n",
]
if len(args.file) == 1:
files = split_json_by_tp_pp(args.file[0], output_root="splits")
info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")]
else:
files = args.file
print("comparing : " + ", ".join(files))
debug = args.debug
plot = args.plot
# For Plot feature, assign y axis from one of info_cols
y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6
with open("perf_comparison.html", "w") as text_file:
for i in range(len(data_cols_to_compare)):
output_df, raw_data_cols = compare_data_columns(
files,
name_column,
data_cols_to_compare[i],
info_cols,
drop_column,
debug=debug,
)
# For Plot feature, insert y axis from one of info_cols
raw_data_cols.insert(0, info_cols[y_axis_index])
filtered_info_cols = info_cols[:-2]
existing_group_cols = [
c for c in filtered_info_cols if c in output_df.columns
]
if not existing_group_cols:
raise ValueError(
f"No valid group-by columns "
f"Expected subset: {filtered_info_cols}, "
f"but DataFrame has: {list(output_df.columns)}"
)
output_df_sorted = output_df.sort_values(by=existing_group_cols)
output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
for name, group in output_groups:
html = group.to_html()
text_file.write(html_msgs_for_data_cols[i])
text_file.write(html)
if plot and plotly_found:
import plotly.express as px
df = group[raw_data_cols]
df_sorted = df.sort_values(by=info_cols[y_axis_index])
# Melt DataFrame for plotting
df_melted = df_sorted.melt(
id_vars=info_cols[y_axis_index],
var_name="Configuration",
value_name=data_cols_to_compare[i],
)
title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
# Create Plotly line chart
fig = px.line(
df_melted,
x=info_cols[y_axis_index],
y=data_cols_to_compare[i],
color="Configuration",
title=title,
markers=True,
)
# Export to HTML
text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))

View File

@@ -1,14 +1,19 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import json
import os
import shlex
from importlib import util
from pathlib import Path
from typing import Any
import pandas as pd
import psutil
import regex as re
from tabulate import tabulate
results_folder = Path("results/")
# latency results and the keys that will be printed into markdown
latency_results = []
latency_column_mapping = {
@@ -28,28 +33,39 @@ throughput_results = []
throughput_results_column_mapping = {
"test_name": "Test name",
"gpu_type": "GPU",
# "num_requests": "# of req.",
# "total_num_tokens": "Total # of tokens",
# "elapsed_time": "Elapsed time (s)",
"num_requests": "# of req.",
"total_num_tokens": "Total # of tokens",
"elapsed_time": "Elapsed time (s)",
"requests_per_second": "Tput (req/s)",
# "tokens_per_second": "Tput (tok/s)",
"tokens_per_second": "Tput (tok/s)",
}
# serving results and the keys that will be printed into markdown
serving_results = []
serving_column_mapping = {
"test_name": "Test name",
"model_id": "Model",
"dataset_name": "Dataset Name",
"input_len": "Input Len",
"output_len": "Output Len",
"tp_size": "TP Size",
"pp_size": "PP Size",
"dtype": "dtype",
"gpu_type": "GPU",
# "completed": "# of req.",
"completed": "# of req.",
"qps": "qps",
"max_concurrency": "# of max concurrency.",
"request_throughput": "Tput (req/s)",
# "input_throughput": "Input Tput (tok/s)",
# "output_throughput": "Output Tput (tok/s)",
"total_token_throughput": "Total Token Tput (tok/s)",
"output_throughput": "Output Tput (tok/s)",
# "total_input_tokens": "Total input tokens",
# "total_output_tokens": "Total output tokens",
"mean_ttft_ms": "Mean TTFT (ms)",
"median_ttft_ms": "Median TTFT (ms)",
"p99_ttft_ms": "P99 TTFT (ms)",
# "mean_tpot_ms": "Mean TPOT (ms)",
# "median_tpot_ms": "Median",
# "p99_tpot_ms": "P99",
"mean_tpot_ms": "Mean TPOT (ms)",
"median_tpot_ms": "Median",
"p99_tpot_ms": "P99",
"mean_itl_ms": "Mean ITL (ms)",
"median_itl_ms": "Median ITL (ms)",
"p99_itl_ms": "P99 ITL (ms)",
@@ -65,24 +81,134 @@ def read_markdown(file):
def results_to_json(latency, throughput, serving):
return json.dumps({
'latency': latency.to_dict(),
'throughput': throughput.to_dict(),
'serving': serving.to_dict()
})
return json.dumps(
{
"latency": latency.to_dict(),
"throughput": throughput.to_dict(),
"serving": serving.to_dict(),
}
)
def get_size_with_unit(bytes, suffix="B"):
"""
Scale bytes to its proper format
e.g:
1253656 => '1.20MB'
1253656678 => '1.17GB'
"""
factor = 1024
for unit in ["", "K", "M", "G", "T", "P"]:
if bytes < factor:
return f"{bytes:.2f}{unit}{suffix}"
bytes /= factor
def _coerce(val: str) -> Any:
"""Best-effort type coercion from string to Python types."""
low = val.lower()
if low == "null":
return None
if low == "true":
return True
if low == "false":
return False
# integers
if re.fullmatch(r"[+-]?\d+", val):
try:
return int(val)
except ValueError:
pass
# floats (keep 'inf'/'-inf'/'nan' as strings)
if re.fullmatch(r"[+-]?\d*\.\d+", val):
try:
return float(val)
except ValueError:
pass
return val
def parse_client_command(cmd: str) -> dict[str, Any]:
"""Parse the client_command shell string into {executable, script, args}."""
toks = shlex.split(cmd)
if len(toks) < 2:
raise ValueError("client_command must include an executable and a script")
executable, script = toks[0], toks[1]
args: dict[str, Any] = {}
i = 2
while i < len(toks):
t = toks[i]
if t.startswith("--"):
# --key=value or --key (value) or boolean flag
if "=" in t:
key, val = t.split("=", 1)
if key == "--metadata":
md = {}
if val:
if "=" in val:
k, v = val.split("=", 1)
md[k] = _coerce(v)
else:
md[val] = True
args[key] = md
else:
args[key] = _coerce(val)
i += 1
continue
key = t
# Special: consume metadata k=v pairs until next --flag
if key == "--metadata":
i += 1
md = {}
while i < len(toks) and not toks[i].startswith("--"):
pair = toks[i]
if "=" in pair:
k, v = pair.split("=", 1)
md[k] = _coerce(v)
else:
md[pair] = True
i += 1
args[key] = md
continue
# Standard: check if next token is a value (not a flag)
if i + 1 < len(toks) and not toks[i + 1].startswith("--"):
args[key] = _coerce(toks[i + 1])
i += 2
else:
# lone flag -> True
args[key] = True
i += 1
else:
# unexpected positional; skip
i += 1
return {"executable": executable, "script": script, "args": args}
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"-r",
"--result",
type=str,
default="results",
help="Folder name for benchmark output results.",
)
args = parser.parse_args()
results_folder = Path(args.result)
if not results_folder.exists():
raise FileNotFoundError(f"results folder does not exist: {results_folder}")
# collect results
for test_file in results_folder.glob("*.json"):
with open(test_file) as f:
raw_result = json.loads(f.read())
if "serving" in str(test_file):
# this result is generated via `benchmark_serving.py`
# this result is generated via `vllm bench serve` command
# attach the benchmarking command to raw_result
try:
with open(test_file.with_suffix(".commands")) as f:
@@ -90,18 +216,50 @@ if __name__ == "__main__":
except OSError as e:
print(e)
continue
# Parse Server Command Arg
out: dict[str, Any] = {
"server_command": parse_client_command(command["server_command"])
}
parse_args = [
"--tensor-parallel-size",
"--pipeline-parallel-size",
"--dtype",
]
col_mapping = ["tp_size", "pp_size", "dtype"]
for index, arg in enumerate(parse_args):
if arg in out["server_command"]["args"]:
raw_result.update(
{col_mapping[index]: out["server_command"]["args"][arg]}
)
# Parse Client Command Arg
out: dict[str, Any] = {
"client_command": parse_client_command(command["client_command"])
}
parse_args = [
"--dataset-name",
"--random-input-len",
"--random-output-len",
"--request-rate",
]
col_mapping = ["dataset_name", "input_len", "output_len", "qps"]
for index, arg in enumerate(parse_args):
if arg in out["client_command"]["args"]:
raw_result.update(
{col_mapping[index]: out["client_command"]["args"][arg]}
)
# Add Server, Client command
raw_result.update(command)
# update the test name of this result
raw_result.update({"test_name": test_file.stem})
# add the result to raw_result
serving_results.append(raw_result)
continue
elif "latency" in f.name:
# this result is generated via `benchmark_latency.py`
# this result is generated via `vllm bench latency` command
# attach the benchmarking command to raw_result
try:
@@ -120,7 +278,8 @@ if __name__ == "__main__":
for perc in [10, 25, 50, 75, 90, 99]:
# Multiply 1000 to convert the time unit from s to ms
raw_result.update(
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}
)
raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
# add the result to raw_result
@@ -128,7 +287,7 @@ if __name__ == "__main__":
continue
elif "throughput" in f.name:
# this result is generated via `benchmark_throughput.py`
# this result is generated via `vllm bench throughput` command
# attach the benchmarking command to raw_result
try:
@@ -153,26 +312,51 @@ if __name__ == "__main__":
serving_results = pd.DataFrame.from_dict(serving_results)
throughput_results = pd.DataFrame.from_dict(throughput_results)
raw_results_json = results_to_json(latency_results, throughput_results,
serving_results)
svmem = psutil.virtual_memory()
platform_data = {
"Physical cores": [psutil.cpu_count(logical=False)],
"Total cores": [psutil.cpu_count(logical=True)],
"Total Memory": [get_size_with_unit(svmem.total)],
}
if util.find_spec("numa") is not None:
from numa import info
platform_data["Total NUMA nodes"] = [info.get_num_configured_nodes()]
if util.find_spec("cpuinfo") is not None:
from cpuinfo import get_cpu_info
platform_data["CPU Brand"] = [get_cpu_info()["brand_raw"]]
platform_results = pd.DataFrame.from_dict(
platform_data, orient="index", columns=["Platform Info"]
)
raw_results_json = results_to_json(
latency_results, throughput_results, serving_results
)
# remapping the key, for visualization purpose
if not latency_results.empty:
latency_results = latency_results[list(
latency_column_mapping.keys())].rename(
columns=latency_column_mapping)
latency_results = latency_results[list(latency_column_mapping.keys())].rename(
columns=latency_column_mapping
)
if not serving_results.empty:
serving_results = serving_results[list(
serving_column_mapping.keys())].rename(
columns=serving_column_mapping)
valid_columns = [
col for col in serving_column_mapping if col in serving_results.columns
]
serving_results = serving_results[valid_columns].rename(
columns=serving_column_mapping
)
if not throughput_results.empty:
throughput_results = throughput_results[list(
throughput_results_column_mapping.keys())].rename(
columns=throughput_results_column_mapping)
throughput_results = throughput_results[
list(throughput_results_column_mapping.keys())
].rename(columns=throughput_results_column_mapping)
processed_results_json = results_to_json(latency_results,
throughput_results,
serving_results)
processed_results_json = results_to_json(
latency_results, throughput_results, serving_results
)
for df in [latency_results, serving_results, throughput_results]:
if df.empty:
@@ -184,38 +368,45 @@ if __name__ == "__main__":
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
# we want to turn it into "8xGPUTYPE"
df["GPU"] = df["GPU"].apply(
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
)
# get markdown tables
latency_md_table = tabulate(latency_results,
headers='keys',
tablefmt='pipe',
showindex=False)
serving_md_table = tabulate(serving_results,
headers='keys',
tablefmt='pipe',
showindex=False)
throughput_md_table = tabulate(throughput_results,
headers='keys',
tablefmt='pipe',
showindex=False)
latency_md_table = tabulate(
latency_results, headers="keys", tablefmt="pipe", showindex=False
)
serving_md_table = tabulate(
serving_results, headers="keys", tablefmt="pipe", showindex=False
)
throughput_md_table = tabulate(
throughput_results, headers="keys", tablefmt="pipe", showindex=False
)
platform_md_table = tabulate(
platform_results, headers="keys", tablefmt="pipe", showindex=True
)
# document the result
with open(results_folder / "benchmark_results.md", "w") as f:
results = read_markdown("../.buildkite/nightly-benchmarks/" +
"performance-benchmarks-descriptions.md")
md_file = "benchmark_results.md"
json_file = "benchmark_results.json"
with open(results_folder / md_file, "w") as f:
results = read_markdown(
"../.buildkite/nightly-benchmarks/"
+ "performance-benchmarks-descriptions.md"
)
results = results.format(
latency_tests_markdown_table=latency_md_table,
throughput_tests_markdown_table=throughput_md_table,
serving_tests_markdown_table=serving_md_table,
benchmarking_results_in_json_string=processed_results_json)
platform_markdown_table=platform_md_table,
benchmarking_results_in_json_string=processed_results_json,
)
f.write(results)
# document benchmarking results in json
with open(results_folder / "benchmark_results.json", "w") as f:
results = latency_results.to_dict(
orient='records') + throughput_results.to_dict(
orient='records') + serving_results.to_dict(orient='records')
with open(results_folder / json_file, "w") as f:
results = (
latency_results.to_dict(orient="records")
+ throughput_results.to_dict(orient="records")
+ serving_results.to_dict(orient="records")
)
f.write(json.dumps(results))

View File

@@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
@@ -14,15 +15,12 @@ def main(model, cachedir):
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Download and save Hugging Face tokenizer")
parser.add_argument("--model",
type=str,
required=True,
help="Name of the model")
parser.add_argument("--cachedir",
type=str,
required=True,
help="Directory to save the tokenizer")
description="Download and save Hugging Face tokenizer"
)
parser.add_argument("--model", type=str, required=True, help="Name of the model")
parser.add_argument(
"--cachedir", type=str, required=True, help="Directory to save the tokenizer"
)
args = parser.parse_args()
main(args.model, args.cachedir)

View File

@@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import json
@@ -11,33 +12,33 @@ from tabulate import tabulate
def parse_arguments():
parser = argparse.ArgumentParser(
description=
'Parse command line arguments for summary-nightly-results script.')
parser.add_argument('--results-folder',
type=str,
required=True,
help='The folder where the results are stored.')
parser.add_argument('--description',
type=str,
required=True,
help='Description of the results.')
description="Parse command line arguments for summary-nightly-results script."
)
parser.add_argument(
"--results-folder",
type=str,
required=True,
help="The folder where the results are stored.",
)
parser.add_argument(
"--description", type=str, required=True, help="Description of the results."
)
args = parser.parse_args()
return args
def get_perf(df, method, model, metric):
means = []
for qps in [2, 4, 8, 16, "inf"]:
target = df['Test name'].str.contains(model)
target = target & df['Engine'].str.contains(method)
target = target & df['Test name'].str.contains("qps_" + str(qps))
target = df["Test name"].str.contains(model)
target = target & df["Engine"].str.contains(method)
target = target & df["Test name"].str.contains("qps_" + str(qps))
filtered_df = df[target]
if filtered_df.empty:
means.append(0.)
means.append(0.0)
else:
means.append(filtered_df[metric].values[0])
@@ -45,7 +46,6 @@ def get_perf(df, method, model, metric):
def get_perf_w_std(df, method, model, metric):
if metric in ["TTFT", "ITL"]:
mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
mean = mean.tolist()
@@ -60,7 +60,8 @@ def get_perf_w_std(df, method, model, metric):
else:
assert metric == "Tput"
mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
df, method, model, "Output Tput (tok/s)")
df, method, model, "Output Tput (tok/s)"
)
mean = mean.tolist()
std = None
@@ -80,18 +81,17 @@ def main(args):
# generate markdown table
df = pd.DataFrame.from_dict(results)
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
with open(args.description) as f:
description = f.read()
description = description.format(
nightly_results_benchmarking_table=md_table)
description = description.format(nightly_results_benchmarking_table=md_table)
with open("nightly_results.md", "w") as f:
f.write(description)
if __name__ == '__main__':
if __name__ == "__main__":
args = parse_arguments()
main(args)

View File

@@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from lmdeploy.serve.openai.api_client import APIClient

View File

@@ -73,7 +73,7 @@ get_current_llm_serving_engine() {
echo "Container: vllm"
# move to a completely irrelevant directory, to avoid import vllm from current folder
export CURRENT_LLM_SERVING_ENGINE=vllm
return
fi
}
@@ -95,12 +95,14 @@ json2args() {
}
kill_gpu_processes() {
pkill -f python
pkill -f python3
pkill -f tritonserver
pkill -f pt_main_thread
pkill -f text-generation
pkill -f lmdeploy
pkill -f '[p]ython'
pkill -f '[p]ython3'
pkill -f '[t]ritonserver'
pkill -f '[p]t_main_thread'
pkill -f '[t]ext-generation'
pkill -f '[l]mdeploy'
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
pkill -f '[V]LLM'
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
sleep 1
@@ -125,7 +127,7 @@ ensure_installed() {
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
local serving_test_file
@@ -225,7 +227,7 @@ run_serving_tests() {
if [[ "$dataset_name" = "sharegpt" ]]; then
client_command="python3 benchmark_serving.py \
client_command="vllm bench serve \
--backend $backend \
--tokenizer /tokenizer_cache \
--model $model \
@@ -246,7 +248,7 @@ run_serving_tests() {
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
client_command="python3 benchmark_serving.py \
client_command="vllm bench serve \
--backend $backend \
--tokenizer /tokenizer_cache \
--model $model \
@@ -265,13 +267,13 @@ run_serving_tests() {
$client_args"
else
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
exit 1
fi
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
@@ -302,7 +304,7 @@ run_serving_tests() {
}
run_genai_perf_tests() {
# run genai-perf tests
# run genai-perf tests
# $1: a json file specifying genai-perf test cases
local genai_perf_test_file
@@ -311,14 +313,14 @@ run_genai_perf_tests() {
# Iterate over genai-perf tests
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# prepend the current serving engine to the test name
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
@@ -369,10 +371,10 @@ run_genai_perf_tests() {
qps=$num_prompts
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
backend=$CURRENT_LLM_SERVING_ENGINE
if [[ "$backend" == *"vllm"* ]]; then
backend="vllm"
fi
@@ -380,7 +382,7 @@ run_genai_perf_tests() {
client_command="genai-perf profile \
-m $model \
--service-kind openai \
--backend vllm \
--backend "$backend" \
--endpoint-type chat \
--streaming \
--url localhost:$port \
@@ -413,7 +415,7 @@ prepare_dataset() {
do
cat sonnet.txt >> sonnet_4x.txt
done
}
main() {

View File

@@ -10,15 +10,38 @@ set -x
set -o pipefail
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if command -v nvidia-smi; then
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
elif command -v amd-smi; then
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
fi
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
if command -v nvidia-smi; then
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
elif command -v amd-smi; then
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
fi
echo "GPU type is $gpu_type"
}
check_cpus() {
# check the number of CPUs and NUMA Node and GPU type.
declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
if [[ $numa_count -gt 0 ]]; then
echo "NUMA found."
echo $numa_count
else
echo "Need at least 1 NUMA to run benchmarking."
exit 1
fi
declare -g gpu_type="cpu"
echo "GPU type is $gpu_type"
}
@@ -60,6 +83,22 @@ json2args() {
echo "$args"
}
json2envs() {
# transforms the JSON string to environment variables.
# example:
# input: { "VLLM_CPU_KVCACHE_SPACE": 5 }
# output: VLLM_CPU_KVCACHE_SPACE=5
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map((.key ) + "=" + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
@@ -87,12 +126,19 @@ kill_gpu_processes() {
ps -aux
lsof -t -i:8000 | xargs -r kill -9
pgrep python3 | xargs -r kill -9
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
pgrep VLLM | xargs -r kill -9
# wait until GPU memory usage smaller than 1GB
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
sleep 1
done
if command -v nvidia-smi; then
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
sleep 1
done
elif command -v amd-smi; then
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
sleep 1
done
fi
# remove vllm config file
rm -rf ~/.config/vllm
@@ -119,7 +165,7 @@ upload_to_buildkite() {
}
run_latency_tests() {
# run latency tests using `benchmark_latency.py`
# run latency tests using `vllm bench latency` command
# $1: a json file specifying latency test cases
local latency_test_file
@@ -143,15 +189,26 @@ run_latency_tests() {
# get arguments
latency_params=$(echo "$params" | jq -r '.parameters')
latency_args=$(json2args "$latency_params")
latency_environment_variables=$(echo "$params" | jq -r '.environment_variables')
latency_envs=$(json2envs "$latency_environment_variables")
# check if there is enough GPU to run the test
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
if [ "$ON_CPU" == "1" ]; then
pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
world_size=$(($tp*$pp))
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
continue
fi
else
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
fi
latency_command="python3 benchmark_latency.py \
latency_command=" $latency_envs vllm bench latency \
--output-json $RESULTS_FOLDER/${test_name}.json \
$latency_args"
@@ -177,7 +234,7 @@ run_latency_tests() {
}
run_throughput_tests() {
# run throughput tests using `benchmark_throughput.py`
# run throughput tests using `vllm bench throughput`
# $1: a json file specifying throughput test cases
local throughput_test_file
@@ -201,15 +258,26 @@ run_throughput_tests() {
# get arguments
throughput_params=$(echo "$params" | jq -r '.parameters')
throughput_args=$(json2args "$throughput_params")
throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
throughput_envs=$(json2envs "$throughput_environment_variables")
# check if there is enough GPU to run the test
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
if [ "$ON_CPU" == "1" ]; then
pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
world_size=$(($tp*$pp))
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
continue
fi
else
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
fi
throughput_command="python3 benchmark_throughput.py \
throughput_command=" $throughput_envs vllm bench throughput \
--output-json $RESULTS_FOLDER/${test_name}.json \
$throughput_args"
@@ -234,7 +302,7 @@ run_throughput_tests() {
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
local serving_test_file
@@ -257,18 +325,36 @@ run_serving_tests() {
# get client and server arguments
server_params=$(echo "$params" | jq -r '.server_parameters')
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
client_params=$(echo "$params" | jq -r '.client_parameters')
server_args=$(json2args "$server_params")
server_envs=$(json2envs "$server_envs")
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
max_concurrency_list="[$num_prompts]"
fi
max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
echo "Running over max concurrency list $max_concurrency_list"
# check if there is enough GPU to run the test
# check if there is enough resources to run the test
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
if [ "$ON_CPU" == "1" ]; then
pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
world_size=$(($tp*$pp))
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
continue
fi
else
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
fi
# check if server model and client model is aligned
@@ -279,23 +365,33 @@ run_serving_tests() {
continue
fi
server_command="python3 \
server_command="$server_envs python3 \
-m vllm.entrypoints.openai.api_server \
$server_args"
# run the server
echo "Running test case $test_name"
echo "Server command: $server_command"
bash -c "$server_command" &
server_pid=$!
# wait until the server is alive
if wait_for_server; then
echo ""
echo "vllm server is up and running."
# support remote vllm server
client_remote_args=""
if [[ -z "${REMOTE_HOST}" ]]; then
bash -c "$server_command" &
server_pid=$!
# wait until the server is alive
if wait_for_server; then
echo ""
echo "vLLM server is up and running."
else
echo ""
echo "vLLM failed to start within the timeout period."
fi
else
echo ""
echo "vllm failed to start within the timeout period."
server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
if [[ ${REMOTE_PORT} ]]; then
client_remote_args=" --host=$REMOTE_HOST --port=$REMOTE_PORT "
else
client_remote_args=" --host=$REMOTE_HOST "
fi
fi
# iterate over different QPS
@@ -307,35 +403,39 @@ run_serving_tests() {
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
# iterate over different max_concurrency
for max_concurrency in $max_concurrency_list; do
new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
echo " new test name $new_test_name"
# pass the tensor parallel size to the client so that it can be displayed
# on the benchmark dashboard
client_command="vllm bench serve \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
--max-concurrency $max_concurrency \
--metadata "tensor_parallel_size=$tp" \
$client_args $client_remote_args "
# pass the tensor parallel size to the client so that it can be displayed
# on the benchmark dashboard
client_command="python3 benchmark_serving.py \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
--metadata "tensor_parallel_size=$tp" \
$client_args"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
bash -c "$client_command"
bash -c "$client_command"
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
done
# clean up
@@ -345,7 +445,14 @@ run_serving_tests() {
}
main() {
check_gpus
local ARCH
ARCH=''
if [ "$ON_CPU" == "1" ];then
check_cpus
ARCH='-cpu'
else
check_gpus
fi
check_hf_token
# Set to v1 to run v1 benchmark
@@ -358,7 +465,7 @@ main() {
(which jq) || (apt-get update && apt-get -y install jq)
(which lsof) || (apt-get update && apt-get install -y lsof)
# get the current IP address, required by benchmark_serving.py
# get the current IP address, required by `vllm bench serve` command
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
# turn of the reporting of the status of each request, to clean up the terminal output
export VLLM_LOGGING_LEVEL="WARNING"
@@ -371,9 +478,9 @@ main() {
QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
# benchmarking
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
# postprocess benchmarking results
pip install tabulate pandas

View File

@@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import datetime
import json
@@ -34,10 +35,8 @@ serving_column_mapping = {
}
if __name__ == "__main__":
# collect results
for test_file in results_folder.glob("*.json"):
with open(test_file) as f:
raw_result = json.loads(f.read())
@@ -56,17 +55,16 @@ if __name__ == "__main__":
serving_results = pd.DataFrame.from_dict(serving_results)
if not serving_results.empty:
serving_results = serving_results[list(
serving_column_mapping.keys())].rename(
columns=serving_column_mapping)
serving_results = serving_results[list(serving_column_mapping.keys())].rename(
columns=serving_column_mapping
)
serving_md_table_with_headers = tabulate(serving_results,
headers='keys',
tablefmt='pipe',
showindex=False)
serving_md_table_with_headers = tabulate(
serving_results, headers="keys", tablefmt="pipe", showindex=False
)
# remove the first line of header
serving_md_table_lines = serving_md_table_with_headers.split('\n')
serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
serving_md_table_lines = serving_md_table_with_headers.split("\n")
serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
@@ -76,10 +74,9 @@ if __name__ == "__main__":
# document results with header.
# for those who wants to reproduce our benchmark.
f.write(serving_md_table_with_headers)
f.write('\n')
f.write("\n")
# document benchmarking results in json
with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
results = serving_results.to_dict(orient='records')
results = serving_results.to_dict(orient="records")
f.write(json.dumps(results))

View File

@@ -11,9 +11,7 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},

View File

@@ -0,0 +1,30 @@
[
{
"test_name": "latency_llama8B_tp1",
"environment_variables": {
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"num_iters_warmup": 5,
"num_iters": 15
}
},
{
"test_name": "latency_llama8B_tp4",
"environment_variables": {
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"num_iters_warmup": 5,
"num_iters": 15
}
}
]

View File

@@ -35,9 +35,7 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
@@ -90,9 +88,7 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
@@ -145,9 +141,7 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
@@ -197,9 +191,7 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
@@ -251,9 +243,7 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
@@ -305,9 +295,7 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},

View File

@@ -0,0 +1,610 @@
[
{
"test_name": "serving_llama8B_bf16_tp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_bf16_tp2_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_bf16_tp4_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_bf16_tp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_bf16_tp2_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_bf16_tp4_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int8_tp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int8_tp2_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int8_tp4_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int8_tp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int8_tp2_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int8_tp4_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int4_tp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int4_tp2_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int4_tp4_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int4_tp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int4_tp2_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int4_tp4_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
}
]

View File

@@ -0,0 +1,820 @@
[
{
"test_name": "serving_llama8B_bf16_pp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"pipeline_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_bf16_tp2_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_bf16_pp3_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_bf16_pp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"pipeline_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_bf16_tp2_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_bf16_pp3_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int8_pp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"pipeline_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int8_tp2_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int8_pp3_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 2,
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int8_pp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"pipeline_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int8_tp2_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int8_pp3_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 2,
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int4_pp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"pipeline_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int4_tp2_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int4_pp3_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 2,
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int4_pp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"pipeline_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int4_tp2_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int4_pp3_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 2,
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
}
]

View File

@@ -0,0 +1,168 @@
[
{
"test_name": "serving_llama8B_tp1_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp2_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp4_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp4_random_1024_128",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 1024,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 100
}
},
{
"test_name": "serving_llama8B_pp6_random_1024_128",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"pipeline_parallel_size": 6,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 1024,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 100
}
}
]

View File

@@ -7,7 +7,6 @@
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
@@ -26,7 +25,6 @@
"tensor_parallel_size": 4,
"swap_space": 16,
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
@@ -45,7 +43,6 @@
"tensor_parallel_size": 2,
"swap_space": 16,
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
@@ -60,13 +57,14 @@
"test_name": "serving_llama70B_tp4_sharegpt_specdecode",
"qps_list": [2],
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"disable_log_requests": "",
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4,
"swap_space": 16,
"speculative_model": "turboderp/Qwama-0.5B-Instruct",
"num_speculative_tokens": 4,
"speculative_draft_tensor_parallel_size": 1
"swap_space": 16,
"speculative_config": {
"model": "turboderp/Qwama-0.5B-Instruct",
"num_speculative_tokens": 4,
"draft_tensor_parallel_size": 1
}
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",

View File

@@ -0,0 +1,32 @@
[
{
"test_name": "throughput_llama8B_tp1",
"environment_variables": {
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200,
"backend": "vllm"
}
},
{
"test_name": "throughput_llama8B_tp4",
"environment_variables": {
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200,
"backend": "vllm"
}
}
]

46
.buildkite/pyproject.toml Normal file
View File

@@ -0,0 +1,46 @@
# This local pyproject file is part of the migration from yapf to ruff format.
# It uses the same core rules as the main pyproject.toml file, but with the
# following differences:
# - ruff line length is overridden to 88
# - deprecated typing ignores (UP006, UP035) have been removed
[tool.ruff]
line-length = 88
[tool.ruff.lint.per-file-ignores]
"vllm/third_party/**" = ["ALL"]
"vllm/version.py" = ["F401"]
"vllm/_version.py" = ["ALL"]
[tool.ruff.lint]
select = [
# pycodestyle
"E",
# Pyflakes
"F",
# pyupgrade
"UP",
# flake8-bugbear
"B",
# flake8-simplify
"SIM",
# isort
"I",
# flake8-logging-format
"G",
]
ignore = [
# star imports
"F405", "F403",
# lambda expression assignment
"E731",
# Loop control variable not used within loop body
"B007",
# f-string format
"UP032",
# Can remove once 3.10+ is the minimum Python version
"UP007",
]
[tool.ruff.format]
docstring-code-format = true

View File

@@ -1,76 +1,138 @@
steps:
- label: "Build wheel - CUDA 12.4"
# aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
- label: "Build arm64 wheel - CUDA 12.9"
id: build-wheel-arm64-cuda-12-9
agents:
queue: cpu_queue_postmerge
queue: arm64_cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ."
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/upload-wheels.sh"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 12.1"
- block: "Build CUDA 12.8 wheel"
key: block-build-cu128-wheel
- label: "Build wheel - CUDA 12.8"
depends_on: block-build-cu128-wheel
id: build-wheel-cuda-12-8
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/upload-wheels.sh"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
# Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
# However, this block can be uncommented to save some compute hours.
# - block: "Build CUDA 11.8 wheel"
# key: block-build-cu118-wheel
- label: "Build wheel - CUDA 11.8"
# depends_on: block-build-cu118-wheel
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- block: "Build release image"
- block: "Build CUDA 12.6 wheel"
key: block-build-cu126-wheel
depends_on: ~
key: block-release-image-build
- label: "Build release image"
depends_on: block-release-image-build
- label: "Build wheel - CUDA 12.6"
depends_on: block-build-cu126-wheel
id: build-wheel-cuda-12-6
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
# x86 + CUDA builds
- label: "Build wheel - CUDA 12.9"
depends_on: ~
id: build-wheel-cuda-12-9
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- label: "Build release image (x86)"
depends_on: ~
id: build-release-image-x86
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# re-tag to default image tag and push, just in case arm64 build fails
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
# PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
- label: "Build release image (arm64)"
depends_on: ~
id: build-release-image-arm64
agents:
queue: arm64_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# Add job to create multi-arch manifest
- label: "Create multi-arch manifest"
depends_on:
- build-release-image-x86
- build-release-image-arm64
id: create-multi-arch-manifest
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Annotate release workflow"
depends_on:
- create-multi-arch-manifest
- build-wheel-cuda-12-8
- build-wheel-cuda-12-6
- build-wheel-cuda-12-9
id: annotate-release-workflow
agents:
queue: cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/annotate-release.sh"
- label: "Build and publish TPU release image"
depends_on: ~
if: build.env("NIGHTLY") == "1"
agents:
queue: tpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
- "yes | docker system prune -a"
- "git fetch --all"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
- "docker push vllm/vllm-tpu:nightly"
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
plugins:
- docker-login#v3.0.0:
username: vllm
username: vllmbot
password-env: DOCKERHUB_TOKEN
env:
DOCKER_BUILDKIT: "1"
- input: "Provide Release version here"
id: input-release-version
fields:
- text: "What is the release version?"
key: "release-version"
key: release-version
- block: "Build CPU release image"
key: block-cpu-release-image-build
@@ -82,7 +144,30 @@ steps:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain -f Dockerfile.cpu ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
env:
DOCKER_BUILDKIT: "1"
- label: "Build and publish nightly multi-arch image to DockerHub"
depends_on:
- create-multi-arch-manifest
if: build.env("NIGHTLY") == "1"
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
- "docker push vllm/vllm-openai:nightly"
- "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
# Clean up old nightly builds (keep only last 14)
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
plugins:
- docker-login#v3.0.0:
username: vllmbot
password-env: DOCKERHUB_TOKEN
env:
DOCKER_BUILDKIT: "1"

View File

@@ -1,90 +0,0 @@
#!/bin/bash
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# allow to bind to different cores
CORE_RANGE=${CORE_RANGE:-48-95}
NUMA_NODE=${NUMA_NODE:-1}
# Try building the docker image
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
# Setup cleanup
remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
trap remove_docker_container EXIT
remove_docker_container
# Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
function cpu_tests() {
set -e
export NUMA_NODE=$2
export BUILDKITE_BUILD_NUMBER=$3
# offline inference
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run basic model test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e
pip install -r vllm/requirements/test.txt
pip install -r vllm/requirements/cpu.txt
pytest -v -s tests/models/decoder_only/language -m cpu_model
pytest -v -s tests/models/embedding/language -m cpu_model
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
# Run compressed-tensor test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
# Run AWQ test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
tests/quantization/test_ipex_quant.py"
# Run chunked-prefill and prefix-cache test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e
pytest -s -v -k cpu_model \
tests/basic_correctness/test_chunked_prefill.py"
# online serving
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e
export VLLM_CPU_KVCACHE_SPACE=10
export VLLM_CPU_OMP_THREADS_BIND=$1
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \
--backend vllm \
--dataset-name random \
--model facebook/opt-125m \
--num-prompts 20 \
--endpoint /v1/completions \
--tokenizer facebook/opt-125m"
# Run multi-lora tests
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
tests/lora/test_qwen2vl.py"
}
# All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"

View File

@@ -1,24 +0,0 @@
#!/bin/bash
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# Try building the docker image
docker build -t hpu-test-env -f Dockerfile.hpu .
# Setup cleanup
# certain versions of HPU software stack have a bug that can
# override the exit code of the script, so we need to use
# separate remove_docker_container and remove_docker_container_and_exit
# functions, while other platforms only need one remove_docker_container
# function.
EXITCODE=1
remove_docker_container() { docker rm -f hpu-test || true; }
remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }
trap remove_docker_container_and_exit EXIT
remove_docker_container
# Run the image and launch offline inference
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
EXITCODE=$?

View File

@@ -1,54 +0,0 @@
#!/bin/bash
# This script build the Neuron docker image and run the API server inside the container.
# It serves a sanity check for compilation and basic model usage.
set -e
set -v
image_name="neuron/vllm-ci"
container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
HF_CACHE="$(realpath ~)/huggingface"
mkdir -p "${HF_CACHE}"
HF_MOUNT="/root/.cache/huggingface"
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
# Try building the docker image
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
# prune old image and containers to save disk space, and only once a day
# by using a timestamp file in tmp.
if [ -f /tmp/neuron-docker-build-timestamp ]; then
last_build=$(cat /tmp/neuron-docker-build-timestamp)
current_time=$(date +%s)
if [ $((current_time - last_build)) -gt 86400 ]; then
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune -f
echo "$current_time" > /tmp/neuron-docker-build-timestamp
fi
else
date "+%s" > /tmp/neuron-docker-build-timestamp
fi
docker build -t "${image_name}" -f Dockerfile.neuron .
# Setup cleanup
remove_docker_container() {
docker image rm -f "${image_name}" || true;
}
trap remove_docker_container EXIT
# Run the image
docker run --rm -it --device=/dev/neuron0 --network bridge \
-v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
--name "${container_name}" \
${image_name} \
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys"

View File

@@ -1,38 +0,0 @@
#!/bin/bash
set -e
# Build the docker image.
docker build -f Dockerfile.tpu -t vllm-tpu .
# Set up cleanup.
remove_docker_container() { docker rm -f tpu-test || true; }
trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
# For HF_TOKEN.
source /etc/environment
# Run a simple end-to-end example.
docker run --privileged --net host --shm-size=16G -it \
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
&& python3 -m pip install pytest \
&& python3 -m pip install lm_eval[api]==0.4.4 \
&& export VLLM_USE_V1=1 \
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
&& echo TEST_1 \
&& pytest /workspace/vllm/tests/tpu/test_compilation.py \
&& echo TEST_2 \
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
&& echo TEST_3 \
&& pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
&& echo TEST_4 \
&& pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
&& echo TEST_5 \
&& python3 /workspace/vllm/examples/offline_inference/tpu.py" \
# TODO: This test fails because it uses RANDOM_SEED sampling
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \

View File

@@ -1,31 +0,0 @@
#!/bin/bash
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
# Try building the docker image
docker build -t ${image_name} -f Dockerfile.xpu .
# Setup cleanup
remove_docker_container() {
docker rm -f "${container_name}" || true;
docker image rm -f "${image_name}" || true;
docker system prune -f || true;
}
trap remove_docker_container EXIT
# Run the image and test offline inference/tensor parallel
docker run \
--device /dev/dri \
-v /dev/dri/by-path:/dev/dri/by-path \
--entrypoint="" \
--name "${container_name}" \
"${image_name}" \
sh -c '
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
'

View File

@@ -0,0 +1,31 @@
#!/bin/bash
set -ex
# Get release version and strip leading 'v' if present
RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
if [ -z "$RELEASE_VERSION" ]; then
echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
exit 1
fi
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
To download the wheel:
\`\`\`
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
\`\`\`
To download and upload the image:
\`\`\`
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
docker tag vllm/vllm-openai vllm/vllm-openai:latest
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
docker push vllm/vllm-openai:latest
docker push vllm/vllm-openai:v${RELEASE_VERSION}
\`\`\`
EOF

View File

@@ -0,0 +1,17 @@
#!/bin/bash
# Usage: ./ci_clean_log.sh ci.log
# This script strips timestamps and color codes from CI log files.
# Check if argument is given
if [ $# -lt 1 ]; then
echo "Usage: $0 ci.log"
exit 1
fi
INPUT_FILE="$1"
# Strip timestamps
sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
# Strip colorization
sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"

View File

@@ -0,0 +1,97 @@
#!/bin/bash
set -ex
# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
# This script uses DockerHub API to list and delete old tags with "nightly-" prefix
# DockerHub API endpoint for vllm/vllm-openai repository
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
# Get DockerHub token from environment
if [ -z "$DOCKERHUB_TOKEN" ]; then
echo "Error: DOCKERHUB_TOKEN environment variable is not set"
exit 1
fi
# Function to get all tags from DockerHub
get_all_tags() {
local page=1
local all_tags=""
while true; do
local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
"$REPO_API_URL?page=$page&page_size=100")
# Get both last_updated timestamp and tag name, separated by |
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
if [ -z "$tags" ]; then
break
fi
all_tags="$all_tags$tags"$'\n'
page=$((page + 1))
done
# Sort by timestamp (newest first) and extract just the tag names
echo "$all_tags" | sort -r | cut -d'|' -f2
}
delete_tag() {
local tag_name="$1"
echo "Deleting tag: $tag_name"
local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
else
echo "Successfully deleted tag: $tag_name"
fi
}
# Get all nightly- prefixed tags, sorted by last_updated timestamp (newest first)
echo "Fetching all tags from DockerHub..."
all_tags=$(get_all_tags)
if [ -z "$all_tags" ]; then
echo "No tags found to clean up"
exit 0
fi
# Count total tags
total_tags=$(echo "$all_tags" | wc -l)
echo "Found $total_tags tags"
# Keep only the last 14 builds (including the current one)
tags_to_keep=14
tags_to_delete=$((total_tags - tags_to_keep))
if [ $tags_to_delete -le 0 ]; then
echo "No tags need to be deleted (only $total_tags tags found, keeping $tags_to_keep)"
exit 0
fi
echo "Will delete $tags_to_delete old tags, keeping the newest $tags_to_keep"
# Get tags to delete (skip the first $tags_to_keep tags)
tags_to_delete_list=$(echo "$all_tags" | tail -n +$((tags_to_keep + 1)))
if [ -z "$tags_to_delete_list" ]; then
echo "No tags to delete"
exit 0
fi
# Delete old tags
echo "Deleting old tags..."
while IFS= read -r tag; do
if [ -n "$tag" ]; then
delete_tag "$tag"
# Add a small delay to avoid rate limiting
sleep 1
fi
done <<< "$tags_to_delete_list"
echo "Cleanup completed successfully"

View File

@@ -3,6 +3,9 @@
# This script runs test inside the corresponding ROCm docker container.
set -o pipefail
# Export Python path
export PYTHONPATH=".."
# Print ROCm version
echo "--- Confirming Clean Initial State"
while true; do
@@ -74,50 +77,102 @@ HF_MOUNT="/root/.cache/huggingface"
commands=$@
echo "Commands:$commands"
if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
fi
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
fi
if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
fi
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
fi
if [[ $commands == *"pytest -v -s lora"* ]]; then
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
fi
#ignore certain kernels tests
if [[ $commands == *" kernels "* ]]; then
if [[ $commands == *" kernels/core"* ]]; then
commands="${commands} \
--ignore=kernels/test_attention_selector.py \
--ignore=kernels/test_blocksparse_attention.py \
--ignore=kernels/test_causal_conv1d.py \
--ignore=kernels/test_cutlass.py \
--ignore=kernels/test_encoder_decoder_attn.py \
--ignore=kernels/test_flash_attn.py \
--ignore=kernels/test_flashinfer.py \
--ignore=kernels/test_int8_quant.py \
--ignore=kernels/test_machete_gemm.py \
--ignore=kernels/test_mamba_ssm.py \
--ignore=kernels/test_marlin_gemm.py \
--ignore=kernels/test_moe.py \
--ignore=kernels/test_prefix_prefill.py \
--ignore=kernels/test_rand.py \
--ignore=kernels/test_sampler.py \
--ignore=kernels/test_cascade_flash_attn.py \
--ignore=kernels/test_mamba_mixer2.py \
--ignore=kernels/test_aqlm.py \
--ignore=kernels/test_machete_mm.py \
--ignore=kernels/test_mha_attn.py \
--ignore=kernels/test_block_fp8.py \
--ignore=kernels/test_permute_cols.py"
--ignore=kernels/core/test_fused_quant_layernorm.py \
--ignore=kernels/core/test_permute_cols.py"
fi
if [[ $commands == *" kernels/attention"* ]]; then
commands="${commands} \
--ignore=kernels/attention/test_attention_selector.py \
--ignore=kernels/attention/test_encoder_decoder_attn.py \
--ignore=kernels/attention/test_flash_attn.py \
--ignore=kernels/attention/test_flashinfer.py \
--ignore=kernels/attention/test_prefix_prefill.py \
--ignore=kernels/attention/test_cascade_flash_attn.py \
--ignore=kernels/attention/test_mha_attn.py \
--ignore=kernels/attention/test_lightning_attn.py \
--ignore=kernels/attention/test_attention.py"
fi
if [[ $commands == *" kernels/quantization"* ]]; then
commands="${commands} \
--ignore=kernels/quantization/test_int8_quant.py \
--ignore=kernels/quantization/test_machete_mm.py \
--ignore=kernels/quantization/test_block_fp8.py \
--ignore=kernels/quantization/test_block_int8.py \
--ignore=kernels/quantization/test_marlin_gemm.py \
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
--ignore=kernels/quantization/test_int8_kernel.py"
fi
if [[ $commands == *" kernels/mamba"* ]]; then
commands="${commands} \
--ignore=kernels/mamba/test_mamba_mixer2.py \
--ignore=kernels/mamba/test_causal_conv1d.py \
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
fi
if [[ $commands == *" kernels/moe"* ]]; then
commands="${commands} \
--ignore=kernels/moe/test_moe.py \
--ignore=kernels/moe/test_cutlass_moe.py \
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
fi
#ignore certain Entrypoints/openai tests
if [[ $commands == *" entrypoints/openai "* ]]; then
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
--ignore=entrypoints/openai/test_audio.py \
--ignore=entrypoints/openai/test_chat.py \
--ignore=entrypoints/openai/test_shutdown.py \
--ignore=entrypoints/openai/test_completion.py \
--ignore=entrypoints/openai/test_sleep.py \
--ignore=entrypoints/openai/test_models.py \
--ignore=entrypoints/openai/test_lora_adapters.py \
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
--ignore=entrypoints/openai/test_root_path.py \
--ignore=entrypoints/openai/test_tokenization.py \
--ignore=entrypoints/openai/test_prompt_validation.py "}
fi
#ignore certain Entrypoints/llm tests
if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
if [[ $commands == *" entrypoints/llm "* ]]; then
commands=${commands//" entrypoints/llm "/" entrypoints/llm \
--ignore=entrypoints/llm/test_chat.py \
--ignore=entrypoints/llm/test_accuracy.py \
--ignore=entrypoints/llm/test_init.py \
--ignore=entrypoints/llm/test_prompt_validation.py "}
fi
#Obsolete currently
##ignore certain Entrypoints/llm tests
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
#fi
# --ignore=entrypoints/openai/test_encoder_decoder.py \
# --ignore=entrypoints/openai/test_embedding.py \
# --ignore=entrypoints/openai/test_oot_registration.py
@@ -126,6 +181,8 @@ fi
PARALLEL_JOB_COUNT=8
MYPYTHONPATH=".."
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then
# assign job count as the number of shards used
@@ -134,9 +191,10 @@ if [[ $commands == *"--shard-id="* ]]; then
# assign shard-id for each shard
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
echo "Shard ${GPU} commands:$commands_gpu"
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
docker run \
--device /dev/kfd --device /dev/dri \
--network host \
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
--network=host \
--shm-size=16gb \
--rm \
-e HIP_VISIBLE_DEVICES="${GPU}" \
@@ -145,6 +203,7 @@ if [[ $commands == *"--shard-id="* ]]; then
-e AWS_SECRET_ACCESS_KEY \
-v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \
-e "PYTHONPATH=${MYPYTHONPATH}" \
--name "${container_name}_${GPU}" \
"${image_name}" \
/bin/bash -c "${commands_gpu}" \
@@ -163,9 +222,10 @@ if [[ $commands == *"--shard-id="* ]]; then
fi
done
else
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
docker run \
--device /dev/kfd --device /dev/dri \
--network host \
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
--network=host \
--shm-size=16gb \
--rm \
-e HIP_VISIBLE_DEVICES=0 \
@@ -174,6 +234,7 @@ else
-e AWS_SECRET_ACCESS_KEY \
-v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \
-e "PYTHONPATH=${MYPYTHONPATH}" \
--name "${container_name}" \
"${image_name}" \
/bin/bash -c "${commands}"

View File

@@ -0,0 +1,49 @@
#!/bin/bash
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# Setup cleanup
remove_docker_container() {
if [[ -n "$container_id" ]]; then
podman stop --all -t0
podman rm -f "$container_id" || true
fi
podman system prune -f
}
trap remove_docker_container EXIT
remove_docker_container
# Try building the docker image
podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
# Run the image
container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
function cpu_tests() {
# offline inference
podman exec -it "$container_id" bash -c "
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run basic model test
podman exec -it "$container_id" bash -c "
set -e
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
pip install sentence-transformers datamodel_code_generator
pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
}
# All of CPU tests are expected to be finished less than 40 mins.
export container_id
export -f cpu_tests
timeout 40m bash -c cpu_tests

View File

@@ -10,5 +10,4 @@ trap remove_docker_container EXIT
remove_docker_container
# Try building the docker image
docker build -t cpu-test -f Dockerfile.ppc64le .
docker build -t cpu-test -f docker/Dockerfile.s390x .

View File

@@ -0,0 +1,123 @@
#!/bin/bash
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# allow to bind to different cores
CORE_RANGE=${CORE_RANGE:-48-95}
# used for TP/PP E2E test
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
NUMA_NODE=${NUMA_NODE:-1}
export CMAKE_BUILD_PARALLEL_LEVEL=32
# Setup cleanup
remove_docker_container() {
set -e;
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
}
trap remove_docker_container EXIT
remove_docker_container
# Try building the docker image
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
function cpu_tests() {
set -e
export NUMA_NODE=$2
# list packages
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
set -e
pip list"
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pip list"
# offline inference
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run kernel tests
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -x -v -s tests/kernels/test_onednn.py"
# Run basic model test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
# Note: disable until supports V1
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
# Note: disable Bart until supports V1
pytest -x -v -s tests/models/language/generation -m cpu_model \
--ignore=tests/models/language/generation/test_bart.py
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
--ignore=tests/models/language/generation/test_bart.py
pytest -x -v -s tests/models/language/pooling -m cpu_model
pytest -x -v -s tests/models/multimodal/generation \
--ignore=tests/models/multimodal/generation/test_mllama.py \
--ignore=tests/models/multimodal/generation/test_pixtral.py \
-m cpu_model"
# Run compressed-tensor test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -x -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
# Note: disable it until supports V1
# Run AWQ test
# docker exec cpu-test-"$NUMA_NODE" bash -c "
# set -e
# VLLM_USE_V1=0 pytest -x -s -v \
# tests/quantization/test_ipex_quant.py"
# Run multi-lora tests
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -x -s -v \
tests/lora/test_qwen2vl.py"
# online serving: tp+pp
docker exec cpu-test-"$NUMA_NODE" bash -c '
set -e
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
vllm bench serve \
--backend vllm \
--dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \
--num-prompts 20 \
--endpoint /v1/completions
kill -s SIGTERM $server_pid &'
# online serving: tp+dp
docker exec cpu-test-"$NUMA_NODE" bash -c '
set -e
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
vllm bench serve \
--backend vllm \
--dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \
--num-prompts 20 \
--endpoint /v1/completions
kill -s SIGTERM $server_pid &'
}
# All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests
timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"

View File

@@ -9,14 +9,14 @@ python3 use_existing_torch.py
# Try building the docker image
DOCKER_BUILDKIT=1 docker build . \
--file docker/Dockerfile \
--target vllm-openai \
--platform "linux/arm64" \
-t gh200-test \
--build-arg max_jobs=66 \
--build-arg nvcc_threads=2 \
--build-arg RUN_WHEEL_CHECK=false \
--build-arg torch_cuda_arch_list="9.0+PTX" \
--build-arg vllm_fa_cmake_gpu_arches="90-real"
--build-arg torch_cuda_arch_list="9.0+PTX"
# Setup cleanup
remove_docker_container() { docker rm -f gh200-test || true; }

View File

@@ -0,0 +1,56 @@
#!/bin/bash
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -exuo pipefail
# Try building the docker image
cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
FROM gaudi-base-image:latest
COPY ./ /workspace/vllm
WORKDIR /workspace/vllm
ENV no_proxy=localhost,127.0.0.1
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
RUN VLLM_TARGET_DEVICE=empty pip install .
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
# install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils
WORKDIR /workspace/
RUN git clone https://github.com/vllm-project/vllm-gaudi.git
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
EOF
# Setup cleanup
# certain versions of HPU software stack have a bug that can
# override the exit code of the script, so we need to use
# separate remove_docker_containers and remove_docker_containers_and_exit
# functions, while other platforms only need one remove_docker_container
# function.
EXITCODE=1
remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; }
trap 'remove_docker_containers; exit $EXITCODE;' EXIT
remove_docker_containers
echo "Running HPU plugin v1 test"
docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \
-e HABANA_VISIBLE_DEVICES=all \
hpu-plugin-v1-test-env \
/bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh"
EXITCODE=$?
if [ $EXITCODE -eq 0 ]; then
echo "Test with basic model passed"
else
echo "Test with basic model FAILED with exit code: $EXITCODE" >&2
fi
# The trap will handle the container removal and final exit.

View File

@@ -0,0 +1,167 @@
#!/bin/bash
set -xu
remove_docker_container() {
docker rm -f tpu-test || true;
}
trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
# Build the docker image.
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
# Set up cleanup.
cleanup_docker() {
# Get Docker's root directory
docker_root=$(docker info -f '{{.DockerRootDir}}')
if [ -z "$docker_root" ]; then
echo "Failed to determine Docker root directory."
exit 1
fi
echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
fi
}
cleanup_docker
# For HF_TOKEN.
source /etc/environment
docker run --privileged --net host --shm-size=16G -it \
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
vllm-tpu /bin/bash -c '
set -e # Exit immediately if a command exits with a non-zero status.
set -u # Treat unset variables as an error.
echo "--- Starting script inside Docker container ---"
# Create results directory
RESULTS_DIR=$(mktemp -d)
# If mktemp fails, set -e will cause the script to exit.
echo "Results will be stored in: $RESULTS_DIR"
# Install dependencies
echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
&& python3 -m pip install --progress-bar off hf-transfer
echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
export VLLM_XLA_CHECK_RECOMPILATION=1
export VLLM_XLA_CACHE_PATH=
echo "Using VLLM V1"
echo "--- Hardware Information ---"
# tpu-info
echo "--- Starting Tests ---"
set +e
overall_script_exit_code=0
# --- Test Definitions ---
# If a test fails, this function will print logs and will not cause the main script to exit.
run_test() {
local test_num=$1
local test_name=$2
local test_command=$3
local log_file="$RESULTS_DIR/test_${test_num}.log"
local actual_exit_code
echo "--- TEST_$test_num: Running $test_name ---"
# Execute the test command.
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
actual_exit_code=$?
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
if [ "$actual_exit_code" -ne 0 ]; then
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
if [ -f "$log_file" ]; then
cat "$log_file" >&2
else
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
fi
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
return "$actual_exit_code" # Return the failure code
else
echo "TEST_$test_num ($test_name) PASSED."
return 0 # Return success
fi
}
# Helper function to call run_test and update the overall script exit code
run_and_track_test() {
local test_num_arg="$1"
local test_name_arg="$2"
local test_command_arg="$3"
# Run the test
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
local test_specific_exit_code=$?
# If the test failed, set the overall script exit code to 1
if [ "$test_specific_exit_code" -ne 0 ]; then
# No need for extra echo here, run_test already logged the failure.
overall_script_exit_code=1
fi
}
# --- Actual Test Execution ---
run_and_track_test 1 "test_struct_output_generate.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
run_and_track_test 2 "test_moe_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
run_and_track_test 3 "test_lora.py" \
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
run_and_track_test 4 "test_tpu_qkv_linear.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
run_and_track_test 5 "test_spmd_model_weight_loading.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
run_and_track_test 6 "test_kv_cache_update_kernel.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
run_and_track_test 7 "test_tpu_int8.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_int8.py"
# After all tests have been attempted, exit with the overall status.
if [ "$overall_script_exit_code" -ne 0 ]; then
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
else
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
fi
exit "$overall_script_exit_code"
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
# Capture the exit code of the docker run command
DOCKER_RUN_EXIT_CODE=$?
# The trap will run for cleanup.
# Exit the main script with the Docker run command's exit code.
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
exit "$DOCKER_RUN_EXIT_CODE"
else
echo "Docker run command completed successfully."
exit 0
fi
# TODO: This test fails because it uses RANDOM_SEED sampling
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \

View File

@@ -0,0 +1,175 @@
#!/bin/bash
set -xu
remove_docker_container() {
docker rm -f tpu-test || true;
}
trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
# Build the docker image.
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
# Set up cleanup.
cleanup_docker() {
# Get Docker's root directory
docker_root=$(docker info -f '{{.DockerRootDir}}')
if [ -z "$docker_root" ]; then
echo "Failed to determine Docker root directory."
exit 1
fi
echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
fi
}
cleanup_docker
# For HF_TOKEN.
source /etc/environment
docker run --privileged --net host --shm-size=16G -it \
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
vllm-tpu /bin/bash -c '
set -e # Exit immediately if a command exits with a non-zero status.
set -u # Treat unset variables as an error.
echo "--- Starting script inside Docker container ---"
# Create results directory
RESULTS_DIR=$(mktemp -d)
# If mktemp fails, set -e will cause the script to exit.
echo "Results will be stored in: $RESULTS_DIR"
# Install dependencies
echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
&& python3 -m pip install --progress-bar off hf-transfer
echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
export VLLM_XLA_CHECK_RECOMPILATION=1
export VLLM_XLA_CACHE_PATH=
echo "Using VLLM V1"
echo "--- Hardware Information ---"
# tpu-info
echo "--- Starting Tests ---"
set +e
overall_script_exit_code=0
# --- Test Definitions ---
# If a test fails, this function will print logs and will not cause the main script to exit.
run_test() {
local test_num=$1
local test_name=$2
local test_command=$3
local log_file="$RESULTS_DIR/test_${test_num}.log"
local actual_exit_code
echo "--- TEST_$test_num: Running $test_name ---"
# Execute the test command.
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
actual_exit_code=$?
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
if [ "$actual_exit_code" -ne 0 ]; then
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
if [ -f "$log_file" ]; then
cat "$log_file" >&2
else
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
fi
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
return "$actual_exit_code" # Return the failure code
else
echo "TEST_$test_num ($test_name) PASSED."
return 0 # Return success
fi
}
# Helper function to call run_test and update the overall script exit code
run_and_track_test() {
local test_num_arg="$1"
local test_name_arg="$2"
local test_command_arg="$3"
# Run the test
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
local test_specific_exit_code=$?
# If the test failed, set the overall script exit code to 1
if [ "$test_specific_exit_code" -ne 0 ]; then
# No need for extra echo here, run_test already logged the failure.
overall_script_exit_code=1
fi
}
# --- Actual Test Execution ---
run_and_track_test 0 "test_perf.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py"
run_and_track_test 1 "test_compilation.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py"
run_and_track_test 2 "test_basic.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
run_and_track_test 4 "test_quantization_accuracy.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
run_and_track_test 5 "examples/offline_inference/tpu.py" \
"python3 /workspace/vllm/examples/offline_inference/tpu.py"
run_and_track_test 6 "test_tpu_model_runner.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
run_and_track_test 7 "test_sampler.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py"
run_and_track_test 8 "test_topk_topp_sampler.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py"
run_and_track_test 9 "test_multimodal.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
run_and_track_test 10 "test_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
# After all tests have been attempted, exit with the overall status.
if [ "$overall_script_exit_code" -ne 0 ]; then
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
else
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
fi
exit "$overall_script_exit_code"
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
# Capture the exit code of the docker run command
DOCKER_RUN_EXIT_CODE=$?
# The trap will run for cleanup.
# Exit the main script with the Docker run command's exit code.
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
exit "$DOCKER_RUN_EXIT_CODE"
else
echo "Docker run command completed successfully."
exit 0
fi
# TODO: This test fails because it uses RANDOM_SEED sampling
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \

View File

@@ -0,0 +1,50 @@
#!/bin/bash
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
# Try building the docker image
docker build -t ${image_name} -f docker/Dockerfile.xpu .
# Setup cleanup
remove_docker_container() {
docker rm -f "${container_name}" || true;
docker image rm -f "${image_name}" || true;
docker system prune -f || true;
}
trap remove_docker_container EXIT
# Run the image and test offline inference/tensor parallel
docker run \
--device /dev/dri \
-v /dev/dri/by-path:/dev/dri/by-path \
--entrypoint="" \
-e "HF_TOKEN=${HF_TOKEN}" \
-e "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK}" \
--name "${container_name}" \
"${image_name}" \
bash -c '
set -e
echo $ZE_AFFINITY_MASK
pip install tblib==3.1.0
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
cd tests
pytest -v -s v1/core
pytest -v -s v1/engine
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
pytest -v -s v1/structured_output
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
pytest -v -s v1/test_serial_utils.py
pytest -v -s v1/test_utils.py
pytest -v -s v1/test_metrics_reader.py
'

View File

@@ -0,0 +1,18 @@
#!/bin/bash
# Usage: ./rerun_test.sh path/to/test.py::test_name
# Check if argument is given
if [ $# -lt 1 ]; then
echo "Usage: $0 path/to/test.py::test_name"
echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
exit 1
fi
TEST=$1
COUNT=1
while pytest -sv "$TEST"; do
COUNT=$((COUNT + 1))
echo "RUN NUMBER ${COUNT}"
done

View File

@@ -5,16 +5,16 @@
set -ex
set -o pipefail
# cd into parent directory of this file
cd "$(dirname "${BASH_SOURCE[0]}")/.."
# cd 2 levels into the working directory
cd "$(dirname "${BASH_SOURCE[0]}")/../.."
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
# run python-based benchmarks and upload the result to buildkite
python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
bench_latency_exit_code=$?
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
bench_throughput_exit_code=$?
# run server-based benchmarks and upload the result to buildkite
@@ -24,7 +24,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
# wait for server to start, timeout after 600 seconds
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \
vllm bench serve \
--backend vllm \
--dataset-name sharegpt \
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \

View File

@@ -3,7 +3,7 @@
set -euox pipefail
if [[ $# -lt 4 ]]; then
echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
exit 1
fi

View File

@@ -0,0 +1,24 @@
#!/bin/bash
set -euo pipefail
docker_root=$(docker info -f '{{.DockerRootDir}}')
if [ -z "$docker_root" ]; then
echo "Failed to determine Docker root directory."
exit 1
fi
echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=24h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
fi

View File

@@ -0,0 +1,14 @@
# Environment config
TEST_NAME=llama8b
CONTAINER_NAME=tpu-test
# vllm config
MODEL=meta-llama/Llama-3.1-8B-Instruct
MAX_NUM_SEQS=256
MAX_NUM_BATCHED_TOKENS=1024
TENSOR_PARALLEL_SIZE=1
MAX_MODEL_LEN=2048
DOWNLOAD_DIR=/mnt/disks/persist
EXPECTED_THROUGHPUT=8.0
INPUT_LEN=1800
OUTPUT_LEN=128

View File

@@ -0,0 +1,90 @@
#!/bin/bash
if [ ! -f "$1" ]; then
echo "Error: The env file '$1' does not exist."
exit 1 # Exit the script with a non-zero status to indicate an error
fi
ENV_FILE=$1
# For testing on local vm, use `set -a` to export all variables
source /etc/environment
source $ENV_FILE
remove_docker_container() {
docker rm -f $CONTAINER_NAME || true;
}
trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
LOG_ROOT=$(mktemp -d)
# If mktemp fails, set -e will cause the script to exit.
echo "Results will be stored in: $LOG_ROOT"
if [ -z "$HF_TOKEN" ]; then
echo "Error: HF_TOKEN is not set or is empty."
exit 1
fi
# Make sure mounted disk or dir exists
if [ ! -d "$DOWNLOAD_DIR" ]; then
echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder."
exit 1
fi
echo "Run model $MODEL"
echo
echo "starting docker...$CONTAINER_NAME"
echo
docker run \
-v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
--env-file $ENV_FILE \
-e HF_TOKEN="$HF_TOKEN" \
-e TARGET_COMMIT=$BUILDKITE_COMMIT \
-e MODEL=$MODEL \
-e WORKSPACE=/workspace \
--name $CONTAINER_NAME \
-d \
--privileged \
--network host \
-v /dev/shm:/dev/shm \
vllm/vllm-tpu-bm tail -f /dev/null
echo "run script..."
echo
docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/tpu/run_bm.sh"
echo "copy result back..."
VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt
docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG"
docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG"
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput"
if [ "$BUILDKITE" = "true" ]; then
echo "Running inside Buildkite"
buildkite-agent artifact upload "$VLLM_LOG"
buildkite-agent artifact upload "$BM_LOG"
else
echo "Not running inside Buildkite"
fi
#
# compare the throughput with EXPECTED_THROUGHPUT
# and assert meeting the expectation
#
if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
echo "Failed to get the throughput"
exit 1
fi
if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then
echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)"
exit 1
fi

View File

@@ -0,0 +1,14 @@
# Environment config
TEST_NAME=llama8bw8a8
CONTAINER_NAME=tpu-test
# vllm config
MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8
MAX_NUM_SEQS=128
MAX_NUM_BATCHED_TOKENS=1024
TENSOR_PARALLEL_SIZE=1
MAX_MODEL_LEN=2048
DOWNLOAD_DIR=/mnt/disks/persist
EXPECTED_THROUGHPUT=10.0
INPUT_LEN=1800
OUTPUT_LEN=128

View File

@@ -0,0 +1,93 @@
#!/bin/bash
set -euo pipefail
VLLM_LOG="$WORKSPACE/vllm_log.txt"
BM_LOG="$WORKSPACE/bm_log.txt"
if [ -n "$TARGET_COMMIT" ]; then
head_hash=$(git rev-parse HEAD)
if [ "$TARGET_COMMIT" != "$head_hash" ]; then
echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash"
exit 1
fi
fi
echo "model: $MODEL"
echo
#
# create a log folder
#
mkdir "$WORKSPACE/log"
# TODO: Move to image building.
pip install pandas
pip install datasets
#
# create sonnet_4x
#
echo "Create sonnet_4x.txt"
echo "" > benchmarks/sonnet_4x.txt
for _ in {1..4}
do
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
done
#
# start vllm service in backend
#
echo "lanching vllm..."
echo "logging to $VLLM_LOG"
echo
VLLM_USE_V1=1 vllm serve $MODEL \
--seed 42 \
--max-num-seqs $MAX_NUM_SEQS \
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
--no-enable-prefix-caching \
--download_dir $DOWNLOAD_DIR \
--max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
echo "wait for 20 minutes.."
echo
# sleep 1200
# wait for 10 minutes...
for i in {1..120}; do
# TODO: detect other type of errors.
if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
echo "Detected RuntimeError, exiting."
exit 1
elif grep -Fq "Application startup complete" "$VLLM_LOG"; then
echo "Application started"
break
else
echo "wait for 10 seconds..."
sleep 10
fi
done
#
# run test
#
echo "run benchmark test..."
echo "logging to $BM_LOG"
echo
vllm bench serve \
--backend vllm \
--model $MODEL \
--dataset-name sonnet \
--dataset-path benchmarks/sonnet_4x.txt \
--sonnet-input-len $INPUT_LEN \
--sonnet-output-len $OUTPUT_LEN \
--ignore-eos > "$BM_LOG"
echo "completed..."
echo
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
echo "throughput: $throughput"
echo

View File

@@ -14,8 +14,19 @@ fi
# Get the single wheel file
wheel="${wheel_files[0]}"
# Rename 'linux' to 'manylinux1' in the wheel filename
new_wheel="${wheel/linux/manylinux1}"
# Detect architecture and rename 'linux' to appropriate manylinux version
arch=$(uname -m)
if [[ $arch == "x86_64" ]]; then
manylinux_version="manylinux1"
elif [[ $arch == "aarch64" ]]; then
manylinux_version="manylinux2014"
else
echo "Warning: Unknown architecture $arch, using manylinux1 as default"
manylinux_version="manylinux1"
fi
# Rename 'linux' to the appropriate manylinux version in the wheel filename
new_wheel="${wheel/linux/$manylinux_version}"
mv -- "$wheel" "$new_wheel"
wheel="$new_wheel"
@@ -47,14 +58,15 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
if [[ $normal_wheel == *"cu118"* ]]; then
# if $normal_wheel matches cu118, do not upload the index.html
echo "Skipping index files for cu118 wheels"
elif [[ $normal_wheel == *"cu121"* ]]; then
# if $normal_wheel matches cu121, do not upload the index.html
echo "Skipping index files for cu121 wheels"
if [[ $normal_wheel == *"cu126"* ]]; then
# if $normal_wheel matches cu126, do not upload the index.html
echo "Skipping index files for cu126 wheels"
elif [[ $normal_wheel == *"cu128"* ]]; then
# if $normal_wheel matches cu128, do not upload the index.html
echo "Skipping index files for cu128 wheels"
else
# only upload index.html for cu124 wheels (default wheels)
# only upload index.html for cu129 wheels (default wheels) as it
# is available on both x86 and arm64
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
fi
@@ -63,15 +75,17 @@ fi
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
if [[ $normal_wheel == *"cu118"* ]]; then
# if $normal_wheel matches cu118, do not upload the index.html
echo "Skipping index files for cu118 wheels"
elif [[ $normal_wheel == *"cu121"* ]]; then
# if $normal_wheel matches cu121, do not upload the index.html
echo "Skipping index files for cu121 wheels"
if [[ $normal_wheel == *"cu126"* ]]; then
# if $normal_wheel matches cu126, do not upload the index.html
echo "Skipping index files for cu126 wheels"
elif [[ $normal_wheel == *"cu128"* ]]; then
# if $normal_wheel matches cu128, do not upload the index.html
echo "Skipping index files for cu128 wheels"
else
# only upload index.html for cu124 wheels (default wheels)
# only upload index.html for cu129 wheels (default wheels) as it
# is available on both x86 and arm64
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
fi
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"

File diff suppressed because it is too large Load Diff

6
.gemini/config.yaml Normal file
View File

@@ -0,0 +1,6 @@
# https://developers.google.com/gemini-code-assist/docs/customize-gemini-behavior-github
have_fun: false # Just review the code
code_review:
comment_severity_threshold: HIGH # Reduce quantity of comments
pull_request_opened:
summary: false # Don't summarize the PR in a separate comment

24
.github/.bc-linter.yml vendored Normal file
View File

@@ -0,0 +1,24 @@
# doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md
version: 1
paths:
# We temporarily disable globally, and will only enable with `annotations.include`
# include:
# - "vllm/v1/attetion/*.py"
# - "vllm/v1/core/*.py"
exclude:
- "**/*.py"
scan:
functions: true # check free functions and methods
classes: true # check classes/dataclasses
public_only: true # ignore names starting with "_" at any level
annotations:
include: # decorators that forceinclude a symbol
- name: "bc_linter_include" # matched by simple name or dotted suffix
propagate_to_members: false # for classes, include methods/inner classes
exclude: # decorators that forceexclude a symbol
- name: "bc_linter_skip" # matched by simple name or dotted suffix
propagate_to_members: true # for classes, exclude methods/inner classes
excluded_violations: [] # e.g. ["ParameterRenamed", "FieldTypeChanged"]

103
.github/CODEOWNERS vendored
View File

@@ -5,37 +5,100 @@
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
/vllm/model_executor/guided_decoding @mgoin @russellb
/vllm/multimodal @DarkLight1337 @ywang96
CMakeLists.txt @tlrmchlsmth
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
/vllm/model_executor/layers/mamba @tdoublep
/vllm/model_executor/model_loader @22quinn
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
/vllm/v1/sample @22quinn @houseroad
/vllm/vllm_flash_attn @LucasWilkinson
/vllm/lora @jeejeelee
/vllm/reasoning @aarnphm @chaunceyjiang
/vllm/entrypoints @aarnphm @chaunceyjiang
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
/vllm/distributed/kv_transfer @NickLucche
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# Any change to the VllmConfig changes can have a large user-facing impact,
# so spam a lot of people
/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
# vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
/vllm/v1/structured_output @mgoin @russellb
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
/vllm/v1/spec_decode @benchislett @luccafong
/vllm/v1/attention/backends/triton_attn.py @tdoublep
/vllm/v1/core @heheda12345
/vllm/v1/kv_cache_interface.py @heheda12345
# Test ownership
/.buildkite/lm-eval-harness @mgoin @simon-mo
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
/tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb
/tests/kernels @tlrmchlsmth @WoosukKwon
/tests/model_executor/test_guided_processors.py @mgoin @russellb
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
/tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256
/tests/models @DarkLight1337 @ywang96
/tests/multi_step @alexm-redhat @comaniac
/tests/multimodal @DarkLight1337 @ywang96
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
/tests/prefix_caching @comaniac @KuntaiDu
/tests/quantization @mgoin @robertgshaw2-redhat
/tests/spec_decode @njhill @LiuXiaoxuanPKU
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
/tests/v1/structured_output @mgoin @russellb
/tests/weight_loading @mgoin @youkaichao
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
/tests/v1/structured_output @mgoin @russellb @aarnphm
/tests/v1/core @heheda12345
/tests/weight_loading @mgoin @youkaichao @yewentao256
/tests/lora @jeejeelee
/tests/models/language/generation/test_hybrid.py @tdoublep
/tests/v1/kv_connector/nixl_integration @NickLucche
# Docs
/docs @hmellor
mkdocs.yaml @hmellor
# CPU
/vllm/v1/worker/^cpu @bigPYJ1151
/csrc/cpu @bigPYJ1151
/vllm/platforms/cpu.py @bigPYJ1151
/cmake/cpu_extension.cmake @bigPYJ1151
/docker/Dockerfile.cpu @bigPYJ1151
# Intel GPU
/vllm/v1/worker/^xpu @jikunshang
/vllm/platforms/xpu.py @jikunshang
/docker/Dockerfile.xpu @jikunshang
# Qwen-specific files
/vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow
/vllm/model_executor/models/qwen* @sighingnow
# MTP-specific files
/vllm/model_executor/models/deepseek_mtp.py @luccafong
# Mistral-specific files
/vllm/model_executor/models/mistral*.py @patrickvonplaten
/vllm/model_executor/models/mixtral*.py @patrickvonplaten
/vllm/model_executor/models/voxtral*.py @patrickvonplaten
/vllm/model_executor/models/pixtral*.py @patrickvonplaten
/vllm/transformers_utils/configs/mistral.py @patrickvonplaten
/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
# Kernels
/vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep
/vllm/attention/ops/triton_unified_attention.py @tdoublep
# ROCm related: specify owner with write access to notify AMD folks for careful code review
/docker/Dockerfile.rocm* @gshtras
/vllm/v1/attention/backends/rocm*.py @gshtras
/vllm/v1/attention/backends/mla/rocm*.py @gshtras
/vllm/attention/ops/rocm*.py @gshtras
/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
# TPU
/vllm/v1/worker/tpu* @NickLucche
/vllm/platforms/tpu.py @NickLucche
/vllm/v1/sample/tpu @NickLucche
/vllm/tests/v1/tpu @NickLucche

View File

@@ -14,7 +14,7 @@ body:
description: |
Please run the following and paste the output below.
```sh
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```

View File

@@ -14,7 +14,7 @@ body:
description: |
Please run the following and paste the output below.
```sh
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```

View File

@@ -8,25 +8,35 @@ body:
attributes:
value: >
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
- type: markdown
attributes:
value: |
⚠️ **SECURITY WARNING:** Please review any text you paste to ensure it does not contain sensitive information such as:
- API tokens or keys (e.g., Hugging Face tokens, OpenAI API keys)
- Passwords or authentication credentials
- Private URLs or endpoints
- Personal or confidential data
Consider redacting or replacing sensitive values with placeholders like `<YOUR_TOKEN_HERE>` when sharing configuration or code examples.
- type: textarea
attributes:
label: Your current environment
description: |
Please run the following and paste the output below.
```sh
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: |
<details>
<summary>The output of `python collect_env.py`</summary>
<summary>The output of <code>python collect_env.py</code></summary>
```text
Your output of `python collect_env.py` here
```
</details>
validations:
required: true
@@ -75,20 +85,20 @@ body:
```
```
The error message you got, with the full traceback.
The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
```
validations:
required: true
- type: markdown
attributes:
value: >
⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output:
value: |
⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the model's output:
- Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
- If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
Thanks for contributing 🎉!
Thanks for reporting 🙏!
- type: checkboxes
id: askllm
attributes:

View File

@@ -0,0 +1,69 @@
name: 🧪 CI failure report
description: Report a failing test.
title: "[CI Failure]: "
labels: ["ci-failure"]
body:
- type: markdown
attributes:
value: >
#### Include the name of the failing Buildkite step and test file in the title.
- type: input
attributes:
label: Name of failing test
description: |
Paste in the fully-qualified name of the failing test from the logs.
placeholder: |
`path/to/test_file.py::test_name[params]`
validations:
required: true
- type: checkboxes
attributes:
label: Basic information
description: Select all items that apply to the failing test.
options:
- label: Flaky test
- label: Can reproduce locally
- label: Caused by external libraries (e.g. bug in `transformers`)
- type: textarea
attributes:
label: 🧪 Describe the failing test
description: |
Please provide a clear and concise description of the failing test.
placeholder: |
A clear and concise description of the failing test.
```
The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
```
validations:
required: true
- type: textarea
attributes:
label: 📝 History of failing test
description: |
Since when did the test start to fail?
You can look up its history via [Buildkite Test Suites](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main).
If you have time, identify the PR that caused the test to fail on main. You can do so via the following methods:
- Use Buildkite Test Suites to find the PR where the test failure first occurred, and reproduce the failure locally.
- Run [`git bisect`](https://git-scm.com/docs/git-bisect) locally.
- Manually unblock Buildkite steps for suspected PRs on main and check the results. (authorized users only)
placeholder: |
Approximate timeline and/or problematic PRs
A link to the Buildkite analytics of the failing test (if available)
validations:
required: true
- type: textarea
attributes:
label: CC List.
description: >
The list of people you want to CC. Usually, this includes those who worked on the PR that failed the test.
- type: markdown
attributes:
value: >
Thanks for reporting 🙏!

View File

@@ -9,7 +9,7 @@ body:
value: >
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/index.html first to understand how to add a new model.
- type: textarea
attributes:
label: The model to consider.

View File

@@ -35,7 +35,7 @@ body:
description: |
Please run the following and paste the output below.
```sh
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```

View File

@@ -46,7 +46,7 @@ body:
- type: markdown
attributes:
value: >
Thanks for contributing 🎉!
Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit).
- type: checkboxes
id: askllm
attributes:

View File

@@ -1,6 +1,21 @@
FILL IN THE PR DESCRIPTION HERE
<!-- markdownlint-disable -->
PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED.
FIX #xxxx (*link existing issues this PR will resolve*)
## Purpose
<!--- pyml disable-next-line no-emphasis-as-heading -->
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>**
## Test Plan
## Test Result
---
<details>
<summary> Essential Elements of an Effective PR Description Checklist </summary>
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
- [ ] The test plan, such as providing test command.
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
- [ ] (Optional) Release notes update. If your change is user facing, please update the release notes draft in the [Google Doc](https://docs.google.com/document/d/1YyVqrgX4gHTtrstbq8oWUImOyPCKSGnJ7xtTpmXzlRs/edit?tab=t.0).
</details>
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)

206
.github/mergify.yml vendored
View File

@@ -19,7 +19,7 @@ pull_request_rules:
- files~=\.buildkite/
- files~=^cmake/
- files=CMakeLists.txt
- files~=^Dockerfile
- files~=^docker/Dockerfile
- files~=^requirements.*\.txt
- files=setup.py
actions:
@@ -27,6 +27,22 @@ pull_request_rules:
add:
- ci/build
- name: label-deepseek
description: Automatically apply deepseek label
conditions:
- or:
- files~=^examples/.*deepseek.*\.py
- files~=^tests/.*deepseek.*\.py
- files~=^vllm/entrypoints/openai/tool_parsers/.*deepseek.*\.py
- files~=^vllm/model_executor/models/.*deepseek.*\.py
- files~=^vllm/reasoning/.*deepseek.*\.py
- files~=^vllm/transformers_utils/.*deepseek.*\.py
- title~=(?i)DeepSeek
actions:
label:
add:
- deepseek
- name: label-frontend
description: Automatically apply frontend label
conditions:
@@ -36,6 +52,21 @@ pull_request_rules:
add:
- frontend
- name: label-llama
description: Automatically apply llama label
conditions:
- or:
- files~=^examples/.*llama.*\.py
- files~=^tests/.*llama.*\.py
- files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
- files~=^vllm/model_executor/models/.*llama.*\.py
- files~=^vllm/transformers_utils/configs/.*llama.*\.py
- title~=(?i)llama
actions:
label:
add:
- llama
- name: label-multi-modality
description: Automatically apply multi-modality label
conditions:
@@ -43,23 +74,98 @@ pull_request_rules:
- files~=^vllm/multimodal/
- files~=^tests/multimodal/
- files~=^tests/models/multimodal/
- files~=^tests/models/*/audio_language/
- files~=^tests/models/*/vision_language/
- files=tests/models/test_vision.py
actions:
label:
add:
- multi-modality
- name: label-new-model
description: Automatically apply new-model label
conditions:
- and:
- files~=^vllm/model_executor/models/
- files=vllm/model_executor/models/registry.py
actions:
label:
add:
- new-model
- name: label-performance
description: Automatically apply performance label
conditions:
- or:
- files~=^benchmarks/
- files~=^vllm/benchmarks/
- files~=^tests/benchmarks/
- files~=^\.buildkite/nightly-benchmarks/
actions:
label:
add:
- performance
- name: label-qwen
description: Automatically apply qwen label
conditions:
- or:
- files~=^examples/.*qwen.*\.py
- files~=^tests/.*qwen.*\.py
- files~=^vllm/model_executor/models/.*qwen.*\.py
- files~=^vllm/reasoning/.*qwen.*\.py
- title~=(?i)Qwen
actions:
label:
add:
- qwen
- name: label-gpt-oss
description: Automatically apply gpt-oss label
conditions:
- or:
- files~=^examples/.*gpt[-_]?oss.*\.py
- files~=^tests/.*gpt[-_]?oss.*\.py
- files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
- files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
- title~=(?i)gpt[-_]?oss
actions:
label:
add:
- gpt-oss
- name: label-rocm
description: Automatically apply rocm label
conditions:
- or:
- files~=^csrc/rocm/
- files~=^docker/Dockerfile.rocm
- files~=^requirements/rocm.*\.txt
- files~=^vllm/attention/backends/rocm.*\.py
- files~=^vllm/attention/ops/rocm.*\.py
- files~=^vllm/model_executor/layers/fused_moe/rocm.*\.py
- files~=^vllm/v1/attention/backends/mla/rocm.*\.py
- files~=^tests/kernels/.*_rocm.*\.py
- files=vllm/platforms/rocm.py
- title~=(?i)AMD
- title~=(?i)ROCm
actions:
label:
add:
- rocm
- name: label-structured-output
description: Automatically apply structured-output label
conditions:
- or:
- files~=^vllm/model_executor/guided_decoding/
- files=tests/model_executor/test_guided_processors.py
- files=tests/entrypoints/llm/test_guided_generate.py
- files=benchmarks/benchmark_serving_guided.py
- files=benchmarks/benchmark_guided.py
- files~=^benchmarks/structured_schemas/
- files=benchmarks/benchmark_serving_structured_output.py
- files=benchmarks/run_structured_output_benchmark.sh
- files=docs/features/structured_outputs.md
- files=examples/offline_inference/structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
- files~=^tests/v1/structured_output/
- files=tests/v1/entrypoints/llm/test_guided_generate.py
- files~=^vllm/v1/structured_output/
actions:
label:
add:
@@ -69,9 +175,12 @@ pull_request_rules:
description: Automatically apply speculative-decoding label
conditions:
- or:
- files~=^vllm/spec_decode/
- files=vllm/model_executor/layers/spec_decode_base_sampler.py
- files~=^tests/spec_decode/
- files~=^vllm/v1/spec_decode/
- files~=^tests/v1/spec_decode/
- files~=^examples/.*(spec_decode|mlpspeculator|eagle|speculation).*\.py
- files~=^vllm/model_executor/models/.*eagle.*\.py
- files=vllm/model_executor/models/mlp_speculator.py
- files~=^vllm/transformers_utils/configs/(eagle|medusa|mlp_speculator)\.py
actions:
label:
add:
@@ -88,6 +197,56 @@ pull_request_rules:
add:
- v1
- name: label-tpu
description: Automatically apply tpu label
# Keep this list in sync with `label-tpu-remove` conditions
conditions:
- or:
- files~=tpu.py
- files~=_tpu
- files~=tpu_
- files~=/tpu/
- files~=pallas
actions:
label:
add:
- tpu
- name: label-tpu-remove
description: Automatically remove tpu label
# Keep this list in sync with `label-tpu` conditions
conditions:
- and:
- -files~=tpu.py
- -files~=_tpu
- -files~=tpu_
- -files~=/tpu/
- -files~=pallas
actions:
label:
remove:
- tpu
- name: label-tool-calling
description: Automatically add tool-calling label
conditions:
- or:
- files~=^tests/tool_use/
- files~=^tests/mistral_tool_use/
- files~=^tests/entrypoints/openai/tool_parsers/
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
- files~=^vllm/entrypoints/openai/tool_parsers/
- files=docs/features/tool_calling.md
- files~=^examples/tool_chat_*
- files=examples/offline_inference/chat_with_tools.py
- files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
- files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
- files=examples/online_serving/openai_chat_completion_client_with_tools.py
actions:
label:
add:
- tool-calling
- name: ping author on conflicts and add 'needs-rebase' label
conditions:
- conflict
@@ -103,6 +262,31 @@ pull_request_rules:
https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
- name: assign reviewer for tensorizer changes
conditions:
- files~=^vllm/model_executor/model_loader/tensorizer.py
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
- files~=^tests/tensorizer_loader/
actions:
assign:
users:
- "sangstar"
- name: assign reviewer for modelopt changes
conditions:
- or:
- files~=^vllm/model_executor/layers/quantization/modelopt\.py$
- files~=^vllm/model_executor/layers/quantization/__init__\.py$
- files~=^tests/models/quantization/test_modelopt\.py$
- files~=^tests/quantization/test_modelopt\.py$
- files~=^tests/models/quantization/test_nvfp4\.py$
- files~=^docs/features/quantization/modelopt\.md$
actions:
assign:
users:
- "Edwardf0t1"
- name: remove 'needs-rebase' label when conflict is resolved
conditions:
- -conflict

21
.github/scale-config.yml vendored Normal file
View File

@@ -0,0 +1,21 @@
# scale-config.yml:
# Powers what instance types are available for GHA auto-scaled
# runners. Runners listed here will be available as self hosted
# runners, configuration is directly pulled from the main branch.
# runner_types:
# runner_label:
# instance_type: m4.large
# os: linux
# # min_available defaults to the global cfg in the ALI Terraform
# min_available: undefined
# # when max_available value is not defined, no max runners is enforced
# max_available: undefined
# disk_size: 50
# is_ephemeral: true
runner_types:
linux.2xlarge:
disk_size: 150
instance_type: c5.2xlarge
is_ephemeral: true
os: linux

View File

@@ -15,18 +15,18 @@ NEW=/tmp/new_pr_body.txt
gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
cp "${OLD}" "${NEW}"
# Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
sed -i '/FIX #xxxx.*$/d' "${NEW}"
# Remove markdown comments (like the <!-- markdownlint-disable --> at the start)
sed -i '/<!--.*-->$/d' "${NEW}"
# Remove "FILL IN THE PR DESCRIPTION HERE"
sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
# Remove "PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED."
sed -i '/PLEASE FILL IN THE PR DESCRIPTION HERE.*$/d' "${NEW}"
# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
python3 - <<EOF
import re
import regex as re
with open("${NEW}", "r") as file:
content = file.read()

View File

@@ -1,4 +1,6 @@
name: Add label on auto-merge enabled
permissions:
pull-requests: write
on:
pull_request_target:
types:
@@ -8,7 +10,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Add label
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
github.rest.issues.addLabels({

27
.github/workflows/bc-lint.yml vendored Normal file
View File

@@ -0,0 +1,27 @@
name: BC Lint
on:
pull_request:
types:
- opened
- synchronize
- reopened
jobs:
bc_lint:
if: github.repository_owner == 'vllm-project'
runs-on: ubuntu-latest
steps:
- name: Run BC Lint Action
uses: pytorch/test-infra/.github/actions/bc-lint@main
with:
repo: ${{ github.event.pull_request.head.repo.full_name }}
base_sha: ${{ github.event.pull_request.base.sha }}
head_sha: ${{ github.event.pull_request.head.sha }}
suppression: ${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }}
docs_link: 'https://github.com/pytorch/test-infra/wiki/BC-Linter'
config_dir: .github
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
cancel-in-progress: true

View File

@@ -16,11 +16,16 @@ jobs:
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python
uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
with:
python-version: '3.12'
- name: Install Python dependencies
run: |
python3 -m pip install --upgrade pip
python3 -m pip install regex
- name: Update PR description
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"

309
.github/workflows/issue_autolabel.yml vendored Normal file
View File

@@ -0,0 +1,309 @@
name: Label issues based on keywords
on:
issues:
types: [opened, edited, reopened]
permissions:
issues: write # needed so the workflow can add labels
contents: read
concurrency:
group: issue-labeler-${{ github.event.issue.number }}
cancel-in-progress: true
jobs:
add-labels:
runs-on: ubuntu-latest
steps:
- name: Label issues based on keywords
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
// Configuration: Add new labels and keywords here
const labelConfig = {
rocm: {
// Keyword search - matches whole words only (with word boundaries)
keywords: [
{
term: "composable kernel",
searchIn: "both"
},
{
term: "rccl",
searchIn: "body" // only search in body
},
{
term: "migraphx",
searchIn: "title" // only search in title
},
{
term: "hipgraph",
searchIn: "both"
},
{
term: "ROCm System Management Interface",
searchIn: "body"
},
],
// Substring search - matches anywhere in text (partial matches)
substrings: [
{
term: "VLLM_ROCM_",
searchIn: "both"
},
{
term: "aiter",
searchIn: "title"
},
{
term: "rocm",
searchIn: "title"
},
{
term: "amd",
searchIn: "title"
},
{
term: "hip-",
searchIn: "both"
},
{
term: "gfx",
searchIn: "both"
},
{
term: "cdna",
searchIn: "both"
},
{
term: "rdna",
searchIn: "both"
},
{
term: "torch_hip",
searchIn: "body" // only in body
},
{
term: "_hip",
searchIn: "both"
},
{
term: "hip_",
searchIn: "both"
},
// ROCm tools and libraries
{
term: "hipify",
searchIn: "both"
},
],
// Regex patterns - for complex pattern matching
regexPatterns: [
{
pattern: "\\bmi\\d{3}[a-z]*\\b",
description: "AMD GPU names (mi + 3 digits + optional letters)",
flags: "gi",
searchIn: "both" // "title", "body", or "both"
}
],
},
};
// Helper function to create regex based on search type
function createSearchRegex(term, type) {
// Escape special regex characters in the term
const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
switch (type) {
case 'keyword':
// Word boundary search - matches whole words only
return new RegExp(`\\b${escapedTerm}\\b`, "gi");
case 'substring':
// Substring search - matches anywhere in the text
return new RegExp(escapedTerm, "gi");
default:
throw new Error(`Unknown search type: ${type}`);
}
}
// Helper function to find matching terms in text with line information
function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
const matches = [];
const lines = text.split('\n');
for (const termConfig of searchTerms) {
let regex;
let term, searchIn, pattern, description, flags;
// Handle different input formats (string or object)
if (typeof termConfig === 'string') {
term = termConfig;
searchIn = 'both'; // default
} else {
term = termConfig.term;
searchIn = termConfig.searchIn || 'both';
pattern = termConfig.pattern;
description = termConfig.description;
flags = termConfig.flags;
}
// Skip if this term shouldn't be searched in the current location
if (searchIn !== 'both' && searchIn !== searchLocation) {
continue;
}
// Create appropriate regex
if (searchType === 'regex') {
regex = new RegExp(pattern, flags || "gi");
} else {
regex = createSearchRegex(term, searchType);
}
const termMatches = [];
// Check each line for matches
lines.forEach((line, lineIndex) => {
const lineMatches = line.match(regex);
if (lineMatches) {
lineMatches.forEach(match => {
termMatches.push({
match: match,
lineNumber: lineIndex + 1,
lineContent: line.trim(),
searchType: searchType,
searchLocation: searchLocation,
originalTerm: term || pattern,
description: description,
// Show context around the match in the line
context: line.length > 100 ?
line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
: line.trim()
});
});
}
});
if (termMatches.length > 0) {
matches.push({
term: term || (description || pattern),
searchType: searchType,
searchLocation: searchLocation,
searchIn: searchIn,
pattern: pattern,
matches: termMatches,
count: termMatches.length
});
}
}
return matches;
}
// Helper function to check if label should be added
async function processLabel(labelName, config) {
const body = context.payload.issue.body || "";
const title = context.payload.issue.title || "";
core.notice(`Processing label: ${labelName}`);
core.notice(`Issue Title: "${title}"`);
core.notice(`Issue Body length: ${body.length} characters`);
let shouldAddLabel = false;
let allMatches = [];
let reason = '';
const keywords = config.keywords || [];
const substrings = config.substrings || [];
const regexPatterns = config.regexPatterns || [];
core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
// Search in title
if (title.trim()) {
core.notice(`Searching in title: "${title}"`);
const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
}
// Search in body
if (body.trim()) {
core.notice(`Searching in body (${body.length} characters)`);
const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
}
if (allMatches.length > 0) {
core.notice(`Found ${allMatches.length} matching term(s):`);
for (const termMatch of allMatches) {
const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
if (termMatch.searchType === 'regex') {
core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
} else {
core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
}
// Show details for each match
termMatch.matches.forEach((match, index) => {
core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
if (match.description) {
core.notice(` Description: ${match.description}`);
}
core.notice(` Context: ${match.context}`);
if (match.lineContent !== match.context) {
core.notice(` Full line: ${match.lineContent}`);
}
});
}
shouldAddLabel = true;
const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
const bodyMatches = allMatches.filter(t => t.searchLocation === 'body').reduce((sum, t) => sum + t.count, 0);
const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
}
core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
core.notice(`Reason: ${reason || 'No matching terms found'}`);
if (shouldAddLabel) {
const existingLabels = context.payload.issue.labels.map(l => l.name);
if (!existingLabels.includes(labelName)) {
await github.rest.issues.addLabels({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
labels: [labelName],
});
core.notice(`Label "${labelName}" added. ${reason}`);
return true;
}
core.notice(`Label "${labelName}" already present.`);
return false;
}
core.notice(`No matching terms found for label "${labelName}".`);
return false;
}
// Process all configured labels
const processLabels = Object.entries(labelConfig)
.map(([labelName, config]) => processLabel(labelName, config));
const labelsAdded = await Promise.all(processLabels);
const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);

Some files were not shown because too many files have changed in this diff Show More