Compare commits

..

372 Commits

Author SHA1 Message Date
Simon Mo
fd47e57f4b [Docs] Remove PDF build from Readtehdocs (#9347)
Some checks failed
Create Release / Create Release (push) Has been cancelled
Create Release / Build Wheel (11.8, ubuntu-20.04, 3.10, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (11.8, ubuntu-20.04, 3.11, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (11.8, ubuntu-20.04, 3.12, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (11.8, ubuntu-20.04, 3.8, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (11.8, ubuntu-20.04, 3.9, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (12.1, ubuntu-20.04, 3.10, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (12.1, ubuntu-20.04, 3.11, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (12.1, ubuntu-20.04, 3.12, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (12.1, ubuntu-20.04, 3.8, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (12.1, ubuntu-20.04, 3.9, 2.4.0) (push) Has been cancelled
2024-10-14 11:57:47 -07:00
Daniele
203ab8f80f [CI/Build] setuptools-scm fixes (#8900) 2024-10-14 11:34:47 -07:00
Kunshang Ji
4141608c6a [Hardware][intel GPU] add async output process for xpu (#8897) 2024-10-14 12:23:33 -06:00
Reza Salehi
dfe43a2071 [Model] Molmo vLLM Integration (#9016)
Co-authored-by: sanghol <sanghol@allenai.org>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-10-14 07:56:24 -07:00
Tyler Michael Smith
16b24e7dcd [Bugfix] Bandaid fix for speculative decoding tests (#9327) 2024-10-13 23:02:11 +00:00
Lily Liu
f519902c52 [CI] Fix merge conflict (#9317) 2024-10-13 06:41:23 +00:00
Jee Jee Li
250e26a63e [Bugfix]Fix MiniCPM's LoRA bug (#9286) 2024-10-12 09:36:47 -07:00
Yunmeng
2b184ddd4f [Misc][Installation] Improve source installation script and doc (#9309)
Co-authored-by: youkaichao <youkaichao@126.com>
2024-10-12 09:36:40 -07:00
Xiang Xu
00298e092c [Bugfix] Fix bug of xformer prefill for encoder-decoder (#9026) 2024-10-12 15:00:43 +08:00
Lily Liu
89feb4c84d [SpecDec] Remove Batch Expansion (2/3) (#9298) 2024-10-12 05:13:37 +00:00
Maximilien de Bayser
ec10cb8511 [BugFix] Fix tool call finish reason in streaming case (#9209)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
2024-10-11 18:24:26 -07:00
Prashant Gupta
d11b46f3a5 [bugfix] fix f-string for error (#9295)
Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com>
2024-10-11 17:03:48 -07:00
Allen Wang
c6cf9295e1 [Bugfix] Sets is_first_step_output for TPUModelRunner (#9202) 2024-10-11 13:28:10 -07:00
Lucas Wilkinson
de9fb4bef8 [Bugfix][CI/Build] Fix docker build where CUDA archs < 7.0 are being detected (#9254) 2024-10-11 15:57:39 -04:00
Wallas Henrique
8baf85e4e9 [Doc] Compatibility matrix for mutual exclusive features (#8512)
Signed-off-by: Wallas Santos <wallashss@ibm.com>
2024-10-11 11:18:50 -07:00
homeffjy
1a1823871d [Doc] Remove outdated comment to avoid misunderstanding (#9287) 2024-10-11 18:02:03 +00:00
sixgod
6cf1167c1a [Model] Add GLM-4v support and meet vllm==0.6.2 (#9242) 2024-10-11 17:36:13 +00:00
Burkhard Ringlein
f710090d8e [Kernel] adding fused moe kernel config for L40S TP4 (#9245) 2024-10-11 08:54:22 -07:00
Tyler Michael Smith
7342a7d7f8 [Model] Support Mamba (#6484) 2024-10-11 15:40:06 +00:00
Sebastian Schoennenbeck
df3dcdf49d [Bugfix] Fix priority in multiprocessing engine (#9277) 2024-10-11 15:35:35 +00:00
Jee Jee Li
36ea79079b [Misc][LoRA] Support loading LoRA weights for target_modules in reg format (#9275) 2024-10-11 12:31:21 +00:00
Cyrus Leung
e808156f30 [Misc] Collect model support info in a single process per model (#9233) 2024-10-11 11:08:11 +00:00
youkaichao
cbc2ef5529 [misc] hide best_of from engine (#9261)
Co-authored-by: Brendan Wong <bjwpokemon@gmail.com>
2024-10-10 21:30:44 -07:00
Andy Dai
94bf9ae4e9 [Misc] Fix sampling from sonnet for long context case (#9235) 2024-10-11 00:33:16 +00:00
omrishiv
f990bab2a4 [Doc][Neuron] add note to neuron documentation about resolving triton issue (#9257)
Signed-off-by: omrishiv <327609+omrishiv@users.noreply.github.com>
2024-10-10 23:36:32 +00:00
youkaichao
e00c094f15 [torch.compile] generic decorators (#9258) 2024-10-10 15:54:23 -07:00
Kevin H. Luu
a78c6ba7c8 [ci/build] Add placeholder command for custom models test (#9262) 2024-10-10 15:45:09 -07:00
dependabot[bot]
fb870fd491 Bump actions/setup-python from 3 to 5 (#9195)
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-10-10 13:30:46 -07:00
dependabot[bot]
270953bafb Bump actions/checkout from 3 to 4 (#9196)
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-10-10 13:30:35 -07:00
dependabot[bot]
9cc811c4ff Bump actions/github-script from 6 to 7 (#9197)
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-10-10 13:30:24 -07:00
youkaichao
e4d652ea3e [torch.compile] integration with compilation control (#9058) 2024-10-10 12:39:36 -07:00
Simon Mo
78c0b4166c Suggest codeowners for the core componenets (#9210) 2024-10-10 12:29:24 -07:00
jordanyono
21efb603f5 [CI/Build] Make the Dockerfile.cpu file's PIP_EXTRA_INDEX_URL Configurable as a Build Argument (#9252) 2024-10-10 18:18:18 +00:00
Rafael Vasquez
055f3270d4 [Doc] Improve debugging documentation (#9204)
Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
2024-10-10 10:48:51 -07:00
Lucas Wilkinson
18511aeda6 [Bugfix] Fix Machete unittests failing with NotImplementedError (#9218) 2024-10-10 17:39:56 +00:00
Ilya Lavrenov
83ea5c72b9 [OpenVINO] Use torch 2.4.0 and newer optimim version (#9121)
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-10-10 11:18:58 -06:00
whyiug
04de9057ab [Model] support input image embedding for minicpmv (#9237) 2024-10-10 15:00:47 +00:00
Isotr0py
07c11cf4d4 [Bugfix] Fix lm_head weights tying with lora for llama (#9227) 2024-10-10 21:11:56 +08:00
sroy745
f3a507f1d3 [Core] Add an environment variable which needs to be set explicitly to allow BlockSpaceManagerV1 (#9149) 2024-10-10 14:17:17 +08:00
Lucas Wilkinson
a64e7b9407 [Bugfix] Machete garbage results for some models (large K dim) (#9212) 2024-10-10 14:16:17 +08:00
Michael Goin
ce00231a8b [Bugfix] Fix Weight Loading Multiple GPU Test - Large Models (#9213) 2024-10-10 14:15:40 +08:00
youkaichao
de895f1697 [misc] improve model support check in another process (#9208) 2024-10-09 21:58:27 -07:00
Russell Bryant
cf25b93bdd [Core] Fix invalid args to _process_request (#9201)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2024-10-10 12:10:09 +08:00
Michael Goin
d5fbb8706d [CI/Build] Update Dockerfile install+deploy image to ubuntu 22.04 (#9130)
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-10-09 12:51:47 -06:00
Russell Bryant
cdca8994bd [CI/Build] mypy: check vllm/entrypoints (#9194)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2024-10-09 17:15:28 +00:00
Li, Jiang
ca77dd7a44 [Hardware][CPU] Support AWQ for CPU backend (#7515) 2024-10-09 10:28:08 -06:00
Ewout ter Hoeven
7dea289066 Add Dependabot configuration for GitHub Actions updates (#1217)
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-10-09 08:16:26 -07:00
Cyrus Leung
cfaa6008e6 [Bugfix] Access get_vocab instead of vocab in tool parsers (#9188) 2024-10-09 08:59:57 -06:00
Ahmad Fahadh Ilyas
21906a6f50 [Bugfix] Fix lora loading for Compressed Tensors in #9120 (#9179) 2024-10-09 12:10:44 +00:00
Jiangtao Hu
dc4aea677a [Doc] Fix VLM prompt placeholder sample bug (#9170) 2024-10-09 08:59:42 +00:00
youkaichao
c8627cd41b [ci][test] use load dummy for testing (#9165) 2024-10-09 00:38:40 -07:00
Cyrus Leung
8bfaa4e31e [Bugfix] fix composite weight loading and EAGLE weight loading (#9160) 2024-10-09 00:36:55 -07:00
AlpinDale
0b5b5d767e [Frontend] Log the maximum supported concurrency (#8831) 2024-10-09 00:03:14 -07:00
Hui Liu
cdc72e3c80 [Model] Remap FP8 kv_scale in CommandR and DBRX (#9174) 2024-10-09 06:43:06 +00:00
Joe Rowell
7627172bf4 [Bugfix][Doc] Report neuron error in output (#9159) 2024-10-08 22:43:34 -07:00
Travis Johnson
480b7f40cf [Misc] Improve validation errors around best_of and n (#9167)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
2024-10-09 04:54:48 +00:00
Yuan Tang
acce7630c1 Update link to KServe deployment guide (#9173) 2024-10-09 03:58:49 +00:00
Yuan Tang
ffc4b27ea8 Add classifiers in setup.py (#9171) 2024-10-08 19:30:48 -07:00
chenqianfzh
2f4117c38e support bitsandbytes quantization with more models (#9148) 2024-10-08 19:52:19 -06:00
Michael Goin
9ba0bd6aa6 Add lm-eval directly to requirements-test.txt (#9161) 2024-10-08 18:22:31 -07:00
Russell Bryant
2a131965a8 mypy: check additional directories (#9162)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2024-10-08 22:08:22 +00:00
bnellnm
bd37b9fbe2 [Bugfix] Try to handle older versions of pytorch (#9086) 2024-10-08 14:28:12 -07:00
Rafael Vasquez
de24046fcd [Doc] Improve contributing and installation documentation (#9132)
Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
2024-10-08 20:22:08 +00:00
Sayak Paul
1874c6a1b0 [Doc] Update vlm.rst to include an example on videos (#9155)
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2024-10-08 18:12:29 +00:00
Daniele
9a94ca4a5d [Bugfix] fix OpenAI API server startup with --disable-frontend-multiprocessing (#8537) 2024-10-08 09:38:40 -07:00
Peter Pan
cfba685bd4 [CI/Build] Add examples folder into Docker image so that we can leverage the templates*.jinja when serving models (#8758)
Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>
2024-10-08 09:37:34 -07:00
Alex Brooks
069d3bd8d0 [Frontend] Add Early Validation For Chat Template / Tool Call Parser (#9151)
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
2024-10-08 14:31:26 +00:00
Alex Brooks
a3691b6b5e [Core][Frontend] Add Support for Inference Time mm_processor_kwargs (#9131)
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
2024-10-08 14:12:56 +00:00
Brendan Wong
8c746226c9 [Frontend] API support for beam search for MQLLMEngine (#9117) 2024-10-08 05:51:43 +00:00
youkaichao
e1faa2a598 [misc] improve ux on readme (#9147) 2024-10-07 22:26:25 -07:00
Kunshang Ji
80b57f00d5 [Intel GPU] Fix xpu decode input (#9145) 2024-10-08 03:51:14 +00:00
youkaichao
04c12f8157 [misc] update utils to support comparing multiple settings (#9140) 2024-10-08 02:51:49 +00:00
Simon Mo
8eeb857084 Add Slack to README (#9137) 2024-10-07 17:06:21 -07:00
youkaichao
fa45513a51 [misc] fix comment and variable name (#9139) 2024-10-07 16:07:05 -07:00
Kuntai Du
c0d9a98d0c [Doc] Include performance benchmark in README (#9135) 2024-10-07 15:04:06 -07:00
Russell Bryant
e0dbdb013d [CI/Build] Add linting for github actions workflows (#7876)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2024-10-07 21:18:10 +00:00
TimWang
93cf74a8a7 [Doc]: Add deploying_with_k8s guide (#8451) 2024-10-07 13:31:45 -07:00
Cyrus Leung
151ef4efd2 [Model] Support NVLM-D and fix QK Norm in InternViT (#9045)
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2024-10-07 11:55:12 +00:00
Isotr0py
f19da64871 [Core] Refactor GGUF parameters packing and forwarding (#8859) 2024-10-07 10:01:46 +00:00
Isotr0py
4f95ffee6f [Hardware][CPU] Cross-attention and Encoder-Decoder models support on CPU backend (#9089) 2024-10-07 06:50:35 +00:00
Cyrus Leung
8c6de96ea1 [Model] Explicit interface for vLLM models and support OOT embedding models (#9108) 2024-10-07 06:10:35 +00:00
youkaichao
18b296fdb2 [core] remove beam search from the core (#9105) 2024-10-07 05:47:04 +00:00
sroy745
c8f26bb636 [BugFix][Core] Fix BlockManagerV2 when Encoder Input is None (#9103) 2024-10-07 03:52:42 +00:00
Isotr0py
487678d046 [Bugfix][Hardware][CPU] Fix CPU model input for decode (#9044) 2024-10-06 19:14:27 -07:00
Varun Sundar Rabindranath
cb3b2b9ba4 [Bugfix] Fix incorrect updates to num_computed_tokens in multi-step scheduling (#9038)
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2024-10-06 12:48:11 -07:00
Yanyi Liu
fdf59d30ea [Bugfix] fix tool_parser error handling when serve a model not support it (#8709) 2024-10-06 12:51:08 +00:00
Cyrus Leung
b22b798471 [Model] PP support for embedding models and update docs (#9090)
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
2024-10-06 16:35:27 +08:00
Cyrus Leung
f22619fe96 [Misc] Remove user-facing error for removed VLM args (#9104) 2024-10-06 01:33:52 -07:00
Brendan Wong
168cab6bbf [Frontend] API support for beam search (#9087)
Co-authored-by: youkaichao <youkaichao@126.com>
2024-10-05 23:39:03 -07:00
TJian
23fea8714a [Bugfix] Fix try-catch conditions to import correct Flash Attention Backend in Draft Model (#9101) 2024-10-06 13:00:04 +08:00
youkaichao
f4dd830e09 [core] use forward context for flash infer (#9097) 2024-10-05 19:37:31 -07:00
Andy Dai
5df1834895 [Bugfix] Fix order of arguments matters in config.yaml (#8960) 2024-10-05 17:35:11 +00:00
Chen Zhang
cfadb9c687 [Bugfix] Deprecate registration of custom configs to huggingface (#9083) 2024-10-05 21:56:40 +08:00
Xin Yang
15986f598c [Model] Support Gemma2 embedding model (#9004) 2024-10-05 06:57:05 +00:00
hhzhang16
53b3a33027 [Bugfix] Fixes Phi3v & Ultravox Multimodal EmbeddingInputs (#8979) 2024-10-04 22:05:37 -07:00
Chen Zhang
dac914b0d6 [Bugfix] use blockmanagerv1 for encoder-decoder (#9084)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-10-05 04:45:38 +00:00
Zhuohan Li
a95354a36e [Doc] Update README.md with Ray summit slides (#9088) 2024-10-05 02:54:45 +00:00
youkaichao
663874e048 [torch.compile] improve allreduce registration (#9061) 2024-10-04 16:43:50 -07:00
Chongming Ni
cc90419e89 [Hardware][Neuron] Add on-device sampling support for Neuron (#8746)
Co-authored-by: Ashraf Mahgoub <ashymahg@amazon.com>
2024-10-04 16:42:20 -07:00
Cody Yu
27302dd584 [Misc] Fix CI lint (#9085) 2024-10-04 16:07:54 -07:00
Andy Dai
0cc566ca8f [Misc] Add random seed for prefix cache benchmark (#9081) 2024-10-04 21:58:57 +00:00
Andy Dai
05c531be47 [Misc] Improved prefix cache example (#9077) 2024-10-04 21:38:42 +00:00
Kuntai Du
fbb74420e7 [CI] Update performance benchmark: upgrade trt-llm to r24.07, and add SGLang (#7412) 2024-10-04 14:01:44 -07:00
ElizaWszola
05d686432f [Kernel] Zero point support in fused MarlinMoE kernel + AWQ Fused MoE (#8973)
Co-authored-by: Dipika <dipikasikka1@gmail.com>
Co-authored-by: Dipika Sikka <ds3822@columbia.edu>
2024-10-04 12:34:44 -06:00
Flávia Béo
0dcc8cbe5a Adds truncate_prompt_tokens param for embeddings creation (#8999)
Signed-off-by: Flavia Beo <flavia.beo@ibm.com>
2024-10-04 18:31:40 +00:00
Roger Wang
26aa325f4f [Core][VLM] Test registration for OOT multimodal models (#8717)
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-10-04 10:38:25 -07:00
Varad Ahirwadkar
e5dc713c23 [Hardware][PowerPC] Make oneDNN dependency optional for Power (#9039)
Signed-off-by: Varad Ahirwadkar <varad.ahirwadkar1@ibm.com>
2024-10-04 17:24:42 +00:00
Simon Mo
36eecfbddb Remove AMD Ray Summit Banner (#9075) 2024-10-04 10:17:16 -07:00
Prashant Gupta
9ade8bbc8d [Model] add a bunch of supported lora modules for mixtral (#9008)
Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com>
2024-10-04 16:24:40 +00:00
Lucas Wilkinson
22482e495e [Bugfix] Flash attention arches not getting set properly (#9062) 2024-10-04 09:43:15 -06:00
whyiug
3d826d2c52 [Bugfix] Reshape the dimensions of the input image embeddings in Qwen2VL (#9071) 2024-10-04 14:34:58 +00:00
Cyrus Leung
0e36fd4909 [Misc] Move registry to its own file (#9064) 2024-10-04 10:01:37 +00:00
Murali Andoorveedu
0f6d7a9a34 [Models] Add remaining model PP support (#7168)
Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
Signed-off-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-10-04 10:56:58 +08:00
Michael Goin
303d44790a [Misc] Enable multi-step output streaming by default (#9047) 2024-10-03 22:55:42 -04:00
Lucas Wilkinson
aeb37c2a72 [CI/Build] Per file CUDA Archs (improve wheel size and dev build times) (#8845) 2024-10-03 22:55:25 -04:00
代君
3dbb215b38 [Frontend][Feature] support tool calling for internlm/internlm2_5-7b-chat model (#8405) 2024-10-04 10:36:39 +08:00
Domen Vreš
2838d6b38e [Bugfix] Weight loading fix for OPT model (#9042)
Co-authored-by: dvres <dvres@fri.uni-lj.si>
2024-10-03 19:53:29 -04:00
sroy745
91add85ec4 Fix failing spec decode test (#9054) 2024-10-03 23:07:29 +00:00
youkaichao
9aaf14c62e [misc] add forward context for attention (#9029) 2024-10-03 12:09:42 -07:00
xendo
63e39937f9 [Frontend] [Neuron] Parse literals out of override-neuron-config (#8959)
Co-authored-by: Jerzy Zagorski <jzagorsk@amazon.com>
2024-10-03 18:02:07 +00:00
sroy745
f5d72b2fc6 [Core] Make BlockSpaceManagerV2 the default BlockManager to use. (#8678) 2024-10-03 09:44:21 -07:00
Guillaume Calmettes
83caf35e08 [BugFix] Enforce Mistral ToolCall id constraint when using the Mistral tool call parser (#9020) 2024-10-03 16:44:52 +08:00
Divakar Verma
01843c89b8 [Misc] log when using default MoE config (#8971) 2024-10-03 04:31:07 +00:00
Travis Johnson
19a4dd0990 [Bugfix] example template should not add parallel_tool_prompt if tools is none (#9007) 2024-10-03 03:04:17 +00:00
Nick Hill
18c2e30c57 [Doc] Update Granite model docs (#9025) 2024-10-03 02:42:24 +00:00
Shawn Tan
19f0d25796 [Model] Adding Granite MoE. (#8206)
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
2024-10-03 09:33:57 +08:00
Sergey Shlyapnikov
f58d4fccc9 [OpenVINO] Enable GPU support for OpenVINO vLLM backend (#8192) 2024-10-02 17:50:01 -04:00
Varun Sundar Rabindranath
afb050b29d [Core] CUDA Graphs for Multi-Step + Chunked-Prefill (#8645)
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2024-10-02 19:44:39 +00:00
Alex Brooks
7f60520deb [Misc] Update Default Image Mapper Error Log (#8977)
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
2024-10-02 11:44:38 +00:00
afeldman-nm
563649aafe [Core] Combined support for multi-step scheduling, chunked prefill & prefix caching (#8804)
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Andrew Feldman <afeld2012@gmail.com>
2024-10-02 07:52:20 +00:00
Lily Liu
1570203864 [Spec Decode] (1/2) Remove batch expansion (#8839) 2024-10-01 16:04:42 -07:00
vlsav
22f5851b80 Update benchmark_serving.py to read and write json-datasets, results in UTF8, for better compatibility with Windows (#8997) 2024-10-01 11:07:06 -07:00
Cyrus Leung
4f341bd4bf [Doc] Update list of supported models (#8987) 2024-10-02 00:35:39 +08:00
Sebastian Schoennenbeck
35bd215168 [Core] [Frontend] Priority scheduling for embeddings and in the OpenAI-API (#8965) 2024-10-01 09:58:06 +00:00
Alex Brooks
1fe0a4264a [Bugfix] Fix Token IDs Reference for MiniCPM-V When Images are Provided With No Placeholders (#8991)
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
2024-10-01 09:52:44 +00:00
Isotr0py
bc4eb65b54 [Bugfix] Fix Fuyu tensor parallel inference (#8986) 2024-10-01 17:51:41 +08:00
Divakar Verma
82f3937e59 [Misc] add process_weights_after_loading for DummyLoader (#8969) 2024-10-01 03:46:41 +00:00
youkaichao
7da2487591 [torch.compile] fix tensor alias (#8982) 2024-10-01 03:40:48 +00:00
Kevin H. Luu
aaccca2b4d [CI/Build] Fix machete generated kernel files ordering (#8976)
Signed-off-by: kevin <kevin@anyscale.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
2024-10-01 03:33:12 +00:00
Joe Runde
062c89e7c9 [Frontend][Core] Move guided decoding params into sampling params (#8252)
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
2024-10-01 09:34:25 +08:00
Lily Liu
bce324487a [CI][SpecDecode] Fix spec decode tests, use flash attention backend for spec decode CI tests. (#8975) 2024-10-01 00:51:40 +00:00
Kevin H. Luu
1425a1bcf9 [ci] Add CODEOWNERS for test directories (#8795)
Signed-off-by: kevin <kevin@anyscale.com>
2024-10-01 00:47:08 +00:00
Jee Jee Li
1cabfcefb6 [Misc] Adjust max_position_embeddings for LoRA compatibility (#8957) 2024-09-30 12:57:39 +00:00
Sebastian Schoennenbeck
be76e5aabf [Core] Make scheduling policy settable via EngineArgs (#8956) 2024-09-30 12:28:44 +00:00
Isotr0py
2ae25f79cf [Model] Expose InternVL2 max_dynamic_patch as a mm_processor_kwarg (#8946) 2024-09-30 13:01:20 +08:00
Jee Jee Li
8e60afa15e [Model][LoRA]LoRA support added for MiniCPMV2.6 (#8943)
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-09-30 04:31:55 +00:00
Roger Wang
b6d7392579 [Misc][CI/Build] Include cv2 via mistral_common[opencv] (#8951) 2024-09-30 04:28:26 +00:00
whyiug
e01ab595d8 [Model] support input embeddings for qwen2vl (#8856) 2024-09-30 03:16:10 +00:00
Mor Zusman
f13a07b1f8 [Kernel][Model] Varlen prefill + Prefill chunking support for mamba kernels and Jamba model (#8533) 2024-09-29 17:35:58 -04:00
danieljannai21
6c9ba48fde [Frontend] Added support for HF's new continue_final_message parameter (#8942) 2024-09-29 17:59:47 +00:00
juncheoll
1fb9c1b0bf [Misc] Fix typo in BlockSpaceManagerV1 (#8944) 2024-09-29 15:05:54 +00:00
Nick Hill
31f46a0d35 [BugFix] Fix seeded random sampling with encoder-decoder models (#8870)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-09-29 09:43:14 +00:00
Jee Jee Li
3d49776bbb [Model][LoRA]LoRA support added for MiniCPMV2.5 (#7199) 2024-09-29 06:59:45 +00:00
Zilin Zhu
bc2ef1f77c [Model] Support Qwen2.5-Math-RM-72B (#8896) 2024-09-28 21:19:39 -07:00
Tyler Michael Smith
2e7fe7e79f [Build/CI] Set FETCHCONTENT_BASE_DIR to one location for better caching (#8930) 2024-09-29 03:13:01 +00:00
Cyrus Leung
26a68d5d7e [CI/Build] Add test decorator for minimum GPU memory (#8925) 2024-09-29 02:50:51 +00:00
ElizaWszola
d081da0064 [Bugfix] Fix Marlin MoE act order when is_k_full == False (#8741)
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2024-09-28 18:19:40 -07:00
sroy745
5bf8789b2a [Bugfix] Block manager v2 with preemption and lookahead slots (#8824) 2024-09-29 09:17:45 +08:00
Russell Bryant
d1537039ce [Core] Improve choice of Python multiprocessing method (#8823)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: youkaichao <youkaichao@126.com>
2024-09-29 09:17:07 +08:00
youkaichao
cc276443b5 [doc] organize installation doc and expose per-commit docker (#8931) 2024-09-28 17:48:41 -07:00
Chen Zhang
e585b583a9 [Bugfix] Support testing prefill throughput with benchmark_serving.py --hf-output-len 1 (#8891) 2024-09-28 18:51:22 +00:00
Edouard B.
090e945e36 [Frontend] Make beam search emulator temperature modifiable (#8928)
Co-authored-by: Eduard Balzin <nfunctor@yahoo.fr>
2024-09-28 11:30:21 -07:00
Cyrus Leung
e1a3f5e831 [CI/Build] Update models tests & examples (#8874)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-09-28 09:54:35 -07:00
Varun Sundar Rabindranath
19d02ff938 [Bugfix] Fix PP for Multi-Step (#8887) 2024-09-28 08:52:46 -07:00
tastelikefeet
39d3f8d94f [Bugfix] Fix code for downloading models from modelscope (#8443) 2024-09-28 08:24:12 -07:00
Cyrus Leung
b0298aa8cc [Misc] Remove vLLM patch of BaichuanTokenizer (#8921) 2024-09-28 08:11:25 +00:00
Tyler Titsworth
260024a374 [Bugfix][Intel] Fix XPU Dockerfile Build (#7824)
Signed-off-by: tylertitsworth <tyler.titsworth@intel.com>
Co-authored-by: youkaichao <youkaichao@126.com>
2024-09-27 23:45:50 -07:00
youkaichao
d86f6b2afb [misc] fix wheel name (#8919) 2024-09-27 22:10:44 -07:00
Sebastian Schoennenbeck
bd429f2b75 [Core] Priority-based scheduling in async engine (#8850) 2024-09-27 15:07:10 -07:00
youkaichao
18e60d7d13 [misc][distributed] add VLLM_SKIP_P2P_CHECK flag (#8911) 2024-09-27 14:27:56 -07:00
Varun Sundar Rabindranath
c2ec430ab5 [Core] Multi-Step + Single Step Prefills via Chunked Prefill code path (#8378)
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2024-09-27 13:32:07 -07:00
Lucas Wilkinson
c5d55356f9 [Bugfix] fix for deepseek w4a16 (#8906)
Co-authored-by: mgoin <michael@neuralmagic.com>
2024-09-27 13:12:34 -06:00
Luka Govedič
172d1cd276 [Kernel] AQ AZP 4/4: Integrate asymmetric quantization to linear method (#7271) 2024-09-27 14:25:10 -04:00
youkaichao
a9b15c606f [torch.compile] use empty tensor instead of None for profiling (#8875) 2024-09-27 08:11:32 -07:00
Brittany
8df2dc3c88 [TPU] Update pallas.py to support trillium (#8871) 2024-09-27 01:16:55 -07:00
Isotr0py
6d792d2f31 [Bugfix][VLM] Fix Fuyu batching inference with max_num_seqs>1 (#8892) 2024-09-27 01:15:58 -07:00
Peter Pan
0e088750af [MISC] Fix invalid escape sequence '\' (#8830)
Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>
2024-09-27 01:13:25 -07:00
youkaichao
dc4e3df5c2 [misc] fix collect env (#8894) 2024-09-27 00:26:38 -07:00
Cyrus Leung
3b00b9c26c [Core] renamePromptInputs and inputs (#8876) 2024-09-26 20:35:15 -07:00
Maximilien de Bayser
344cd2b6f4 [Feature] Add support for Llama 3.1 and 3.2 tool use (#8343)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
2024-09-26 17:01:42 -07:00
Cyrus Leung
1b49148e47 [Installation] Allow lower versions of FastAPI to maintain Ray 2.9 compatibility (#8764) 2024-09-26 16:54:09 -07:00
Nick Hill
4b377d6feb [BugFix] Fix test breakages from transformers 4.45 upgrade (#8829) 2024-09-26 16:46:43 -07:00
Tyler Michael Smith
71d21c73ab [Bugfix] Fixup advance_step.cu warning (#8815) 2024-09-26 16:23:45 -07:00
Chirag Jain
ee2da3e9ef fix validation: Only set tool_choice auto if at least one tool is provided (#8568) 2024-09-26 16:23:17 -07:00
Tyler Michael Smith
e2f6f26e86 [Bugfix] Fix print_warning_once's line info (#8867) 2024-09-26 16:18:26 -07:00
Michael Goin
b28d2104de [Misc] Change dummy profiling and BOS fallback warns to log once (#8820) 2024-09-26 16:18:14 -07:00
Pernekhan Utemuratov
93d364da34 [Bugfix] Include encoder prompts len to non-stream api usage response (#8861) 2024-09-26 15:47:00 -07:00
Kevin H. Luu
d9cfbc891e [ci] Soft fail Entrypoints, Samplers, LoRA, Decoder-only VLM (#8872)
Signed-off-by: kevin <kevin@anyscale.com>
2024-09-26 15:02:16 -07:00
youkaichao
70de39f6b4 [misc][installation] build from source without compilation (#8818) 2024-09-26 13:19:04 -07:00
fyuan1316
68988d4e0d [CI/Build] Fix missing ci dependencies (#8834) 2024-09-26 11:04:39 -07:00
Michael Goin
520db4dbc1 [Docs] Add README to the build docker image (#8825) 2024-09-26 11:02:52 -07:00
Tyler Michael Smith
f70bccac75 [Build/CI] Upgrade to gcc 10 in the base build Docker image (#8814) 2024-09-26 10:07:18 -07:00
Roger Wang
4bb98f2190 [Misc] Update config loading for Qwen2-VL and remove Granite (#8837) 2024-09-26 07:45:30 -07:00
Michael Goin
7193774b1f [Misc] Support quantization of MllamaForCausalLM (#8822)
Some checks failed
Create Release / Create Release (push) Has been cancelled
Create Release / Build Wheel (11.8, ubuntu-20.04, 3.10, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (11.8, ubuntu-20.04, 3.11, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (11.8, ubuntu-20.04, 3.12, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (11.8, ubuntu-20.04, 3.8, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (11.8, ubuntu-20.04, 3.9, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (12.1, ubuntu-20.04, 3.10, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (12.1, ubuntu-20.04, 3.11, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (12.1, ubuntu-20.04, 3.12, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (12.1, ubuntu-20.04, 3.8, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (12.1, ubuntu-20.04, 3.9, 2.4.0) (push) Has been cancelled
2024-09-25 14:46:22 -07:00
Roger Wang
e2c6e0a829 [Doc] Update doc for Transformers 4.45 (#8817) 2024-09-25 13:29:48 -07:00
Chen Zhang
770ec6024f [Model] Add support for the multi-modal Llama 3.2 model (#8811)
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Chang Su <chang.s.su@oracle.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-09-25 13:29:32 -07:00
Simon Mo
4f1ba0844b Revert "rename PromptInputs and inputs with backward compatibility (#8760) (#8810) 2024-09-25 10:36:26 -07:00
Michael Goin
873edda6cf [Misc] Support FP8 MoE for compressed-tensors (#8588) 2024-09-25 09:43:36 -07:00
科英
64840dfae4 [Frontend] MQLLMEngine supports profiling. (#8761) 2024-09-25 09:37:41 -07:00
Cyrus Leung
28e1299e60 rename PromptInputs and inputs with backward compatibility (#8760) 2024-09-25 09:36:47 -07:00
DefTruth
0c4d2ad5e6 [VLM][Bugfix] internvl with num_scheduler_steps > 1 (#8614) 2024-09-25 09:35:53 -07:00
Jee Jee Li
c6f2485c82 [[Misc]] Add extra deps for openai server image (#8792) 2024-09-25 09:35:23 -07:00
bnellnm
300da09177 [Kernel] Fullgraph and opcheck tests (#8479) 2024-09-25 08:35:52 -06:00
Hongxia Yang
1c046447a6 [CI/Build][Bugfix][Doc][ROCm] CI fix and doc update after ROCm 6.2 upgrade (#8777) 2024-09-25 22:26:37 +08:00
Woo-Yeon Lee
8fae5ed7f6 [Misc] Fix minor typo in scheduler (#8765) 2024-09-25 00:53:03 -07:00
David Newman
3368c3ab36 [Bugfix] Ray 2.9.x doesn't expose available_resources_per_node (#8767)
Signed-off-by: darthhexx <darthhexx@gmail.com>
2024-09-25 00:52:26 -07:00
Adam Tilghman
1ac3de09cd [Frontend] OpenAI server: propagate usage accounting to FastAPI middleware layer (#8672) 2024-09-25 07:49:26 +00:00
sohamparikh
3e073e66f1 [Bugfix] load fc bias from config for eagle (#8790) 2024-09-24 23:16:30 -07:00
Isotr0py
c23953675f [Hardware][CPU] Enable mrope and support Qwen2-VL on CPU backend (#8770) 2024-09-24 23:16:11 -07:00
zifeitong
e3dd0692fa [BugFix] Propagate 'trust_remote_code' setting in internvl and minicpmv (#8250) 2024-09-25 05:53:43 +00:00
sroy745
fc3afc20df Fix tests in test_chunked_prefill_scheduler which fail with BlockManager V2 (#8752) 2024-09-24 21:26:36 -07:00
sasha0552
b4522474a3 [Bugfix][Kernel] Implement acquire/release polyfill for Pascal (#8776) 2024-09-24 21:26:33 -07:00
sroy745
ee777d9c30 Fix test_schedule_swapped_simple in test_scheduler.py (#8780) 2024-09-24 21:26:18 -07:00
Joe Runde
6e0c9d6bd0 [Bugfix] Use heartbeats instead of health checks (#8583) 2024-09-24 20:37:38 -07:00
Archit Patke
6da1ab6b41 [Core] Adding Priority Scheduling (#5958) 2024-09-24 19:50:50 -07:00
Travis Johnson
01b6f9e1f0 [Core][Bugfix] Support prompt_logprobs returned with speculative decoding (#8047)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
2024-09-24 17:29:56 -07:00
Jee Jee Li
13f9f7a3d0 [[Misc]Upgrade bitsandbytes to the latest version 0.44.0 (#8768) 2024-09-24 17:08:55 -07:00
youkaichao
1e7d5c01f5 [misc] soft drop beam search (#8763) 2024-09-24 15:48:39 -07:00
Daniele
2467b642dd [CI/Build] fix setuptools-scm usage (#8771) 2024-09-24 12:38:12 -07:00
Lucas Wilkinson
72fc97a0f1 [Bugfix] Fix torch dynamo fixes caused by replace_parameters (#8748) 2024-09-24 14:33:21 -04:00
Andy
2529d09b5a [Frontend] Batch inference for llm.chat() API (#8648)
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
2024-09-24 09:44:11 -07:00
ElizaWszola
a928ded995 [Kernel] Split Marlin MoE kernels into multiple files (#8661)
Co-authored-by: mgoin <michael@neuralmagic.com>
2024-09-24 09:31:42 -07:00
Hanzhi Zhou
cc4325b66a [Bugfix] Fix potentially unsafe custom allreduce synchronization (#8558) 2024-09-24 01:08:14 -07:00
Alex Brooks
8ff7ced996 [Model] Expose Phi3v num_crops as a mm_processor_kwarg (#8658)
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-09-24 07:36:46 +00:00
Peter Salas
3f06bae907 [Core][Model] Support loading weights by ID within models (#7931) 2024-09-24 07:14:15 +00:00
Cody Yu
b8747e8a7c [MISC] Skip dumping inputs when unpicklable (#8744) 2024-09-24 06:10:03 +00:00
Simon Mo
3185fb0cca Revert "[Core] Rename PromptInputs to PromptType, and inputs to prompt" (#8750) 2024-09-24 05:45:20 +00:00
youkaichao
0250dd68c5 re-implement beam search on top of vllm core (#8726)
Co-authored-by: Brendan Wong <bjwpokemon@gmail.com>
2024-09-23 22:08:12 -07:00
sroy745
88577ac928 Fix tests in test_scheduler.py that fail with BlockManager V2 (#8728) 2024-09-24 04:43:13 +00:00
Hongxia Yang
530821d00c [Hardware][AMD] ROCm6.2 upgrade (#8674) 2024-09-23 18:52:39 -07:00
Alexander Matveev
1a2aef3e59 Add output streaming support to multi-step + async while ensuring RequestOutput obj reuse (#8335) 2024-09-23 15:38:04 -07:00
jiqing-feng
5f7bb58427 Fix typical acceptance sampler with correct recovered token ids (#8562) 2024-09-23 12:32:27 -07:00
Russell Bryant
b05f5c9238 [Core] Allow IPv6 in VLLM_HOST_IP with zmq (#8575)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2024-09-23 12:15:41 -07:00
Jee Jee Li
9b0e3ec970 [Kernel][LoRA] Add assertion for punica sgmv kernels (#7585) 2024-09-23 18:57:42 +00:00
Lucas Wilkinson
86e9c8df29 [Kernel] (2/N) Machete - Integrate into CompressedTensorsWNA16 and GPTQMarlin (#7701)
Co-authored-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2024-09-23 13:46:26 -04:00
Daniele
ee5f34b1c2 [CI/Build] use setuptools-scm to set __version__ (#4738)
Co-authored-by: youkaichao <youkaichao@126.com>
2024-09-23 09:44:26 -07:00
Jani Monoses
f2bd246c17 [VLM] Fix paligemma, fuyu and persimmon with transformers 4.45 : use config.text_config.vocab_size (#8707) 2024-09-23 14:43:09 +00:00
Yanyi Liu
a79e522984 [Model] Support pp for qwen2-vl (#8696) 2024-09-23 13:46:59 +00:00
Li, Jiang
3e83c12b5c [Bugfix][CPU] fix missing input intermediate_tensors in the cpu_model_runner (#8733) 2024-09-23 13:15:16 +00:00
Isotr0py
e551ca1555 [Hardware][CPU] Refactor CPU model runner (#8729) 2024-09-23 20:12:20 +08:00
Alex Brooks
9b8c8ba119 [Core][Frontend] Support Passing Multimodal Processor Kwargs (#8657)
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
2024-09-23 07:44:48 +00:00
Yan Ma
d23679eb99 [Bugfix] fix docker build for xpu (#8652) 2024-09-22 22:54:18 -07:00
Luka Govedič
57a0702e63 [Bugfix] Fix CPU CMake build (#8723)
Co-authored-by: Yuan <yuan.zhou@intel.com>
2024-09-22 20:40:46 -07:00
Tyler Michael Smith
3dda7c2250 [Bugfix] Avoid some bogus messages RE CUTLASS's revision when building (#8702) 2024-09-22 22:24:59 -04:00
youkaichao
92ba7e7477 [misc] upgrade mistral-common (#8715) 2024-09-22 15:41:59 -07:00
youkaichao
d4a2ac8302 [build] enable existing pytorch (for GH200, aarch64, nightly) (#8713) 2024-09-22 12:47:54 -07:00
Lily Liu
c6bd70d772 [SpecDec][Misc] Cleanup, remove bonus token logic. (#8701) 2024-09-22 12:34:14 -07:00
litianjian
5b59532760 [Model][VLM] Add LLaVA-Onevision model support (#8486)
Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-09-22 10:51:44 -07:00
Huazhong Ji
ca2b628b3c [MISC] rename CudaMemoryProfiler to DeviceMemoryProfiler (#8703) 2024-09-22 10:44:09 -07:00
Alex Brooks
8ca5051b9a [Misc] Use NamedTuple in Multi-image example (#8705)
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
2024-09-22 20:56:20 +08:00
Cyrus Leung
06ed2815e2 [Model] Refactor BLIP/BLIP-2 to support composite model loading (#8407) 2024-09-22 12:24:21 +00:00
youkaichao
0e40ac9b7b [ci][build] fix vllm-flash-attn (#8699) 2024-09-21 23:24:58 -07:00
Isotr0py
13d88d4137 [Bugfix] Refactor composite weight loading logic (#8656) 2024-09-22 04:33:27 +00:00
Tyler Michael Smith
d66ac62854 [Kernel][Bugfix] Delete some more useless code in marlin_moe_ops.cu (#8643) 2024-09-21 23:45:02 +00:00
Divakar Verma
9dc7c6c7f3 [dbrx] refactor dbrx experts to extend FusedMoe class (#8518) 2024-09-21 15:09:39 -06:00
rasmith
ec4aaad812 [Kernel][Triton][AMD] Remove tl.atomic_add from awq_gemm_kernel, 2-5x speedup MI300, minor improvement for MI250 (#8646) 2024-09-21 09:20:54 +00:00
Andy Dai
4dfdf43196 [Doc] Fix typo in AMD installation guide (#8689) 2024-09-21 00:24:12 -07:00
Cyrus Leung
5e85f4f82a [VLM] Use SequenceData.from_token_counts to create dummy data (#8687) 2024-09-20 23:28:56 -07:00
Luka Govedič
71c60491f2 [Kernel] Build flash-attn from source (#8245) 2024-09-20 23:27:10 -07:00
youkaichao
0faab90eb0 [beam search] add output for manually checking the correctness (#8684) 2024-09-20 19:55:33 -07:00
Cyrus Leung
0455c46ed4 [Core] Factor out common code in SequenceData and Sequence (#8675) 2024-09-21 02:30:39 +00:00
Kunshang Ji
d4bf085ad0 [MISC] add support custom_op check (#8557)
Co-authored-by: youkaichao <youkaichao@126.com>
2024-09-20 19:03:55 -07:00
Cyrus Leung
0057894ef7 [Core] Rename PromptInputs and inputs(#8673) 2024-09-20 19:00:54 -07:00
zyddnys
0f961b3ce9 [Bugfix] Fix incorrect llava next feature size calculation (#8496) 2024-09-20 22:48:32 +00:00
omrishiv
7f9c8902e3 [Hardware][AWS] update neuron to 2.20 (#8676)
Signed-off-by: omrishiv <327609+omrishiv@users.noreply.github.com>
2024-09-20 15:19:44 -07:00
omrishiv
7c8566aa4f [Doc] neuron documentation update (#8671)
Signed-off-by: omrishiv <327609+omrishiv@users.noreply.github.com>
2024-09-20 15:04:37 -07:00
Patrick von Platen
b4e4eda92e [Bugfix][Core] Fix tekken edge case for mistral tokenizer (#8640) 2024-09-20 14:33:03 -07:00
Pastel!
2874bac618 [Bugfix] Config got an unexpected keyword argument 'engine' (#8556) 2024-09-20 14:00:45 -07:00
Cyrus Leung
035fa895ec [Misc] Show AMD GPU topology in collect_env.py (#8649) 2024-09-20 13:52:19 -07:00
saumya-saran
b28298f2f4 [Bugfix] Validate SamplingParam n is an int (#8548) 2024-09-20 12:46:02 -07:00
Alexey Kondratiev(AMD)
2940afa04e [CI/Build] Removing entrypoints/openai/test_embedding.py test from ROCm build (#8670) 2024-09-20 10:27:44 -07:00
Niklas Muennighoff
3b63de9353 [Model] Add OLMoE (#7922) 2024-09-20 09:31:41 -07:00
Jiaxin Shan
260d40b5ea [Core] Support Lora lineage and base model metadata management (#6315) 2024-09-20 06:20:56 +00:00
William Lin
9e5ec35b1f [bugfix] [AMD] add multi-step advance_step to ROCmFlashAttentionMetadata (#8474) 2024-09-19 20:49:54 -07:00
Amit Garg
18ae428a0d [Bugfix] Fix Phi3.5 mini and MoE LoRA inference (#8571) 2024-09-20 08:54:02 +08:00
bnellnm
de6f90a13d [Misc] guard against change in cuda library name (#8609) 2024-09-20 06:36:30 +08:00
Alexey Kondratiev(AMD)
6cb748e190 [CI/Build] Re-enabling Entrypoints tests on ROCm, excluding ones that fail (#8551) 2024-09-19 13:06:32 -07:00
Simon Mo
9e99407e3c Create SECURITY.md (#8642) 2024-09-19 12:16:28 -07:00
Isotr0py
ea4647b7d7 [Doc] Add documentation for GGUF quantization (#8618) 2024-09-19 13:15:55 -06:00
盏一
e42c634acb [Core] simplify logits resort in _apply_top_k_top_p (#8619) 2024-09-19 18:28:25 +00:00
Charlie Fu
9cc373f390 [Kernel][Amd] Add fp8 kv cache support for rocm custom paged attention (#8577) 2024-09-19 17:37:57 +00:00
Nick Hill
76515f303b [Frontend] Use MQLLMEngine for embeddings models too (#8584) 2024-09-19 12:51:06 -04:00
Kunshang Ji
855c8ae2c9 [MISC] remove engine_use_ray in benchmark_throughput.py (#8615) 2024-09-18 22:33:20 -07:00
Kuntai Du
c52ec5f034 [Bugfix] fixing sonnet benchmark bug in benchmark_serving.py (#8616) 2024-09-19 05:24:24 +00:00
Roger Wang
02c9afa2d0 Revert "[Misc][Bugfix] Disable guided decoding for mistral tokenizer" (#8593) 2024-09-19 04:14:28 +00:00
sroy745
3118f63385 [Bugfix] [Encoder-Decoder] Bugfix for encoder specific metadata construction during decode of encoder-decoder models. (#8545) 2024-09-19 02:24:15 +00:00
Tyler Michael Smith
4c34ce8916 [Kernel] Remove marlin moe templating on thread_m_blocks (#8573)
Co-authored-by: lwilkinson@neuralmagic.com
2024-09-19 01:42:49 +00:00
Joe Runde
0d47bf3bf4 [Bugfix] add dead_error property to engine client (#8574)
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
2024-09-18 22:10:01 +00:00
Nick Hill
d9cd78eb71 [BugFix] Nonzero exit code if MQLLMEngine startup fails (#8572) 2024-09-18 20:17:55 +00:00
Tyler Michael Smith
db9120cded [Kernel] Change interface to Mamba selective_state_update for continuous batching (#8039) 2024-09-18 20:05:06 +00:00
Gregory Shtrasberg
b3195bc9e4 [AMD][ROCm]Quantization methods on ROCm; Fix _scaled_mm call (#8380)
Co-authored-by: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-09-18 10:41:08 -07:00
Geun, Lim
e18749ff09 [Model] Support Solar Model (#8386)
Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-09-18 11:04:00 -06:00
Russell Bryant
d65798f78c [Core] zmq: bind only to 127.0.0.1 for local-only usage (#8543)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2024-09-18 16:10:27 +00:00
afeldman-nm
a8c1d161a7 [Core] *Prompt* logprobs support in Multi-step (#8199) 2024-09-18 08:38:43 -07:00
Alexander Matveev
7c7714d856 [Core][Bugfix][Perf] Introduce MQLLMEngine to avoid asyncio OH (#8157)
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
2024-09-18 13:56:58 +00:00
Aaron Pham
9d104b5beb [CI/Build] Update Ruff version (#8469)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2024-09-18 11:00:56 +00:00
Cyrus Leung
6ffa3f314c [CI/Build] Avoid CUDA initialization (#8534) 2024-09-18 10:38:11 +00:00
Jiaxin Shan
e351572900 [Misc] Add argument to disable FastAPI docs (#8554) 2024-09-18 09:51:59 +00:00
Daniele
95965d31b6 [CI/Build] fix Dockerfile.cpu on podman (#8540) 2024-09-18 10:49:53 +08:00
Tyler Michael Smith
8110e44529 [Kernel] Change interface to Mamba causal_conv1d_update for continuous batching (#8012) 2024-09-17 23:44:27 +00:00
Alexey Kondratiev(AMD)
09deb4721f [CI/Build] Excluding kernels/test_gguf.py from ROCm (#8520) 2024-09-17 16:40:29 -07:00
youkaichao
fa0c114fad [doc] improve installation doc (#8550)
Co-authored-by: Andy Dai <76841985+Imss27@users.noreply.github.com>
2024-09-17 16:24:06 -07:00
Joe Runde
98f9713399 [Bugfix] Fix TP > 1 for new granite (#8544)
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
2024-09-17 23:17:08 +00:00
Nick Hill
56c3de018c [Misc] Don't dump contents of kvcache tensors on errors (#8527) 2024-09-17 12:24:29 -07:00
Patrick von Platen
a54ed80249 [Model] Add mistral function calling format to all models loaded with "mistral" format (#8515)
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2024-09-17 17:50:37 +00:00
chenqianfzh
9855b99502 [Feature][kernel] tensor parallelism with bitsandbytes quantization (#8434) 2024-09-17 08:09:12 -07:00
sroy745
1009e93c5d [Encoder decoder] Add cuda graph support during decoding for encoder-decoder models (#7631) 2024-09-17 07:35:01 -07:00
Isotr0py
1b6de8352b [Benchmark] Support sample from HF datasets and image input for benchmark_serving (#8495) 2024-09-17 07:34:27 +00:00
Rui Qiao
cbdb252259 [Misc] Limit to ray[adag] 2.35 to avoid backward incompatible change (#8509)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2024-09-17 00:06:26 -07:00
youkaichao
99aa4eddaf [torch.compile] register allreduce operations as custom ops (#8526) 2024-09-16 22:57:57 -07:00
Roger Wang
ee2bceaaa6 [Misc][Bugfix] Disable guided decoding for mistral tokenizer (#8521) 2024-09-16 22:22:45 -07:00
Alex Brooks
1c1bb388e0 [Frontend] Improve Nullable kv Arg Parsing (#8525)
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
2024-09-17 04:17:32 +00:00
Simon Mo
546034b466 [refactor] remove triton based sampler (#8524) 2024-09-16 20:04:48 -07:00
Joe Runde
cca61642e0 [Bugfix] Fix 3.12 builds on main (#8510)
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
2024-09-17 00:01:45 +00:00
Simon Mo
5ce45eb54d [misc] small qol fixes for release process (#8517) 2024-09-16 15:11:27 -07:00
Simon Mo
5478c4b41f [perf bench] set timeout to debug hanging (#8516) 2024-09-16 14:30:02 -07:00
Kevin Lin
47f5e03b5b [Bugfix] Bind api server port before starting engine (#8491) 2024-09-16 13:56:28 -07:00
youkaichao
2759a43a26 [doc] update doc on testing and debugging (#8514) 2024-09-16 12:10:23 -07:00
Luka Govedič
5d73ae49d6 [Kernel] AQ AZP 3/4: Asymmetric quantization kernels (#7270) 2024-09-16 11:52:40 -07:00
sasha0552
781e3b9a42 [Bugfix][Kernel] Fix build for sm_60 in GGUF kernel (#8506) 2024-09-16 12:15:57 -06:00
Nick Hill
acd5511b6d [BugFix] Fix clean shutdown issues (#8492) 2024-09-16 09:33:46 -07:00
lewtun
837c1968f9 [Frontend] Expose revision arg in OpenAI server (#8501) 2024-09-16 15:55:26 +00:00
ElizaWszola
a091e2da3e [Kernel] Enable 8-bit weights in Fused Marlin MoE (#8032)
Co-authored-by: Dipika <dipikasikka1@gmail.com>
2024-09-16 09:47:19 -06:00
Isotr0py
fc990f9795 [Bugfix][Kernel] Add IQ1_M quantization implementation to GGUF kernel (#8357) 2024-09-15 16:51:44 -06:00
Chris
3724d5f6b5 [Bugfix][Model] Fix Python 3.8 compatibility in Pixtral model by updating type annotations (#8490) 2024-09-15 04:20:05 +00:00
Woosuk Kwon
50e9ec41fc [TPU] Implement multi-step scheduling (#8489) 2024-09-14 16:58:31 -07:00
youkaichao
47790f3e32 [torch.compile] add a flag to disable custom op (#8488) 2024-09-14 13:07:16 -07:00
youkaichao
a36e070dad [torch.compile] fix functionalization (#8480) 2024-09-14 09:46:04 -07:00
ywfang
8a0cf1ddc3 [Model] support minicpm3 (#8297)
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-09-14 14:50:26 +00:00
Charlie Fu
1ef0d2efd0 [Kernel][Hardware][Amd]Custom paged attention kernel for rocm (#8310) 2024-09-13 17:01:11 -07:00
Kunshang Ji
851725202a [Hardware][intel GPU] bump up ipex version to 2.3 (#8365)
Co-authored-by: Yan Ma <yan.ma@intel.com>
2024-09-13 16:54:34 -07:00
Simon Mo
9ba0817ff1 bump version to v0.6.1.post2 (#8473)
Some checks failed
Create Release / Create Release (push) Has been cancelled
Create Release / Build Wheel (11.8, ubuntu-20.04, 3.10, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (11.8, ubuntu-20.04, 3.11, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (11.8, ubuntu-20.04, 3.12, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (11.8, ubuntu-20.04, 3.8, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (11.8, ubuntu-20.04, 3.9, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (12.1, ubuntu-20.04, 3.10, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (12.1, ubuntu-20.04, 3.11, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (12.1, ubuntu-20.04, 3.12, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (12.1, ubuntu-20.04, 3.8, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (12.1, ubuntu-20.04, 3.9, 2.4.0) (push) Has been cancelled
2024-09-13 11:35:00 -07:00
Nick Hill
18e9e1f7b3 [HotFix] Fix final output truncation with stop string + streaming (#8468) 2024-09-13 11:31:12 -07:00
Isotr0py
f57092c00b [Doc] Add oneDNN installation to CPU backend documentation (#8467) 2024-09-13 18:06:30 +00:00
Cyrus Leung
a84e598e21 [CI/Build] Reorganize models tests (#7820) 2024-09-13 10:20:06 -07:00
youkaichao
0a4806f0a9 [plugin][torch.compile] allow to add custom compile backend (#8445) 2024-09-13 09:32:42 -07:00
Cyrus Leung
ecd7a1d5b6 [Installation] Gate FastAPI version for Python 3.8 (#8456) 2024-09-13 09:02:26 -07:00
youkaichao
a2469127db [misc][ci] fix quant test (#8449) 2024-09-13 17:20:14 +08:00
Jee Jee Li
06311e2956 [Misc] Skip loading extra bias for Qwen2-VL GPTQ-Int8 (#8442) 2024-09-13 07:58:28 +00:00
youkaichao
cab69a15e4 [doc] recommend pip instead of conda (#8446) 2024-09-12 23:52:41 -07:00
Isotr0py
9b4a3b235e [CI/Build] Enable InternVL2 PP test only on single node (#8437) 2024-09-13 06:35:20 +00:00
Simon Mo
acda0b35d0 bump version to v0.6.1.post1 (#8440)
Some checks failed
Create Release / Create Release (push) Has been cancelled
Create Release / Build Wheel (11.8, ubuntu-20.04, 3.10, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (11.8, ubuntu-20.04, 3.11, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (11.8, ubuntu-20.04, 3.12, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (11.8, ubuntu-20.04, 3.8, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (11.8, ubuntu-20.04, 3.9, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (12.1, ubuntu-20.04, 3.10, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (12.1, ubuntu-20.04, 3.11, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (12.1, ubuntu-20.04, 3.12, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (12.1, ubuntu-20.04, 3.8, 2.4.0) (push) Has been cancelled
Create Release / Build Wheel (12.1, ubuntu-20.04, 3.9, 2.4.0) (push) Has been cancelled
2024-09-12 21:39:49 -07:00
William Lin
ba77527955 [bugfix] torch profiler bug for single gpu with GPUExecutor (#8354) 2024-09-12 21:30:00 -07:00
Alexander Matveev
6821020109 [Bugfix] Fix async log stats (#8417) 2024-09-12 20:48:59 -07:00
Cyrus Leung
8427550488 [CI/Build] Update pixtral tests to use JSON (#8436) 2024-09-13 03:47:52 +00:00
Cyrus Leung
3f79bc3d1a [Bugfix] Bump fastapi and pydantic version (#8435) 2024-09-13 03:21:42 +00:00
shangmingc
40c396533d [Bugfix] Mapping physical device indices for e2e test utils (#8290) 2024-09-13 11:06:28 +08:00
Cyrus Leung
5ec9c0fb3c [Core] Factor out input preprocessing to a separate class (#7329) 2024-09-13 02:56:13 +00:00
Dipika Sikka
8f44a92d85 [BugFix] fix group_topk (#8430) 2024-09-13 09:23:42 +08:00
Roger Wang
360ddbd37e [Misc] Update Pixtral example (#8431) 2024-09-12 17:31:18 -07:00
Wenxiang
a480939e8e [Bugfix] Fix weight loading issue by rename variable. (#8293) 2024-09-12 19:25:00 -04:00
Patrick von Platen
d31174a4e1 [Hotfix][Pixtral] Fix multiple images bugs (#8415) 2024-09-12 15:21:51 -07:00
Roger Wang
b61bd98f90 [CI/Build] Disable multi-node test for InternVL2 (#8428) 2024-09-12 15:05:35 -07:00
Roger Wang
c16369455f [Hotfix][Core][VLM] Disable chunked prefill by default and prefix caching for multimodal models (#8425) 2024-09-12 14:06:51 -07:00
Alexander Matveev
019877253b [Bugfix] multi-step + flashinfer: ensure cuda graph compatible (#8427) 2024-09-12 21:01:50 +00:00
Nick Hill
551ce01078 [Core] Add engine option to return only deltas or final output (#7381) 2024-09-12 12:02:00 -07:00
William Lin
a6c0f3658d [multi-step] add flashinfer backend (#7928) 2024-09-12 11:16:22 -07:00
Joe Runde
f2e263b801 [Bugfix] Offline mode fix (#8376)
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
2024-09-12 11:11:57 -07:00
Luis Vega
1f0c75afa9 [BugFix] Fix Duplicate Assignment in Hermes2ProToolParser (#8423) 2024-09-12 11:10:11 -07:00
WANGWEI
8a23e93302 [BugFix] lazy init _copy_stream to avoid torch init wrong gpu instance (#8403) 2024-09-12 10:47:42 -07:00
Alex Brooks
c6202daeed [Model] Support multiple images for qwen-vl (#8247)
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-09-12 10:10:54 -07:00
Isotr0py
e56bf27741 [Bugfix] Fix InternVL2 inference with various num_patches (#8375)
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-09-12 10:10:35 -07:00
Roger Wang
520ca380ae [Hotfix][VLM] Fixing max position embeddings for Pixtral (#8399) 2024-09-12 09:28:37 -07:00
youkaichao
7de49aa86c [torch.compile] hide slicing under custom op for inductor (#8384) 2024-09-12 00:11:55 -07:00
Woosuk Kwon
42ffba11ad [Misc] Use RoPE cache for MRoPE (#8396) 2024-09-11 23:13:14 -07:00
Kevin Lin
295c4730a8 [Misc] Raise error when using encoder/decoder model with cpu backend (#8355) 2024-09-12 05:45:24 +00:00
Blueyo0
1bf2dd9df0 [Gemma2] add bitsandbytes support for Gemma2 (#8338) 2024-09-11 21:53:12 -07:00
tomeras91
5a60699c45 [Bugfix]: Fix the logic for deciding if tool parsing is used (#8366) 2024-09-12 03:55:30 +00:00
Michael Goin
b6c75e1cf2 Fix the AMD weight loading tests (#8390) 2024-09-11 20:35:33 -07:00
Woosuk Kwon
b71c956deb [TPU] Use Ray for default distributed backend (#8389) 2024-09-11 20:31:51 -07:00
youkaichao
f842a7aff1 [misc] remove engine_use_ray (#8126) 2024-09-11 18:23:36 -07:00
Cody Yu
a65cb16067 [MISC] Dump model runner inputs when crashing (#8305) 2024-09-12 01:12:25 +00:00
703 changed files with 46331 additions and 18371 deletions

View File

@@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.764
- name: "exact_match,flexible-extract"
value: 0.764
limit: 250
num_fewshot: 5

View File

@@ -1,6 +1,7 @@
Meta-Llama-3-8B-Instruct.yaml Meta-Llama-3-8B-Instruct.yaml
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
Minitron-4B-Base-FP8.yaml Minitron-4B-Base-FP8.yaml

View File

@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on GSM for transformers. # We can use this script to compute baseline accuracy on GSM for transformers.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10 # pip install lm-eval==0.4.4
usage() { usage() {
echo`` echo``

View File

@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support. # We use this for fp8, which HF does not support.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install lm-eval==0.4.3 # pip install lm-eval==0.4.4
usage() { usage() {
echo`` echo``

View File

@@ -49,10 +49,15 @@ def test_lm_eval_correctness():
results = launch_lm_eval(eval_config) results = launch_lm_eval(eval_config)
# Confirm scores match ground truth. # Confirm scores match ground truth.
success = True
for task in eval_config["tasks"]: for task in eval_config["tasks"]:
for metric in task["metrics"]: for metric in task["metrics"]:
ground_truth = metric["value"] ground_truth = metric["value"]
measured_value = results["results"][task["name"]][metric["name"]] measured_value = results["results"][task["name"]][metric["name"]]
print(f'{task["name"]} | {metric["name"]}: ' print(f'{task["name"]} | {metric["name"]}: '
f'ground_truth={ground_truth} | measured={measured_value}') f'ground_truth={ground_truth} | measured={measured_value}')
assert numpy.isclose(ground_truth, measured_value, rtol=RTOL) success = success and numpy.isclose(
ground_truth, measured_value, rtol=RTOL)
# Assert at the end, print all scores even on failure for debugging.
assert success

View File

@@ -8,8 +8,7 @@ steps:
containers: containers:
- image: badouralix/curl-jq - image: badouralix/curl-jq
command: command:
- sh - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
- .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
- wait - wait
- label: "A100" - label: "A100"
agents: agents:

View File

@@ -0,0 +1,28 @@
## Description
This file contains the downloading link for benchmarking results.
- [benchmarking pipeline](artifact://nightly-pipeline.yaml)
- [benchmarking results](artifact://results.zip)
- [benchmarking code](artifact://nightly-benchmarks.zip)
Please download the visualization scripts in the post
## Results reproduction
- Find the docker we use in `benchmarking pipeline`
- Deploy the docker, and inside the docker:
- Download `nightly-benchmarks.zip`.
- In the same folder, run the following code
```
export HF_TOKEN=<your HF token>
apt update
apt install -y git
unzip nightly-benchmarks.zip
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
```
And the results will be inside `./benchmarks/results`.

View File

@@ -1,45 +1,39 @@
# Nightly benchmark # Nightly benchmark
The main goal of this benchmarking is two-fold: This benchmark aims to:
- Performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and tgi) leads in performance in what workload. - Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
- Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md](). - Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
## Docker images ## Setup
We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images: - Docker images:
- vllm/vllm-openai:v0.5.0.post1 - vLLM: `vllm/vllm-openai:v0.6.2`
- nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
- openmmlab/lmdeploy:v0.5.0 - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
- ghcr.io/huggingface/text-generation-inference:2.1 - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
- *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
- Hardware
- 8x Nvidia A100 GPUs
- Workload:
- Dataset
- ShareGPT dataset
- Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
- Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
- Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
- Models: llama-3 8B, llama-3 70B.
- We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
- Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
<!-- Please check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/nightly-pipeline.yaml">nightly-pipeline.yaml</a> artifact for more details on how we deploy the docker images. --> # Known issues
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
## Hardware - TGI does not support `ignore-eos` flag.
One AWS node with 8x NVIDIA A100 GPUs.
## Workload description
We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
- Input length: randomly sample 500 prompts from ShareGPT dataset (with fixed random seed).
- Output length: the corresponding output length of these 500 prompts.
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
- Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
<!-- Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/tests/nightly-tests.json">nightly-tests.json</a> artifact for more details. -->
## Plots
In the following plots, the dot shows the mean and the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed.
<img src="artifact://nightly_results.png" alt="Benchmarking results" height=250 >
## Results
{nightly_results_benchmarking_table}

View File

@@ -13,7 +13,7 @@ common_pod_spec: &common_pod_spec
common_container_settings: &common_container_settings common_container_settings: &common_container_settings
command: command:
- bash .buildkite/nightly-benchmarks/run-nightly-suite.sh - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
resources: resources:
limits: limits:
nvidia.com/gpu: 8 nvidia.com/gpu: 8
@@ -37,7 +37,10 @@ common_container_settings: &common_container_settings
steps: steps:
- block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours." - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
- label: "A100 trt benchmark"
- label: "A100 vllm step 10"
priority: 100 priority: 100
agents: agents:
queue: A100 queue: A100
@@ -46,7 +49,21 @@ steps:
podSpec: podSpec:
<<: *common_pod_spec <<: *common_pod_spec
containers: containers:
- image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 - image: vllm/vllm-openai:v0.6.2
<<: *common_container_settings
- label: "A100 sglang benchmark"
priority: 100
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
<<: *common_pod_spec
containers:
- image: lmsysorg/sglang:v0.3.2-cu121
<<: *common_container_settings <<: *common_container_settings
- label: "A100 lmdeploy benchmark" - label: "A100 lmdeploy benchmark"
@@ -58,11 +75,13 @@ steps:
podSpec: podSpec:
<<: *common_pod_spec <<: *common_pod_spec
containers: containers:
- image: openmmlab/lmdeploy:v0.5.0 - image: openmmlab/lmdeploy:v0.6.1-cu12
<<: *common_container_settings <<: *common_container_settings
- label: "A100 vllm benchmark"
- label: "A100 trt llama-8B"
priority: 100 priority: 100
agents: agents:
queue: A100 queue: A100
@@ -71,10 +90,25 @@ steps:
podSpec: podSpec:
<<: *common_pod_spec <<: *common_pod_spec
containers: containers:
- image: vllm/vllm-openai:latest - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
<<: *common_container_settings <<: *common_container_settings
env:
- name: VLLM_USAGE_SOURCE
value: ci-test
- name: HF_HOME
value: /root/.cache/huggingface
- name: VLLM_SOURCE_CODE_LOC
value: /workspace/build/buildkite/vllm/performance-benchmark
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
- name: TEST_SELECTOR
value: "llama8B"
- label: "A100 tgi benchmark"
- label: "A100 trt llama-70B"
priority: 100 priority: 100
agents: agents:
queue: A100 queue: A100
@@ -83,12 +117,54 @@ steps:
podSpec: podSpec:
<<: *common_pod_spec <<: *common_pod_spec
containers: containers:
- image: ghcr.io/huggingface/text-generation-inference:2.1 - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
<<: *common_container_settings <<: *common_container_settings
env:
- name: VLLM_USAGE_SOURCE
value: ci-test
- name: HF_HOME
value: /root/.cache/huggingface
- name: VLLM_SOURCE_CODE_LOC
value: /workspace/build/buildkite/vllm/performance-benchmark
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
- name: TEST_SELECTOR
value: "llama70B"
# FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image
# - label: "A100 trt benchmark"
# priority: 100
# agents:
# queue: A100
# plugins:
# - kubernetes:
# podSpec:
# <<: *common_pod_spec
# containers:
# - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
# <<: *common_container_settings
# FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
# - label: "A100 tgi benchmark"
# priority: 100
# agents:
# queue: A100
# plugins:
# - kubernetes:
# podSpec:
# <<: *common_pod_spec
# containers:
# - image: ghcr.io/huggingface/text-generation-inference:2.2.0
# <<: *common_container_settings
- wait - wait
- label: "Plot" - label: "Collect the results"
priority: 100 priority: 100
agents: agents:
queue: A100 queue: A100
@@ -117,4 +193,4 @@ steps:
name: hf-token-secret name: hf-token-secret
key: token key: token
- wait - block: ":rocket: check the results!"

View File

@@ -1,76 +0,0 @@
#!/bin/bash
set -o pipefail
set -x
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
check_hf_token() {
# check if HF_TOKEN is available and valid
if [[ -z "$HF_TOKEN" ]]; then
echo "Error: HF_TOKEN is not set."
exit 1
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
echo "Error: HF_TOKEN does not start with 'hf_'."
exit 1
else
echo "HF_TOKEN is set and valid."
fi
}
main() {
check_gpus
check_hf_token
df -h
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq)
cd $VLLM_SOURCE_CODE_LOC/benchmarks
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
# run lmdeploy
if which lmdeploy >/dev/null; then
echo "lmdeploy is available, redirect to run-lmdeploy-nightly.sh"
bash ../.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
exit 0
fi
# run tgi
if [ -e /tgi-entrypoint.sh ]; then
echo "tgi is available, redirect to run-tgi-nightly.sh"
bash ../.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
exit 0
fi
# run trt
if which trtllm-build >/dev/null; then
echo "trtllm is available, redirect to run-trt-nightly.sh"
bash ../.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
exit 0
fi
# run vllm
if [ -e /vllm-workspace ]; then
echo "vllm is available, redirect to run-vllm-nightly.sh"
bash ../.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
exit 0
fi
}
main "$@"

View File

@@ -0,0 +1,95 @@
import argparse
import json
from pathlib import Path
import numpy as np
import pandas as pd
from tabulate import tabulate
def parse_arguments():
parser = argparse.ArgumentParser(
description=
'Parse command line arguments for summary-nightly-results script.')
parser.add_argument('--results-folder',
type=str,
required=True,
help='The folder where the results are stored.')
parser.add_argument('--description',
type=str,
required=True,
help='Description of the results.')
args = parser.parse_args()
return args
def get_perf(df, method, model, metric):
means = []
for qps in [2, 4, 8, 16, "inf"]:
target = df['Test name'].str.contains(model)
target = target & df['Engine'].str.contains(method)
target = target & df['Test name'].str.contains("qps_" + str(qps))
filtered_df = df[target]
if filtered_df.empty:
means.append(0.)
else:
means.append(filtered_df[metric].values[0])
return np.array(means)
def get_perf_w_std(df, method, model, metric):
if metric in ["TTFT", "ITL"]:
mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
mean = mean.tolist()
std = get_perf(df, method, model, "Std " + metric + " (ms)")
if std.mean() == 0:
std = None
success = get_perf(df, method, model, "Successful req.")
if std is not None:
std = std / np.sqrt(success)
std = std.tolist()
else:
assert metric == "Tput"
mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
df, method, model, "Output Tput (tok/s)")
mean = mean.tolist()
std = None
return mean, std
def main(args):
results_folder = Path(args.results_folder)
results = []
# collect results
for test_file in results_folder.glob("*_nightly_results.json"):
with open(test_file, "r") as f:
results = results + json.loads(f.read())
# generate markdown table
df = pd.DataFrame.from_dict(results)
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
with open(args.description, "r") as f:
description = f.read()
description = description.format(
nightly_results_benchmarking_table=md_table)
with open("nightly_results.md", "w") as f:
f.write(description)
if __name__ == '__main__':
args = parse_arguments()
main(args)

View File

@@ -0,0 +1,241 @@
#!/bin/bash
# Currently FP8 benchmark is NOT enabled.
set -x
server_params=$1
common_params=$2
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
launch_trt_server() {
model_path=$(echo "$common_params" | jq -r '.model')
model_name="${model_path#*/}"
model_type=$(echo "$server_params" | jq -r '.model_type')
model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
model_tp_size=$(echo "$common_params" | jq -r '.tp')
max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
max_seq_len=$(echo "$server_params" | jq -r '.max_seq_len')
max_num_tokens=$(echo "$server_params" | jq -r '.max_num_tokens')
trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
# create model caching directory
cd ~
rm -rf models
mkdir -p models
cd models
models_dir=$(pwd)
trt_model_path=${models_dir}/${model_name}-trt-ckpt
trt_engine_path=${models_dir}/${model_name}-trt-engine
# clone tensorrt backend
cd /
rm -rf tensorrtllm_backend
git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
git lfs install
cd tensorrtllm_backend
git checkout $trt_llm_version
tensorrtllm_backend_dir=$(pwd)
git submodule update --init --recursive
# build trtllm engine
cd /tensorrtllm_backend
cd ./tensorrt_llm/examples/${model_type}
python3 convert_checkpoint.py \
--model_dir ${model_path} \
--dtype ${model_dtype} \
--tp_size ${model_tp_size} \
--output_dir ${trt_model_path}
trtllm-build \
--checkpoint_dir ${trt_model_path} \
--use_fused_mlp \
--reduce_fusion disable \
--workers 8 \
--gpt_attention_plugin ${model_dtype} \
--gemm_plugin ${model_dtype} \
--tp_size ${model_tp_size} \
--max_batch_size ${max_batch_size} \
--max_input_len ${max_input_len} \
--max_seq_len ${max_seq_len} \
--max_num_tokens ${max_num_tokens} \
--output_dir ${trt_engine_path}
# handle triton protobuf files and launch triton server
cd /tensorrtllm_backend
mkdir triton_model_repo
cp -r all_models/inflight_batcher_llm/* triton_model_repo/
cd triton_model_repo
rm -rf ./tensorrt_llm/1/*
cp -r ${trt_engine_path}/* ./tensorrt_llm/1
python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false
python3 ../tools/fill_template.py -i preprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5
python3 ../tools/fill_template.py -i postprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false
python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:$max_batch_size
python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:"False",bls_instance_count:1
cd /tensorrtllm_backend
python3 scripts/launch_triton_server.py \
--world_size=${model_tp_size} \
--model_repo=/tensorrtllm_backend/triton_model_repo &
}
launch_tgi_server() {
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
server_args=$(json2args "$server_params")
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
echo "Key 'fp8' exists in common params."
server_command="/tgi-entrypoint.sh \
--model-id $model \
--num-shard $tp \
--port $port \
--quantize fp8 \
$server_args"
else
echo "Key 'fp8' does not exist in common params."
server_command="/tgi-entrypoint.sh \
--model-id $model \
--num-shard $tp \
--port $port \
$server_args"
fi
echo "Server command: $server_command"
eval "$server_command" &
}
launch_lmdeploy_server() {
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
server_args=$(json2args "$server_params")
server_command="lmdeploy serve api_server $model \
--tp $tp \
--server-port $port \
$server_args"
# run the server
echo "Server command: $server_command"
bash -c "$server_command" &
}
launch_sglang_server() {
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
server_args=$(json2args "$server_params")
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
server_command="python3 \
-m sglang.launch_server \
--tp $tp \
--model-path $model \
--port $port \
$server_args"
else
echo "Key 'fp8' does not exist in common params."
server_command="python3 \
-m sglang.launch_server \
--tp $tp \
--model-path $model \
--port $port \
$server_args"
fi
# run the server
echo "Server command: $server_command"
eval "$server_command" &
}
launch_vllm_server() {
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
server_args=$(json2args "$server_params")
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
server_command="python3 \
-m vllm.entrypoints.openai.api_server \
-tp $tp \
--model $model \
--port $port \
$server_args"
else
echo "Key 'fp8' does not exist in common params."
server_command="python3 \
-m vllm.entrypoints.openai.api_server \
-tp $tp \
--model $model \
--port $port \
$server_args"
fi
# run the server
echo "Server command: $server_command"
eval "$server_command" &
}
main() {
if [[ $CURRENT_LLM_SERVING_ENGINE == "trt" ]]; then
launch_trt_server
fi
if [[ $CURRENT_LLM_SERVING_ENGINE == "tgi" ]]; then
launch_tgi_server
fi
if [[ $CURRENT_LLM_SERVING_ENGINE == "lmdeploy" ]]; then
launch_lmdeploy_server
fi
if [[ $CURRENT_LLM_SERVING_ENGINE == "sglang" ]]; then
launch_sglang_server
fi
if [[ "$CURRENT_LLM_SERVING_ENGINE" == *"vllm"* ]]; then
launch_vllm_server
fi
}
main

View File

@@ -1,102 +0,0 @@
#!/bin/bash
server_params=$1
common_params=$2
model_path=$(echo "$common_params" | jq -r '.model')
model_name="${model_path#*/}"
model_type=$(echo "$server_params" | jq -r '.model_type')
model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
model_tp_size=$(echo "$common_params" | jq -r '.tp')
max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
max_output_len=$(echo "$server_params" | jq -r '.max_output_len')
trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
cd ~
rm -rf models
mkdir -p models
cd models
models_dir=$(pwd)
trt_model_path=${models_dir}/${model_name}-trt-ckpt
trt_engine_path=${models_dir}/${model_name}-trt-engine
cd ~
rm -rf tensorrt-demo
git clone https://github.com/neuralmagic/tensorrt-demo.git
cd tensorrt-demo
tensorrt_demo_dir=$(pwd)
# make sure the parameter inside tensorrt_demo is consistent to envvar
sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt
sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt
cd /
rm -rf tensorrtllm_backend
git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
git lfs install
cd tensorrtllm_backend
git checkout $trt_llm_version
tensorrtllm_backend_dir=$(pwd)
git submodule update --init --recursive
cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/
cd /tensorrtllm_backend
cd ./tensorrt_llm/examples/${model_type}
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
echo "Key 'fp8' exists in common params. Use quantize.py instead of convert_checkpoint.py"
echo "Reference: https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llama/README.md"
python ../quantization/quantize.py \
--model_dir ${model_path} \
--dtype ${model_dtype} \
--tp_size ${model_tp_size} \
--output_dir ${trt_model_path} \
--qformat fp8 \
--kv_cache_dtype fp8 \
--calib_size 2
else
echo "Key 'fp8' does not exist in common params. Use convert_checkpoint.py"
python3 convert_checkpoint.py \
--model_dir ${model_path} \
--dtype ${model_dtype} \
--tp_size ${model_tp_size} \
--output_dir ${trt_model_path}
fi
trtllm-build \
--checkpoint_dir=${trt_model_path} \
--gpt_attention_plugin=${model_dtype} \
--gemm_plugin=${model_dtype} \
--remove_input_padding=enable \
--paged_kv_cache=enable \
--tp_size=${model_tp_size} \
--max_batch_size=${max_batch_size} \
--max_input_len=${max_input_len} \
--max_output_len=${max_output_len} \
--max_num_tokens=${max_output_len} \
--opt_num_tokens=${max_output_len} \
--output_dir=${trt_engine_path}
cd /tensorrtllm_backend/triton_model_repo
rm -rf ./tensorrt_llm/1/*
cp -r ${trt_engine_path}/* ./tensorrt_llm/1
cd /tensorrtllm_backend
python3 scripts/launch_triton_server.py \
--world_size=${model_tp_size} \
--model_repo=/tensorrtllm_backend/triton_model_repo &

View File

@@ -8,6 +8,7 @@ main() {
(which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq) (which jq) || (apt-get update && apt-get -y install jq)
(which zip) || (apt-get install -y zip)
if [ ! -f /workspace/buildkite-agent ]; then if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip plotting the results." echo "buildkite-agent binary not found. Skip plotting the results."
@@ -24,17 +25,54 @@ main() {
ls ls
ls results/ ls results/
# generate figures # upload benchmark results
python3 -m pip install tabulate pandas matplotlib zip -r results.zip results/
python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ /workspace/buildkite-agent artifact upload "results.zip"
--description $description \
--results-folder results/
# upload results and figures # upload benchmarking scripts
/workspace/buildkite-agent artifact upload "nightly_results.png" cd $VLLM_SOURCE_CODE_LOC/
/workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
/workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
# upload benchmarking pipeline
/workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
# The figures should be genereated by a separate process outside the CI/CD pipeline
# # generate figures
# python3 -m pip install tabulate pandas matplotlib
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
# --description $description \
# --results-folder results/
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
# --description $description \
# --results-folder results/ \
# --dataset sharegpt
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
# --description $description \
# --results-folder results/ \
# --dataset sonnet_2048_128
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
# --description $description \
# --results-folder results/ \
# --dataset sonnet_128_2048
# # upload results and figures
# /workspace/buildkite-agent artifact upload "nightly_results*.png"
# /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
# /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
# /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
} }
main "$@" main "$@"

View File

@@ -1,135 +0,0 @@
import argparse
import json
import math
from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd
from tabulate import tabulate
def parse_arguments():
parser = argparse.ArgumentParser(
description=
'Parse command line arguments for summary-nightly-results script.')
parser.add_argument('--results-folder',
type=str,
required=True,
help='The folder where the results are stored.')
parser.add_argument('--description',
type=str,
required=True,
help='Description of the results.')
args = parser.parse_args()
return args
def main(args):
bar_colors = ['#56B4E9', '#009E73', '#D55E00', '#E69F00']
results_folder = Path(args.results_folder)
results = []
# collect results
for test_file in results_folder.glob("*_nightly_results.json"):
with open(test_file, "r") as f:
results = results + json.loads(f.read())
# generate markdown table
df = pd.DataFrame.from_dict(results)
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
with open(args.description, "r") as f:
description = f.read()
description = description.format(
nightly_results_benchmarking_table=md_table)
with open("nightly_results.md", "w") as f:
f.write(description)
plt.rcParams.update({'font.size': 20})
# plot results
fig, axes = plt.subplots(3, 3, figsize=(16, 14))
fig.subplots_adjust(hspace=1)
methods = ["vllm", "trt", "lmdeploy", "tgi"]
for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
for j, metric in enumerate(["TTFT", "ITL"]):
means, stds = [], []
for method in methods:
target = df['Test name'].str.contains(model)
target = target & df['Engine'].str.contains(method)
filtered_df = df[target]
if filtered_df.empty:
means.append(0.)
stds.append(0.)
else:
means.append(filtered_df[f"Mean {metric} (ms)"].values[0])
std = filtered_df[f"Std {metric} (ms)"].values[0]
success = filtered_df["Successful req."].values[0]
stds.append(std / math.sqrt(success))
print(model, metric)
print(means, stds)
ax = axes[i, j + 1]
bars = ax.bar(
["vllm", "trt", "lmdeploy", "tgi"],
means,
yerr=stds,
capsize=10,
)
for idx, bar in enumerate(bars):
bar.set_color(bar_colors[idx])
ax.set_ylim(bottom=0)
ax.set_ylabel(f"{metric} (ms)")
ax.set_title(f"{model} {metric}")
ax.grid(axis='y')
metric = "Tput"
j = 0
if True:
tputs = []
for method in methods:
target = df['Test name'].str.contains(model)
target = target & df['Engine'].str.contains(method)
filtered_df = df[target]
if filtered_df.empty:
tputs.append(0.)
else:
input_tput = filtered_df["Input Tput (tok/s)"].values[0]
output_tput = filtered_df["Output Tput (tok/s)"].values[0]
tputs.append(input_tput + output_tput)
print(model, metric)
print(tputs)
ax = axes[i, j]
bars = ax.bar(
["vllm", "trt", "lmdeploy", "tgi"],
tputs,
)
for idx, bar in enumerate(bars):
bar.set_color(bar_colors[idx])
ax.set_ylim(bottom=0)
ax.set_ylabel("Tput (token/s)")
ax.set_title(f"{model} {metric}")
ax.grid(axis='y')
fig.tight_layout()
fig.savefig("nightly_results.png", bbox_inches='tight', dpi=400)
if __name__ == '__main__':
args = parse_arguments()
main(args)

View File

@@ -1,218 +0,0 @@
#!/bin/bash
set -o pipefail
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
kill_gpu_processes() {
pkill lmdeploy || true
# waiting for GPU processes to be fully killed
sleep 10
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
}
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
timeout 1200 bash -c '
until curl -s localhost:8000/v1/completions > /dev/null; do
sleep 1
done' && return 0 || return 1
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# append lmdeploy to the test name
test_name=lmdeploy_$test_name
# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
# get client and server arguments
server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters')
client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters')
server_args=$(json2args "$server_params")
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
# prepare tokenizer
rm -rf /tokenizer_cache
mkdir /tokenizer_cache
python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
--model "$model" \
--cachedir /tokenizer_cache
server_command="lmdeploy serve api_server $model \
--tp $tp \
--server-port $port \
$server_args"
# run the server
echo "Running test case $test_name"
echo "Server command: $server_command"
bash -c "$server_command" &
# wait until the server is alive
wait_for_server
if [ $? -eq 0 ]; then
echo ""
echo "lmdeploy server is up and running."
else
echo ""
echo "lmdeploy failed to start within the timeout period."
break
fi
# get model name
model_name=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
client_command="python3 benchmark_serving.py \
--backend lmdeploy \
--tokenizer /tokenizer_cache \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
--model \"$model_name\" \
$client_args"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
--arg engine "lmdeploy" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
# clean up
kill_gpu_processes
rm -rf /root/.cache/huggingface/*
done
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
main() {
check_gpus
# enter vllm directory
cd $VLLM_SOURCE_CODE_LOC/benchmarks
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
python -m pip install transformers==4.41.2
export CURRENT_LLM_SERVING_ENGINE=lmdeploy
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
python -m pip install tabulate pandas
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
upload_to_buildkite
}
main "$@"

View File

@@ -0,0 +1,357 @@
#!/bin/bash
set -o pipefail
set -x
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
check_hf_token() {
# check if HF_TOKEN is available and valid
if [[ -z "$HF_TOKEN" ]]; then
echo "Error: HF_TOKEN is not set."
exit 1
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
echo "Error: HF_TOKEN does not start with 'hf_'."
exit 1
else
echo "HF_TOKEN is set and valid."
fi
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
get_current_llm_serving_engine() {
if which lmdeploy >/dev/null; then
echo "Container: lmdeploy"
export CURRENT_LLM_SERVING_ENGINE=lmdeploy
return
fi
if [ -e /tgi-entrypoint.sh ]; then
echo "Container: tgi"
export CURRENT_LLM_SERVING_ENGINE=tgi
return
fi
if which trtllm-build >/dev/null; then
echo "Container: tensorrt-llm"
export CURRENT_LLM_SERVING_ENGINE=trt
return
fi
if [ -e /sgl-workspace ]; then
echo "Container: sglang"
export CURRENT_LLM_SERVING_ENGINE=sglang
return
fi
if [ -e /vllm-workspace ]; then
echo "Container: vllm"
# move to a completely irrelevant directory, to avoid import vllm from current folder
export CURRENT_LLM_SERVING_ENGINE=vllm
return
fi
}
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
kill_gpu_processes() {
pkill -f python
pkill -f python3
pkill -f tritonserver
pkill -f pt_main_thread
pkill -f text-generation
pkill -f lmdeploy
while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
sleep 1
done
}
wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
timeout 1200 bash -c '
until curl -s localhost:8000/v1/completions > /dev/null; do
sleep 1
done' && return 0 || return 1
}
ensure_installed() {
# Ensure that the given command is installed by apt-get
local cmd=$1
if ! which $cmd >/dev/null; then
apt-get update && apt-get install -y $cmd
fi
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# prepend the current serving engine to the test name
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
# get client and server arguments
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
if [[ $reuse_server == "true" ]]; then
echo "Reuse previous server for test case $test_name"
else
kill_gpu_processes
bash $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh \
"$server_params" "$common_params"
fi
wait_for_server
if [ $? -eq 0 ]; then
echo ""
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
else
echo ""
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
break
fi
# prepare tokenizer
# this is required for lmdeploy.
cd $VLLM_SOURCE_CODE_LOC/benchmarks
rm -rf /tokenizer_cache
mkdir /tokenizer_cache
python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
--model "$model" \
--cachedir /tokenizer_cache
cd $VLLM_SOURCE_CODE_LOC/benchmarks
# change model name for lmdeploy (it will not follow standard hf name)
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
fi
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
backend=$CURRENT_LLM_SERVING_ENGINE
if [[ $backend = "trt" ]]; then
backend="tensorrt-llm"
fi
if [[ "$backend" == *"vllm"* ]]; then
backend="vllm"
fi
if [[ "$dataset_name" = "sharegpt" ]]; then
client_command="python3 benchmark_serving.py \
--backend $backend \
--tokenizer /tokenizer_cache \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
--ignore-eos \
$client_args"
elif [[ "$dataset_name" = "sonnet" ]]; then
sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
client_command="python3 benchmark_serving.py \
--backend $backend \
--tokenizer /tokenizer_cache \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--sonnet-input-len $sonnet_input_len \
--sonnet-output-len $sonnet_output_len \
--sonnet-prefix-len $sonnet_prefix_len \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
--ignore-eos \
$client_args"
else
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
exit 1
fi
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
server_command="None"
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
--arg engine "$CURRENT_LLM_SERVING_ENGINE" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
done
kill_gpu_processes
}
prepare_dataset() {
# download sharegpt dataset
cd $VLLM_SOURCE_CODE_LOC/benchmarks
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
# duplicate sonnet by 4x, to allow benchmarking with input length 2048
cd $VLLM_SOURCE_CODE_LOC/benchmarks
echo "" > sonnet_4x.txt
for _ in {1..4}
do
cat sonnet.txt >> sonnet_4x.txt
done
}
main() {
# check if the environment variable is successfully injected from yaml
check_gpus
check_hf_token
get_current_llm_serving_engine
pip install -U transformers
# check storage
df -h
ensure_installed wget
ensure_installed curl
ensure_installed jq
prepare_dataset
cd $VLLM_SOURCE_CODE_LOC/benchmarks
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
# run the test
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
# upload benchmark results to buildkite
python3 -m pip install tabulate pandas
python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
upload_to_buildkite
}
main "$@"

View File

@@ -1,216 +0,0 @@
#!/bin/bash
set -o pipefail
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
kill_gpu_processes() {
pkill text-generation || true
# waiting for GPU processes to be fully killed
sleep 10
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
}
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
wait_for_server() {
timeout 1200 bash -c '
until curl -s localhost:8000/generate_stream > /dev/null; do
sleep 1
done' && return 0 || return 1
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# append tgi to the test name
test_name=tgi_$test_name
# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
# get client and server arguments
server_params=$(echo "$params" | jq -r '.tgi_server_parameters')
client_params=$(echo "$params" | jq -r '.tgi_client_parameters')
server_args=$(json2args "$server_params")
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
echo "Key 'fp8' exists in common params."
server_command="/tgi-entrypoint.sh \
--model-id $model \
--num-shard $tp \
--port $port \
--quantize fp8 \
$server_args"
else
echo "Key 'fp8' does not exist in common params."
server_command="/tgi-entrypoint.sh \
--model-id $model \
--num-shard $tp \
--port $port \
$server_args"
fi
# run the server
echo "Running test case $test_name"
echo "Server command: $server_command"
eval "$server_command" &
# wait until the server is alive
wait_for_server
if [ $? -eq 0 ]; then
echo ""
echo "tgi server is up and running."
else
echo ""
echo "tgi failed to start within the timeout period."
break
fi
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
client_command="python3 benchmark_serving.py \
--backend tgi \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
$client_args"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
--arg engine "tgi" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
# clean up
kill_gpu_processes
rm -rf /root/.cache/huggingface/*
done
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
main() {
check_gpus
# enter vllm directory
cd $VLLM_SOURCE_CODE_LOC/benchmarks
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
export CURRENT_LLM_SERVING_ENGINE=tgi
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
python -m pip install tabulate pandas
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
upload_to_buildkite
}
main "$@"

View File

@@ -1,214 +0,0 @@
#!/bin/bash
set -o pipefail
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
kill_gpu_processes() {
pkill tritonserver || true
# waiting for GPU processes to be fully killed
sleep 20
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
}
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
wait_for_server() {
timeout 1200 bash -c '
until curl -s localhost:8000/generate_stream > /dev/null; do
sleep 1
done' && return 0 || return 1
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# append trt to the test name
test_name=trt_$test_name
# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
# get client and server arguments
server_params=$(echo "$params" | jq -r '.trt_server_parameters')
client_params=$(echo "$params" | jq -r '.trt_client_parameters')
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required model_tp_size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
cd $VLLM_SOURCE_CODE_LOC/benchmarks
echo "Running test case $test_name"
bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params"
# wait until the server is alive
wait_for_server
if [ $? -eq 0 ]; then
echo ""
echo "trt server is up and running."
else
echo ""
echo "trt failed to start within the timeout period."
break
fi
# prepare tokenizer
cd $VLLM_SOURCE_CODE_LOC/benchmarks
rm -rf /tokenizer_cache
mkdir /tokenizer_cache
python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
--model "$model" \
--cachedir /tokenizer_cache
cd $VLLM_SOURCE_CODE_LOC/benchmarks
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
client_command="python3 benchmark_serving.py \
--backend tensorrt-llm \
--tokenizer /tokenizer_cache \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
$client_args"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
server_command=""
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
--arg engine "trt" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
# clean up
kill_gpu_processes
rm -rf /root/.cache/huggingface/*
done
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
main() {
check_gpus
# enter vllm directory
cd $VLLM_SOURCE_CODE_LOC/benchmarks
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
# update transformers package, to make sure mixtral tokenizer is available
python -m pip install transformers -U
export CURRENT_LLM_SERVING_ENGINE=trt
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
python -m pip install tabulate pandas
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
upload_to_buildkite
}
main "$@"

View File

@@ -1,221 +0,0 @@
#!/bin/bash
set -o pipefail
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
kill_gpu_processes() {
# kill all processes on GPU.
pkill pt_main_thread
sleep 10
# remove vllm config file
rm -rf ~/.config/vllm
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
}
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
timeout 1200 bash -c '
until curl -s localhost:8000/v1/completions > /dev/null; do
sleep 1
done' && return 0 || return 1
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# append vllm to the test name
test_name=vllm_$test_name
# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
# get client and server arguments
server_params=$(echo "$params" | jq -r '.vllm_server_parameters')
client_params=$(echo "$params" | jq -r '.vllm_client_parameters')
server_args=$(json2args "$server_params")
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
server_command="python3 \
-m vllm.entrypoints.openai.api_server \
-tp $tp \
--model $model \
--port $port \
$server_args"
else
echo "Key 'fp8' does not exist in common params."
server_command="python3 \
-m vllm.entrypoints.openai.api_server \
-tp $tp \
--model $model \
--port $port \
$server_args"
fi
# run the server
echo "Running test case $test_name"
echo "Server command: $server_command"
eval "$server_command" &
# wait until the server is alive
wait_for_server
if [ $? -eq 0 ]; then
echo ""
echo "vllm server is up and running."
else
echo ""
echo "vllm failed to start within the timeout period."
break
fi
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
client_command="python3 benchmark_serving.py \
--backend vllm \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
$client_args"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
--arg engine "vllm" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
# clean up
kill_gpu_processes
rm -rf /root/.cache/huggingface/*
done
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
main() {
check_gpus
# enter vllm directory
cd $VLLM_SOURCE_CODE_LOC/benchmarks
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
export CURRENT_LLM_SERVING_ENGINE=vllm
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
python3 -m pip install tabulate pandas
python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
upload_to_buildkite
}
main "$@"

View File

@@ -17,10 +17,17 @@ serving_column_mapping = {
"request_throughput": "Tput (req/s)", "request_throughput": "Tput (req/s)",
"mean_ttft_ms": "Mean TTFT (ms)", "mean_ttft_ms": "Mean TTFT (ms)",
"std_ttft_ms": "Std TTFT (ms)", "std_ttft_ms": "Std TTFT (ms)",
"median_ttft_ms": "Median TTFT (ms)",
"mean_itl_ms": "Mean ITL (ms)", "mean_itl_ms": "Mean ITL (ms)",
"std_itl_ms": "Std ITL (ms)", "std_itl_ms": "Std ITL (ms)",
"input_throughput": "Input Tput (tok/s)", "median_itl_ms": "Median ITL (ms)",
"mean_tpot_ms": "Mean TPOT (ms)",
"std_tpot_ms": "Std TPOT (ms)",
"median_tpot_ms": "Median TPOT (ms)",
"total_token_throughput": "Total Token Tput (tok/s)",
"output_throughput": "Output Tput (tok/s)", "output_throughput": "Output Tput (tok/s)",
"total_input_tokens": "Total input tokens",
"total_output_tokens": "Total output tokens",
"engine": "Engine", "engine": "Engine",
} }

View File

@@ -2,9 +2,11 @@
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token) TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT" URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
TIMEOUT_SECONDS=10
retries=0 retries=0
while [ $retries -lt 1000 ]; do while [ $retries -lt 1000 ]; do
if [ $(curl -s -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then if [ $(curl -s --max-time $TIMEOUT_SECONDS -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
exit 0 exit 0
fi fi

View File

@@ -1,16 +1,18 @@
[ [
{ {
"test_name": "llama8B_tp1", "test_name": "llama8B_tp1_sharegpt",
"qps_list": [4], "qps_list": [4,8,16,32,"inf"],
"common_parameters": { "common_parameters": {
"model": "meta-llama/Meta-Llama-3-8B", "model": "meta-llama/Meta-Llama-3-8B-Instruct",
"tp": 1, "tp": 1,
"dataset_name": "sharegpt", "dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 500, "num_prompts": 500,
"port": 8000 "port": 8000,
"reuse_server": false
}, },
"lmdeploy_server_parameters": { "lmdeploy_server_parameters": {
"dtype": "bfloat16"
}, },
"lmdeploy_client_parameters": { "lmdeploy_client_parameters": {
}, },
@@ -21,34 +23,158 @@
}, },
"trt_server_parameters": { "trt_server_parameters": {
"model_type": "llama", "model_type": "llama",
"model_dtype": "float16", "model_dtype": "bfloat16",
"max_batch_size": 256, "max_batch_size": 2048,
"max_input_len": 4096, "max_input_len": 4096,
"max_output_len": 4096, "max_seq_len": 6144,
"trt_llm_version": "r24.04" "max_num_tokens": 16384,
"trt_llm_version": "v0.11.0"
}, },
"trt_client_parameters": { "trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream" "endpoint": "/v2/models/ensemble/generate_stream"
}, },
"vllm_server_parameters": { "vllm_server_parameters": {
"disable_log_stats": "", "disable_log_stats": "",
"disable_log_requests": "" "disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
}, },
"vllm_client_parameters": { "vllm_client_parameters": {
},
"sglang_server_parameters": {
"disable_radix_cache": "",
"enable_torch_compile": "",
"dtype": "bfloat16"
},
"sglang_client_parameters": {
} }
}, },
{ {
"test_name": "llama70B_tp4", "test_name": "llama8B_tp1_sonnet_512_16",
"qps_list": [2], "qps_list": [4,8,16,32,"inf"],
"common_parameters": {
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
"tp": 1,
"dataset_name": "sonnet",
"dataset_path": "./sonnet_4x.txt",
"num_prompts": 500,
"port": 8000,
"sonnet_input_len": 512,
"sonnet_output_len": 16,
"sonnet_prefix_len": 50,
"reuse_server": true
},
"lmdeploy_server_parameters": {
"dtype": "bfloat16"
},
"lmdeploy_client_parameters": {
},
"tgi_server_parameters": {
},
"tgi_client_parameters": {
"endpoint": "/generate_stream"
},
"trt_server_parameters": {
"model_type": "llama",
"model_dtype": "bfloat16",
"max_batch_size": 2048,
"max_input_len": 4096,
"max_seq_len": 6144,
"max_num_tokens": 16384,
"trt_llm_version": "v0.11.0"
},
"trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
"vllm_client_parameters": {
},
"sglang_server_parameters": {
"disable_radix_cache": "",
"enable_torch_compile": "",
"dtype": "bfloat16"
},
"sglang_client_parameters": {
}
},
{
"test_name": "llama8B_tp1_sonnet_512_256",
"qps_list": [4,8,16,32,"inf"],
"common_parameters": {
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
"tp": 1,
"dataset_name": "sonnet",
"dataset_path": "./sonnet_4x.txt",
"num_prompts": 500,
"port": 8000,
"sonnet_input_len": 512,
"sonnet_output_len": 256,
"sonnet_prefix_len": 50,
"reuse_server": true
},
"lmdeploy_server_parameters": {
"dtype": "bfloat16"
},
"lmdeploy_client_parameters": {
},
"tgi_server_parameters": {
},
"tgi_client_parameters": {
"endpoint": "/generate_stream"
},
"trt_server_parameters": {
"model_type": "llama",
"model_dtype": "bfloat16",
"max_batch_size": 2048,
"max_input_len": 4096,
"max_seq_len": 6144,
"max_num_tokens": 16384,
"trt_llm_version": "v0.11.0"
},
"trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
"vllm_client_parameters": {
},
"sglang_server_parameters": {
"disable_radix_cache": "",
"enable_torch_compile": "",
"dtype": "bfloat16"
},
"sglang_client_parameters": {
}
},
{
"test_name": "llama70B_tp4_sharegpt",
"qps_list": [4,8,16,32,"inf"],
"common_parameters": { "common_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct", "model": "meta-llama/Meta-Llama-3-70B-Instruct",
"tp": 4, "tp": 4,
"dataset_name": "sharegpt", "dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 500, "num_prompts": 500,
"port": 8000 "port": 8000,
"reuse_server": false
}, },
"lmdeploy_server_parameters": { "lmdeploy_server_parameters": {
"dtype": "bfloat16"
}, },
"lmdeploy_client_parameters": { "lmdeploy_client_parameters": {
}, },
@@ -59,34 +185,50 @@
}, },
"trt_server_parameters": { "trt_server_parameters": {
"model_type": "llama", "model_type": "llama",
"model_dtype": "float16", "model_dtype": "bfloat16",
"max_batch_size": 256, "max_batch_size": 2048,
"max_input_len": 4096, "max_input_len": 4096,
"max_output_len": 4096, "max_seq_len": 6144,
"trt_llm_version": "r24.04" "max_num_tokens": 16384,
"trt_llm_version": "v0.11.0"
}, },
"trt_client_parameters": { "trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream" "endpoint": "/v2/models/ensemble/generate_stream"
}, },
"vllm_server_parameters": { "vllm_server_parameters": {
"disable_log_stats": "", "disable_log_stats": "",
"disable_log_requests": "" "disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
}, },
"vllm_client_parameters": { "vllm_client_parameters": {
},
"sglang_server_parameters": {
"disable_radix_cache": "",
"dtype": "bfloat16"
},
"sglang_client_parameters": {
} }
}, },
{ {
"test_name": "mixtral8x7B_tp2", "test_name": "llama70B_tp4_sonnet_512_16",
"qps_list": [2], "qps_list": [4,8,16,32,"inf"],
"common_parameters": { "common_parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1", "model": "meta-llama/Meta-Llama-3-70B-Instruct",
"tp": 2, "tp": 4,
"dataset_name": "sharegpt", "dataset_name": "sonnet",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset_path": "./sonnet_4x.txt",
"num_prompts": 500, "num_prompts": 500,
"port": 8000 "port": 8000,
"sonnet_input_len": 512,
"sonnet_output_len": 16,
"sonnet_prefix_len": 50,
"reuse_server": true
}, },
"lmdeploy_server_parameters": { "lmdeploy_server_parameters": {
"dtype": "bfloat16"
}, },
"lmdeploy_client_parameters": { "lmdeploy_client_parameters": {
}, },
@@ -97,20 +239,85 @@
}, },
"trt_server_parameters": { "trt_server_parameters": {
"model_type": "llama", "model_type": "llama",
"model_dtype": "float16", "model_dtype": "bfloat16",
"max_batch_size": 256, "max_batch_size": 2048,
"max_input_len": 4096, "max_input_len": 4096,
"max_output_len": 4096, "max_seq_len": 6144,
"trt_llm_version": "r24.04" "max_num_tokens": 16384,
"trt_llm_version": "v0.11.0"
}, },
"trt_client_parameters": { "trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream" "endpoint": "/v2/models/ensemble/generate_stream"
}, },
"vllm_server_parameters": { "vllm_server_parameters": {
"disable_log_stats": "", "disable_log_stats": "",
"disable_log_requests": "" "disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
}, },
"vllm_client_parameters": { "vllm_client_parameters": {
},
"sglang_server_parameters": {
"disable_radix_cache": "",
"dtype": "bfloat16"
},
"sglang_client_parameters": {
}
},
{
"test_name": "llama70B_tp4_sonnet_512_256",
"qps_list": [4,8,16,32,"inf"],
"common_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
"tp": 4,
"dataset_name": "sonnet",
"dataset_path": "./sonnet_4x.txt",
"num_prompts": 500,
"port": 8000,
"sonnet_input_len": 512,
"sonnet_output_len": 256,
"sonnet_prefix_len": 50,
"reuse_server": true
},
"lmdeploy_server_parameters": {
"dtype": "bfloat16"
},
"lmdeploy_client_parameters": {
},
"tgi_server_parameters": {
},
"tgi_client_parameters": {
"endpoint": "/generate_stream"
},
"trt_server_parameters": {
"model_type": "llama",
"model_dtype": "bfloat16",
"max_batch_size": 2048,
"max_input_len": 4096,
"max_seq_len": 6144,
"max_num_tokens": 16384,
"trt_llm_version": "v0.11.0"
},
"trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
"vllm_client_parameters": {
},
"sglang_server_parameters": {
"disable_radix_cache": "",
"dtype": "bfloat16"
},
"sglang_client_parameters": {
} }
} }
] ]

View File

@@ -3,13 +3,14 @@ steps:
agents: agents:
queue: cpu_queue queue: cpu_queue
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
# rename the files to change linux -> manylinux1 # rename the files to change linux -> manylinux1
- "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done" - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/" - "mv artifacts/dist/$(ls artifacts/dist) artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/" - "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/$BUILDKITE_COMMIT/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
- "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
@@ -21,7 +22,7 @@ steps:
agents: agents:
queue: cpu_queue queue: cpu_queue
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
# rename the files to change linux -> manylinux1 # rename the files to change linux -> manylinux1

View File

@@ -83,6 +83,7 @@ if [[ $commands == *" kernels "* ]]; then
--ignore=kernels/test_encoder_decoder_attn.py \ --ignore=kernels/test_encoder_decoder_attn.py \
--ignore=kernels/test_flash_attn.py \ --ignore=kernels/test_flash_attn.py \
--ignore=kernels/test_flashinfer.py \ --ignore=kernels/test_flashinfer.py \
--ignore=kernels/test_gguf.py \
--ignore=kernels/test_int8_quant.py \ --ignore=kernels/test_int8_quant.py \
--ignore=kernels/test_machete_gemm.py \ --ignore=kernels/test_machete_gemm.py \
--ignore=kernels/test_mamba_ssm.py \ --ignore=kernels/test_mamba_ssm.py \
@@ -93,6 +94,16 @@ if [[ $commands == *" kernels "* ]]; then
--ignore=kernels/test_sampler.py" --ignore=kernels/test_sampler.py"
fi fi
#ignore certain Entrypoints tests
if [[ $commands == *" entrypoints/openai "* ]]; then
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
--ignore=entrypoints/openai/test_accuracy.py \
--ignore=entrypoints/openai/test_audio.py \
--ignore=entrypoints/openai/test_encoder_decoder.py \
--ignore=entrypoints/openai/test_embedding.py \
--ignore=entrypoints/openai/test_oot_registration.py "}
fi
PARALLEL_JOB_COUNT=8 PARALLEL_JOB_COUNT=8
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then if [[ $commands == *"--shard-id="* ]]; then

View File

@@ -18,7 +18,13 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
# Run basic model test # Run basic model test
docker exec cpu-test bash -c " docker exec cpu-test bash -c "
pip install pytest matplotlib einops transformers_stream_generator pip install pytest matplotlib einops transformers_stream_generator
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported pytest -v -s tests/models -m \"not vlm\" \
--ignore=tests/models/test_embedding.py \
--ignore=tests/models/test_oot_registration.py \
--ignore=tests/models/test_registry.py \
--ignore=tests/models/test_jamba.py \
--ignore=tests/models/test_mamba.py \
--ignore=tests/models/test_danube3_4b.py" # Mamba kernels and Danube3-4B on CPU is not supported
# online inference # online inference
docker exec cpu-test bash -c " docker exec cpu-test bash -c "

View File

@@ -22,19 +22,25 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
# Run basic model test # Run basic model test
docker exec cpu-test bash -c " docker exec cpu-test bash -c "
pip install pytest matplotlib einops transformers_stream_generator pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py \ pytest -v -s tests/models/encoder_decoder/language
--ignore=tests/models/test_oot_registration.py \ pytest -v -s tests/models/decoder_only/language \
--ignore=tests/models/test_registry.py \ --ignore=tests/models/test_fp8.py \
--ignore=tests/models/test_fp8.py \ --ignore=tests/models/decoder_only/language/test_jamba.py \
--ignore=tests/models/test_jamba.py \ --ignore=tests/models/decoder_only/language/test_mamba.py \
--ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported --ignore=tests/models/decoder_only/language/test_granitemoe.py \
--ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
# Run compressed-tensor test # Run compressed-tensor test
# docker exec cpu-test bash -c "
# pytest -s -v \
# tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
# tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token"
# Run AWQ test
docker exec cpu-test bash -c " docker exec cpu-test bash -c "
pytest -s -v \ pytest -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ tests/quantization/test_ipex_quant.py"
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token"
# online inference # online inference
docker exec cpu-test bash -c " docker exec cpu-test bash -c "

View File

@@ -11,4 +11,4 @@ trap remove_docker_container EXIT
remove_docker_container remove_docker_container
# Run the image and launch offline inference # Run the image and launch offline inference
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py

View File

@@ -9,6 +9,7 @@
# label(str): the name of the test. emoji allowed. # label(str): the name of the test. emoji allowed.
# fast_check(bool): whether to run this on each commit on fastcheck pipeline. # fast_check(bool): whether to run this on each commit on fastcheck pipeline.
# fast_check_only(bool): run this test on fastcheck pipeline only # fast_check_only(bool): run this test on fastcheck pipeline only
# optional(bool): never run this test by default (i.e. need to unblock manually)
# command(str): the single command to run for tests. incompatible with commands. # command(str): the single command to run for tests. incompatible with commands.
# commands(list): the list of commands to run for test. incompatbile with command. # commands(list): the list of commands to run for test. incompatbile with command.
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd] # mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
@@ -39,17 +40,20 @@ steps:
# Check API reference (if it fails, you may have missing mock imports) # Check API reference (if it fails, you may have missing mock imports)
- grep \"sig sig-object py\" build/html/dev/sampling_params.html - grep \"sig sig-object py\" build/html/dev/sampling_params.html
- label: Async Engine, Inputs, Utils, Worker Test # 15min - label: Async Engine, Inputs, Utils, Worker Test # 24min
fast_check: true fast_check: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/mq_llm_engine
- tests/async_engine - tests/async_engine
- tests/test_inputs - tests/test_inputs
- tests/multimodal - tests/multimodal
- tests/test_utils - tests/test_utils
- tests/worker - tests/worker
commands: commands:
- pytest -v -s async_engine # Async Engine - pytest -v -s mq_llm_engine # MQLLMEngine
- pytest -v -s async_engine # AsyncLLMEngine
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
- pytest -v -s test_inputs.py - pytest -v -s test_inputs.py
- pytest -v -s multimodal - pytest -v -s multimodal
- pytest -v -s test_utils.py # Utils - pytest -v -s test_utils.py # Utils
@@ -60,14 +64,22 @@ steps:
fast_check: true fast_check: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/basic_correctness - tests/basic_correctness/test_basic_correctness
- tests/basic_correctness/test_cpu_offload
- tests/basic_correctness/test_preemption
commands: commands:
- pytest -v -s basic_correctness/test_basic_correctness.py - pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py - pytest -v -s basic_correctness/test_cpu_offload.py
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Chunked Prefill Test
source_file_dependencies:
- vllm/
- tests/basic_correctness/test_chunked_prefill
commands:
- VLLM_ATTENTION_BACKEND=XFORMERS VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py
- label: Core Test # 10min - label: Core Test # 10min
mirror_hardwares: [amd] mirror_hardwares: [amd]
fast_check: true fast_check: true
@@ -76,22 +88,29 @@ steps:
- vllm/distributed - vllm/distributed
- tests/core - tests/core
commands: commands:
- pytest -v -s core - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core/test_scheduler.py
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/test_chunked_prefill_scheduler.py
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/block/e2e/test_correctness.py
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/block/e2e/test_correctness_sliding_window.py
- pytest -v -s core --ignore=core/block/e2e/test_correctness.py --ignore=core/test_scheduler.py --ignore=core/test_chunked_prefill_scheduler.py --ignore=core/block/e2e/test_correctness.py --ignore=core/block/e2e/test_correctness_sliding_window.py
- label: Entrypoints Test # 20min - label: Entrypoints Test # 40min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
fast_check: true fast_check: true
#mirror_hardwares: [amd] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
commands: commands:
- pip install -e ./plugins/vllm_add_dummy_model - pip install -e ./plugins/vllm_add_dummy_model
- pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api] - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
- pytest -v -s entrypoints/openai - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
- pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/test_chat_utils.py
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Distributed Tests (4 GPUs) # 10min - label: Distributed Tests (4 GPUs) # 10min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
@@ -102,7 +121,9 @@ steps:
- vllm/core/ - vllm/core/
- tests/distributed - tests/distributed
- tests/spec_decode/e2e/test_integration_dist_tp4 - tests/spec_decode/e2e/test_integration_dist_tp4
- tests/compile
commands: commands:
- pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_pynccl.py
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
@@ -130,7 +151,9 @@ steps:
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/test_regression - tests/test_regression
command: pytest -v -s test_regression.py commands:
- pip install modelscope
- pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional working_dir: "/vllm-workspace/tests" # optional
- label: Engine Test # 10min - label: Engine Test # 10min
@@ -144,7 +167,7 @@ steps:
# OOM in the CI unless we run this separately # OOM in the CI unless we run this separately
- pytest -v -s tokenization - pytest -v -s tokenization
- label: Examples Test # 12min - label: Examples Test # 15min
working_dir: "/vllm-workspace/examples" working_dir: "/vllm-workspace/examples"
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
@@ -162,39 +185,16 @@ steps:
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference_encoder_decoder.py - python3 offline_inference_encoder_decoder.py
- label: Models Test # 1hr10min - label: Prefix Caching Test # 9min
source_file_dependencies:
- vllm/
- tests/models
commands:
- pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s models -m \"not vlm\" --ignore=models/test_oot_registration.py
- label: torch compile integration test
source_file_dependencies:
- vllm/
commands:
- pytest -v -s ./compile/test_full_graph.py
- pytest -v -s ./compile/test_wrapper.py
- label: Vision Language Models Test # 42min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
commands:
- pytest -v -s models -m vlm
- label: Prefix Caching Test # 7min
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/prefix_caching - tests/prefix_caching
commands: commands:
- pytest -v -s prefix_caching - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s prefix_caching/test_prefix_caching.py
- pytest -v -s prefix_caching --ignore=prefix_caching/test_prefix_caching.py
- label: Samplers Test # 18min - label: Samplers Test # 36min
source_file_dependencies: source_file_dependencies:
- vllm/model_executor/layers - vllm/model_executor/layers
- vllm/sampling_metadata.py - vllm/sampling_metadata.py
@@ -210,17 +210,16 @@ steps:
- tests/test_logits_processor - tests/test_logits_processor
command: pytest -v -s test_logits_processor.py command: pytest -v -s test_logits_processor.py
- label: Speculative decoding tests # 22min - label: Speculative decoding tests # 30min
source_file_dependencies: source_file_dependencies:
- vllm/spec_decode - vllm/spec_decode
- tests/spec_decode - tests/spec_decode
commands: commands:
# See https://github.com/vllm-project/vllm/issues/5152
- export VLLM_ATTENTION_BACKEND=XFORMERS
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py - pytest -v -s spec_decode/e2e/test_multistep_correctness.py
- pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s spec_decode/e2e/test_compatibility.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_compatibility.py
- label: LoRA Test %N # 30min each - label: LoRA Test %N # 15min each
mirror_hardwares: [amd] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/lora - vllm/lora
@@ -228,7 +227,24 @@ steps:
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
parallelism: 4 parallelism: 4
- label: Kernels Test %N # 30min each - label: "PyTorch Fullgraph Smoke Test" # 9min
fast_check: true
source_file_dependencies:
- vllm/
- tests/compile
commands:
- pytest -v -s compile/test_basic_correctness.py
# TODO: re-write in comparison tests, and fix symbolic shape
# for quantization ops.
# - label: "PyTorch Fullgraph Test" # 18min
# source_file_dependencies:
# - vllm/
# - tests/compile
# commands:
# - pytest -v -s compile/test_full_graph.py
- label: Kernels Test %N # 1h each
mirror_hardwares: [amd] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
@@ -258,12 +274,12 @@ steps:
- pip install aiohttp - pip install aiohttp
- bash run-benchmarks.sh - bash run-benchmarks.sh
- label: Quantization Test # 15min - label: Quantization Test # 33min
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
- tests/quantization - tests/quantization
command: pytest -v -s quantization command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
- label: LM Eval Small Models # 53min - label: LM Eval Small Models # 53min
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
@@ -271,10 +287,16 @@ steps:
- csrc/ - csrc/
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
commands: commands:
- pip install lm-eval
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- bash ./run-tests.sh -c configs/models-small.txt -t 1 - bash ./run-tests.sh -c configs/models-small.txt -t 1
- label: Encoder Decoder tests # 5min
source_file_dependencies:
- vllm/
- tests/encoder_decoder
commands:
- pytest -v -s encoder_decoder
- label: OpenAI-Compatible Tool Use # 20 min - label: OpenAI-Compatible Tool Use # 20 min
fast_check: false fast_check: false
mirror_hardwares: [ amd ] mirror_hardwares: [ amd ]
@@ -284,6 +306,56 @@ steps:
commands: commands:
- pytest -v -s tool_use - pytest -v -s tool_use
##### models test #####
- label: Basic Models Test # 3min
source_file_dependencies:
- vllm/
- tests/models
commands:
- pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s models/*.py --ignore=models/test_oot_registration.py
- label: Decoder-only Language Models Test # 1h36min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/models/decoder_only/language
commands:
- pytest -v -s models/decoder_only/language
- label: Decoder-only Multi-Modal Models Test # 1h31min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/models/decoder_only/audio_language
- tests/models/decoder_only/vision_language
commands:
- pytest -v -s models/decoder_only/audio_language
- pytest -v -s models/decoder_only/vision_language
- label: Other Models Test # 6min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/models/embedding/language
- tests/models/encoder_decoder/language
- tests/models/encoder_decoder/vision_language
commands:
- pytest -v -s models/embedding/language
- pytest -v -s models/encoder_decoder/language
- pytest -v -s models/encoder_decoder/vision_language
# This test is used only in PR development phase to test individual models and should never run on main
- label: Custom Models Test
optional: true
commands:
- echo 'Testing custom models...'
# PR authors can temporarily add commands below to test individual models
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
##### 1 GPU test ##### ##### 1 GPU test #####
##### multi gpus test ##### ##### multi gpus test #####
@@ -309,13 +381,13 @@ steps:
- tests/distributed/ - tests/distributed/
commands: commands:
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
- label: Distributed Tests (2 GPUs) # 28min - label: Distributed Tests (2 GPUs) # 40min
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
@@ -325,19 +397,23 @@ steps:
- vllm/executor/ - vllm/executor/
- vllm/model_executor/models/ - vllm/model_executor/models/
- tests/distributed/ - tests/distributed/
- vllm/compilation
commands: commands:
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py - pytest -v -s ./compile/test_basic_correctness.py
- TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py - pytest -v -s ./compile/test_wrapper.py
- pytest -v -s distributed/test_basic_distributed_correctness_enc_dec.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
- pytest -v -s distributed/test_chunked_prefill_distributed.py - TARGET_TEST_SUITE=L4 VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest basic_correctness/ -v -s -m distributed_2_gpus
- pytest -v -s distributed/test_multimodal_broadcast.py # Avoid importing model tests that cause CUDA reinitialization error
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
- pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- pip install -e ./plugins/vllm_add_dummy_model - pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s distributed/test_distributed_oot.py - pytest -v -s distributed/test_distributed_oot.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
- label: Multi-step Tests (4 GPUs) # 21min - label: Multi-step Tests (4 GPUs) # 36min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:
@@ -355,7 +431,7 @@ steps:
- pytest -v -s multi_step/test_correctness_async_llm.py - pytest -v -s multi_step/test_correctness_async_llm.py
- pytest -v -s multi_step/test_correctness_llm.py - pytest -v -s multi_step/test_correctness_llm.py
- label: Pipeline Parallelism Test # 23min - label: Pipeline Parallelism Test # 45min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:
@@ -381,7 +457,7 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s -x lora/test_long_context.py - pytest -v -s -x lora/test_long_context.py
- label: Weight Loading Multiple GPU Test - label: Weight Loading Multiple GPU Test # 33min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
source_file_dependencies: source_file_dependencies:
@@ -414,7 +490,7 @@ steps:
# NOTE: don't test llama model here, it seems hf implementation is buggy # NOTE: don't test llama model here, it seems hf implementation is buggy
# see https://github.com/vllm-project/vllm/pull/5689 for details # see https://github.com/vllm-project/vllm/pull/5689 for details
- pytest -v -s distributed/test_custom_all_reduce.py - pytest -v -s distributed/test_custom_all_reduce.py
- TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
- pytest -v -s -x lora/test_mixtral.py - pytest -v -s -x lora/test_mixtral.py
- label: LM Eval Large Models # optional - label: LM Eval Large Models # optional
@@ -425,6 +501,5 @@ steps:
- csrc/ - csrc/
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
commands: commands:
- pip install lm-eval
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- bash ./run-tests.sh -c configs/models-large.txt -t 4 - bash ./run-tests.sh -c configs/models-large.txt -t 4

View File

@@ -1,4 +1,34 @@
vllm/*.so /.github/
/.venv /.venv
/build /build
dist dist
vllm/*.so
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
.mypy_cache
# Distribution / packaging
.Python
/build/
cmake-build-*/
CMakeUserPresets.json
develop-eggs/
/dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

30
.github/CODEOWNERS vendored Normal file
View File

@@ -0,0 +1,30 @@
# See https://help.github.com/articles/about-codeowners/
# for more info about CODEOWNERS file
# This lists cover the "core" components of vLLM that require careful review
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
/vllm/core @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
/vllm/engine/llm_engine.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
/vllm/executor/executor_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
/vllm/worker/worker_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
/vllm/worker/worker.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
/vllm/model_executor/layers/sampler.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
CMakeLists.txt @tlrmchlsmth @WoosukKwon
# Test ownership
/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo
/tests/models @DarkLight1337 @ywang96
/tests/multimodal @DarkLight1337 @ywang96
/tests/prefix_caching @comaniac @KuntaiDu
/tests/spec_decode @njhill @LiuXiaoxuanPKU
/tests/kernels @tlrmchlsmth @WoosukKwon
/tests/quantization @mgoin @robertgshaw2-neuralmagic
/.buildkite/lm-eval-harness @mgoin @simon-mo
/tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao
/tests/multi_step @alexm-neuralmagic @comaniac
/tests/weight_loading @mgoin @youkaichao
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac

View File

@@ -30,6 +30,15 @@ body:
</details> </details>
validations: validations:
required: true required: true
- type: textarea
attributes:
label: Model Input Dumps
description: |
If you are facing crashing due to illegal memory access or other issues with model execution, vLLM may dump the problematic input of the model. In this case, you will see the message `Error in model execution (input dumped to /tmp/err_xxx.pkl)`. If you see this message, please zip the file (because GitHub doesn't support .pkl file format) and upload it here. This will help us to reproduce the issue and facilitate the debugging process.
placeholder: |
Upload the dumped input file.
validations:
required: false
- type: textarea - type: textarea
attributes: attributes:
label: 🐛 Describe the bug label: 🐛 Describe the bug

7
.github/dependabot.yml vendored Normal file
View File

@@ -0,0 +1,7 @@
version: 2
updates:
# Maintain dependencies for GitHub Actions
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "weekly"

37
.github/workflows/actionlint.yml vendored Normal file
View File

@@ -0,0 +1,37 @@
name: Lint GitHub Actions workflows
on:
push:
branches:
- "main"
paths:
- '.github/workflows/*.ya?ml'
- '.github/workflows/actionlint.*'
pull_request:
branches:
- "main"
paths:
- '.github/workflows/*.ya?ml'
- '.github/workflows/actionlint.*'
env:
LC_ALL: en_US.UTF-8
defaults:
run:
shell: bash
permissions:
contents: read
jobs:
actionlint:
runs-on: ubuntu-latest
steps:
- name: "Checkout"
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
with:
fetch-depth: 0
- name: "Run actionlint"
run: |
tools/actionlint.sh -color

View File

@@ -8,7 +8,7 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Add label - name: Add label
uses: actions/github-script@v5 uses: actions/github-script@v7
with: with:
script: | script: |
github.rest.issues.addLabels({ github.rest.issues.addLabels({

View File

@@ -17,9 +17,9 @@ jobs:
matrix: matrix:
python-version: ["3.11"] python-version: ["3.11"]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2 uses: actions/setup-python@v5
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
- name: Install dependencies - name: Install dependencies

View File

@@ -0,0 +1,17 @@
{
"problemMatcher": [
{
"owner": "actionlint",
"pattern": [
{
"regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$",
"file": 1,
"line": 2,
"column": 3,
"message": 4,
"code": 5
}
]
}
]
}

View File

@@ -11,15 +11,15 @@ on:
- main - main
jobs: jobs:
ruff: mypy:
runs-on: ubuntu-latest runs-on: ubuntu-latest
strategy: strategy:
matrix: matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2 uses: actions/setup-python@v5
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
- name: Install dependencies - name: Install dependencies
@@ -32,15 +32,4 @@ jobs:
pip install types-setuptools pip install types-setuptools
- name: Mypy - name: Mypy
run: | run: |
mypy tools/mypy.sh
mypy tests --follow-imports skip
mypy vllm/attention --follow-imports skip
mypy vllm/distributed --follow-imports skip
mypy vllm/engine --follow-imports skip
mypy vllm/executor --follow-imports skip
mypy vllm/lora --follow-imports skip
mypy vllm/model_executor --follow-imports skip
mypy vllm/prompt_adapter --follow-imports skip
mypy vllm/spec_decode --follow-imports skip
mypy vllm/worker --follow-imports skip

View File

@@ -21,16 +21,16 @@ jobs:
upload_url: ${{ steps.create_release.outputs.upload_url }} upload_url: ${{ steps.create_release.outputs.upload_url }}
steps: steps:
- name: Checkout - name: Checkout
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Extract branch info - name: Extract branch info
shell: bash shell: bash
run: | run: |
echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV echo "release_tag=${GITHUB_REF#refs/*/}" >> "$GITHUB_ENV"
- name: Create Release - name: Create Release
id: create_release id: create_release
uses: "actions/github-script@v6" uses: "actions/github-script@v7"
env: env:
RELEASE_TAG: ${{ env.release_tag }} RELEASE_TAG: ${{ env.release_tag }}
with: with:
@@ -54,7 +54,7 @@ jobs:
steps: steps:
- name: Checkout - name: Checkout
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Setup ccache - name: Setup ccache
uses: hendrikmuhs/ccache-action@v1.2 uses: hendrikmuhs/ccache-action@v1.2
@@ -68,7 +68,7 @@ jobs:
bash -x .github/workflows/scripts/env.sh bash -x .github/workflows/scripts/env.sh
- name: Set up Python - name: Set up Python
uses: actions/setup-python@v4 uses: actions/setup-python@v5
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
@@ -86,10 +86,10 @@ jobs:
CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
run: | run: |
bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }} bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
wheel_name=$(ls dist/*whl | xargs -n 1 basename) wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
asset_name=${wheel_name//"linux"/"manylinux1"} asset_name=${wheel_name//"linux"/"manylinux1"}
echo "wheel_name=${wheel_name}" >> $GITHUB_ENV echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
echo "asset_name=${asset_name}" >> $GITHUB_ENV echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
- name: Upload Release Asset - name: Upload Release Asset
uses: actions/upload-release-asset@v1 uses: actions/upload-release-asset@v1

View File

@@ -8,7 +8,7 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Remind to run full CI on PR - name: Remind to run full CI on PR
uses: actions/github-script@v6 uses: actions/github-script@v7
with: with:
script: | script: |
github.rest.issues.createComment({ github.rest.issues.createComment({

View File

@@ -17,18 +17,18 @@ jobs:
matrix: matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2 uses: actions/setup-python@v5
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install ruff==0.1.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2 pip install -r requirements-lint.txt
- name: Analysing the code with ruff - name: Analysing the code with ruff
run: | run: |
ruff . ruff check .
- name: Spelling check with codespell - name: Spelling check with codespell
run: | run: |
codespell --toml pyproject.toml codespell --toml pyproject.toml

View File

@@ -8,12 +8,12 @@ PATH=${cuda_home}/bin:$PATH
LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
# Install requirements # Install requirements
$python_executable -m pip install wheel packaging $python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt
$python_executable -m pip install -r requirements-cuda.txt
# Limit the number of parallel jobs to avoid OOM # Limit the number of parallel jobs to avoid OOM
export MAX_JOBS=1 export MAX_JOBS=1
# Make sure release wheels are built for the following architectures # Make sure release wheels are built for the following architectures
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
# Build # Build
$python_executable setup.py bdist_wheel --dist-dir=dist $python_executable setup.py bdist_wheel --dist-dir=dist

View File

@@ -16,9 +16,9 @@ jobs:
matrix: matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2 uses: actions/setup-python@v5
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
- name: Install dependencies - name: Install dependencies

13
.gitignore vendored
View File

@@ -1,5 +1,8 @@
# vllm commit id, generated by setup.py # version file generated by setuptools-scm
vllm/commit_id.py /vllm/_version.py
# vllm-flash-attn built from source
vllm/vllm_flash_attn/
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files
__pycache__/ __pycache__/
@@ -12,6 +15,8 @@ __pycache__/
# Distribution / packaging # Distribution / packaging
.Python .Python
build/ build/
cmake-build-*/
CMakeUserPresets.json
develop-eggs/ develop-eggs/
dist/ dist/
downloads/ downloads/
@@ -28,6 +33,7 @@ share/python-wheels/
.installed.cfg .installed.cfg
*.egg *.egg
MANIFEST MANIFEST
/.deps/
# PyInstaller # PyInstaller
# Usually these files are written by a python script from a template # Usually these files are written by a python script from a template
@@ -193,3 +199,6 @@ hip_compat.h
# Benchmark dataset # Benchmark dataset
benchmarks/*.json benchmarks/*.json
# Linting
actionlint

View File

@@ -13,10 +13,10 @@ sphinx:
fail_on_warning: true fail_on_warning: true
# If using Sphinx, optionally build your docs in additional formats such as PDF # If using Sphinx, optionally build your docs in additional formats such as PDF
formats: formats: []
- pdf
# Optionally declare the Python requirements required to build your docs # Optionally declare the Python requirements required to build your docs
python: python:
install: install:
- requirements: docs/requirements-docs.txt - requirements: docs/requirements-docs.txt

View File

@@ -1,5 +1,16 @@
cmake_minimum_required(VERSION 3.26) cmake_minimum_required(VERSION 3.26)
# When building directly using CMake, make sure you run the install step
# (it places the .so files in the correct location).
#
# Example:
# mkdir build && cd build
# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_INSTALL_PREFIX=.. ..
# cmake --build . --target install
#
# If you want to only build one target, make sure to install it manually:
# cmake --build . --target _C
# cmake --install . --component _C
project(vllm_extensions LANGUAGES CXX) project(vllm_extensions LANGUAGES CXX)
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py) # CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
@@ -13,6 +24,9 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
# Suppress potential warnings about unused manually-specified variables # Suppress potential warnings about unused manually-specified variables
set(ignoreMe "${VLLM_PYTHON_PATH}") set(ignoreMe "${VLLM_PYTHON_PATH}")
# Prevent installation of dependencies (cutlass) by default.
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
# #
# Supported python versions. These versions will be searched in order, the # Supported python versions. These versions will be searched in order, the
# first match will be selected. These should be kept in sync with setup.py. # first match will be selected. These should be kept in sync with setup.py.
@@ -70,19 +84,6 @@ endif()
find_package(Torch REQUIRED) find_package(Torch REQUIRED)
# #
# Add the `default` target which detects which extensions should be
# built based on platform/architecture. This is the same logic that
# setup.py uses to select which extensions should be built and should
# be kept in sync.
#
# The `default` target makes direct use of cmake easier since knowledge
# of which extensions are supported has been factored in, e.g.
#
# mkdir build && cd build
# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
# cmake --build . --target default
#
add_custom_target(default)
message(STATUS "Enabling core extension.") message(STATUS "Enabling core extension.")
# Define _core_C extension # Define _core_C extension
@@ -100,8 +101,6 @@ define_gpu_extension_target(
USE_SABI 3 USE_SABI 3
WITH_SOABI) WITH_SOABI)
add_dependencies(default _core_C)
# #
# Forward the non-CUDA device extensions to external CMake scripts. # Forward the non-CUDA device extensions to external CMake scripts.
# #
@@ -144,14 +143,32 @@ else()
message(FATAL_ERROR "Can't find CUDA or HIP installation.") message(FATAL_ERROR "Can't find CUDA or HIP installation.")
endif() endif()
#
# Override the GPU architectures detected by cmake/torch and filter them by if(VLLM_GPU_LANG STREQUAL "CUDA")
# the supported versions for the current language. #
# The final set of arches is stored in `VLLM_GPU_ARCHES`. # For cuda we want to be able to control which architectures we compile for on
# # a per-file basis in order to cut down on compile time. So here we extract
override_gpu_arches(VLLM_GPU_ARCHES # the set of architectures we want to compile for and remove the from the
${VLLM_GPU_LANG} # CMAKE_CUDA_FLAGS so that they are not applied globally.
"${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}") #
clear_cuda_arches(CUDA_ARCH_FLAGS)
extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}")
message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
# Filter the target architectures by the supported supported archs
# since for some files we will build for all CUDA_ARCHS.
cuda_archs_loose_intersection(CUDA_ARCHS
"${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
else()
#
# For other GPU targets override the GPU architectures detected by cmake/torch
# and filter them by the supported versions for the current language.
# The final set of arches is stored in `VLLM_GPU_ARCHES`.
#
override_gpu_arches(VLLM_GPU_ARCHES
${VLLM_GPU_LANG}
"${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
endif()
# #
# Query torch for additional GPU compilation flags for the given # Query torch for additional GPU compilation flags for the given
@@ -167,6 +184,17 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}") list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
endif() endif()
#
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
# Configure it to place files in vllm/.deps, in order to play nicely with sccache.
#
include(FetchContent)
get_filename_component(PROJECT_ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
file(MAKE_DIRECTORY "${FETCHCONTENT_BASE_DIR}")
set(FETCHCONTENT_BASE_DIR "${PROJECT_ROOT_DIR}/.deps")
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
# #
# Define other extension targets # Define other extension targets
# #
@@ -190,8 +218,11 @@ set(VLLM_EXT_SRC
"csrc/torch_bindings.cpp") "csrc/torch_bindings.cpp")
if(VLLM_GPU_LANG STREQUAL "CUDA") if(VLLM_GPU_LANG STREQUAL "CUDA")
include(FetchContent)
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use")
FetchContent_Declare( FetchContent_Declare(
cutlass cutlass
GIT_REPOSITORY https://github.com/nvidia/cutlass.git GIT_REPOSITORY https://github.com/nvidia/cutlass.git
@@ -210,29 +241,89 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/mamba/causal_conv1d/causal_conv1d.cu" "csrc/mamba/causal_conv1d/causal_conv1d.cu"
"csrc/quantization/aqlm/gemm_kernels.cu" "csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu" "csrc/quantization/awq/gemm_kernels.cu"
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu"
"csrc/quantization/gguf/gguf_kernel.cu" "csrc/quantization/gguf/gguf_kernel.cu"
"csrc/quantization/fp8/fp8_marlin.cu"
"csrc/custom_all_reduce.cu" "csrc/custom_all_reduce.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu" "csrc/permute_cols.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu" "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu")
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
set_gencode_flags_for_srcs(
SRCS "${VLLM_EXT_SRC}"
CUDA_ARCHS "${CUDA_ARCHS}")
# Only build Marlin kernels if we are building for at least some compatible archs.
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
# are not supported by Machete yet.
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS})
if (MARLIN_ARCHS)
set(MARLIN_SRCS
"csrc/quantization/fp8/fp8_marlin.cu"
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_SRCS}"
CUDA_ARCHS "${MARLIN_ARCHS}")
list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
else()
message(STATUS "Not building Marlin kernels as no compatible archs found"
"in CUDA target architectures")
endif()
# #
# The CUTLASS kernels for Hopper require sm90a to be enabled. # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
# This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a. # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
# That adds an extra 17MB to compiled binary, so instead we selectively enable it. cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
set_source_files_properties( set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu" set_gencode_flags_for_srcs(
PROPERTIES SRCS "${SRCS}"
COMPILE_FLAGS CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
"-gencode arch=compute_90a,code=sm_90a") list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
else()
# clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
# build any 3x kernels
set(SCALED_MM_3X_ARCHS)
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
"later if you intend on running FP8 quantized models on "
"Hopper.")
else()
message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
"in CUDA target architectures")
endif()
endif()
#
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
# kernels for the remaining archs that are not already built for 3x.
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
"7.5;8.0;8.6;8.9;9.0;9.0a" "${CUDA_ARCHS}")
# subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
if (SCALED_MM_2X_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C2X=1")
message(STATUS "Building scaled_mm_c2x for archs: ${SCALED_MM_2X_ARCHS}")
else()
if (SCALED_MM_3X_ARCHS)
message(STATUS "Not building scaled_mm_c2x as all archs are already built"
" for and covered by scaled_mm_c3x")
else()
message(STATUS "Not building scaled_mm_c2x as no compatible archs found "
"in CUDA target architectures")
endif()
endif() endif()
@@ -240,49 +331,75 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Machete kernels # Machete kernels
# The machete kernels only work on hopper and require CUDA 12.0 or later. # The machete kernels only work on hopper and require CUDA 12.0 or later.
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0) # Only build Machete kernels if we are building for something compatible with sm90a
cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
# #
# For the Machete kernels we automatically generate sources for various # For the Machete kernels we automatically generate sources for various
# preselected input type pairs and schedules. # preselected input type pairs and schedules.
# Generate sources: # Generate sources:
execute_process( set(MACHETE_GEN_SCRIPT
COMMAND ${CMAKE_COMMAND} -E env ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py)
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)
${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py
RESULT_VARIABLE machete_generation_result
OUTPUT_VARIABLE machete_generation_output
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
)
if (NOT machete_generation_result EQUAL 0) message(STATUS "Machete generation script hash: ${MACHETE_GEN_SCRIPT_HASH}")
message(FATAL_ERROR "Machete generation failed." message(STATUS "Last run machete generate script hash: $CACHE{MACHETE_GEN_SCRIPT_HASH}")
" Result: \"${machete_generation_result}\""
"\nCheck the log for details: " if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH}
"${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log") OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
execute_process(
COMMAND ${CMAKE_COMMAND} -E env
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
RESULT_VARIABLE machete_generation_result
OUTPUT_VARIABLE machete_generation_output
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
)
if (NOT machete_generation_result EQUAL 0)
message(FATAL_ERROR "Machete generation failed."
" Result: \"${machete_generation_result}\""
"\nCheck the log for details: "
"${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
else()
set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
CACHE STRING "Last run machete generate script hash" FORCE)
message(STATUS "Machete generation completed successfully.")
endif()
else() else()
message(STATUS "Machete generation completed successfully.") message(STATUS "Machete generation script has not changed, skipping generation.")
endif() endif()
# Add machete generated sources # Add machete generated sources
file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu") file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu")
list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES}) list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES})
message(STATUS "Machete generated sources: ${MACHETE_GEN_SOURCES}")
set_source_files_properties( # forward compatible
${MACHETE_GEN_SOURCES} set_gencode_flags_for_srcs(
PROPERTIES SRCS "${MACHETE_GEN_SOURCES}"
COMPILE_FLAGS CUDA_ARCHS "${MACHETE_ARCHS}")
"-gencode arch=compute_90a,code=sm_90a")
list(APPEND VLLM_EXT_SRC
csrc/quantization/machete/machete_pytorch.cu)
message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
AND MACHETE_ARCHS)
message(STATUS "Not building Machete kernels as CUDA Compiler version is "
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
"later if you intend on running w4a16 quantized models on "
"Hopper.")
else()
message(STATUS "Not building Machete kernels as no compatible archs "
"found in CUDA target architectures")
endif()
endif() endif()
# if CUDA endif
# Add pytorch binding for machete (add on even CUDA < 12.0 so that we can
# raise an error if the user that this was built with an incompatible
# CUDA version)
list(APPEND VLLM_EXT_SRC
csrc/quantization/machete/machete_pytorch.cu)
endif() endif()
message(STATUS "Enabling C extension.")
define_gpu_extension_target( define_gpu_extension_target(
_C _C
DESTINATION vllm DESTINATION vllm
@@ -308,11 +425,36 @@ set(VLLM_MOE_EXT_SRC
"csrc/moe/torch_bindings.cpp" "csrc/moe/torch_bindings.cpp"
"csrc/moe/topk_softmax_kernels.cu") "csrc/moe/topk_softmax_kernels.cu")
set_gencode_flags_for_srcs(
SRCS "${VLLM_MOE_EXT_SRC}"
CUDA_ARCHS "${CUDA_ARCHS}")
if(VLLM_GPU_LANG STREQUAL "CUDA") if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_MOE_EXT_SRC cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
"csrc/moe/marlin_moe_ops.cu") if (MARLIN_MOE_ARCHS)
set(MARLIN_MOE_SRC
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
"csrc/moe/marlin_moe_ops.cu")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_MOE_SRC}"
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
else()
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
"in CUDA target architectures")
endif()
endif() endif()
message(STATUS "Enabling moe extension.")
define_gpu_extension_target( define_gpu_extension_target(
_moe_C _moe_C
DESTINATION vllm DESTINATION vllm
@@ -323,13 +465,96 @@ define_gpu_extension_target(
USE_SABI 3 USE_SABI 3
WITH_SOABI) WITH_SOABI)
if(VLLM_GPU_LANG STREQUAL "HIP")
#
# _rocm_C extension
#
set(VLLM_ROCM_EXT_SRC
"csrc/rocm/torch_bindings.cpp"
"csrc/rocm/attention.cu")
define_gpu_extension_target(
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP") _rocm_C
message(STATUS "Enabling C extension.") DESTINATION vllm
add_dependencies(default _C) LANGUAGE ${VLLM_GPU_LANG}
SOURCES ${VLLM_ROCM_EXT_SRC}
message(STATUS "Enabling moe extension.") COMPILE_FLAGS ${VLLM_GPU_FLAGS}
add_dependencies(default _moe_C) ARCHITECTURES ${VLLM_GPU_ARCHES}
USE_SABI 3
WITH_SOABI)
endif() endif()
# vllm-flash-attn currently only supported on CUDA
if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
return()
endif ()
# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
# arches in the CUDA case (and instead set the gencodes on a per file basis)
# we need to manually set VLLM_GPU_ARCHES here.
if(VLLM_GPU_LANG STREQUAL "CUDA")
foreach(_ARCH ${CUDA_ARCHS})
string(REPLACE "." "" _ARCH "${_ARCH}")
list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
endforeach()
endif()
#
# Build vLLM flash attention from source
#
# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
# They should be identical but if they aren't, this is a massive footgun.
#
# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
# To only install vllm-flash-attn, use --component vllm_flash_attn_c.
# If no component is specified, vllm-flash-attn is still installed.
# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
# This is to enable local development of vllm-flash-attn within vLLM.
# It can be set as an environment variable or passed as a cmake argument.
# The environment variable takes precedence.
if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
endif()
if(VLLM_FLASH_ATTN_SRC_DIR)
FetchContent_Declare(vllm-flash-attn SOURCE_DIR ${VLLM_FLASH_ATTN_SRC_DIR})
else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd
GIT_PROGRESS TRUE
)
endif()
# Set the parent build flag so that the vllm-flash-attn library does not redo compile flag and arch initialization.
set(VLLM_PARENT_BUILD ON)
# Ensure the vllm/vllm_flash_attn directory exists before installation
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" COMPONENT vllm_flash_attn_c)
# Make sure vllm-flash-attn install rules are nested under vllm/
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" COMPONENT vllm_flash_attn_c)
install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" COMPONENT vllm_flash_attn_c)
# Fetch the vllm-flash-attn library
FetchContent_MakeAvailable(vllm-flash-attn)
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
# Restore the install prefix
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" COMPONENT vllm_flash_attn_c)
# Copy over the vllm-flash-attn python files
install(
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
DESTINATION vllm/vllm_flash_attn
COMPONENT vllm_flash_attn_c
FILES_MATCHING PATTERN "*.py"
)
# Nothing after vllm-flash-attn, see comment about macros above

View File

@@ -1,30 +1,23 @@
# Contributing to vLLM # Contributing to vLLM
Thank you for your interest in contributing to vLLM! Thank you for your interest in contributing to vLLM! Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. There are several ways you can contribute to the project:
Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large.
There are several ways you can contribute to the project:
- Identify and report any issues or bugs. - Identify and report any issues or bugs.
- Request or add a new model. - Request or add support for a new model.
- Suggest or implement new features. - Suggest or implement new features.
- Improve documentation or contribute a how-to guide.
However, remember that contributions aren't just about code. We also believe in the power of community support; thus, answering queries, offering PR reviews, and assisting others are also highly regarded and beneficial contributions.
We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions.
Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository!
Talk about it in your blog posts, highlighting how it's driving your incredible projects.
Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository.
## Setup for development ## Developing
### Build from source Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the [building from source](https://docs.vllm.ai/en/latest/getting_started/installation.html#build-from-source) documentation for details.
```bash
pip install -e . # This may take several minutes.
```
### Testing ## Testing
```bash ```bash
pip install -r requirements-dev.txt pip install -r requirements-dev.txt
@@ -36,15 +29,16 @@ mypy
# Unit tests # Unit tests
pytest tests/ pytest tests/
``` ```
**Note:** Currently, the repository does not pass the mypy tests. **Note:** Currently, the repository does not pass the ``mypy`` tests.
## Contribution Guidelines
## Contributing Guidelines ### Issues
### Issue Reporting If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it. > [!IMPORTANT]
If not, please file a new issue, providing as much relevant information as possible. > If you discover a security vulnerability, please follow the instructions [here](/SECURITY.md#reporting-a-vulnerability).
### Pull Requests & Code Reviews ### Pull Requests & Code Reviews
@@ -53,4 +47,4 @@ Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE
### Thank You ### Thank You
Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
Your contributions make vLLM a great tool for everyone! All of your contributions help make vLLM a great tool and community for everyone!

View File

@@ -27,6 +27,14 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version && python3 --version && python3 -m pip --version
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
# as it was causing spam when compiling the CUTLASS kernels
RUN apt-get install -y gcc-10 g++-10
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
RUN <<EOF
gcc --version
EOF
# Workaround for https://github.com/openai/triton/issues/2507 and # Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image # this won't be needed for future versions of this docker image
@@ -48,6 +56,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
# see https://github.com/pytorch/pytorch/pull/123243 # see https://github.com/pytorch/pytorch/pull/123243
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX' ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
# Override the arch list for flash-attn to reduce the binary size
ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
#################### BASE BUILD IMAGE #################### #################### BASE BUILD IMAGE ####################
#################### WHEEL BUILD IMAGE #################### #################### WHEEL BUILD IMAGE ####################
@@ -60,14 +71,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-build.txt python3 -m pip install -r requirements-build.txt
# files and directories related to build wheels # files and directories related to build wheels
COPY csrc csrc COPY . .
COPY setup.py setup.py
COPY cmake cmake
COPY CMakeLists.txt CMakeLists.txt
COPY requirements-common.txt requirements-common.txt
COPY requirements-cuda.txt requirements-cuda.txt
COPY pyproject.toml pyproject.toml
COPY vllm vllm
# max jobs used by Ninja to build extensions # max jobs used by Ninja to build extensions
ARG max_jobs=2 ARG max_jobs=2
@@ -76,14 +80,13 @@ ENV MAX_JOBS=${max_jobs}
ARG nvcc_threads=8 ARG nvcc_threads=8
ENV NVCC_THREADS=$nvcc_threads ENV NVCC_THREADS=$nvcc_threads
ARG buildkite_commit
ENV BUILDKITE_COMMIT=${buildkite_commit}
ARG USE_SCCACHE ARG USE_SCCACHE
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
ARG SCCACHE_REGION_NAME=us-west-2 ARG SCCACHE_REGION_NAME=us-west-2
ARG SCCACHE_S3_NO_CREDENTIALS=0
# if USE_SCCACHE is set, use sccache to speed up compilation # if USE_SCCACHE is set, use sccache to speed up compilation
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \
if [ "$USE_SCCACHE" = "1" ]; then \ if [ "$USE_SCCACHE" = "1" ]; then \
echo "Installing sccache..." \ echo "Installing sccache..." \
&& curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \ && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
@@ -92,6 +95,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \ && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \ && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \ && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
&& export SCCACHE_IDLE_TIMEOUT=0 \ && export SCCACHE_IDLE_TIMEOUT=0 \
&& export CMAKE_BUILD_TYPE=Release \ && export CMAKE_BUILD_TYPE=Release \
&& sccache --show-stats \ && sccache --show-stats \
@@ -102,6 +106,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
ENV CCACHE_DIR=/root/.cache/ccache ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \ RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/pip \ --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \
if [ "$USE_SCCACHE" != "1" ]; then \ if [ "$USE_SCCACHE" != "1" ]; then \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \ python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
fi fi
@@ -131,7 +136,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
#################### DEV IMAGE #################### #################### DEV IMAGE ####################
#################### vLLM installation IMAGE #################### #################### vLLM installation IMAGE ####################
# image with vLLM installed # image with vLLM installed
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
ARG CUDA_VERSION=12.4.1 ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3.12 ARG PYTHON_VERSION=3.12
WORKDIR /vllm-workspace WORKDIR /vllm-workspace
@@ -169,6 +174,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
. /etc/environment && \ . /etc/environment && \
python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
COPY examples examples
#################### vLLM installation IMAGE #################### #################### vLLM installation IMAGE ####################
@@ -180,10 +186,6 @@ FROM vllm-base AS test
ADD . /vllm-workspace/ ADD . /vllm-workspace/
# install development dependencies (for testing) # install development dependencies (for testing)
# A newer setuptools is required for installing some test dependencies from source that do not publish python 3.12 wheels
# This installation must complete before the test dependencies are collected and installed.
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install "setuptools>=74.1.1"
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-dev.txt python3 -m pip install -r requirements-dev.txt
@@ -202,7 +204,7 @@ FROM vllm-base AS vllm-openai
# install additional dependencies for openai api server # install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
pip install accelerate hf_transfer 'modelscope!=1.15.0' pip install accelerate hf_transfer 'modelscope!=1.15.0' bitsandbytes>=0.44.0 timm==0.9.10
ENV VLLM_USAGE_SOURCE production-docker-image ENV VLLM_USAGE_SOURCE production-docker-image

View File

@@ -22,9 +22,12 @@ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/li
RUN echo 'ulimit -c 0' >> ~/.bashrc RUN echo 'ulimit -c 0' >> ~/.bashrc
RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl RUN pip install intel_extension_for_pytorch==2.4.0
ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu WORKDIR /workspace
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \ --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
pip install --upgrade pip && \ pip install --upgrade pip && \
@@ -60,8 +63,10 @@ ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/ccache \ --mount=type=cache,target=/root/.cache/ccache \
--mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \ VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
pip install dist/*.whl pip install dist/*.whl && \
rm -rf dist
WORKDIR /workspace/ WORKDIR /workspace/

View File

@@ -1,14 +1,17 @@
# default base image # default base image
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.19.1-ubuntu20.04" ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.0-ubuntu20.04"
FROM $BASE_IMAGE FROM $BASE_IMAGE
RUN echo "Base image is $BASE_IMAGE" RUN echo "Base image is $BASE_IMAGE"
# Install some basic utilities # Install some basic utilities
RUN apt-get update \ RUN apt-get update && \
&& apt-get install python3 python3-pip -y \ apt-get install -y \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 git \
python3 \
python3-pip \
ffmpeg libsm6 libxext6 libgl1
### Mount Point ### ### Mount Point ###
# When launching the container, mount the code directory to /app # When launching the container, mount the code directory to /app
@@ -20,19 +23,19 @@ RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
RUN python3 -m pip install sentencepiece transformers==4.36.2 -U RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U RUN python3 -m pip install --pre neuronx-cc==2.15.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
COPY ./vllm /app/vllm/vllm COPY . /app/vllm
COPY ./setup.py /app/vllm/setup.py
COPY ./requirements-common.txt /app/vllm/requirements-common.txt
COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
RUN cd /app/vllm \ RUN cd /app/vllm \
&& python3 -m pip install -U -r requirements-neuron.txt && python3 -m pip install -U \
cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
-r requirements-neuron.txt
ENV VLLM_TARGET_DEVICE neuron ENV VLLM_TARGET_DEVICE neuron
RUN cd /app/vllm \ RUN --mount=type=bind,source=.git,target=.git \
&& pip install -e . \ cd /app/vllm \
&& pip install --no-build-isolation -v -e . \
&& cd .. && cd ..
CMD ["/bin/bash"] CMD ["/bin/bash"]

View File

@@ -4,20 +4,12 @@
FROM ubuntu:22.04 AS dev FROM ubuntu:22.04 AS dev
RUN apt-get update -y && \ RUN apt-get update -y && \
apt-get install -y python3-pip git && \ apt-get install -y \
apt-get install -y ffmpeg libsm6 libxext6 libgl1 git python3-pip \
ffmpeg libsm6 libxext6 libgl1
WORKDIR /workspace WORKDIR /workspace
# copy requirements COPY . .
COPY requirements-build.txt /workspace/vllm/
COPY requirements-common.txt /workspace/vllm/
COPY requirements-openvino.txt /workspace/vllm/
COPY vllm/ /workspace/vllm/vllm
COPY csrc/core /workspace/vllm/csrc/core
COPY cmake/utils.cmake /workspace/vllm/cmake/
COPY CMakeLists.txt /workspace/vllm/
COPY setup.py /workspace/vllm/
# install build requirements # install build requirements
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt

View File

@@ -16,9 +16,15 @@ COPY ./ /workspace/vllm
WORKDIR /workspace/vllm WORKDIR /workspace/vllm
# These packages will be in rocketce eventually # These packages will be in rocketce eventually
RUN pip install -v cmake xformers torch==2.3.1 uvloop==0.20.0 -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing RUN --mount=type=cache,target=/root/.cache/pip \
pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
torch==2.3.1 \
-r requirements-cpu.txt \
xformers uvloop==0.20.0
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install RUN --mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py install
WORKDIR /workspace/ WORKDIR /workspace/

View File

@@ -1,5 +1,5 @@
# Default ROCm 6.1 base image # Default ROCm 6.2 base image
ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging" ARG BASE_IMAGE="rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0"
# Default ROCm ARCHes to build vLLM for. # Default ROCm ARCHes to build vLLM for.
ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100" ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
@@ -7,18 +7,12 @@ ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
# Whether to install CK-based flash-attention # Whether to install CK-based flash-attention
# If 0, will not install flash-attention # If 0, will not install flash-attention
ARG BUILD_FA="1" ARG BUILD_FA="1"
# If `TRY_FA_WHEEL=1`, we will try installing flash-attention from `FA_WHEEL_URL`
# If this succeeds, we use the downloaded wheel and skip building flash-attention.
# Otherwise, ROCm flash-attention from `FA_BRANCH` will be built for the
# architectures specified in `FA_GFX_ARCHS`
ARG TRY_FA_WHEEL="1"
ARG FA_WHEEL_URL="https://github.com/ROCm/flash-attention/releases/download/v2.5.9post1-cktile-vllm/flash_attn-2.5.9.post1-cp39-cp39-linux_x86_64.whl"
ARG FA_GFX_ARCHS="gfx90a;gfx942" ARG FA_GFX_ARCHS="gfx90a;gfx942"
ARG FA_BRANCH="23a2b1c2" ARG FA_BRANCH="3cea2fb"
# Whether to build triton on rocm # Whether to build triton on rocm
ARG BUILD_TRITON="1" ARG BUILD_TRITON="1"
ARG TRITON_BRANCH="e0fc12c" ARG TRITON_BRANCH="e192dba"
### Base image build stage ### Base image build stage
FROM $BASE_IMAGE AS base FROM $BASE_IMAGE AS base
@@ -50,14 +44,17 @@ RUN python3 -m pip install --upgrade pip
# Remove sccache so it doesn't interfere with ccache # Remove sccache so it doesn't interfere with ccache
# TODO: implement sccache support across components # TODO: implement sccache support across components
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)" RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
# Install torch == 2.5.0 on ROCm
RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ # Install torch == 2.6.0 on ROCm
*"rocm-6.1"*) \ RUN --mount=type=cache,target=/root/.cache/pip \
case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
*"rocm-6.2"*) \
python3 -m pip uninstall -y torch torchvision \ python3 -m pip uninstall -y torch torchvision \
&& python3 -m pip install --no-cache-dir --pre \ && python3 -m pip install --pre \
torch==2.5.0.dev20240726 \ torch==2.6.0.dev20240918 \
torchvision==0.20.0.dev20240726 \ setuptools-scm>=8 \
--index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \ torchvision==0.20.0.dev20240918 \
--extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \
*) ;; esac *) ;; esac
ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
@@ -79,25 +76,18 @@ RUN cd /opt/rocm/share/amd_smi \
### Flash-Attention wheel build stage ### Flash-Attention wheel build stage
FROM base AS build_fa FROM base AS build_fa
ARG BUILD_FA ARG BUILD_FA
ARG TRY_FA_WHEEL
ARG FA_WHEEL_URL
ARG FA_GFX_ARCHS ARG FA_GFX_ARCHS
ARG FA_BRANCH ARG FA_BRANCH
# Build ROCm flash-attention wheel if `BUILD_FA = 1` # Build ROCm flash-attention wheel if `BUILD_FA = 1`
RUN --mount=type=cache,target=${CCACHE_DIR} \ RUN --mount=type=cache,target=${CCACHE_DIR} \
if [ "$BUILD_FA" = "1" ]; then \ if [ "$BUILD_FA" = "1" ]; then \
if [ "${TRY_FA_WHEEL}" = "1" ] && python3 -m pip install "${FA_WHEEL_URL}"; then \ mkdir -p libs \
# If a suitable wheel exists, we download it instead of building FA && cd libs \
mkdir -p /install && wget -N "${FA_WHEEL_URL}" -P /install; \ && git clone https://github.com/ROCm/flash-attention.git \
else \ && cd flash-attention \
mkdir -p libs \ && git checkout "${FA_BRANCH}" \
&& cd libs \ && git submodule update --init \
&& git clone https://github.com/ROCm/flash-attention.git \ && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
&& cd flash-attention \
&& git checkout "${FA_BRANCH}" \
&& git submodule update --init \
&& GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
fi; \
# Create an empty directory otherwise as later build stages expect one # Create an empty directory otherwise as later build stages expect one
else mkdir -p /install; \ else mkdir -p /install; \
fi fi
@@ -112,6 +102,7 @@ RUN --mount=type=cache,target=${CCACHE_DIR} \
if [ "$BUILD_TRITON" = "1" ]; then \ if [ "$BUILD_TRITON" = "1" ]; then \
mkdir -p libs \ mkdir -p libs \
&& cd libs \ && cd libs \
&& python3 -m pip install ninja cmake wheel pybind11 \
&& git clone https://github.com/OpenAI/triton.git \ && git clone https://github.com/OpenAI/triton.git \
&& cd triton \ && cd triton \
&& git checkout "${TRITON_BRANCH}" \ && git checkout "${TRITON_BRANCH}" \
@@ -129,7 +120,7 @@ COPY . .
# Package upgrades for useful functionality or to avoid dependency issues # Package upgrades for useful functionality or to avoid dependency issues
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install --upgrade numba scipy huggingface-hub[cli] python3 -m pip install --upgrade numba scipy huggingface-hub[cli] pytest-shard
# Workaround for ray >= 2.10.0 # Workaround for ray >= 2.10.0
@@ -138,15 +129,9 @@ ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
ENV TOKENIZERS_PARALLELISM=false ENV TOKENIZERS_PARALLELISM=false
RUN --mount=type=cache,target=${CCACHE_DIR} \ RUN --mount=type=cache,target=${CCACHE_DIR} \
--mount=type=bind,source=.git,target=.git \
--mount=type=cache,target=/root/.cache/pip \ --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -Ur requirements-rocm.txt \ python3 -m pip install -Ur requirements-rocm.txt \
&& case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
*"rocm-6.1"*) \
# Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM
wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib \
# Prevent interference if torch bundles its own HIP runtime
&& rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \
*) ;; esac \
&& python3 setup.py clean --all \ && python3 setup.py clean --all \
&& python3 setup.py develop && python3 setup.py develop

View File

@@ -5,16 +5,25 @@ FROM $BASE_IMAGE
WORKDIR /workspace WORKDIR /workspace
# Install some basic utilities # Install some basic utilities
RUN apt-get update && apt-get install -y ffmpeg libsm6 libxext6 libgl1 RUN apt-get update && apt-get install -y \
git \
ffmpeg libsm6 libxext6 libgl1
# Install the TPU and Pallas dependencies. # Install the TPU and Pallas dependencies.
RUN python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html RUN --mount=type=cache,target=/root/.cache/pip \
RUN python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
# Build vLLM. # Build vLLM.
COPY . /workspace/vllm COPY . /workspace/vllm
ENV VLLM_TARGET_DEVICE="tpu" ENV VLLM_TARGET_DEVICE="tpu"
RUN cd /workspace/vllm && python3 -m pip install -r requirements-tpu.txt RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \
cd /workspace/vllm && \
python3 -m pip install \
cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
-r requirements-tpu.txt
RUN cd /workspace/vllm && python3 setup.py develop RUN cd /workspace/vllm && python3 setup.py develop
CMD ["/bin/bash"] CMD ["/bin/bash"]

View File

@@ -1,21 +1,55 @@
FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu20.04 FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS vllm-base
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \ echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \ chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \ wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \ echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg chmod 644 /usr/share/keyrings/intel-graphics.gpg
RUN apt-get update -y \ RUN apt-get update -y && \
&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1 apt-get install -y --no-install-recommends --fix-missing \
COPY ./ /workspace/vllm curl \
ffmpeg \
git \
libsndfile1 \
libsm6 \
libxext6 \
libgl1 \
lsb-release \
numactl \
python3 \
python3-dev \
python3-pip \
# vim \
wget
WORKDIR /workspace/vllm WORKDIR /workspace/vllm
COPY requirements-xpu.txt /workspace/vllm/requirements-xpu.txt
COPY requirements-common.txt /workspace/vllm/requirements-common.txt
RUN pip install -v -r requirements-xpu.txt RUN --mount=type=cache,target=/root/.cache/pip \
pip install --no-cache-dir \
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
-r requirements-xpu.txt
RUN VLLM_TARGET_DEVICE=xpu python3 setup.py install COPY ./ /workspace/vllm
ENV VLLM_TARGET_DEVICE=xpu
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \
python3 setup.py install
CMD ["/bin/bash"] CMD ["/bin/bash"]
FROM vllm-base AS vllm-openai
# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
pip install accelerate hf_transfer 'modelscope!=1.15.0'
ENV VLLM_USAGE_SOURCE production-docker-image \
TRITON_XPU_PROFILE 1
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

View File

@@ -10,22 +10,13 @@ Easy, fast, and cheap LLM serving for everyone
</h3> </h3>
<p align="center"> <p align="center">
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | | <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
</p> </p>
---
**vLLM, AMD, Anyscale Meet & Greet at [Ray Summit 2024](http://raysummit.anyscale.com) (Monday, Sept 30th, 5-7pm PT) at Marriott Marquis San Francisco**
We are excited to announce our special vLLM event in collaboration with AMD and Anyscale.
Join us to learn more about recent advancements of vLLM on MI300X.
Register [here](https://lu.ma/db5ld9n5) and be a part of the event!
---
*Latest News* 🔥 *Latest News* 🔥
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/sessioncatalog?tab.day=20241001&search.sessiontracks=1719251906298001uzJ2) from other vLLM contributors and users!
- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing). - [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing). - [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html). - [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
@@ -51,7 +42,7 @@ vLLM is fast with:
- Speculative decoding - Speculative decoding
- Chunked prefill - Chunked prefill
**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/4068) that compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)). **Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script.
vLLM is flexible and easy to use with: vLLM is flexible and easy to use with:

11
SECURITY.md Normal file
View File

@@ -0,0 +1,11 @@
# Security Policy
## Reporting a Vulnerability
If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).
---
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.

View File

@@ -23,8 +23,9 @@ class RequestFuncInput:
output_len: int output_len: int
model: str model: str
best_of: int = 1 best_of: int = 1
use_beam_search: bool = False
logprobs: Optional[int] = None logprobs: Optional[int] = None
multi_modal_content: Optional[dict] = None
ignore_eos: bool = False
@dataclass @dataclass
@@ -47,13 +48,13 @@ async def async_request_tgi(
assert api_url.endswith("generate_stream") assert api_url.endswith("generate_stream")
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
assert not request_func_input.use_beam_search
params = { params = {
"best_of": request_func_input.best_of, "best_of": request_func_input.best_of,
"max_new_tokens": request_func_input.output_len, "max_new_tokens": request_func_input.output_len,
"do_sample": True, "do_sample": True,
"temperature": 0.01, # TGI does not accept 0.0 temperature. "temperature": 0.01, # TGI does not accept 0.0 temperature.
"top_p": 0.99, # TGI does not accept 1.0 top_p. "top_p": 0.99, # TGI does not accept 1.0 top_p.
# TGI does not accept ignore_eos flag.
} }
payload = { payload = {
"inputs": request_func_input.prompt, "inputs": request_func_input.prompt,
@@ -118,7 +119,6 @@ async def async_request_trt_llm(
assert api_url.endswith("generate_stream") assert api_url.endswith("generate_stream")
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
assert not request_func_input.use_beam_search
assert request_func_input.best_of == 1 assert request_func_input.best_of == 1
payload = { payload = {
"accumulate_tokens": True, "accumulate_tokens": True,
@@ -128,6 +128,8 @@ async def async_request_trt_llm(
"max_tokens": request_func_input.output_len, "max_tokens": request_func_input.output_len,
"stream": True, "stream": True,
} }
if request_func_input.ignore_eos:
payload["min_length"] = request_func_input.output_len
output = RequestFuncOutput() output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len output.prompt_len = request_func_input.prompt_len
@@ -182,7 +184,6 @@ async def async_request_deepspeed_mii(
) -> RequestFuncOutput: ) -> RequestFuncOutput:
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
assert request_func_input.best_of == 1 assert request_func_input.best_of == 1
assert not request_func_input.use_beam_search
payload = { payload = {
"prompt": request_func_input.prompt, "prompt": request_func_input.prompt,
@@ -230,7 +231,6 @@ async def async_request_openai_completions(
), "OpenAI Completions API URL must end with 'completions' or 'profile'." ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
assert not request_func_input.use_beam_search
payload = { payload = {
"model": request_func_input.model, "model": request_func_input.model,
"prompt": request_func_input.prompt, "prompt": request_func_input.prompt,
@@ -239,6 +239,7 @@ async def async_request_openai_completions(
"max_tokens": request_func_input.output_len, "max_tokens": request_func_input.output_len,
"logprobs": request_func_input.logprobs, "logprobs": request_func_input.logprobs,
"stream": True, "stream": True,
"ignore_eos": request_func_input.ignore_eos,
} }
headers = { headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
@@ -311,18 +312,21 @@ async def async_request_openai_chat_completions(
), "OpenAI Chat Completions API URL must end with 'chat/completions'." ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
assert not request_func_input.use_beam_search content = [{"type": "text", "text": request_func_input.prompt}]
if request_func_input.multi_modal_content:
content.append(request_func_input.multi_modal_content)
payload = { payload = {
"model": request_func_input.model, "model": request_func_input.model,
"messages": [ "messages": [
{ {
"role": "user", "role": "user",
"content": request_func_input.prompt, "content": content
}, },
], ],
"temperature": 0.0, "temperature": 0.0,
"max_tokens": request_func_input.output_len, "max_tokens": request_func_input.output_len,
"stream": True, "stream": True,
"ignore_eos": request_func_input.ignore_eos,
} }
headers = { headers = {
"Content-Type": "application/json", "Content-Type": "application/json",
@@ -426,4 +430,5 @@ ASYNC_REQUEST_FUNCS = {
"openai-chat": async_request_openai_chat_completions, "openai-chat": async_request_openai_chat_completions,
"tensorrt-llm": async_request_trt_llm, "tensorrt-llm": async_request_trt_llm,
"scalellm": async_request_openai_completions, "scalellm": async_request_openai_completions,
"sglang": async_request_openai_completions,
} }

View File

@@ -11,7 +11,7 @@ from tqdm import tqdm
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs
from vllm.inputs import PromptInputs from vllm.inputs import PromptType
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
@@ -51,9 +51,8 @@ def main(args: argparse.Namespace):
sampling_params = SamplingParams( sampling_params = SamplingParams(
n=args.n, n=args.n,
temperature=0.0 if args.use_beam_search else 1.0, temperature=1.0,
top_p=1.0, top_p=1.0,
use_beam_search=args.use_beam_search,
ignore_eos=True, ignore_eos=True,
max_tokens=args.output_len, max_tokens=args.output_len,
) )
@@ -61,7 +60,7 @@ def main(args: argparse.Namespace):
dummy_prompt_token_ids = np.random.randint(10000, dummy_prompt_token_ids = np.random.randint(10000,
size=(args.batch_size, size=(args.batch_size,
args.input_len)) args.input_len))
dummy_inputs: List[PromptInputs] = [{ dummy_prompts: List[PromptType] = [{
"prompt_token_ids": batch "prompt_token_ids": batch
} for batch in dummy_prompt_token_ids.tolist()] } for batch in dummy_prompt_token_ids.tolist()]
@@ -74,13 +73,13 @@ def main(args: argparse.Namespace):
], ],
on_trace_ready=torch.profiler.tensorboard_trace_handler( on_trace_ready=torch.profiler.tensorboard_trace_handler(
str(profile_dir))) as p: str(profile_dir))) as p:
llm.generate(dummy_inputs, llm.generate(dummy_prompts,
sampling_params=sampling_params, sampling_params=sampling_params,
use_tqdm=False) use_tqdm=False)
print(p.key_averages()) print(p.key_averages())
else: else:
start_time = time.perf_counter() start_time = time.perf_counter()
llm.generate(dummy_inputs, llm.generate(dummy_prompts,
sampling_params=sampling_params, sampling_params=sampling_params,
use_tqdm=False) use_tqdm=False)
end_time = time.perf_counter() end_time = time.perf_counter()
@@ -222,7 +221,9 @@ if __name__ == '__main__':
parser.add_argument("--enable-prefix-caching", parser.add_argument("--enable-prefix-caching",
action='store_true', action='store_true',
help="Enable automatic prefix caching") help="Enable automatic prefix caching")
parser.add_argument('--use-v2-block-manager', action='store_true') parser.add_argument('--use-v2-block-manager',
action='store_true',
default=EngineArgs.use_v2_block_manager)
parser.add_argument( parser.add_argument(
"--ray-workers-use-nsight", "--ray-workers-use-nsight",
action='store_true', action='store_true',

View File

@@ -33,6 +33,7 @@ from typing import List, Optional, Tuple
from transformers import PreTrainedTokenizerBase from transformers import PreTrainedTokenizerBase
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
try: try:
@@ -113,7 +114,7 @@ def repeat_and_sort_requests(requests: List[Tuple[str, int, int]],
def main(args): def main(args):
tokenizer = get_tokenizer(args.model, trust_remote_code=True) tokenizer = get_tokenizer(args.model, trust_remote_code=True)
input_length_range = tuple(map(int, args.input_length_range.split(':'))) input_length_range = tuple(map(int, args.input_length_range.split(':')))
random.seed(args.seed)
if args.dataset_path is not None: if args.dataset_path is not None:
print(f"Start to sample {args.num_prompts} prompts" print(f"Start to sample {args.num_prompts} prompts"
"from {args.dataset_path}") "from {args.dataset_path}")
@@ -177,6 +178,7 @@ if __name__ == "__main__":
help='enable prefix caching') help='enable prefix caching')
parser.add_argument('--use-v2-block-manager', parser.add_argument('--use-v2-block-manager',
action='store_true', action='store_true',
default=EngineArgs.use_v2_block_manager,
help='Use BlockSpaceMangerV2') help='Use BlockSpaceMangerV2')
parser.add_argument('--num-prompts', parser.add_argument('--num-prompts',
type=int, type=int,
@@ -194,5 +196,9 @@ if __name__ == "__main__":
default='128:256', default='128:256',
help='Range of input lengths for sampling prompts,' help='Range of input lengths for sampling prompts,'
'specified as "min:max" (e.g., "128:256").') 'specified as "min:max" (e.g., "128:256").')
parser.add_argument("--seed",
type=int,
default=0,
help='Random seed for reproducibility')
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)

View File

@@ -0,0 +1,293 @@
"""Benchmark offline prioritization."""
import argparse
import json
import random
import time
from typing import List, Optional, Tuple
from transformers import AutoTokenizer, PreTrainedTokenizerBase
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
def sample_requests(
dataset_path: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
fixed_output_len: Optional[int],
) -> List[Tuple[str, int, int]]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
# Load the dataset.
with open(dataset_path) as f:
dataset = json.load(f)
# Filter out the conversations with less than 2 turns.
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
# Only keep the first two turns of each conversation.
dataset = [(data["conversations"][0]["value"],
data["conversations"][1]["value"]) for data in dataset]
# Shuffle the dataset.
random.shuffle(dataset)
# Filter out sequences that are too long or too short
filtered_dataset: List[Tuple[str, int, int]] = []
for i in range(len(dataset)):
if len(filtered_dataset) == num_requests:
break
# Tokenize the prompts and completions.
prompt = dataset[i][0]
prompt_token_ids = tokenizer(prompt).input_ids
completion = dataset[i][1]
completion_token_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_token_ids)
output_len = len(completion_token_ids
) if fixed_output_len is None else fixed_output_len
if prompt_len < 4 or output_len < 4:
# Prune too short sequences.
continue
if prompt_len > 1024 or prompt_len + output_len > 2048:
# Prune too long sequences.
continue
#Select a equi-probable random priority
priority = 0 if random.random() < 0.5 else 1
filtered_dataset.append((prompt, prompt_len, output_len, priority))
return filtered_dataset
def run_vllm(
requests: List[Tuple[str, int, int]],
model: str,
tokenizer: str,
quantization: Optional[str],
tensor_parallel_size: int,
seed: int,
n: int,
trust_remote_code: bool,
dtype: str,
max_model_len: Optional[int],
enforce_eager: bool,
kv_cache_dtype: str,
quantization_param_path: Optional[str],
device: str,
enable_prefix_caching: bool,
enable_chunked_prefill: bool,
max_num_batched_tokens: int,
gpu_memory_utilization: float = 0.9,
download_dir: Optional[str] = None,
) -> float:
from vllm import LLM, SamplingParams
llm = LLM(
model=model,
tokenizer=tokenizer,
quantization=quantization,
tensor_parallel_size=tensor_parallel_size,
seed=seed,
trust_remote_code=trust_remote_code,
dtype=dtype,
max_model_len=max_model_len,
gpu_memory_utilization=gpu_memory_utilization,
enforce_eager=enforce_eager,
kv_cache_dtype=kv_cache_dtype,
quantization_param_path=quantization_param_path,
device=device,
enable_prefix_caching=enable_prefix_caching,
download_dir=download_dir,
enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
disable_log_stats=False,
)
# Add the requests to the engine.
prompts = []
sampling_params = []
priority = []
for prompt, _, output_len, _priority in requests:
prompts.append(prompt)
priority.append(_priority)
sampling_params.append(
SamplingParams(
n=n,
temperature=1.0,
top_p=1.0,
ignore_eos=True,
max_tokens=output_len,
))
start = time.perf_counter()
llm.generate(prompts, sampling_params, priority=priority, use_tqdm=True)
end = time.perf_counter()
return end - start
def main(args: argparse.Namespace):
print(args)
random.seed(args.seed)
# Sample the requests.
tokenizer = AutoTokenizer.from_pretrained(
args.tokenizer, trust_remote_code=args.trust_remote_code)
if args.dataset is None:
# Synthesize a prompt with the given input length.
prompt = "hi" * (args.input_len - 1)
requests = [(prompt, args.input_len, args.output_len)
for _ in range(args.num_prompts)]
else:
requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
args.output_len)
if args.backend == "vllm":
elapsed_time = run_vllm(requests, args.model, args.tokenizer,
args.quantization, args.tensor_parallel_size,
args.seed, args.n, args.trust_remote_code,
args.dtype, args.max_model_len,
args.enforce_eager, args.kv_cache_dtype,
args.quantization_param_path, args.device,
args.enable_prefix_caching,
args.enable_chunked_prefill,
args.max_num_batched_tokens,
args.gpu_memory_utilization, args.download_dir)
else:
raise ValueError(f"Unknown backend: {args.backend}")
total_num_tokens = sum(prompt_len + output_len
for _, prompt_len, output_len, priority in requests)
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
f"{total_num_tokens / elapsed_time:.2f} tokens/s")
# Output JSON results if specified
if args.output_json:
results = {
"elapsed_time": elapsed_time,
"num_requests": len(requests),
"total_num_tokens": total_num_tokens,
"requests_per_second": len(requests) / elapsed_time,
"tokens_per_second": total_num_tokens / elapsed_time,
}
with open(args.output_json, "w") as f:
json.dump(results, f, indent=4)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Benchmark the throughput.")
parser.add_argument("--backend",
type=str,
choices=["vllm", "hf", "mii"],
default="vllm")
parser.add_argument("--dataset",
type=str,
default=None,
help="Path to the dataset.")
parser.add_argument("--input-len",
type=int,
default=None,
help="Input prompt length for each request")
parser.add_argument("--output-len",
type=int,
default=None,
help="Output length for each request. Overrides the "
"output length from the dataset.")
parser.add_argument("--model", type=str, default="facebook/opt-125m")
parser.add_argument("--tokenizer", type=str, default=None)
parser.add_argument('--quantization',
'-q',
choices=[*QUANTIZATION_METHODS, None],
default=None)
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
parser.add_argument("--n",
type=int,
default=1,
help="Number of generated sequences per prompt.")
parser.add_argument("--num-prompts",
type=int,
default=200,
help="Number of prompts to process.")
parser.add_argument("--seed", type=int, default=0)
parser.add_argument('--trust-remote-code',
action='store_true',
help='trust remote code from huggingface')
parser.add_argument(
'--max-model-len',
type=int,
default=None,
help='Maximum length of a sequence (including prompt and output). '
'If None, will be derived from the model.')
parser.add_argument(
'--dtype',
type=str,
default='auto',
choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
help='data type for model weights and activations. '
'The "auto" option will use FP16 precision '
'for FP32 and FP16 models, and BF16 precision '
'for BF16 models.')
parser.add_argument('--gpu-memory-utilization',
type=float,
default=0.9,
help='the fraction of GPU memory to be used for '
'the model executor, which can range from 0 to 1.'
'If unspecified, will use the default value of 0.9.')
parser.add_argument("--enforce-eager",
action="store_true",
help="enforce eager execution")
parser.add_argument(
'--kv-cache-dtype',
type=str,
choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
default="auto",
help='Data type for kv cache storage. If "auto", will use model '
'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
parser.add_argument(
'--quantization-param-path',
type=str,
default=None,
help='Path to the JSON file containing the KV cache scaling factors. '
'This should generally be supplied, when KV cache dtype is FP8. '
'Otherwise, KV cache scaling factors default to 1.0, which may cause '
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
'instead supported for common inference criteria.')
parser.add_argument(
"--device",
type=str,
default="cuda",
choices=["cuda", "cpu"],
help='device type for vLLM execution, supporting CUDA and CPU.')
parser.add_argument(
"--enable-prefix-caching",
action='store_true',
help="enable automatic prefix caching for vLLM backend.")
parser.add_argument("--enable-chunked-prefill",
action='store_true',
help="enable chunked prefill for vLLM backend.")
parser.add_argument('--max-num-batched-tokens',
type=int,
default=None,
help='maximum number of batched tokens per '
'iteration')
parser.add_argument('--download-dir',
type=str,
default=None,
help='directory to download and load the weights, '
'default to the default cache dir of huggingface')
parser.add_argument(
'--output-json',
type=str,
default=None,
help='Path to save the throughput results in JSON format.')
args = parser.parse_args()
if args.tokenizer is None:
args.tokenizer = args.model
if args.dataset is None:
assert args.input_len is not None
assert args.output_len is not None
else:
assert args.input_len is None
main(args)

View File

@@ -1,4 +1,4 @@
"""Benchmark online serving throughput. r"""Benchmark online serving throughput.
On the server side, run one of the following commands: On the server side, run one of the following commands:
vLLM OpenAI API server vLLM OpenAI API server
@@ -24,6 +24,8 @@ On the client side, run:
""" """
import argparse import argparse
import asyncio import asyncio
import base64
import io
import json import json
import os import os
import random import random
@@ -31,11 +33,13 @@ import time
import warnings import warnings
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime from datetime import datetime
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple
import numpy as np import numpy as np
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput, from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
RequestFuncOutput) RequestFuncOutput)
from datasets import load_dataset
from PIL.Image import Image
from tqdm.asyncio import tqdm from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase from transformers import PreTrainedTokenizerBase
@@ -84,11 +88,9 @@ def sample_sharegpt_requests(
num_requests: int, num_requests: int,
tokenizer: PreTrainedTokenizerBase, tokenizer: PreTrainedTokenizerBase,
fixed_output_len: Optional[int] = None, fixed_output_len: Optional[int] = None,
) -> List[Tuple[str, int, int]]: ) -> List[Tuple[str, int, int, None]]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
# Load the dataset. # Load the dataset.
with open(dataset_path) as f: with open(dataset_path, encoding='utf-8') as f:
dataset = json.load(f) dataset = json.load(f)
# Filter out the conversations with less than 2 turns. # Filter out the conversations with less than 2 turns.
dataset = [data for data in dataset if len(data["conversations"]) >= 2] dataset = [data for data in dataset if len(data["conversations"]) >= 2]
@@ -113,13 +115,13 @@ def sample_sharegpt_requests(
prompt_len = len(prompt_token_ids) prompt_len = len(prompt_token_ids)
output_len = len(completion_token_ids output_len = len(completion_token_ids
) if fixed_output_len is None else fixed_output_len ) if fixed_output_len is None else fixed_output_len
if prompt_len < 4 or output_len < 4: if prompt_len < 4 or (fixed_output_len is None and output_len < 4):
# Prune too short sequences. # Prune too short sequences.
continue continue
if prompt_len > 1024 or prompt_len + output_len > 2048: if prompt_len > 1024 or prompt_len + output_len > 2048:
# Prune too long sequences. # Prune too long sequences.
continue continue
filtered_dataset.append((prompt, prompt_len, output_len)) filtered_dataset.append((prompt, prompt_len, output_len, None))
return filtered_dataset return filtered_dataset
@@ -131,13 +133,13 @@ def sample_sonnet_requests(
output_len: int, output_len: int,
prefix_len: int, prefix_len: int,
tokenizer: PreTrainedTokenizerBase, tokenizer: PreTrainedTokenizerBase,
) -> List[Tuple[str, str, int, int]]: ) -> List[Tuple[str, str, int, int, None]]:
assert ( assert (
input_len > prefix_len input_len > prefix_len
), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'." ), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
# Load the dataset. # Load the dataset.
with open(dataset_path) as f: with open(dataset_path, encoding='utf-8') as f:
poem_lines = f.readlines() poem_lines = f.readlines()
# Tokenize the poem lines. # Tokenize the poem lines.
@@ -174,9 +176,9 @@ def sample_sonnet_requests(
# Sample the rest of lines per request. # Sample the rest of lines per request.
sampled_requests: List[Tuple[str, int, int]] = [] sampled_requests: List[Tuple[str, int, int]] = []
for _ in range(num_requests): for _ in range(num_requests):
sampled_lines = "".join( num_lines_needed = num_input_lines - num_prefix_lines
prefix_lines + sampled_lines = "".join(prefix_lines +
random.sample(poem_lines, num_input_lines - num_prefix_lines)) random.choices(poem_lines, k=num_lines_needed))
prompt = f"{base_prompt}{sampled_lines}" prompt = f"{base_prompt}{sampled_lines}"
message = [ message = [
@@ -189,7 +191,66 @@ def sample_sonnet_requests(
message, add_generation_prompt=True, tokenize=False) message, add_generation_prompt=True, tokenize=False)
prompt_len = len(tokenizer(prompt_formatted).input_ids) prompt_len = len(tokenizer(prompt_formatted).input_ids)
sampled_requests.append( sampled_requests.append(
(prompt, prompt_formatted, prompt_len, output_len)) (prompt, prompt_formatted, prompt_len, output_len, None))
return sampled_requests
def sample_hf_requests(
dataset_path: str,
dataset_subset: str,
dataset_split: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
fixed_output_len: Optional[int] = None,
) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
dataset = load_dataset(dataset_path,
name=dataset_subset,
split=dataset_split,
streaming=True)
assert "conversations" in dataset.features, (
"HF Dataset must have 'conversations' column.")
filtered_dataset = dataset.shuffle().filter(
lambda x: len(x["conversations"]) >= 2)
sampled_requests: List[Tuple[str, int, int, Dict[str,
Collection[str]]]] = []
for data in filtered_dataset:
if len(sampled_requests) == num_requests:
break
# Tokenize the prompts and completions.
prompt = data["conversations"][0]["value"]
prompt_token_ids = tokenizer(prompt).input_ids
completion = data["conversations"][1]["value"]
completion_token_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_token_ids)
output_len = len(completion_token_ids
) if fixed_output_len is None else fixed_output_len
if fixed_output_len is None and (prompt_len < 4 or output_len < 4):
# Prune too short sequences.
continue
if fixed_output_len is None and \
(prompt_len > 1024 or prompt_len + output_len > 2048):
# Prune too long sequences.
continue
if "image" in data and isinstance(data["image"], Image):
image: Image = data["image"]
image = image.convert("RGB")
image_data = io.BytesIO()
image.save(image_data, format='JPEG')
image_base64 = base64.b64encode(
image_data.getvalue()).decode("utf-8")
mm_content = {
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_base64}"
},
}
else:
mm_content = None
sampled_requests.append((prompt, prompt_len, output_len, mm_content))
return sampled_requests return sampled_requests
@@ -223,8 +284,8 @@ def sample_random_requests(
[(offsets[i] + i + j) % tokenizer.vocab_size [(offsets[i] + i + j) % tokenizer.vocab_size
for j in range(input_lens[i])]) for j in range(input_lens[i])])
input_requests.append( input_requests.append((prompt, int(prefix_len + input_lens[i]),
(prompt, int(prefix_len + input_lens[i]), int(output_lens[i]))) int(output_lens[i]), None))
return input_requests return input_requests
@@ -330,12 +391,12 @@ async def benchmark(
input_requests: List[Tuple[str, int, int]], input_requests: List[Tuple[str, int, int]],
logprobs: Optional[int], logprobs: Optional[int],
best_of: int, best_of: int,
use_beam_search: bool,
request_rate: float, request_rate: float,
disable_tqdm: bool, disable_tqdm: bool,
profile: bool, profile: bool,
selected_percentile_metrics: List[str], selected_percentile_metrics: List[str],
selected_percentiles: List[str], selected_percentiles: List[str],
ignore_eos: bool,
): ):
if backend in ASYNC_REQUEST_FUNCS: if backend in ASYNC_REQUEST_FUNCS:
request_func = ASYNC_REQUEST_FUNCS[backend] request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -343,7 +404,12 @@ async def benchmark(
raise ValueError(f"Unknown backend: {backend}") raise ValueError(f"Unknown backend: {backend}")
print("Starting initial single prompt test run...") print("Starting initial single prompt test run...")
test_prompt, test_prompt_len, test_output_len = input_requests[0] test_prompt, test_prompt_len, test_output_len, test_mm_content = (
input_requests[0])
if backend != "openai-chat" and test_mm_content is not None:
# multi-modal benchmark is only available on OpenAI Chat backend.
raise ValueError(
"Multi-modal content is only supported on 'openai-chat' backend.")
test_input = RequestFuncInput( test_input = RequestFuncInput(
model=model_id, model=model_id,
prompt=test_prompt, prompt=test_prompt,
@@ -352,7 +418,8 @@ async def benchmark(
output_len=test_output_len, output_len=test_output_len,
logprobs=logprobs, logprobs=logprobs,
best_of=best_of, best_of=best_of,
use_beam_search=use_beam_search, multi_modal_content=test_mm_content,
ignore_eos=ignore_eos,
) )
test_output = await request_func(request_func_input=test_input) test_output = await request_func(request_func_input=test_input)
if not test_output.success: if not test_output.success:
@@ -372,7 +439,7 @@ async def benchmark(
output_len=test_output_len, output_len=test_output_len,
logprobs=logprobs, logprobs=logprobs,
best_of=best_of, best_of=best_of,
use_beam_search=use_beam_search, multi_modal_content=test_mm_content,
) )
profile_output = await request_func(request_func_input=profile_input) profile_output = await request_func(request_func_input=profile_input)
if profile_output.success: if profile_output.success:
@@ -385,7 +452,7 @@ async def benchmark(
benchmark_start_time = time.perf_counter() benchmark_start_time = time.perf_counter()
tasks: List[asyncio.Task] = [] tasks: List[asyncio.Task] = []
async for request in get_request(input_requests, request_rate): async for request in get_request(input_requests, request_rate):
prompt, prompt_len, output_len = request prompt, prompt_len, output_len, mm_content = request
request_func_input = RequestFuncInput( request_func_input = RequestFuncInput(
model=model_id, model=model_id,
prompt=prompt, prompt=prompt,
@@ -394,7 +461,7 @@ async def benchmark(
output_len=output_len, output_len=output_len,
logprobs=logprobs, logprobs=logprobs,
best_of=best_of, best_of=best_of,
use_beam_search=use_beam_search, multi_modal_content=mm_content,
) )
tasks.append( tasks.append(
asyncio.create_task( asyncio.create_task(
@@ -412,7 +479,6 @@ async def benchmark(
output_len=test_output_len, output_len=test_output_len,
logprobs=logprobs, logprobs=logprobs,
best_of=best_of, best_of=best_of,
use_beam_search=use_beam_search,
) )
profile_output = await request_func(request_func_input=profile_input) profile_output = await request_func(request_func_input=profile_input)
if profile_output.success: if profile_output.success:
@@ -470,7 +536,7 @@ async def benchmark(
# E.g., "Time to First Token" # E.g., "Time to First Token"
metric_header: str, metric_header: str,
): ):
# This function print and add statistics of the specified # This function prints and adds statistics of the specified
# metric. # metric.
if metric_attribute_name not in selected_percentile_metrics: if metric_attribute_name not in selected_percentile_metrics:
return return
@@ -556,9 +622,9 @@ def main(args: argparse.Namespace):
prefix_len=args.sonnet_prefix_len, prefix_len=args.sonnet_prefix_len,
tokenizer=tokenizer, tokenizer=tokenizer,
) )
input_requests = [(prompt, prompt_len, output_len) input_requests = [(prompt, prompt_len, output_len, None)
for prompt, prompt_formatted, prompt_len, for prompt, prompt_formatted, prompt_len,
output_len in input_requests] output_len, _ in input_requests]
else: else:
assert ( assert (
tokenizer.chat_template or tokenizer.default_chat_template tokenizer.chat_template or tokenizer.default_chat_template
@@ -571,9 +637,19 @@ def main(args: argparse.Namespace):
prefix_len=args.sonnet_prefix_len, prefix_len=args.sonnet_prefix_len,
tokenizer=tokenizer, tokenizer=tokenizer,
) )
input_requests = [(prompt_formatted, prompt_len, output_len) input_requests = [(prompt_formatted, prompt_len, output_len, None)
for prompt, prompt_formatted, prompt_len, for prompt, prompt_formatted, prompt_len,
output_len in input_requests] output_len, _ in input_requests]
elif args.dataset_name == "hf":
input_requests = sample_hf_requests(
dataset_path=args.dataset_path,
dataset_subset=args.hf_subset,
dataset_split=args.hf_split,
num_requests=args.num_prompts,
tokenizer=tokenizer,
fixed_output_len=args.hf_output_len,
)
elif args.dataset_name == "random": elif args.dataset_name == "random":
input_requests = sample_random_requests( input_requests = sample_random_requests(
@@ -598,7 +674,6 @@ def main(args: argparse.Namespace):
input_requests=input_requests, input_requests=input_requests,
logprobs=args.logprobs, logprobs=args.logprobs,
best_of=args.best_of, best_of=args.best_of,
use_beam_search=args.use_beam_search,
request_rate=args.request_rate, request_rate=args.request_rate,
disable_tqdm=args.disable_tqdm, disable_tqdm=args.disable_tqdm,
profile=args.profile, profile=args.profile,
@@ -606,6 +681,7 @@ def main(args: argparse.Namespace):
selected_percentiles=[ selected_percentiles=[
float(p) for p in args.metric_percentiles.split(",") float(p) for p in args.metric_percentiles.split(",")
], ],
ignore_eos=args.ignore_eos,
)) ))
# Save config and results to json # Save config and results to json
@@ -619,7 +695,6 @@ def main(args: argparse.Namespace):
result_json["model_id"] = model_id result_json["model_id"] = model_id
result_json["tokenizer_id"] = tokenizer_id result_json["tokenizer_id"] = tokenizer_id
result_json["best_of"] = args.best_of result_json["best_of"] = args.best_of
result_json["use_beam_search"] = args.use_beam_search
result_json["num_prompts"] = args.num_prompts result_json["num_prompts"] = args.num_prompts
# Metadata # Metadata
@@ -647,7 +722,7 @@ def main(args: argparse.Namespace):
file_name = args.result_filename file_name = args.result_filename
if args.result_dir: if args.result_dir:
file_name = os.path.join(args.result_dir, file_name) file_name = os.path.join(args.result_dir, file_name)
with open(file_name, "w") as outfile: with open(file_name, "w", encoding='utf-8') as outfile:
json.dump(result_json, outfile) json.dump(result_json, outfile)
@@ -685,13 +760,14 @@ if __name__ == "__main__":
"--dataset-name", "--dataset-name",
type=str, type=str,
default="sharegpt", default="sharegpt",
choices=["sharegpt", "sonnet", "random"], choices=["sharegpt", "sonnet", "random", "hf"],
help="Name of the dataset to benchmark on.", help="Name of the dataset to benchmark on.",
) )
parser.add_argument("--dataset-path", parser.add_argument("--dataset-path",
type=str, type=str,
default=None, default=None,
help="Path to the dataset.") help="Path to the sharegpt/sonnet dataset. "
"Or the huggingface dataset ID if using HF dataset.")
parser.add_argument( parser.add_argument(
"--model", "--model",
type=str, type=str,
@@ -718,26 +794,6 @@ if __name__ == "__main__":
default=1000, default=1000,
help="Number of prompts to process.", help="Number of prompts to process.",
) )
parser.add_argument(
"--sharegpt-output-len",
type=int,
default=None,
help="Output length for each request. Overrides the output length "
"from the ShareGPT dataset.")
parser.add_argument(
"--sonnet-input-len",
type=int,
default=550,
help=
"Number of input tokens per request, used only for sonnet dataset.",
)
parser.add_argument(
"--sonnet-output-len",
type=int,
default=150,
help=
"Number of output tokens per request, used only for sonnet dataset.",
)
parser.add_argument( parser.add_argument(
"--logprobs", "--logprobs",
type=int, type=int,
@@ -748,42 +804,6 @@ if __name__ == "__main__":
"logprob is returned for each token; or (2) if beam search " "logprob is returned for each token; or (2) if beam search "
"is enabled 1 logprob per token is computed"), "is enabled 1 logprob per token is computed"),
) )
parser.add_argument(
"--sonnet-prefix-len",
type=int,
default=200,
help=
"Number of prefix tokens per request, used only for sonnet dataset.",
)
parser.add_argument(
"--random-input-len",
type=int,
default=1024,
help=
"Number of input tokens per request, used only for random sampling.",
)
parser.add_argument(
"--random-output-len",
type=int,
default=128,
help=
"Number of output tokens per request, used only for random sampling.",
)
parser.add_argument(
"--random-range-ratio",
type=float,
default=1.0,
help="Range of sampled ratio of input/output length, "
"used only for random sampling.",
)
parser.add_argument(
"--random-prefix-len",
type=int,
default=0,
help="Number of fixed prefix tokens before random "
" context. The length range of context in a random "
" request is [random-prefix-len, "
" random-prefix-len + random-prefix-len * random-range-ratio).")
parser.add_argument( parser.add_argument(
"--request-rate", "--request-rate",
type=float, type=float,
@@ -839,6 +859,11 @@ if __name__ == "__main__":
"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
" format.", " format.",
) )
parser.add_argument(
"--ignore-eos",
action="store_true",
help="Set ignore_eos flag when sending the benchmark request."
"Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
parser.add_argument( parser.add_argument(
"--percentile-metrics", "--percentile-metrics",
type=str, type=str,
@@ -857,5 +882,85 @@ if __name__ == "__main__":
"Use \"--percentile-metrics\" to select metrics.", "Use \"--percentile-metrics\" to select metrics.",
) )
# group for dataset specific arguments
sonnet_group = parser.add_argument_group("sonnet dataset options")
sonnet_group.add_argument(
"--sonnet-input-len",
type=int,
default=550,
help=
"Number of input tokens per request, used only for sonnet dataset.",
)
sonnet_group.add_argument(
"--sonnet-output-len",
type=int,
default=150,
help=
"Number of output tokens per request, used only for sonnet dataset.",
)
sonnet_group.add_argument(
"--sonnet-prefix-len",
type=int,
default=200,
help=
"Number of prefix tokens per request, used only for sonnet dataset.",
)
sharegpt_group = parser.add_argument_group("sharegpt dataset options")
sharegpt_group.add_argument(
"--sharegpt-output-len",
type=int,
default=None,
help="Output length for each request. Overrides the output length "
"from the ShareGPT dataset.")
random_group = parser.add_argument_group("random dataset options")
random_group.add_argument(
"--random-input-len",
type=int,
default=1024,
help=
"Number of input tokens per request, used only for random sampling.",
)
random_group.add_argument(
"--random-output-len",
type=int,
default=128,
help=
"Number of output tokens per request, used only for random sampling.",
)
random_group.add_argument(
"--random-range-ratio",
type=float,
default=1.0,
help="Range of sampled ratio of input/output length, "
"used only for random sampling.",
)
random_group.add_argument(
"--random-prefix-len",
type=int,
default=0,
help="Number of fixed prefix tokens before random "
" context. The length range of context in a random "
" request is [random-prefix-len, "
" random-prefix-len + random-prefix-len * random-range-ratio).")
hf_group = parser.add_argument_group("hf dataset options")
hf_group.add_argument("--hf-subset",
type=str,
default=None,
help="Subset of the HF dataset.")
hf_group.add_argument("--hf-split",
type=str,
default=None,
help="Split of the HF dataset.")
hf_group.add_argument(
"--hf-output-len",
type=int,
default=None,
help="Output length for each request. Overrides the output lengths "
"from the sampled HF dataset.",
)
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)

View File

@@ -15,6 +15,7 @@ from vllm.engine.arg_utils import DEVICE_OPTIONS, AsyncEngineArgs, EngineArgs
from vllm.entrypoints.openai.api_server import ( from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args) build_async_engine_client_from_engine_args)
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.sampling_params import BeamSearchParams
from vllm.utils import FlexibleArgumentParser, merge_async_iterators from vllm.utils import FlexibleArgumentParser, merge_async_iterators
@@ -72,7 +73,6 @@ def run_vllm(
tensor_parallel_size: int, tensor_parallel_size: int,
seed: int, seed: int,
n: int, n: int,
use_beam_search: bool,
trust_remote_code: bool, trust_remote_code: bool,
dtype: str, dtype: str,
max_model_len: Optional[int], max_model_len: Optional[int],
@@ -125,16 +125,33 @@ def run_vllm(
sampling_params.append( sampling_params.append(
SamplingParams( SamplingParams(
n=n, n=n,
temperature=0.0 if use_beam_search else 1.0, temperature=1.0,
top_p=1.0, top_p=1.0,
use_beam_search=use_beam_search,
ignore_eos=True, ignore_eos=True,
max_tokens=output_len, max_tokens=output_len,
)) ))
start = time.perf_counter() use_beam_search = False
llm.generate(prompts, sampling_params, use_tqdm=True)
end = time.perf_counter() if not use_beam_search:
start = time.perf_counter()
llm.generate(prompts, sampling_params, use_tqdm=True)
end = time.perf_counter()
else:
prompts = [prompt for prompt, _, _ in requests]
# output_len should be the same for all requests.
output_len = requests[0][2]
for prompt, input_len, _output_len in requests:
assert _output_len == output_len
start = time.perf_counter()
llm.beam_search(
prompts,
BeamSearchParams(
beam_width=n,
max_tokens=output_len,
ignore_eos=True,
))
end = time.perf_counter()
return end - start return end - start
@@ -146,7 +163,6 @@ async def run_vllm_async(
tensor_parallel_size: int, tensor_parallel_size: int,
seed: int, seed: int,
n: int, n: int,
use_beam_search: bool,
trust_remote_code: bool, trust_remote_code: bool,
dtype: str, dtype: str,
max_model_len: Optional[int], max_model_len: Optional[int],
@@ -191,7 +207,6 @@ async def run_vllm_async(
use_v2_block_manager=use_v2_block_manager, use_v2_block_manager=use_v2_block_manager,
disable_async_output_proc=disable_async_output_proc, disable_async_output_proc=disable_async_output_proc,
worker_use_ray=False, worker_use_ray=False,
engine_use_ray=False,
disable_log_requests=True, disable_log_requests=True,
) )
@@ -206,9 +221,8 @@ async def run_vllm_async(
sampling_params.append( sampling_params.append(
SamplingParams( SamplingParams(
n=n, n=n,
temperature=0.0 if use_beam_search else 1.0, temperature=1.0,
top_p=1.0, top_p=1.0,
use_beam_search=use_beam_search,
ignore_eos=True, ignore_eos=True,
max_tokens=output_len, max_tokens=output_len,
)) ))
@@ -230,11 +244,9 @@ def run_hf(
model: str, model: str,
tokenizer: PreTrainedTokenizerBase, tokenizer: PreTrainedTokenizerBase,
n: int, n: int,
use_beam_search: bool,
max_batch_size: int, max_batch_size: int,
trust_remote_code: bool, trust_remote_code: bool,
) -> float: ) -> float:
assert not use_beam_search
llm = AutoModelForCausalLM.from_pretrained( llm = AutoModelForCausalLM.from_pretrained(
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code) model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
if llm.config.model_type == "llama": if llm.config.model_type == "llama":
@@ -266,7 +278,7 @@ def run_hf(
padding=True).input_ids padding=True).input_ids
llm_outputs = llm.generate( llm_outputs = llm.generate(
input_ids=input_ids.cuda(), input_ids=input_ids.cuda(),
do_sample=not use_beam_search, do_sample=True,
num_return_sequences=n, num_return_sequences=n,
temperature=1.0, temperature=1.0,
top_p=1.0, top_p=1.0,
@@ -322,7 +334,7 @@ def main(args: argparse.Namespace):
if args.backend == "vllm": if args.backend == "vllm":
run_args = [ run_args = [
requests, args.model, args.tokenizer, args.quantization, requests, args.model, args.tokenizer, args.quantization,
args.tensor_parallel_size, args.seed, args.n, args.use_beam_search, args.tensor_parallel_size, args.seed, args.n,
args.trust_remote_code, args.dtype, args.max_model_len, args.trust_remote_code, args.dtype, args.max_model_len,
args.enforce_eager, args.kv_cache_dtype, args.enforce_eager, args.kv_cache_dtype,
args.quantization_param_path, args.device, args.quantization_param_path, args.device,
@@ -341,8 +353,7 @@ def main(args: argparse.Namespace):
elif args.backend == "hf": elif args.backend == "hf":
assert args.tensor_parallel_size == 1 assert args.tensor_parallel_size == 1
elapsed_time = run_hf(requests, args.model, tokenizer, args.n, elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
args.use_beam_search, args.hf_max_batch_size, args.hf_max_batch_size, args.trust_remote_code)
args.trust_remote_code)
elif args.backend == "mii": elif args.backend == "mii":
elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size, elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
args.output_len) args.output_len)
@@ -396,7 +407,6 @@ if __name__ == "__main__":
type=int, type=int,
default=1, default=1,
help="Number of generated sequences per prompt.") help="Number of generated sequences per prompt.")
parser.add_argument("--use-beam-search", action="store_true")
parser.add_argument("--num-prompts", parser.add_argument("--num-prompts",
type=int, type=int,
default=1000, default=1000,
@@ -463,6 +473,7 @@ if __name__ == "__main__":
help="Maximum number of forward steps per scheduler call.") help="Maximum number of forward steps per scheduler call.")
parser.add_argument("--use-v2-block-manager", parser.add_argument("--use-v2-block-manager",
action='store_true', action='store_true',
default=EngineArgs.use_v2_block_manager,
help="Enable block manager v2.") help="Enable block manager v2.")
parser.add_argument( parser.add_argument(
"--enable-prefix-caching", "--enable-prefix-caching",
@@ -551,8 +562,6 @@ if __name__ == "__main__":
raise ValueError("dtype must be auto for MII backend.") raise ValueError("dtype must be auto for MII backend.")
if args.n != 1: if args.n != 1:
raise ValueError("n must be 1 for MII backend.") raise ValueError("n must be 1 for MII backend.")
if args.use_beam_search:
raise ValueError("Beam search is not supported for MII backend.")
if args.quantization is not None: if args.quantization is not None:
raise ValueError("Quantization is only for vLLM backend.") raise ValueError("Quantization is only for vLLM backend.")
if args.hf_max_batch_size is not None: if args.hf_max_batch_size is not None:

View File

@@ -1,10 +1,10 @@
import random
import time import time
import torch import torch
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
seed_everything)
@torch.inference_mode() @torch.inference_mode()
@@ -16,10 +16,7 @@ def main(num_tokens: int,
do_profile: bool = False, do_profile: bool = False,
num_warmup_iters: int = 5, num_warmup_iters: int = 5,
num_iters: int = 100) -> None: num_iters: int = 100) -> None:
random.seed(seed) seed_everything(seed)
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.set_default_device("cuda") torch.set_default_device("cuda")
layer = RMSNorm(hidden_size).to(dtype=dtype) layer = RMSNorm(hidden_size).to(dtype=dtype)

View File

@@ -4,8 +4,10 @@ import itertools
import math import math
import pickle as pkl import pickle as pkl
import time import time
from typing import Callable, Iterable, List, Tuple from itertools import product
from typing import Callable, Iterable, List, Optional, Tuple
import pandas as pd
import torch import torch
import torch.utils.benchmark as TBenchmark import torch.utils.benchmark as TBenchmark
from torch.utils.benchmark import Measurement as TMeasurement from torch.utils.benchmark import Measurement as TMeasurement
@@ -84,6 +86,10 @@ def loop_over_weights(
fn(a, w_ref, w_q, w_s) fn(a, w_ref, w_q, w_s)
_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
def bench(atype: torch.dtype, def bench(atype: torch.dtype,
wtype: ScalarType, wtype: ScalarType,
group_size: int, group_size: int,
@@ -94,6 +100,8 @@ def bench(atype: torch.dtype,
sub_label: str, sub_label: str,
benchmark_marlinv1: bool = True, benchmark_marlinv1: bool = True,
sweep_schedules: bool = True) -> Iterable[TMeasurement]: sweep_schedules: bool = True) -> Iterable[TMeasurement]:
global _SWEEP_SCHEDULES_RESULTS
a, weights = make_bench_tensors(atype, wtype, group_size, m, n, k) a, weights = make_bench_tensors(atype, wtype, group_size, m, n, k)
sub_label += f", L={len(weights)}" sub_label += f", L={len(weights)}"
@@ -163,6 +171,11 @@ def bench(atype: torch.dtype,
best_schedule = None best_schedule = None
schedules = ops.machete_supported_schedules(wtype) schedules = ops.machete_supported_schedules(wtype)
for schedule in reversed(schedules): for schedule in reversed(schedules):
schedule_M = int(schedule.split("_")[0].split("x")[1])
# Prune known bad schedules
if schedule_M >= 2 * max(m, 16) or schedule_M < m // 4:
continue
def run(a, _, w_q, w_s, schedule=schedule): def run(a, _, w_q, w_s, schedule=schedule):
ops.machete_gemm(a, ops.machete_gemm(a,
@@ -175,6 +188,20 @@ def bench(atype: torch.dtype,
res = bench_fn(label, sub_label, "machete_best", res = bench_fn(label, sub_label, "machete_best",
lambda: loop_over_weights(a, weights_machete, run)) lambda: loop_over_weights(a, weights_machete, run))
results_row = {
"M": m,
"K": k,
"N": n,
"group_size": group_size,
"schedule": schedule,
"median": res.median,
}
if _SWEEP_SCHEDULES_RESULTS is None:
_SWEEP_SCHEDULES_RESULTS = pd.DataFrame(
columns=results_row.keys())
_SWEEP_SCHEDULES_RESULTS.\
loc[len(_SWEEP_SCHEDULES_RESULTS)] = results_row
print(f" {res.median:5.5} ", schedule) print(f" {res.median:5.5} ", schedule)
if not best or res.median < best.median: if not best or res.median < best.median:
best = res best = res
@@ -235,18 +262,22 @@ def run_square_bench(args):
dim_sizes = list( dim_sizes = list(
range(args.dim_start, args.dim_end + 1, args.dim_increment)) range(args.dim_start, args.dim_end + 1, args.dim_increment))
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes)) MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
data = run(args.dtype, args.sweep_schedules, MKNs) data = run(args.dtype, args.sweep_schedules, MKNs)
make_output(data, MKNs, f"square_bench-{args.dtype}") make_output(data, MKNs, f"square_bench-{args.dtype}")
def run_range_bench(args): def run_range_bench(args):
dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment)) m_start, k_start, n_start = [int(x) for x in args.dim_start.split(",")]
n = len(dim_sizes) m_end, k_end, n_end = [int(x) for x in args.dim_end.split(",")]
Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes m_increment, k_increment, n_increment = \
Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes [int(x) for x in args.dim_increment.split(",")]
Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes Ms = list(range(m_start, m_end + 1, m_increment))
MKNs = list(zip(Ms, Ks, Ns)) Ks = list(range(k_start, k_end + 1, k_increment))
Ns = list(range(n_start, n_end + 1, n_increment))
MKNs = list(product(Ms, Ks, Ns))
data = run(args.dtype, args.sweep_schedules, MKNs) data = run(args.dtype, args.sweep_schedules, MKNs)
make_output(data, MKNs, f"range_bench-{args.dtype}") make_output(data, MKNs, f"range_bench-{args.dtype}")
@@ -333,6 +364,9 @@ Benchmark Machete GEMM.
action="store_true", action="store_true",
help="Run a sweep over all supported schedules", help="Run a sweep over all supported schedules",
) )
parser.add_argument("--sweep-csv-out",
help="CSV to store sweep results",
default="sch_sweep_results.csv")
subparsers = parser.add_subparsers(dest="cmd", required=True) subparsers = parser.add_subparsers(dest="cmd", required=True)
square_parser = subparsers.add_parser("square_bench") square_parser = subparsers.add_parser("square_bench")
@@ -342,12 +376,21 @@ Benchmark Machete GEMM.
square_parser.set_defaults(func=run_square_bench) square_parser.set_defaults(func=run_square_bench)
range_parser = subparsers.add_parser("range_bench") range_parser = subparsers.add_parser("range_bench")
range_parser.add_argument("--dim-start", type=int, required=True) range_parser.add_argument(
range_parser.add_argument("--dim-end", type=int, required=True) "--dim-start",
range_parser.add_argument("--dim-increment", type=int, required=True) type=str,
range_parser.add_argument("--m-constant", type=int, default=None) required=True,
range_parser.add_argument("--n-constant", type=int, default=None) help="Start value for M,K,N as common separated list")
range_parser.add_argument("--k-constant", type=int, default=None) range_parser.add_argument(
"--dim-end",
type=str,
required=True,
help="End value (inclusive) for M,K,N as common separated list")
range_parser.add_argument(
"--dim-increment",
type=str,
required=True,
help="Increment value for M,K,N as common separated list")
range_parser.set_defaults(func=run_range_bench) range_parser.set_defaults(func=run_range_bench)
model_parser = subparsers.add_parser("model_bench") model_parser = subparsers.add_parser("model_bench")
@@ -369,4 +412,9 @@ Benchmark Machete GEMM.
model_parser.set_defaults(func=run_model_bench) model_parser.set_defaults(func=run_model_bench)
args = parser.parse_args() args = parser.parse_args()
_SWEEP_SCHEDULES_RESULTS_CSV = args.sweep_csv_out
args.func(args) args.func(args)
if _SWEEP_SCHEDULES_RESULTS is not None:
_SWEEP_SCHEDULES_RESULTS.to_csv(_SWEEP_SCHEDULES_RESULTS_CSV)

View File

@@ -10,7 +10,7 @@ from ray.experimental.tqdm_ray import tqdm
from transformers import AutoConfig from transformers import AutoConfig
from vllm.model_executor.layers.fused_moe.fused_moe import * from vllm.model_executor.layers.fused_moe.fused_moe import *
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser, seed_everything
class BenchmarkConfig(TypedDict): class BenchmarkConfig(TypedDict):
@@ -166,7 +166,7 @@ class BenchmarkWorker:
def __init__(self, seed: int) -> None: def __init__(self, seed: int) -> None:
torch.set_default_device("cuda") torch.set_default_device("cuda")
torch.cuda.manual_seed_all(seed) seed_everything(seed)
self.seed = seed self.seed = seed
def benchmark( def benchmark(
@@ -180,7 +180,7 @@ class BenchmarkWorker:
use_fp8_w8a8: bool, use_fp8_w8a8: bool,
use_int8_w8a16: bool, use_int8_w8a16: bool,
) -> Tuple[Dict[str, int], float]: ) -> Tuple[Dict[str, int], float]:
torch.cuda.manual_seed_all(self.seed) seed_everything(self.seed)
dtype_str = get_config_dtype_str(dtype, dtype_str = get_config_dtype_str(dtype,
use_int8_w8a16=use_int8_w8a16, use_int8_w8a16=use_int8_w8a16,
use_fp8_w8a8=use_fp8_w8a8) use_fp8_w8a8=use_fp8_w8a8)

View File

@@ -6,7 +6,7 @@ import torch
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser, from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
create_kv_caches_with_random) create_kv_caches_with_random, seed_everything)
NUM_BLOCKS = 1024 NUM_BLOCKS = 1024
PARTITION_SIZE = 512 PARTITION_SIZE = 512
@@ -28,10 +28,7 @@ def main(
device: str = "cuda", device: str = "cuda",
kv_cache_dtype: Optional[str] = None, kv_cache_dtype: Optional[str] = None,
) -> None: ) -> None:
random.seed(seed) seed_everything(seed)
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
scale = float(1.0 / (head_size**0.5)) scale = float(1.0 / (head_size**0.5))
query = torch.empty(num_seqs, query = torch.empty(num_seqs,

View File

@@ -1,10 +1,10 @@
import random
import time import time
import torch import torch
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
seed_everything)
@torch.inference_mode() @torch.inference_mode()
@@ -17,10 +17,7 @@ def main(num_tokens: int,
do_profile: bool = False, do_profile: bool = False,
num_warmup_iters: int = 5, num_warmup_iters: int = 5,
num_iters: int = 100) -> None: num_iters: int = 100) -> None:
random.seed(seed) seed_everything(seed)
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.set_default_device("cuda") torch.set_default_device("cuda")
x = torch.randn(num_tokens, hidden_size, dtype=dtype) x = torch.randn(num_tokens, hidden_size, dtype=dtype)

View File

@@ -6,7 +6,7 @@ import torch
from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding, from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
get_rope) get_rope)
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser, seed_everything
def benchmark_rope_kernels_multi_lora( def benchmark_rope_kernels_multi_lora(
@@ -22,9 +22,7 @@ def benchmark_rope_kernels_multi_lora(
max_position: int = 8192, max_position: int = 8192,
base: int = 10000, base: int = 10000,
) -> None: ) -> None:
torch.random.manual_seed(seed) seed_everything(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
if rotary_dim is None: if rotary_dim is None:
rotary_dim = head_size rotary_dim = head_size

View File

@@ -45,8 +45,7 @@ if __name__ == "__main__":
rows = int(math.ceil(len(results) / 2)) rows = int(math.ceil(len(results) / 2))
fig, axs = plt.subplots(rows, 2, figsize=(12, 5 * rows)) fig, axs = plt.subplots(rows, 2, figsize=(12, 5 * rows))
axs = axs.flatten() axs = axs.flatten()
axs_idx = 0 for axs_idx, (shape, data) in enumerate(results.items()):
for shape, data in results.items():
plt.sca(axs[axs_idx]) plt.sca(axs[axs_idx])
df = pd.DataFrame(data) df = pd.DataFrame(data)
sns.lineplot(data=df, sns.lineplot(data=df,
@@ -59,6 +58,5 @@ if __name__ == "__main__":
palette="Dark2") palette="Dark2")
plt.title(f"Shape: {shape}") plt.title(f"Shape: {shape}")
plt.ylabel("time (median, s)") plt.ylabel("time (median, s)")
axs_idx += 1
plt.tight_layout() plt.tight_layout()
plt.savefig("graph_machete_bench.pdf") plt.savefig("graph_machete_bench.pdf")

View File

@@ -0,0 +1 @@
pandas

View File

@@ -84,7 +84,12 @@ endif()
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}") message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
list(APPEND LIBS dnnl numa) list(APPEND LIBS numa)
# Appending the dnnl library for the AVX2 and AVX512, as it is not utilized by Power architecture.
if (AVX2_FOUND OR AVX512_FOUND)
list(APPEND LIBS dnnl)
endif()
# #
# _C extension # _C extension
@@ -120,4 +125,3 @@ define_gpu_extension_target(
) )
message(STATUS "Enabling C extension.") message(STATUS "Enabling C extension.")
add_dependencies(default _C)

View File

@@ -133,10 +133,181 @@ macro(string_to_ver OUT_VER IN_STR)
string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR}) string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
endmacro() endmacro()
#
# Clear all `-gencode` flags from `CMAKE_CUDA_FLAGS` and store them in
# `CUDA_ARCH_FLAGS`.
#
# Example:
# CMAKE_CUDA_FLAGS="-Wall -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75"
# clear_cuda_arches(CUDA_ARCH_FLAGS)
# CUDA_ARCH_FLAGS="-gencode arch=compute_70,code=sm_70;-gencode arch=compute_75,code=sm_75"
# CMAKE_CUDA_FLAGS="-Wall"
#
macro(clear_cuda_arches CUDA_ARCH_FLAGS)
# Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
string(REGEX MATCHALL "-gencode arch=[^ ]+" CUDA_ARCH_FLAGS
${CMAKE_CUDA_FLAGS})
# Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
# and passed back via the `CUDA_ARCHITECTURES` property.
string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
${CMAKE_CUDA_FLAGS})
endmacro()
#
# Extract unique CUDA architectures from a list of compute capabilities codes in
# the form `<major><minor>[<letter>]`, convert them to the form sort
# `<major>.<minor>`, dedupes them and then sorts them in ascending order and
# stores them in `OUT_ARCHES`.
#
# Example:
# CUDA_ARCH_FLAGS="-gencode arch=compute_75,code=sm_75;...;-gencode arch=compute_90a,code=sm_90a"
# extract_unique_cuda_archs_ascending(OUT_ARCHES CUDA_ARCH_FLAGS)
# OUT_ARCHES="7.5;...;9.0"
function(extract_unique_cuda_archs_ascending OUT_ARCHES CUDA_ARCH_FLAGS)
set(_CUDA_ARCHES)
foreach(_ARCH ${CUDA_ARCH_FLAGS})
string(REGEX MATCH "arch=compute_\([0-9]+a?\)" _COMPUTE ${_ARCH})
if (_COMPUTE)
set(_COMPUTE ${CMAKE_MATCH_1})
endif()
string_to_ver(_COMPUTE_VER ${_COMPUTE})
list(APPEND _CUDA_ARCHES ${_COMPUTE_VER})
endforeach()
list(REMOVE_DUPLICATES _CUDA_ARCHES)
list(SORT _CUDA_ARCHES COMPARE NATURAL ORDER ASCENDING)
set(${OUT_ARCHES} ${_CUDA_ARCHES} PARENT_SCOPE)
endfunction()
#
# For a specific file set the `-gencode` flag in compile options conditionally
# for the CUDA language.
#
# Example:
# set_gencode_flag_for_srcs(
# SRCS "foo.cu"
# ARCH "compute_75"
# CODE "sm_75")
# adds: "-gencode arch=compute_75,code=sm_75" to the compile options for
# `foo.cu` (only for the CUDA language).
#
macro(set_gencode_flag_for_srcs)
set(options)
set(oneValueArgs ARCH CODE)
set(multiValueArgs SRCS)
cmake_parse_arguments(arg "${options}" "${oneValueArgs}"
"${multiValueArgs}" ${ARGN} )
set(_FLAG -gencode arch=${arg_ARCH},code=${arg_CODE})
set_property(
SOURCE ${arg_SRCS}
APPEND PROPERTY
COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CUDA>:${_FLAG}>"
)
message(DEBUG "Setting gencode flag for ${arg_SRCS}: ${_FLAG}")
endmacro(set_gencode_flag_for_srcs)
#
# For a list of source files set the `-gencode` flags in the files specific
# compile options (specifically for the CUDA language).
#
# arguments are:
# SRCS: list of source files
# CUDA_ARCHS: list of CUDA architectures in the form `<major>.<minor>[letter]`
# BUILD_PTX_FOR_ARCH: if set to true, then the PTX code will be built
# for architecture `BUILD_PTX_FOR_ARCH` if there is a CUDA_ARCH in CUDA_ARCHS
# that is larger than BUILD_PTX_FOR_ARCH.
#
macro(set_gencode_flags_for_srcs)
set(options)
set(oneValueArgs BUILD_PTX_FOR_ARCH)
set(multiValueArgs SRCS CUDA_ARCHS)
cmake_parse_arguments(arg "${options}" "${oneValueArgs}"
"${multiValueArgs}" ${ARGN} )
foreach(_ARCH ${arg_CUDA_ARCHS})
string(REPLACE "." "" _ARCH "${_ARCH}")
set_gencode_flag_for_srcs(
SRCS ${arg_SRCS}
ARCH "compute_${_ARCH}"
CODE "sm_${_ARCH}")
endforeach()
if (${arg_BUILD_PTX_FOR_ARCH})
list(SORT arg_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
list(GET arg_CUDA_ARCHS -1 _HIGHEST_ARCH)
if (_HIGHEST_ARCH VERSION_GREATER_EQUAL ${arg_BUILD_PTX_FOR_ARCH})
string(REPLACE "." "" _PTX_ARCH "${arg_BUILD_PTX_FOR_ARCH}")
set_gencode_flag_for_srcs(
SRCS ${arg_SRCS}
ARCH "compute_${_PTX_ARCH}"
CODE "compute_${_PTX_ARCH}")
endif()
endif()
endmacro()
#
# For the given `SRC_CUDA_ARCHS` list of gencode versions in the form
# `<major>.<minor>[letter]` compute the "loose intersection" with the
# `TGT_CUDA_ARCHS` list of gencodes.
# The loose intersection is defined as:
# { max{ x \in tgt | x <= y } | y \in src, { x \in tgt | x <= y } != {} }
# where `<=` is the version comparison operator.
# In other words, for each version in `TGT_CUDA_ARCHS` find the highest version
# in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`.
# We have special handling for 9.0a, if 9.0a is in `SRC_CUDA_ARCHS` and 9.0 is
# in `TGT_CUDA_ARCHS` then we should remove 9.0a from `SRC_CUDA_ARCHS` and add
# 9.0a to the result.
# The result is stored in `OUT_CUDA_ARCHS`.
#
# Example:
# SRC_CUDA_ARCHS="7.5;8.0;8.6;9.0;9.0a"
# TGT_CUDA_ARCHS="8.0;8.9;9.0"
# cuda_archs_loose_intersection(OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
# OUT_CUDA_ARCHS="8.0;8.6;9.0;9.0a"
#
function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
list(REMOVE_DUPLICATES SRC_CUDA_ARCHS)
# if 9.0a is in SRC_CUDA_ARCHS and 9.0 is in CUDA_ARCHS then we should
# remove 9.0a from SRC_CUDA_ARCHS and add 9.0a to _CUDA_ARCHS
set(_CUDA_ARCHS)
if ("9.0a" IN_LIST SRC_CUDA_ARCHS)
list(REMOVE_ITEM SRC_CUDA_ARCHS "9.0a")
if ("9.0" IN_LIST TGT_CUDA_ARCHS)
set(_CUDA_ARCHS "9.0a")
endif()
endif()
list(SORT SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
# for each ARCH in CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that is
# less or eqault to ARCH
foreach(_ARCH ${CUDA_ARCHS})
set(_TMP_ARCH)
foreach(_SRC_ARCH ${SRC_CUDA_ARCHS})
if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH)
set(_TMP_ARCH ${_SRC_ARCH})
else()
break()
endif()
endforeach()
if (_TMP_ARCH)
list(APPEND _CUDA_ARCHS ${_TMP_ARCH})
endif()
endforeach()
list(REMOVE_DUPLICATES _CUDA_ARCHS)
set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)
endfunction()
# #
# Override the GPU architectures detected by cmake/torch and filter them by # Override the GPU architectures detected by cmake/torch and filter them by
# `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in # `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in
# `GPU_ARCHES`. # `GPU_ARCHES`. This only applies to the HIP language since for CUDA we set
# the architectures on a per file basis.
# #
# Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`. # Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`.
# #
@@ -174,109 +345,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
"None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is" "None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is"
" supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.") " supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
endif() endif()
elseif(${GPU_LANG} STREQUAL "CUDA")
#
# Setup/process CUDA arch flags.
#
# The torch cmake setup hardcodes the detected architecture flags in
# `CMAKE_CUDA_FLAGS`. Since `CMAKE_CUDA_FLAGS` is a "global" variable, it
# can't modified on a per-target basis.
# So, all the `-gencode` flags need to be extracted and removed from
# `CMAKE_CUDA_FLAGS` for processing so they can be passed by another method.
# Since it's not possible to use `target_compiler_options` for adding target
# specific `-gencode` arguments, the target's `CUDA_ARCHITECTURES` property
# must be used instead. This requires repackaging the architecture flags
# into a format that cmake expects for `CUDA_ARCHITECTURES`.
#
# This is a bit fragile in that it depends on torch using `-gencode` as opposed
# to one of the other nvcc options to specify architectures.
#
# Note: torch uses the `TORCH_CUDA_ARCH_LIST` environment variable to override
# detected architectures.
#
message(DEBUG "initial CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
# Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
string(REGEX MATCHALL "-gencode arch=[^ ]+" _CUDA_ARCH_FLAGS
${CMAKE_CUDA_FLAGS})
# Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
# and passed back via the `CUDA_ARCHITECTURES` property.
string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
${CMAKE_CUDA_FLAGS})
# If this error is triggered, it might mean that torch has changed how it sets
# up nvcc architecture code generation flags.
if (NOT _CUDA_ARCH_FLAGS)
message(FATAL_ERROR
"Could not find any architecture related code generation flags in "
"CMAKE_CUDA_FLAGS. (${CMAKE_CUDA_FLAGS})")
endif()
message(DEBUG "final CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
message(DEBUG "arch flags: ${_CUDA_ARCH_FLAGS}")
# Initialize the architecture lists to empty.
set(${GPU_ARCHES})
# Process each `gencode` flag.
foreach(_ARCH ${_CUDA_ARCH_FLAGS})
# For each flag, extract the version number and whether it refers to PTX
# or native code.
# Note: if a regex matches then `CMAKE_MATCH_1` holds the binding
# for that match.
string(REGEX MATCH "arch=compute_\([0-9]+a?\)" _COMPUTE ${_ARCH})
if (_COMPUTE)
set(_COMPUTE ${CMAKE_MATCH_1})
endif()
string(REGEX MATCH "code=sm_\([0-9]+a?\)" _SM ${_ARCH})
if (_SM)
set(_SM ${CMAKE_MATCH_1})
endif()
string(REGEX MATCH "code=compute_\([0-9]+a?\)" _CODE ${_ARCH})
if (_CODE)
set(_CODE ${CMAKE_MATCH_1})
endif()
# Make sure the virtual architecture can be matched.
if (NOT _COMPUTE)
message(FATAL_ERROR
"Could not determine virtual architecture from: ${_ARCH}.")
endif()
# One of sm_ or compute_ must exist.
if ((NOT _SM) AND (NOT _CODE))
message(FATAL_ERROR
"Could not determine a codegen architecture from: ${_ARCH}.")
endif()
if (_SM)
# -real suffix let CMake to only generate elf code for the kernels.
# we want this, otherwise the added ptx (default) will increase binary size.
set(_VIRT "-real")
set(_CODE_ARCH ${_SM})
else()
# -virtual suffix let CMake to generate ptx code for the kernels.
set(_VIRT "-virtual")
set(_CODE_ARCH ${_CODE})
endif()
# Check if the current version is in the supported arch list.
string_to_ver(_CODE_VER ${_CODE_ARCH})
if (NOT _CODE_VER IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
message(STATUS "discarding unsupported CUDA arch ${_VER}.")
continue()
endif()
# Add it to the arch list.
list(APPEND ${GPU_ARCHES} "${_CODE_ARCH}${_VIRT}")
endforeach()
endif() endif()
message(STATUS "${GPU_LANG} target arches: ${${GPU_ARCHES}}")
endmacro() endmacro()
# #
@@ -350,18 +419,19 @@ function (define_gpu_extension_target GPU_MOD_NAME)
target_include_directories(${GPU_MOD_NAME} PRIVATE csrc target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
${GPU_INCLUDE_DIRECTORIES}) ${GPU_INCLUDE_DIRECTORIES})
# TODO: is torch_python_LIBRARY needed? target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES})
target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${torch_python_LIBRARY}
${GPU_LIBRARIES})
# Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
# dependencies that are not necessary and may not be installed. # dependencies that are not necessary and may not be installed.
if (GPU_LANGUAGE STREQUAL "CUDA") if (GPU_LANGUAGE STREQUAL "CUDA")
if ("${CUDA_CUDA_LIB}" STREQUAL "")
set(CUDA_CUDA_LIB "${CUDA_CUDA_LIBRARY}")
endif()
target_link_libraries(${GPU_MOD_NAME} PRIVATE ${CUDA_CUDA_LIB} target_link_libraries(${GPU_MOD_NAME} PRIVATE ${CUDA_CUDA_LIB}
${CUDA_LIBRARIES}) ${CUDA_LIBRARIES})
else() else()
target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES}) target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
endif() endif()
install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION}) install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION} COMPONENT ${GPU_MOD_NAME})
endfunction() endfunction()

View File

@@ -267,13 +267,16 @@ def get_neuron_sdk_version(run_lambda):
def get_vllm_version(): def get_vllm_version():
try: from vllm import __version__, __version_tuple__
import vllm
return vllm.__version__ + "@" + vllm.__commit__
except Exception:
# old version of vllm does not have __commit__
return 'N/A'
if __version__ == "dev":
return "N/A (dev)"
if len(__version_tuple__) == 4: # dev build
git_sha = __version_tuple__[-1][1:] # type: ignore
return f"{__version__} (git sha: {git_sha}"
return __version__
def summarize_vllm_build_flags(): def summarize_vllm_build_flags():
# This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc. # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
@@ -285,9 +288,14 @@ def summarize_vllm_build_flags():
def get_gpu_topo(run_lambda): def get_gpu_topo(run_lambda):
output = None
if get_platform() == 'linux': if get_platform() == 'linux':
return run_and_read_all(run_lambda, 'nvidia-smi topo -m') output = run_and_read_all(run_lambda, 'nvidia-smi topo -m')
return None if output is None:
output = run_and_read_all(run_lambda, 'rocm-smi --showtopo')
return output
# example outputs of CPU infos # example outputs of CPU infos

3
csrc/core/exception.hpp Normal file
View File

@@ -0,0 +1,3 @@
#pragma once
#define VLLM_IMPLIES(p, q) (!(p) || (q))

View File

@@ -12,6 +12,11 @@
// could be a macro instead of a literal token. // could be a macro instead of a literal token.
#define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE) #define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE)
// A version of the TORCH_LIBRARY_IMPL macro that expands the NAME, i.e. so NAME
// could be a macro instead of a literal token.
#define TORCH_LIBRARY_IMPL_EXPAND(NAME, DEVICE, MODULE) \
TORCH_LIBRARY_IMPL(NAME, DEVICE, MODULE)
// REGISTER_EXTENSION allows the shared library to be loaded and initialized // REGISTER_EXTENSION allows the shared library to be loaded and initialized
// via python's import statement. // via python's import statement.
#define REGISTER_EXTENSION(NAME) \ #define REGISTER_EXTENSION(NAME) \

View File

@@ -257,11 +257,13 @@ void int8_scaled_mm(torch::Tensor& c, // [M, OC], row-major
// static-per-tensor quantization. // static-per-tensor quantization.
void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size]
const torch::Tensor& input, // [..., hidden_size] const torch::Tensor& input, // [..., hidden_size]
const torch::Tensor& scale) { const torch::Tensor& scale,
c10::optional<torch::Tensor> const& azp) {
CPU_KERNEL_GUARD_IN(static_scaled_int8_quant) CPU_KERNEL_GUARD_IN(static_scaled_int8_quant)
TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(input.is_contiguous());
TORCH_CHECK(out.is_contiguous()); TORCH_CHECK(out.is_contiguous());
TORCH_CHECK(scale.numel() == 1); TORCH_CHECK(scale.numel() == 1);
TORCH_CHECK(!azp.has_value(), "Zero point is not supported on CPU.");
const int hidden_size = input.size(-1); const int hidden_size = input.size(-1);
const int num_tokens = input.numel() / hidden_size; const int num_tokens = input.numel() / hidden_size;
@@ -277,11 +279,12 @@ void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size]
void dynamic_scaled_int8_quant( void dynamic_scaled_int8_quant(
torch::Tensor& out, // [..., hidden_size] torch::Tensor& out, // [..., hidden_size]
const torch::Tensor& input, // [..., hidden_size] const torch::Tensor& input, // [..., hidden_size]
torch::Tensor& scale // [..., 1] torch::Tensor& scale, // [..., 1]
) { c10::optional<torch::Tensor> const& azp) {
CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant) CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant)
TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(input.is_contiguous());
TORCH_CHECK(out.is_contiguous()); TORCH_CHECK(out.is_contiguous());
TORCH_CHECK(!azp.has_value(), "Zero point is not supported on CPU.");
int const hidden_size = input.size(-1); int const hidden_size = input.size(-1);
int const num_tokens = input.numel() / hidden_size; int const num_tokens = input.numel() / hidden_size;

View File

@@ -94,13 +94,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
#ifdef __AVX512F__ #ifdef __AVX512F__
// Compute int8 quantized tensor for given scaling factor. // Compute int8 quantized tensor for given scaling factor.
ops.def( ops.def(
"static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale) -> " "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
"()"); "Tensor? azp) -> ()");
ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant); ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);
// Compute int8 quantized tensor and scaling factor // Compute int8 quantized tensor and scaling factor
ops.def( ops.def(
"dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale) -> " "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "
"()"); "Tensor!? azp) -> ()");
ops.impl("dynamic_scaled_int8_quant", torch::kCPU, ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
&dynamic_scaled_int8_quant); &dynamic_scaled_int8_quant);
// W8A8 GEMM, supporting symmetric per-tensor or per-row/column // W8A8 GEMM, supporting symmetric per-tensor or per-row/column

View File

@@ -55,18 +55,6 @@ bool _is_weak_contiguous(torch::Tensor& t) {
t.numel() * t.element_size()); t.numel() * t.element_size());
} }
bool should_custom_ar(torch::Tensor& inp, int64_t max_size, int64_t world_size,
bool full_nvlink) {
auto inp_size = inp.numel() * inp.element_size();
// custom allreduce requires input byte size to be multiples of 16
if (inp_size % 16 != 0) return false;
if (!_is_weak_contiguous(inp)) return false;
if (world_size == 2 || full_nvlink) return inp_size <= max_size;
// for 4 or more non NVLink-capable GPUs, custom allreduce provides little
// performance improvement over NCCL.
return false;
}
void _all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out, void _all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
cudaStream_t stream) { cudaStream_t stream) {
auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa); auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);

View File

@@ -6,6 +6,7 @@
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <iostream> #include <iostream>
#include <array>
#include <limits> #include <limits>
#include <map> #include <map>
#include <unordered_map> #include <unordered_map>
@@ -23,17 +24,23 @@
namespace vllm { namespace vllm {
constexpr int kMaxBlocks = 64; constexpr int kMaxBlocks = 36;
// note: we don't want to use atomics for signals because peer atomics are no // Counter may overflow, but it's fine since unsigned int overflow is
// supported on PCIe links // well-defined behavior.
using FlagType = uint32_t;
struct Signal { struct Signal {
alignas(128) uint32_t start[kMaxBlocks][8]; alignas(128) FlagType self_counter[kMaxBlocks][8];
alignas(128) uint32_t end[kMaxBlocks][8]; // Two sets of peer counters are needed for two syncs. The reason is that
// it's possible for peer GPU block to arrive at the second sync point while
// the current GPU block haven't passed the first sync point. Thus, peer GPU
// may write counter+1 while current GPU is busy waiting for counter. We use
// alternating counter array to avoid this possibility.
alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
}; };
struct __align__(16) RankData { const void* __restrict__ ptrs[8]; }; struct __align__(16) RankData { const void* __restrict__ ptrs[8]; };
struct __align__(16) RankSignals { volatile Signal* signals[8]; }; struct __align__(16) RankSignals { Signal* signals[8]; };
// like std::array, but aligned // like std::array, but aligned
template <typename T, int sz> template <typename T, int sz>
@@ -123,47 +130,71 @@ DINLINE O downcast(array_t<float, O::size> val) {
} }
} }
// This function is meant to be used as the first synchronization in the all static DINLINE void st_flag_release(FlagType* flag_addr, FlagType flag) {
// reduce kernel. Thus, it doesn't need to make any visibility guarantees for #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
// prior memory accesses. Note: volatile writes will not be reordered against asm volatile("st.release.sys.global.u32 [%1], %0;" ::"r"(flag),
// other volatile writes. "l"(flag_addr));
template <int ngpus> #else
DINLINE void start_sync(const RankSignals& sg, volatile Signal* self_sg, asm volatile("membar.sys; st.volatile.global.u32 [%1], %0;" ::"r"(flag),
int rank) { "l"(flag_addr));
if (threadIdx.x < ngpus) { #endif
// reset flag for next time
self_sg->end[blockIdx.x][threadIdx.x] = 0;
// simultaneously write to the corresponding flag of all ranks.
// Latency = 1 p2p write
sg.signals[threadIdx.x]->start[blockIdx.x][rank] = 1;
// wait until we got true from all ranks
while (!self_sg->start[blockIdx.x][threadIdx.x]);
}
__syncthreads();
} }
// This function is meant to be used as the second or the final synchronization static DINLINE FlagType ld_flag_acquire(FlagType* flag_addr) {
// barrier in the all reduce kernel. If it's the final synchronization barrier, FlagType flag;
// we don't need to make any visibility guarantees for prior memory accesses. #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
template <int ngpus, bool final_sync = false> asm volatile("ld.acquire.sys.global.u32 %0, [%1];"
DINLINE void end_sync(const RankSignals& sg, volatile Signal* self_sg, : "=r"(flag)
int rank) { : "l"(flag_addr));
__syncthreads(); #else
// eliminate the case that prior writes are not visible after signals become asm volatile("ld.volatile.global.u32 %0, [%1]; membar.gl;"
// visible. Note that I did not managed to make this happen through a lot of : "=r"(flag)
// testing. Might be the case that hardware provides stronger guarantee than : "l"(flag_addr));
// the memory model. #endif
if constexpr (!final_sync) __threadfence_system(); return flag;
}
static DINLINE void st_flag_volatile(FlagType* flag_addr, FlagType flag) {
asm volatile("st.volatile.global.u32 [%1], %0;" ::"r"(flag), "l"(flag_addr));
}
static DINLINE FlagType ld_flag_volatile(FlagType* flag_addr) {
FlagType flag;
asm volatile("ld.volatile.global.u32 %0, [%1];"
: "=r"(flag)
: "l"(flag_addr));
return flag;
}
// is_start: whether this is the very first synchronization barrier.
// need_fence: whether a memory fence is needed. If true, a release-acquire
// semantic is used to enforce memory access order before and after this
// barrier.
template <int ngpus, bool is_start, bool need_fence = false>
DINLINE void multi_gpu_barrier(const RankSignals& sg, Signal* self_sg,
int rank) {
if constexpr (!is_start) __syncthreads();
static_assert(
!(is_start && need_fence)); // Start barrier shouldn't need fence.
if (threadIdx.x < ngpus) { if (threadIdx.x < ngpus) {
// reset flag for next time // Increment the counter. Technically we only need one counter, but we use
self_sg->start[blockIdx.x][threadIdx.x] = 0; // multiple per block to eliminate the need to share the counter via smem.
// simultaneously write to the corresponding flag of all ranks. auto val = self_sg->self_counter[blockIdx.x][threadIdx.x] += 1;
// Latency = 1 p2p write // Write the expected counter value to peer and wait for correct value from
sg.signals[threadIdx.x]->end[blockIdx.x][rank] = 1; // peer.
// wait until we got true from all ranks auto peer_counter_ptr =
while (!self_sg->end[blockIdx.x][threadIdx.x]); &sg.signals[threadIdx.x]->peer_counter[val % 2][blockIdx.x][rank];
auto self_counter_ptr =
&self_sg->peer_counter[val % 2][blockIdx.x][threadIdx.x];
if constexpr (need_fence) {
st_flag_release(peer_counter_ptr, val);
while (ld_flag_acquire(self_counter_ptr) != val);
} else {
st_flag_volatile(peer_counter_ptr, val);
while (ld_flag_volatile(self_counter_ptr) != val);
}
} }
if constexpr (!final_sync) __syncthreads(); if constexpr (is_start || need_fence) __syncthreads();
} }
template <typename P, int ngpus, typename A> template <typename P, int ngpus, typename A>
@@ -178,33 +209,31 @@ DINLINE P packed_reduce(const P* ptrs[], int idx) {
template <typename T, int ngpus> template <typename T, int ngpus>
__global__ void __launch_bounds__(512, 1) __global__ void __launch_bounds__(512, 1)
cross_device_reduce_1stage(RankData* _dp, RankSignals sg, cross_device_reduce_1stage(RankData* _dp, RankSignals sg, Signal* self_sg,
volatile Signal* self_sg, T* __restrict__ result, T* __restrict__ result, int rank, int size) {
int rank, int size) {
using P = typename packed_t<T>::P; using P = typename packed_t<T>::P;
using A = typename packed_t<T>::A; using A = typename packed_t<T>::A;
// note: we don't reorder the address so the accumulation order is the same // note: we don't reorder the address so the accumulation order is the same
// for all ranks, ensuring bitwise identical results // for all ranks, ensuring bitwise identical results
auto dp = *_dp; auto dp = *_dp;
start_sync<ngpus>(sg, self_sg, rank); multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
// do the actual reduction // do the actual reduction
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
idx += gridDim.x * blockDim.x) { idx += gridDim.x * blockDim.x) {
((P*)result)[idx] = packed_reduce<P, ngpus, A>((const P**)&dp.ptrs[0], idx); ((P*)result)[idx] = packed_reduce<P, ngpus, A>((const P**)&dp.ptrs[0], idx);
} }
end_sync<ngpus, true>(sg, self_sg, rank); multi_gpu_barrier<ngpus, false>(sg, self_sg, rank);
} }
template <typename P> template <typename P>
DINLINE P* get_tmp_buf(volatile Signal* sg) { DINLINE P* get_tmp_buf(Signal* sg) {
return (P*)(((Signal*)sg) + 1); return (P*)(((Signal*)sg) + 1);
} }
template <typename T, int ngpus> template <typename T, int ngpus>
__global__ void __launch_bounds__(512, 1) __global__ void __launch_bounds__(512, 1)
cross_device_reduce_2stage(RankData* _dp, RankSignals sg, cross_device_reduce_2stage(RankData* _dp, RankSignals sg, Signal* self_sg,
volatile Signal* self_sg, T* __restrict__ result, T* __restrict__ result, int rank, int size) {
int rank, int size) {
int tid = blockIdx.x * blockDim.x + threadIdx.x; int tid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = gridDim.x * blockDim.x; int stride = gridDim.x * blockDim.x;
using P = typename packed_t<T>::P; using P = typename packed_t<T>::P;
@@ -222,12 +251,12 @@ __global__ void __launch_bounds__(512, 1)
tmps[i] = get_tmp_buf<P>(sg.signals[target]); tmps[i] = get_tmp_buf<P>(sg.signals[target]);
} }
auto tmp_out = tmps[0]; auto tmp_out = tmps[0];
start_sync<ngpus>(sg, self_sg, rank); multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
// stage 1: reduce scatter // stage 1: reduce scatter
for (int idx = start + tid; idx < end; idx += stride) { for (int idx = start + tid; idx < end; idx += stride) {
tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx); tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx);
} }
end_sync<ngpus>(sg, self_sg, rank); multi_gpu_barrier<ngpus, false, true>(sg, self_sg, rank);
// stage 2: allgather. Note: it's important to match the tid between // stage 2: allgather. Note: it's important to match the tid between
// the two stages, because visibility across devices is only guaranteed // the two stages, because visibility across devices is only guaranteed
@@ -437,6 +466,8 @@ class CustomAllreduce {
#define KL(ngpus, name) \ #define KL(ngpus, name) \
name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \ name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
rank_, size); rank_, size);
// TODO(hanzhi713): Threshold is different for A100 and H100.
// Add per device threshold.
#define REDUCE_CASE(ngpus) \ #define REDUCE_CASE(ngpus) \
case ngpus: { \ case ngpus: { \
if (world_size_ == 2) { \ if (world_size_ == 2) { \

View File

@@ -1,15 +1,15 @@
/** /**
* This is a standalone test for custom allreduce. * This is a standalone test for custom allreduce.
* To compile, make sure you have MPI and NCCL installed in your system. * To compile, make sure you have MPI and NCCL installed in your system.
* export MPI_HOME=XXX * export MPI_HOME=xxx
* nvcc -O2 -arch=native -std=c++17 custom_all_reduce_test.cu -o * nvcc -O2 -arch=native -std=c++17 custom_all_reduce_test.cu -o
* custom_all_reduce_test -lnccl -I${MPI_HOME}/include -lmpi * custom_all_reduce_test -lnccl -I${MPI_HOME} -lmpi
* *
* Warning: this C++ test is not designed to be very readable and was used * Warning: this C++ test is not designed to be very readable and was used
* during the rapid prototyping process. * during the rapid prototyping process.
* *
* To run: * To run:
* mpirun -np 8 ./custom_all_reduce_test * mpirun --allow-run-as-root -np 8 ./custom_all_reduce_test
*/ */
#include <cuda.h> #include <cuda.h>
#include <curand_kernel.h> #include <curand_kernel.h>
@@ -44,7 +44,14 @@
} while (0) } while (0)
__global__ void dummy_kernel() { __global__ void dummy_kernel() {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
for (int i = 0; i < 100; i++) __nanosleep(1000000); // 100ms for (int i = 0; i < 100; i++) __nanosleep(1000000); // 100ms
#else
for (int i = 0; i < 100; i++) {
long long int start = clock64();
while (clock64() - start < 150000000); // approximately 98.4ms on P40
}
#endif
} }
template <typename T> template <typename T>
@@ -302,15 +309,19 @@ int main(int argc, char** argv) {
bool performance_test = true; bool performance_test = true;
cudaProfilerStart(); cudaProfilerStart();
// for (int threads : {256, 512}) { // Uncomment to scan through different block size configs.
// for (int threads : {256, 512, 1024}) {
// for (int block_limit = 16; block_limit < 112; block_limit += 4) { // for (int block_limit = 16; block_limit < 112; block_limit += 4) {
// run<half>(myRank, nRanks, comm, threads, block_limit, 4096 * 1024); // run<half>(myRank, nRanks, comm, threads, block_limit, 1024 * 1024,
// performance_test);
// } // }
// } // }
// Scan through different sizes to test performance.
for (int sz = 512; sz <= (8 << 20); sz *= 2) { for (int sz = 512; sz <= (8 << 20); sz *= 2) {
run<half>(myRank, nRanks, comm, 512, 36, sz + 8 * 47, performance_test); run<half>(myRank, nRanks, comm, 512, 36, sz + 8 * 47, performance_test);
} }
cudaProfilerStop(); cudaProfilerStop();
MPICHECK(MPI_Finalize());
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }

View File

@@ -68,7 +68,13 @@ static inline auto make_cute_layout(torch::Tensor const& tensor,
name, ".stride(", idx, ") to be ", StrideEle::value); name, ".stride(", idx, ") to be ", StrideEle::value);
return StrideEle{}; return StrideEle{};
} else { } else {
return tensor.stride(idx); if (tensor.size(idx) == 1) {
// use 0 stride for dim with size 1, this is easier for
// cute/cutlass to optimize (helps the TMA code flatten dims)
return StrideEle{0};
} else {
return tensor.stride(idx);
}
} }
} else { } else {
// Extra strides are assumed to be 0 or 1 // Extra strides are assumed to be 0 or 1

View File

@@ -39,8 +39,6 @@
template<typename input_t, typename weight_t> template<typename input_t, typename weight_t>
void causal_conv1d_fwd_cuda(ConvParamsBase &params, cudaStream_t stream); void causal_conv1d_fwd_cuda(ConvParamsBase &params, cudaStream_t stream);
template <typename input_t, typename weight_t>
void causal_conv1d_channellast_fwd_cuda(ConvParamsBase &params, cudaStream_t stream);
template<typename input_t, typename weight_t> template<typename input_t, typename weight_t>
void causal_conv1d_update_cuda(ConvParamsBase &params, cudaStream_t stream); void causal_conv1d_update_cuda(ConvParamsBase &params, cudaStream_t stream);
@@ -55,8 +53,11 @@ void set_conv_params_fwd(ConvParamsBase &params,
const at::Tensor x, const at::Tensor x,
const at::Tensor weight, const at::Tensor weight,
const at::Tensor out, const at::Tensor out,
void* bias_ptr, const c10::optional<at::Tensor>& bias,
bool silu_activation) { bool silu_activation,
const c10::optional<at::Tensor>& query_start_loc = std::nullopt,
const c10::optional<at::Tensor>& cache_indices = std::nullopt,
const c10::optional<at::Tensor>& has_initial_state = std::nullopt) {
// Reset the parameters // Reset the parameters
memset(&params, 0, sizeof(params)); memset(&params, 0, sizeof(params));
@@ -71,26 +72,31 @@ void set_conv_params_fwd(ConvParamsBase &params,
// Set the pointers and strides. // Set the pointers and strides.
params.x_ptr = x.data_ptr(); params.x_ptr = x.data_ptr();
params.weight_ptr = weight.data_ptr(); params.weight_ptr = weight.data_ptr();
params.bias_ptr = bias_ptr; params.bias_ptr = bias.has_value() ? bias.value().data_ptr() : nullptr;
params.out_ptr = out.data_ptr(); params.out_ptr = out.data_ptr();
// All stride are in elements, not bytes. // All stride are in elements, not bytes.
params.x_batch_stride = x.stride(0); params.query_start_loc_ptr = query_start_loc.has_value() ? query_start_loc.value().data_ptr() : nullptr;
params.x_c_stride = x.stride(1); params.cache_indices_ptr = cache_indices.has_value() ? cache_indices.value().data_ptr() : nullptr;
params.x_l_stride = x.stride(-1); params.has_initial_state_ptr = has_initial_state.has_value() ? has_initial_state.value().data_ptr() : nullptr;
const bool varlen = params.query_start_loc_ptr != nullptr;
params.x_batch_stride = x.stride(varlen ? 1 : 0);
params.x_c_stride = x.stride(varlen ? 0 : 1);
params.x_l_stride = x.stride(varlen ? 1 : -1);
params.weight_c_stride = weight.stride(0); params.weight_c_stride = weight.stride(0);
params.weight_width_stride = weight.stride(1); params.weight_width_stride = weight.stride(1);
params.out_batch_stride = out.stride(0); params.out_batch_stride = out.stride(varlen ? 1 : 0);
params.out_c_stride = out.stride(1); params.out_c_stride = out.stride(varlen ? 0 : 1);
params.out_l_stride = out.stride(-1); params.out_l_stride = out.stride(varlen ? 1 : -1);
} }
at::Tensor at::Tensor
causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight, causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
const c10::optional<at::Tensor> &bias_, const c10::optional<at::Tensor> &bias_,
const c10::optional<at::Tensor> &seq_idx_, const c10::optional<at::Tensor> &conv_states,
const c10::optional<at::Tensor> &initial_states_, const c10::optional<at::Tensor> &query_start_loc,
const c10::optional<at::Tensor> &final_states_out_, const c10::optional<at::Tensor> &cache_indices,
const c10::optional<at::Tensor> &has_initial_state,
bool silu_activation) { bool silu_activation) {
auto input_type = x.scalar_type(); auto input_type = x.scalar_type();
auto weight_type = weight.scalar_type(); auto weight_type = weight.scalar_type();
@@ -100,23 +106,21 @@ causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
TORCH_CHECK(x.is_cuda()); TORCH_CHECK(x.is_cuda());
TORCH_CHECK(weight.is_cuda()); TORCH_CHECK(weight.is_cuda());
const bool varlen = query_start_loc.has_value() ? true : false;
const auto sizes = x.sizes(); const auto sizes = x.sizes();
const int batch_size = sizes[0]; const int batch_size = varlen ? query_start_loc.value().sizes()[0] - 1 : sizes[0];
const int dim = sizes[1]; const int dim = varlen ? sizes[0] : sizes[1];
const int seqlen = sizes[2]; const int seqlen = varlen ? sizes[1] : sizes[2];
const int width = weight.size(-1); const int width = weight.size(-1);
if (varlen){
CHECK_SHAPE(x, batch_size, dim, seqlen); CHECK_SHAPE(x, dim, seqlen);
}
else {
CHECK_SHAPE(x, batch_size, dim, seqlen);
}
CHECK_SHAPE(weight, dim, width); CHECK_SHAPE(weight, dim, width);
TORCH_CHECK(x.stride(2) == 1 || x.stride(1) == 1);
const bool is_channel_last = x.stride(1) == 1 && x.stride(2) > 1;
if (is_channel_last) {
TORCH_CHECK(dim % 8 == 0, "causal_conv1d only supports channel dimension divisible by 8 for now");
TORCH_CHECK(x.stride(2) % 8 == 0 and x.stride(0) % 8 == 0, "causal_conv1d with channel last layout requires strides (x.stride(0) and x.stride(2)) to be multiples of 8");
}
TORCH_CHECK(width >= 2 && width <= 4, "causal_conv1d only supports width between 2 and 4");
if (bias_.has_value()) { if (bias_.has_value()) {
auto bias = bias_.value(); auto bias = bias_.value();
@@ -126,56 +130,50 @@ causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
CHECK_SHAPE(bias, dim); CHECK_SHAPE(bias, dim);
} }
if (seq_idx_.has_value()) {
TORCH_CHECK(is_channel_last, "seq_idx is only supported for channel last layout"); if (has_initial_state.has_value()) {
auto seq_idx = seq_idx_.value(); auto has_initial_state_ = has_initial_state.value();
TORCH_CHECK(seq_idx.scalar_type() == torch::kInt32); TORCH_CHECK(has_initial_state_.scalar_type() == at::ScalarType::Bool);
TORCH_CHECK(seq_idx.is_cuda()); TORCH_CHECK(has_initial_state_.is_cuda());
TORCH_CHECK(seq_idx.is_contiguous()); CHECK_SHAPE(has_initial_state_, batch_size);
CHECK_SHAPE(seq_idx, batch_size, seqlen); }
if (query_start_loc.has_value()) {
auto query_start_loc_ = query_start_loc.value();
TORCH_CHECK(query_start_loc_.scalar_type() == at::ScalarType::Int);
TORCH_CHECK(query_start_loc_.is_cuda());
}
if (cache_indices.has_value()) {
auto cache_indices_ = cache_indices.value();
TORCH_CHECK(cache_indices_.scalar_type() == at::ScalarType::Int);
TORCH_CHECK(cache_indices_.is_cuda());
CHECK_SHAPE(cache_indices_, batch_size);
} }
at::Tensor out = torch::empty_like(x); at::Tensor out = torch::empty_like(x);
ConvParamsBase params; ConvParamsBase params;
set_conv_params_fwd(params, batch_size, dim, seqlen, width, x, weight, out, set_conv_params_fwd(params, batch_size, dim, seqlen, width, x, weight, out,
bias_.has_value() ? bias_.value().data_ptr() : nullptr, bias_,
silu_activation); silu_activation,
query_start_loc,
cache_indices,
has_initial_state
);
if (seq_idx_.has_value()) { if (conv_states.has_value()) {
params.seq_idx_ptr = seq_idx_.value().data_ptr(); auto conv_states_ = conv_states.value();
TORCH_CHECK(conv_states_.scalar_type() == input_type);
TORCH_CHECK(conv_states_.is_cuda());
params.conv_states_ptr = conv_states_.data_ptr();
params.conv_states_batch_stride = conv_states_.stride(0);
params.conv_states_c_stride = conv_states_.stride(1);
params.conv_states_l_stride = conv_states_.stride(2);
} else { } else {
params.seq_idx_ptr = nullptr; params.conv_states_ptr = nullptr;
}
if (initial_states_.has_value()) {
TORCH_CHECK(is_channel_last, "initial_states is only supported for channel last layout");
auto initial_states = initial_states_.value();
TORCH_CHECK(initial_states.scalar_type() == input_type);
TORCH_CHECK(initial_states.is_cuda());
CHECK_SHAPE(initial_states, batch_size, dim, width - 1);
TORCH_CHECK(initial_states.stride(1) == 1);
params.initial_states_ptr = initial_states.data_ptr();
params.initial_states_batch_stride = initial_states.stride(0);
params.initial_states_c_stride = initial_states.stride(1);
params.initial_states_l_stride = initial_states.stride(2);
} else {
params.initial_states_ptr = nullptr;
}
if (final_states_out_.has_value()) {
TORCH_CHECK(is_channel_last, "final_states is only supported for channel last layout");
auto final_states = final_states_out_.value();
TORCH_CHECK(final_states.scalar_type() == input_type);
TORCH_CHECK(final_states.is_cuda());
CHECK_SHAPE(final_states, batch_size, dim, width - 1);
TORCH_CHECK(final_states.stride(1) == 1);
params.final_states_ptr = final_states.data_ptr();
params.final_states_batch_stride = final_states.stride(0);
params.final_states_c_stride = final_states.stride(1);
params.final_states_l_stride = final_states.stride(2);
} else {
params.final_states_ptr = nullptr;
} }
// Otherwise the kernel will be launched from cuda:0 device // Otherwise the kernel will be launched from cuda:0 device
@@ -183,11 +181,7 @@ causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
at::cuda::CUDAGuard device_guard{(char)x.get_device()}; at::cuda::CUDAGuard device_guard{(char)x.get_device()};
auto stream = at::cuda::getCurrentCUDAStream().stream(); auto stream = at::cuda::getCurrentCUDAStream().stream();
DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_fwd", [&] { DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_fwd", [&] {
if (!is_channel_last) { causal_conv1d_fwd_cuda<input_t, weight_t>(params, stream);
causal_conv1d_fwd_cuda<input_t, weight_t>(params, stream);
} else {
causal_conv1d_channellast_fwd_cuda<input_t, weight_t>(params, stream);
}
}); });
return out; return out;
} }
@@ -198,7 +192,9 @@ causal_conv1d_update(const at::Tensor &x,
const at::Tensor &conv_state, const at::Tensor &conv_state,
const at::Tensor &weight, const at::Tensor &weight,
const c10::optional<at::Tensor> &bias_, const c10::optional<at::Tensor> &bias_,
bool silu_activation) { bool silu_activation,
const c10::optional<at::Tensor> &cache_seqlens_,
const c10::optional<at::Tensor> &conv_state_indices_) {
auto input_type = x.scalar_type(); auto input_type = x.scalar_type();
auto weight_type = weight.scalar_type(); auto weight_type = weight.scalar_type();
TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16); TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
@@ -213,10 +209,12 @@ causal_conv1d_update(const at::Tensor &x,
const auto sizes = x.sizes(); const auto sizes = x.sizes();
const int batch_size = sizes[0]; const int batch_size = sizes[0];
const int dim = sizes[1]; const int dim = sizes[1];
const int seqlen = sizes[2];
const int width = weight.size(-1); const int width = weight.size(-1);
const int conv_state_len = conv_state.size(2);
TORCH_CHECK(conv_state_len >= width - 1);
CHECK_SHAPE(x, batch_size, dim); CHECK_SHAPE(x, batch_size, dim, seqlen);
CHECK_SHAPE(conv_state, batch_size, dim, width);
CHECK_SHAPE(weight, dim, width); CHECK_SHAPE(weight, dim, width);
TORCH_CHECK(width >= 2 && width <= 4, "causal_conv1d only supports width between 2 and 4"); TORCH_CHECK(width >= 2 && width <= 4, "causal_conv1d only supports width between 2 and 4");
@@ -232,15 +230,43 @@ causal_conv1d_update(const at::Tensor &x,
at::Tensor out = torch::empty_like(x); at::Tensor out = torch::empty_like(x);
ConvParamsBase params; ConvParamsBase params;
set_conv_params_fwd(params, batch_size, dim, /*seqlen=*/1, width, x, weight, out, set_conv_params_fwd(params, batch_size, dim, seqlen, width, x, weight, out,
bias_.has_value() ? bias_.value().data_ptr() : nullptr, bias_,
silu_activation); silu_activation);
params.conv_state_ptr = conv_state.data_ptr(); params.conv_state_ptr = conv_state.data_ptr();
params.conv_state_len = conv_state_len;
// All stride are in elements, not bytes. // All stride are in elements, not bytes.
params.conv_state_batch_stride = conv_state.stride(0); params.conv_state_batch_stride = conv_state.stride(0);
params.conv_state_c_stride = conv_state.stride(1); params.conv_state_c_stride = conv_state.stride(1);
params.conv_state_l_stride = conv_state.stride(2); params.conv_state_l_stride = conv_state.stride(2);
if (cache_seqlens_.has_value()) {
auto cache_seqlens = cache_seqlens_.value();
TORCH_CHECK(cache_seqlens.scalar_type() == torch::kInt32);
TORCH_CHECK(cache_seqlens.is_cuda());
TORCH_CHECK(cache_seqlens.stride(-1) == 1);
CHECK_SHAPE(cache_seqlens, batch_size);
params.cache_seqlens = cache_seqlens.data_ptr<int32_t>();
} else {
params.cache_seqlens = nullptr;
}
if (conv_state_indices_.has_value()) {
auto conv_state_indices = conv_state_indices_.value();
TORCH_CHECK(conv_state_indices.scalar_type() == torch::kInt32)
TORCH_CHECK(conv_state_indices.is_cuda());
TORCH_CHECK(conv_state_indices.stride(0) == 1)
CHECK_SHAPE(conv_state_indices, batch_size);
int conv_state_entries = conv_state.size(0);
CHECK_SHAPE(conv_state, conv_state_entries, dim, conv_state_len);
params.conv_state_indices_ptr = conv_state_indices.data_ptr<int32_t>();
} else {
CHECK_SHAPE(conv_state, batch_size, dim, conv_state_len);
params.conv_state_indices_ptr = nullptr;
}
// Otherwise the kernel will be launched from cuda:0 device // Otherwise the kernel will be launched from cuda:0 device
// Cast to char to avoid compiler warning about narrowing // Cast to char to avoid compiler warning about narrowing
at::cuda::CUDAGuard device_guard{(char)x.get_device()}; at::cuda::CUDAGuard device_guard{(char)x.get_device()};
@@ -280,7 +306,7 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
constexpr int kWidth = Ktraits::kWidth; constexpr int kWidth = Ktraits::kWidth;
constexpr int kNThreads = Ktraits::kNThreads; constexpr int kNThreads = Ktraits::kNThreads;
constexpr int kNElts = Ktraits::kNElts; constexpr int kNElts = Ktraits::kNElts;
static constexpr bool kIsVecLoad = Ktraits::kIsVecLoad; constexpr bool kIsVecLoad = Ktraits::kIsVecLoad;
using input_t = typename Ktraits::input_t; using input_t = typename Ktraits::input_t;
using vec_t = typename Ktraits::vec_t; using vec_t = typename Ktraits::vec_t;
using weight_t = typename Ktraits::weight_t; using weight_t = typename Ktraits::weight_t;
@@ -293,20 +319,39 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
auto& smem_store_vec = reinterpret_cast<typename Ktraits::BlockStoreVecT::TempStorage&>(smem_); auto& smem_store_vec = reinterpret_cast<typename Ktraits::BlockStoreVecT::TempStorage&>(smem_);
vec_t *smem_exchange = reinterpret_cast<vec_t *>(smem_ + Ktraits::kSmemIOSize); vec_t *smem_exchange = reinterpret_cast<vec_t *>(smem_ + Ktraits::kSmemIOSize);
const bool kVarlen = params.query_start_loc_ptr != nullptr;
const int tidx = threadIdx.x; const int tidx = threadIdx.x;
const int batch_id = blockIdx.x; const int batch_id = blockIdx.x;
const int channel_id = blockIdx.y; const int channel_id = blockIdx.y;
input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + batch_id * params.x_batch_stride const int *query_start_loc = kVarlen ? reinterpret_cast<int *>(params.query_start_loc_ptr) : nullptr;
const int sequence_start_index = kVarlen ? query_start_loc[batch_id] : batch_id;
const int seqlen = kVarlen ? query_start_loc[batch_id + 1] - sequence_start_index : params.seqlen;
input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + sequence_start_index * params.x_batch_stride
+ channel_id * params.x_c_stride; + channel_id * params.x_c_stride;
weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr) + channel_id * params.weight_c_stride; weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr) + channel_id * params.weight_c_stride;
input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + sequence_start_index * params.out_batch_stride
+ channel_id * params.out_c_stride; + channel_id * params.out_c_stride;
float bias_val = params.bias_ptr == nullptr ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[channel_id]); float bias_val = params.bias_ptr == nullptr ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[channel_id]);
bool has_initial_state = params.has_initial_state_ptr == nullptr ? false
: reinterpret_cast<bool *>(params.has_initial_state_ptr)[batch_id];
int* cache_indices = params.cache_indices_ptr == nullptr ? nullptr
: reinterpret_cast<int *>(params.cache_indices_ptr);
int cache_index = cache_indices == nullptr ? batch_id : cache_indices[batch_id];
input_t *conv_states = params.conv_states_ptr == nullptr ? nullptr
: reinterpret_cast<input_t *>(params.conv_states_ptr) + cache_index * params.conv_states_batch_stride + channel_id * params.conv_states_c_stride;
// Thread 0 will load the last elements of the previous chunk, so we initialize those to 0. // Thread 0 will load the last elements of the previous chunk, so we initialize those to 0.
if (tidx == 0) { if (tidx == 0) {
input_t zeros[kNElts] = {0}; input_t initial_state[kNElts] = {0};
smem_exchange[kNThreads - 1] = reinterpret_cast<vec_t *>(zeros)[0]; if (has_initial_state) {
#pragma unroll
for (int w = 0; w < kWidth - 1; ++w){ initial_state[kNElts - 1 - (kWidth - 2) + w ] = conv_states[w]; }
}
smem_exchange[kNThreads - 1] = reinterpret_cast<vec_t *>(initial_state)[0];
} }
float weight_vals[kWidth]; float weight_vals[kWidth];
@@ -314,14 +359,14 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
for (int i = 0; i < kWidth; ++i) { weight_vals[i] = float(weight[i * params.weight_width_stride]); } for (int i = 0; i < kWidth; ++i) { weight_vals[i] = float(weight[i * params.weight_width_stride]); }
constexpr int kChunkSize = kNThreads * kNElts; constexpr int kChunkSize = kNThreads * kNElts;
const int n_chunks = (params.seqlen + kChunkSize - 1) / kChunkSize; const int n_chunks = (seqlen + kChunkSize - 1) / kChunkSize;
for (int chunk = 0; chunk < n_chunks; ++chunk) { for (int chunk = 0; chunk < n_chunks; ++chunk) {
input_t x_vals_load[2 * kNElts] = {0}; input_t x_vals_load[2 * kNElts] = {0};
if constexpr(kIsVecLoad) { if constexpr(kIsVecLoad) {
typename Ktraits::BlockLoadVecT(smem_load_vec).Load(reinterpret_cast<vec_t*>(x), *reinterpret_cast<vec_t (*)[1]>(&x_vals_load[kNElts]), (params.seqlen - chunk * kChunkSize) / kNElts); typename Ktraits::BlockLoadVecT(smem_load_vec).Load(reinterpret_cast<vec_t*>(x), *reinterpret_cast<vec_t (*)[1]>(&x_vals_load[kNElts]), (seqlen - chunk * kChunkSize) / kNElts);
} else { } else {
__syncthreads(); __syncthreads();
typename Ktraits::BlockLoadT(smem_load).Load(x, *reinterpret_cast<input_t (*)[kNElts]>(&x_vals_load[kNElts]), params.seqlen - chunk * kChunkSize); typename Ktraits::BlockLoadT(smem_load).Load(x, *reinterpret_cast<input_t (*)[kNElts]>(&x_vals_load[kNElts]), seqlen - chunk * kChunkSize);
} }
x += kChunkSize; x += kChunkSize;
__syncthreads(); __syncthreads();
@@ -359,19 +404,57 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
#pragma unroll #pragma unroll
for (int i = 0; i < kNElts; ++i) { out_vals_store[i] = out_vals[i]; } for (int i = 0; i < kNElts; ++i) { out_vals_store[i] = out_vals[i]; }
if constexpr(kIsVecLoad) { if constexpr(kIsVecLoad) {
typename Ktraits::BlockStoreVecT(smem_store_vec).Store(reinterpret_cast<vec_t*>(out), reinterpret_cast<vec_t (&)[1]>(out_vals_store), (params.seqlen - chunk * kChunkSize) / kNElts); typename Ktraits::BlockStoreVecT(smem_store_vec).Store(reinterpret_cast<vec_t*>(out), reinterpret_cast<vec_t (&)[1]>(out_vals_store), (seqlen - chunk * kChunkSize) / kNElts);
} else { } else {
typename Ktraits::BlockStoreT(smem_store).Store(out, out_vals_store, params.seqlen - chunk * kChunkSize); typename Ktraits::BlockStoreT(smem_store).Store(out, out_vals_store, seqlen - chunk * kChunkSize);
} }
out += kChunkSize; out += kChunkSize;
} }
// Final state is stored in the smem_exchange last token slot,
// in case seqlen < kWidth, we would need to take the final state from the
// initial state which is stored in conv_states
// in case seqlen > kWidth, we would need to load the last kWidth - 1 data
// and load it into conv_state accordingly
int last_thread = ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize) / kNElts;
if (conv_states != nullptr && tidx == last_thread) {
input_t x_vals_load[kNElts * 2] = {0};
// in case we are on the first kWidth tokens
if (last_thread == 0 && seqlen < kWidth){
// Need to take the initial state
reinterpret_cast<vec_t *>(x_vals_load)[0] = smem_exchange[0];
const int offset = seqlen - (kWidth - 1);
#pragma unroll
for (int w = 0; w < kWidth - 1; ++w){
// pad the existing state
if ((w - seqlen) >= 0 && has_initial_state) { conv_states[w - seqlen] = conv_states[w]; }
else if ((w - seqlen) >= 0 && !has_initial_state) { conv_states[w - seqlen] = input_t(0.0f); }
}
#pragma unroll
for (int w = 0; w < kWidth - 1; ++w){
if (offset + w >= 0)
conv_states[w] = x_vals_load[offset + w ];
}
}
else {
// in case the final state is in between the threads data
reinterpret_cast<vec_t *>(x_vals_load)[1] = smem_exchange[last_thread + 1];
reinterpret_cast<vec_t *>(x_vals_load)[0] = smem_exchange[last_thread];
const int offset = ((seqlen - (kWidth - 1)) % (kNElts));
#pragma unroll
for (int w = 0; w < kWidth - 1; ++w){
conv_states[w] = x_vals_load[offset + w ];
}
}
}
} }
template<int kNThreads, int kWidth, typename input_t, typename weight_t> template<int kNThreads, int kWidth, typename input_t, typename weight_t>
void causal_conv1d_fwd_launch(ConvParamsBase &params, cudaStream_t stream) { void causal_conv1d_fwd_launch(ConvParamsBase &params, cudaStream_t stream) {
static constexpr int kNElts = sizeof(input_t) == 4 ? 4 : 8; static constexpr int kNElts = sizeof(input_t) == 4 ? 4 : 8;
BOOL_SWITCH(params.seqlen % kNElts == 0, kIsVecLoad, [&] { const bool kVarlen = params.query_start_loc_ptr != nullptr;
BOOL_SWITCH(params.seqlen % kNElts == 0 && !kVarlen, kIsVecLoad, [&] {
using Ktraits = Causal_conv1d_fwd_kernel_traits<kNThreads, kWidth, kIsVecLoad, input_t, weight_t>; using Ktraits = Causal_conv1d_fwd_kernel_traits<kNThreads, kWidth, kIsVecLoad, input_t, weight_t>;
constexpr int kSmemSize = Ktraits::kSmemSize; constexpr int kSmemSize = Ktraits::kSmemSize;
dim3 grid(params.batch, params.dim); dim3 grid(params.batch, params.dim);
@@ -406,220 +489,11 @@ void causal_conv1d_fwd_cuda(ConvParamsBase &params, cudaStream_t stream) {
} }
} }
template<int kNThreads_, int kWidth_, int kChunkSizeL_, bool kIsVecLoad_, typename input_t_, typename weight_t_>
struct Causal_conv1d_channellast_fwd_kernel_traits {
// The cache line is 128 bytes, and we try to read 16 bytes per thread.
// So we have 8 threads per "row", so 32 or 64 elements in the channel dimension.
// That leaves 4 columns per warp, and so 16 columns per block (assuming each block has 128
// threads). Each each load is 16 x 32|64 elements in the L x C dimensions.
using input_t = input_t_;
using weight_t = weight_t_;
static constexpr int kNThreads = kNThreads_;
static_assert(kNThreads % 32 == 0);
static constexpr int kNWarps = kNThreads / 32;
static constexpr int kWidth = kWidth_;
static constexpr int kChunkSizeL = kChunkSizeL_;
static constexpr int kNBytes = sizeof(input_t);
static_assert(kNBytes == 2 || kNBytes == 4);
static constexpr int kNElts = kNBytes == 4 ? 4 : 8;
static constexpr int kNEltsPerRow = 128 / kNBytes;
static constexpr int kNThreadsPerRow = kNEltsPerRow / kNElts; // Always 8 for now
static_assert(kNThreadsPerRow * kNBytes * kNElts == 128);
static constexpr int kNColsPerWarp = 32 / kNThreadsPerRow; // Always 4 for now
static_assert(kNColsPerWarp * kNThreadsPerRow == 32);
static constexpr int kNColsPerLoad = kNColsPerWarp * kNWarps;
static constexpr int kNLoads = kChunkSizeL / kNColsPerLoad;
static_assert(kNLoads * kNColsPerLoad == kChunkSizeL);
static constexpr bool kIsVecLoad = kIsVecLoad_;
using vec_t = typename BytesToType<kNBytes * kNElts>::Type;
// using BlockLoadT = cub::BlockLoad<input_t, kNThreads, kNItems, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
// using BlockStoreT = cub::BlockStore<input_t, kNThreads, kNItems, cub::BLOCK_STORE_WARP_TRANSPOSE>;
// static constexpr int kSmemSize = std::max({sizeof(typename BlockLoadT::TempStorage),
// sizeof(typename BlockStoreT::TempStorage)});
// static constexpr int kSmemSize = kChunkSizeL * kNEltsPerRow * kNBytes;
};
template<typename Ktraits, bool kHasSeqIdx>
__global__ __launch_bounds__(Ktraits::kNThreads)
void causal_conv1d_channellast_fwd_kernel(ConvParamsBase params) {
constexpr int kWidth = Ktraits::kWidth;
constexpr int kNThreads = Ktraits::kNThreads;
constexpr int kNElts = Ktraits::kNElts;
constexpr int kNThreadsPerC = Ktraits::kNThreadsPerRow;
constexpr int kLPerLoad = Ktraits::kNColsPerLoad;
constexpr int kChunkSizeL = Ktraits::kChunkSizeL;
constexpr int kChunkSizeC = Ktraits::kNEltsPerRow;
using input_t = typename Ktraits::input_t;
using vec_t = typename Ktraits::vec_t;
using weight_t = typename Ktraits::weight_t;
// Shared memory.
__shared__ input_t x_smem[kWidth - 1 + kChunkSizeL][kChunkSizeC + kNElts];
const int batch_id = blockIdx.x;
const int chunk_l_id = blockIdx.y;
const int chunk_c_id = blockIdx.z;
const int tid = threadIdx.x;
const int l_idx = tid / kNThreadsPerC;
const int c_idx = tid % kNThreadsPerC;
input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + batch_id * params.x_batch_stride
+ (chunk_l_id * kChunkSizeL + l_idx) * params.x_l_stride + chunk_c_id * kChunkSizeC + c_idx * kNElts;
weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr)
+ chunk_c_id * kChunkSizeC * params.weight_c_stride;
input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride
+ (chunk_l_id * kChunkSizeL + l_idx) * params.out_l_stride + chunk_c_id * kChunkSizeC + c_idx * kNElts;
int *seq_idx = !kHasSeqIdx ? nullptr : reinterpret_cast<int *>(params.seq_idx_ptr)
+ batch_id * params.seqlen + chunk_l_id * kChunkSizeL;
input_t *initial_states = params.initial_states_ptr == nullptr || chunk_l_id > 0 ? nullptr
: reinterpret_cast<input_t *>(params.initial_states_ptr) + batch_id * params.initial_states_batch_stride + l_idx * params.initial_states_l_stride + chunk_c_id * kChunkSizeC + c_idx * kNElts;
// The last L-chunk will also have enough info to write to final states, since it also contain a few x values
// from the previous L-chunk.
input_t *final_states = params.final_states_ptr == nullptr || chunk_l_id < gridDim.y - 1 ? nullptr
: reinterpret_cast<input_t *>(params.final_states_ptr) + batch_id * params.final_states_batch_stride + l_idx * params.final_states_l_stride + chunk_c_id * kChunkSizeC + c_idx * kNElts;
#pragma unroll
for (int l = 0; l < Ktraits::kNLoads; ++l) {
input_t x_vals_load[kNElts] = {0};
if (chunk_l_id * kChunkSizeL + l * kLPerLoad + l_idx < params.seqlen
&& chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
reinterpret_cast<vec_t *>(x_vals_load)[0] = *reinterpret_cast<vec_t *>(x + l * kLPerLoad * params.x_l_stride);
}
reinterpret_cast<vec_t *>(x_smem[kWidth - 1 + l * kLPerLoad + l_idx])[c_idx] = reinterpret_cast<vec_t *>(x_vals_load)[0];
}
// Load the elements from the previous chunk that are needed for convolution.
if (l_idx < kWidth - 1) {
input_t x_vals_load[kNElts] = {0};
if (chunk_l_id * kChunkSizeL + l_idx - (kWidth - 1) >= 0
&& chunk_l_id * kChunkSizeL + l_idx - (kWidth - 1) < params.seqlen
&& chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
reinterpret_cast<vec_t *>(x_vals_load)[0] = *reinterpret_cast<vec_t *>(x - (kWidth - 1) * params.x_l_stride);
} else if (initial_states != nullptr
&& chunk_l_id * kChunkSizeL + l_idx - (kWidth - 1) < 0
&& chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
reinterpret_cast<vec_t *>(x_vals_load)[0] = *reinterpret_cast<vec_t *>(initial_states);
}
reinterpret_cast<vec_t *>(x_smem[l_idx])[c_idx] = reinterpret_cast<vec_t *>(x_vals_load)[0];
}
__syncthreads();
if (final_states != nullptr
&& l_idx < kWidth - 1
&& chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
// x_smem[0] contains element at index chunk_l_id * kChunkSizeL - (kWidth - 1)
// So last few elements (index params.seqlen - kWidth + 1 + l_idx) are stored in x_smem[params.seqlen - kWidth + 1 + l_idx - (chunk_l_id * kChunkSizeL - kWidth + 1)][c_idx]
*reinterpret_cast<vec_t *>(final_states) = reinterpret_cast<vec_t *>(x_smem[params.seqlen + l_idx - chunk_l_id * kChunkSizeL])[c_idx];
}
constexpr int kLPerThread = constexpr_min(kChunkSizeL * kChunkSizeC / kNThreads, kChunkSizeL);
static_assert(kLPerThread * kNThreads == kChunkSizeL * kChunkSizeC);
constexpr int kNThreadsPerRow = kChunkSizeL / kLPerThread;
static_assert(kNThreadsPerRow * kLPerThread == kChunkSizeL);
// kChunkSizeL, kLPerThread, kNThreadsPerRow should be powers of 2 for simplicity
static_assert((kChunkSizeL & (kChunkSizeL - 1)) == 0);
static_assert((kLPerThread & (kLPerThread - 1)) == 0);
static_assert((kNThreadsPerRow & (kNThreadsPerRow - 1)) == 0);
static_assert(kNThreadsPerRow <= 32);
const int row_idx = tid / kNThreadsPerRow;
const int col_idx = tid % kNThreadsPerRow;
float bias_val = params.bias_ptr == nullptr || chunk_c_id * kChunkSizeC + row_idx >= params.dim ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[chunk_c_id * kChunkSizeC + row_idx]);
float weight_vals[kWidth] = {0};
if (chunk_c_id * kChunkSizeC + row_idx < params.dim) {
#pragma unroll
for (int w = 0; w < kWidth; ++w) {
weight_vals[w] = weight[row_idx * params.weight_c_stride + w * params.weight_width_stride];
}
}
float x_vals[kWidth - 1 + kLPerThread];
#pragma unroll
for (int i = 0; i < kWidth - 1 + kLPerThread; ++i) {
x_vals[i] = float(x_smem[col_idx * kLPerThread + i][row_idx]);
}
int seq_idx_thread[kWidth - 1 + kLPerThread];
if constexpr (kHasSeqIdx) {
#pragma unroll
for (int i = 0; i < kWidth - 1 + kLPerThread; ++i) {
seq_idx_thread[i] = chunk_l_id * kChunkSizeL + col_idx * kLPerThread + i - (kWidth - 1) >= 0 ? seq_idx[col_idx * kLPerThread + i - (kWidth - 1)] : -1;
}
}
float out_vals[kLPerThread];
#pragma unroll
for (int i = 0; i < kLPerThread; ++i) {
out_vals[i] = bias_val;
const int seq_idx_cur = !kHasSeqIdx ? 0 : seq_idx_thread[i + kWidth - 1];
#pragma unroll
for (int w = 0; w < kWidth; ++w) {
if constexpr (!kHasSeqIdx) {
out_vals[i] += weight_vals[w] * x_vals[i + w];
} else {
out_vals[i] += seq_idx_thread[i + w] == seq_idx_cur ? weight_vals[w] * x_vals[i + w] : 0.f;
}
}
if (params.silu_activation) {out_vals[i] = out_vals[i] / (1 + expf(-out_vals[i])); }
}
__syncthreads();
#pragma unroll
for (int i = 0; i < kLPerThread; ++i) { x_smem[col_idx * kLPerThread + i][row_idx] = out_vals[i]; }
__syncthreads();
#pragma unroll
for (int l = 0; l < Ktraits::kNLoads; ++l) {
input_t out_vals_store[kNElts];
reinterpret_cast<vec_t *>(out_vals_store)[0] = reinterpret_cast<vec_t *>(x_smem[l * kLPerLoad + l_idx])[c_idx];
if (chunk_l_id * kChunkSizeL + l * kLPerLoad + l_idx < params.seqlen
&& chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
*reinterpret_cast<vec_t *>(out + l * kLPerLoad * params.out_l_stride) = reinterpret_cast<vec_t *>(out_vals_store)[0];
}
}
}
template<int kNThreads, int kWidth, typename input_t, typename weight_t>
void causal_conv1d_channellast_fwd_launch(ConvParamsBase &params, cudaStream_t stream) {
BOOL_SWITCH(params.seq_idx_ptr != nullptr, kHasSeqIdx, [&] {
using Ktraits = Causal_conv1d_channellast_fwd_kernel_traits<kNThreads, kWidth, 64, true, input_t, weight_t>;
// constexpr int kSmemSize = Ktraits::kSmemSize;
constexpr int kChunkSizeL = Ktraits::kChunkSizeL;
constexpr int kChunkSizeC = Ktraits::kNEltsPerRow;
const int n_chunks_L = (params.seqlen + kChunkSizeL - 1) / kChunkSizeL;
const int n_chunks_C = (params.dim + kChunkSizeC - 1) / kChunkSizeC;
dim3 grid(params.batch, n_chunks_L, n_chunks_C);
dim3 block(Ktraits::kNThreads);
auto kernel = &causal_conv1d_channellast_fwd_kernel<Ktraits, kHasSeqIdx>;
// if (kSmemSize >= 48 * 1024) {
// C10_CUDA_CHECK(cudaFuncSetAttribute(
// kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
// }
// kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
kernel<<<grid, Ktraits::kNThreads, 0, stream>>>(params);
C10_CUDA_KERNEL_LAUNCH_CHECK();
});
}
template<typename input_t, typename weight_t>
void causal_conv1d_channellast_fwd_cuda(ConvParamsBase &params, cudaStream_t stream) {
if (params.width == 2) {
causal_conv1d_channellast_fwd_launch<128, 2, input_t, weight_t>(params, stream);
} else if (params.width == 3) {
causal_conv1d_channellast_fwd_launch<128, 3, input_t, weight_t>(params, stream);
} else if (params.width == 4) {
causal_conv1d_channellast_fwd_launch<128, 4, input_t, weight_t>(params, stream);
}
}
template void causal_conv1d_fwd_cuda<float, float>(ConvParamsBase &params, cudaStream_t stream); template void causal_conv1d_fwd_cuda<float, float>(ConvParamsBase &params, cudaStream_t stream);
template void causal_conv1d_fwd_cuda<at::Half, at::Half>(ConvParamsBase &params, cudaStream_t stream); template void causal_conv1d_fwd_cuda<at::Half, at::Half>(ConvParamsBase &params, cudaStream_t stream);
template void causal_conv1d_fwd_cuda<at::BFloat16, at::BFloat16>(ConvParamsBase &params, cudaStream_t stream); template void causal_conv1d_fwd_cuda<at::BFloat16, at::BFloat16>(ConvParamsBase &params, cudaStream_t stream);
template void causal_conv1d_channellast_fwd_cuda<float, float>(ConvParamsBase &params, cudaStream_t stream);
template void causal_conv1d_channellast_fwd_cuda<at::Half, at::Half>(ConvParamsBase &params, cudaStream_t stream);
template void causal_conv1d_channellast_fwd_cuda<at::BFloat16, at::BFloat16>(ConvParamsBase &params, cudaStream_t stream);
///////
@@ -633,7 +507,7 @@ struct Causal_conv1d_update_kernel_traits {
static_assert(kNBytes == 2 || kNBytes == 4); static_assert(kNBytes == 2 || kNBytes == 4);
}; };
template<typename Ktraits> template<typename Ktraits, bool kIsCircularBuffer>
__global__ __launch_bounds__(Ktraits::kNThreads) __global__ __launch_bounds__(Ktraits::kNThreads)
void causal_conv1d_update_kernel(ConvParamsBase params) { void causal_conv1d_update_kernel(ConvParamsBase params) {
constexpr int kWidth = Ktraits::kWidth; constexpr int kWidth = Ktraits::kWidth;
@@ -644,42 +518,87 @@ void causal_conv1d_update_kernel(ConvParamsBase params) {
const int tidx = threadIdx.x; const int tidx = threadIdx.x;
const int batch_id = blockIdx.x; const int batch_id = blockIdx.x;
const int channel_id = blockIdx.y * kNThreads + tidx; const int channel_id = blockIdx.y * kNThreads + tidx;
if (channel_id >= params.dim) return;
input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + batch_id * params.x_batch_stride input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + batch_id * params.x_batch_stride
+ channel_id * params.x_c_stride; + channel_id * params.x_c_stride;
input_t *conv_state = reinterpret_cast<input_t *>(params.conv_state_ptr) + batch_id * params.conv_state_batch_stride
// If params.conv_state_batch_indices is set, then the conv state is gathered from the conv state tensor
// along the batch axis. Otherwise, the conv state coordinate is the same as the batch id.
const int conv_state_batch_coord = params.conv_state_indices_ptr == nullptr
? batch_id
: params.conv_state_indices_ptr[batch_id];
input_t *conv_state = reinterpret_cast<input_t *>(params.conv_state_ptr)
+ conv_state_batch_coord * params.conv_state_batch_stride
+ channel_id * params.conv_state_c_stride; + channel_id * params.conv_state_c_stride;
weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr) + channel_id * params.weight_c_stride; weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr) + channel_id * params.weight_c_stride;
input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride
+ channel_id * params.out_c_stride; + channel_id * params.out_c_stride;
float bias_val = params.bias_ptr == nullptr || channel_id >= params.dim ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[channel_id]); float bias_val = params.bias_ptr == nullptr ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[channel_id]);
int state_len = params.conv_state_len;
int advance_len = params.seqlen;
int cache_seqlen = kIsCircularBuffer ? params.cache_seqlens[batch_id] % state_len : 0;
int update_idx = cache_seqlen - (kWidth - 1);
update_idx = update_idx < 0 ? update_idx + state_len : update_idx;
float weight_vals[kWidth] = {0}; float weight_vals[kWidth] = {0};
if (channel_id < params.dim) { #pragma unroll
#pragma unroll for (int i = 0; i < kWidth; ++i) { weight_vals[i] = float(weight[i * params.weight_width_stride]); }
for (int i = 0; i < kWidth; ++i) { weight_vals[i] = float(weight[i * params.weight_width_stride]); }
}
float x_vals[kWidth] = {0}; float x_vals[kWidth] = {0};
if (channel_id < params.dim) { if constexpr (!kIsCircularBuffer) {
#pragma unroll 2
for (int i = 0; i < state_len - advance_len - (kWidth - 1); ++i) {
conv_state[i * params.conv_state_l_stride] = conv_state[(i + advance_len) * params.conv_state_l_stride];
}
#pragma unroll #pragma unroll
for (int i = 0; i < kWidth - 1; ++i) { x_vals[i] = float(conv_state[(i + 1) * params.conv_state_l_stride]); } for (int i = 0; i < kWidth - 1; ++i) {
x_vals[kWidth - 1] = float(x[0]); input_t state_val = conv_state[(state_len - (kWidth - 1) + i) * params.conv_state_l_stride];
if (i < advance_len + (kWidth - 1) && state_len - advance_len - (kWidth - 1) + i >= 0) {
conv_state[(state_len - advance_len - (kWidth - 1) + i) * params.conv_state_l_stride] = state_val;
}
x_vals[i] = float(state_val);
}
} else {
#pragma unroll #pragma unroll
for (int i = 0; i < kWidth; ++i) { conv_state[i * params.conv_state_l_stride] = input_t(x_vals[i]); } for (int i = 0; i < kWidth - 1; ++i, update_idx = update_idx + 1 >= state_len ? update_idx + 1 - state_len : update_idx + 1) {
input_t state_val = conv_state[update_idx * params.conv_state_l_stride];
x_vals[i] = float(state_val);
}
}
#pragma unroll 2
for (int i = 0; i < params.seqlen; ++i) {
input_t x_val = x[i * params.x_l_stride];
if constexpr (!kIsCircularBuffer) {
if (i < advance_len && state_len - advance_len + i >= 0) {
conv_state[(state_len - advance_len + i) * params.conv_state_l_stride] = x_val;
}
} else {
conv_state[update_idx * params.conv_state_l_stride] = x_val;
++update_idx;
update_idx = update_idx >= state_len ? update_idx - state_len : update_idx;
}
x_vals[kWidth - 1] = float(x_val);
float out_val = bias_val;
#pragma unroll
for (int j = 0; j < kWidth; ++j) { out_val += weight_vals[j] * x_vals[j]; }
if (params.silu_activation) { out_val = out_val / (1 + expf(-out_val)); }
out[i * params.out_l_stride] = input_t(out_val);
// Shift the input buffer by 1
#pragma unroll
for (int i = 0; i < kWidth - 1; ++i) { x_vals[i] = x_vals[i + 1]; }
} }
float out_val = bias_val;
#pragma unroll
for (int i = 0; i < kWidth; ++i) { out_val += weight_vals[i] * x_vals[i]; }
if (params.silu_activation) { out_val = out_val / (1 + expf(-out_val)); }
if (channel_id < params.dim) { out[0] = input_t(out_val); }
} }
template<int kNThreads, int kWidth, typename input_t, typename weight_t> template<int kNThreads, int kWidth, typename input_t, typename weight_t>
void causal_conv1d_update_launch(ConvParamsBase &params, cudaStream_t stream) { void causal_conv1d_update_launch(ConvParamsBase &params, cudaStream_t stream) {
using Ktraits = Causal_conv1d_update_kernel_traits<kNThreads, kWidth, input_t, weight_t>; using Ktraits = Causal_conv1d_update_kernel_traits<kNThreads, kWidth, input_t, weight_t>;
dim3 grid(params.batch, (params.dim + kNThreads - 1) / kNThreads); dim3 grid(params.batch, (params.dim + kNThreads - 1) / kNThreads);
auto kernel = &causal_conv1d_update_kernel<Ktraits>; auto kernel = params.cache_seqlens == nullptr
? &causal_conv1d_update_kernel<Ktraits, false>
: &causal_conv1d_update_kernel<Ktraits, true>;
kernel<<<grid, Ktraits::kNThreads, 0, stream>>>(params); kernel<<<grid, Ktraits::kNThreads, 0, stream>>>(params);
C10_CUDA_KERNEL_LAUNCH_CHECK(); C10_CUDA_KERNEL_LAUNCH_CHECK();
} }

View File

@@ -24,6 +24,7 @@ struct ConvParamsBase {
index_t out_c_stride; index_t out_c_stride;
index_t out_l_stride; index_t out_l_stride;
int conv_state_len;
index_t conv_state_batch_stride; index_t conv_state_batch_stride;
index_t conv_state_c_stride; index_t conv_state_c_stride;
index_t conv_state_l_stride; index_t conv_state_l_stride;
@@ -35,6 +36,14 @@ struct ConvParamsBase {
void *__restrict__ out_ptr; void *__restrict__ out_ptr;
void *__restrict__ conv_state_ptr; void *__restrict__ conv_state_ptr;
void *__restrict__ query_start_loc_ptr;
void *__restrict__ has_initial_state_ptr;
void *__restrict__ cache_indices_ptr;
int32_t *__restrict__ cache_seqlens;
// For the continuous batching case. Makes it so that the mamba state for
// the current batch doesn't need to be a contiguous tensor.
int32_t *__restrict__ conv_state_indices_ptr;
void *__restrict__ seq_idx_ptr; void *__restrict__ seq_idx_ptr;
@@ -48,6 +57,11 @@ struct ConvParamsBase {
index_t final_states_batch_stride; index_t final_states_batch_stride;
index_t final_states_l_stride; index_t final_states_l_stride;
index_t final_states_c_stride; index_t final_states_c_stride;
void * conv_states_ptr;
index_t conv_states_batch_stride;
index_t conv_states_l_stride;
index_t conv_states_c_stride;
}; };

View File

@@ -54,10 +54,14 @@ struct SSMParamsBase {
void *__restrict__ delta_ptr; void *__restrict__ delta_ptr;
void *__restrict__ delta_bias_ptr; void *__restrict__ delta_bias_ptr;
void *__restrict__ out_ptr; void *__restrict__ out_ptr;
void *__restrict__ x_ptr; void *__restrict__ ssm_states_ptr;
void *__restrict__ z_ptr; void *__restrict__ z_ptr;
void *__restrict__ out_z_ptr; void *__restrict__ out_z_ptr;
void *__restrict__ index_ptr;
void *__restrict__ query_start_loc_ptr;
void *__restrict__ cache_indices_ptr;
void *__restrict__ has_initial_state_ptr;
}; };
@@ -201,7 +205,7 @@ inline __device__ void load_input(typename Ktraits::input_t *u,
typename Ktraits::input_t (&u_vals)[Ktraits::kNItems], typename Ktraits::input_t (&u_vals)[Ktraits::kNItems],
typename Ktraits::BlockLoadT::TempStorage &smem_load, typename Ktraits::BlockLoadT::TempStorage &smem_load,
int seqlen) { int seqlen) {
if constexpr (Ktraits::kIsEvenLen) { if constexpr (Ktraits::kIsEvenLen && !Ktraits::kVarlen) {
auto& smem_load_vec = reinterpret_cast<typename Ktraits::BlockLoadVecT::TempStorage&>(smem_load); auto& smem_load_vec = reinterpret_cast<typename Ktraits::BlockLoadVecT::TempStorage&>(smem_load);
using vec_t = typename Ktraits::vec_t; using vec_t = typename Ktraits::vec_t;
typename Ktraits::BlockLoadVecT(smem_load_vec).Load( typename Ktraits::BlockLoadVecT(smem_load_vec).Load(
@@ -217,21 +221,6 @@ inline __device__ void load_input(typename Ktraits::input_t *u,
} }
} }
template<typename Ktraits>
inline __device__ void load_index(int *u,
int (&u_vals)[Ktraits::kNItems],
typename Ktraits::BlockLoadIndexT::TempStorage &smem_load_index,
int seqlen) {
if constexpr (Ktraits::kIsEvenLen) {
auto& smem_load_index_vec = reinterpret_cast<typename Ktraits::BlockLoadIndexVecT::TempStorage&>(smem_load_index);
Ktraits::BlockLoadIndexVecT(smem_load_index_vec).Load(
reinterpret_cast<uint4*>(u),
reinterpret_cast<uint4(&)[Ktraits::kNLoadsIndex]>(u_vals)
);
} else {
Ktraits::BlockLoadIndexT(smem_load_index).Load(u, u_vals, seqlen, 0);
}
}
template<typename Ktraits> template<typename Ktraits>
inline __device__ void load_weight(typename Ktraits::input_t *Bvar, inline __device__ void load_weight(typename Ktraits::input_t *Bvar,
@@ -240,7 +229,7 @@ inline __device__ void load_weight(typename Ktraits::input_t *Bvar,
int seqlen) { int seqlen) {
constexpr int kNItems = Ktraits::kNItems; constexpr int kNItems = Ktraits::kNItems;
typename Ktraits::input_t B_vals_load[kNItems]; typename Ktraits::input_t B_vals_load[kNItems];
if constexpr (Ktraits::kIsEvenLen) { if constexpr (Ktraits::kIsEvenLen && !Ktraits::kVarlen) {
auto& smem_load_weight_vec = reinterpret_cast<typename Ktraits::BlockLoadWeightVecT::TempStorage&>(smem_load_weight); auto& smem_load_weight_vec = reinterpret_cast<typename Ktraits::BlockLoadWeightVecT::TempStorage&>(smem_load_weight);
using vec_t = typename Ktraits::vec_t; using vec_t = typename Ktraits::vec_t;
typename Ktraits::BlockLoadWeightVecT(smem_load_weight_vec).Load( typename Ktraits::BlockLoadWeightVecT(smem_load_weight_vec).Load(
@@ -263,7 +252,7 @@ inline __device__ void store_output(typename Ktraits::input_t *out,
typename Ktraits::input_t write_vals[Ktraits::kNItems]; typename Ktraits::input_t write_vals[Ktraits::kNItems];
#pragma unroll #pragma unroll
for (int i = 0; i < Ktraits::kNItems; ++i) { write_vals[i] = out_vals[i]; } for (int i = 0; i < Ktraits::kNItems; ++i) { write_vals[i] = out_vals[i]; }
if constexpr (Ktraits::kIsEvenLen) { if constexpr (Ktraits::kIsEvenLen && !Ktraits::kVarlen) {
auto& smem_store_vec = reinterpret_cast<typename Ktraits::BlockStoreVecT::TempStorage&>(smem_store); auto& smem_store_vec = reinterpret_cast<typename Ktraits::BlockStoreVecT::TempStorage&>(smem_store);
using vec_t = typename Ktraits::vec_t; using vec_t = typename Ktraits::vec_t;
typename Ktraits::BlockStoreVecT(smem_store_vec).Store( typename Ktraits::BlockStoreVecT(smem_store_vec).Store(

View File

@@ -23,7 +23,7 @@
template<int kNThreads_, int kNItems_, int kNRows_, bool kIsEvenLen_, template<int kNThreads_, int kNItems_, int kNRows_, bool kIsEvenLen_,
bool kIsVariableB_, bool kIsVariableC_, bool kIsVariableB_, bool kIsVariableC_,
bool kHasZ_, bool kUseIndex_, typename input_t_, typename weight_t_> bool kHasZ_, bool kVarlen_, typename input_t_, typename weight_t_>
struct Selective_Scan_fwd_kernel_traits { struct Selective_Scan_fwd_kernel_traits {
static_assert(kNItems_ % 4 == 0); static_assert(kNItems_ % 4 == 0);
using input_t = input_t_; using input_t = input_t_;
@@ -38,22 +38,19 @@ struct Selective_Scan_fwd_kernel_traits {
static constexpr int kNElts = kNBytes == 4 ? 4 : constexpr_min(8, kNItems); static constexpr int kNElts = kNBytes == 4 ? 4 : constexpr_min(8, kNItems);
static_assert(kNItems % kNElts == 0); static_assert(kNItems % kNElts == 0);
static constexpr int kNLoads = kNItems / kNElts; static constexpr int kNLoads = kNItems / kNElts;
static constexpr bool kIsEvenLen = kIsEvenLen_; static constexpr bool kIsEvenLen = kVarlen_ ? false : kIsEvenLen_;
static constexpr bool kIsVariableB = kIsVariableB_; static constexpr bool kIsVariableB = kIsVariableB_;
static constexpr bool kIsVariableC = kIsVariableC_; static constexpr bool kIsVariableC = kIsVariableC_;
static constexpr bool kHasZ = kHasZ_; static constexpr bool kHasZ = kHasZ_;
static constexpr bool kUseIndex = kUseIndex_; static constexpr bool kVarlen = kVarlen_;
static constexpr bool kDirectIO = kIsEvenLen && kNLoads == 1; static constexpr bool kDirectIO = kVarlen_ ? false : kIsEvenLen && kNLoads == 1;
static constexpr int kNLoadsIndex = kNItems / 4; static constexpr int kNLoadsIndex = kNItems / 4;
using vec_t = typename BytesToType<kNBytes * kNElts>::Type; using vec_t = typename BytesToType<kNBytes * kNElts>::Type;
using scan_t = float2; using scan_t = float2;
using BlockLoadT = cub::BlockLoad<input_t, kNThreads, kNItems, cub::BLOCK_LOAD_WARP_TRANSPOSE>; using BlockLoadT = cub::BlockLoad<input_t, kNThreads, kNItems, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
using BlockLoadVecT = cub::BlockLoad<vec_t, kNThreads, kNLoads, using BlockLoadVecT = cub::BlockLoad<vec_t, kNThreads, kNLoads,
!kDirectIO ? cub::BLOCK_LOAD_WARP_TRANSPOSE : cub::BLOCK_LOAD_DIRECT>; !kDirectIO ? cub::BLOCK_LOAD_WARP_TRANSPOSE : cub::BLOCK_LOAD_DIRECT>;
using BlockLoadIndexT = cub::BlockLoad<int, kNThreads, kNItems, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
using BlockLoadIndexVecT = cub::BlockLoad<uint4, kNThreads, kNLoadsIndex,
!(kIsEvenLen && kNLoadsIndex == 1) ? cub::BLOCK_LOAD_WARP_TRANSPOSE : cub::BLOCK_LOAD_DIRECT>;
using BlockLoadWeightT = cub::BlockLoad<input_t, kNThreads, kNItems , cub::BLOCK_LOAD_WARP_TRANSPOSE>; using BlockLoadWeightT = cub::BlockLoad<input_t, kNThreads, kNItems , cub::BLOCK_LOAD_WARP_TRANSPOSE>;
using BlockLoadWeightVecT = cub::BlockLoad<vec_t, kNThreads, kNLoads , using BlockLoadWeightVecT = cub::BlockLoad<vec_t, kNThreads, kNLoads ,
!kDirectIO ? cub::BLOCK_LOAD_WARP_TRANSPOSE : cub::BLOCK_LOAD_DIRECT>; !kDirectIO ? cub::BLOCK_LOAD_WARP_TRANSPOSE : cub::BLOCK_LOAD_DIRECT>;
@@ -65,8 +62,6 @@ struct Selective_Scan_fwd_kernel_traits {
using BlockScanT = cub::BlockScan<scan_t, kNThreads, cub::BLOCK_SCAN_WARP_SCANS>; using BlockScanT = cub::BlockScan<scan_t, kNThreads, cub::BLOCK_SCAN_WARP_SCANS>;
static constexpr int kSmemIOSize = custom_max({sizeof(typename BlockLoadT::TempStorage), static constexpr int kSmemIOSize = custom_max({sizeof(typename BlockLoadT::TempStorage),
sizeof(typename BlockLoadVecT::TempStorage), sizeof(typename BlockLoadVecT::TempStorage),
sizeof(typename BlockLoadIndexT::TempStorage),
sizeof(typename BlockLoadIndexVecT::TempStorage),
(int(kIsVariableB) + int(kIsVariableC)) * sizeof(typename BlockLoadWeightT::TempStorage), (int(kIsVariableB) + int(kIsVariableC)) * sizeof(typename BlockLoadWeightT::TempStorage),
(int(kIsVariableB) + int(kIsVariableC)) * sizeof(typename BlockLoadWeightVecT::TempStorage), (int(kIsVariableB) + int(kIsVariableC)) * sizeof(typename BlockLoadWeightVecT::TempStorage),
sizeof(typename BlockStoreT::TempStorage), sizeof(typename BlockStoreT::TempStorage),
@@ -80,7 +75,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
constexpr bool kIsVariableB = Ktraits::kIsVariableB; constexpr bool kIsVariableB = Ktraits::kIsVariableB;
constexpr bool kIsVariableC = Ktraits::kIsVariableC; constexpr bool kIsVariableC = Ktraits::kIsVariableC;
constexpr bool kHasZ = Ktraits::kHasZ; constexpr bool kHasZ = Ktraits::kHasZ;
constexpr bool kUseIndex = Ktraits::kUseIndex; constexpr bool kVarlen = Ktraits::kVarlen;
constexpr int kNThreads = Ktraits::kNThreads; constexpr int kNThreads = Ktraits::kNThreads;
constexpr int kNItems = Ktraits::kNItems; constexpr int kNItems = Ktraits::kNItems;
constexpr int kNRows = Ktraits::kNRows; constexpr int kNRows = Ktraits::kNRows;
@@ -97,7 +92,6 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
// auto& smem_load = reinterpret_cast<typename BlockLoadT::TempStorage&>(smem_loadstorescan); // auto& smem_load = reinterpret_cast<typename BlockLoadT::TempStorage&>(smem_loadstorescan);
auto& smem_load = reinterpret_cast<typename Ktraits::BlockLoadT::TempStorage&>(smem_); auto& smem_load = reinterpret_cast<typename Ktraits::BlockLoadT::TempStorage&>(smem_);
auto& smem_load_weight = reinterpret_cast<typename Ktraits::BlockLoadWeightT::TempStorage&>(smem_); auto& smem_load_weight = reinterpret_cast<typename Ktraits::BlockLoadWeightT::TempStorage&>(smem_);
auto& smem_load_index = reinterpret_cast<typename Ktraits::BlockLoadIndexT::TempStorage&>(smem_);
auto& smem_load_weight1 = *reinterpret_cast<typename Ktraits::BlockLoadWeightT::TempStorage*>(smem_ + sizeof(typename Ktraits::BlockLoadWeightT::TempStorage)); auto& smem_load_weight1 = *reinterpret_cast<typename Ktraits::BlockLoadWeightT::TempStorage*>(smem_ + sizeof(typename Ktraits::BlockLoadWeightT::TempStorage));
auto& smem_store = reinterpret_cast<typename Ktraits::BlockStoreT::TempStorage&>(smem_); auto& smem_store = reinterpret_cast<typename Ktraits::BlockStoreT::TempStorage&>(smem_);
auto& smem_scan = *reinterpret_cast<typename Ktraits::BlockScanT::TempStorage*>(smem_ + Ktraits::kSmemIOSize); auto& smem_scan = *reinterpret_cast<typename Ktraits::BlockScanT::TempStorage*>(smem_ + Ktraits::kSmemIOSize);
@@ -108,17 +102,29 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
const int batch_id = blockIdx.x; const int batch_id = blockIdx.x;
const int dim_id = blockIdx.y; const int dim_id = blockIdx.y;
const int group_id = dim_id / (params.dim_ngroups_ratio); const int group_id = dim_id / (params.dim_ngroups_ratio);
input_t *u = reinterpret_cast<input_t *>(params.u_ptr) + batch_id * params.u_batch_stride int seqlen = params.seqlen;
int sequence_start_index = batch_id;
if constexpr (kVarlen){
int *query_start_loc = reinterpret_cast<int *>(params.query_start_loc_ptr);
sequence_start_index = query_start_loc[batch_id];
seqlen = query_start_loc[batch_id + 1] - sequence_start_index;
}
const bool has_initial_state = params.has_initial_state_ptr == nullptr ? false
: reinterpret_cast<bool *>(params.has_initial_state_ptr)[batch_id];
const int* cache_indices = params.cache_indices_ptr == nullptr ? nullptr
: reinterpret_cast<int *>(params.cache_indices_ptr);
const int cache_index = cache_indices == nullptr ? batch_id : cache_indices[batch_id];
input_t *u = reinterpret_cast<input_t *>(params.u_ptr) + sequence_start_index * params.u_batch_stride
+ dim_id * kNRows * params.u_d_stride; + dim_id * kNRows * params.u_d_stride;
input_t *delta = reinterpret_cast<input_t *>(params.delta_ptr) + batch_id * params.delta_batch_stride input_t *delta = reinterpret_cast<input_t *>(params.delta_ptr) + sequence_start_index * params.delta_batch_stride
+ dim_id * kNRows * params.delta_d_stride; + dim_id * kNRows * params.delta_d_stride;
weight_t *A = reinterpret_cast<weight_t *>(params.A_ptr) + dim_id * kNRows * params.A_d_stride; weight_t *A = reinterpret_cast<weight_t *>(params.A_ptr) + dim_id * kNRows * params.A_d_stride;
weight_t *B = reinterpret_cast<weight_t *>(params.B_ptr) + dim_id * kNRows * params.B_d_stride; weight_t *B = reinterpret_cast<weight_t *>(params.B_ptr) + dim_id * kNRows * params.B_d_stride;
input_t *Bvar = reinterpret_cast<input_t *>(params.B_ptr) + batch_id * params.B_batch_stride + group_id * params.B_group_stride; input_t *Bvar = reinterpret_cast<input_t *>(params.B_ptr) + sequence_start_index * params.B_batch_stride + group_id * params.B_group_stride;
weight_t *C = reinterpret_cast<weight_t *>(params.C_ptr) + dim_id * kNRows * params.C_d_stride; weight_t *C = reinterpret_cast<weight_t *>(params.C_ptr) + dim_id * kNRows * params.C_d_stride;
input_t *Cvar = reinterpret_cast<input_t *>(params.C_ptr) + batch_id * params.C_batch_stride + group_id * params.C_group_stride; input_t *Cvar = reinterpret_cast<input_t *>(params.C_ptr) + sequence_start_index * params.C_batch_stride + group_id * params.C_group_stride;
scan_t *x = reinterpret_cast<scan_t *>(params.x_ptr) + (batch_id * params.dim + dim_id * kNRows) * params.n_chunks * params.dstate; input_t *ssm_states = reinterpret_cast<input_t *>(params.ssm_states_ptr) + (cache_index * params.dim + dim_id * kNRows) * params.dstate;
int *index = !kUseIndex ? nullptr :reinterpret_cast<int *>(params.index_ptr) + batch_id * params.seqlen;
float D_val[kNRows] = {0}; float D_val[kNRows] = {0};
if (params.D_ptr != nullptr) { if (params.D_ptr != nullptr) {
@@ -142,9 +148,9 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
// } // }
constexpr int kChunkSize = kNThreads * kNItems; constexpr int kChunkSize = kNThreads * kNItems;
for (int chunk = 0; chunk < params.n_chunks; ++chunk) { const int n_chunks = (seqlen + 2048 - 1) / 2048;
for (int chunk = 0; chunk < n_chunks; ++chunk) {
input_t u_vals[kNRows][kNItems], delta_vals_load[kNRows][kNItems]; input_t u_vals[kNRows][kNItems], delta_vals_load[kNRows][kNItems];
int index_vals_load[kNRows][kNItems];
__syncthreads(); __syncthreads();
#pragma unroll #pragma unroll
@@ -152,15 +158,9 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
if constexpr (!kDirectIO) { if constexpr (!kDirectIO) {
if (r > 0) { __syncthreads(); } if (r > 0) { __syncthreads(); }
} }
load_input<Ktraits>(u + r * params.u_d_stride, u_vals[r], smem_load, params.seqlen - chunk * kChunkSize); load_input<Ktraits>(u + r * params.u_d_stride, u_vals[r], smem_load, seqlen - chunk * kChunkSize);
if constexpr (!kDirectIO) { __syncthreads(); } if constexpr (!kDirectIO) { __syncthreads(); }
load_input<Ktraits>(delta + r * params.delta_d_stride, delta_vals_load[r], smem_load, params.seqlen - chunk * kChunkSize); load_input<Ktraits>(delta + r * params.delta_d_stride, delta_vals_load[r], smem_load, seqlen - chunk * kChunkSize);
if constexpr (kUseIndex) {
load_index<Ktraits>(index + r * params.delta_d_stride, index_vals_load[r], smem_load_index, params.seqlen - chunk * kChunkSize);
}
}
if constexpr (kUseIndex) {
index += kChunkSize;
} }
u += kChunkSize; u += kChunkSize;
delta += kChunkSize; delta += kChunkSize;
@@ -195,9 +195,9 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
// If both B and C vary, this is unused. // If both B and C vary, this is unused.
weight_t BC_val[kNRows]; weight_t BC_val[kNRows];
weight_t B_vals[kNItems], C_vals[kNItems]; weight_t B_vals[kNItems], C_vals[kNItems];
if constexpr (kIsVariableB) { if constexpr (kIsVariableB) {
load_weight<Ktraits>(Bvar + state_idx * params.B_dstate_stride, B_vals, load_weight<Ktraits>(Bvar + state_idx * params.B_dstate_stride, B_vals,
smem_load_weight, (params.seqlen - chunk * kChunkSize) * (1)); smem_load_weight, (seqlen - chunk * kChunkSize) * (1));
if constexpr (!kIsVariableC) { if constexpr (!kIsVariableC) {
#pragma unroll #pragma unroll
for (int r = 0; r < kNRows; ++r) { for (int r = 0; r < kNRows; ++r) {
@@ -208,7 +208,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
if constexpr (kIsVariableC) { if constexpr (kIsVariableC) {
auto &smem_load_weight_C = !kIsVariableB ? smem_load_weight : smem_load_weight1; auto &smem_load_weight_C = !kIsVariableB ? smem_load_weight : smem_load_weight1;
load_weight<Ktraits>(Cvar + state_idx * params.C_dstate_stride, C_vals, load_weight<Ktraits>(Cvar + state_idx * params.C_dstate_stride, C_vals,
smem_load_weight_C, (params.seqlen - chunk * kChunkSize) * (1 )); smem_load_weight_C, (seqlen - chunk * kChunkSize) * (1 ));
if constexpr (!kIsVariableB) { if constexpr (!kIsVariableB) {
#pragma unroll #pragma unroll
for (int r = 0; r < kNRows; ++r) { for (int r = 0; r < kNRows; ++r) {
@@ -232,24 +232,16 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
thread_data[i] = make_float2(exp2f(delta_vals[r][i] * A_val[r]), thread_data[i] = make_float2(exp2f(delta_vals[r][i] * A_val[r]),
!kIsVariableB ? delta_u_vals[r][i] : B_vals[i] * delta_u_vals[r][i]); !kIsVariableB ? delta_u_vals[r][i] : B_vals[i] * delta_u_vals[r][i]);
// Reset A bar for cumulative sequences (Real) if (seqlen % (kNItems * kNThreads) != 0) { // So that the last state is correct
if constexpr (kUseIndex) { if (threadIdx.x * kNItems + i >= seqlen - chunk * kChunkSize) {
if (index_vals_load[r][i] == 0) {
thread_data[i].x = 0.f;
}
}
if constexpr (!Ktraits::kIsEvenLen) { // So that the last state is correct
if (threadIdx.x * kNItems + i >= params.seqlen - chunk * kChunkSize) {
thread_data[i] = make_float2(1.f, 0.f); thread_data[i] = make_float2(1.f, 0.f);
} }
} }
} }
// Initialize running total // Initialize running total
scan_t running_prefix;
// If we use WARP_SCAN then all lane 0 of all warps (not just thread 0) needs to read scan_t running_prefix = chunk > 0 ? smem_running_prefix[state_idx + r * MAX_DSTATE] : make_float2(1.0, has_initial_state ? float(ssm_states[state_idx]): 0.0);
running_prefix = chunk == 0 ? x[(r * params.n_chunks) * params.dstate + state_idx] : ( threadIdx.x % 32 == 0 ? smem_running_prefix[state_idx + r * MAX_DSTATE] : make_float2(1.f, 0.f));
// running_prefix = chunk > 0 && threadIdx.x == 0 ? smem_running_prefix[state_idx] : make_float2(1.f, 0.f);
SSMScanPrefixCallbackOp<weight_t> prefix_op(running_prefix); SSMScanPrefixCallbackOp<weight_t> prefix_op(running_prefix);
typename Ktraits::BlockScanT(smem_scan).InclusiveScan( typename Ktraits::BlockScanT(smem_scan).InclusiveScan(
thread_data, thread_data, SSMScanOp<weight_t>(), prefix_op thread_data, thread_data, SSMScanOp<weight_t>(), prefix_op
@@ -258,7 +250,9 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
// Unless there's only 1 warp, but then it's the same thread (0) reading and writing. // Unless there's only 1 warp, but then it's the same thread (0) reading and writing.
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
smem_running_prefix[state_idx] = prefix_op.running_prefix; smem_running_prefix[state_idx] = prefix_op.running_prefix;
x[(r * params.n_chunks + chunk) * params.dstate + state_idx] = prefix_op.running_prefix; if (chunk == n_chunks - 1) {
ssm_states[state_idx] = input_t(prefix_op.running_prefix.y);
}
} }
#pragma unroll #pragma unroll
for (int i = 0; i < kNItems; ++i) { for (int i = 0; i < kNItems; ++i) {
@@ -270,7 +264,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
} }
} }
input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + sequence_start_index * params.out_batch_stride
+ dim_id * kNRows * params.out_d_stride + chunk * kChunkSize; + dim_id * kNRows * params.out_d_stride + chunk * kChunkSize;
__syncthreads(); __syncthreads();
#pragma unroll #pragma unroll
@@ -278,26 +272,26 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
if constexpr (!kDirectIO) { if constexpr (!kDirectIO) {
if (r > 0) { __syncthreads(); } if (r > 0) { __syncthreads(); }
} }
store_output<Ktraits>(out + r * params.out_d_stride, out_vals[r], smem_store, params.seqlen - chunk * kChunkSize); store_output<Ktraits>(out + r * params.out_d_stride, out_vals[r], smem_store, seqlen - chunk * kChunkSize);
} }
if constexpr (kHasZ) { if constexpr (kHasZ) {
input_t *z = reinterpret_cast<input_t *>(params.z_ptr) + batch_id * params.z_batch_stride input_t *z = reinterpret_cast<input_t *>(params.z_ptr) + sequence_start_index * params.z_batch_stride
+ dim_id * kNRows * params.z_d_stride + chunk * kChunkSize; + dim_id * kNRows * params.z_d_stride + chunk * kChunkSize;
input_t *out_z = reinterpret_cast<input_t *>(params.out_z_ptr) + batch_id * params.out_z_batch_stride input_t *out_z = reinterpret_cast<input_t *>(params.out_z_ptr) + sequence_start_index * params.out_z_batch_stride
+ dim_id * kNRows * params.out_z_d_stride + chunk * kChunkSize; + dim_id * kNRows * params.out_z_d_stride + chunk * kChunkSize;
#pragma unroll #pragma unroll
for (int r = 0; r < kNRows; ++r) { for (int r = 0; r < kNRows; ++r) {
input_t z_vals[kNItems]; input_t z_vals[kNItems];
__syncthreads(); __syncthreads();
load_input<Ktraits>(z + r * params.z_d_stride, z_vals, smem_load, params.seqlen - chunk * kChunkSize); load_input<Ktraits>(z + r * params.z_d_stride, z_vals, smem_load, seqlen - chunk * kChunkSize);
#pragma unroll #pragma unroll
for (int i = 0; i < kNItems; ++i) { for (int i = 0; i < kNItems; ++i) {
float z_val = z_vals[i]; float z_val = z_vals[i];
out_vals[r][i] *= z_val / (1 + expf(-z_val)); out_vals[r][i] *= z_val / (1 + expf(-z_val));
} }
__syncthreads(); __syncthreads();
store_output<Ktraits>(out_z + r * params.out_z_d_stride, out_vals[r], smem_store, params.seqlen - chunk * kChunkSize); store_output<Ktraits>(out_z + r * params.out_z_d_stride, out_vals[r], smem_store, seqlen - chunk * kChunkSize);
} }
} }
@@ -316,8 +310,8 @@ void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
constexpr bool kIsVariableC = true; constexpr bool kIsVariableC = true;
constexpr bool kHasZ = true; constexpr bool kHasZ = true;
BOOL_SWITCH(params.seqlen % (kNThreads * kNItems) == 0, kIsEvenLen, [&] { BOOL_SWITCH(params.seqlen % (kNThreads * kNItems) == 0, kIsEvenLen, [&] {
BOOL_SWITCH(params.index_ptr != nullptr , kUseIndex, [&] { BOOL_SWITCH(params.query_start_loc_ptr != nullptr , kVarlen, [&] {
using Ktraits = Selective_Scan_fwd_kernel_traits<kNThreads, kNItems, kNRows, kIsEvenLen, kIsVariableB, kIsVariableC, kHasZ, kUseIndex, input_t, weight_t>; using Ktraits = Selective_Scan_fwd_kernel_traits<kNThreads, kNItems, kNRows, kIsEvenLen, kIsVariableB, kIsVariableC, kHasZ, kVarlen, input_t, weight_t>;
constexpr int kSmemSize = Ktraits::kSmemSize + kNRows * MAX_DSTATE * sizeof(typename Ktraits::scan_t); constexpr int kSmemSize = Ktraits::kSmemSize + kNRows * MAX_DSTATE * sizeof(typename Ktraits::scan_t);
dim3 grid(params.batch, params.dim / kNRows); dim3 grid(params.batch, params.dim / kNRows);
auto kernel = &selective_scan_fwd_kernel<Ktraits>; auto kernel = &selective_scan_fwd_kernel<Ktraits>;
@@ -405,12 +399,15 @@ void set_ssm_params_fwd(SSMParamsBase &params,
const torch::Tensor out, const torch::Tensor out,
const torch::Tensor z, const torch::Tensor z,
const torch::Tensor out_z, const torch::Tensor out_z,
void* D_ptr, const c10::optional<at::Tensor>& D,
void* delta_bias_ptr, const c10::optional<at::Tensor>& delta_bias,
void* x_ptr, const torch::Tensor ssm_states,
bool has_z, bool has_z,
bool delta_softplus, bool delta_softplus,
void* index_ptr) { const c10::optional<at::Tensor>& query_start_loc,
const c10::optional<at::Tensor>& cache_indices,
const c10::optional<at::Tensor>& has_initial_state,
bool varlen) {
// Reset the parameters // Reset the parameters
memset(&params, 0, sizeof(params)); memset(&params, 0, sizeof(params));
@@ -434,55 +431,83 @@ void set_ssm_params_fwd(SSMParamsBase &params,
params.A_ptr = A.data_ptr(); params.A_ptr = A.data_ptr();
params.B_ptr = B.data_ptr(); params.B_ptr = B.data_ptr();
params.C_ptr = C.data_ptr(); params.C_ptr = C.data_ptr();
params.D_ptr = D_ptr; params.D_ptr = D.has_value() ? D.value().data_ptr() : nullptr;
params.delta_bias_ptr = delta_bias_ptr; params.delta_bias_ptr = delta_bias.has_value() ? delta_bias.value().data_ptr() : nullptr;
params.out_ptr = out.data_ptr(); params.out_ptr = out.data_ptr();
params.x_ptr = x_ptr; params.ssm_states_ptr = ssm_states.data_ptr();
params.z_ptr = has_z ? z.data_ptr() : nullptr; params.z_ptr = has_z ? z.data_ptr() : nullptr;
params.out_z_ptr = has_z ? out_z.data_ptr() : nullptr; params.out_z_ptr = has_z ? out_z.data_ptr() : nullptr;
params.query_start_loc_ptr = query_start_loc.has_value() ? query_start_loc.value().data_ptr() : nullptr;
params.cache_indices_ptr = cache_indices.has_value() ? cache_indices.value().data_ptr() : nullptr;
params.has_initial_state_ptr = has_initial_state.has_value() ? has_initial_state.value().data_ptr() : nullptr;
params.index_ptr = index_ptr;
// All stride are in elements, not bytes. // All stride are in elements, not bytes.
params.A_d_stride = A.stride(0); params.A_d_stride = A.stride(0);
params.A_dstate_stride = A.stride(1); params.A_dstate_stride = A.stride(1);
if (!is_variable_B) {
params.B_d_stride = B.stride(0); if (varlen){
} else { params.B_batch_stride = B.stride(2);
params.B_batch_stride = B.stride(0); params.B_group_stride = B.stride(0);
params.B_group_stride = B.stride(1); params.B_dstate_stride = B.stride(1);
params.C_batch_stride = C.stride(2);
params.C_group_stride = C.stride(0);
params.C_dstate_stride = C.stride(1);
params.u_batch_stride = u.stride(1);
params.u_d_stride = u.stride(0);
params.delta_batch_stride = delta.stride(1);
params.delta_d_stride = delta.stride(0);
if (has_z) {
params.z_batch_stride = z.stride(1);
params.z_d_stride = z.stride(0);
params.out_z_batch_stride = out_z.stride(1);
params.out_z_d_stride = out_z.stride(0);
}
params.out_batch_stride = out.stride(1);
params.out_d_stride = out.stride(0);
} }
params.B_dstate_stride = !is_variable_B ? B.stride(1) : B.stride(2); else{
if (!is_variable_C) { if (!is_variable_B) {
params.C_d_stride = C.stride(0); params.B_d_stride = B.stride(0);
} else { } else {
params.C_batch_stride = C.stride(0); params.B_batch_stride = B.stride(0);
params.C_group_stride = C.stride(1); params.B_group_stride = B.stride(1);
}
params.B_dstate_stride = !is_variable_B ? B.stride(1) : B.stride(2);
if (!is_variable_C) {
params.C_d_stride = C.stride(0);
} else {
params.C_batch_stride = C.stride(0);
params.C_group_stride = C.stride(1);
}
params.C_dstate_stride = !is_variable_C ? C.stride(1) : C.stride(2);
params.u_batch_stride = u.stride(0);
params.u_d_stride = u.stride(1);
params.delta_batch_stride = delta.stride(0);
params.delta_d_stride = delta.stride(1);
if (has_z) {
params.z_batch_stride = z.stride(0);
params.z_d_stride = z.stride(1);
params.out_z_batch_stride = out_z.stride(0);
params.out_z_d_stride = out_z.stride(1);
}
params.out_batch_stride = out.stride(0);
params.out_d_stride = out.stride(1);
} }
params.C_dstate_stride = !is_variable_C ? C.stride(1) : C.stride(2);
params.u_batch_stride = u.stride(0);
params.u_d_stride = u.stride(1);
params.delta_batch_stride = delta.stride(0);
params.delta_d_stride = delta.stride(1);
if (has_z) {
params.z_batch_stride = z.stride(0);
params.z_d_stride = z.stride(1);
params.out_z_batch_stride = out_z.stride(0);
params.out_z_d_stride = out_z.stride(1);
}
params.out_batch_stride = out.stride(0);
params.out_d_stride = out.stride(1);
} }
std::vector<torch::Tensor> void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
const torch::Tensor &A, const torch::Tensor &B, const torch::Tensor &C, const torch::Tensor &A, const torch::Tensor &B, const torch::Tensor &C,
const c10::optional<torch::Tensor> &D_, const c10::optional<torch::Tensor> &D_,
const c10::optional<torch::Tensor> &z_, const c10::optional<torch::Tensor> &z_,
const c10::optional<torch::Tensor> &delta_bias_, const c10::optional<torch::Tensor> &delta_bias_,
bool delta_softplus, bool delta_softplus,
const c10::optional<torch::Tensor> &index_, const c10::optional<torch::Tensor> &query_start_loc,
const c10::optional<torch::Tensor> &x) { const c10::optional<torch::Tensor> &cache_indices,
const c10::optional<torch::Tensor> &has_initial_state,
const torch::Tensor &ssm_states) {
auto input_type = u.scalar_type(); auto input_type = u.scalar_type();
auto weight_type = A.scalar_type(); auto weight_type = A.scalar_type();
TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16); TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
@@ -505,23 +530,37 @@ selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
TORCH_CHECK(delta.stride(-1) == 1 || delta.size(-1) == 1); TORCH_CHECK(delta.stride(-1) == 1 || delta.size(-1) == 1);
const auto sizes = u.sizes(); const auto sizes = u.sizes();
const int batch_size = sizes[0]; const bool varlen = query_start_loc.has_value();
const int dim = sizes[1]; const int batch_size = varlen ? query_start_loc.value().sizes()[0] - 1 : sizes[0];
const int seqlen = sizes[2]; const int dim = varlen ? sizes[0] : sizes[1];
const int seqlen = varlen ? sizes[1] : sizes[2];
const int dstate = A.size(1); const int dstate = A.size(1);
const int n_groups = is_variable_B ? B.size(1) : 1; const int n_groups = varlen ? B.size(0) : B.size(1);
TORCH_CHECK(dstate <= 256, "selective_scan only supports state dimension <= 256"); TORCH_CHECK(dstate <= 256, "selective_scan only supports state dimension <= 256");
CHECK_SHAPE(u, batch_size, dim, seqlen); if (varlen) {
CHECK_SHAPE(delta, batch_size, dim, seqlen); CHECK_SHAPE(u, dim, seqlen);
CHECK_SHAPE(delta, dim, seqlen);
} else {
CHECK_SHAPE(u, batch_size, dim, seqlen);
CHECK_SHAPE(delta, batch_size, dim, seqlen);
}
CHECK_SHAPE(A, dim, dstate); CHECK_SHAPE(A, dim, dstate);
TORCH_CHECK(is_variable_B, "is_variable_B = False is disabled in favor of reduced binary size") TORCH_CHECK(is_variable_B, "is_variable_B = False is disabled in favor of reduced binary size")
CHECK_SHAPE(B, batch_size, n_groups, dstate, seqlen ); if (varlen) {
CHECK_SHAPE(B, n_groups, dstate, seqlen);
} else {
CHECK_SHAPE(B, batch_size, n_groups, dstate, seqlen);
}
TORCH_CHECK(B.stride(-1) == 1 || B.size(-1) == 1); TORCH_CHECK(B.stride(-1) == 1 || B.size(-1) == 1);
TORCH_CHECK(is_variable_C, "is_variable_C = False is disabled in favor of reduced binary size") TORCH_CHECK(is_variable_C, "is_variable_C = False is disabled in favor of reduced binary size")
CHECK_SHAPE(C, batch_size, n_groups, dstate, seqlen); if (varlen) {
CHECK_SHAPE(C, n_groups, dstate, seqlen);
} else {
CHECK_SHAPE(C, batch_size, n_groups, dstate, seqlen);
}
TORCH_CHECK(C.stride(-1) == 1 || C.size(-1) == 1); TORCH_CHECK(C.stride(-1) == 1 || C.size(-1) == 1);
if (D_.has_value()) { if (D_.has_value()) {
@@ -539,13 +578,31 @@ selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
TORCH_CHECK(delta_bias.stride(-1) == 1 || delta_bias.size(-1) == 1); TORCH_CHECK(delta_bias.stride(-1) == 1 || delta_bias.size(-1) == 1);
CHECK_SHAPE(delta_bias, dim); CHECK_SHAPE(delta_bias, dim);
} }
if (index_.has_value()) {
auto index = index_.value();
TORCH_CHECK(index.scalar_type() == at::ScalarType::Int); if (has_initial_state.has_value()) {
TORCH_CHECK(index.is_cuda()); auto has_initial_state_ = has_initial_state.value();
CHECK_SHAPE(index, batch_size, seqlen); TORCH_CHECK(has_initial_state_.scalar_type() == at::ScalarType::Bool);
TORCH_CHECK(has_initial_state_.is_cuda());
CHECK_SHAPE(has_initial_state_, batch_size);
} }
if (query_start_loc.has_value()) {
auto query_start_loc_ = query_start_loc.value();
TORCH_CHECK(query_start_loc_.scalar_type() == at::ScalarType::Int);
TORCH_CHECK(query_start_loc_.is_cuda());
}
if (cache_indices.has_value()) {
auto cache_indices_ = cache_indices.value();
TORCH_CHECK(cache_indices_.scalar_type() == at::ScalarType::Int);
TORCH_CHECK(cache_indices_.is_cuda());
CHECK_SHAPE(cache_indices_, batch_size);
}
at::Tensor z, out_z; at::Tensor z, out_z;
const bool has_z = z_.has_value(); const bool has_z = z_.has_value();
TORCH_CHECK(has_z, "has_z = False is disabled in favor of reduced binary size") TORCH_CHECK(has_z, "has_z = False is disabled in favor of reduced binary size")
@@ -553,31 +610,38 @@ selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
TORCH_CHECK(z.scalar_type() == input_type); TORCH_CHECK(z.scalar_type() == input_type);
TORCH_CHECK(z.is_cuda()); TORCH_CHECK(z.is_cuda());
TORCH_CHECK(z.stride(-1) == 1 || z.size(-1) == 1); TORCH_CHECK(z.stride(-1) == 1 || z.size(-1) == 1);
CHECK_SHAPE(z, batch_size, dim, seqlen); if (varlen){
out_z = torch::empty_like(z); CHECK_SHAPE(z, dim, seqlen);
} else {
CHECK_SHAPE(z, batch_size, dim, seqlen);
}
out_z = z;
const int n_chunks = (seqlen + 2048 - 1) / 2048; const int n_chunks = (seqlen + 2048 - 1) / 2048;
// const int n_chunks = (seqlen + 1024 - 1) / 1024; // const int n_chunks = (seqlen + 1024 - 1) / 1024;
// at::Tensor out = torch::empty_like(u); // at::Tensor out = torch::empty_like(u);
// Right now u has BHL layout and delta has HBL layout, and we want out to have HBL layout // Right now u has BHL layout and delta has HBL layout, and we want out to have HBL layout
at::Tensor out = torch::empty_like(delta); at::Tensor out = delta;
if (x.has_value()){ TORCH_CHECK(ssm_states.scalar_type() == input_type);
auto _x = x.value(); TORCH_CHECK(ssm_states.is_cuda());
TORCH_CHECK(_x.scalar_type() == weight_type); TORCH_CHECK(ssm_states.stride(-1) == 1);
TORCH_CHECK(_x.is_cuda()); CHECK_SHAPE(ssm_states, batch_size, dim, dstate);
TORCH_CHECK(_x.stride(-1) == 1);
CHECK_SHAPE(_x, batch_size, dim, n_chunks, dstate * 2);
}
SSMParamsBase params; SSMParamsBase params;
set_ssm_params_fwd(params, batch_size, dim, seqlen, dstate, n_groups, n_chunks, is_variable_B, is_variable_C, set_ssm_params_fwd(params, batch_size, dim, seqlen, dstate, n_groups, n_chunks, is_variable_B, is_variable_C,
u, delta, A, B, C, out, z, out_z, u, delta, A, B, C, out, z, out_z,
D_.has_value() ? D_.value().data_ptr() : nullptr, D_,
delta_bias_.has_value() ? delta_bias_.value().data_ptr() : nullptr, delta_bias_,
x.value().data_ptr(), ssm_states,
has_z, has_z,
delta_softplus, delta_softplus,
index_.has_value() ? index_.value().data_ptr() : nullptr); query_start_loc,
cache_indices,
has_initial_state,
varlen
);
// Otherwise the kernel will be launched from cuda:0 device // Otherwise the kernel will be launched from cuda:0 device
// Cast to char to avoid compiler warning about narrowing // Cast to char to avoid compiler warning about narrowing
@@ -586,8 +650,5 @@ selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_fwd", [&] { DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_fwd", [&] {
selective_scan_fwd_cuda<input_t, weight_t>(params, stream); selective_scan_fwd_cuda<input_t, weight_t>(params, stream);
}); });
std::vector<at::Tensor> result = {out, x.value()};
if (has_z) { result.push_back(out_z); }
return result;
} }

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,31 @@
#include "marlin_moe_kernel_ku4.h"
namespace marlin_moe {
// We return bool so we can create these different kernel calls as a sequence
// of if-elseif's.
bool call_marlin_moe_kernel_ku4(
vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
bool has_act_order, int group_blocks, int num_threads, int blocks,
int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
int m_block, int max_par, int cfg_max_m_blocks) {
bool has_zp = true;
if (false) {
}
AWQ_CALL_IF_MOE(vllm::kU4, 16, 4, 256)
AWQ_CALL_IF_MOE(vllm::kU4, 8, 8, 256)
AWQ_CALL_IF_MOE(vllm::kU4, 8, 4, 128)
AWQ_CALL_IF_MOE(vllm::kU4, 4, 8, 128)
else {
return false;
}
return true;
}
} // namespace marlin_moe

View File

@@ -0,0 +1,20 @@
#pragma once
#include "marlin_moe_kernel.h"
namespace marlin_moe {
// We return bool so we can create these different kernel calls as a sequence
// of if-elseif's.
bool call_marlin_moe_kernel_ku4(
vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
bool has_act_order, int group_blocks, int num_threads, int blocks,
int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
int m_block, int max_par, int cfg_max_m_blocks);
} // namespace marlin_moe

View File

@@ -0,0 +1,31 @@
#include "marlin_moe_kernel_ku4b8.h"
namespace marlin_moe {
// We return bool so we can create these different kernel calls as a sequence
// of if-elseif's.
bool call_marlin_moe_kernel_ku4b8(
vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
bool has_act_order, int group_blocks, int num_threads, int blocks,
int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
int m_block, int max_par, int cfg_max_m_blocks) {
bool has_zp = false;
if (false) {
}
GPTQ_CALL_IF_MOE(vllm::kU4B8, 16, 4, 256)
GPTQ_CALL_IF_MOE(vllm::kU4B8, 8, 8, 256)
GPTQ_CALL_IF_MOE(vllm::kU4B8, 8, 4, 128)
GPTQ_CALL_IF_MOE(vllm::kU4B8, 4, 8, 128)
else {
return false;
}
return true;
}
} // namespace marlin_moe

View File

@@ -0,0 +1,20 @@
#pragma once
#include "marlin_moe_kernel.h"
namespace marlin_moe {
// We return bool so we can create these different kernel calls as a sequence
// of if-elseif's.
bool call_marlin_moe_kernel_ku4b8(
vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
bool has_act_order, int group_blocks, int num_threads, int blocks,
int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
int m_block, int max_par, int cfg_max_m_blocks);
} // namespace marlin_moe

View File

@@ -0,0 +1,31 @@
#include "marlin_moe_kernel_ku8b128.h"
namespace marlin_moe {
// We return bool so we can create these different kernel calls as a sequence
// of if-elseif's.
bool call_marlin_moe_kernel_ku8b128(
vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
bool has_act_order, int group_blocks, int num_threads, int blocks,
int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
int m_block, int max_par, int cfg_max_m_blocks) {
bool has_zp = false;
if (false) {
}
GPTQ_CALL_IF_MOE(vllm::kU8B128, 16, 4, 256)
GPTQ_CALL_IF_MOE(vllm::kU8B128, 8, 8, 256)
GPTQ_CALL_IF_MOE(vllm::kU8B128, 8, 4, 128)
GPTQ_CALL_IF_MOE(vllm::kU8B128, 4, 8, 128)
else {
return false;
}
return true;
}
} // namespace marlin_moe

View File

@@ -0,0 +1,18 @@
#pragma once
#include "marlin_moe_kernel.h"
namespace marlin_moe {
bool call_marlin_moe_kernel_ku8b128(
vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
bool has_act_order, int group_blocks, int num_threads, int blocks,
int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
int m_block, int max_par, int cfg_max_m_blocks);
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,12 +0,0 @@
#pragma once
#include <torch/all.h>
torch::Tensor marlin_gemm_moe(
const torch::Tensor& a, const torch::Tensor& b_q_weights,
const torch::Tensor& sorted_ids, const torch::Tensor& topk_weights,
const torch::Tensor& topk_ids, const torch::Tensor& b_scales,
const torch::Tensor& g_idx, const torch::Tensor& perm,
torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k,
bool is_k_full, int64_t num_experts, int64_t topk, int64_t moe_block_size,
bool replicate_input, bool apply_weights);

View File

@@ -1,6 +1,5 @@
#include "core/registration.h" #include "core/registration.h"
#include "moe_ops.h" #include "moe_ops.h"
#include "marlin_moe_ops.h"
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
// Apply topk softmax to the gating outputs. // Apply topk softmax to the gating outputs.
@@ -13,10 +12,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
m.def( m.def(
"marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, " "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
"Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! " "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
"g_idx, Tensor! perm, Tensor! workspace, int size_m, int size_n, int " "b_zeros, Tensor! g_idx, Tensor! perm, Tensor! workspace, "
"size_k, bool is_k_full, int num_experts, int topk, int moe_block_size, " "__torch__.torch.classes._core_C.ScalarType b_q_type, int size_m, "
"bool replicate_input, bool apply_weights) -> Tensor"); "int size_n, int size_k, bool is_k_full, int num_experts, int topk, "
m.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe); "int moe_block_size, bool replicate_input, bool apply_weights)"
" -> Tensor");
// conditionally compiled so impl registration is in source file
#endif #endif
} }

View File

@@ -54,10 +54,21 @@ void gelu_fast(torch::Tensor& out, torch::Tensor& input);
void gelu_quick(torch::Tensor& out, torch::Tensor& input); void gelu_quick(torch::Tensor& out, torch::Tensor& input);
void advance_step(int64_t num_seqs, int64_t num_queries, int64_t block_size, void advance_step_flashattn(int64_t num_seqs, int64_t num_queries,
torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids, int64_t block_size, torch::Tensor& input_tokens,
torch::Tensor& input_positions, torch::Tensor& seq_lens, torch::Tensor& sampled_token_ids,
torch::Tensor& slot_mapping, torch::Tensor& block_tables); torch::Tensor& input_positions,
torch::Tensor& seq_lens,
torch::Tensor& slot_mapping,
torch::Tensor& block_tables);
void advance_step_flashinfer(
int64_t num_seqs, int64_t num_queries, int64_t block_size,
torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
torch::Tensor& input_positions, torch::Tensor& seq_lens,
torch::Tensor& slot_mapping, torch::Tensor& block_tables,
torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds);
#ifndef USE_ROCM #ifndef USE_ROCM
torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes, torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
@@ -79,60 +90,7 @@ torch::Tensor awq_dequantize(torch::Tensor _kernel,
torch::Tensor _zeros, int64_t split_k_iters, torch::Tensor _zeros, int64_t split_k_iters,
int64_t thx, int64_t thy); int64_t thx, int64_t thy);
torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, torch::Tensor permute_cols(torch::Tensor const& A, torch::Tensor const& perm);
torch::Tensor& b_scales, torch::Tensor& workspace,
int64_t size_m, int64_t size_n, int64_t size_k);
namespace machete {
std::vector<std::string> supported_schedules(
vllm::ScalarTypeTorchPtr const& btype);
torch::Tensor gemm(torch::Tensor const& A, torch::Tensor const& B,
vllm::ScalarTypeTorchPtr const& btype,
c10::optional<torch::Tensor> const& scales,
c10::optional<torch::Tensor> const& zeros,
c10::optional<int64_t> group_size,
c10::optional<torch::Tensor> const& C,
c10::optional<double> alpha, c10::optional<double> beta,
c10::optional<std::string> schedule);
torch::Tensor prepack_B(torch::Tensor const& B,
vllm::ScalarTypeTorchPtr const& btype);
}; // namespace machete
torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
torch::Tensor& b_meta,
torch::Tensor& b_scales,
torch::Tensor& workspace,
vllm::ScalarTypeTorchPtr const& b_q_type,
int64_t size_m, int64_t size_n,
int64_t size_k);
torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
torch::Tensor& b_scales, torch::Tensor& b_zeros,
torch::Tensor& g_idx, torch::Tensor& perm,
torch::Tensor& workspace,
vllm::ScalarTypeTorchPtr const& b_q_type,
int64_t size_m, int64_t size_n, int64_t size_k,
bool is_k_full, bool has_zp,
bool use_fp32_reduce);
torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
int64_t size_k, int64_t size_n,
int64_t num_bits);
torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
torch::Tensor& perm, c10::SymInt size_k,
c10::SymInt size_n, int64_t num_bits);
torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
int64_t size_n, int64_t num_bits);
torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
c10::SymInt size_k, c10::SymInt size_n,
int64_t num_bits);
torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m, torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
int64_t n); int64_t n);
@@ -143,11 +101,6 @@ torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, torch::Tensor X,
torch::Tensor ggml_mul_mat_a8(torch::Tensor W, torch::Tensor X, int64_t type, torch::Tensor ggml_mul_mat_a8(torch::Tensor W, torch::Tensor X, int64_t type,
int64_t row); int64_t row);
torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
torch::Tensor& b_scales, torch::Tensor& workspace,
int64_t num_bits, int64_t size_m, int64_t size_n,
int64_t size_k);
bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability); bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a, void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
@@ -162,21 +115,15 @@ void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
torch::Tensor const& azp_adj, torch::Tensor const& azp_adj,
c10::optional<torch::Tensor> const& azp, c10::optional<torch::Tensor> const& azp,
c10::optional<torch::Tensor> const& bias); c10::optional<torch::Tensor> const& bias);
torch::Tensor marlin_qqq_gemm(torch::Tensor const& a,
torch::Tensor const& b_q_weight,
torch::Tensor const& s_tok,
torch::Tensor const& s_ch,
torch::Tensor const& s_group,
torch::Tensor& workspace, int64_t size_m,
int64_t size_n, int64_t size_k);
#endif #endif
void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input, void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
torch::Tensor const& scale); torch::Tensor const& scale,
c10::optional<torch::Tensor> const& azp);
void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input, void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
torch::Tensor& scales); torch::Tensor& scales,
c10::optional<torch::Tensor> const& azp);
torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight, torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
torch::Tensor b_gptq_qzeros, torch::Tensor b_gptq_qzeros,
@@ -200,26 +147,30 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
torch::Tensor experts_ids, torch::Tensor experts_ids,
torch::Tensor num_tokens_post_pad); torch::Tensor num_tokens_post_pad);
std::vector<torch::Tensor> selective_scan_fwd( void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta,
const torch::Tensor& u, const torch::Tensor& delta, const torch::Tensor& A, const torch::Tensor& A, const torch::Tensor& B,
const torch::Tensor& B, const torch::Tensor& C, const torch::Tensor& C,
const c10::optional<torch::Tensor>& D_, const c10::optional<torch::Tensor>& D_,
const c10::optional<torch::Tensor>& z_, const c10::optional<torch::Tensor>& z_,
const c10::optional<torch::Tensor>& delta_bias_, bool delta_softplus, const c10::optional<torch::Tensor>& delta_bias_,
const c10::optional<torch::Tensor>& index_, bool delta_softplus,
const c10::optional<torch::Tensor>& x); const c10::optional<torch::Tensor>& query_start_loc,
const c10::optional<torch::Tensor>& cache_indices,
const c10::optional<torch::Tensor>& has_initial_state,
const torch::Tensor& ssm_states);
at::Tensor causal_conv1d_update(const at::Tensor& x, at::Tensor causal_conv1d_update(
const at::Tensor& conv_state, const at::Tensor& x, const at::Tensor& conv_state, const at::Tensor& weight,
const at::Tensor& weight, const c10::optional<at::Tensor>& bias_, bool silu_activation,
const c10::optional<at::Tensor>& bias_, const c10::optional<at::Tensor>& cache_seqlens_,
bool silu_activation); const c10::optional<at::Tensor>& conv_state_indices_);
at::Tensor causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight, at::Tensor causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight,
const c10::optional<at::Tensor>& bias_, const c10::optional<at::Tensor>& bias_,
const c10::optional<at::Tensor>& seq_idx_, const c10::optional<at::Tensor>& conv_states,
const c10::optional<at::Tensor>& initial_states_, const c10::optional<at::Tensor>& query_start_loc,
const c10::optional<at::Tensor>& final_states_out_, const c10::optional<at::Tensor>& cache_indices,
const c10::optional<at::Tensor>& has_initial_state,
bool silu_activation); bool silu_activation);
#ifndef USE_ROCM #ifndef USE_ROCM
@@ -228,8 +179,6 @@ fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data,
const std::vector<std::string>& handles, const std::vector<std::string>& handles,
const std::vector<int64_t>& offsets, int64_t rank, const std::vector<int64_t>& offsets, int64_t rank,
bool full_nvlink); bool full_nvlink);
bool should_custom_ar(torch::Tensor& inp, int64_t max_size, int64_t world_size,
bool full_nvlink);
void all_reduce_reg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out); void all_reduce_reg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out);
void all_reduce_unreg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& reg_buffer, void all_reduce_unreg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& reg_buffer,
torch::Tensor& out); torch::Tensor& out);

88
csrc/permute_cols.cu Normal file
View File

@@ -0,0 +1,88 @@
#include <torch/all.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda_fp16.h>
static constexpr int default_threads = 256;
static constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
// For a given "a" of size [M,K] performs a permutation of the K columns based
// on the given "perm" indices.
// Currently only supports 16bit types (since we permute half types)
__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
int const* __restrict__ perm_int_ptr,
int4* __restrict__ out_int4_ptr, int size_m,
int size_k, int block_rows) {
int start_row = block_rows * blockIdx.x;
int finish_row = start_row + block_rows;
if (finish_row > size_m) {
finish_row = size_m;
}
int cur_block_rows = std::max(finish_row - start_row, 0);
int row_stride = size_k * sizeof(half) / 16;
auto permute_row = [&](int row) {
int iters = size_k / default_threads;
int rest = size_k % default_threads;
int offset = row * row_stride;
half const* a_row_half = reinterpret_cast<half const*>(a_int4_ptr + offset);
half* out_half = reinterpret_cast<half*>(out_int4_ptr + offset);
int base_k = 0;
for (int i = 0; i < iters; i++) {
int cur_k = base_k + threadIdx.x;
int src_pos = perm_int_ptr[cur_k];
out_half[cur_k] = a_row_half[src_pos];
base_k += default_threads;
}
if (rest) {
if (threadIdx.x < rest) {
int cur_k = base_k + threadIdx.x;
int src_pos = perm_int_ptr[cur_k];
out_half[cur_k] = a_row_half[src_pos];
}
}
};
for (int i = 0; i < cur_block_rows; i++) {
int cur_row = start_row + i;
if (cur_row < size_m) {
permute_row(cur_row);
}
}
}
// More efficient version of A[..., perm]
// taken from gptq_marlin.cu
torch::Tensor permute_cols(torch::Tensor const& A, torch::Tensor const& perm) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
auto dev = A.get_device();
auto stream = at::cuda::getCurrentCUDAStream(dev);
TORCH_CHECK(A.scalar_type() == at::kHalf || A.scalar_type() == at::kBFloat16,
"Currently only 16bit types are supported");
TORCH_CHECK(A.is_contiguous(), "A must be contiguous");
TORCH_CHECK(A.size(-1) % 8 == 0,
"A columns must be a multiple of 8 (128bits)");
auto A_2d = A.view({-1, A.size(-1)});
torch::Tensor D = torch::empty_like(A);
int sms;
cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
int block_rows = div_ceil(A_2d.size(0), sms);
permute_cols_kernel<<<sms, default_threads, 0, stream>>>(
reinterpret_cast<int4 const*>(A_2d.const_data_ptr()),
perm.const_data_ptr<int>(), reinterpret_cast<int4*>(D.mutable_data_ptr()),
A_2d.size(0), A_2d.size(1), block_rows);
return D;
}

View File

@@ -12,13 +12,22 @@ namespace prepare_inputs {
// //
template <int const num_threads> template <int const num_threads>
__global__ void advance_step_kernel(int num_seqs, int num_queries, __global__ void advance_step_flashattn_kernel(
int block_size, long* input_tokens_ptr, int num_seqs, int num_queries, int block_size, long* input_tokens_ptr,
long const* sampled_token_ids_ptr, long const* sampled_token_ids_ptr, long* input_positions_ptr,
long* input_positions_ptr, int* seq_lens_ptr, long* slot_mapping_ptr, int const* block_tables_ptr,
int* seq_lens_ptr, long* slot_mapping_ptr, int64_t const block_tables_stride) {
int const* block_tables_ptr, int const n_pad = num_seqs - num_queries;
int64_t const block_tables_stride) { if (n_pad && blockIdx.x == 0) {
// Handle cuda graph padding
int const offset = num_queries;
for (int i = threadIdx.x; i < n_pad; i += blockDim.x) {
input_tokens_ptr[offset + i] = 0;
input_positions_ptr[offset + i] = 0;
slot_mapping_ptr[offset + i] = -1;
}
}
int num_query_blocks = div_ceil(num_queries, num_threads); int num_query_blocks = div_ceil(num_queries, num_threads);
if (blockIdx.x >= num_query_blocks) { if (blockIdx.x >= num_query_blocks) {
@@ -54,7 +63,7 @@ __global__ void advance_step_kernel(int num_seqs, int num_queries,
slot_mapping_ptr[cur_query_id] = slot_num; slot_mapping_ptr[cur_query_id] = slot_num;
} }
inline void verify_tensor(std::string const& name, torch::Tensor& t, inline void verify_tensor(std::string const& name, torch::Tensor const& t,
int64_t const size_0, int64_t const size_1, int64_t const size_0, int64_t const size_1,
c10::ScalarType const type) { c10::ScalarType const type) {
bool size_0_cond = true; bool size_0_cond = true;
@@ -79,16 +88,91 @@ inline void verify_tensor(std::string const& name, torch::Tensor& t,
} }
} }
void advance_step(int num_seqs, int num_queries, int block_size, __global__ void advance_step_flashinfer_kernel(
torch::Tensor& input_tokens, // type: long int num_threads, int num_seqs, int num_queries, int block_size,
torch::Tensor& sampled_token_ids, // type: long long* input_tokens_ptr, long const* sampled_token_ids_ptr,
torch::Tensor& input_positions, // type: long long* input_positions_ptr, int* seq_lens_ptr, long* slot_mapping_ptr,
torch::Tensor& seq_lens, // type: int int const* block_tables_ptr, int64_t const block_tables_stride,
torch::Tensor& slot_mapping, // type: long int* paged_kv_last_page_len_ptr, int* block_table_bound_ptr) {
torch::Tensor& block_tables) { // type: int int num_query_blocks = div_ceil(num_queries, num_threads);
if (blockIdx.x < num_query_blocks) {
int cur_query_id = blockIdx.x * num_threads + threadIdx.x;
if (cur_query_id < num_queries) {
// Update input_tokens
input_tokens_ptr[cur_query_id] = sampled_token_ids_ptr[cur_query_id];
int seq_len = seq_lens_ptr[cur_query_id];
int next_seq_len = seq_len + 1;
int next_input_pos = next_seq_len - 1;
// Update seq_lens
seq_lens_ptr[cur_query_id] = next_seq_len;
// Update input_positions
input_positions_ptr[cur_query_id] = next_input_pos;
int const* seq_block_tables_ptr =
block_tables_ptr + block_tables_stride * cur_query_id;
int block_index = next_input_pos / block_size;
int block_offset = next_input_pos % block_size;
// Update paged_kv_last_page_len
paged_kv_last_page_len_ptr[cur_query_id] = block_offset + 1;
int slot_num =
seq_block_tables_ptr[block_index] * block_size + block_offset;
// Update slot_mapping
slot_mapping_ptr[cur_query_id] = slot_num;
block_table_bound_ptr[cur_query_id] = div_ceil(next_seq_len, block_size);
}
}
}
__global__ void advance_step_flashinfer_indptr_kernel(
int num_threads, int num_seqs, int num_queries, int* paged_kv_indptr_ptr,
int* block_table_bound_ptr) {
int idx = blockIdx.x * num_threads + threadIdx.x;
// Update paged_kv_indptr
if (idx < num_queries) {
int sum = 0;
for (int i = 0; i <= idx; ++i) {
sum += block_table_bound_ptr[i];
}
paged_kv_indptr_ptr[idx + 1] = sum;
}
}
__global__ void advance_step_flashinfer_indices_kernel(
int num_threads, int num_seqs, int num_queries, int const* block_tables_ptr,
int64_t const block_tables_stride, int* paged_kv_indices_ptr,
int* paged_kv_indptr_ptr, int* block_table_bound_ptr) {
int idx = blockIdx.x * num_threads + threadIdx.x;
int row = idx / block_tables_stride;
int col = idx % block_tables_stride;
if (row < num_queries && col < block_table_bound_ptr[row]) {
paged_kv_indices_ptr[paged_kv_indptr_ptr[row] + col] =
block_tables_ptr[row * block_tables_stride + col];
}
// if cudagraph, fill padded seqs with the last valid seq's indptr
if (num_queries < row && row <= num_seqs) {
paged_kv_indptr_ptr[row] = paged_kv_indptr_ptr[num_queries];
}
}
void advance_step_flashattn(int num_seqs, int num_queries, int block_size,
torch::Tensor& input_tokens, // type: long
torch::Tensor& sampled_token_ids, // type: long
torch::Tensor& input_positions, // type: long
torch::Tensor& seq_lens, // type: int
torch::Tensor& slot_mapping, // type: long
torch::Tensor& block_tables) { // type: int
if (logging) { if (logging) {
printf("advance_step:\n"); printf("advance_step_flashattn:\n");
printf(" num_seqs = %d\n", num_seqs); printf(" num_seqs = %d\n", num_seqs);
printf(" num_queries = %d\n", num_queries); printf(" num_queries = %d\n", num_queries);
printf(" block_size = %d\n", block_size); printf(" block_size = %d\n", block_size);
@@ -108,24 +192,126 @@ void advance_step(int num_seqs, int num_queries, int block_size,
int blocks; int blocks;
cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev); cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
advance_step_kernel<max_threads><<<blocks, max_threads, 0, stream>>>( advance_step_flashattn_kernel<max_threads>
num_seqs, num_queries, block_size, <<<blocks, max_threads, 0, stream>>>(
num_seqs, num_queries, block_size,
reinterpret_cast<long*>(input_tokens.data_ptr()),
reinterpret_cast<long const*>(sampled_token_ids.data_ptr()),
reinterpret_cast<long*>(input_positions.data_ptr()),
reinterpret_cast<int*>(seq_lens.data_ptr()),
reinterpret_cast<long*>(slot_mapping.data_ptr()),
reinterpret_cast<int const*>(block_tables.data_ptr()),
block_tables.stride(0));
}
void advance_step_flashinfer(
int num_seqs, int num_queries, int block_size,
torch::Tensor& input_tokens, // type: long
torch::Tensor& sampled_token_ids, // type: long
torch::Tensor& input_positions, // type: long
torch::Tensor& seq_lens, // type: int
torch::Tensor& slot_mapping, // type: long
torch::Tensor& block_tables, // type: int
torch::Tensor& paged_kv_indices, // type: int
torch::Tensor& paged_kv_indptr, // type: int
torch::Tensor& paged_kv_last_page_len, // type: int
torch::Tensor& block_table_bound) { // type: int
if (logging) {
printf("advance_step_flashinfer:\n");
printf(" num_seqs = %d\n", num_seqs);
printf(" num_queries = %d\n", num_queries);
printf(" block_size = %d\n", block_size);
printf(" block_tables.stride(0) = %zu\n", block_tables.stride(0));
}
// Verify all tensors
verify_tensor("input_tokens", input_tokens, num_seqs, -1, at::kLong);
// verify_tensor("sampled_token_ids", sampled_token_ids, num_queries, 1,
// at::kLong);
verify_tensor("input_positions", input_positions, num_seqs, -1, at::kLong);
verify_tensor("seq_lens", seq_lens, num_seqs, -1, at::kInt);
verify_tensor("slot_mapping", slot_mapping, num_seqs, -1, at::kLong);
verify_tensor("block_tables", block_tables, num_seqs, -1, at::kInt);
verify_tensor("paged_kv_indices", paged_kv_indices, -1, -1, at::kInt);
verify_tensor("paged_kv_indptr", paged_kv_indptr, num_seqs + 1, -1, at::kInt);
verify_tensor("paged_kv_last_page_len", paged_kv_last_page_len, num_seqs, -1,
at::kInt);
verify_tensor("block_table_bound", block_table_bound, num_seqs, -1, at::kInt);
int dev = sampled_token_ids.get_device();
cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
int blocks;
int threads;
cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);
if (logging) {
printf("launching kernel with %d blocks\n", blocks);
}
// TODO(will): support arbitrary block_tables stride
if ((blocks * threads) / block_tables.stride(0) < num_queries) {
TORCH_CHECK(false,
"multi-step: not enough threads to map block_table to"
"FlashInfer's paged_kv_indices on GPU. Try reducing the number "
"of seqs,",
" increasing the block size or take smaller steps.",
" num_queries = ", num_queries,
" block_tables.stride(0) = ", block_tables.stride(0),
" blocks = ", blocks, " max_threads = ", threads);
}
advance_step_flashinfer_kernel<<<blocks, threads, 0, stream>>>(
threads, num_seqs, num_queries, block_size,
reinterpret_cast<long*>(input_tokens.data_ptr()), reinterpret_cast<long*>(input_tokens.data_ptr()),
reinterpret_cast<long const*>(sampled_token_ids.data_ptr()), reinterpret_cast<long const*>(sampled_token_ids.data_ptr()),
reinterpret_cast<long*>(input_positions.data_ptr()), reinterpret_cast<long*>(input_positions.data_ptr()),
reinterpret_cast<int*>(seq_lens.data_ptr()), reinterpret_cast<int*>(seq_lens.data_ptr()),
reinterpret_cast<long*>(slot_mapping.data_ptr()), reinterpret_cast<long*>(slot_mapping.data_ptr()),
reinterpret_cast<int const*>(block_tables.data_ptr()), reinterpret_cast<int const*>(block_tables.data_ptr()),
block_tables.stride(0)); block_tables.stride(0),
reinterpret_cast<int*>(paged_kv_last_page_len.data_ptr()),
reinterpret_cast<int*>(block_table_bound.data_ptr()));
advance_step_flashinfer_indptr_kernel<<<blocks, threads, 0, stream>>>(
threads, num_seqs, num_queries,
reinterpret_cast<int*>(paged_kv_indptr.data_ptr()),
reinterpret_cast<int*>(block_table_bound.data_ptr()));
advance_step_flashinfer_indices_kernel<<<blocks, threads, 0, stream>>>(
threads, num_seqs, num_queries,
reinterpret_cast<int const*>(block_tables.data_ptr()),
block_tables.stride(0),
reinterpret_cast<int*>(paged_kv_indices.data_ptr()),
reinterpret_cast<int*>(paged_kv_indptr.data_ptr()),
reinterpret_cast<int*>(block_table_bound.data_ptr()));
} }
} // namespace prepare_inputs } // namespace prepare_inputs
void advance_step(int64_t num_seqs, int64_t num_queries, int64_t block_size, void advance_step_flashattn(int64_t num_seqs, int64_t num_queries,
torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids, int64_t block_size, torch::Tensor& input_tokens,
torch::Tensor& input_positions, torch::Tensor& seq_lens, torch::Tensor& sampled_token_ids,
torch::Tensor& slot_mapping, torch::Tensor& block_tables) { torch::Tensor& input_positions,
prepare_inputs::advance_step(num_seqs, num_queries, block_size, input_tokens, torch::Tensor& seq_lens,
sampled_token_ids, input_positions, seq_lens, torch::Tensor& slot_mapping,
slot_mapping, block_tables); torch::Tensor& block_tables) {
prepare_inputs::advance_step_flashattn(
num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
input_positions, seq_lens, slot_mapping, block_tables);
}
void advance_step_flashinfer(
int64_t num_seqs, int64_t num_queries, int64_t block_size,
torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
torch::Tensor& input_positions, torch::Tensor& seq_lens,
torch::Tensor& slot_mapping, torch::Tensor& block_tables,
torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bound) {
prepare_inputs::advance_step_flashinfer(
num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
input_positions, seq_lens, slot_mapping, block_tables, paged_kv_indices,
paged_kv_indptr, paged_kv_last_page_len, block_table_bound);
} }

View File

@@ -14,12 +14,17 @@
static inline __device__ int8_t float_to_int8_rn(float x) { static inline __device__ int8_t float_to_int8_rn(float x) {
#ifdef USE_ROCM #ifdef USE_ROCM
static const float i8_min = static constexpr auto i8_min =
static_cast<float>(std::numeric_limits<int8_t>::min()); static_cast<float>(std::numeric_limits<int8_t>::min());
static const float i8_max = static constexpr auto i8_max =
static_cast<float>(std::numeric_limits<int8_t>::max()); static_cast<float>(std::numeric_limits<int8_t>::max());
// round
// To match the rounding mode of CUDA, we use nearbyint.
// It uses the current rounding mode, which is always FE_TONEAREST on HIP.
// If that changes in the future, we may need to set the rounding mode
// explicitly, either at runtime or compile time.
float dst = std::nearbyint(x); float dst = std::nearbyint(x);
// saturate // saturate
dst = std::clamp(dst, i8_min, i8_max); dst = std::clamp(dst, i8_min, i8_max);
return static_cast<int8_t>(dst); return static_cast<int8_t>(dst);
@@ -31,6 +36,59 @@ static inline __device__ int8_t float_to_int8_rn(float x) {
#endif #endif
} }
static inline __device__ int32_t float_to_int32_rn(float x) {
#ifdef USE_ROCM
// int32_max is not exactly representable as float.
// Therefore, we need to be careful and manually return int32_max on overflow.
// For symmetry, we also do the same for int32_min, even though it is exactly
// representable as float and the conversion should be exact.
static constexpr auto i32_min = std::numeric_limits<int32_t>::min();
static constexpr auto i32_min_f = static_cast<float>(i32_min);
static constexpr auto i32_max = std::numeric_limits<int32_t>::max();
static constexpr auto i32_max_f = static_cast<float>(i32_max);
// To match the rounding mode of CUDA, we use nearbyint.
// It uses the current rounding mode, which is always FE_TONEAREST on HIP.
// If that changes in the future, we may need to set the rounding mode
// explicitly, either at runtime or compile time.
float dst = std::nearbyint(x);
// saturate on the higher end.
if (dst >= i32_max_f) {
return i32_max;
}
// saturate on the lower end.
if (dst <= i32_min_f) {
return i32_min;
}
return static_cast<int32_t>(dst);
#else
// CUDA path
uint32_t dst;
asm volatile("cvt.rni.sat.s32.f32 %0, %1;" : "=r"(dst) : "f"(x));
return reinterpret_cast<const int32_t&>(dst);
#endif
}
static inline __device__ int8_t int32_to_int8(int32_t x) {
#ifdef USE_ROCM
static constexpr auto i8_min =
static_cast<int32_t>(std::numeric_limits<int8_t>::min());
static constexpr auto i8_max =
static_cast<int32_t>(std::numeric_limits<int8_t>::max());
// saturate
int32_t dst = std::clamp(x, i8_min, i8_max);
return static_cast<int8_t>(dst);
#else
// CUDA path
uint32_t dst;
asm volatile("cvt.sat.s8.s32 %0, %1;" : "=r"(dst) : "r"(x));
return reinterpret_cast<const int8_t&>(dst);
#endif
}
namespace vllm { namespace vllm {
template <typename scalar_t, typename scale_type> template <typename scalar_t, typename scale_type>
@@ -47,6 +105,23 @@ __global__ void static_scaled_int8_quant_kernel(
} }
} }
template <typename scalar_t, typename scale_type, typename azp_type>
__global__ void static_scaled_int8_azp_quant_kernel(
scalar_t const* __restrict__ input, int8_t* __restrict__ out,
scale_type const* scale_ptr, azp_type const* azp_ptr,
const int hidden_size) {
int const tid = threadIdx.x;
int const token_idx = blockIdx.x;
scale_type const scale = *scale_ptr;
azp_type const azp = *azp_ptr;
for (int i = tid; i < hidden_size; i += blockDim.x) {
auto const val = static_cast<float>(input[token_idx * hidden_size + i]);
auto const quant_val = int32_to_int8(float_to_int32_rn(val / scale) + azp);
out[token_idx * hidden_size + i] = quant_val;
}
}
template <typename scalar_t, typename scale_type> template <typename scalar_t, typename scale_type>
__global__ void dynamic_scaled_int8_quant_kernel( __global__ void dynamic_scaled_int8_quant_kernel(
scalar_t const* __restrict__ input, int8_t* __restrict__ out, scalar_t const* __restrict__ input, int8_t* __restrict__ out,
@@ -80,14 +155,68 @@ __global__ void dynamic_scaled_int8_quant_kernel(
} }
} }
template <typename scalar_t, typename scale_type, typename azp_type>
__global__ void dynamic_scaled_int8_azp_quant_kernel(
scalar_t const* __restrict__ input, int8_t* __restrict__ out,
scale_type* scale, azp_type* azp, const int hidden_size) {
int const token_idx = blockIdx.x;
// Scan for the min and max value for this token
float max_val = std::numeric_limits<float>::min();
float min_val = std::numeric_limits<float>::max();
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
auto val = static_cast<float>(input[token_idx * hidden_size + i]);
max_val = std::max(max_val, val);
min_val = std::min(min_val, val);
}
// Reduce the max and min values across the block
using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStorage;
max_val = BlockReduce(reduceStorage).Reduce(max_val, cub::Max{}, blockDim.x);
__syncthreads(); // Make sure min doesn't mess with max shared memory
min_val = BlockReduce(reduceStorage).Reduce(min_val, cub::Min{}, blockDim.x);
__shared__ scale_type scale_sh;
__shared__ azp_type azp_sh;
// Compute the scale and zero point and store them, only on the first thread
if (threadIdx.x == 0) {
float const scale_val = (max_val - min_val) / 255.0f;
// Use rounding to even (same as torch.round)
auto const azp_float = std::nearbyint(-128.0f - min_val / scale_val);
auto const azp_val = static_cast<azp_type>(azp_float);
// Store the scale and azp into shared and global
scale[token_idx] = scale_sh = scale_val;
azp[token_idx] = azp_sh = azp_val;
}
// Wait for the scale and azp to be computed
__syncthreads();
float const scale_val = scale_sh;
azp_type const azp_val = azp_sh;
// Quantize the values
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
auto const val = static_cast<float>(input[token_idx * hidden_size + i]);
auto const quant_val =
int32_to_int8(float_to_int32_rn(val / scale_val) + azp_val);
out[token_idx * hidden_size + i] = quant_val;
}
}
} // namespace vllm } // namespace vllm
void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size]
torch::Tensor const& input, // [..., hidden_size] torch::Tensor const& input, // [..., hidden_size]
torch::Tensor const& scale) { torch::Tensor const& scale,
c10::optional<torch::Tensor> const& azp) {
TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(input.is_contiguous());
TORCH_CHECK(out.is_contiguous()); TORCH_CHECK(out.is_contiguous());
TORCH_CHECK(scale.numel() == 1); TORCH_CHECK(scale.numel() == 1);
TORCH_CHECK(!azp || azp->numel() == 1);
int const hidden_size = input.size(-1); int const hidden_size = input.size(-1);
int const num_tokens = input.numel() / hidden_size; int const num_tokens = input.numel() / hidden_size;
@@ -96,19 +225,29 @@ void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size]
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES( VLLM_DISPATCH_FLOATING_TYPES(
input.scalar_type(), "static_scaled_int8_quant_kernel", [&] { input.scalar_type(), "static_scaled_int8_quant_kernel", [&] {
vllm::static_scaled_int8_quant_kernel<scalar_t, float> if (!azp) {
<<<grid, block, 0, stream>>>(input.data_ptr<scalar_t>(), vllm::static_scaled_int8_quant_kernel<scalar_t, float>
out.data_ptr<int8_t>(), <<<grid, block, 0, stream>>>(
scale.data_ptr<float>(), hidden_size); input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
scale.data_ptr<float>(), hidden_size);
} else {
vllm::static_scaled_int8_azp_quant_kernel<scalar_t, float, int32_t>
<<<grid, block, 0, stream>>>(
input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
scale.data_ptr<float>(), azp->data_ptr<int32_t>(),
hidden_size);
}
}); });
} }
void dynamic_scaled_int8_quant( void dynamic_scaled_int8_quant(
torch::Tensor& out, // [..., hidden_size] torch::Tensor& out, // [..., hidden_size]
torch::Tensor const& input, // [..., hidden_size] torch::Tensor const& input, // [..., hidden_size]
torch::Tensor& scales) { torch::Tensor& scales, c10::optional<torch::Tensor> const& azp) {
TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(input.is_contiguous());
TORCH_CHECK(out.is_contiguous()); TORCH_CHECK(out.is_contiguous());
TORCH_CHECK(scales.is_contiguous());
TORCH_CHECK(!azp || azp->is_contiguous());
int const hidden_size = input.size(-1); int const hidden_size = input.size(-1);
int const num_tokens = input.numel() / hidden_size; int const num_tokens = input.numel() / hidden_size;
@@ -117,9 +256,17 @@ void dynamic_scaled_int8_quant(
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES( VLLM_DISPATCH_FLOATING_TYPES(
input.scalar_type(), "dynamic_scaled_int8_quant_kernel", [&] { input.scalar_type(), "dynamic_scaled_int8_quant_kernel", [&] {
vllm::dynamic_scaled_int8_quant_kernel<scalar_t, float> if (!azp) {
<<<grid, block, 0, stream>>>(input.data_ptr<scalar_t>(), vllm::dynamic_scaled_int8_quant_kernel<scalar_t, float>
out.data_ptr<int8_t>(), <<<grid, block, 0, stream>>>(
scales.data_ptr<float>(), hidden_size); input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
scales.data_ptr<float>(), hidden_size);
} else {
vllm::dynamic_scaled_int8_azp_quant_kernel<scalar_t, float, int32_t>
<<<grid, block, 0, stream>>>(
input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
scales.data_ptr<float>(), azp->data_ptr<int32_t>(),
hidden_size);
}
}); });
} }

Some files were not shown because too many files have changed in this diff Show More