[Core] Use flashinfer sampling kernel when available (#7137)

Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-08-19 11:24:03 +08:00
parent ff7ec82c4d
commit f710fb5265
5 changed files with 129 additions and 27 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -192,7 +192,9 @@ steps:
  - vllm/model_executor/layers
  - vllm/sampling_metadata.py
  - tests/samplers
-  command: pytest -v -s samplers
+  commands:
+    - pytest -v -s samplers
+    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers

 - label: LogitsProcessor Test # 5min
  mirror_hardwares: [amd]