[Frontend] [Core] perf: Automatically detect vLLM-tensorized model, update tensorizer to version 2.9.0 (#4208)

This commit is contained in:
Sanger Steel
2024-05-13 17:57:07 -04:00
committed by GitHub
parent 0fca3cdcf2
commit 8bc68e198c
10 changed files with 259 additions and 523 deletions

View File

@@ -60,11 +60,13 @@ steps:
mirror_hardwares: [amd]
commands:
# install aws cli for llava_example.py
- pip install awscli
# install tensorizer for tensorize_vllm_model.py
- pip install awscli tensorizer
- python3 offline_inference.py
- python3 offline_inference_with_prefix.py
- python3 llm_engine_example.py
- python3 llava_example.py
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- label: Kernels Test %N
command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT