[Frontend] [Core] perf: Automatically detect vLLM-tensorized model, update tensorizer to version 2.9.0 (#4208)

2024-05-13 17:57:07 -04:00
parent 0fca3cdcf2
commit 8bc68e198c
10 changed files with 259 additions and 523 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -60,11 +60,13 @@ steps:
  mirror_hardwares: [amd]
  commands:
    # install aws cli for llava_example.py
-    - pip install awscli
+    # install tensorizer for tensorize_vllm_model.py
+    - pip install awscli tensorizer
    - python3 offline_inference.py
    - python3 offline_inference_with_prefix.py
    - python3 llm_engine_example.py
    - python3 llava_example.py
+    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors

 - label: Kernels Test %N
  command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT