[Performance] Optimize e2e overheads: Reduce python allocations (#7162)

2024-08-09 00:34:28 -04:00
parent 73388c07a4
commit e02ac55617
11 changed files with 549 additions and 124 deletions
--- a/vllm/model_executor/init.py
+++ b/vllm/model_executor/init.py
@@ -1,10 +1,12 @@
 from vllm.model_executor.parameter import (BasevLLMParameter,
                                           PackedvLLMParameter)
-from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.sampling_metadata import (SamplingMetadata,
+                                                   SamplingMetadataCache)
 from vllm.model_executor.utils import set_random_seed

 __all__ = [
    "SamplingMetadata",
+    "SamplingMetadataCache",
    "set_random_seed",
    "BasevLLMParameter",
    "PackedvLLMParameter",