[Misc] refactor context extension (#19246)
Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>
This commit is contained in:
@@ -1,8 +1,18 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
This script demonstrates how to extend the context length
|
||||
of a Qwen model using the YARN method (rope_scaling)
|
||||
and run a simple chat example.
|
||||
|
||||
Usage:
|
||||
python examples/offline_inference/context_extension.py
|
||||
"""
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
|
||||
def create_llm():
|
||||
rope_theta = 1000000
|
||||
original_max_position_embeddings = 32768
|
||||
factor = 4.0
|
||||
@@ -19,7 +29,10 @@ hf_overrides = {
|
||||
}
|
||||
|
||||
llm = LLM(model="Qwen/Qwen3-0.6B", hf_overrides=hf_overrides)
|
||||
return llm
|
||||
|
||||
|
||||
def run_llm_chat(llm):
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
@@ -32,6 +45,7 @@ conversation = [
|
||||
{"role": "assistant", "content": "Hello! How can I assist you today?"},
|
||||
]
|
||||
outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
|
||||
return outputs
|
||||
|
||||
|
||||
def print_outputs(outputs):
|
||||
@@ -44,4 +58,11 @@ def print_outputs(outputs):
|
||||
print("-" * 80)
|
||||
|
||||
|
||||
def main():
|
||||
llm = create_llm()
|
||||
outputs = run_llm_chat(llm)
|
||||
print_outputs(outputs)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user