[Misc] refactor context extension (#19246)

Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-07 13:13:21 +08:00
parent cf02f9b283
commit 122cdca5f6
1 changed files with 47 additions and 26 deletions
--- a/examples/offline_inference/context_extension.py
+++ b/examples/offline_inference/context_extension.py
@@ -1,8 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This script demonstrates how to extend the context length
+of a Qwen model using the YARN method (rope_scaling)
+and run a simple chat example.
+
+Usage:
+    python examples/offline_inference/context_extension.py
+"""

 from vllm import LLM, SamplingParams

+
+def create_llm():
    rope_theta = 1000000
    original_max_position_embeddings = 32768
    factor = 4.0
@@ -19,7 +29,10 @@ hf_overrides = {
    }

    llm = LLM(model="Qwen/Qwen3-0.6B", hf_overrides=hf_overrides)
+    return llm

+
+def run_llm_chat(llm):
    sampling_params = SamplingParams(
        temperature=0.8,
        top_p=0.95,
@@ -32,6 +45,7 @@ conversation = [
        {"role": "assistant", "content": "Hello! How can I assist you today?"},
    ]
    outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
+    return outputs


 def print_outputs(outputs):
@@ -44,4 +58,11 @@ def print_outputs(outputs):
        print("-" * 80)


+def main():
+    llm = create_llm()
+    outputs = run_llm_chat(llm)
    print_outputs(outputs)
+
+
+if __name__ == "__main__":
+    main()