From 550a04a0caa5a351239968897ff29f676e7501e1 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Tue, 28 Apr 2026 02:10:48 +0000 Subject: [PATCH] custom weights --- vllm/Dockerfile | 3 +++ vllm/super_v3_reasoning_parser.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 vllm/super_v3_reasoning_parser.py diff --git a/vllm/Dockerfile b/vllm/Dockerfile index c1216d0..3030348 100644 --- a/vllm/Dockerfile +++ b/vllm/Dockerfile @@ -235,6 +235,9 @@ RUN apt install -y --no-install-recommends tmux cmake # Deprecated cleanup RUN pip uninstall -y pynvml && pip install nvidia-ml-py +# Copy over nemotron reasonong parser +COPY ./super_v3_reasoning_parser.py /opt/super_v3_reasoning_parser.py + # Copy vLLM shim that intercepts --model to download custom weights from URLs COPY vllm_shim_module.py /opt/vllm-shim/vllm_shim_module.py diff --git a/vllm/super_v3_reasoning_parser.py b/vllm/super_v3_reasoning_parser.py new file mode 100644 index 0000000..9a98bf7 --- /dev/null +++ b/vllm/super_v3_reasoning_parser.py @@ -0,0 +1,28 @@ +from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager +from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser + + +@ReasoningParserManager.register_module("super_v3") +class SuperV3ReasoningParser(DeepSeekR1ReasoningParser): + def extract_reasoning(self, model_output, request): + reasoning_content, final_content = super().extract_reasoning( + model_output, request + ) + if ( + hasattr(request, "chat_template_kwargs") + and request.chat_template_kwargs + and ( + request.chat_template_kwargs.get("enable_thinking") is False + or request.chat_template_kwargs.get("force_nonempty_content") is True + ) + and final_content is None + ): + """ + The original `deepseek_r1` reasoning parser this inherits from will automatically put everything in the reasoning content when it cannot parse out reasoning. This was fine for the DeepSeek R1 model that was not intended to be used without reasoning. + 1. Since the Nemotron 3 Nano and Super both have thinking off modes modulated by "enable_thinking=false" in the chat template kwargs, this change instead which will properly place the content in cases where there is no thinking enabled via config. + 2. There are rare cases where the model will output only reasoning without an end-think token `` (e.g. reasoning exceeds max length), which results in empty content returned. End users may want to unilaterally avoid such cases and always have a content response even if the model does not finish its reasoning. + """ + # Put all nonempty content into the content, rather than return content + reasoning_content, final_content = None, reasoning_content + + return reasoning_content, final_content \ No newline at end of file