[Doc]: Update OpenAI-Compatible Server documents (#12082)

2025-01-16 00:07:45 +08:00
parent de0526f668
commit 57e729e874
2 changed files with 38 additions and 38 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -238,7 +238,7 @@ class EngineArgs:
            choices=get_args(TaskOption),
            help='The task to use the model for. Each vLLM instance only '
            'supports one task, even if the same model can be used for '
-            'multiple tasks. When the model only supports one task, "auto" '
+            'multiple tasks. When the model only supports one task, ``"auto"`` '
            'can be used to select it; otherwise, you must specify explicitly '
            'which task to use.')
        parser.add_argument(
@@ -250,7 +250,7 @@ class EngineArgs:
        parser.add_argument(
            '--skip-tokenizer-init',
            action='store_true',
-            help='Skip initialization of tokenizer and detokenizer')
+            help='Skip initialization of tokenizer and detokenizer.')
        parser.add_argument(
            '--revision',
            type=nullable_str,
@@ -401,7 +401,7 @@ class EngineArgs:
        parser.add_argument(
            '--worker-use-ray',
            action='store_true',
-            help='Deprecated, use --distributed-executor-backend=ray.')
+            help='Deprecated, use ``--distributed-executor-backend=ray``.')
        parser.add_argument('--pipeline-parallel-size',
                            '-pp',
                            type=int,
@@ -430,7 +430,7 @@ class EngineArgs:
                            choices=[8, 16, 32, 64, 128],
                            help='Token block size for contiguous chunks of '
                            'tokens. This is ignored on neuron devices and '
-                            'set to max-model-len. On CUDA devices, '
+                            'set to ``--max-model-len``. On CUDA devices, '
                            'only block sizes up to 32 are supported. '
                            'On HPU devices, block size defaults to 128.')

@@ -439,12 +439,12 @@ class EngineArgs:
            action=argparse.BooleanOptionalAction,
            default=EngineArgs.enable_prefix_caching,
            help="Enables automatic prefix caching. "
-            "Use --no-enable-prefix-caching to disable explicitly.",
+            "Use ``--no-enable-prefix-caching`` to disable explicitly.",
        )
        parser.add_argument('--disable-sliding-window',
                            action='store_true',
                            help='Disables sliding window, '
-                            'capping to sliding window size')
+                            'capping to sliding window size.')
        parser.add_argument('--use-v2-block-manager',
                            action='store_true',
                            default=True,
@@ -861,7 +861,7 @@ class EngineArgs:
            "of the provided names. The model name in the model "
            "field of a response will be the first name in this "
            "list. If not specified, the model name will be the "
-            "same as the `--model` argument. Noted that this name(s) "
+            "same as the ``--model`` argument. Noted that this name(s) "
            "will also be used in `model_name` tag content of "
            "prometheus metrics, if multiple names provided, metrics "
            "tag will take the first one.")
@@ -881,7 +881,7 @@ class EngineArgs:
            default=None,
            help="Valid choices are " +
            ",".join(ALLOWED_DETAILED_TRACE_MODULES) +
-            ". It makes sense to set this only if --otlp-traces-endpoint is"
+            ". It makes sense to set this only if ``--otlp-traces-endpoint`` is"
            " set. If set, it will collect detailed traces for the specified "
            "modules. This involves use of possibly costly and or blocking "
            "operations and hence might have a performance impact.")