Disable outlines cache by default (#14837)

This commit is contained in:
Russell Bryant
2025-03-14 23:57:55 -04:00
committed by GitHub
parent ccf02fcbae
commit 776dcec8fe
2 changed files with 16 additions and 1 deletions

View File

@@ -95,6 +95,7 @@ if TYPE_CHECKING:
VLLM_DP_MASTER_IP: str = "" VLLM_DP_MASTER_IP: str = ""
VLLM_DP_MASTER_PORT: int = 0 VLLM_DP_MASTER_PORT: int = 0
VLLM_MARLIN_USE_ATOMIC_ADD: bool = False VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
VLLM_V0_USE_OUTLINES_CACHE: bool = False
def get_default_cache_root(): def get_default_cache_root():
@@ -623,6 +624,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
# Whether to use atomicAdd reduce in gptq/awq marlin kernel. # Whether to use atomicAdd reduce in gptq/awq marlin kernel.
"VLLM_MARLIN_USE_ATOMIC_ADD": "VLLM_MARLIN_USE_ATOMIC_ADD":
lambda: os.environ.get("VLLM_MARLIN_USE_ATOMIC_ADD", "0") == "1", lambda: os.environ.get("VLLM_MARLIN_USE_ATOMIC_ADD", "0") == "1",
# Whether to turn on the outlines cache for V0
# This cache is unbounded and on disk, so it's not safe to use in
# an environment with potentially malicious users.
"VLLM_V0_USE_OUTLINES_CACHE":
lambda: os.environ.get("VLLM_V0_USE_OUTLINES_CACHE", "0") == "1",
} }
# end-env-vars-definition # end-env-vars-definition

View File

@@ -24,7 +24,7 @@ from typing import Callable, DefaultDict, Dict, List, Optional, Union
import numpy as np import numpy as np
import torch import torch
from outlines import grammars from outlines import grammars
from outlines.caching import cache from outlines.caching import cache, disable_cache
from outlines.fsm.guide import (CFGGuide, CFGState, Generate, Guide, from outlines.fsm.guide import (CFGGuide, CFGState, Generate, Guide,
RegexGuide, Write) RegexGuide, Write)
from outlines.fsm.parsing import PartialLark from outlines.fsm.parsing import PartialLark
@@ -32,12 +32,20 @@ from outlines_core.fsm.json_schema import build_regex_from_schema
from pydantic import BaseModel from pydantic import BaseModel
from transformers import PreTrainedTokenizerBase from transformers import PreTrainedTokenizerBase
import vllm.envs as envs
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.guided_decoding.reasoner import Reasoner from vllm.model_executor.guided_decoding.reasoner import Reasoner
from vllm.platforms import current_platform from vllm.platforms import current_platform
logger = init_logger(__name__) logger = init_logger(__name__)
if envs.VLLM_V0_USE_OUTLINES_CACHE:
logger.warning("Enabling outlines cache. This is an unbounded on-disk "
"cache. It may consume a lot of disk space and should "
"not be used with untrusted clients.")
else:
disable_cache()
class BaseLogitsProcessor: class BaseLogitsProcessor: