Disable outlines cache by default (#14837)
This commit is contained in:
@@ -95,6 +95,7 @@ if TYPE_CHECKING:
|
|||||||
VLLM_DP_MASTER_IP: str = ""
|
VLLM_DP_MASTER_IP: str = ""
|
||||||
VLLM_DP_MASTER_PORT: int = 0
|
VLLM_DP_MASTER_PORT: int = 0
|
||||||
VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
|
VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
|
||||||
|
VLLM_V0_USE_OUTLINES_CACHE: bool = False
|
||||||
|
|
||||||
|
|
||||||
def get_default_cache_root():
|
def get_default_cache_root():
|
||||||
@@ -623,6 +624,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
# Whether to use atomicAdd reduce in gptq/awq marlin kernel.
|
# Whether to use atomicAdd reduce in gptq/awq marlin kernel.
|
||||||
"VLLM_MARLIN_USE_ATOMIC_ADD":
|
"VLLM_MARLIN_USE_ATOMIC_ADD":
|
||||||
lambda: os.environ.get("VLLM_MARLIN_USE_ATOMIC_ADD", "0") == "1",
|
lambda: os.environ.get("VLLM_MARLIN_USE_ATOMIC_ADD", "0") == "1",
|
||||||
|
|
||||||
|
# Whether to turn on the outlines cache for V0
|
||||||
|
# This cache is unbounded and on disk, so it's not safe to use in
|
||||||
|
# an environment with potentially malicious users.
|
||||||
|
"VLLM_V0_USE_OUTLINES_CACHE":
|
||||||
|
lambda: os.environ.get("VLLM_V0_USE_OUTLINES_CACHE", "0") == "1",
|
||||||
}
|
}
|
||||||
|
|
||||||
# end-env-vars-definition
|
# end-env-vars-definition
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ from typing import Callable, DefaultDict, Dict, List, Optional, Union
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from outlines import grammars
|
from outlines import grammars
|
||||||
from outlines.caching import cache
|
from outlines.caching import cache, disable_cache
|
||||||
from outlines.fsm.guide import (CFGGuide, CFGState, Generate, Guide,
|
from outlines.fsm.guide import (CFGGuide, CFGState, Generate, Guide,
|
||||||
RegexGuide, Write)
|
RegexGuide, Write)
|
||||||
from outlines.fsm.parsing import PartialLark
|
from outlines.fsm.parsing import PartialLark
|
||||||
@@ -32,12 +32,20 @@ from outlines_core.fsm.json_schema import build_regex_from_schema
|
|||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
|
||||||
|
import vllm.envs as envs
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.guided_decoding.reasoner import Reasoner
|
from vllm.model_executor.guided_decoding.reasoner import Reasoner
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
if envs.VLLM_V0_USE_OUTLINES_CACHE:
|
||||||
|
logger.warning("Enabling outlines cache. This is an unbounded on-disk "
|
||||||
|
"cache. It may consume a lot of disk space and should "
|
||||||
|
"not be used with untrusted clients.")
|
||||||
|
else:
|
||||||
|
disable_cache()
|
||||||
|
|
||||||
|
|
||||||
class BaseLogitsProcessor:
|
class BaseLogitsProcessor:
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user