[Misc] Move dynamic seed initialization to EngineArgs (#29165)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -146,9 +146,12 @@ class ModelConfig:
|
|||||||
- "bfloat16" for a balance between precision and range.\n
|
- "bfloat16" for a balance between precision and range.\n
|
||||||
- "float" is shorthand for FP32 precision.\n
|
- "float" is shorthand for FP32 precision.\n
|
||||||
- "float32" for FP32 precision."""
|
- "float32" for FP32 precision."""
|
||||||
seed: int | None = None
|
seed: int = 0
|
||||||
"""Random seed for reproducibility. Initialized to None in V0, but
|
"""Random seed for reproducibility.
|
||||||
initialized to 0 in V1."""
|
|
||||||
|
We must set the global seed because otherwise,
|
||||||
|
different tensor parallel workers would sample different tokens,
|
||||||
|
leading to inconsistent results."""
|
||||||
hf_config: PretrainedConfig = field(init=False)
|
hf_config: PretrainedConfig = field(init=False)
|
||||||
"""The Hugging Face config of the model."""
|
"""The Hugging Face config of the model."""
|
||||||
hf_text_config: PretrainedConfig = field(init=False)
|
hf_text_config: PretrainedConfig = field(init=False)
|
||||||
@@ -415,7 +418,7 @@ class ModelConfig:
|
|||||||
def __post_init__(
|
def __post_init__(
|
||||||
self,
|
self,
|
||||||
# Multimodal config init vars
|
# Multimodal config init vars
|
||||||
limit_mm_per_prompt: dict[str, int] | None,
|
limit_mm_per_prompt: dict[str, int | dict[str, int]] | None,
|
||||||
enable_mm_embeds: bool | None,
|
enable_mm_embeds: bool | None,
|
||||||
media_io_kwargs: dict[str, dict[str, Any]] | None,
|
media_io_kwargs: dict[str, dict[str, Any]] | None,
|
||||||
mm_processor_kwargs: dict[str, Any] | None,
|
mm_processor_kwargs: dict[str, Any] | None,
|
||||||
@@ -428,23 +431,6 @@ class ModelConfig:
|
|||||||
skip_mm_profiling: bool | None,
|
skip_mm_profiling: bool | None,
|
||||||
video_pruning_rate: float | None,
|
video_pruning_rate: float | None,
|
||||||
) -> None:
|
) -> None:
|
||||||
# Set the default seed to 0 in V1.
|
|
||||||
# NOTE(woosuk): In V1, we use separate processes for workers (unless
|
|
||||||
# VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here
|
|
||||||
# doesn't affect the user process. However, without a consistent seed,
|
|
||||||
# different tensor parallel workers would sample different tokens,
|
|
||||||
# leading to inconsistent results.
|
|
||||||
if self.seed is None:
|
|
||||||
self.seed = 0
|
|
||||||
if not envs.VLLM_ENABLE_V1_MULTIPROCESSING:
|
|
||||||
logger.warning(
|
|
||||||
"The global random seed is set to %d. Since "
|
|
||||||
"VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may "
|
|
||||||
"affect the random state of the Python process that "
|
|
||||||
"launched vLLM.",
|
|
||||||
self.seed,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Keep set served_model_name before maybe_model_redirect(self.model)
|
# Keep set served_model_name before maybe_model_redirect(self.model)
|
||||||
self.served_model_name = get_served_model_name(
|
self.served_model_name = get_served_model_name(
|
||||||
self.model, self.served_model_name
|
self.model, self.served_model_name
|
||||||
@@ -1151,12 +1137,6 @@ class ModelConfig:
|
|||||||
self,
|
self,
|
||||||
parallel_config: ParallelConfig,
|
parallel_config: ParallelConfig,
|
||||||
) -> None:
|
) -> None:
|
||||||
if parallel_config.distributed_executor_backend == "external_launcher":
|
|
||||||
assert self.seed is not None, (
|
|
||||||
"Seed must be set when using external launcher backend to "
|
|
||||||
"make sure sampling results are the same across workers."
|
|
||||||
)
|
|
||||||
|
|
||||||
total_num_attention_heads = getattr(
|
total_num_attention_heads = getattr(
|
||||||
self.hf_text_config, "num_attention_heads", 0
|
self.hf_text_config, "num_attention_heads", 0
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ from pydantic import Field, SkipValidation, model_validator
|
|||||||
from pydantic.dataclasses import dataclass
|
from pydantic.dataclasses import dataclass
|
||||||
from typing_extensions import Self
|
from typing_extensions import Self
|
||||||
|
|
||||||
|
from vllm.config.model import ModelConfig
|
||||||
from vllm.config.parallel import ParallelConfig
|
from vllm.config.parallel import ParallelConfig
|
||||||
from vllm.config.utils import config
|
from vllm.config.utils import config
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
@@ -18,10 +19,8 @@ if TYPE_CHECKING:
|
|||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
|
|
||||||
import vllm.model_executor.layers.quantization as me_quant
|
import vllm.model_executor.layers.quantization as me_quant
|
||||||
from vllm.config import ModelConfig
|
|
||||||
else:
|
else:
|
||||||
PretrainedConfig = Any
|
PretrainedConfig = Any
|
||||||
ModelConfig = Any
|
|
||||||
|
|
||||||
me_quant = LazyLoader(
|
me_quant = LazyLoader(
|
||||||
"model_executor", globals(), "vllm.model_executor.layers.quantization"
|
"model_executor", globals(), "vllm.model_executor.layers.quantization"
|
||||||
@@ -316,10 +315,6 @@ class SpeculativeConfig:
|
|||||||
self.prompt_lookup_min = 0
|
self.prompt_lookup_min = 0
|
||||||
|
|
||||||
if self.model is not None:
|
if self.model is not None:
|
||||||
# TODO: Move this import to the top once `ModelConfig`
|
|
||||||
# lives in `vllm.config.model`.
|
|
||||||
from vllm.config import ModelConfig
|
|
||||||
|
|
||||||
self.draft_model_config = ModelConfig(
|
self.draft_model_config = ModelConfig(
|
||||||
model=self.model,
|
model=self.model,
|
||||||
runner="draft",
|
runner="draft",
|
||||||
|
|||||||
@@ -367,7 +367,7 @@ class EngineArgs:
|
|||||||
config_format: str = ModelConfig.config_format
|
config_format: str = ModelConfig.config_format
|
||||||
dtype: ModelDType = ModelConfig.dtype
|
dtype: ModelDType = ModelConfig.dtype
|
||||||
kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
|
kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
|
||||||
seed: int | None = ModelConfig.seed
|
seed: int | None = None
|
||||||
max_model_len: int | None = ModelConfig.max_model_len
|
max_model_len: int | None = ModelConfig.max_model_len
|
||||||
cuda_graph_sizes: list[int] | None = CompilationConfig.cudagraph_capture_sizes
|
cuda_graph_sizes: list[int] | None = CompilationConfig.cudagraph_capture_sizes
|
||||||
cudagraph_capture_sizes: list[int] | None = (
|
cudagraph_capture_sizes: list[int] | None = (
|
||||||
@@ -1188,6 +1188,20 @@ class EngineArgs:
|
|||||||
if check_gguf_file(self.model):
|
if check_gguf_file(self.model):
|
||||||
self.quantization = self.load_format = "gguf"
|
self.quantization = self.load_format = "gguf"
|
||||||
|
|
||||||
|
# NOTE(woosuk): In V1, we use separate processes for workers (unless
|
||||||
|
# VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here
|
||||||
|
# doesn't affect the user process.
|
||||||
|
if self.seed is None:
|
||||||
|
self.seed = 0
|
||||||
|
if not envs.VLLM_ENABLE_V1_MULTIPROCESSING:
|
||||||
|
logger.warning(
|
||||||
|
"The global random seed is set to %d. Since "
|
||||||
|
"VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may "
|
||||||
|
"affect the random state of the Python process that "
|
||||||
|
"launched vLLM.",
|
||||||
|
self.seed,
|
||||||
|
)
|
||||||
|
|
||||||
if self.disable_mm_preprocessor_cache:
|
if self.disable_mm_preprocessor_cache:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"`--disable-mm-preprocessor-cache` is deprecated "
|
"`--disable-mm-preprocessor-cache` is deprecated "
|
||||||
|
|||||||
@@ -106,9 +106,6 @@ class TPUWorker:
|
|||||||
"Profiling enabled. Traces will be saved to: %s", self.profile_dir
|
"Profiling enabled. Traces will be saved to: %s", self.profile_dir
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.model_config.seed is None:
|
|
||||||
self.model_config.seed = 0
|
|
||||||
|
|
||||||
def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None:
|
def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None:
|
||||||
self.cache_config.num_gpu_blocks = num_gpu_blocks
|
self.cache_config.num_gpu_blocks = num_gpu_blocks
|
||||||
self.cache_config.num_cpu_blocks = num_cpu_blocks
|
self.cache_config.num_cpu_blocks = num_cpu_blocks
|
||||||
|
|||||||
Reference in New Issue
Block a user