[Bug] Revert torch warning fix (#31585)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
Wentao Ye
2026-01-05 17:31:21 -05:00
committed by GitHub
parent 276e03b92c
commit af9a7ec255
3 changed files with 6 additions and 8 deletions

View File

@@ -154,7 +154,7 @@ def run_tests(
with monkeypatch.context() as m:
# lock matmul precision to full FP32 (IEEE)
m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "ieee")
m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "highest")
# m.setenv("VLLM_BATCH_INVARIANT", "1")
outputs: list[tuple[str, list, list]] = []
for n, (

View File

@@ -75,7 +75,7 @@ if TYPE_CHECKING:
VLLM_MEDIA_CONNECTOR: str = "http"
VLLM_TARGET_DEVICE: str = "cuda"
VLLM_MAIN_CUDA_VERSION: str = "12.9"
VLLM_FLOAT32_MATMUL_PRECISION: Literal["ieee", "tf32"] = "ieee"
VLLM_FLOAT32_MATMUL_PRECISION: Literal["highest", "high", "medium"] = "highest"
MAX_JOBS: str | None = None
NVCC_THREADS: str | None = None
VLLM_USE_PRECOMPILED: bool = False
@@ -459,13 +459,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower()
or "12.9",
# Controls PyTorch float32 matmul precision mode within vLLM workers.
# Accepted values:
# - "ieee" (default): force full IEEE FP32 matmul precision.
# - "tf32": enable TensorFloat32-based fast matmul.
# Valid options mirror torch.set_float32_matmul_precision
"VLLM_FLOAT32_MATMUL_PRECISION": env_with_choices(
"VLLM_FLOAT32_MATMUL_PRECISION",
"ieee",
["ieee", "tf32"],
"highest",
["highest", "high", "medium"],
case_sensitive=False,
),
# Maximum number of compilation jobs to run in parallel.

View File

@@ -84,7 +84,7 @@ class Worker(WorkerBase):
# configure float32 matmul precision according to vLLM env.
precision = envs.VLLM_FLOAT32_MATMUL_PRECISION
torch.backends.cuda.matmul.fp32_precision = precision
torch.set_float32_matmul_precision(precision)
if self.model_config.trust_remote_code:
# note: lazy import to avoid importing torch before initializing