Reduce max_model_len to 32768 (876544 requires 204 GiB KV cache)

This commit is contained in:
2026-05-19 09:13:33 +00:00
parent e91421f06e
commit bcfbd1e25b

View File

@@ -24,7 +24,7 @@ services:
- --reasoning-parser=deepseek_v4
- --moe-backend=cutedsl
- --gpu-memory-utilization=0.9
- --max-model-len=876544
- --max-model-len=32768
- --host=0.0.0.0
- --port=8000
deploy: