Switch to cudagraph_mode=NONE (not enforce-eager) for real inference testing
This commit is contained in:
@@ -16,7 +16,7 @@ services:
|
||||
- --trust-remote-code
|
||||
- --enable-expert-parallel
|
||||
- --tensor-parallel-size=8
|
||||
- --enforce-eager
|
||||
- --compilation-config={"cudagraph_mode":"NONE","custom_ops":["all"]}
|
||||
- --tokenizer-mode=deepseek_v4
|
||||
- --tool-call-parser=deepseek_v4
|
||||
- --enable-auto-tool-choice
|
||||
|
||||
Reference in New Issue
Block a user