diff --git a/docs/design/debug_vllm_compile.md b/docs/design/debug_vllm_compile.md index bc5f46022..262782243 100644 --- a/docs/design/debug_vllm_compile.md +++ b/docs/design/debug_vllm_compile.md @@ -282,6 +282,15 @@ If vLLM's compile cache is wrong, this usually means that a factor is missing. Please see [this example](https://github.com/vllm-project/vllm/blob/18b39828d90413d05d770dfd2e2f48304f4ca0eb/vllm/config/model.py#L310) of how vLLM computes part of the cache key. +vLLM's compilation cache requires that the code being compiled ends up being serializable. +If this is not the case, then it will error out on save. Usually the fixes are to either: + +- rewrite the non-serializable pieces (perhaps difficult because it's difficult to + tell right now what is serializable and what isn't) +- file a bug report +- ignore the error by setting `VLLM_DISABLE_COMPILE_CACHE=1` (note that this will + make warm server starts a lot slower). + ## Debugging CUDAGraphs CUDAGraphs is a feature that allows one to: diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index 331063ff1..606503539 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -273,7 +273,26 @@ class InductorStandaloneAdaptor(CompilerInterface): assert key is not None path = os.path.join(self.cache_dir, key) + def is_saveable_2_10(compiled_artifact): + # can just use compiled_artifact.is_saveable in 2.11 + if compiled_artifact._artifacts is None: + return False + _, cache_info = compiled_artifact._artifacts + return len(cache_info.aot_autograd_artifacts) == 1 + if is_compile_cache_enabled(compiler_config): + if not is_saveable_2_10(compiled_graph): + raise RuntimeError( + "The compiled artifact is not serializable. This usually means " + "that the model code has something that is not serializable " + "by torch.compile in it. You can fix this by either " + "figuring out what is not serializable and rewriting it, " + "filing a bug report, " + "or suppressing this error by " + "disabling vLLM's compilation cache via " + "VLLM_DISABLE_COMPILE_CACHE=1 " + "(this will greatly increase vLLM server warm start times)." + ) compiled_graph.save(path=path, format=self.save_format) compilation_counter.num_compiled_artifacts_saved += 1 return compiled_graph, (key, path)