[Bugfix][FP8] Fix dynamic FP8 Marlin quantization (#7219)

This commit is contained in:
Michael Goin
2024-08-07 14:23:12 -04:00
committed by GitHub
parent fde47d3bc2
commit 5223199e03
3 changed files with 33 additions and 5 deletions

View File

@@ -52,6 +52,7 @@ if TYPE_CHECKING:
CMAKE_BUILD_TYPE: Optional[str] = None
VERBOSE: bool = False
VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
VLLM_TEST_FORCE_FP8_MARLIN: bool = False
def get_default_cache_root():
@@ -342,6 +343,13 @@ environment_variables: Dict[str, Callable[[], Any]] = {
lambda:
(os.environ.get("VLLM_ALLOW_LONG_MAX_MODEL_LEN", "0").strip().lower() in
("1", "true")),
# If set, forces FP8 Marlin to be used for FP8 quantization regardless
# of the hardware support for FP8 compute.
"VLLM_TEST_FORCE_FP8_MARLIN":
lambda:
(os.environ.get("VLLM_TEST_FORCE_FP8_MARLIN", "0").strip().lower() in
("1", "true")),
}
# end-env-vars-definition