[Doc] Compatibility matrix for mutual exclusive features (#8512)

Signed-off-by: Wallas Santos <wallashss@ibm.com>
This commit is contained in:
Wallas Henrique
2024-10-11 15:18:50 -03:00
committed by GitHub
parent 1a1823871d
commit 8baf85e4e9
13 changed files with 467 additions and 0 deletions

View File

@@ -359,6 +359,8 @@ class ModelConfig:
self.use_async_output_proc = False
return
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
# If the feature combo become valid
if device_config.device_type not in ("cuda", "tpu"):
logger.warning(
"Async output processing is only supported for CUDA or TPU. "
@@ -372,6 +374,8 @@ class ModelConfig:
self.use_async_output_proc = False
return
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
# If the feature combo become valid
if device_config.device_type == "cuda" and self.enforce_eager:
logger.warning(
"To see benefits of async output processing, enable CUDA "
@@ -385,6 +389,8 @@ class ModelConfig:
if self.embedding_mode:
self.use_async_output_proc = False
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
# If the feature combo become valid
if speculative_config:
logger.warning("Async output processing is not supported with"
" speculative decoding currently.")
@@ -1200,6 +1206,8 @@ class SpeculativeConfig:
"speculative decoding is > 1, but got "
f"{speculative_disable_by_batch_size=}")
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
# If the feature combo become valid
if enable_chunked_prefill:
raise ValueError(
"Speculative decoding and chunked prefill are "
@@ -1561,6 +1569,8 @@ class LoRAConfig:
model_config.quantization)
def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
# If the feature combo become valid
if scheduler_config.chunked_prefill_enabled:
raise ValueError("LoRA is not supported with chunked prefill yet.")