diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index c1e7a7d49..a3131aa38 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -680,7 +680,7 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, batch. Info: - [Blip2ImageInputs][] + [`Blip2ImageInputs`][vllm.model_executor.models.blip2.Blip2ImageInputs] """ if intermediate_tensors is not None: diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 9591deea0..4f15e1b57 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -737,7 +737,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): inputs_embeds: Optional tensor of input embeddings. Info: - [LlavaImageInputs][] + [`LlavaImageInputs`][vllm.model_executor.models.llava.LlavaImageInputs] """ if intermediate_tensors is not None: inputs_embeds = None diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 5e82f9799..beb3c3310 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -527,7 +527,8 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, Unlike in LLaVA-1.5, the number of image tokens inputted to the language model depends on the original size of the input image. Including the original image token in the input, the required number of image tokens - is given by [get_llava_next_image_feature_size][]. + is given by [`LlavaNextProcessingInfo.get_num_image_tokens`][vllm.\ +model_executor.models.llava_next.LlavaNextProcessingInfo.get_num_image_tokens]. This way, the `positions` and `attn_metadata` are consistent with the `input_ids`. @@ -540,7 +541,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, inputs_embeds: Optional tensor of input embeddings. Info: - [LlavaNextImageInputs][] + [`LlavaNextImageInputs`][vllm.model_executor.models.llava_next.LlavaNextImageInputs] """ if intermediate_tensors is not None: inputs_embeds = None