diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index c1e7a7d49..a3131aa38 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -680,7 +680,7 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
                 batch.
         
         Info:
-            [Blip2ImageInputs][]
+            [`Blip2ImageInputs`][vllm.model_executor.models.blip2.Blip2ImageInputs]
         """
 
         if intermediate_tensors is not None:
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 9591deea0..4f15e1b57 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -737,7 +737,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
             inputs_embeds: Optional tensor of input embeddings.
 
         Info:
-            [LlavaImageInputs][]
+            [`LlavaImageInputs`][vllm.model_executor.models.llava.LlavaImageInputs]
         """
         if intermediate_tensors is not None:
             inputs_embeds = None
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 5e82f9799..beb3c3310 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -527,7 +527,8 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
         Unlike in LLaVA-1.5, the number of image tokens inputted to the language
         model depends on the original size of the input image. Including the
         original image token in the input, the required number of image tokens
-        is given by [get_llava_next_image_feature_size][].
+        is given by [`LlavaNextProcessingInfo.get_num_image_tokens`][vllm.\
+model_executor.models.llava_next.LlavaNextProcessingInfo.get_num_image_tokens].
 
         This way, the `positions` and `attn_metadata` are consistent
         with the `input_ids`.
@@ -540,7 +541,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
             inputs_embeds: Optional tensor of input embeddings.
 
         Info:
-            [LlavaNextImageInputs][]
+            [`LlavaNextImageInputs`][vllm.model_executor.models.llava_next.LlavaNextImageInputs]
         """
         if intermediate_tensors is not None:
             inputs_embeds = None