[torch.compile] support all attention backends (#10558)

Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-22 14:04:42 -08:00
parent db100c5cde
commit eebad39f26
77 changed files with 876 additions and 648 deletions
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -1,5 +1,5 @@
 from functools import partial
-from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Type, Union

 import torch
 from torch import nn
@@ -250,7 +250,12 @@ class InternLMDecoderLayer(nn.Module):
@support_torch_compile
 class InternLM2Model(nn.Module):

-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+    def __init__(
+            self,
+            *,
+            vllm_config: VllmConfig,
+            prefix: str = "",
+            layer_type: Type[InternLMDecoderLayer] = InternLMDecoderLayer):
        super().__init__()

        config = vllm_config.model_config.hf_config
@@ -266,7 +271,7 @@ class InternLM2Model(nn.Module):
        )
        self.start_layer, self.end_layer, self.layers = make_layers(
            config.num_hidden_layers,
-            lambda prefix: InternLMDecoderLayer(
+            lambda prefix: layer_type(
                config, cache_config, quant_config, prefix=prefix),
            prefix=f"{prefix}.layers")
        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -316,14 +321,18 @@ class InternLM2Model(nn.Module):

 class InternLM2ForCausalLM(nn.Module, SupportsPP):

-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 model_type: Type[InternLM2Model] = InternLM2Model):
        super().__init__()
        config = vllm_config.model_config.hf_config
        quant_config = vllm_config.quant_config
        self.config = config
        self.quant_config = quant_config
-        self.model = InternLM2Model(vllm_config=vllm_config,
-                                    prefix=maybe_prefix(prefix, "model"))
+        self.model = model_type(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
        self.output = ParallelLMHead(config.vocab_size,
                                     config.hidden_size,
                                     quant_config=quant_config,