[Neuron] Support inference with transformers-neuronx (#2569)

2024-02-28 09:34:34 -08:00
parent e46fa5d52e
commit 3b7178cfa4
18 changed files with 516 additions and 42 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -44,7 +44,7 @@ class EngineArgs:
    lora_extra_vocab_size: int = 256
    lora_dtype = 'auto'
    max_cpu_loras: Optional[int] = None
-    device: str = 'cuda'
+    device: str = 'auto'

    def __post_init__(self):
        if self.tokenizer is None:
@@ -171,7 +171,7 @@ class EngineArgs:
        parser.add_argument('--block-size',
                            type=int,
                            default=EngineArgs.block_size,
-                            choices=[8, 16, 32],
+                            choices=[8, 16, 32, 128],
                            help='token block size')
        parser.add_argument('--seed',
                            type=int,
@@ -264,13 +264,11 @@ class EngineArgs:
            help=('Maximum number of LoRAs to store in CPU memory. '
                  'Must be >= than max_num_seqs. '
                  'Defaults to max_num_seqs.'))
-        parser.add_argument(
-            "--device",
-            type=str,
-            default=EngineArgs.device,
-            choices=["cuda"],
-            help=('Device type for vLLM execution. '
-                  'Currently, only CUDA-compatible devices are supported.'))
+        parser.add_argument("--device",
+                            type=str,
+                            default=EngineArgs.device,
+                            choices=["auto", "cuda", "neuron"],
+                            help='Device type for vLLM execution.')
        return parser

    @classmethod