Use slow tokenizer for LLaMA (#84)
This commit is contained in:
@@ -7,12 +7,12 @@ from typing import List, Dict, Optional
|
||||
from fastapi import FastAPI, Request
|
||||
from fastapi.responses import StreamingResponse
|
||||
import ray
|
||||
from transformers import AutoTokenizer
|
||||
import uvicorn
|
||||
|
||||
from cacheflow.core.server import (Server, add_server_arguments,
|
||||
process_server_arguments,
|
||||
initialize_cluster)
|
||||
from cacheflow.frontend.utils import get_tokenizer
|
||||
from cacheflow.sampling_params import SamplingParams
|
||||
from cacheflow.sequence import Sequence, SequenceGroup
|
||||
from cacheflow.utils import Counter, get_gpu_memory, get_cpu_memory
|
||||
@@ -44,7 +44,7 @@ class FastAPIServer:
|
||||
):
|
||||
self.block_size = block_size
|
||||
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
self.tokenizer = get_tokenizer(model)
|
||||
self.seq_group_counter = Counter()
|
||||
self.seq_counter = Counter()
|
||||
if server_use_ray:
|
||||
|
||||
Reference in New Issue
Block a user