Use slow tokenizer for LLaMA (#84)

This commit is contained in:
Woosuk Kwon
2023-05-09 16:03:44 -07:00
committed by GitHub
parent add055e151
commit 85eb631839
3 changed files with 26 additions and 5 deletions

View File

@@ -7,12 +7,12 @@ from typing import List, Dict, Optional
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse
import ray
from transformers import AutoTokenizer
import uvicorn
from cacheflow.core.server import (Server, add_server_arguments,
process_server_arguments,
initialize_cluster)
from cacheflow.frontend.utils import get_tokenizer
from cacheflow.sampling_params import SamplingParams
from cacheflow.sequence import Sequence, SequenceGroup
from cacheflow.utils import Counter, get_gpu_memory, get_cpu_memory
@@ -44,7 +44,7 @@ class FastAPIServer:
):
self.block_size = block_size
self.tokenizer = AutoTokenizer.from_pretrained(model)
self.tokenizer = get_tokenizer(model)
self.seq_group_counter = Counter()
self.seq_counter = Counter()
if server_use_ray: