[Model][Misc] Add e5-mistral-7b-instruct and Embedding API (#3734)
This commit is contained in:
@@ -23,6 +23,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
|
||||
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
|
||||
# generation quality here
|
||||
LORA_NAME = "typeof/zephyr-7b-beta-lora"
|
||||
@@ -121,7 +122,7 @@ def zephyr_lora_files():
|
||||
return snapshot_download(repo_id=LORA_NAME)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
@pytest.fixture(scope="module")
|
||||
def server(zephyr_lora_files):
|
||||
ray.init()
|
||||
server_runner = ServerRunner.remote([
|
||||
@@ -150,6 +151,25 @@ def server(zephyr_lora_files):
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def embedding_server(zephyr_lora_files):
|
||||
ray.shutdown()
|
||||
ray.init()
|
||||
server_runner = ServerRunner.remote([
|
||||
"--model",
|
||||
EMBEDDING_MODEL_NAME,
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--enforce-eager",
|
||||
])
|
||||
ray.get(server_runner.ready.remote())
|
||||
yield server_runner
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def client():
|
||||
client = openai.AsyncOpenAI(
|
||||
@@ -890,5 +910,79 @@ async def test_long_seed(server, client: openai.AsyncOpenAI):
|
||||
or "less_than_equal" in exc_info.value.message)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
[EMBEDDING_MODEL_NAME],
|
||||
)
|
||||
async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI,
|
||||
model_name: str):
|
||||
input = [
|
||||
"The chef prepared a delicious meal.",
|
||||
]
|
||||
|
||||
# test single embedding
|
||||
embeddings = await client.embeddings.create(
|
||||
model=model_name,
|
||||
input=input,
|
||||
encoding_format="float",
|
||||
)
|
||||
assert embeddings.id is not None
|
||||
assert embeddings.data is not None and len(embeddings.data) == 1
|
||||
assert len(embeddings.data[0].embedding) == 4096
|
||||
assert embeddings.usage.completion_tokens == 0
|
||||
assert embeddings.usage.prompt_tokens == 9
|
||||
assert embeddings.usage.total_tokens == 9
|
||||
|
||||
# test using token IDs
|
||||
input = [1, 1, 1, 1, 1]
|
||||
embeddings = await client.embeddings.create(
|
||||
model=model_name,
|
||||
input=input,
|
||||
encoding_format="float",
|
||||
)
|
||||
assert embeddings.id is not None
|
||||
assert embeddings.data is not None and len(embeddings.data) == 1
|
||||
assert len(embeddings.data[0].embedding) == 4096
|
||||
assert embeddings.usage.completion_tokens == 0
|
||||
assert embeddings.usage.prompt_tokens == 5
|
||||
assert embeddings.usage.total_tokens == 5
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
[EMBEDDING_MODEL_NAME],
|
||||
)
|
||||
async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI,
|
||||
model_name: str):
|
||||
# test List[str]
|
||||
inputs = [
|
||||
"The cat sat on the mat.", "A feline was resting on a rug.",
|
||||
"Stars twinkle brightly in the night sky."
|
||||
]
|
||||
embeddings = await client.embeddings.create(
|
||||
model=model_name,
|
||||
input=inputs,
|
||||
encoding_format="float",
|
||||
)
|
||||
assert embeddings.id is not None
|
||||
assert embeddings.data is not None and len(embeddings.data) == 3
|
||||
assert len(embeddings.data[0].embedding) == 4096
|
||||
|
||||
# test List[List[int]]
|
||||
inputs = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
|
||||
[25, 32, 64, 77]]
|
||||
embeddings = await client.embeddings.create(
|
||||
model=model_name,
|
||||
input=inputs,
|
||||
encoding_format="float",
|
||||
)
|
||||
assert embeddings.id is not None
|
||||
assert embeddings.data is not None and len(embeddings.data) == 4
|
||||
assert len(embeddings.data[0].embedding) == 4096
|
||||
assert embeddings.usage.completion_tokens == 0
|
||||
assert embeddings.usage.prompt_tokens == 17
|
||||
assert embeddings.usage.total_tokens == 17
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__])
|
||||
|
||||
Reference in New Issue
Block a user