Remove all references to yapf as it's no longer used (#26251)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -21,8 +21,6 @@ from vllm.utils import FlexibleArgumentParser
|
||||
logger = logging.getLogger()
|
||||
|
||||
|
||||
# yapf conflicts with isort for this docstring
|
||||
# yapf: disable
|
||||
"""
|
||||
tensorize_vllm_model.py is a script that can be used to serialize and
|
||||
deserialize vLLM models. These models can be loaded using tensorizer
|
||||
@@ -132,7 +130,8 @@ def get_parser():
|
||||
"can be loaded using tensorizer directly to the GPU "
|
||||
"extremely quickly. Tensor encryption and decryption is "
|
||||
"also supported, although libsodium must be installed to "
|
||||
"use it.")
|
||||
"use it."
|
||||
)
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
|
||||
parser.add_argument(
|
||||
@@ -144,13 +143,14 @@ def get_parser():
|
||||
"along with the model by instantiating a TensorizerConfig object, "
|
||||
"creating a dict from it with TensorizerConfig.to_serializable(), "
|
||||
"and passing it to LoRARequest's initializer with the kwarg "
|
||||
"tensorizer_config_dict."
|
||||
"tensorizer_config_dict.",
|
||||
)
|
||||
|
||||
subparsers = parser.add_subparsers(dest='command', required=True)
|
||||
subparsers = parser.add_subparsers(dest="command", required=True)
|
||||
|
||||
serialize_parser = subparsers.add_parser(
|
||||
'serialize', help="Serialize a model to `--serialized-directory`")
|
||||
"serialize", help="Serialize a model to `--serialized-directory`"
|
||||
)
|
||||
|
||||
serialize_parser.add_argument(
|
||||
"--suffix",
|
||||
@@ -163,7 +163,9 @@ def get_parser():
|
||||
"`--suffix` is `v1`, the serialized model tensors will be "
|
||||
"saved to "
|
||||
"`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
|
||||
"If none is provided, a random UUID will be used."))
|
||||
"If none is provided, a random UUID will be used."
|
||||
),
|
||||
)
|
||||
serialize_parser.add_argument(
|
||||
"--serialized-directory",
|
||||
type=str,
|
||||
@@ -175,108 +177,127 @@ def get_parser():
|
||||
"and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
|
||||
"be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
|
||||
"where `suffix` is given by `--suffix` or a random UUID if not "
|
||||
"provided.")
|
||||
"provided.",
|
||||
)
|
||||
|
||||
serialize_parser.add_argument(
|
||||
"--serialization-kwargs",
|
||||
type=tensorizer_kwargs_arg,
|
||||
required=False,
|
||||
help=("A JSON string containing additional keyword arguments to "
|
||||
"pass to Tensorizer's TensorSerializer during "
|
||||
"serialization."))
|
||||
help=(
|
||||
"A JSON string containing additional keyword arguments to "
|
||||
"pass to Tensorizer's TensorSerializer during "
|
||||
"serialization."
|
||||
),
|
||||
)
|
||||
|
||||
serialize_parser.add_argument(
|
||||
"--keyfile",
|
||||
type=str,
|
||||
required=False,
|
||||
help=("Encrypt the model weights with a randomly-generated binary key,"
|
||||
" and save the key at this path"))
|
||||
help=(
|
||||
"Encrypt the model weights with a randomly-generated binary key,"
|
||||
" and save the key at this path"
|
||||
),
|
||||
)
|
||||
|
||||
deserialize_parser = subparsers.add_parser(
|
||||
'deserialize',
|
||||
help=("Deserialize a model from `--path-to-tensors`"
|
||||
" to verify it can be loaded and used."))
|
||||
"deserialize",
|
||||
help=(
|
||||
"Deserialize a model from `--path-to-tensors`"
|
||||
" to verify it can be loaded and used."
|
||||
),
|
||||
)
|
||||
|
||||
deserialize_parser.add_argument(
|
||||
"--path-to-tensors",
|
||||
type=str,
|
||||
required=False,
|
||||
help="The local path or S3 URI to the model tensors to deserialize. ")
|
||||
help="The local path or S3 URI to the model tensors to deserialize. ",
|
||||
)
|
||||
|
||||
deserialize_parser.add_argument(
|
||||
"--serialized-directory",
|
||||
type=str,
|
||||
required=False,
|
||||
help="Directory with model artifacts for loading. Assumes a "
|
||||
"model.tensors file exists therein. Can supersede "
|
||||
"--path-to-tensors.")
|
||||
"model.tensors file exists therein. Can supersede "
|
||||
"--path-to-tensors.",
|
||||
)
|
||||
|
||||
deserialize_parser.add_argument(
|
||||
"--keyfile",
|
||||
type=str,
|
||||
required=False,
|
||||
help=("Path to a binary key to use to decrypt the model weights,"
|
||||
" if the model was serialized with encryption"))
|
||||
help=(
|
||||
"Path to a binary key to use to decrypt the model weights,"
|
||||
" if the model was serialized with encryption"
|
||||
),
|
||||
)
|
||||
|
||||
deserialize_parser.add_argument(
|
||||
"--deserialization-kwargs",
|
||||
type=tensorizer_kwargs_arg,
|
||||
required=False,
|
||||
help=("A JSON string containing additional keyword arguments to "
|
||||
"pass to Tensorizer's `TensorDeserializer` during "
|
||||
"deserialization."))
|
||||
help=(
|
||||
"A JSON string containing additional keyword arguments to "
|
||||
"pass to Tensorizer's `TensorDeserializer` during "
|
||||
"deserialization."
|
||||
),
|
||||
)
|
||||
|
||||
TensorizerArgs.add_cli_args(deserialize_parser)
|
||||
|
||||
return parser
|
||||
|
||||
def merge_extra_config_with_tensorizer_config(extra_cfg: dict,
|
||||
cfg: TensorizerConfig):
|
||||
|
||||
def merge_extra_config_with_tensorizer_config(extra_cfg: dict, cfg: TensorizerConfig):
|
||||
for k, v in extra_cfg.items():
|
||||
if hasattr(cfg, k):
|
||||
setattr(cfg, k, v)
|
||||
logger.info(
|
||||
"Updating TensorizerConfig with %s from "
|
||||
"--model-loader-extra-config provided", k
|
||||
"--model-loader-extra-config provided",
|
||||
k,
|
||||
)
|
||||
|
||||
|
||||
def deserialize(args, tensorizer_config):
|
||||
if args.lora_path:
|
||||
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
|
||||
llm = LLM(model=args.model,
|
||||
load_format="tensorizer",
|
||||
tensor_parallel_size=args.tensor_parallel_size,
|
||||
model_loader_extra_config=tensorizer_config,
|
||||
enable_lora=True,
|
||||
llm = LLM(
|
||||
model=args.model,
|
||||
load_format="tensorizer",
|
||||
tensor_parallel_size=args.tensor_parallel_size,
|
||||
model_loader_extra_config=tensorizer_config,
|
||||
enable_lora=True,
|
||||
)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
max_tokens=256,
|
||||
stop=["[/assistant]"]
|
||||
temperature=0, max_tokens=256, stop=["[/assistant]"]
|
||||
)
|
||||
|
||||
# Truncating this as the extra text isn't necessary
|
||||
prompts = [
|
||||
"[user] Write a SQL query to answer the question based on ..."
|
||||
]
|
||||
prompts = ["[user] Write a SQL query to answer the question based on ..."]
|
||||
|
||||
# Test LoRA load
|
||||
print(
|
||||
llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest("sql-lora",
|
||||
1,
|
||||
args.lora_path,
|
||||
tensorizer_config_dict = tensorizer_config
|
||||
.to_serializable())
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(
|
||||
"sql-lora",
|
||||
1,
|
||||
args.lora_path,
|
||||
tensorizer_config_dict=tensorizer_config.to_serializable(),
|
||||
),
|
||||
)
|
||||
)
|
||||
else:
|
||||
llm = LLM(model=args.model,
|
||||
load_format="tensorizer",
|
||||
tensor_parallel_size=args.tensor_parallel_size,
|
||||
model_loader_extra_config=tensorizer_config
|
||||
llm = LLM(
|
||||
model=args.model,
|
||||
load_format="tensorizer",
|
||||
tensor_parallel_size=args.tensor_parallel_size,
|
||||
model_loader_extra_config=tensorizer_config,
|
||||
)
|
||||
return llm
|
||||
|
||||
@@ -285,17 +306,20 @@ def main():
|
||||
parser = get_parser()
|
||||
args = parser.parse_args()
|
||||
|
||||
s3_access_key_id = (getattr(args, 's3_access_key_id', None)
|
||||
or os.environ.get("S3_ACCESS_KEY_ID", None))
|
||||
s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
|
||||
or os.environ.get("S3_SECRET_ACCESS_KEY", None))
|
||||
s3_endpoint = (getattr(args, 's3_endpoint', None)
|
||||
or os.environ.get("S3_ENDPOINT_URL", None))
|
||||
s3_access_key_id = getattr(args, "s3_access_key_id", None) or os.environ.get(
|
||||
"S3_ACCESS_KEY_ID", None
|
||||
)
|
||||
s3_secret_access_key = getattr(
|
||||
args, "s3_secret_access_key", None
|
||||
) or os.environ.get("S3_SECRET_ACCESS_KEY", None)
|
||||
s3_endpoint = getattr(args, "s3_endpoint", None) or os.environ.get(
|
||||
"S3_ENDPOINT_URL", None
|
||||
)
|
||||
|
||||
credentials = {
|
||||
"s3_access_key_id": s3_access_key_id,
|
||||
"s3_secret_access_key": s3_secret_access_key,
|
||||
"s3_endpoint": s3_endpoint
|
||||
"s3_endpoint": s3_endpoint,
|
||||
}
|
||||
|
||||
model_ref = args.model
|
||||
@@ -309,25 +333,25 @@ def main():
|
||||
if args.model_loader_extra_config:
|
||||
extra_config = json.loads(args.model_loader_extra_config)
|
||||
|
||||
|
||||
tensorizer_dir = (args.serialized_directory or
|
||||
extra_config.get("tensorizer_dir"))
|
||||
tensorizer_uri = (getattr(args, "path_to_tensors", None)
|
||||
or extra_config.get("tensorizer_uri"))
|
||||
tensorizer_dir = args.serialized_directory or extra_config.get("tensorizer_dir")
|
||||
tensorizer_uri = getattr(args, "path_to_tensors", None) or extra_config.get(
|
||||
"tensorizer_uri"
|
||||
)
|
||||
|
||||
if tensorizer_dir and tensorizer_uri:
|
||||
parser.error("--serialized-directory and --path-to-tensors "
|
||||
"cannot both be provided")
|
||||
parser.error(
|
||||
"--serialized-directory and --path-to-tensors cannot both be provided"
|
||||
)
|
||||
|
||||
if not tensorizer_dir and not tensorizer_uri:
|
||||
parser.error("Either --serialized-directory or --path-to-tensors "
|
||||
"must be provided")
|
||||
|
||||
parser.error(
|
||||
"Either --serialized-directory or --path-to-tensors must be provided"
|
||||
)
|
||||
|
||||
if args.command == "serialize":
|
||||
engine_args = EngineArgs.from_cli_args(args)
|
||||
|
||||
input_dir = tensorizer_dir.rstrip('/')
|
||||
input_dir = tensorizer_dir.rstrip("/")
|
||||
suffix = args.suffix if args.suffix else uuid.uuid4().hex
|
||||
base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
|
||||
if engine_args.tensor_parallel_size > 1:
|
||||
@@ -339,15 +363,14 @@ def main():
|
||||
tensorizer_uri=model_path,
|
||||
encryption_keyfile=keyfile,
|
||||
serialization_kwargs=args.serialization_kwargs or {},
|
||||
**credentials
|
||||
**credentials,
|
||||
)
|
||||
|
||||
if args.lora_path:
|
||||
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
|
||||
tensorize_lora_adapter(args.lora_path, tensorizer_config)
|
||||
|
||||
merge_extra_config_with_tensorizer_config(extra_config,
|
||||
tensorizer_config)
|
||||
merge_extra_config_with_tensorizer_config(extra_config, tensorizer_config)
|
||||
tensorize_vllm_model(engine_args, tensorizer_config)
|
||||
|
||||
elif args.command == "deserialize":
|
||||
@@ -356,11 +379,10 @@ def main():
|
||||
tensorizer_dir=args.serialized_directory,
|
||||
encryption_keyfile=keyfile,
|
||||
deserialization_kwargs=args.deserialization_kwargs or {},
|
||||
**credentials
|
||||
**credentials,
|
||||
)
|
||||
|
||||
merge_extra_config_with_tensorizer_config(extra_config,
|
||||
tensorizer_config)
|
||||
merge_extra_config_with_tensorizer_config(extra_config, tensorizer_config)
|
||||
deserialize(args, tensorizer_config)
|
||||
else:
|
||||
raise ValueError("Either serialize or deserialize must be specified.")
|
||||
|
||||
Reference in New Issue
Block a user