tensorize_vllm_model.py 8.76 KB
Newer Older
1
2
import argparse
import dataclasses
3
import json
4
5
6
7
import os
import uuid
from functools import partial

8
from tensorizer import stream_io
9

10
11
12
from vllm import LLM
from vllm.distributed import (init_distributed_environment,
                              initialize_model_parallel)
13
14
from vllm.engine.arg_utils import EngineArgs
from vllm.engine.llm_engine import LLMEngine
15
16
17
from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs,
                                                         TensorizerConfig,
                                                         serialize_vllm_model)
18
19
20
21
22

# yapf conflicts with isort for this docstring
# yapf: disable
"""
tensorize_vllm_model.py is a script that can be used to serialize and 
23
24
25
26
deserialize vLLM models. These models can be loaded using tensorizer 
to the GPU extremely quickly over an HTTP/HTTPS endpoint, an S3 endpoint,
or locally. Tensor encryption and decryption is also supported, although 
libsodium must be installed to use it. Install vllm with tensorizer support 
27
28
using `pip install vllm[tensorizer]`. To learn more about tensorizer, visit
https://github.com/coreweave/tensorizer
29

30
31
To serialize a model, install vLLM from source, then run something 
like this from the root level of this repository:
32

33
python -m examples.tensorize_vllm_model \
34
   --model facebook/opt-125m \
35
   serialize \
36
37
   --serialized-directory s3://my-bucket \
   --suffix v1
38
39
   
Which downloads the model from HuggingFace, loads it into vLLM, serializes it,
40
41
and saves it to your S3 bucket. A local directory can also be used. This
assumes your S3 credentials are specified as environment variables
42
43
44
45
in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and 
`S3_ENDPOINT_URL`. To provide S3 credentials directly, you can provide 
`--s3-access-key-id` and `--s3-secret-access-key`, as well as `--s3-endpoint` 
as CLI args to this script.
46
47
48
49

You can also encrypt the model weights with a randomly-generated key by 
providing a `--keyfile` argument.

50
51
To deserialize a model, you can run something like this from the root 
level of this repository:
52

53
python -m examples.tensorize_vllm_model \
54
55
56
   --model EleutherAI/gpt-j-6B \
   --dtype float16 \
   deserialize \
57
   --path-to-tensors s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors
58
59
60
61
62
63

Which downloads the model tensors from your S3 bucket and deserializes them.

You can also provide a `--keyfile` argument to decrypt the model weights if 
they were serialized with encryption.

64
65
66
67
68
69
70
For more information on the available arguments for serializing, run 
`python -m examples.tensorize_vllm_model serialize --help`.

Or for deserializing:

`python -m examples.tensorize_vllm_model deserialize --help`.

71
72
Once a model is serialized, tensorizer can be invoked with the `LLM` class 
directly to load models:
73
74
75

    llm = LLM(model="facebook/opt-125m",
              load_format="tensorizer",
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
              model_loader_extra_config=TensorizerConfig(
                    tensorizer_uri = path_to_tensors,
                    num_readers=3,
                    )
              )
            
A serialized model can be used during model loading for the vLLM OpenAI
inference server. `model_loader_extra_config` is exposed as the CLI arg
`--model-loader-extra-config`, and accepts a JSON string literal of the
TensorizerConfig arguments desired.

In order to see all of the available arguments usable to configure 
loading with tensorizer that are given to `TensorizerConfig`, run:

`python -m examples.tensorize_vllm_model deserialize --help`

under the `tensorizer options` section. These can also be used for
deserialization in this example script, although `--tensorizer-uri` and
`--path-to-tensors` are functionally the same in this case.
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
"""


def parse_args():
    parser = argparse.ArgumentParser(
        description="An example script that can be used to serialize and "
        "deserialize vLLM models. These models "
        "can be loaded using tensorizer directly to the GPU "
        "extremely quickly. Tensor encryption and decryption is "
        "also supported, although libsodium must be installed to "
        "use it.")
    parser = EngineArgs.add_cli_args(parser)
    subparsers = parser.add_subparsers(dest='command')

    serialize_parser = subparsers.add_parser(
        'serialize', help="Serialize a model to `--serialized-directory`")

    serialize_parser.add_argument(
        "--suffix",
        type=str,
        required=False,
        help=(
            "The suffix to append to the serialized model directory, which is "
            "used to construct the location of the serialized model tensors, "
            "e.g. if `--serialized-directory` is `s3://my-bucket/` and "
            "`--suffix` is `v1`, the serialized model tensors will be "
            "saved to "
            "`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
            "If none is provided, a random UUID will be used."))
    serialize_parser.add_argument(
        "--serialized-directory",
        type=str,
        required=True,
        help="The directory to serialize the model to. "
        "This can be a local directory or S3 URI. The path to where the "
        "tensors are saved is a combination of the supplied `dir` and model "
        "reference ID. For instance, if `dir` is the serialized directory, "
        "and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
        "be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
        "where `suffix` is given by `--suffix` or a random UUID if not "
        "provided.")

    serialize_parser.add_argument(
        "--keyfile",
        type=str,
        required=False,
        help=("Encrypt the model weights with a randomly-generated binary key,"
              " and save the key at this path"))

    deserialize_parser = subparsers.add_parser(
        'deserialize',
        help=("Deserialize a model from `--path-to-tensors`"
              " to verify it can be loaded and used."))

    deserialize_parser.add_argument(
        "--path-to-tensors",
        type=str,
        required=True,
        help="The local path or S3 URI to the model tensors to deserialize. ")

    deserialize_parser.add_argument(
        "--keyfile",
        type=str,
        required=False,
        help=("Path to a binary key to use to decrypt the model weights,"
              " if the model was serialized with encryption"))

162
    TensorizerArgs.add_cli_args(deserialize_parser)
163

164
    return parser.parse_args()
165
166
167
168



def deserialize():
169
170
171
    llm = LLM(model=args.model,
              load_format="tensorizer",
              model_loader_extra_config=tensorizer_config
172
    )
173
    return llm
174
175
176
177
178



args = parse_args()

179
180
181
182
183
184
s3_access_key_id = (getattr(args, 's3_access_key_id', None)
                    or os.environ.get("S3_ACCESS_KEY_ID", None))
s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
                        or os.environ.get("S3_SECRET_ACCESS_KEY", None))
s3_endpoint = (getattr(args, 's3_endpoint', None)
               or os.environ.get("S3_ENDPOINT_URL", None))
185

186
187
188
189
190
credentials = {
    "s3_access_key_id": s3_access_key_id,
    "s3_secret_access_key": s3_secret_access_key,
    "s3_endpoint": s3_endpoint
}
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206

_read_stream, _write_stream = (partial(
    stream_io.open_stream,
    mode=mode,
    s3_access_key_id=s3_access_key_id,
    s3_secret_access_key=s3_secret_access_key,
    s3_endpoint=s3_endpoint,
) for mode in ("rb", "wb+"))

model_ref = args.model

model_name = model_ref.split("/")[1]

os.environ["MASTER_ADDR"] = "127.0.0.1"
os.environ["MASTER_PORT"] = "8080"

207
init_distributed_environment(world_size=1, rank=0, local_rank=0)
208
209
210
211
initialize_model_parallel()

keyfile = args.keyfile if args.keyfile else None

212
213
214
215
216
217
218
219

if args.model_loader_extra_config:
    config = json.loads(args.model_loader_extra_config)
    tensorizer_args = TensorizerConfig(**config)._construct_tensorizer_args()
    tensorizer_args.tensorizer_uri = args.path_to_tensors
else:
    tensorizer_args = None

220
if args.command == "serialize":
221
222
223
224
225
226
    eng_args_dict = {f.name: getattr(args, f.name) for f in
                     dataclasses.fields(EngineArgs)}

    engine_args = EngineArgs.from_cli_args(argparse.Namespace(**eng_args_dict))
    engine = LLMEngine.from_engine_args(engine_args)

227
228
229
230
    input_dir = args.serialized_directory.rstrip('/')
    suffix = args.suffix if args.suffix else uuid.uuid4().hex
    base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
    model_path = f"{base_path}/model.tensors"
231
232
233
234
    tensorizer_config = TensorizerConfig(
        tensorizer_uri=model_path,
        **credentials)
    serialize_vllm_model(engine, tensorizer_config, keyfile)
235
elif args.command == "deserialize":
236
237
238
239
240
241
    if not tensorizer_args:
        tensorizer_config = TensorizerConfig(
            tensorizer_uri=args.path_to_tensors,
            encryption_keyfile = keyfile,
            **credentials
        )
242
243
244
    deserialize()
else:
    raise ValueError("Either serialize or deserialize must be specified.")