deploy.py 39.1 KB
Newer Older
1
2
3
4
5
6
7
# Copyright (c) OpenMMLab. All rights reserved.
import configparser
import json
import os
import os.path as osp
import re
import shutil
8
import sys
9
10
11
12
from pathlib import Path

import safetensors
import torch
Chen Xin's avatar
Chen Xin committed
13
from safetensors.torch import load_file
14
15
from sentencepiece import SentencePieceProcessor

16
import lmdeploy
17
18
from lmdeploy.model import MODELS

19
supported_formats = ['llama', 'hf', 'awq', 'qwen']
20
21


22
def get_package_root_path():
23
24
    import lmdeploy
    return Path(lmdeploy.__file__).parent
25
26


27
def create_workspace(_path: str):
lvhan028's avatar
lvhan028 committed
28
29
30
31
32
33
34
    """Create a workspace.

    Args:
        _path (str): the path of the workspace
    Returns:
        bool: success or not
    """
35
36
37
38
39
40
41
42
43
44
45
46
    try:
        if osp.exists(_path):
            shutil.rmtree(_path)
        os.makedirs(_path)
        print(f'create workspace in directory {_path}')
        return True
    except Exception as e:
        print(f'create workspace in {_path} failed: {e}')
        return False


def destroy_workspace(_path: str):
lvhan028's avatar
lvhan028 committed
47
48
49
50
51
52
53
    """destroy workspace.

    Args:
        _path(str): the path of the workspace
    Returns:
        bool: success or not
    """
54
55
56
57
58
    try:
        shutil.rmtree(_path)
        print(f'destroy workspace in directory {_path}')
        return True
    except Exception as e:
59
        print(f'destroy workspace in {_path} failed: {e}')
60
61
62
63
        return False


def copy_triton_model_templates(_path: str):
lvhan028's avatar
lvhan028 committed
64
65
66
67
68
69
70
    """copy triton model templates to the specified path.

    Args:
        _path (str): the target path
    Returns:
        str: the path of the triton models
    """
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
    try:
        cur_path = osp.abspath(__file__)
        dir_path = osp.dirname(cur_path)
        triton_models_path = osp.join(dir_path, 'triton_models')
        dst_path = osp.join(_path, 'triton_models')
        shutil.copytree(triton_models_path, dst_path, symlinks=True)
        print(f'copy triton model templates from "{triton_models_path}" to '
              f'"{dst_path}" successfully')
        shutil.copy(osp.join(dir_path, 'service_docker_up.sh'), _path)
        return dst_path
    except Exception as e:
        print(f'copy triton model templates from "{triton_models_path}"'
              f' to "{dst_path}" failed: {e}')
        return None


87
def tokenizer_info_sp(model_path: str):
lvhan028's avatar
lvhan028 committed
88
89
90
91
92
93
94
    """Return the vocabulary size, bos token id and eos token id.

    Args:
        model_path (str): the tokenizer model's path
    Returns:
        tuple: vocabulary size, bos token id and eos token id
    """
95
96
97
98
99
100
101
102
103
    assert os.path.isfile(model_path), model_path
    sp_model = SentencePieceProcessor(model_file=model_path)
    # BOS / EOS token IDs
    n_words = sp_model.vocab_size()
    bos_id = sp_model.bos_id()
    eos_id = sp_model.eos_id()
    return n_words, bos_id, eos_id


104
105
106
107
108
109
110
def tokenizer_info_qwen(model_dir: str):
    n_words = 151851
    bos_id = 0
    eos_id = 151643
    return n_words, bos_id, eos_id


Chen Xin's avatar
Chen Xin committed
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
def load_checkpoint(model_path):
    """Load checkpoint files into torch format.

    Args:
        model_path (str): the checkpoint folder
    Returns:
        Dict[str, torch.Tensor]: weight in torch format
    """
    suffixes = ['.safetensors', '.bin']
    for suffix in suffixes:
        files = [
            file for file in os.listdir(model_path) if file.endswith(suffix)
        ]
        if len(files) > 0:
            break

    assert len(files) > 0, f'could not find checkpoints in {model_path}'
    files = sorted(files)
    print(files)
    params = {}
    for file in files:
        if file.endswith('.bin'):
            tmp = torch.load(osp.join(model_path, file), map_location='cpu')
        else:
            tmp = load_file(osp.join(model_path, file))
        params.update(tmp)
    return params


140
141
142
def export(model_name: str,
           num_layer: int,
           norm_eps: float,
143
           kv_head_num: int,
144
145
146
147
           model_params: dict,
           tokenizer_path: str,
           out_dir: str,
           tp: int,
148
149
           size_per_head: int = 128,
           group_size: int = 0,
150
151
152
153
           weight_type: str = 'fp16',
           max_position_embeddings: int = 0,
           use_dynamic_ntk: int = 0,
           use_logn_attn: int = 0,
Lyu Han's avatar
Lyu Han committed
154
           rope_theta: float = 10000.0,
155
           tokenizer_info=tokenizer_info_sp):
lvhan028's avatar
lvhan028 committed
156
157
158
159
160
161
162
163
164
165
166
167
    """Export deploying information to a config file.

    Args:
        model_name (str): model's name
        num_layer (int): the number of transformer blocks
        norm_eps (float): norm epsilon
        model_params (dict): parameters of a model
        tokenizer_path (str): the tokenizer model's path
        out_dir (str): the path of the output directory
        tp (int): the number of tensor parallelism
        size_per_head (int): the dimension of each head
    """
168
169
170
171
172
173
174
    out_dir = osp.join(out_dir, 'weights')
    os.makedirs(out_dir, exist_ok=True)

    def save_bin(param: torch.Tensor, name):
        print(name, param.shape)
        if param.dtype in [torch.float, torch.bfloat16]:
            param = param.half()
175
        param.contiguous().cpu().numpy().tofile(osp.join(out_dir, name))
176

Li Zhang's avatar
Li Zhang committed
177
    attn_bias = False
178
    inter_size = 0
Li Zhang's avatar
Li Zhang committed
179

180
181
182
183
184
185
186
187
188
189
190
191
192
193
    tok_embeddings = model_params['tok_embeddings.weight']
    _vocab_size, dim = tok_embeddings.shape
    head_num = dim // size_per_head
    if _vocab_size % tp != 0:
        # Resolve https://github.com/InternLM/lmdeploy/issues/266
        # Pad tok_embeddings and output weights, making their shape divisible by TP # noqa: E501
        pad_size = (_vocab_size + tp - 1) // tp * tp - _vocab_size
        # Pad weight at the bottom of dim 0
        model_params['tok_embeddings.weight'] = torch.nn.functional.pad(
            tok_embeddings, (0, 0, 0, pad_size), 'constant', 0)
        # Pad output weight at the bottom of dim 0
        model_params['output.weight'] = torch.nn.functional.pad(
            model_params['output.weight'], (0, 0, 0, pad_size), 'constant', 0)

194
195
196
197
    # reverse the splitting axes since the weights are transposed above
    for param_name, param_data in model_params.items():
        split_dim = None
        key, ext = param_name.split('.')[-2:]
Li Zhang's avatar
Li Zhang committed
198
199
        if key == 'w_qkv' and ext == 'bias':
            attn_bias = True
200
        copy = False
Li Zhang's avatar
Li Zhang committed
201
        if key in ['w1', 'w3', 'w13', 'w_qkv']:
202
            split_dim = -1
203
            # TODO: move parameter extraction outside of the loop
204
            if key == 'w1':
205
206
207
                inter_size = max(inter_size, param_data.shape[-1])
            elif key == 'w13':
                inter_size = max(inter_size, param_data.shape[-1] // 2)
208
        elif key in ['w2', 'wo']:
Li Zhang's avatar
Li Zhang committed
209
            if ext in ['bias']:
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
                copy = True
            else:
                split_dim = 0
        if split_dim is not None:
            print(f'*** splitting {param_name}, shape={param_data.shape}, '
                  f'split_dim={split_dim}')
            assert param_data.shape[split_dim] % tp == 0
            split_size = param_data.shape[split_dim] // tp
            splits = torch.split(param_data, split_size, dim=split_dim)
            for i, split in enumerate(splits):
                prefix, ext = osp.splitext(param_name)
                save_bin(split, f'{prefix}.{i}{ext}')
        elif copy:
            print(f'### copying {param_name}, shape={param_data.shape}')
            copies = [param_data] * tp
            for i, copy in enumerate(copies):
                prefix, ext = osp.splitext(param_name)
                save_bin(copy, f'{prefix}.{i}{ext}')
        else:
            save_bin(param_data, param_name)

231
232
    assert inter_size > 0

233
    # export config and save it to {out_dir}/config.ini
234
    model = MODELS.get(model_name)()
235
    vocab_size, bos_id, eos_id = tokenizer_info(tokenizer_path)
236
    assert _vocab_size >= vocab_size, \
237
        f'different vocab size {_vocab_size} vs {vocab_size}'
lvhan028's avatar
lvhan028 committed
238
239
240
    cfg = dict(llama=dict(
        model_name=model_name,
        head_num=head_num,
241
        kv_head_num=kv_head_num,
lvhan028's avatar
lvhan028 committed
242
        size_per_head=size_per_head,
243
        vocab_size=_vocab_size,
lvhan028's avatar
lvhan028 committed
244
245
        num_layer=num_layer,
        rotary_embedding=size_per_head,
Lyu Han's avatar
Lyu Han committed
246
        rope_theta=rope_theta,
lvhan028's avatar
lvhan028 committed
247
248
        inter_size=inter_size,
        norm_eps=norm_eps,
249
        attn_bias=int(attn_bias),
lvhan028's avatar
lvhan028 committed
250
251
        start_id=bos_id,
        end_id=eos_id,
252
253
        weight_type=weight_type,
        group_size=group_size,
254
        # parameters for turbomind
lvhan028's avatar
lvhan028 committed
255
256
        max_batch_size=32,
        max_context_token_num=4,
257
        session_len=model.session_len + 8,
lvhan028's avatar
lvhan028 committed
258
259
        step_length=1,
        cache_max_entry_count=48,
260
        cache_chunk_size=1,
q.yao's avatar
q.yao committed
261
        use_context_fmha=1,
262
        quant_policy=0,
263
264
265
266
        tensor_para_size=tp,
        # extra attention params
        max_position_embeddings=max_position_embeddings,
        use_dynamic_ntk=int(use_dynamic_ntk),
Lyu Han's avatar
Lyu Han committed
267
268
        use_logn_attn=int(use_logn_attn),
    ))
269
270
271
272
273
274
275
276
277
278
279

    config = configparser.ConfigParser()
    for section, key_values in cfg.items():
        config[section] = key_values

    config_path = osp.join(out_dir, 'config.ini')
    with open(config_path, 'w') as f:
        config.write(f)
    return True


280
281
282
283
284
285
def merge_qkv(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, tp: int,
              dim: int):

    def reshape(x):
        return x.view(x.size(0), tp, -1) if dim == 2 else x.view(tp, -1)

Li Zhang's avatar
Li Zhang committed
286
287
288
289
    qkv = torch.cat((reshape(q), reshape(k), reshape(v)), dim=-1)

    # (input_dim, head_num + 2 * kv_head_num)
    return qkv.view(q.size(0), -1)
290
291


292
293
def deploy_llama(model_name: str, model_path: str, tokenizer_path: str,
                 triton_models_path: str, tp: int):
lvhan028's avatar
lvhan028 committed
294
295
296
297
298
299
300
301
302
303
    """Deploy a model with huggingface transformers' format.

    Args:
        model_name (str): the name of the to-be-deployed model
        model_path (str): the path of the directory where the model weight
          files are
        tokenizer_path (str): the path of the tokenizer model path
        triton_models_path (str): the path of the exported triton models
        tp (int): the number of tensor parallelism
    """
304
305
306
    if osp.exists(tokenizer_path):
        shutil.copy(tokenizer_path,
                    osp.join(triton_models_path, 'tokenizer/tokenizer.model'))
307
        with get_package_root_path() as root_path:
308
            shutil.copy(osp.join(root_path, 'tokenizer.py'),
309
                        osp.join(triton_models_path, 'tokenizer'))
310
    else:
q.yao's avatar
q.yao committed
311
        print(f'tokenizer model {tokenizer_path} does not exist')
312
313
314
315
316
317
318
319
        return False
    # read model arguments from params.json
    try:
        params_path = osp.join(model_path, 'params.json')
        with open(params_path) as f:
            model_arg = json.load(f)
            num_layer = model_arg['n_layers']
            norm_eps = model_arg['norm_eps']
320
321
            head_num = model_arg.get('n_heads', 32)
            kv_head_num = model_arg.get('n_kv_heads', head_num)
322
323
324
325
    except Exception as e:
        print(f'get "n_layers" and "norm_eps" from {params_path} failed: {e}')
        return False

326
    # convert weights from llama to turbomind format
327
328
329
330
331
332
333
334
335
336
    checkpoints = []
    for pattern in ['*.pth', '*.pt']:
        checkpoints += sorted(Path(model_path).glob(pattern))
    print(checkpoints)
    n_ckpt = len(checkpoints)
    model_params = {}

    def get_param(_name, _size):
        print(_name, _size)
        if _name not in model_params:
lvhan028's avatar
lvhan028 committed
337
338
339
            model_params[_name] = torch.zeros(_size,
                                              dtype=torch.float16,
                                              device='cpu')
340
341
342
343
344
        return model_params[_name]

    for i, ckpt_path in enumerate(checkpoints):
        ckpt = torch.load(ckpt_path, map_location='cpu')
        for param_name, param_data in ckpt.items():
Li Zhang's avatar
Li Zhang committed
345
            key, ext = param_name.split('.')[-2:]
346
347
348
            # column-parallel
            if key in ['w1', 'w3', 'wq', 'wk', 'wv', 'output']:
                size = param_data.size(0)
Li Zhang's avatar
Li Zhang committed
349
350
                if ext == 'weight':
                    param = get_param(
q.yao's avatar
q.yao committed
351
352
                        param_name,
                        [size * n_ckpt, param_data.size(1)])
Li Zhang's avatar
Li Zhang committed
353
354
355
356
                    param.data[size * i:size * (i + 1), :] = param_data
                else:  # bias
                    param = get_param(param_name, [size * n_ckpt])
                    param.data[size * i:size * (i + 1)] = param_data
357
358
359
            # row-parallel
            elif key in ['w2', 'wo', 'tok_embeddings']:
                size = param_data.size(-1)
Li Zhang's avatar
Li Zhang committed
360
361
362
363
364
365
366
                if ext == 'weight':
                    param = get_param(param_name,
                                      [param_data.size(0), size * n_ckpt])
                    param.data[:, size * i:size * (i + 1)] = param_data
                else:  # bias
                    param = get_param(param_name, [size])
                    param.data = param_data
367
368
369
370
371
372
            elif i == 0:
                param = get_param(param_name, param_data.size())
                param.data = param_data
        del ckpt

    for name, param in model_params.items():
373
        # transpose all weights as TurboMind is expecting column-major
374
375
376
377
378
379
        # weights: (output_dims, input_dims) -> (input_dims, output_dims)
        key = name.split('.')[-2]
        if key in ['w1', 'w3', 'wq', 'wk', 'wv', 'w2', 'wo']:
            param.data = param.data.t()

    # concat qkv projection
Li Zhang's avatar
Li Zhang committed
380
381
    for t in ['weight', 'bias']:
        for i in range(1000):
q.yao's avatar
q.yao committed
382
383
384
            _qkv = [
                f'layers.{i}.attention.{k}.{t}' for k in ['wq', 'wk', 'wv']
            ]
Li Zhang's avatar
Li Zhang committed
385
386
387
388
            try:
                qkv = tuple(map(model_params.pop, _qkv))
            except KeyError:
                break
389
390
            # concat by heads
            qkv = merge_qkv(*qkv, tp, dim=2 if t == 'weight' else 1)
Li Zhang's avatar
Li Zhang committed
391
392
            print(f'layers.{i}.attention.w_qkv.{t}', qkv.shape)
            model_params[f'layers.{i}.attention.w_qkv.{t}'] = qkv
393

394
    assert i == 0 or num_layer == i, f'miss matched layers: {num_layer} vs {i}'
395

396
    return export(model_name, num_layer, norm_eps, kv_head_num, model_params,
397
398
399
400
401
                  tokenizer_path, triton_models_path, tp)


def permute(x: torch.Tensor):
    SIZE_PER_HEAD = 128
402
    if x.shape[-1] > 1:
403
404
405
406
407
408
409
410
411
412
413
414
415
        dim = x.shape[-1]
        n_heads = dim // SIZE_PER_HEAD
        return x.view(-1, n_heads, 2,
                      dim // n_heads // 2).transpose(2, 3).reshape(-1, dim)
    else:  # scales, zeros
        dim = x.shape[0]
        n_heads = dim // SIZE_PER_HEAD
        return x.view(n_heads, 2, dim // n_heads // 2,
                      1).transpose(1, 2).reshape(dim, 1)


def deploy_hf(model_name: str, model_path: str, tokenizer_path: str,
              triton_models_path: str, tp: int):
lvhan028's avatar
lvhan028 committed
416
417
418
419
420
421
422
423
424
425
    """Deploy a model with huggingface transformers' format.

    Args:
        model_name (str): the name of the to-be-deployed model
        model_path (str): the path of the directory where the model weight
          files are
        tokenizer_path (str): the path of the tokenizer model path
        triton_models_path (str): the path of the exported triton models
        tp (int): the number of tensor parallelism
    """
426
427
428
429
430
    if tokenizer_path is None:
        tokenizer_path = osp.join(model_path, 'tokenizer.model')
    if osp.exists(tokenizer_path):
        shutil.copy(tokenizer_path,
                    osp.join(triton_models_path, 'tokenizer/tokenizer.model'))
431
432
433
434
435
        for _file in os.listdir(model_path):
            if _file.endswith('.json') or _file.endswith('.py'):
                json_path = osp.join(model_path, _file)
                shutil.copy(json_path,
                            osp.join(triton_models_path, 'tokenizer', _file))
436
        with get_package_root_path() as root_path:
437
            shutil.copy(osp.join(root_path, 'tokenizer.py'),
438
                        osp.join(triton_models_path, 'tokenizer'))
439
    else:
q.yao's avatar
q.yao committed
440
        print(f'tokenizer model {tokenizer_path} does not exist')
441
442
443
444
445
446
447
448
449
        exit(-1)

    # read model arguments from params.json
    try:
        params_path = osp.join(model_path, 'config.json')
        with open(params_path) as f:
            model_arg = json.load(f)
            num_layer = model_arg['num_hidden_layers']
            norm_eps = model_arg['rms_norm_eps']
Lyu Han's avatar
Lyu Han committed
450
451
452
453
            rope_theta = float(model_arg.get('rope_theta', 10000.0))
            max_position_embeddings = int(
                model_arg.get('max_position_embeddings', 0))
            repo_scaling = bool(model_arg.get('rope_scaling', False))
454
455
456
457
            if 'num_key_value_heads' in model_arg:
                kv_head_num = model_arg['num_key_value_heads']
            else:
                kv_head_num = model_arg['num_attention_heads']
458
459
460
461
462
    except Exception as e:
        print(f'get "num_hidden_layers" and "rms_norm_eps" from '
              f'{params_path} failed: {e}')
        return False

463
    # convert weights from hf to turbomind
464
465
466
    model_params = {}

    _qweight = 'weight'
Li Zhang's avatar
Li Zhang committed
467
    _suffixes = [_qweight, 'bias']
468

Chen Xin's avatar
Chen Xin committed
469
    _params = load_checkpoint(model_path)
470
471

    def get_tensor(name):
lvhan028's avatar
lvhan028 committed
472
        """return tensor according its name."""
473
474
        return _params[name]

Li Zhang's avatar
Li Zhang committed
475
    def get_tensor_transposed(name: str):
lvhan028's avatar
lvhan028 committed
476
        """return a transposed tensor according its name."""
477
        if name not in _params and name.find('bias'):
Li Zhang's avatar
Li Zhang committed
478
            return None
479
        return _params[name].t()
480

481
482
483
    w_pack = False
    if 'model.layers.0.self_attn.W_pack.weight' in _params:
        w_pack = True
484
485
486
487
488

    for i in range(1000):
        try:
            # attention weights
            for suffix in _suffixes:
489
                if w_pack:
490
491
492
493
                    _qkvo = [
                        f'model.layers.{i}.self_attn.{t}'
                        for t in ['W_pack', 'o_proj']
                    ]
494
                    qkv, o = map(get_tensor_transposed,
495
                                 map(('{}.' + suffix).format, _qkvo))
496
497
498
499
500
501
502
503
504
505

                    if qkv is None:
                        continue
                    _shape = qkv.shape[1] // 3
                    _qkv = torch.split(qkv, [_shape, _shape, _shape], dim=1)
                    q = _qkv[0]
                    k = _qkv[1]
                    v = _qkv[2]

                else:
506
507
508
                    _qkvo = [
                        f'model.layers.{i}.self_attn.{t}_proj' for t in 'qkvo'
                    ]
509
                    q, k, v, o = map(get_tensor_transposed,
510
                                     map(('{}.' + suffix).format, _qkvo))
Li Zhang's avatar
Li Zhang committed
511
512
513
514
515
516
517
                if q is None:
                    continue
                # q, k has different layout for fb & hf, convert to fb's
                # layout
                q = permute(q)
                k = permute(k)
                if suffix == _qweight:  # weight, qweight
518
519
                    qkv = merge_qkv(q, k, v, tp, dim=2)
                    print(suffix, qkv.shape)
Li Zhang's avatar
Li Zhang committed
520
                else:  # scales, zeros, bias
521
                    qkv = merge_qkv(q, k, v, tp, dim=1)
Li Zhang's avatar
Li Zhang committed
522
523
524
                    print(suffix, qkv.shape)
                for k, v in [('w_qkv', qkv), ('wo', o)]:
                    model_params[f'layers.{i}.attention.{k}.{suffix}'] = v
525
526
527
528
529
530
531
532
            # ffn weights
            _w123 = [
                f'model.layers.{i}.mlp.{t}_proj'
                for t in ['gate', 'down', 'up']
            ]
            for suffix in _suffixes:
                w1, w2, w3 = map(get_tensor_transposed,
                                 map(('{}.' + suffix).format, _w123))
Li Zhang's avatar
Li Zhang committed
533
534
535
536
537
538
                if w1 is None:
                    continue
                if suffix in ['scales', 'zeros', 'bias']:
                    w1, w2, w3 = map(lambda x: x.squeeze(dim=-1), [w1, w2, w3])
                for k, v in [('w1', w1), ('w2', w2), ('w3', w3)]:
                    model_params[f'layers.{i}.feed_forward.{k}.{suffix}'] = v
539
540
541
542
543
544
545
546
547
548
            other = [('attention_norm.weight', 'input_layernorm.weight'),
                     ('ffn_norm.weight', 'post_attention_layernorm.weight')]
            for ft, hf in other:
                model_params[f'layers.{i}.' +
                             ft] = get_tensor(f'model.layers.{i}.' + hf)
        except safetensors.SafetensorError:
            break
        except KeyError:
            break

Li Zhang's avatar
Li Zhang committed
549
    assert num_layer == i, f'miss matched layers: {num_layer} vs {i}'
550
551
552
553
554
555
556

    other = [('tok_embeddings.weight', 'model.embed_tokens.weight'),
             ('norm.weight', 'model.norm.weight'),
             ('output.weight', 'lm_head.weight')]
    for ft, hf in other:
        model_params[ft] = get_tensor(hf)

Lyu Han's avatar
Lyu Han committed
557
558
    if model_name == 'baichuan2-7b':
        # https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/modeling_baichuan.py#L507
559
560
561
562
        # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/main/modeling_baichuan.py#L507
        model_params['output.weight'] = torch.nn.functional.normalize(
            model_params['output.weight'])

Lyu Han's avatar
Lyu Han committed
563
564
565
566
567
568
569
570
571
572
573
    return export(model_name,
                  num_layer,
                  norm_eps,
                  kv_head_num,
                  model_params,
                  tokenizer_path,
                  triton_models_path,
                  tp,
                  max_position_embeddings=max_position_embeddings,
                  use_dynamic_ntk=repo_scaling,
                  rope_theta=rope_theta)
574
575


576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
def deploy_awq(model_name: str, model_path: str, tokenizer_path: str,
               triton_models_path: str, tp: int, quant_path: str,
               group_size: int):
    """Deploy a model with huggingface transformers' format.

    Args:
        model_name (str): the name of the to-be-deployed model
        model_path (str): the path of the directory where the model weight
          files are
        tokenizer_path (str): the path of the tokenizer model path
        triton_models_path (str): the path of the exported triton models
        tp (int): the number of tensor parallelism
        quant_path (str): path of the quantized model, which can be None
        group_size (int): a parameter used in AWQ to quantize fp16 weights
            to 4 bits
    """
    if tokenizer_path is None:
        tokenizer_path = osp.join(model_path, 'tokenizer.model')
    if osp.exists(tokenizer_path):
        shutil.copy(tokenizer_path,
                    osp.join(triton_models_path, 'tokenizer/tokenizer.model'))
        for _file in os.listdir(model_path):
            if _file.endswith('.json') or _file.endswith('.py'):
                json_path = osp.join(model_path, _file)
                shutil.copy(json_path,
                            osp.join(triton_models_path, 'tokenizer', _file))
        with get_package_root_path() as root_path:
603
            shutil.copy(osp.join(root_path, 'tokenizer.py'),
604
605
606
607
608
609
610
611
612
613
614
615
                        osp.join(triton_models_path, 'tokenizer'))
    else:
        print(f'tokenizer model {tokenizer_path} does not exist')
        exit(-1)

    # read model arguments from params.json
    try:
        params_path = osp.join(model_path, 'config.json')
        with open(params_path) as f:
            model_arg = json.load(f)
            num_layer = model_arg['num_hidden_layers']
            norm_eps = model_arg['rms_norm_eps']
Lyu Han's avatar
Lyu Han committed
616
            rope_theta = float(model_arg.get('rope_theta', 10000.0))
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
            if 'num_key_value_heads' in model_arg:
                kv_head_num = model_arg['num_key_value_heads']
            else:
                kv_head_num = model_arg['num_attention_heads']
    except Exception as e:
        print(f'get "num_hidden_layers" and "rms_norm_eps" from '
              f'{params_path} failed: {e}')
        return False

    # convert weights from hf to turbomind
    if quant_path is None:
        _files = [
            osp.join(model_path, file) for file in os.listdir(model_path)
            if file.endswith('.bin')
        ]
        _files = sorted(_files)
    else:
        _files = [quant_path]

    model_params = {}

    _params = {}
    for _file in _files:
        _tmp = torch.load(_file, map_location='cpu')
        _params.update(_tmp)

    def get_tensor(name):
        """return tensor according its name."""
        return _params[name].cuda().contiguous()

    # import _turbomind as _tm
    # TODO: find another way import _turbomind
    lmdeploy_dir = osp.split(lmdeploy.__file__)[0]
    sys.path.append(osp.join(lmdeploy_dir, 'lib'))
    import _turbomind as _tm  # noqa: E402

Li Zhang's avatar
Li Zhang committed
653
    def transpose_qk_s4(src: torch.Tensor):
654
655
656
657
658
659
        assert src.is_contiguous()
        dst = torch.zeros_like(src)
        _tm.transpose_qk_s4_k_m8(src, dst,
                                 src.size(-1) * 8, src.size(0), group_size)
        return dst

Li Zhang's avatar
Li Zhang committed
660
661
662
    def fuse_w1_w3_s4(w1_qw: torch.Tensor, w1_qz: torch.Tensor,
                      w1_s: torch.Tensor, w3_qw: torch.Tensor,
                      w3_qz: torch.Tensor, w3_s: torch.Tensor):
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683

        def fuse(a: torch.Tensor, b: torch.Tensor):
            ab = torch.cat((a, b)).contiguous()
            _ab = torch.zeros_like(ab)
            _tm.fuse_w1_w3_s4_k_m8(ab, _ab, a.size(-1) * 8, a.size(0))
            return _ab.view(a.size(0), -1)

        w13_qw = fuse(w1_qw, w3_qw)
        w13_qz = fuse(w1_qz, w3_qz)

        w13_s = torch.cat((w1_s, w3_s)).view(2, w1_s.size(0), -1)
        w13_s = w13_s.permute(1, 2, 0).contiguous().view(w1_s.size(0), -1)

        return w13_qw, w13_qz, w13_s

    def convert_s4(qw: torch.Tensor, qz: torch.Tensor, s: torch.Tensor,
                   group_size: int):
        assert qw.is_contiguous()
        assert qz.is_contiguous()
        assert s.is_contiguous()
        _qw = torch.zeros_like(qw)
Li Zhang's avatar
Li Zhang committed
684
        _sz = torch.zeros_like(s, dtype=torch.int32)  # half2
685
686
687
688
689
        _ws = torch.zeros_like(s)
        _tm.convert_s4_k_m8(_qw, _sz, _ws, qw, s, qz,
                            qw.size(-1) * 8, qw.size(0), group_size)
        return _qw, _sz

Li Zhang's avatar
Li Zhang committed
690
691
692
693
    def tp_m_s4(x: torch.Tensor, tp: int):
        return x.view(x.size(0) // 32, tp, -1, 128).permute(0, 2, 3,
                                                            1).contiguous()

694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
    attn_bias = False

    for i in range(num_layer):
        print(i)

        # attention weights
        q_qw = get_tensor(f'model.layers.{i}.self_attn.q_proj.qweight')
        k_qw = get_tensor(f'model.layers.{i}.self_attn.k_proj.qweight')
        v_qw = get_tensor(f'model.layers.{i}.self_attn.v_proj.qweight')
        o_qw = get_tensor(f'model.layers.{i}.self_attn.o_proj.qweight')

        q_qz = get_tensor(f'model.layers.{i}.self_attn.q_proj.qzeros')
        k_qz = get_tensor(f'model.layers.{i}.self_attn.k_proj.qzeros')
        v_qz = get_tensor(f'model.layers.{i}.self_attn.v_proj.qzeros')
        o_qz = get_tensor(f'model.layers.{i}.self_attn.o_proj.qzeros')

        q_s = get_tensor(f'model.layers.{i}.self_attn.q_proj.scales')
        k_s = get_tensor(f'model.layers.{i}.self_attn.k_proj.scales')
        v_s = get_tensor(f'model.layers.{i}.self_attn.v_proj.scales')
        o_s = get_tensor(f'model.layers.{i}.self_attn.o_proj.scales')

        try:
            q_b = get_tensor(f'model.layers.{i}.self_attn.q_proj.bias')
            k_b = get_tensor(f'model.layers.{i}.self_attn.k_proj.bias')
            v_b = get_tensor(f'model.layers.{i}.self_attn.v_proj.bias')
            o_b = get_tensor(f'model.layers.{i}.self_attn.o_proj.bias')
            attn_bias = True
        except:  # noqa: E722
            pass

Li Zhang's avatar
Li Zhang committed
724
725
726
727
        q_qw = transpose_qk_s4(q_qw)
        k_qw = transpose_qk_s4(k_qw)
        q_qz = transpose_qk_s4(q_qz)
        k_qz = transpose_qk_s4(k_qz)
728
729
730
731
732
733
734
735
736
        q_s = permute(q_s)
        k_s = permute(k_s)

        qkv_qw = merge_qkv(q_qw, k_qw, v_qw, tp, dim=2)
        qkv_qz = merge_qkv(q_qz, k_qz, v_qz, tp, dim=2)
        qkv_s = merge_qkv(q_s, k_s, v_s, tp, dim=2)

        qkv_qw, qkv_sz = convert_s4(qkv_qw, qkv_qz, qkv_s, group_size)

Li Zhang's avatar
Li Zhang committed
737
738
        qkv_qw = tp_m_s4(qkv_qw, tp)

739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
        model_params[f'layers.{i}.attention.w_qkv.qweight'] = qkv_qw
        model_params[f'layers.{i}.attention.w_qkv.scales_zeros'] = qkv_sz

        o_qw, o_sz = convert_s4(o_qw, o_qz, o_s, group_size)

        model_params[f'layers.{i}.attention.wo.qweight'] = o_qw
        model_params[f'layers.{i}.attention.wo.scales_zeros'] = o_sz

        if attn_bias:
            q_b = permute(q_b)
            k_b = permute(k_b)
            qkv_b = merge_qkv(q_b, k_b, v_b, tp, dim=1)
            model_params[f'layers.{i}.attention.w_qkv.bias'] = qkv_b
            model_params[f'layers.{i}.attention.wo.bias'] = o_b

        # ffn weights
        w1_qw = get_tensor(f'model.layers.{i}.mlp.gate_proj.qweight')
        w2_qw = get_tensor(f'model.layers.{i}.mlp.down_proj.qweight')
        w3_qw = get_tensor(f'model.layers.{i}.mlp.up_proj.qweight')

        w1_qz = get_tensor(f'model.layers.{i}.mlp.gate_proj.qzeros')
        w2_qz = get_tensor(f'model.layers.{i}.mlp.down_proj.qzeros')
        w3_qz = get_tensor(f'model.layers.{i}.mlp.up_proj.qzeros')

        w1_s = get_tensor(f'model.layers.{i}.mlp.gate_proj.scales')
        w2_s = get_tensor(f'model.layers.{i}.mlp.down_proj.scales')
        w3_s = get_tensor(f'model.layers.{i}.mlp.up_proj.scales')

Li Zhang's avatar
Li Zhang committed
767
768
        w13_qw, w13_qz, w13_s = fuse_w1_w3_s4(w1_qw, w1_qz, w1_s, w3_qw, w3_qz,
                                              w3_s)
769
770
771
772

        w13_qw, w13_sz = convert_s4(w13_qw, w13_qz, w13_s, group_size)
        w2_qw, w2_sz = convert_s4(w2_qw, w2_qz, w2_s, group_size)

Li Zhang's avatar
Li Zhang committed
773
774
        w13_qw = tp_m_s4(w13_qw, tp)

775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
        model_params[f'layers.{i}.feed_forward.w13.qweight'] = w13_qw
        model_params[f'layers.{i}.feed_forward.w13.scales_zeros'] = w13_sz

        model_params[f'layers.{i}.feed_forward.w2.qweight'] = w2_qw
        model_params[f'layers.{i}.feed_forward.w2.scales_zeros'] = w2_sz

        # norm weights
        attn_norm = get_tensor(f'model.layers.{i}.input_layernorm.weight')
        ffn_norm = get_tensor(
            f'model.layers.{i}.post_attention_layernorm.weight')

        model_params[f'layers.{i}.attention_norm.weight'] = attn_norm
        model_params[f'layers.{i}.ffn_norm.weight'] = ffn_norm

    other = [('tok_embeddings.weight', 'model.embed_tokens.weight'),
             ('norm.weight', 'model.norm.weight'),
             ('output.weight', 'lm_head.weight')]
    for ft, hf in other:
        model_params[ft] = get_tensor(hf)

    return export(model_name,
                  num_layer,
                  norm_eps,
                  kv_head_num,
                  model_params,
                  tokenizer_path,
                  triton_models_path,
                  tp,
                  weight_type='int4',
Lyu Han's avatar
Lyu Han committed
804
805
                  group_size=group_size,
                  rope_theta=rope_theta)
806
807


808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
def deploy_qwen(model_name: str, model_path: str, tokenizer_path: str,
                triton_models_path: str, tp: int):
    """Deploy a model with huggingface transformers' format.

    Args:
        model_name (str): the name of the to-be-deployed model
        model_path (str): the path of the directory where the model weight
          files are
        tokenizer_path (str): the path of the tokenizer model path
        triton_models_path (str): the path of the exported triton models
        tp (int): the number of tensor parallelism
        quant_path (str): path of the quantized model, which can be None
        group_size (int): a parameter used in AWQ to quantize fp16 weights
            to 4 bits
    """

    if osp.exists(model_path):
        shutil.copy(osp.join(model_path, 'qwen.tiktoken'),
                    osp.join(triton_models_path, 'tokenizer'))
        for _file in os.listdir(model_path):
            if _file.endswith('.json') or _file.endswith('.py'):
                json_path = osp.join(model_path, _file)
                shutil.copy(json_path,
                            osp.join(triton_models_path, 'tokenizer', _file))
        with get_package_root_path() as root_path:
833
            shutil.copy(osp.join(root_path, 'tokenizer.py'),
834
835
836
837
838
839
840
841
842
843
844
845
                        osp.join(triton_models_path, 'tokenizer'))
    else:
        print(f'tokenizer model {tokenizer_path} does not exist')
        exit(-1)

    # read model arguments from params.json
    try:
        params_path = osp.join(model_path, 'config.json')
        with open(params_path) as f:
            config = json.load(f)
            num_layer = config['num_hidden_layers']
            norm_eps = config['layer_norm_epsilon']
Lyu Han's avatar
Lyu Han committed
846
            rope_theta = float(config.get('rotary_emb_base', 10000.0))
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
            if 'num_key_value_heads' in config:
                kv_head_num = config['num_key_value_heads']
            else:
                kv_head_num = config['num_attention_heads']
            seq_length = config['seq_length']
            use_dynamic_ntk = config['use_dynamic_ntk']
            use_logn_attn = config['use_logn_attn']
    except Exception as e:
        print(f'get "num_hidden_layers" and "layer_norm_epsilon" from '
              f'{params_path} failed: {e}')
        return False

    # convert weights from hf to turbomind
    model_params = {}

Chen Xin's avatar
Chen Xin committed
862
    _params = load_checkpoint(model_path)
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926

    def get_tensor(name, trans=True):
        """return a transposed tensor according its name."""
        if trans:
            return _params[name].cuda().t()
        else:
            return _params[name].cuda()

    for i in range(num_layer):
        print(i)

        # qkv weights
        qkv_w = get_tensor(f'transformer.h.{i}.attn.c_attn.weight')
        q_w, k_w, v_w = torch.split(qkv_w, qkv_w.size(-1) // 3, dim=-1)
        q_w, k_w = permute(q_w), permute(k_w)
        qkv_w = merge_qkv(q_w, k_w, v_w, tp, dim=2)
        model_params[f'layers.{i}.attention.w_qkv.weight'] = qkv_w

        # qkv bias
        qkv_b = get_tensor(f'transformer.h.{i}.attn.c_attn.bias')
        q_b, k_b, v_b = torch.split(qkv_b, qkv_b.size(-1) // 3)
        q_b, k_b = permute(q_b), permute(k_b)
        qkv_b = merge_qkv(q_b, k_b, v_b, tp, dim=1)
        model_params[f'layers.{i}.attention.w_qkv.bias'] = qkv_b

        # o weights
        o_w = get_tensor(f'transformer.h.{i}.attn.c_proj.weight')
        model_params[f'layers.{i}.attention.wo.weight'] = o_w
        model_params[f'layers.{i}.attention.wo.bias'] = torch.zeros_like(q_b)

        # ffn weights
        # ours: w2(silu(w1(x)) * w3(x))
        # qwen: c_proj(w1(x) * silu(w2(x)))
        w1 = get_tensor(f'transformer.h.{i}.mlp.w2.weight')
        w3 = get_tensor(f'transformer.h.{i}.mlp.w1.weight')
        w2 = get_tensor(f'transformer.h.{i}.mlp.c_proj.weight')
        model_params[f'layers.{i}.feed_forward.w1.weight'] = w1
        model_params[f'layers.{i}.feed_forward.w2.weight'] = w2
        model_params[f'layers.{i}.feed_forward.w3.weight'] = w3

        # norm weights
        attn_norm = get_tensor(f'transformer.h.{i}.ln_1.weight')
        ffn_norm = get_tensor(f'transformer.h.{i}.ln_2.weight')

        model_params[f'layers.{i}.attention_norm.weight'] = attn_norm
        model_params[f'layers.{i}.ffn_norm.weight'] = ffn_norm

    other = [('tok_embeddings.weight', 'transformer.wte.weight'),
             ('norm.weight', 'transformer.ln_f.weight'),
             ('output.weight', 'lm_head.weight')]
    for ft, hf in other:
        model_params[ft] = get_tensor(hf, trans=False)

    return export(model_name,
                  num_layer,
                  norm_eps,
                  kv_head_num,
                  model_params,
                  model_path,
                  triton_models_path,
                  tp,
                  max_position_embeddings=seq_length,
                  use_dynamic_ntk=use_dynamic_ntk,
                  use_logn_attn=use_logn_attn,
Lyu Han's avatar
Lyu Han committed
927
                  rope_theta=rope_theta,
928
929
930
                  tokenizer_info=tokenizer_info_qwen)


931
def pack_model_repository(workspace_path: str):
lvhan028's avatar
lvhan028 committed
932
933
934
935
936
    """package the model repository.

    Args:
        workspace_path: the path of workspace
    """
937
938
939
940
941
942
943
944
945
    os.symlink(src='../../tokenizer',
               dst=osp.join(workspace_path, 'triton_models', 'preprocessing',
                            '1', 'tokenizer'))
    os.symlink(src='../../tokenizer',
               dst=osp.join(workspace_path, 'triton_models', 'postprocessing',
                            '1', 'tokenizer'))
    os.symlink(src='../../weights',
               dst=osp.join(workspace_path, 'triton_models', 'interactive',
                            '1', 'weights'))
946
947
    model_repo_dir = osp.join(workspace_path, 'model_repository')
    os.makedirs(model_repo_dir, exist_ok=True)
lvhan028's avatar
lvhan028 committed
948
    os.symlink(src=osp.join('../triton_models/interactive'),
949
               dst=osp.join(model_repo_dir, 'turbomind'))
lvhan028's avatar
lvhan028 committed
950
951
952
953
    os.symlink(src=osp.join('../triton_models/preprocessing'),
               dst=osp.join(model_repo_dir, 'preprocessing'))
    os.symlink(src=osp.join('../triton_models/postprocessing'),
               dst=osp.join(model_repo_dir, 'postprocessing'))
954
955
956
957


def main(model_name: str,
         model_path: str,
958
         model_format: str = None,
959
960
         tokenizer_path: str = None,
         dst_path: str = './workspace',
961
962
963
         tp: int = 1,
         quant_path: str = None,
         group_size: int = 0):
964
    """deploy llama family models via turbomind.
965
966
967

    Args:
        model_name (str): the name of the to-be-deployed model, such as
968
            llama-7b, llama-13b, vicuna-7b and etc
969
970
971
972
973
        model_path (str): the directory path of the model
        model_format (str): the format of the model, fb or hf. 'fb' stands for
            META's llama format, and 'hf' means huggingface format
        tokenizer_path (str): the path of tokenizer model
        dst_path (str): the destination path that saves outputs
Chen Xin's avatar
Chen Xin committed
974
        tp (int): the number of GPUs used for tensor parallelism, should be 2^n
975
976
977
        quant_path (str): path of the quantized model, which can be None
        group_size (int): a parameter used in AWQ to quantize fp16 weights
            to 4 bits
978
    """
979
980
981
    assert model_name in MODELS.module_dict.keys(), \
        f"'{model_name}' is not supported. " \
        f'The supported models are: {MODELS.module_dict.keys()}'
982

Chen Xin's avatar
Chen Xin committed
983
984
    assert ((tp & (tp - 1) == 0) and tp != 0), 'tp should be 2^n'

985
    if model_format is None:
986
        model_format = 'qwen' if model_name.startswith('qwen') else 'hf'
987

988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
    if model_format not in supported_formats:
        print(f'the model format "{model_format}" is not supported. '
              f'The supported format are: {supported_formats}')
        exit(-1)

    if model_format == 'llama' and tokenizer_path is None:
        print('The model is llama. Its tokenizer model path should be '
              'specified')
        exit(-1)

    if not create_workspace(dst_path):
        exit(-1)

    triton_models_path = copy_triton_model_templates(dst_path)
    if triton_models_path is None:
        exit(-1)

    if model_format == 'llama':
        res = deploy_llama(model_name, model_path, tokenizer_path,
                           triton_models_path, tp)
1008
    elif model_format == 'hf':
1009
1010
        res = deploy_hf(model_name, model_path, tokenizer_path,
                        triton_models_path, tp)
1011
1012
1013
    elif model_format == 'awq':
        res = deploy_awq(model_name, model_path, tokenizer_path,
                         triton_models_path, tp, quant_path, group_size)
1014
1015
1016
    elif model_format == 'qwen':
        res = deploy_qwen(model_name, model_path, tokenizer_path,
                          triton_models_path, tp)
1017
1018
1019
1020

    # update `tensor_para_size` in `triton_models/interactive/config.pbtxt`
    with open(osp.join(triton_models_path, 'interactive/config.pbtxt'),
              'a') as f:
1021
1022
1023
1024
1025
        param = \
            'parameters {\n  key: "tensor_para_size"\n  value: {\n    ' \
            'string_value: ' + f'"{tp}"\n' + '  }\n}\n' + \
            'parameters {\n  key: "model_name"\n  value: {\n    ' \
            'string_value: ' + f'"{model_name}"\n' + '  }\n}\n'
1026
1027
        f.write(param)
    if not res:
1028
        print(f'deploy model "{model_name}" via turbomind failed')
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
        destroy_workspace(dst_path)
        exit(-1)

    # pack model repository for triton inference server
    pack_model_repository(dst_path)

    # update the value of $TP in `service_docker_up.sh`
    file_path = osp.join(dst_path, 'service_docker_up.sh')
    with open(file_path, 'r') as f:
        content = f.read()
        content = re.sub('TP=1', f'TP={tp}', content)
    with open(file_path, 'w') as f:
        f.write(content)


if __name__ == '__main__':
1045
1046
    import fire

1047
    fire.Fire(main)