jiuge.py 6.26 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import infinicore
from transformers import AutoTokenizer
from tokenizers import decoders as _dec
from infinilm.modeling_utils import load_model_state_dict_by_file
import infinilm
from infinilm.distributed import DistConfig
import argparse
import sys
import time
import os

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python"))


def get_args():
    parser = argparse.ArgumentParser(description="run Llama args")

    parser.add_argument(
        "--cpu",
        action="store_true",
        help="Run cpu test",
    )
    parser.add_argument(
        "--nvidia",
        action="store_true",
        help="Run nvidia test",
    )
    parser.add_argument(
        "--metax",
        action="store_true",
        help="Run metax test",
    )
    parser.add_argument(
        "--moore",
        action="store_true",
        help="Run moore test",
    )
    parser.add_argument(
        "--iluvatar",
        action="store_true",
        help="Run iluvatar test",
    )
43
44
45
46
47
    parser.add_argument(
        "--cambricon",
        action="store_true",
        help="Run cambricon test",
    )
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
    parser.add_argument(
        "--model_path",
        type=str,
        required=True,
        help="model_path",
    )
    parser.add_argument(
        "--max_new_tokens",
        type=int,
        default=100,
        help="max_new_tokens",
    )
    parser.add_argument(
        "--backend",
        type=str,
Your Name's avatar
Your Name committed
63
        default="cpp",
64
65
66
        help="python or cpp model",
    )
    parser.add_argument(
pengcheng888's avatar
pengcheng888 committed
67
        "--batch-size",
68
69
70
71
72
73
74
75
76
77
78
79
80
        type=int,
        default=1,
        help="number of prompts in a batch",
    )
    parser.add_argument(
        "--prompt",
        type=str,
        default="How are you",
        help="input prompt",
    )
    parser.add_argument(
        "--tp",
        type=int,
Your Name's avatar
Your Name committed
81
        default=1,
82
83
84
85
86
87
88
89
90
91
92
93
        help="total rank for tensor parallel",
    )

    return parser.parse_args()


def test(
    prompts: str | list[str],
    model_path,
    max_new_tokens=100,
    infini_device=infinicore.device("cpu", 0),
    backend="python",
Your Name's avatar
Your Name committed
94
    tp=1,
95
96
97
98
99
100
101
102
103
):
    model_path = os.path.expanduser(model_path)
    # ---------------------------------------------------------------------------- #
    #                        创建模型,
    # ---------------------------------------------------------------------------- #
    model = infinilm.AutoLlamaModel.from_pretrained(
        model_path,
        device=infini_device,
        backend=backend,
Your Name's avatar
Your Name committed
104
        distributed_config=DistConfig(tp),
105
106
107
108
109
    )

    # ---------------------------------------------------------------------------- #
    #                        加载权重
    # ---------------------------------------------------------------------------- #
110
    load_model_state_dict_by_file(model, model_path, dtype=model.config.dtype)
111
112
113
114
115

    # ---------------------------------------------------------------------------- #
    #                        创建 tokenizer
    # ---------------------------------------------------------------------------- #
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
Your Name's avatar
Your Name committed
116

117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
    if "llama" == model.config.model_type:
        backend = getattr(tokenizer, "backend_tokenizer", None)
        target = getattr(backend, "_tokenizer", backend)
        norm = getattr(target, "normalizer", None)
        dec = getattr(target, "decoder", None)
        sn = repr(norm)[:800] if norm is not None else ""
        sd = repr(dec)[:800] if dec is not None else ""
        has_prepend = "Prepend" in sn
        has_strip = "Strip" in sd
        if has_prepend and has_strip:
            target.decoder = _dec.Sequence(
                [
                    _dec.Replace("▁", " "),
                    _dec.ByteFallback(),
                    _dec.Fuse(),
                ]
            )

    # ---------------------------------------------------------------------------- #
    #                        token编码
    # ---------------------------------------------------------------------------- #
    # prompt = "山东最高的山是?"
    if isinstance(prompts, str):
        prompts = [prompts]
    input_contents = [
        tokenizer.apply_chat_template(
            conversation=[{"role": "user", "content": prompt}],
            add_generation_prompt=True,
            tokenize=False,
        )
        for prompt in prompts
    ]
PanZezhong's avatar
PanZezhong committed
149

150
151
152
153
    input_ids_list = tokenizer.batch_encode_plus(input_contents)[
        "input_ids"
    ]  # List: [[1, 1128, 526, 366, 29892]]

PanZezhong's avatar
PanZezhong committed
154
155
156
157
158
159
    # 根据输入长度和最长输出长度创建KVCache
    model.reset_cache(
        1 if prompts is str else len(prompts),
        max_new_tokens + len(input_ids_list[0]),
    )

160
161
162
    # ---------------------------------------------------------------------------- #
    #                        自回归生成
    # ---------------------------------------------------------------------------- #
PanZezhong's avatar
PanZezhong committed
163
    print(input_contents[0], end="", flush=True)
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
    input_ids_infini = infinicore.from_list(input_ids_list)

    t1 = time.time()
    print("=================== start generate ====================")
    model.generate(
        input_ids_infini,
        max_new_tokens=max_new_tokens,
        tokenizer=tokenizer,
    )
    t2 = time.time()

    print(
        f"total_time: {round((t2 - t1) * 1000, 2)} ms",
    )


if __name__ == "__main__":
    args = get_args()
    print(args)

    # Parse command line arguments
    device_str = "cpu"
    if args.cpu:
        device_str = "cpu"
    elif args.nvidia:
        device_str = "cuda"
    elif args.metax:
        device_str = "cuda"
    elif args.moore:
        device_str = "musa"
    elif args.iluvatar:
        device_str = "cuda"
196
197
    elif args.cambricon:
        device_str = "mlu"
198
199
    else:
        print(
pengcheng888's avatar
pengcheng888 committed
200
201
            "Usage:  python examples/jiuge.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>\n"
            "such as, python examples/jiuge.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0"
202
203
204
205
206
207
208
        )
        sys.exit(1)
    prompts = [args.prompt for _ in range(args.batch_size)]

    model_path = args.model_path
    max_new_tokens = args.max_new_tokens
    backend = args.backend
Your Name's avatar
Your Name committed
209
    tp = args.tp
210
211
212
213
214
215
216
217
218

    infini_device = infinicore.device(device_str, 0)

    test(
        prompts,
        model_path,
        max_new_tokens,
        infini_device=infini_device,
        backend=backend,
Your Name's avatar
Your Name committed
219
        tp=tp,
220
    )