gpt-bench.py 4.07 KB
Newer Older
aiss's avatar
aiss committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
'''Copyright The Microsoft DeepSpeed Team'''

import os
import torch
import time
import deepspeed
import argparse
from transformers import pipeline
from deepspeed.accelerator import get_accelerator

parser = argparse.ArgumentParser()
parser.add_argument("--model", "-m", type=str, help="hf model name")
parser.add_argument("--deepspeed", action="store_true", help="use deepspeed inference")
parser.add_argument("--dtype",
                    type=str,
                    default="fp16",
                    choices=["fp16",
                             "fp32",
                             "int8"],
                    help="int8, fp16, or fp32")
parser.add_argument("--graphs", action="store_true", help="CUDA Graphs on")
parser.add_argument("--kernel-inject", action="store_true", help="inject kernels on")
parser.add_argument("--max-tokens", type=int, default=50, help="max new tokens")
parser.add_argument("--local_rank",
                    type=int,
                    default=int(os.getenv("LOCAL_RANK",
                                          "0")),
                    help="local rank")
parser.add_argument("--world_size",
                    type=int,
                    default=int(os.getenv("WORLD_SIZE",
                                          "1")),
                    help="world size")
parser.add_argument("--trials", type=int, default=30, help="number of trials")
args = parser.parse_args()


def print_latency(latency_set, title, warmup=3):
    # trim warmup queries
    latency_set = list(latency_set)
    latency_set = latency_set[warmup:]
    count = len(latency_set)
    if count > 0:
        latency_set.sort()
        n50 = (count - 1) * 0.5 + 1
        n90 = (count - 1) * 0.9 + 1
        n95 = (count - 1) * 0.95 + 1
        n99 = (count - 1) * 0.99 + 1
        n999 = (count - 1) * 0.999 + 1

        avg = sum(latency_set) / count
        p50 = latency_set[int(n50) - 1]
        p90 = latency_set[int(n90) - 1]
        p95 = latency_set[int(n95) - 1]
        p99 = latency_set[int(n99) - 1]
        p999 = latency_set[int(n999) - 1]

        print(f"====== latency stats {title} ======")
        print("\tAvg Latency: {0:8.2f} ms".format(avg * 1000))
        print("\tP50 Latency: {0:8.2f} ms".format(p50 * 1000))
        print("\tP90 Latency: {0:8.2f} ms".format(p90 * 1000))
        print("\tP95 Latency: {0:8.2f} ms".format(p95 * 1000))
        print("\tP99 Latency: {0:8.2f} ms".format(p99 * 1000))
        print("\t999 Latency: {0:8.2f} ms".format(p999 * 1000))


deepspeed.init_distributed()

if args.local_rank == 0:
    print("BENCHMARK SETTINGS:")
    print(f"\tMODEL: {args.model}")
    print(f"\tMAX_TOKENS: {args.max_tokens}")
    print(f"\tDTYPE: {args.dtype}")
    print(f"\tCUDA_GRAPHS: {args.graphs}")
    print(f"\tKERNEL_INJECT: {args.kernel_inject}")

if args.dtype == "int8":
    dtype = torch.int8
elif args.dtype == "fp16":
    dtype = torch.float16
else:
    dtype = torch.float32

pipe = pipeline("text-generation",
                model=args.model,
                framework="pt",
                device=args.local_rank)

if dtype == torch.float16:
    pipe.model.half()

if args.deepspeed:
    pipe.model = deepspeed.init_inference(
        pipe.model,
        dtype=dtype,
        mp_size=args.world_size,
        replace_with_kernel_inject=args.kernel_inject,
        enable_cuda_graph=args.graphs,
    )
    pipe.model.profile_model_time()

responses = []
times = []
mtimes = []
for i in range(args.trials):
    get_accelerator().synchronize()
    start = time.time()
    r = pipe("DeepSpeed is", do_sample=False, max_new_tokens=args.max_tokens)
    get_accelerator().synchronize()
    end = time.time()
    responses.append(r)
    times.append(end - start)  # / (args.max_tokens - 3))
    mtimes.append(sum(pipe.model.model_times()))

if args.local_rank == 0:
    print_latency(times, "(e2e) latency")
    print_latency(mtimes, "(model-only) latency")
    print_latency(map(lambda t: t / (args.max_tokens - 3),
                      times),
                  "(e2e) per token latency")
    print(f"RESPONSE 0:")
    print("-" * 30)
    print(responses[0][0]["generated_text"])
    print("-" * 30)