mlperf_conf.h 5.2 KB
Newer Older
yangzhong's avatar
yangzhong committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
const char* mlperf_conf =
"# The format of this config file is 'key = value'.\n"
"# The key has the format 'model.scenario.key'. Value is mostly int64_t.\n"
"# Model maybe '*' as wildcard. In that case the value applies to all models.\n"
"# All times are in milli seconds\n"
"\n"
"# Set performance_sample_count for each model.\n"
"# User can optionally set this to higher values in user.conf.\n"
"resnet50.*.performance_sample_count_override = 1024\n"
"ssd-mobilenet.*.performance_sample_count_override = 256\n"
"retinanet.*.performance_sample_count_override = 64\n"
"bert.*.performance_sample_count_override = 10833\n"
"dlrm.*.performance_sample_count_override = 204800\n"
"dlrm-v2.*.performance_sample_count_override = 204800\n"
"rnnt.*.performance_sample_count_override = 2513\n"
"gptj.*.performance_sample_count_override = 13368\n"
"mixtral-8x7b.*.performance_sample_count_override = 15000\n"
"llama2-70b.*.performance_sample_count_override = 24576\n"
"llama2-70b-interactive.*.performance_sample_count_override = 24576\n"
"llama3_1-405b.*.performance_sample_count_override = 8313\n"
"stable-diffusion-xl.*.performance_sample_count_override = 5000\n"
"rgat.*.performance_sample_count_override = 788379\n"
"# set to 0 to let entire sample set to be performance sample\n"
"3d-unet.*.performance_sample_count_override = 0\n"
"\n"
"# Set seeds.\n"
"*.*.qsl_rng_seed = 6023615788873153749\n"
"*.*.sample_index_rng_seed = 15036839855038426416\n"
"*.*.schedule_rng_seed = 9933818062894767841\n"
"# Set seeds for TEST_05.\n"
"*.*.test05_qsl_rng_seed = 7975553102935885558\n"
"*.*.test05_sample_index_rng_seed = 11403566307062068064\n"
"*.*.test05_schedule_rng_seed = 15816800565822761601\n"
"\n"
"\n"
"*.SingleStream.target_latency_percentile = 90\n"
"*.SingleStream.min_duration = 600000\n"
"\n"
"*.MultiStream.target_latency_percentile = 99\n"
"*.MultiStream.samples_per_query = 8\n"
"*.MultiStream.min_duration = 600000\n"
"*.MultiStream.min_query_count = 662\n"
"retinanet.MultiStream.target_latency = 528\n"
"\n"
"# 3D-UNet uses equal issue mode because it has non-uniform inputs\n"
"3d-unet.*.sample_concatenate_permutation = 1\n"
"\n"
"# R-GAT uses equal issue mode because it may have non-uniform inputs\n"
"rgat.*.sample_concatenate_permutation = 1\n"
"\n"
"# LLM benchmarks have non-uniform inputs and outputs, and use equal issue mode for all latency scenario\n"
"gptj.*.sample_concatenate_permutation = 1\n"
"llama2-70b.*.sample_concatenate_permutation = 1\n"
"llama2-70b-interactive.*.sample_concatenate_permutation = 1\n"
"mixtral-8x7b.*.sample_concatenate_permutation = 1\n"
"llama3_1-405b.*.sample_concatenate_permutation = 1\n"
"\n"
"*.Server.target_latency = 10\n"
"*.Server.target_latency_percentile = 99\n"
"*.Server.target_duration = 0\n"
"*.Server.min_duration = 600000\n"
"resnet50.Server.target_latency = 15\n"
"retinanet.Server.target_latency = 100\n"
"bert.Server.target_latency = 130\n"
"dlrm.Server.target_latency = 60\n"
"dlrm-v2.Server.target_latency = 60\n"
"rnnt.Server.target_latency = 1000\n"
"gptj.Server.target_latency = 20000\n"
"stable-diffusion-xl.Server.target_latency = 20000\n"
"# Benchmarks that measure token latencies\n"
"llama2-70b.*.use_token_latencies = 1\n"
"llama2-70b-interactive.*.use_token_latencies = 1\n"
"mixtral-8x7b.*.use_token_latencies = 1\n"
"llama3_1-405b.*.use_token_latencies = 1\n"
"# gptj benchmark infers token latencies\n"
"gptj.*.infer_token_latencies = 1\n"
"gptj.*.token_latency_scaling_factor = 69\n"
"# Only ttft and tpot are tracked for the llama2-70b, mixtral-8x7B & llama3_1-405b benchmark therefore target_latency = 0\n"
"llama2-70b.Server.target_latency = 0\n"
"llama2-70b.Server.ttft_latency = 2000\n"
"llama2-70b.Server.tpot_latency = 200\n"
"\n"
"# Target Latencies for low latency setting\n"
"llama2-70b-interactive.Server.target_latency = 0\n"
"llama2-70b-interactive.Server.ttft_latency = 450\n"
"llama2-70b-interactive.Server.tpot_latency = 40\n"
"\n"
"mixtral-8x7b.Server.target_latency = 0\n"
"mixtral-8x7b.Server.ttft_latency = 2000\n"
"mixtral-8x7b.Server.tpot_latency = 200\n"
"\n"
"llama3_1-405b.Server.target_latency = 0\n"
"llama3_1-405b.Server.ttft_latency = 6000\n"
"llama3_1-405b.Server.tpot_latency = 175\n"
"\n"
"*.Offline.target_latency_percentile = 90\n"
"*.Offline.min_duration = 600000\n"
"\n"
"# In Offline scenario, we always have one query. But LoadGen maps this to\n"
"# min_sample_count internally in Offline scenario. If the dataset size is larger\n"
"# than 24576 we limit the min_query_count to 24576 and otherwise we use\n"
"# the dataset size as the limit\n"
"\n"
"resnet50.Offline.min_query_count = 24576\n"
"retinanet.Offline.min_query_count = 24576\n"
"dlrm-v2.Offline.min_query_count = 24576\n"
"bert.Offline.min_query_count = 10833\n"
"gptj.Offline.min_query_count = 13368\n"
"rnnt.Offline.min_query_count = 2513\n"
"3d-unet.Offline.min_query_count = 43\n"
"stable-diffusion-xl.Offline.min_query_count = 5000\n"
"llama2-70b.Offline.min_query_count = 24576\n"
"llama3_1-405b.Offline.min_query_count = 8313\n"
"mixtral-8x7b.Offline.min_query_count = 15000\n"
"rgat.Offline.min_query_count = 788379\n"
"\n"
"# These fields should be defined and overridden by user.conf.\n"
"*.SingleStream.target_latency = 10\n"
"*.MultiStream.target_latency = 80\n"
"*.Server.target_qps = 1.0\n"
"*.Offline.target_qps = 1.0\n"
"";