# The format of this config file is 'key = value'. # The key has the format 'model.scenario.key'. Value is mostly int64_t. # Model maybe '*' as wildcard. In that case the value applies to all models. # All times are in milli seconds # Set performance_sample_count for each model. # User can optionally set this to higher values in user.conf. resnet50.*.performance_sample_count_override = 1024 ssd-mobilenet.*.performance_sample_count_override = 256 retinanet.*.performance_sample_count_override = 64 bert.*.performance_sample_count_override = 10833 dlrm.*.performance_sample_count_override = 204800 dlrm-v2.*.performance_sample_count_override = 204800 rnnt.*.performance_sample_count_override = 2513 gptj.*.performance_sample_count_override = 13368 mixtral-8x7b.*.performance_sample_count_override = 15000 llama2-70b.*.performance_sample_count_override = 24576 llama2-70b-interactive.*.performance_sample_count_override = 24576 llama3_1-405b.*.performance_sample_count_override = 8313 stable-diffusion-xl.*.performance_sample_count_override = 5000 rgat.*.performance_sample_count_override = 788379 # set to 0 to let entire sample set to be performance sample 3d-unet.*.performance_sample_count_override = 0 # Set seeds. *.*.qsl_rng_seed = 6023615788873153749 *.*.sample_index_rng_seed = 15036839855038426416 *.*.schedule_rng_seed = 9933818062894767841 # Set seeds for TEST_05. *.*.test05_qsl_rng_seed = 7975553102935885558 *.*.test05_sample_index_rng_seed = 11403566307062068064 *.*.test05_schedule_rng_seed = 15816800565822761601 *.SingleStream.target_latency_percentile = 90 *.SingleStream.min_duration = 600000 *.MultiStream.target_latency_percentile = 99 *.MultiStream.samples_per_query = 8 *.MultiStream.min_duration = 600000 *.MultiStream.min_query_count = 662 retinanet.MultiStream.target_latency = 528 # 3D-UNet uses equal issue mode because it has non-uniform inputs 3d-unet.*.sample_concatenate_permutation = 1 # R-GAT uses equal issue mode because it may have non-uniform inputs rgat.*.sample_concatenate_permutation = 1 # LLM benchmarks have non-uniform inputs and outputs, and use equal issue mode for all latency scenario gptj.*.sample_concatenate_permutation = 1 llama2-70b.*.sample_concatenate_permutation = 1 llama2-70b-interactive.*.sample_concatenate_permutation = 1 mixtral-8x7b.*.sample_concatenate_permutation = 1 llama3_1-405b.*.sample_concatenate_permutation = 1 *.Server.target_latency = 10 *.Server.target_latency_percentile = 99 *.Server.target_duration = 0 *.Server.min_duration = 600000 resnet50.Server.target_latency = 15 retinanet.Server.target_latency = 100 bert.Server.target_latency = 130 dlrm.Server.target_latency = 60 dlrm-v2.Server.target_latency = 60 rnnt.Server.target_latency = 1000 gptj.Server.target_latency = 20000 stable-diffusion-xl.Server.target_latency = 20000 # Benchmarks that measure token latencies llama2-70b.*.use_token_latencies = 1 llama2-70b-interactive.*.use_token_latencies = 1 mixtral-8x7b.*.use_token_latencies = 1 llama3_1-405b.*.use_token_latencies = 1 # gptj benchmark infers token latencies gptj.*.infer_token_latencies = 1 gptj.*.token_latency_scaling_factor = 69 # Only ttft and tpot are tracked for the llama2-70b, mixtral-8x7B & llama3_1-405b benchmark therefore target_latency = 0 llama2-70b.Server.target_latency = 0 llama2-70b.Server.ttft_latency = 2000 llama2-70b.Server.tpot_latency = 200 # Target Latencies for low latency setting llama2-70b-interactive.Server.target_latency = 0 llama2-70b-interactive.Server.ttft_latency = 450 llama2-70b-interactive.Server.tpot_latency = 40 mixtral-8x7b.Server.target_latency = 0 mixtral-8x7b.Server.ttft_latency = 2000 mixtral-8x7b.Server.tpot_latency = 200 llama3_1-405b.Server.target_latency = 0 llama3_1-405b.Server.ttft_latency = 6000 llama3_1-405b.Server.tpot_latency = 175 *.Offline.target_latency_percentile = 90 *.Offline.min_duration = 600000 # In Offline scenario, we always have one query. But LoadGen maps this to # min_sample_count internally in Offline scenario. If the dataset size is larger # than 24576 we limit the min_query_count to 24576 and otherwise we use # the dataset size as the limit resnet50.Offline.min_query_count = 24576 retinanet.Offline.min_query_count = 24576 dlrm-v2.Offline.min_query_count = 24576 bert.Offline.min_query_count = 10833 gptj.Offline.min_query_count = 13368 rnnt.Offline.min_query_count = 2513 3d-unet.Offline.min_query_count = 43 stable-diffusion-xl.Offline.min_query_count = 5000 llama2-70b.Offline.min_query_count = 24576 llama3_1-405b.Offline.min_query_count = 8313 mixtral-8x7b.Offline.min_query_count = 15000 rgat.Offline.min_query_count = 788379 # These fields should be defined and overridden by user.conf. *.SingleStream.target_latency = 10 *.MultiStream.target_latency = 80 *.Server.target_qps = 1.0 *.Offline.target_qps = 1.0