Benchmarks: Unify metric names of benchmarks (#252)

**Description** Unify metric names of benchmarks.

Benchmarks: Unify metric names of benchmarks (#252)
**Description** Unify metric names of benchmarks.
9f56b219 · Yuting Jiang · GitHub · c13ed2a2 · 9f56b219 · 9f56b219
Unverified Commit 9f56b219 authored Dec 09, 2021 by Yuting Jiang Committed by GitHub Dec 09, 2021
20 changed files
--- a/superbench/benchmarks/model_benchmarks/model_base.py
+++ b/superbench/benchmarks/model_benchmarks/model_base.py
@@ -373,7 +373,10 @@ def __process_model_result(self, model_action, precision, step_times):
            )
            return False

-        metric = 'steptime_{}_{}'.format(model_action, precision)
+        precision_metric = {'float16': 'fp16', 'float32': 'fp32', 'float64': 'fp64', 'bfloat16': 'bf16'}
+        if precision.value in precision_metric.keys():
+            precision = precision_metric[precision.value]
+        metric = '{}_{}_step_time'.format(precision, model_action)
        self._result.add_raw_data(metric, step_times)
        avg = statistics.mean(step_times)
        self._result.add_result(metric, avg, reduce_type=ReduceType.MAX if model_action is ModelAction.TRAIN else None)
@@ -381,7 +384,7 @@ def __process_model_result(self, model_action, precision, step_times):
        # The unit of step time is millisecond, use it to calculate the throughput with the unit samples/sec.
        millisecond_per_second = 1000
        throughput = [millisecond_per_second / step_time * self._args.batch_size for step_time in step_times]
-        metric = 'throughput_{}_{}'.format(model_action, precision)
+        metric = '{}_{}_throughput'.format(precision, model_action)
        self._result.add_raw_data(metric, throughput)
        avg = statistics.mean(throughput)
        self._result.add_result(metric, avg, reduce_type=ReduceType.MIN if model_action is ModelAction.TRAIN else None)

--- a/tests/benchmarks/docker_benchmarks/test_rocm_onnxruntime_performance.py
+++ b/tests/benchmarks/docker_benchmarks/test_rocm_onnxruntime_performance.py
@@ -44,13 +44,13 @@ def test_rocm_onnxruntime_performance():
    "samples_per_second": 274.455
 """
    assert (benchmark._process_raw_result(0, raw_output))
-    assert (benchmark.result['bert_large_uncased_ngpu_1'][0] == 21.829)
-    assert (benchmark.result['bert_large_uncased_ngpu_8'][0] == 147.181)
-    assert (benchmark.result['distilbert_base_uncased_ngpu_1'][0] == 126.827)
-    assert (benchmark.result['distilbert_base_uncased_ngpu_8'][0] == 966.796)
-    assert (benchmark.result['gpt2_ngpu_1'][0] == 20.46)
-    assert (benchmark.result['gpt2_ngpu_8'][0] == 151.089)
-    assert (benchmark.result['facebook_bart_large_ngpu_1'][0] == 66.171)
-    assert (benchmark.result['facebook_bart_large_ngpu_8'][0] == 370.343)
-    assert (benchmark.result['roberta_large_ngpu_1'][0] == 37.103)
-    assert (benchmark.result['roberta_large_ngpu_8'][0] == 274.455)
+    assert (benchmark.result['bert_large_uncased_ngpu_1_throughput'][0] == 21.829)
+    assert (benchmark.result['bert_large_uncased_ngpu_8_throughput'][0] == 147.181)
+    assert (benchmark.result['distilbert_base_uncased_ngpu_1_throughput'][0] == 126.827)
+    assert (benchmark.result['distilbert_base_uncased_ngpu_8_throughput'][0] == 966.796)
+    assert (benchmark.result['gpt2_ngpu_1_throughput'][0] == 20.46)
+    assert (benchmark.result['gpt2_ngpu_8_throughput'][0] == 151.089)
+    assert (benchmark.result['facebook_bart_large_ngpu_1_throughput'][0] == 66.171)
+    assert (benchmark.result['facebook_bart_large_ngpu_8_throughput'][0] == 370.343)
+    assert (benchmark.result['roberta_large_ngpu_1_throughput'][0] == 37.103)
+    assert (benchmark.result['roberta_large_ngpu_8_throughput'][0] == 274.455)
--- a/tests/benchmarks/micro_benchmarks/test_cuda_gemm_flops_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_cuda_gemm_flops_performance.py
@@ -38,7 +38,7 @@ def test_flops_performance_cuda(self):
        # Negative case - MICROBENCHMARK_UNSUPPORTED_ARCHITECTURE.
        benchmark = benchmark_class(
            benchmark_name,
-            parameters='--num_warmup 200 --n 1024 --k 512 --m 2048 --precision FP32 TF32_TC FP16_TC INT8_TC'
+            parameters='--num_warmup 200 --n 1024 --k 512 --m 2048 --precision fp32 tf32_tc fp16_tc int8_tc'
        )

        ret = benchmark._preprocess()
@@ -59,11 +59,11 @@ def test_flops_performance_cuda(self):
        assert (benchmark._args.n == 1024)
        assert (benchmark._args.k == 512)
        assert (benchmark._args.m == 2048)
-        assert (benchmark._args.precision == ['FP32', 'TF32_TC', 'FP16_TC', 'INT8_TC'])
-        benchmark._CudaGemmFlopsBenchmark__precision_need_to_run = ['FP32', 'TF32_TC', 'FP16_TC', 'INT8_TC']
+        assert (benchmark._args.precision == ['fp32', 'tf32_tc', 'fp16_tc', 'int8_tc'])
+        benchmark._CudaGemmFlopsBenchmark__precision_need_to_run = ['fp32', 'tf32_tc', 'fp16_tc', 'int8_tc']

        # Check results and metrics.
-        raw_output_FP32 = """
+        raw_output_fp32 = """
 CSV Results:

 Problem,Provider,OperationKind,Operation,Disposition,Status,gemm_kind,m,n,k,A,B,C,alpha,beta,split_k_slices,batch_count,op_class,accum,cta_m,cta_n,cta_k,stages,warps_m,warps_n,warps_k,inst_m,inst_n,inst_k,min_cc,max_cc,Bytes,Flops,Runtime,GB/s,GFLOPs
@@ -72,7 +72,7 @@ def test_flops_performance_cuda(self):
 1,CUTLASS,gemm,cutlass_simt_sgemm_128x128_8x2_tn_align1,passed,success,universal,16384,16384,16384,f32:row,f32:column,f32:column,1,0,1,1,simt,f32,128,128,8,2,4,2,1,1,1,1,50,1024,3221225472,8796629893120,482.034,6.22363,18249
 1,CUTLASS,gemm,cutlass_simt_sgemm_128x128_8x2_tt_align1,passed,success,universal,16384,16384,16384,f32:row,f32:row,f32:column,1,0,1,1,simt,f32,128,128,8,2,4,2,1,1,1,1,50,1024,3221225472,8796629893120,481.838,6.22616,18256.4
 """
-        raw_output_TF32_TC = """
+        raw_output_tf32_tc = """
 CSV Results:

 Problem,Provider,OperationKind,Operation,Disposition,Status,gemm_kind,m,n,k,A,B,C,alpha,beta,split_k_slices,batch_count,op_class,accum,cta_m,cta_n,cta_k,stages,warps_m,warps_n,warps_k,inst_m,inst_n,inst_k,min_cc,max_cc,Bytes,Flops,Runtime,GB/s,GFLOPs
@@ -81,7 +81,7 @@ def test_flops_performance_cuda(self):
 1,CUTLASS,gemm,cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_tn_align4,passed,success,universal,16384,16384,16384,tf32:row,tf32:column,tf32:column,1,0,1,1,tensorop,f32,256,128,16,3,4,2,1,16,8,8,80,1024,3221225472,8796629893120,86.5167,34.6754,101676
 1,CUTLASS,gemm,cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_tt_align4,passed,success,universal,16384,16384,16384,tf32:row,tf32:row,tf32:column,1,0,1,1,tensorop,f32,256,128,16,3,4,2,1,16,8,8,80,1024,3221225472,8796629893120,68.3621,43.884,128677
 """
-        raw_output_FP16_TC = """
+        raw_output_fp16_tc = """
 CSV Results:

 Problem,Provider,OperationKind,Operation,Disposition,Status,gemm_kind,m,n,k,A,B,C,alpha,beta,split_k_slices,batch_count,op_class,accum,cta_m,cta_n,cta_k,stages,warps_m,warps_n,warps_k,inst_m,inst_n,inst_k,min_cc,max_cc,Bytes,Flops,Runtime,GB/s,GFLOPs
@@ -90,13 +90,13 @@ def test_flops_performance_cuda(self):
 1,CUTLASS,gemm,cutlass_tensorop_h16816gemm_256x128_32x3_tn_align8,incorrect,success,universal,16384,16384,16384,f16:row,f16:column,f16:column,1,0,1,1,tensorop,f16,256,128,32,3,4,2,1,16,8,16,80,1024,1610612736,8796629893120,39.0413,38.4209,225316
 1,CUTLASS,gemm,cutlass_tensorop_h16816gemm_256x128_32x3_tt_align8,incorrect,success,universal,16384,16384,16384,f16:row,f16:row,f16:column,1,0,1,1,tensorop,f16,256,128,32,3,4,2,1,16,8,16,80,1024,1610612736,8796629893120,31.2994,47.9243,281048
 """
-        assert (benchmark._process_raw_result(0, raw_output_FP32))
-        assert (benchmark._process_raw_result(1, raw_output_TF32_TC))
-        assert (benchmark._process_raw_result(2, raw_output_FP16_TC))
+        assert (benchmark._process_raw_result(0, raw_output_fp32))
+        assert (benchmark._process_raw_result(1, raw_output_tf32_tc))
+        assert (benchmark._process_raw_result(2, raw_output_fp16_tc))

-        assert (benchmark.result['FP32'][0] == 18369.7)
-        assert (benchmark.result['TF32_TC'][0] == 128677)
-        assert (benchmark.result['FP16_TC'][0] == 281048)
+        assert (benchmark.result['fp32_flops'][0] == 18369.7)
+        assert (benchmark.result['tf32_tc_flops'][0] == 128677)
+        assert (benchmark.result['fp16_tc_flops'][0] == 281048)

        # Negative case - Add invalid raw output.
        assert (benchmark._process_raw_result(3, 'Invalid raw output') is False)
--- a/tests/benchmarks/micro_benchmarks/test_cuda_memory_bw_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_cuda_memory_bw_performance.py
@@ -328,7 +328,7 @@ def test_cuda_memory_bw_performance(self):
 bandwidthTest-D2D, Bandwidth = 762.8 GB/s, Time = 0.00009 s, Size = 68000000 bytes, NumDevsUsed = 1
 Result = PASS
    """
-        for i, metric in enumerate(['H2D_Mem_BW', 'D2H_Mem_BW', 'D2D_Mem_BW']):
+        for i, metric in enumerate(['h2d_bw', 'd2h_bw', 'd2d_bw']):
            assert (benchmark._process_raw_result(i, raw_output[i]))
            assert (metric in benchmark.result)
            assert (len(benchmark.result[metric]) == 1)

--- a/tests/benchmarks/micro_benchmarks/test_disk_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_disk_performance.py
@@ -519,19 +519,19 @@ def test_disk_performance_result_parsing(self):
        assert (1 == len(benchmark.result[jobname_prefix + '_write_iops']))
        assert (85066.128925 == benchmark.result[jobname_prefix + '_write_iops'][0])

-        assert (1 == len(benchmark.result[jobname_prefix + '_read_lat_ns_95.000000']))
-        assert (1941504 == benchmark.result[jobname_prefix + '_read_lat_ns_95.000000'][0])
-        assert (1 == len(benchmark.result[jobname_prefix + '_read_lat_ns_99.000000']))
-        assert (2244608 == benchmark.result[jobname_prefix + '_read_lat_ns_99.000000'][0])
-        assert (1 == len(benchmark.result[jobname_prefix + '_read_lat_ns_99.900000']))
-        assert (3620864 == benchmark.result[jobname_prefix + '_read_lat_ns_99.900000'][0])
-
-        assert (1 == len(benchmark.result[jobname_prefix + '_write_lat_ns_95.000000']))
-        assert (1908736 == benchmark.result[jobname_prefix + '_write_lat_ns_95.000000'][0])
-        assert (1 == len(benchmark.result[jobname_prefix + '_write_lat_ns_99.000000']))
-        assert (2072576 == benchmark.result[jobname_prefix + '_write_lat_ns_99.000000'][0])
-        assert (1 == len(benchmark.result[jobname_prefix + '_write_lat_ns_99.900000']))
-        assert (2605056 == benchmark.result[jobname_prefix + '_write_lat_ns_99.900000'][0])
+        assert (1 == len(benchmark.result[jobname_prefix + '_read_lat_ns_95.0']))
+        assert (1941504 == benchmark.result[jobname_prefix + '_read_lat_ns_95.0'][0])
+        assert (1 == len(benchmark.result[jobname_prefix + '_read_lat_ns_99.0']))
+        assert (2244608 == benchmark.result[jobname_prefix + '_read_lat_ns_99.0'][0])
+        assert (1 == len(benchmark.result[jobname_prefix + '_read_lat_ns_99.9']))
+        assert (3620864 == benchmark.result[jobname_prefix + '_read_lat_ns_99.9'][0])
+
+        assert (1 == len(benchmark.result[jobname_prefix + '_write_lat_ns_95.0']))
+        assert (1908736 == benchmark.result[jobname_prefix + '_write_lat_ns_95.0'][0])
+        assert (1 == len(benchmark.result[jobname_prefix + '_write_lat_ns_99.0']))
+        assert (2072576 == benchmark.result[jobname_prefix + '_write_lat_ns_99.0'][0])
+        assert (1 == len(benchmark.result[jobname_prefix + '_write_lat_ns_99.9']))
+        assert (2605056 == benchmark.result[jobname_prefix + '_write_lat_ns_99.9'][0])

        # Negative case - invalid raw output.
        assert (benchmark._process_raw_result(1, 'Invalid raw output') is False)

--- a/tests/benchmarks/micro_benchmarks/test_gemm_flops_performance_base.py
+++ b/tests/benchmarks/micro_benchmarks/test_gemm_flops_performance_base.py
@@ -72,7 +72,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
        return True


-def test_memory_bw_performance_base():
+def test_gemm_flops_performance_base():
    """Test GemmFlopsBenchmark."""
    # Positive case - memory=pinned.
    benchmark = FakeGemmFlopsBenchmark('fake')
@@ -81,49 +81,49 @@ def test_memory_bw_performance_base():
    assert (benchmark.return_code == ReturnCode.SUCCESS)
    # Check command list
    expected_command = [
-        'echo "--precision FP64 --m 16384 --n 16384 --k 16384 --num_warmup 5"',
-        'echo "--precision FP32 --m 16384 --n 16384 --k 16384 --num_warmup 5"',
-        'echo "--precision FP16 --m 16384 --n 16384 --k 16384 --num_warmup 5"',
-        'echo "--precision FP64_TC --m 16384 --n 16384 --k 16384 --num_warmup 5"',
-        'echo "--precision TF32_TC --m 16384 --n 16384 --k 16384 --num_warmup 5"',
-        'echo "--precision BF16_TC --m 16384 --n 16384 --k 16384 --num_warmup 5"',
-        'echo "--precision FP16_TC --m 16384 --n 16384 --k 16384 --num_warmup 5"',
-        'echo "--precision INT8_TC --m 16384 --n 16384 --k 16384 --num_warmup 5"',
-        'echo "--precision INT4_TC --m 16384 --n 16384 --k 16384 --num_warmup 5"'
+        'echo "--precision fp64 --m 16384 --n 16384 --k 16384 --num_warmup 5"',
+        'echo "--precision fp32 --m 16384 --n 16384 --k 16384 --num_warmup 5"',
+        'echo "--precision fp16 --m 16384 --n 16384 --k 16384 --num_warmup 5"',
+        'echo "--precision fp64_tc --m 16384 --n 16384 --k 16384 --num_warmup 5"',
+        'echo "--precision tf32_tc --m 16384 --n 16384 --k 16384 --num_warmup 5"',
+        'echo "--precision bf16_tc --m 16384 --n 16384 --k 16384 --num_warmup 5"',
+        'echo "--precision fp16_tc --m 16384 --n 16384 --k 16384 --num_warmup 5"',
+        'echo "--precision int8_tc --m 16384 --n 16384 --k 16384 --num_warmup 5"',
+        'echo "--precision int4_tc --m 16384 --n 16384 --k 16384 --num_warmup 5"'
    ]
    for i in range(len(expected_command)):
        command = benchmark._bin_name + benchmark._commands[i].split(benchmark._bin_name)[1]
        assert (command == expected_command[i])
    for i, metric in enumerate(
-        ['FP64', 'FP32', 'FP16', 'FP64_TC', 'TF32_TC', 'BF16_TC', 'FP16_TC', 'INT8_TC', 'INT4_TC']
+        ['fp64', 'fp32', 'fp16', 'fp64_tc', 'tf32_tc', 'bf16_tc', 'fp16_tc', 'int8_tc', 'int4_tc']
    ):
        assert (metric in benchmark.result)
        assert (len(benchmark.result[metric]) == 1)

    # Positive case - memory=unpinned.
-    benchmark = FakeGemmFlopsBenchmark('fake', parameters='--precision FP64 FP32 FP16')
+    benchmark = FakeGemmFlopsBenchmark('fake', parameters='--precision fp64 fp32 fp16')
    assert (benchmark._benchmark_type == BenchmarkType.MICRO)
    assert (benchmark.run())
    assert (benchmark.return_code == ReturnCode.SUCCESS)
    # Check command list
    expected_command = [
-        'echo "--precision FP64 --m 16384 --n 16384 --k 16384 --num_warmup 5"',
-        'echo "--precision FP32 --m 16384 --n 16384 --k 16384 --num_warmup 5"',
-        'echo "--precision FP16 --m 16384 --n 16384 --k 16384 --num_warmup 5"'
+        'echo "--precision fp64 --m 16384 --n 16384 --k 16384 --num_warmup 5"',
+        'echo "--precision fp32 --m 16384 --n 16384 --k 16384 --num_warmup 5"',
+        'echo "--precision fp16 --m 16384 --n 16384 --k 16384 --num_warmup 5"'
    ]
    for i in range(len(expected_command)):
        command = benchmark._bin_name + benchmark._commands[i].split(benchmark._bin_name)[1]
        assert (command == expected_command[i])
-    for i, metric in enumerate(['FP64', 'FP32', 'FP16']):
+    for i, metric in enumerate(['fp64', 'fp32', 'fp16']):
        assert (metric in benchmark.result)
        assert (len(benchmark.result[metric]) == 1)

-    benchmark = FakeGemmFlopsBenchmark('fake', parameters='--precision FP64 BF64')
+    benchmark = FakeGemmFlopsBenchmark('fake', parameters='--precision fp64 bf64')
    assert (benchmark._benchmark_type == BenchmarkType.MICRO)
    assert (benchmark.run() is True)

    # Negative case - INVALID_ARGUMENT.
-    benchmark = FakeGemmFlopsBenchmark('fake', parameters='--precision BF64')
+    benchmark = FakeGemmFlopsBenchmark('fake', parameters='--precision bf64')
    assert (benchmark._benchmark_type == BenchmarkType.MICRO)
    assert (benchmark.run() is False)
    assert (benchmark.return_code == ReturnCode.NO_SUPPORTED_PRECISION)
--- a/tests/benchmarks/micro_benchmarks/test_gpcnet_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_gpcnet_performance.py
@@ -98,14 +98,19 @@ def test_gpcnet_network_test(self):
        # Check function process_raw_data.
        # Positive case - valid raw output.
        assert (benchmark._process_raw_result(0, raw_output))
-        test_name = 'IsolatedNetworkTests'
        metric_list = [
-            'RRTwo-sidedLat(8B)', 'RRGetLat(8B)', 'RRTwo-sidedBW(131072B)', 'RRPutBW(131072B)',
-            'RRTwo-sidedBW+Sync(131072B)', 'NatTwo-sidedBW(131072B)', 'MultipleAllreduce(8B)', 'MultipleAlltoall(4096B)'
+            'rr_two-sided_lat',
+            'rr_get_lat',
+            'rr_two-sided_bw',
+            'rr_put_bw',
+            'rr_two-sided+sync_bw',
+            'nat_two-sided_bw',
+            'multiple_allreduce_time',
+            'multiple_alltoall_bw',
        ]
        for metric_medium in metric_list:
-            for suffix in ['Avg', '99%']:
-                metric = test_name + '_' + metric_medium + '_' + suffix
+            for suffix in ['avg', '99%']:
+                metric = metric_medium + '_' + suffix
                assert (metric in benchmark.result)
                assert (len(benchmark.result[metric]) == 1)
                assert (isinstance(benchmark.result[metric][0], numbers.Number))
@@ -253,58 +258,10 @@ def test_gpcnet_network_load(self):    # noqa: C901
        assert (len(benchmark.result) == benchmark.default_metric_count)
        # Positive case - valid raw output.
        assert (benchmark._process_raw_result(0, raw_output))
-        test_name = 'IsolatedNetworkTests'
-        metric_list = ['RRTwo-sidedLat(8B)', 'RRTwo-sidedBW+Sync(131072B)', 'MultipleAllreduce(8B)']
+        metric_list = ['rr_two-sided_lat_x', 'rr_two-sided+sync_bw_x', 'multiple_allreduce_x']
        for metric_medium in metric_list:
-            for suffix in ['Max', 'Min', 'Avg', '99.9%']:
-                metric = test_name + '_' + metric_medium + '_' + suffix
-                assert (metric in benchmark.result)
-                assert (len(benchmark.result[metric]) == 1)
-                assert (isinstance(benchmark.result[metric][0], numbers.Number))
-        test_name = 'IsolatedCongestionTests'
-        metric_list = ['GetBcast(4096B)', 'PutIncast(4096B)', 'Two-sidedIncast(4096B)', 'Alltoall(4096B)']
-        for metric_medium in metric_list:
-            for suffix in ['Max', 'Min', 'Avg', '99.9%']:
-                metric = test_name + '_' + metric_medium + '_' + suffix
-                assert (metric in benchmark.result)
-                assert (len(benchmark.result[metric]) == 1)
-                assert (isinstance(benchmark.result[metric][0], numbers.Number))
-        test_name = 'NetworkTestsrunningwithCongestionTests(RRTwo-sidedLatNetworkTest)'
-        metric_list = [
-            'GetBcast(4096B)', 'PutIncast(4096B)', 'Two-sidedIncast(4096B)', 'Alltoall(4096B)', 'RRTwo-sidedLat(8B)'
-        ]
-        for metric_medium in metric_list:
-            for suffix in ['Max', 'Min', 'Avg', '99.9%']:
-                metric = test_name + '_' + metric_medium + '_' + suffix
-                assert (metric in benchmark.result)
-                assert (len(benchmark.result[metric]) == 1)
-                assert (isinstance(benchmark.result[metric][0], numbers.Number))
-        test_name = 'NetworkTestsrunningwithCongestionTests(RRTwo-sidedBW+SyncNetworkTest)'
-        metric_list = [
-            'GetBcast(4096B)', 'PutIncast(4096B)', 'Two-sidedIncast(4096B)', 'Alltoall(4096B)',
-            'RRTwo-sidedBW+Sync(131072B)'
-        ]
-        for metric_medium in metric_list:
-            for suffix in ['Max', 'Min', 'Avg', '99.9%']:
-                metric = test_name + '_' + metric_medium + '_' + suffix
-                assert (metric in benchmark.result)
-                assert (len(benchmark.result[metric]) == 1)
-                assert (isinstance(benchmark.result[metric][0], numbers.Number))
-        test_name = 'NetworkTestsrunningwithCongestionTests(MultipleAllreduceNetworkTest)'
-        metric_list = [
-            'GetBcast(4096B)', 'PutIncast(4096B)', 'Two-sidedIncast(4096B)', 'Alltoall(4096B)', 'MultipleAllreduce(8B)'
-        ]
-        for metric_medium in metric_list:
-            for suffix in ['Max', 'Min', 'Avg', '99.9%']:
-                metric = test_name + '_' + metric_medium + '_' + suffix
-                assert (metric in benchmark.result)
-                assert (len(benchmark.result[metric]) == 1)
-                assert (isinstance(benchmark.result[metric][0], numbers.Number))
-        test_name = 'NetworkTestsrunningwithCongestionTests-KeyResults'
-        metric_list = ['RRTwo-sidedLat(8B)', 'RRTwo-sidedBW+Sync(131072B)', 'MultipleAllreduce(8B)']
-        for metric_medium in metric_list:
-            for suffix in ['Avg', '99%']:
-                metric = test_name + '_' + metric_medium + '_' + suffix
+            for suffix in ['avg', '99%']:
+                metric = metric_medium + '_' + suffix
                assert (metric in benchmark.result)
                assert (len(benchmark.result[metric]) == 1)
                assert (isinstance(benchmark.result[metric][0], numbers.Number))

--- a/tests/benchmarks/micro_benchmarks/test_gpu_copy_bw_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_gpu_copy_bw_performance.py
@@ -119,8 +119,8 @@ def _test_gpu_copy_bw_performance_result_parsing(self, platform):
            else:
                assert (len(benchmark.result[output_key]) == 1)
                assert (isinstance(benchmark.result[output_key][0], numbers.Number))
-                assert (output_key in test_raw_output_dict)
-                assert (test_raw_output_dict[output_key] == benchmark.result[output_key][0])
+                assert (output_key.strip('_bw') in test_raw_output_dict)
+                assert (test_raw_output_dict[output_key.strip('_bw')] == benchmark.result[output_key][0])

        # Negative case - invalid raw output.
        assert (benchmark._process_raw_result(1, 'Invalid raw output') is False)

--- a/tests/benchmarks/micro_benchmarks/test_ib_loopback_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_ib_loopback_performance.py
@@ -158,7 +158,7 @@ def test_ib_loopback_all_sizes(self, mock_ib_devices, mock_numa_cores, mock_port
        metric_list = []
        for ib_command in benchmark._args.commands:
            for size in ['8388608', '4194304', '1024', '2']:
-                metric = 'IB_{}_{}_Avg_{}'.format(ib_command, size, str(benchmark._args.ib_index))
+                metric = 'ib_{}_{}_ib{}_bw'.format(ib_command, size, str(benchmark._args.ib_index))
                metric_list.append(metric)
        for metric in metric_list:
            assert (metric in benchmark.result)
@@ -270,7 +270,7 @@ def test_ib_loopback_8M_size(self, mock_ib_devices, mock_numa_cores, mock_port):
        # Positive case - valid raw output.
        metric_list = []
        for ib_command in benchmark._args.commands:
-            metric = 'IB_{}_8388608_Avg_{}'.format(ib_command, str(benchmark._args.ib_index))
+            metric = 'ib_{}_8388608_ib{}_bw'.format(ib_command, str(benchmark._args.ib_index))
            metric_list.append(metric)
        for metric in metric_list:
            assert (metric in benchmark.result)

--- a/tests/benchmarks/micro_benchmarks/test_ib_traffic_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_ib_traffic_performance.py
@@ -27,6 +27,9 @@ def setUp(self):
    def tearDown(self):
        """Method called after the test method has been called and the result recorded."""
        self.__binary_file.unlink()
+        p = Path('hostfile')
+        if p.is_file():
+            p.unlink()

    def test_generate_config(self):    # noqa: C901
        """Test util functions ."""
@@ -126,15 +129,18 @@ def test_ib_traffic_performance(self, mock_ib_devices):

        # Check preprocess
        # Negative cases
-        parameters = '--ib_index 0 --iters 2000 --pattern one-to-one'
+        parameters = '--ib_index 0 --iters 2000 --pattern one-to-one --hostfile hostfile'
        benchmark = benchmark_class(benchmark_name, parameters=parameters)
        mock_ib_devices.return_value = None
        ret = benchmark._preprocess()
        assert (ret is False)
        assert (benchmark.return_code == ReturnCode.MICROBENCHMARK_MPI_INIT_FAILURE)

+        hosts = ['node0\n', 'node1\n', 'node2\n', 'node3\n']
+        with open('hostfile', 'w') as f:
+            f.writelines(hosts)
        os.environ['OMPI_COMM_WORLD_SIZE'] = '4'
-        parameters = '--ib_index 0 --iters 2000 --pattern one-to-one'
+        parameters = '--ib_index 0 --iters 2000 --pattern one-to-one --hostfile hostfile'
        benchmark = benchmark_class(benchmark_name, parameters=parameters)
        mock_ib_devices.return_value = None
        ret = benchmark._preprocess()
@@ -143,21 +149,21 @@ def test_ib_traffic_performance(self, mock_ib_devices):

        # Positive cases
        os.environ['OMPI_COMM_WORLD_SIZE'] = '3'
-        parameters = '--ib_index 0 --iters 2000 --pattern one-to-one'
+        parameters = '--ib_index 0 --iters 2000 --pattern one-to-one --hostfile hostfile'
        benchmark = benchmark_class(benchmark_name, parameters=parameters)
        mock_ib_devices.return_value = ['mlx5_0']
        ret = benchmark._preprocess()
        assert (ret is True)

        # Generate config
-        parameters = '--ib_index 0 --iters 2000 --msg_size 33554432'
+        parameters = '--ib_index 0 --iters 2000 --msg_size 33554432 --hostfile hostfile'
        benchmark = benchmark_class(benchmark_name, parameters=parameters)
        os.environ['OMPI_COMM_WORLD_SIZE'] = '4'
        mock_ib_devices.return_value = ['mlx5_0']
        ret = benchmark._preprocess()
        Path('config.txt').unlink()
        assert (ret)
-        expect_command = 'ib_validation --hostfile /root/hostfile --cmd_prefix "ib_write_bw -F ' + \
+        expect_command = 'ib_validation --hostfile hostfile --cmd_prefix "ib_write_bw -F ' + \
            '--iters=2000 -d mlx5_0 -s 33554432" --input_config ' + os.getcwd() + '/config.txt'
        command = benchmark._bin_name + benchmark._commands[0].split(benchmark._bin_name)[1]
        assert (command == expect_command)
@@ -167,14 +173,14 @@ def test_ib_traffic_performance(self, mock_ib_devices):
        with open('test_config.txt', 'w') as f:
            for line in config:
                f.write(line + '\n')
-        parameters = '--ib_index 0 --iters 2000 --msg_size 33554432 --config test_config.txt'
+        parameters = '--ib_index 0 --iters 2000 --msg_size 33554432 --config test_config.txt --hostfile hostfile'
        benchmark = benchmark_class(benchmark_name, parameters=parameters)
        os.environ['OMPI_COMM_WORLD_SIZE'] = '2'
        mock_ib_devices.return_value = ['mlx5_0']
        ret = benchmark._preprocess()
        Path('test_config.txt').unlink()
        assert (ret)
-        expect_command = 'ib_validation --hostfile /root/hostfile --cmd_prefix "ib_write_bw -F ' + \
+        expect_command = 'ib_validation --hostfile hostfile --cmd_prefix "ib_write_bw -F ' + \
            '--iters=2000 -d mlx5_0 -s 33554432" --input_config test_config.txt'

        command = benchmark._bin_name + benchmark._commands[0].split(benchmark._bin_name)[1]

--- a/tests/benchmarks/micro_benchmarks/test_kernel_launch_overhead.py
+++ b/tests/benchmarks/micro_benchmarks/test_kernel_launch_overhead.py
@@ -36,7 +36,7 @@ def test_kernel_launch_overhead():
    assert ('raw_output_0' in benchmark.raw_data)
    assert (len(benchmark.raw_data['raw_output_0']) == 1)
    assert (isinstance(benchmark.raw_data['raw_output_0'][0], str))
-    for metric in ['event_overhead', 'wall_overhead']:
+    for metric in ['event_time', 'wall_time']:
        assert (metric in benchmark.result)
        assert (len(benchmark.result[metric]) == 1)
        assert (isinstance(benchmark.result[metric][0], numbers.Number))
--- a/tests/benchmarks/micro_benchmarks/test_matmul.py
+++ b/tests/benchmarks/micro_benchmarks/test_matmul.py
@@ -35,6 +35,6 @@ def test_pytorch_matmul():
    # Check results and metrics.
    assert (benchmark.run_count == 2)
    assert (benchmark.return_code == ReturnCode.SUCCESS)
-    assert (len(benchmark.raw_data['nosharding']) == benchmark.run_count)
-    assert (len(benchmark.raw_data['nosharding'][0]) == benchmark._args.num_steps)
-    assert (len(benchmark.result['nosharding']) == benchmark.run_count)
+    assert (len(benchmark.raw_data['nosharding_time']) == benchmark.run_count)
+    assert (len(benchmark.raw_data['nosharding_time'][0]) == benchmark._args.num_steps)
+    assert (len(benchmark.result['nosharding_time']) == benchmark.run_count)
--- a/tests/benchmarks/micro_benchmarks/test_memory_bw_performance_base.py
+++ b/tests/benchmarks/micro_benchmarks/test_memory_bw_performance_base.py
@@ -83,7 +83,7 @@ def test_memory_bw_performance_base():
    for i in range(len(expected_command)):
        command = benchmark._bin_name + benchmark._commands[i].split(benchmark._bin_name)[1]
        assert (command == expected_command[i])
-    for i, metric in enumerate(['H2D_Mem_BW', 'D2H_Mem_BW', 'D2D_Mem_BW']):
+    for i, metric in enumerate(['h2d_bw', 'd2h_bw', 'd2d_bw']):
        assert (metric in benchmark.result)
        assert (len(benchmark.result[metric]) == 1)

@@ -97,7 +97,7 @@ def test_memory_bw_performance_base():
    for i in range(len(expected_command)):
        command = benchmark._bin_name + benchmark._commands[i].split(benchmark._bin_name)[1]
        assert (command == expected_command[i])
-    for i, metric in enumerate(['H2D_Mem_BW', 'D2H_Mem_BW', 'D2D_Mem_BW']):
+    for i, metric in enumerate(['h2d_bw', 'd2h_bw', 'd2d_bw']):
        assert (metric in benchmark.result)
        assert (len(benchmark.result[metric]) == 1)


--- a/tests/benchmarks/micro_benchmarks/test_rocm_gemm_flops_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_rocm_gemm_flops_performance.py
@@ -92,11 +92,11 @@ def test_rocm_flops_performance(self):
        assert (benchmark._process_raw_result(3, raw_output_BF16_X))
        assert (benchmark._process_raw_result(4, raw_output_INT8_X))

-        assert (benchmark.result['FP64'][0] == 10037.5)
-        assert (benchmark.result['FP32_xDLOPS'][0] == 39441.6)
-        assert (benchmark.result['FP16_xDLOPS'][0] == 153728)
-        assert (benchmark.result['BF16_xDLOPS'][0] == 81374.3)
-        assert (benchmark.result['INT8_xDLOPS'][0] == 162675)
+        assert (benchmark.result['fp64_flops'][0] == 10037.5)
+        assert (benchmark.result['fp32_xdlops_flops'][0] == 39441.6)
+        assert (benchmark.result['fp16_xdlops_flops'][0] == 153728)
+        assert (benchmark.result['bf16_xdlops_flops'][0] == 81374.3)
+        assert (benchmark.result['int8_xdlops_iops'][0] == 162675)

        # Negative case - Add invalid raw output.
        assert (benchmark._process_raw_result(4, 'Invalid raw output') is False)
--- a/tests/benchmarks/micro_benchmarks/test_rocm_memory_bw_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_rocm_memory_bw_performance.py
@@ -159,11 +159,11 @@ def test_rocm_memory_bw_performance(self):
 might occur with a mixture of architectural capabilities.
    """

-        for i, metric in enumerate(['htod_524288kB', 'htod_524288kB']):
+        for i, metric in enumerate(['h2d_bw', 'd2h_bw']):
            assert (benchmark._process_raw_result(i, raw_output[i]))
            assert (metric in benchmark.result)
            assert (len(benchmark.result[metric]) == 1)
            assert (isinstance(benchmark.result[metric][0], numbers.Number))

-        assert (benchmark.result['htod_524288kB'][0] == 24.6708)
-        assert (benchmark.result['dtoh_524288kB'][0] == 27.9348)
+        assert (benchmark.result['h2d_bw'][0] == 25.2351)
+        assert (benchmark.result['d2h_bw'][0] == 27.9348)
--- a/tests/benchmarks/micro_benchmarks/test_sharding_matmul.py
+++ b/tests/benchmarks/micro_benchmarks/test_sharding_matmul.py
@@ -44,7 +44,7 @@ def test_pytorch_sharding_matmul():
    # Check results and metrics.
    assert (benchmark.run_count == 2)
    assert (benchmark.return_code == ReturnCode.SUCCESS)
-    for metric in ['allreduce', 'allgather']:
+    for metric in ['allreduce_time', 'allgather_time']:
        assert (len(benchmark.raw_data[metric]) == benchmark.run_count)
        assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)
        assert (len(benchmark.result[metric]) == benchmark.run_count)

--- a/tests/benchmarks/micro_benchmarks/test_tcp_connectivity.py
+++ b/tests/benchmarks/micro_benchmarks/test_tcp_connectivity.py
@@ -52,15 +52,15 @@ def test_tcp_connectivity(self):
        assert (benchmark.result)

        # Check results and metrics.
-        assert (benchmark.result['Successed_api.github.com'][0] == 10)
-        assert (benchmark.result['Failed_api.github.com'][0] == 0)
-        assert (benchmark.result['Success_Rate_api.github.com'][0] == 100.0)
-        assert (isinstance(benchmark.result['Minimum_api.github.com'][0], numbers.Number))
-        assert (isinstance(benchmark.result['Maximum_api.github.com'][0], numbers.Number))
-        assert (isinstance(benchmark.result['Average_api.github.com'][0], numbers.Number))
-        assert (isinstance(benchmark.result['Successed_localhost'][0], numbers.Number))
-        assert (isinstance(benchmark.result['Failed_localhost'][0], numbers.Number))
-        assert (isinstance(benchmark.result['Maximum_localhost'][0], numbers.Number))
-        assert (isinstance(benchmark.result['Minimum_localhost'][0], numbers.Number))
-        assert (isinstance(benchmark.result['Average_localhost'][0], numbers.Number))
+        assert (benchmark.result['api.github.com_successed_count'][0] == 10)
+        assert (benchmark.result['api.github.com_failed_count'][0] == 0)
+        assert (benchmark.result['api.github.com_success_rate'][0] == 100.0)
+        assert (isinstance(benchmark.result['api.github.com_time_min'][0], numbers.Number))
+        assert (isinstance(benchmark.result['api.github.com_time_max'][0], numbers.Number))
+        assert (isinstance(benchmark.result['api.github.com_time_avg'][0], numbers.Number))
+        assert (isinstance(benchmark.result['localhost_successed_count'][0], numbers.Number))
+        assert (isinstance(benchmark.result['localhost_failed_count'][0], numbers.Number))
+        assert (isinstance(benchmark.result['localhost_time_max'][0], numbers.Number))
+        assert (isinstance(benchmark.result['localhost_time_min'][0], numbers.Number))
+        assert (isinstance(benchmark.result['localhost_time_avg'][0], numbers.Number))
        assert (benchmark.return_code == ReturnCode.SUCCESS)
--- a/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py
@@ -135,9 +135,9 @@ def test_tensorrt_inference_result_parsing(self, test_raw_log):

        self.assertEqual(6 + benchmark.default_metric_count, len(benchmark.result))
        for tag in ['mean', '99']:
-            self.assertEqual(0.5, benchmark.result[f'gpu_lat_ms_{tag}'][0])
-            self.assertEqual(0.6, benchmark.result[f'host_lat_ms_{tag}'][0])
-            self.assertEqual(1.0, benchmark.result[f'end_to_end_lat_ms_{tag}'][0])
+            self.assertEqual(0.5, benchmark.result[f'model_0_gpu_time_{tag}'][0])
+            self.assertEqual(0.6, benchmark.result[f'model_0_host_time_{tag}'][0])
+            self.assertEqual(1.0, benchmark.result[f'model_0_end_to_end_time_{tag}'][0])

        # Negative case - invalid raw output
        self.assertFalse(benchmark._process_raw_result(1, 'Invalid raw output'))
--- a/tests/benchmarks/model_benchmarks/test_model_base.py
+++ b/tests/benchmarks/model_benchmarks/test_model_base.py
@@ -223,10 +223,10 @@ def test_train():
    expected_result = (
        '{"name": "pytorch-fake-model", "type": "model", "run_count": 1, "return_code": 0, '
        '"start_time": null, "end_time": null, "raw_data": {'
-        '"steptime_train_float32": [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]], '
-        '"throughput_train_float32": [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]]}, '
-        '"result": {"return_code": [0], "steptime_train_float32": [2.0], "throughput_train_float32": [16000.0]}, '
-        '"reduce_op": {"steptime_train_float32": "max", "throughput_train_float32": "min"}}'
+        '"fp32_train_step_time": [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]], '
+        '"fp32_train_throughput": [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]]}, '
+        '"result": {"return_code": [0], "fp32_train_step_time": [2.0], "fp32_train_throughput": [16000.0]}, '
+        '"reduce_op": {"fp32_train_step_time": "max", "fp32_train_throughput": "min"}}'
    )
    assert (benchmark._preprocess())
    assert (benchmark._ModelBenchmark__train(Precision.FLOAT32))
@@ -249,10 +249,11 @@ def test_inference():
    expected_result = (
        '{"name": "pytorch-fake-model", "type": "model", "run_count": 1, "return_code": 0, '
        '"start_time": null, "end_time": null, "raw_data": {'
-        '"steptime_inference_float16": [[4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0]], '
-        '"throughput_inference_float16": [[8000.0, 8000.0, 8000.0, 8000.0, 8000.0, 8000.0, 8000.0, 8000.0]]}, '
-        '"result": {"return_code": [0], "steptime_inference_float16": [4.0], "throughput_inference_float16": '
-        '[8000.0]}, "reduce_op": {"steptime_inference_float16": null, "throughput_inference_float16": null}}'
+        '"fp16_inference_step_time": [[4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0]], '
+        '"fp16_inference_throughput": [[8000.0, 8000.0, 8000.0, 8000.0, 8000.0, 8000.0, 8000.0, 8000.0]]}, '
+        '"result": {"return_code": [0], '
+        '"fp16_inference_step_time": [4.0], "fp16_inference_throughput": [8000.0]}, '
+        '"reduce_op": {"fp16_inference_step_time": null, "fp16_inference_throughput": null}}'
    )
    assert (benchmark._preprocess())
    assert (benchmark._ModelBenchmark__inference(Precision.FLOAT16))
@@ -280,31 +281,31 @@ def test_benchmark():
    assert (benchmark.run_count == 1)
    assert (benchmark.return_code == ReturnCode.SUCCESS)
    expected_raw_data = {
-        'steptime_train_float32': [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]],
-        'throughput_train_float32': [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]],
-        'steptime_train_float16': [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]],
-        'throughput_train_float16': [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]]
+        'fp32_train_step_time': [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]],
+        'fp32_train_throughput': [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]],
+        'fp16_train_step_time': [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]],
+        'fp16_train_throughput': [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]]
    }
    assert (benchmark.raw_data == expected_raw_data)
    expected_result = {
        'return_code': [0],
-        'steptime_train_float32': [2.0],
-        'throughput_train_float32': [16000.0],
-        'steptime_train_float16': [2.0],
-        'throughput_train_float16': [16000.0]
+        'fp32_train_step_time': [2.0],
+        'fp32_train_throughput': [16000.0],
+        'fp16_train_step_time': [2.0],
+        'fp16_train_throughput': [16000.0]
    }
    assert (benchmark.result == expected_result)

    expected_serialized_result = (
        '{"name": "pytorch-fake-model", "type": "model", "run_count": 1, "return_code": 0, "start_time": null, '
-        '"end_time": null, "raw_data": {"steptime_train_float32": [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]], '
-        '"throughput_train_float32": [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]], '
-        '"steptime_train_float16": [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]], '
-        '"throughput_train_float16": [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]]}, '
-        '"result": {"return_code": [0], "steptime_train_float32": [2.0], "throughput_train_float32": [16000.0], '
-        '"steptime_train_float16": [2.0], "throughput_train_float16": [16000.0]}, '
-        '"reduce_op": {"steptime_train_float32": "max", "throughput_train_float32": "min", '
-        '"steptime_train_float16": "max", "throughput_train_float16": "min"}}'
+        '"end_time": null, "raw_data": {"fp32_train_step_time": [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]], '
+        '"fp32_train_throughput": [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]], '
+        '"fp16_train_step_time": [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]], '
+        '"fp16_train_throughput": [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]]}, '
+        '"result": {"return_code": [0], "fp32_train_step_time": [2.0], "fp32_train_throughput": [16000.0], '
+        '"fp16_train_step_time": [2.0], "fp16_train_throughput": [16000.0]}, '
+        '"reduce_op": {"fp32_train_step_time": "max", "fp32_train_throughput": "min", '
+        '"fp16_train_step_time": "max", "fp16_train_throughput": "min"}}'
    )
    assert (benchmark.serialized_result == expected_serialized_result)


--- a/tests/benchmarks/model_benchmarks/test_pytorch_base.py
+++ b/tests/benchmarks/model_benchmarks/test_pytorch_base.py
@@ -188,8 +188,7 @@ def test_pytorch_base():

    # Test results.
    for metric in [
-        'steptime_train_float32', 'steptime_inference_float32', 'throughput_train_float32',
-        'throughput_inference_float32'
+        'fp32_train_step_time', 'fp32_inference_step_time', 'fp32_train_throughput', 'fp32_inference_throughput'
    ]:
        assert (len(benchmark.raw_data[metric]) == 1)
        assert (len(benchmark.raw_data[metric][0]) == 64)